Import aom library

This is the reference implementation for the Alliance for Open Media's av1 video code. The commit used was 4d668d7feb1f8abd809d1bca0418570a7f142a36.
author: trav90 <travawine@palemoon.org> 2018-10-15 21:45:30 -0500
committer: trav90 <travawine@palemoon.org> 2018-10-15 21:45:30 -0500
commit: 68569dee1416593955c1570d638b3d9250b33012 (patch)
tree: d960f017cd7eba3f125b7e8a813789ee2e076310 /third_party/aom/av1
parent: 07c17b6b98ed32fcecff15c083ab0fd878de3cf0 (diff)
download: UXP-68569dee1416593955c1570d638b3d9250b33012.tar
UXP-68569dee1416593955c1570d638b3d9250b33012.tar.gz
UXP-68569dee1416593955c1570d638b3d9250b33012.tar.lz
UXP-68569dee1416593955c1570d638b3d9250b33012.tar.xz
UXP-68569dee1416593955c1570d638b3d9250b33012.zip
264 files changed, 173837 insertions, 0 deletions
diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake
new file mode 100644
index 000000000..00f687a0d
--- /dev/null
+++ b/third_party/aom/av1/av1.cmake
@@ -0,0 +1,518 @@
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+set(AOM_AV1_COMMON_SOURCES
+    "${AOM_ROOT}/av1/av1_iface_common.h"
+    "${AOM_ROOT}/av1/common/alloccommon.c"
+    "${AOM_ROOT}/av1/common/alloccommon.h"
+    # TODO(tomfinegan): Foward transform belongs in encoder.
+    "${AOM_ROOT}/av1/common/av1_fwd_txfm1d.c"
+    "${AOM_ROOT}/av1/common/av1_fwd_txfm1d.h"
+    "${AOM_ROOT}/av1/common/av1_fwd_txfm2d.c"
+    "${AOM_ROOT}/av1/common/av1_fwd_txfm2d_cfg.h"
+    "${AOM_ROOT}/av1/common/av1_inv_txfm1d.c"
+    "${AOM_ROOT}/av1/common/av1_inv_txfm1d.h"
+    "${AOM_ROOT}/av1/common/av1_inv_txfm2d.c"
+    "${AOM_ROOT}/av1/common/av1_inv_txfm2d_cfg.h"
+    "${AOM_ROOT}/av1/common/av1_loopfilter.c"
+    "${AOM_ROOT}/av1/common/av1_loopfilter.h"
+    "${AOM_ROOT}/av1/common/av1_txfm.h"
+    "${AOM_ROOT}/av1/common/blockd.c"
+    "${AOM_ROOT}/av1/common/blockd.h"
+    "${AOM_ROOT}/av1/common/common.h"
+    "${AOM_ROOT}/av1/common/common_data.h"
+    "${AOM_ROOT}/av1/common/convolve.c"
+    "${AOM_ROOT}/av1/common/convolve.h"
+    "${AOM_ROOT}/av1/common/debugmodes.c"
+    "${AOM_ROOT}/av1/common/entropy.c"
+    "${AOM_ROOT}/av1/common/entropy.h"
+    "${AOM_ROOT}/av1/common/entropymode.c"
+    "${AOM_ROOT}/av1/common/entropymode.h"
+    "${AOM_ROOT}/av1/common/entropymv.c"
+    "${AOM_ROOT}/av1/common/entropymv.h"
+    "${AOM_ROOT}/av1/common/enums.h"
+    "${AOM_ROOT}/av1/common/filter.c"
+    "${AOM_ROOT}/av1/common/filter.h"
+    "${AOM_ROOT}/av1/common/frame_buffers.c"
+    "${AOM_ROOT}/av1/common/frame_buffers.h"
+    "${AOM_ROOT}/av1/common/idct.c"
+    "${AOM_ROOT}/av1/common/idct.h"
+    "${AOM_ROOT}/av1/common/mv.h"
+    "${AOM_ROOT}/av1/common/mvref_common.c"
+    "${AOM_ROOT}/av1/common/mvref_common.h"
+    "${AOM_ROOT}/av1/common/odintrin.c"
+    "${AOM_ROOT}/av1/common/odintrin.h"
+    "${AOM_ROOT}/av1/common/onyxc_int.h"
+    "${AOM_ROOT}/av1/common/pred_common.c"
+    "${AOM_ROOT}/av1/common/pred_common.h"
+    "${AOM_ROOT}/av1/common/quant_common.c"
+    "${AOM_ROOT}/av1/common/quant_common.h"
+    "${AOM_ROOT}/av1/common/reconinter.c"
+    "${AOM_ROOT}/av1/common/reconinter.h"
+    "${AOM_ROOT}/av1/common/reconintra.c"
+    "${AOM_ROOT}/av1/common/reconintra.h"
+    "${AOM_ROOT}/av1/common/resize.c"
+    "${AOM_ROOT}/av1/common/resize.h"
+    "${AOM_ROOT}/av1/common/restoration.h"
+    "${AOM_ROOT}/av1/common/scale.c"
+    "${AOM_ROOT}/av1/common/scale.h"
+    "${AOM_ROOT}/av1/common/scan.c"
+    "${AOM_ROOT}/av1/common/scan.h"
+    "${AOM_ROOT}/av1/common/seg_common.c"
+    "${AOM_ROOT}/av1/common/seg_common.h"
+    "${AOM_ROOT}/av1/common/thread_common.c"
+    "${AOM_ROOT}/av1/common/thread_common.h"
+    "${AOM_ROOT}/av1/common/tile_common.c"
+    "${AOM_ROOT}/av1/common/tile_common.h")
+
+set(AOM_AV1_DECODER_SOURCES
+    "${AOM_ROOT}/av1/av1_dx_iface.c"
+    "${AOM_ROOT}/av1/decoder/decodeframe.c"
+    "${AOM_ROOT}/av1/decoder/decodeframe.h"
+    "${AOM_ROOT}/av1/decoder/decodemv.c"
+    "${AOM_ROOT}/av1/decoder/decodemv.h"
+    "${AOM_ROOT}/av1/decoder/decoder.c"
+    "${AOM_ROOT}/av1/decoder/decoder.h"
+    "${AOM_ROOT}/av1/decoder/detokenize.c"
+    "${AOM_ROOT}/av1/decoder/detokenize.h"
+    "${AOM_ROOT}/av1/decoder/dsubexp.c"
+    "${AOM_ROOT}/av1/decoder/dsubexp.h"
+    "${AOM_ROOT}/av1/decoder/dthread.c"
+    "${AOM_ROOT}/av1/decoder/dthread.h")
+
+set(AOM_AV1_ENCODER_SOURCES
+    "${AOM_ROOT}/av1/av1_cx_iface.c"
+    "${AOM_ROOT}/av1/encoder/aq_complexity.c"
+    "${AOM_ROOT}/av1/encoder/aq_complexity.h"
+    "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.c"
+    "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
+    "${AOM_ROOT}/av1/encoder/aq_variance.c"
+    "${AOM_ROOT}/av1/encoder/aq_variance.h"
+    "${AOM_ROOT}/av1/encoder/av1_quantize.c"
+    "${AOM_ROOT}/av1/encoder/av1_quantize.h"
+    "${AOM_ROOT}/av1/encoder/bitstream.c"
+    "${AOM_ROOT}/av1/encoder/bitstream.h"
+    "${AOM_ROOT}/av1/encoder/block.h"
+    "${AOM_ROOT}/av1/encoder/context_tree.c"
+    "${AOM_ROOT}/av1/encoder/context_tree.h"
+    "${AOM_ROOT}/av1/encoder/cost.c"
+    "${AOM_ROOT}/av1/encoder/cost.h"
+    "${AOM_ROOT}/av1/encoder/dct.c"
+    "${AOM_ROOT}/av1/encoder/encodeframe.c"
+    "${AOM_ROOT}/av1/encoder/encodeframe.h"
+    "${AOM_ROOT}/av1/encoder/encodemb.c"
+    "${AOM_ROOT}/av1/encoder/encodemb.h"
+    "${AOM_ROOT}/av1/encoder/encodemv.c"
+    "${AOM_ROOT}/av1/encoder/encodemv.h"
+    "${AOM_ROOT}/av1/encoder/encoder.c"
+    "${AOM_ROOT}/av1/encoder/encoder.h"
+    "${AOM_ROOT}/av1/encoder/ethread.c"
+    "${AOM_ROOT}/av1/encoder/ethread.h"
+    "${AOM_ROOT}/av1/encoder/extend.c"
+    "${AOM_ROOT}/av1/encoder/extend.h"
+    "${AOM_ROOT}/av1/encoder/firstpass.c"
+    "${AOM_ROOT}/av1/encoder/firstpass.h"
+    "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
+    "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h"
+    "${AOM_ROOT}/av1/encoder/lookahead.c"
+    "${AOM_ROOT}/av1/encoder/lookahead.h"
+    "${AOM_ROOT}/av1/encoder/mbgraph.c"
+    "${AOM_ROOT}/av1/encoder/mbgraph.h"
+    "${AOM_ROOT}/av1/encoder/mcomp.c"
+    "${AOM_ROOT}/av1/encoder/mcomp.h"
+    "${AOM_ROOT}/av1/encoder/picklpf.c"
+    "${AOM_ROOT}/av1/encoder/picklpf.h"
+    "${AOM_ROOT}/av1/encoder/ratectrl.c"
+    "${AOM_ROOT}/av1/encoder/ratectrl.h"
+    "${AOM_ROOT}/av1/encoder/rd.c"
+    "${AOM_ROOT}/av1/encoder/rd.h"
+    "${AOM_ROOT}/av1/encoder/rdopt.c"
+    "${AOM_ROOT}/av1/encoder/rdopt.h"
+    "${AOM_ROOT}/av1/encoder/segmentation.c"
+    "${AOM_ROOT}/av1/encoder/segmentation.h"
+    "${AOM_ROOT}/av1/encoder/speed_features.c"
+    "${AOM_ROOT}/av1/encoder/speed_features.h"
+    "${AOM_ROOT}/av1/encoder/subexp.c"
+    "${AOM_ROOT}/av1/encoder/subexp.h"
+    "${AOM_ROOT}/av1/encoder/temporal_filter.c"
+    "${AOM_ROOT}/av1/encoder/temporal_filter.h"
+    "${AOM_ROOT}/av1/encoder/tokenize.c"
+    "${AOM_ROOT}/av1/encoder/tokenize.h"
+    "${AOM_ROOT}/av1/encoder/treewriter.c"
+    "${AOM_ROOT}/av1/encoder/treewriter.h"
+    "${AOM_ROOT}/av1/encoder/variance_tree.c"
+    "${AOM_ROOT}/av1/encoder/variance_tree.h")
+
+set(AOM_AV1_COMMON_INTRIN_SSE2
+    # Requires CONFIG_GLOBAL_MOTION or CONFIG_WARPED_MOTION
+    #"${AOM_ROOT}/av1/common/x86/warp_plane_sse2.c"
+    "${AOM_ROOT}/av1/common/x86/idct_intrin_sse2.c")
+
+set(AOM_AV1_COMMON_INTRIN_SSSE3
+    "${AOM_ROOT}/av1/common/x86/av1_convolve_ssse3.c")
+
+set(AOM_AV1_COMMON_INTRIN_SSE4_1
+    "${AOM_ROOT}/av1/common/x86/av1_fwd_txfm1d_sse4.c"
+    "${AOM_ROOT}/av1/common/x86/av1_fwd_txfm2d_sse4.c")
+
+set(AOM_AV1_COMMON_INTRIN_AVX2
+    "${AOM_ROOT}/av1/common/x86/hybrid_inv_txfm_avx2.c")
+
+set(AOM_AV1_COMMON_INTRIN_DSPR2
+    "${AOM_ROOT}/av1/common/mips/dspr2/av1_itrans16_dspr2.c"
+    "${AOM_ROOT}/av1/common/mips/dspr2/av1_itrans4_dspr2.c"
+    "${AOM_ROOT}/av1/common/mips/dspr2/av1_itrans8_dspr2.c")
+
+set(AOM_AV1_COMMON_INTRIN_MSA
+    "${AOM_ROOT}/av1/common/mips/msa/av1_idct16x16_msa.c"
+    "${AOM_ROOT}/av1/common/mips/msa/av1_idct4x4_msa.c"
+    "${AOM_ROOT}/av1/common/mips/msa/av1_idct8x8_msa.c")
+
+set(AOM_AV1_ENCODER_ASM_SSE2
+    "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm"
+    "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm"
+    "${AOM_ROOT}/av1/encoder/x86/temporal_filter_apply_sse2.asm")
+
+set(AOM_AV1_ENCODER_INTRIN_SSE2
+    "${AOM_ROOT}/av1/encoder/x86/dct_intrin_sse2.c"
+    "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
+    "${AOM_ROOT}/av1/encoder/x86/av1_quantize_sse2.c")
+
+set(AOM_AV1_ENCODER_ASM_SSSE3_X86_64
+    "${AOM_ROOT}/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm")
+
+set(AOM_AV1_ENCODER_INTRIN_SSSE3
+    "${AOM_ROOT}/av1/encoder/x86/dct_ssse3.c")
+
+set(AOM_AV1_ENCODER_INTRIN_AVX2
+    "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c"
+    "${AOM_ROOT}/av1/encoder/x86/hybrid_fwd_txfm_avx2.c")
+
+set(AOM_AV1_ENCODER_INTRIN_NEON
+    "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c")
+
+set(AOM_AV1_ENCODER_INTRIN_MSA
+    "${AOM_ROOT}/av1/encoder/mips/msa/error_msa.c"
+    "${AOM_ROOT}/av1/encoder/mips/msa/fdct16x16_msa.c"
+    "${AOM_ROOT}/av1/encoder/mips/msa/fdct4x4_msa.c"
+    "${AOM_ROOT}/av1/encoder/mips/msa/fdct8x8_msa.c"
+    "${AOM_ROOT}/av1/encoder/mips/msa/fdct_msa.h"
+    "${AOM_ROOT}/av1/encoder/mips/msa/temporal_filter_msa.c")
+
+if (CONFIG_HIGHBITDEPTH)
+  set(AOM_AV1_COMMON_INTRIN_SSE4_1
+      ${AOM_AV1_COMMON_INTRIN_SSE4_1}
+      "${AOM_ROOT}/av1/common/x86/av1_highbd_convolve_sse4.c"
+      "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_sse4.c")
+
+  set(AOM_AV1_COMMON_INTRIN_AVX2
+      ${AOM_AV1_COMMON_INTRIN_AVX2}
+      "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c")
+
+  set(AOM_AV1_ENCODER_INTRIN_SSE4_1
+      ${AOM_AV1_ENCODER_INTRIN_SSE4_1}
+      "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_sse4.c"
+      "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c")
+else ()
+  set(AOM_AV1_COMMON_INTRIN_NEON
+      ${AOM_AV1_COMMON_INTRIN_NEON}
+      "${AOM_ROOT}/av1/encoder/arm/neon/dct_neon.c"
+      "${AOM_ROOT}/av1/common/arm/neon/iht4x4_add_neon.c"
+      "${AOM_ROOT}/av1/common/arm/neon/iht8x8_add_neon.c")
+
+  set(AOM_AV1_ENCODER_INTRIN_NEON
+      ${AOM_AV1_ENCODER_INTRIN_NEON}
+      "${AOM_ROOT}/av1/encoder/arm/neon/error_neon.c")
+endif ()
+
+if (CONFIG_CDEF)
+  set(AOM_AV1_COMMON_SOURCES
+      ${AOM_AV1_COMMON_SOURCES}
+      "${AOM_ROOT}/av1/common/clpf.c"
+      "${AOM_ROOT}/av1/common/clpf.h"
+      "${AOM_ROOT}/av1/common/clpf_simd.h"
+      "${AOM_ROOT}/av1/common/cdef_simd.h"
+      "${AOM_ROOT}/av1/common/cdef.c"
+      "${AOM_ROOT}/av1/common/cdef.h"
+      "${AOM_ROOT}/av1/common/od_dering.c"
+      "${AOM_ROOT}/av1/common/od_dering.h"
+      "${AOM_ROOT}/av1/common/od_dering_simd.h")
+
+  set(AOM_AV1_ENCODER_SOURCES
+      ${AOM_AV1_ENCODER_SOURCES}
+      "${AOM_ROOT}/av1/encoder/pickcdef.c")
+
+  set(AOM_AV1_COMMON_INTRIN_SSE2
+      ${AOM_AV1_COMMON_INTRIN_SSE2}
+      "${AOM_ROOT}/av1/common/clpf_sse2.c"
+      "${AOM_ROOT}/av1/common/od_dering_sse2.c")
+
+  set(AOM_AV1_COMMON_INTRIN_SSSE3
+      ${AOM_AV1_COMMON_INTRIN_SSSE3}
+      "${AOM_ROOT}/av1/common/clpf_ssse3.c"
+      "${AOM_ROOT}/av1/common/od_dering_ssse3.c")
+
+  set(AOM_AV1_COMMON_INTRIN_SSE4_1
+      ${AOM_AV1_COMMON_INTRIN_SSE4_1}
+      "${AOM_ROOT}/av1/common/clpf_sse4.c"
+      "${AOM_ROOT}/av1/common/od_dering_sse4.c")
+
+  set(AOM_AV1_COMMON_INTRIN_NEON
+      ${AOM_AV1_COMMON_INTRIN_NEON}
+      "${AOM_ROOT}/av1/common/clpf_neon.c"
+      "${AOM_ROOT}/av1/common/od_dering_neon.c")
+endif ()
+
+if (CONFIG_EXT_INTER)
+  set(AOM_AV1_ENCODER_SOURCES
+      ${AOM_AV1_ENCODER_SOURCES}
+      "${AOM_ROOT}/av1/encoder/wedge_utils.c")
+
+  set(AOM_AV1_ENCODER_INTRIN_SSE2
+      ${AOM_AV1_ENCODER_INTRIN_SSE2}
+      "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
+endif ()
+
+if (CONFIG_FILTER_INTRA)
+  set(AOM_AV1_COMMON_INTRIN_SSE4_1
+      ${AOM_AV1_COMMON_INTRIN_SSE4_1}
+      "${AOM_ROOT}/av1/common/x86/filterintra_sse4.c")
+endif ()
+
+if (CONFIG_ACCOUNTING)
+  set(AOM_AV1_DECODER_SOURCES
+      ${AOM_AV1_DECODER_SOURCES}
+      "${AOM_ROOT}/av1/decoder/accounting.c"
+      "${AOM_ROOT}/av1/decoder/accounting.h")
+endif ()
+
+if (CONFIG_INSPECTION)
+  set(AOM_AV1_DECODER_SOURCES
+      ${AOM_AV1_DECODER_SOURCES}
+      "${AOM_ROOT}/av1/decoder/inspection.c"
+      "${AOM_ROOT}/av1/decoder/inspection.h")
+endif ()
+
+if (CONFIG_INTERNAL_STATS)
+  set(AOM_AV1_ENCODER_SOURCES
+      ${AOM_AV1_ENCODER_SOURCES}
+      "${AOM_ROOT}/av1/encoder/blockiness.c")
+endif ()
+
+if (CONFIG_PALETTE)
+  set(AOM_AV1_ENCODER_SOURCES
+      ${AOM_AV1_ENCODER_SOURCES}
+      "${AOM_ROOT}/av1/encoder/palette.c"
+      "${AOM_ROOT}/av1/encoder/palette.h")
+endif ()
+
+if (CONFIG_CFL)
+  set(AOM_AV1_COMMON_SOURCES
+      ${AOM_AV1_COMMON_SOURCES}
+    "${AOM_ROOT}/av1/common/cfl.c"
+    "${AOM_ROOT}/av1/common/cfl.h")
+endif ()
+
+if (CONFIG_PVQ)
+  set(AOM_AV1_COMMON_SOURCES
+      ${AOM_AV1_COMMON_SOURCES}
+      "${AOM_ROOT}/av1/common/laplace_tables.c"
+      "${AOM_ROOT}/av1/common/pvq.c"
+      "${AOM_ROOT}/av1/common/pvq.h"
+      "${AOM_ROOT}/av1/common/pvq_state.c"
+      "${AOM_ROOT}/av1/common/pvq_state.h"
+      "${AOM_ROOT}/av1/common/partition.c"
+      "${AOM_ROOT}/av1/common/partition.h"
+      "${AOM_ROOT}/av1/common/generic_code.c"
+      "${AOM_ROOT}/av1/common/generic_code.h"
+      "${AOM_ROOT}/av1/common/zigzag4.c"
+      "${AOM_ROOT}/av1/common/zigzag8.c"
+      "${AOM_ROOT}/av1/common/zigzag16.c"
+      "${AOM_ROOT}/av1/common/zigzag32.c")
+
+    set(AOM_AV1_DECODER_SOURCES
+        ${AOM_AV1_DECODER_SOURCES}
+        "${AOM_ROOT}/av1/decoder/decint.h"
+        "${AOM_ROOT}/av1/decoder/pvq_decoder.c"
+        "${AOM_ROOT}/av1/decoder/pvq_decoder.h"
+        "${AOM_ROOT}/av1/decoder/generic_decoder.c"
+        "${AOM_ROOT}/av1/decoder/laplace_decoder.c")
+
+    set(AOM_AV1_ENCODER_SOURCES
+        ${AOM_AV1_ENCODER_SOURCES}
+        "${AOM_ROOT}/av1/encoder/daala_compat_enc.c"
+        "${AOM_ROOT}/av1/encoder/encint.h"
+        "${AOM_ROOT}/av1/encoder/pvq_encoder.c"
+        "${AOM_ROOT}/av1/encoder/pvq_encoder.h"
+        "${AOM_ROOT}/av1/encoder/generic_encoder.c"
+        "${AOM_ROOT}/av1/encoder/laplace_encoder.c")
+
+    set(AOM_AV1_COMMON_INTRIN_SSE4_1
+        ${AOM_AV1_COMMON_INTRIN_SSE4_1}
+        "${AOM_ROOT}/av1/common/x86/pvq_sse4.c"
+        "${AOM_ROOT}/av1/common/x86/pvq_sse4.h")
+
+    if (NOT CONFIG_AV1_ENCODER)
+      # TODO(tomfinegan): These should probably be in av1/common, and in a
+      # common source list. For now this mirrors the original build system.
+      set(AOM_AV1_DECODER_SOURCES
+          ${AOM_AV1_DECODER_SOURCES}
+          "${AOM_ROOT}/av1/encoder/dct.c"
+          "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
+          "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h")
+
+      set(AOM_AV1_DECODER_ASM_SSE2
+          ${AOM_AV1_DECODER_ASM_SSE2}
+          "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm")
+
+      set(AOM_AV1_DECODER_INTRIN_SSE2
+          ${AOM_AV1_DECODER_INTRIN_SSE2}
+          "${AOM_ROOT}/av1/encoder/x86/dct_intrin_sse2.c")
+
+      set(AOM_AV1_DECODER_INTRIN_SSSE3
+          ${AOM_AV1_DECODER_INTRIN_SSSE3}
+          "${AOM_ROOT}/av1/encoder/x86/dct_ssse3.c")
+    endif ()
+endif ()
+
+if (CONFIG_WARPED_MOTION)
+  set(AOM_AV1_COMMON_SOURCES
+      ${AOM_AV1_COMMON_SOURCES}
+      "${AOM_ROOT}/av1/common/warped_motion.c"
+      "${AOM_ROOT}/av1/common/warped_motion.h")
+
+  set(AOM_AV1_COMMON_INTRIN_SSE2
+      ${AOM_AV1_COMMON_INTRIN_SSE2}
+      "${AOM_ROOT}/av1/common/x86/warp_plane_sse2.c")
+endif ()
+
+# Setup AV1 common/decoder/encoder targets. The libaom target must exist before
+# this function is called.
+function (setup_av1_targets)
+  add_library(aom_av1_common OBJECT ${AOM_AV1_COMMON_SOURCES})
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_common)
+  target_sources(aom PUBLIC $<TARGET_OBJECTS:aom_av1_common>)
+
+  if (CONFIG_AV1_DECODER)
+    add_library(aom_av1_decoder OBJECT ${AOM_AV1_DECODER_SOURCES})
+    set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_decoder)
+    target_sources(aom PUBLIC $<TARGET_OBJECTS:aom_av1_decoder>)
+  endif ()
+
+  if (CONFIG_AV1_ENCODER)
+    add_library(aom_av1_encoder OBJECT ${AOM_AV1_ENCODER_SOURCES})
+    set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_encoder)
+    target_sources(aom PUBLIC $<TARGET_OBJECTS:aom_av1_encoder>)
+  endif ()
+
+  if (HAVE_SSE2)
+    require_flag_nomsvc("-msse2" NO)
+    add_intrinsics_object_library("-msse2" "sse2" "aom_av1_common"
+                                  "AOM_AV1_COMMON_INTRIN_SSE2")
+    if (CONFIG_AV1_DECODER)
+      if (AOM_AV1_DECODER_ASM_SSE2)
+        add_asm_library("aom_av1_decoder_sse2" "AOM_AV1_DECODER_ASM_SSE2" "aom")
+      endif ()
+
+      if (AOM_AV1_DECODER_INTRIN_SSE2)
+        add_intrinsics_object_library("-msse2" "sse2" "aom_av1_decoder"
+                                      "AOM_AV1_DECODER_INTRIN_SSE2")
+      endif ()
+    endif ()
+
+    if (CONFIG_AV1_ENCODER)
+      add_asm_library("aom_av1_encoder_sse2" "AOM_AV1_ENCODER_ASM_SSE2" "aom")
+      add_intrinsics_object_library("-msse2" "sse2" "aom_av1_encoder"
+                                    "AOM_AV1_ENCODER_INTRIN_SSE2")
+    endif ()
+  endif ()
+
+  if (HAVE_SSSE3)
+    require_flag_nomsvc("-mssse3" NO)
+    add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_common"
+                                  "AOM_AV1_COMMON_INTRIN_SSSE3")
+
+    if (CONFIG_AV1_DECODER)
+      if (AOM_AV1_DECODER_INTRIN_SSSE3)
+        add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_decoder"
+                                      "AOM_AV1_DECODER_INTRIN_SSSE3")
+      endif ()
+    endif ()
+
+    if (CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_encoder"
+                                    "AOM_AV1_ENCODER_INTRIN_SSSE3")
+    endif ()
+  endif ()
+
+  if (HAVE_SSE4_1)
+    require_flag_nomsvc("-msse4.1" NO)
+    add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_common"
+                                  "AOM_AV1_COMMON_INTRIN_SSE4_1")
+
+    if (CONFIG_AV1_ENCODER)
+      if ("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+        add_asm_library("aom_av1_encoder_ssse3"
+                        "AOM_AV1_ENCODER_ASM_SSSE3_X86_64" "aom")
+      endif ()
+
+      if (AOM_AV1_ENCODER_INTRIN_SSE4_1)
+        add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_encoder"
+                                      "AOM_AV1_ENCODER_INTRIN_SSE4_1")
+      endif ()
+    endif ()
+  endif ()
+
+  if (HAVE_AVX2)
+    require_flag_nomsvc("-mavx2" NO)
+    add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_common"
+                                  "AOM_AV1_COMMON_INTRIN_AVX2")
+
+    if (CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_encoder"
+                                    "AOM_AV1_ENCODER_INTRIN_AVX2")
+    endif ()
+  endif ()
+
+  if (HAVE_NEON)
+    if (AOM_AV1_COMMON_INTRIN_NEON)
+      add_intrinsics_object_library("${AOM_INTRIN_NEON_FLAG}"
+                                    "neon"
+                                    "aom_av1_common"
+                                    "AOM_AV1_COMMON_INTRIN_NEON")
+    endif ()
+
+    if (AOM_AV1_ENCODER_INTRIN_NEON)
+      add_intrinsics_object_library("${AOM_INTRIN_NEON_FLAG}"
+                                    "neon"
+                                    "aom_av1_encoder"
+                                    "AOM_AV1_ENCODER_INTRIN_NEON")
+    endif ()
+  endif ()
+
+  if (HAVE_DSPR2)
+    add_intrinsics_object_library("" "dspr2" "aom_av1_common"
+                                  "AOM_AV1_COMMON_INTRIN_DSPR2")
+  endif ()
+
+  if (HAVE_MSA)
+    add_intrinsics_object_library("" "msa" "aom_av1_common"
+                                  "AOM_AV1_COMMON_INTRIN_MSA")
+    add_intrinsics_object_library("" "msa" "aom_av1_encoder"
+                                  "AOM_AV1_ENCODER_INTRIN_MSA")
+  endif ()
+
+  # Pass the new lib targets up to the parent scope instance of
+  # $AOM_LIB_TARGETS.
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
+endfunction ()
+
+function (setup_av1_test_targets)
+endfunction ()
diff --git a/third_party/aom/av1/av1_common.mk b/third_party/aom/av1/av1_common.mk
new file mode 100644
index 000000000..6b9a289af
--- /dev/null
+++ b/third_party/aom/av1/av1_common.mk
@@ -0,0 +1,180 @@
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+
+AV1_COMMON_SRCS-yes += av1_common.mk
+AV1_COMMON_SRCS-yes += av1_iface_common.h
+AV1_COMMON_SRCS-yes += common/alloccommon.c
+AV1_COMMON_SRCS-yes += common/av1_loopfilter.c
+AV1_COMMON_SRCS-yes += common/av1_loopfilter.h
+AV1_COMMON_SRCS-yes += common/blockd.c
+AV1_COMMON_SRCS-yes += common/debugmodes.c
+AV1_COMMON_SRCS-yes += common/entropy.c
+AV1_COMMON_SRCS-yes += common/entropymode.c
+AV1_COMMON_SRCS-yes += common/entropymv.c
+AV1_COMMON_SRCS-yes += common/frame_buffers.c
+AV1_COMMON_SRCS-yes += common/frame_buffers.h
+AV1_COMMON_SRCS-yes += common/alloccommon.h
+AV1_COMMON_SRCS-yes += common/blockd.h
+AV1_COMMON_SRCS-yes += common/common.h
+AV1_COMMON_SRCS-yes += common/entropy.h
+AV1_COMMON_SRCS-yes += common/entropymode.h
+AV1_COMMON_SRCS-yes += common/entropymv.h
+AV1_COMMON_SRCS-yes += common/enums.h
+AV1_COMMON_SRCS-yes += common/filter.h
+AV1_COMMON_SRCS-yes += common/filter.c
+AV1_COMMON_SRCS-yes += common/idct.h
+AV1_COMMON_SRCS-yes += common/idct.c
+AV1_COMMON_SRCS-yes += common/thread_common.h
+AV1_COMMON_SRCS-$(CONFIG_LV_MAP) += common/txb_common.h
+AV1_COMMON_SRCS-$(CONFIG_LV_MAP) += common/txb_common.c
+AV1_COMMON_SRCS-yes += common/mv.h
+AV1_COMMON_SRCS-yes += common/onyxc_int.h
+AV1_COMMON_SRCS-yes += common/pred_common.h
+AV1_COMMON_SRCS-yes += common/pred_common.c
+AV1_COMMON_SRCS-yes += common/quant_common.h
+AV1_COMMON_SRCS-yes += common/reconinter.h
+AV1_COMMON_SRCS-yes += common/reconintra.h
+AV1_COMMON_SRCS-yes += common/av1_rtcd.c
+AV1_COMMON_SRCS-yes += common/av1_rtcd_defs.pl
+AV1_COMMON_SRCS-yes += common/scale.h
+AV1_COMMON_SRCS-yes += common/scale.c
+AV1_COMMON_SRCS-yes += common/seg_common.h
+AV1_COMMON_SRCS-yes += common/seg_common.c
+AV1_COMMON_SRCS-yes += common/tile_common.h
+AV1_COMMON_SRCS-yes += common/tile_common.c
+AV1_COMMON_SRCS-yes += common/thread_common.c
+AV1_COMMON_SRCS-yes += common/mvref_common.c
+AV1_COMMON_SRCS-yes += common/mvref_common.h
+AV1_COMMON_SRCS-yes += common/quant_common.c
+AV1_COMMON_SRCS-yes += common/reconinter.c
+AV1_COMMON_SRCS-yes += common/reconintra.c
+AV1_COMMON_SRCS-yes += common/resize.c
+AV1_COMMON_SRCS-yes += common/resize.h
+AV1_COMMON_SRCS-yes += common/restoration.h
+AV1_COMMON_SRCS-yes += common/common_data.h
+AV1_COMMON_SRCS-yes += common/scan.c
+AV1_COMMON_SRCS-yes += common/scan.h
+# TODO(angiebird) the forward transform belongs under encoder/
+AV1_COMMON_SRCS-yes += common/av1_txfm.h
+AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d.h
+AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d.c
+AV1_COMMON_SRCS-yes += common/av1_inv_txfm1d.h
+AV1_COMMON_SRCS-yes += common/av1_inv_txfm1d.c
+AV1_COMMON_SRCS-yes += common/av1_fwd_txfm2d.c
+AV1_COMMON_SRCS-yes += common/av1_fwd_txfm2d_cfg.h
+AV1_COMMON_SRCS-yes += common/av1_inv_txfm2d.c
+AV1_COMMON_SRCS-yes += common/av1_inv_txfm2d_cfg.h
+AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/av1_convolve_ssse3.c
+ifeq ($(CONFIG_HIGHBITDEPTH),yes)
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_highbd_convolve_sse4.c
+endif
+AV1_COMMON_SRCS-yes += common/convolve.c
+AV1_COMMON_SRCS-yes += common/convolve.h
+ifeq ($(CONFIG_LOOP_RESTORATION),yes)
+AV1_COMMON_SRCS-yes += common/restoration.h
+AV1_COMMON_SRCS-yes += common/restoration.c
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/selfguided_sse4.c
+endif
+ifeq (yes,$(filter $(CONFIG_GLOBAL_MOTION) $(CONFIG_WARPED_MOTION),yes))
+AV1_COMMON_SRCS-yes += common/warped_motion.h
+AV1_COMMON_SRCS-yes += common/warped_motion.c
+endif
+ifeq ($(CONFIG_CDEF),yes)
+AV1_COMMON_SRCS-yes += common/clpf.c
+AV1_COMMON_SRCS-yes += common/clpf.h
+AV1_COMMON_SRCS-yes += common/clpf_simd.h
+AV1_COMMON_SRCS-yes += common/cdef_simd.h
+AV1_COMMON_SRCS-$(HAVE_SSE2) += common/clpf_sse2.c
+AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/clpf_ssse3.c
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/clpf_sse4.c
+AV1_COMMON_SRCS-$(HAVE_NEON) += common/clpf_neon.c
+AV1_COMMON_SRCS-$(HAVE_SSE2) += common/od_dering_sse2.c
+AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/od_dering_ssse3.c
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/od_dering_sse4.c
+AV1_COMMON_SRCS-$(HAVE_NEON) += common/od_dering_neon.c
+AV1_COMMON_SRCS-yes += common/od_dering.c
+AV1_COMMON_SRCS-yes += common/od_dering.h
+AV1_COMMON_SRCS-yes += common/od_dering_simd.h
+AV1_COMMON_SRCS-yes += common/cdef.c
+AV1_COMMON_SRCS-yes += common/cdef.h
+endif
+AV1_COMMON_SRCS-yes += common/odintrin.c
+AV1_COMMON_SRCS-yes += common/odintrin.h
+
+ifeq ($(CONFIG_CFL),yes)
+AV1_COMMON_SRCS-yes += common/cfl.h
+AV1_COMMON_SRCS-yes += common/cfl.c
+endif
+
+ifeq ($(CONFIG_PVQ),yes)
+# PVQ from daala
+AV1_COMMON_SRCS-yes += common/pvq.c
+AV1_COMMON_SRCS-yes += common/partition.c
+AV1_COMMON_SRCS-yes += common/partition.h
+AV1_COMMON_SRCS-yes += common/zigzag4.c
+AV1_COMMON_SRCS-yes += common/zigzag8.c
+AV1_COMMON_SRCS-yes += common/zigzag16.c
+AV1_COMMON_SRCS-yes += common/zigzag32.c
+AV1_COMMON_SRCS-yes += common/zigzag.h
+AV1_COMMON_SRCS-yes += common/generic_code.c
+AV1_COMMON_SRCS-yes += common/pvq_state.c
+AV1_COMMON_SRCS-yes += common/laplace_tables.c
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/pvq_sse4.c
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/pvq_sse4.h
+endif
+ifneq ($(findstring yes,$(CONFIG_PVQ)$(CONFIG_DAALA_DIST)$(CONFIG_XIPHRC)),)
+AV1_COMMON_SRCS-yes += common/pvq.h
+AV1_COMMON_SRCS-yes += common/pvq_state.h
+AV1_COMMON_SRCS-yes += common/generic_code.h
+endif
+
+ifneq ($(CONFIG_HIGHBITDEPTH),yes)
+AV1_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/av1_itrans4_dspr2.c
+AV1_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/av1_itrans8_dspr2.c
+AV1_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/av1_itrans16_dspr2.c
+endif
+
+# common (msa)
+AV1_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/av1_idct4x4_msa.c
+AV1_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/av1_idct8x8_msa.c
+AV1_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/av1_idct16x16_msa.c
+
+AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_intrin_sse2.c
+AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/hybrid_inv_txfm_avx2.c
+
+ifeq ($(CONFIG_AV1_ENCODER),yes)
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_txfm1d_sse4.h
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_fwd_txfm1d_sse4.c
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_fwd_txfm2d_sse4.c
+endif
+ifeq ($(CONFIG_HIGHBITDEPTH),yes)
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/highbd_txfm_utility_sse4.h
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/highbd_inv_txfm_sse4.c
+AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/highbd_inv_txfm_avx2.c
+endif
+
+ifneq ($(CONFIG_HIGHBITDEPTH),yes)
+AV1_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht4x4_add_neon.c
+AV1_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht8x8_add_neon.c
+endif
+
+ifeq ($(CONFIG_FILTER_INTRA),yes)
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/filterintra_sse4.c
+endif
+
+ifneq ($(findstring yes,$(CONFIG_GLOBAL_MOTION) $(CONFIG_WARPED_MOTION)),)
+AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/warp_plane_sse2.c
+ifeq ($(CONFIG_HIGHBITDEPTH),yes)
+AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/highbd_warp_plane_ssse3.c
+endif
+endif
+
+$(eval $(call rtcd_h_template,av1_rtcd,av1/common/av1_rtcd_defs.pl))
diff --git a/third_party/aom/av1/av1_cx.mk b/third_party/aom/av1/av1_cx.mk
new file mode 100644
index 000000000..0a0d770ce
--- /dev/null
+++ b/third_party/aom/av1/av1_cx.mk
@@ -0,0 +1,165 @@
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+
+AV1_CX_EXPORTS += exports_enc
+
+AV1_CX_SRCS-yes += $(AV1_COMMON_SRCS-yes)
+AV1_CX_SRCS-no  += $(AV1_COMMON_SRCS-no)
+AV1_CX_SRCS_REMOVE-yes += $(AV1_COMMON_SRCS_REMOVE-yes)
+AV1_CX_SRCS_REMOVE-no  += $(AV1_COMMON_SRCS_REMOVE-no)
+
+AV1_CX_SRCS-yes += av1_cx_iface.c
+
+AV1_CX_SRCS-yes += encoder/av1_quantize.c
+AV1_CX_SRCS-yes += encoder/av1_quantize.h
+AV1_CX_SRCS-yes += encoder/bitstream.c
+AV1_CX_SRCS-yes += encoder/context_tree.c
+AV1_CX_SRCS-yes += encoder/context_tree.h
+AV1_CX_SRCS-yes += encoder/variance_tree.c
+AV1_CX_SRCS-yes += encoder/variance_tree.h
+AV1_CX_SRCS-yes += encoder/cost.h
+AV1_CX_SRCS-yes += encoder/cost.c
+AV1_CX_SRCS-yes += encoder/dct.c
+AV1_CX_SRCS-yes += encoder/hybrid_fwd_txfm.c
+AV1_CX_SRCS-yes += encoder/hybrid_fwd_txfm.h
+AV1_CX_SRCS-yes += encoder/encodeframe.c
+AV1_CX_SRCS-yes += encoder/encodeframe.h
+AV1_CX_SRCS-yes += encoder/encodemb.c
+AV1_CX_SRCS-yes += encoder/encodemv.c
+AV1_CX_SRCS-yes += encoder/ethread.h
+AV1_CX_SRCS-yes += encoder/ethread.c
+AV1_CX_SRCS-yes += encoder/extend.c
+AV1_CX_SRCS-yes += encoder/firstpass.c
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += ../third_party/fastfeat/fast.h
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += ../third_party/fastfeat/nonmax.c
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += ../third_party/fastfeat/fast_9.c
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += ../third_party/fastfeat/fast.c
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/corner_match.c
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/corner_match.h
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/corner_detect.c
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/corner_detect.h
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/global_motion.c
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/global_motion.h
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/ransac.c
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/ransac.h
+AV1_CX_SRCS-yes += encoder/block.h
+AV1_CX_SRCS-yes += encoder/bitstream.h
+AV1_CX_SRCS-yes += encoder/encodemb.h
+AV1_CX_SRCS-yes += encoder/encodemv.h
+AV1_CX_SRCS-$(CONFIG_LV_MAP) += encoder/encodetxb.c
+AV1_CX_SRCS-$(CONFIG_LV_MAP) += encoder/encodetxb.h
+AV1_CX_SRCS-yes += encoder/extend.h
+AV1_CX_SRCS-yes += encoder/firstpass.h
+AV1_CX_SRCS-yes += encoder/lookahead.c
+AV1_CX_SRCS-yes += encoder/lookahead.h
+AV1_CX_SRCS-yes += encoder/mcomp.h
+AV1_CX_SRCS-yes += encoder/encoder.h
+AV1_CX_SRCS-yes += encoder/ratectrl.h
+ifeq ($(CONFIG_XIPHRC),yes)
+AV1_CX_SRCS-yes += encoder/ratectrl_xiph.h
+endif
+AV1_CX_SRCS-yes += encoder/rd.h
+AV1_CX_SRCS-yes += encoder/rdopt.h
+AV1_CX_SRCS-yes += encoder/tokenize.h
+AV1_CX_SRCS-yes += encoder/treewriter.h
+AV1_CX_SRCS-yes += encoder/mcomp.c
+AV1_CX_SRCS-yes += encoder/encoder.c
+ifeq ($(CONFIG_PALETTE),yes)
+AV1_CX_SRCS-yes += encoder/palette.h
+AV1_CX_SRCS-yes += encoder/palette.c
+endif
+AV1_CX_SRCS-yes += encoder/picklpf.c
+AV1_CX_SRCS-yes += encoder/picklpf.h
+AV1_CX_SRCS-$(CONFIG_LOOP_RESTORATION) += encoder/pickrst.c
+AV1_CX_SRCS-$(CONFIG_LOOP_RESTORATION) += encoder/pickrst.h
+AV1_CX_SRCS-yes += encoder/ratectrl.c
+ifeq ($(CONFIG_XIPHRC),yes)
+AV1_CX_SRCS-yes += encoder/ratectrl_xiph.c
+endif
+AV1_CX_SRCS-yes += encoder/rd.c
+AV1_CX_SRCS-yes += encoder/rdopt.c
+AV1_CX_SRCS-yes += encoder/segmentation.c
+AV1_CX_SRCS-yes += encoder/segmentation.h
+AV1_CX_SRCS-yes += encoder/speed_features.c
+AV1_CX_SRCS-yes += encoder/speed_features.h
+AV1_CX_SRCS-yes += encoder/subexp.c
+AV1_CX_SRCS-yes += encoder/subexp.h
+AV1_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/blockiness.c
+
+AV1_CX_SRCS-yes += encoder/tokenize.c
+AV1_CX_SRCS-yes += encoder/treewriter.c
+AV1_CX_SRCS-yes += encoder/aq_variance.c
+AV1_CX_SRCS-yes += encoder/aq_variance.h
+AV1_CX_SRCS-yes += encoder/aq_cyclicrefresh.c
+AV1_CX_SRCS-yes += encoder/aq_cyclicrefresh.h
+AV1_CX_SRCS-yes += encoder/aq_complexity.c
+AV1_CX_SRCS-yes += encoder/aq_complexity.h
+AV1_CX_SRCS-yes += encoder/temporal_filter.c
+AV1_CX_SRCS-yes += encoder/temporal_filter.h
+AV1_CX_SRCS-yes += encoder/mbgraph.c
+AV1_CX_SRCS-yes += encoder/mbgraph.h
+ifeq ($(CONFIG_CDEF),yes)
+AV1_CX_SRCS-yes += encoder/pickcdef.c
+endif
+ifeq ($(CONFIG_PVQ),yes)
+# PVQ from daala
+AV1_CX_SRCS-yes += encoder/daala_compat_enc.c
+AV1_CX_SRCS-yes += encoder/pvq_encoder.c
+AV1_CX_SRCS-yes += encoder/pvq_encoder.h
+AV1_CX_SRCS-yes += encoder/generic_encoder.c
+AV1_CX_SRCS-yes += encoder/laplace_encoder.c
+endif
+ifneq ($(findstring yes,$(CONFIG_XIPHRC)$(CONFIG_PVQ)),)
+AV1_CX_SRCS-yes += encoder/encint.h
+endif
+
+AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/av1_quantize_sse2.c
+AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
+ifeq ($(CONFIG_HIGHBITDEPTH),yes)
+AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/highbd_block_error_intrin_sse2.c
+endif
+
+AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
+AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/error_sse2.asm
+
+ifeq ($(ARCH_X86_64),yes)
+AV1_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/av1_quantize_ssse3_x86_64.asm
+endif
+
+AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_intrin_sse2.c
+AV1_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3.c
+AV1_CX_SRCS-$(HAVE_AVX2) += encoder/x86/hybrid_fwd_txfm_avx2.c
+ifeq ($(CONFIG_HIGHBITDEPTH),yes)
+AV1_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/av1_highbd_quantize_sse4.c
+AV1_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_fwd_txfm_sse4.c
+endif
+
+ifeq ($(CONFIG_EXT_INTER),yes)
+AV1_CX_SRCS-yes += encoder/wedge_utils.c
+AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/wedge_utils_sse2.c
+endif
+
+AV1_CX_SRCS-$(HAVE_AVX2) += encoder/x86/error_intrin_avx2.c
+
+ifneq ($(CONFIG_HIGHBITDEPTH),yes)
+AV1_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/dct_neon.c
+AV1_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/error_neon.c
+endif
+AV1_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/quantize_neon.c
+
+AV1_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/error_msa.c
+AV1_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct4x4_msa.c
+AV1_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct8x8_msa.c
+AV1_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct16x16_msa.c
+AV1_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct_msa.h
+AV1_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
+
+AV1_CX_SRCS-yes := $(filter-out $(AV1_CX_SRCS_REMOVE-yes),$(AV1_CX_SRCS-yes))
diff --git a/third_party/aom/av1/av1_cx_iface.c b/third_party/aom/av1/av1_cx_iface.c
new file mode 100644
index 000000000..d4832a15c
--- /dev/null
+++ b/third_party/aom/av1/av1_cx_iface.c
@@ -0,0 +1,1605 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "./aom_config.h"
+#include "aom/aom_encoder.h"
+#include "aom_ports/aom_once.h"
+#include "aom_ports/system_state.h"
+#include "aom/internal/aom_codec_internal.h"
+#include "./aom_version.h"
+#include "av1/encoder/encoder.h"
+#include "aom/aomcx.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/av1_iface_common.h"
+
+struct av1_extracfg {
+  int cpu_used;  // available cpu percentage in 1/16
+  unsigned int enable_auto_alt_ref;
+#if CONFIG_EXT_REFS
+  unsigned int enable_auto_bwd_ref;
+#endif  // CONFIG_EXT_REFS
+  unsigned int noise_sensitivity;
+  unsigned int sharpness;
+  unsigned int static_thresh;
+  unsigned int tile_columns;
+  unsigned int tile_rows;
+#if CONFIG_DEPENDENT_HORZTILES
+  unsigned int dependent_horz_tiles;
+#endif
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+  unsigned int loop_filter_across_tiles_enabled;
+#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+  unsigned int arnr_max_frames;
+  unsigned int arnr_strength;
+  unsigned int min_gf_interval;
+  unsigned int max_gf_interval;
+  aom_tune_metric tuning;
+  unsigned int cq_level;  // constrained quality level
+  unsigned int rc_max_intra_bitrate_pct;
+  unsigned int rc_max_inter_bitrate_pct;
+  unsigned int gf_cbr_boost_pct;
+  unsigned int lossless;
+#if CONFIG_AOM_QM
+  unsigned int enable_qm;
+  unsigned int qm_min;
+  unsigned int qm_max;
+#endif
+#if CONFIG_TILE_GROUPS
+  unsigned int num_tg;
+  unsigned int mtu_size;
+#endif
+#if CONFIG_TEMPMV_SIGNALING
+  unsigned int disable_tempmv;
+#endif
+  unsigned int frame_parallel_decoding_mode;
+  AQ_MODE aq_mode;
+#if CONFIG_EXT_DELTA_Q
+  DELTAQ_MODE deltaq_mode;
+#endif
+  unsigned int frame_periodic_boost;
+  aom_bit_depth_t bit_depth;
+  aom_tune_content content;
+  aom_color_space_t color_space;
+  int color_range;
+  int render_width;
+  int render_height;
+  aom_superblock_size_t superblock_size;
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+  int ans_window_size_log2;
+#endif
+#if CONFIG_EXT_TILE
+  unsigned int tile_encoding_mode;
+#endif  // CONFIG_EXT_TILE
+
+  unsigned int motion_vector_unit_test;
+};
+
+static struct av1_extracfg default_extra_cfg = {
+  0,  // cpu_used
+  1,  // enable_auto_alt_ref
+#if CONFIG_EXT_REFS
+  0,    // enable_auto_bwd_ref
+#endif  // CONFIG_EXT_REFS
+  0,    // noise_sensitivity
+  0,    // sharpness
+  0,    // static_thresh
+#if CONFIG_EXT_TILE
+  UINT_MAX,  // tile_columns
+  UINT_MAX,  // tile_rows
+#else
+  0,  // tile_columns
+  0,  // tile_rows
+#endif  // CONFIG_EXT_TILE
+#if CONFIG_DEPENDENT_HORZTILES
+  0,  // Depdendent Horizontal tiles
+#endif
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+  1,              // loop_filter_across_tiles_enabled
+#endif            // CONFIG_LOOPFILTERING_ACROSS_TILES
+  7,              // arnr_max_frames
+  5,              // arnr_strength
+  0,              // min_gf_interval; 0 -> default decision
+  0,              // max_gf_interval; 0 -> default decision
+  AOM_TUNE_PSNR,  // tuning
+  10,             // cq_level
+  0,              // rc_max_intra_bitrate_pct
+  0,              // rc_max_inter_bitrate_pct
+  0,              // gf_cbr_boost_pct
+  0,              // lossless
+#if CONFIG_AOM_QM
+  0,                 // enable_qm
+  DEFAULT_QM_FIRST,  // qm_min
+  DEFAULT_QM_LAST,   // qm_max
+#endif
+#if CONFIG_TILE_GROUPS
+  1,  // max number of tile groups
+  0,  // mtu_size
+#endif
+#if CONFIG_TEMPMV_SIGNALING
+  0,  // disable temporal mv prediction
+#endif
+  1,      // frame_parallel_decoding_mode
+  NO_AQ,  // aq_mode
+#if CONFIG_EXT_DELTA_Q
+  NO_DELTA_Q,  // deltaq_mode
+#endif
+  CONFIG_XIPHRC,                // frame_periodic_delta_q
+  AOM_BITS_8,                   // Bit depth
+  AOM_CONTENT_DEFAULT,          // content
+  AOM_CS_UNKNOWN,               // color space
+  0,                            // color range
+  0,                            // render width
+  0,                            // render height
+  AOM_SUPERBLOCK_SIZE_DYNAMIC,  // superblock_size
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+  23,  // ans_window_size_log2
+#endif
+#if CONFIG_EXT_TILE
+  0,    // Tile encoding mode is TILE_NORMAL by default.
+#endif  // CONFIG_EXT_TILE
+
+  0,  // motion_vector_unit_test
+};
+
+struct aom_codec_alg_priv {
+  aom_codec_priv_t base;
+  aom_codec_enc_cfg_t cfg;
+  struct av1_extracfg extra_cfg;
+  AV1EncoderConfig oxcf;
+  AV1_COMP *cpi;
+  unsigned char *cx_data;
+  size_t cx_data_sz;
+  unsigned char *pending_cx_data;
+  size_t pending_cx_data_sz;
+  int pending_frame_count;
+  size_t pending_frame_sizes[8];
+  aom_image_t preview_img;
+  aom_enc_frame_flags_t next_frame_flags;
+  aom_postproc_cfg_t preview_ppcfg;
+  aom_codec_pkt_list_decl(256) pkt_list;
+  unsigned int fixed_kf_cntr;
+  // BufferPool that holds all reference frames.
+  BufferPool *buffer_pool;
+};
+
+static aom_codec_err_t update_error_state(
+    aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) {
+  const aom_codec_err_t res = error->error_code;
+
+  if (res != AOM_CODEC_OK)
+    ctx->base.err_detail = error->has_detail ? error->detail : NULL;
+
+  return res;
+}
+
+#undef ERROR
+#define ERROR(str)                  \
+  do {                              \
+    ctx->base.err_detail = str;     \
+    return AOM_CODEC_INVALID_PARAM; \
+  } while (0)
+
+#define RANGE_CHECK(p, memb, lo, hi)                   \
+  do {                                                 \
+    if (!((p)->memb >= (lo) && (p)->memb <= (hi)))     \
+      ERROR(#memb " out of range [" #lo ".." #hi "]"); \
+  } while (0)
+
+#define RANGE_CHECK_HI(p, memb, hi)                                     \
+  do {                                                                  \
+    if (!((p)->memb <= (hi))) ERROR(#memb " out of range [.." #hi "]"); \
+  } while (0)
+
+#define RANGE_CHECK_LO(p, memb, lo)                                     \
+  do {                                                                  \
+    if (!((p)->memb >= (lo))) ERROR(#memb " out of range [" #lo "..]"); \
+  } while (0)
+
+#define RANGE_CHECK_BOOL(p, memb)                                     \
+  do {                                                                \
+    if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean"); \
+  } while (0)
+
+static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
+                                       const aom_codec_enc_cfg_t *cfg,
+                                       const struct av1_extracfg *extra_cfg) {
+  RANGE_CHECK(cfg, g_w, 1, 65535);  // 16 bits available
+  RANGE_CHECK(cfg, g_h, 1, 65535);  // 16 bits available
+  RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
+  RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den);
+  RANGE_CHECK_HI(cfg, g_profile, 3);
+
+  RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
+  RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
+  RANGE_CHECK_BOOL(extra_cfg, lossless);
+  RANGE_CHECK(extra_cfg, aq_mode, 0, AQ_MODE_COUNT - 1);
+#if CONFIG_EXT_DELTA_Q
+  RANGE_CHECK(extra_cfg, deltaq_mode, 0, DELTAQ_MODE_COUNT - 1);
+#endif
+  RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1);
+  RANGE_CHECK_HI(cfg, g_threads, 64);
+  RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
+  RANGE_CHECK(cfg, rc_end_usage, AOM_VBR, AOM_Q);
+  RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
+  RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100);
+  RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
+  RANGE_CHECK(cfg, kf_mode, AOM_KF_DISABLED, AOM_KF_AUTO);
+  RANGE_CHECK_BOOL(cfg, rc_resize_allowed);
+  RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100);
+  RANGE_CHECK_HI(cfg, rc_resize_up_thresh, 100);
+  RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
+  RANGE_CHECK(cfg, g_pass, AOM_RC_ONE_PASS, AOM_RC_LAST_PASS);
+  RANGE_CHECK_HI(extra_cfg, min_gf_interval, MAX_LAG_BUFFERS - 1);
+  RANGE_CHECK_HI(extra_cfg, max_gf_interval, MAX_LAG_BUFFERS - 1);
+  if (extra_cfg->max_gf_interval > 0) {
+    RANGE_CHECK(extra_cfg, max_gf_interval, 2, (MAX_LAG_BUFFERS - 1));
+  }
+  if (extra_cfg->min_gf_interval > 0 && extra_cfg->max_gf_interval > 0) {
+    RANGE_CHECK(extra_cfg, max_gf_interval, extra_cfg->min_gf_interval,
+                (MAX_LAG_BUFFERS - 1));
+  }
+
+  if (cfg->rc_resize_allowed == 1) {
+    RANGE_CHECK_HI(cfg, rc_scaled_width, cfg->g_w);
+    RANGE_CHECK_HI(cfg, rc_scaled_height, cfg->g_h);
+  }
+
+  // AV1 does not support a lower bound on the keyframe interval in
+  // automatic keyframe placement mode.
+  if (cfg->kf_mode != AOM_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist &&
+      cfg->kf_min_dist > 0)
+    ERROR(
+        "kf_min_dist not supported in auto mode, use 0 "
+        "or kf_max_dist instead.");
+
+  RANGE_CHECK_HI(extra_cfg, motion_vector_unit_test, 2);
+  RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 2);
+#if CONFIG_EXT_REFS
+  RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2);
+#endif  // CONFIG_EXT_REFS
+  RANGE_CHECK(extra_cfg, cpu_used, 0, 8);
+  RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
+  RANGE_CHECK(extra_cfg, superblock_size, AOM_SUPERBLOCK_SIZE_64X64,
+              AOM_SUPERBLOCK_SIZE_DYNAMIC);
+#if CONFIG_EXT_TILE
+// TODO(any): Waring. If CONFIG_EXT_TILE is true, tile_columns really
+// means tile_width, and tile_rows really means tile_hight. The interface
+// should be sanitized.
+#if CONFIG_EXT_PARTITION
+  if (extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_64X64) {
+    if (extra_cfg->tile_columns != UINT_MAX)
+      RANGE_CHECK(extra_cfg, tile_columns, 1, 32);
+    if (extra_cfg->tile_rows != UINT_MAX)
+      RANGE_CHECK(extra_cfg, tile_rows, 1, 32);
+  } else
+#endif  // CONFIG_EXT_PARTITION
+  {
+    if (extra_cfg->tile_columns != UINT_MAX)
+      RANGE_CHECK(extra_cfg, tile_columns, 1, 64);
+    if (extra_cfg->tile_rows != UINT_MAX)
+      RANGE_CHECK(extra_cfg, tile_rows, 1, 64);
+  }
+  RANGE_CHECK_HI(extra_cfg, tile_encoding_mode, 1);
+#else
+  RANGE_CHECK_HI(extra_cfg, tile_columns, 6);
+  RANGE_CHECK_HI(extra_cfg, tile_rows, 2);
+#endif  // CONFIG_EXT_TILE
+#if CONFIG_DEPENDENT_HORZTILES
+  RANGE_CHECK_HI(extra_cfg, dependent_horz_tiles, 1);
+#endif
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+  RANGE_CHECK_HI(extra_cfg, loop_filter_across_tiles_enabled, 1);
+#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+  RANGE_CHECK_HI(extra_cfg, sharpness, 7);
+  RANGE_CHECK_HI(extra_cfg, arnr_max_frames, 15);
+  RANGE_CHECK_HI(extra_cfg, arnr_strength, 6);
+  RANGE_CHECK_HI(extra_cfg, cq_level, 63);
+  RANGE_CHECK(cfg, g_bit_depth, AOM_BITS_8, AOM_BITS_12);
+  RANGE_CHECK(cfg, g_input_bit_depth, 8, 12);
+  RANGE_CHECK(extra_cfg, content, AOM_CONTENT_DEFAULT, AOM_CONTENT_INVALID - 1);
+
+  // TODO(yaowu): remove this when ssim tuning is implemented for av1
+  if (extra_cfg->tuning == AOM_TUNE_SSIM)
+    ERROR("Option --tune=ssim is not currently supported in AV1.");
+
+  if (cfg->g_pass == AOM_RC_LAST_PASS) {
+#if !CONFIG_XIPHRC
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
+    const FIRSTPASS_STATS *stats;
+#endif
+
+    if (cfg->rc_twopass_stats_in.buf == NULL)
+      ERROR("rc_twopass_stats_in.buf not set.");
+
+#if !CONFIG_XIPHRC
+    if (cfg->rc_twopass_stats_in.sz % packet_sz)
+      ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
+
+    if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz)
+      ERROR("rc_twopass_stats_in requires at least two packets.");
+
+    stats =
+        (const FIRSTPASS_STATS *)cfg->rc_twopass_stats_in.buf + n_packets - 1;
+
+    if ((int)(stats->count + 0.5) != n_packets - 1)
+      ERROR("rc_twopass_stats_in missing EOS stats packet");
+#endif
+  }
+
+#if !CONFIG_HIGHBITDEPTH
+  if (cfg->g_profile > (unsigned int)PROFILE_1) {
+    ERROR("Profile > 1 not supported in this build configuration");
+  }
+#endif
+  if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
+      cfg->g_bit_depth > AOM_BITS_8) {
+    ERROR("Codec high bit-depth not supported in profile < 2");
+  }
+  if (cfg->g_profile <= (unsigned int)PROFILE_1 && cfg->g_input_bit_depth > 8) {
+    ERROR("Source high bit-depth not supported in profile < 2");
+  }
+  if (cfg->g_profile > (unsigned int)PROFILE_1 &&
+      cfg->g_bit_depth == AOM_BITS_8) {
+    ERROR("Codec bit-depth 8 not supported in profile > 1");
+  }
+  RANGE_CHECK(extra_cfg, color_space, AOM_CS_UNKNOWN, AOM_CS_SRGB);
+  RANGE_CHECK(extra_cfg, color_range, 0, 1);
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+  RANGE_CHECK(extra_cfg, ans_window_size_log2, 8, 23);
+#endif
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t validate_img(aom_codec_alg_priv_t *ctx,
+                                    const aom_image_t *img) {
+  switch (img->fmt) {
+    case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_I42016: break;
+    case AOM_IMG_FMT_I422:
+    case AOM_IMG_FMT_I444:
+    case AOM_IMG_FMT_I440:
+      if (ctx->cfg.g_profile != (unsigned int)PROFILE_1) {
+        ERROR(
+            "Invalid image format. I422, I444, I440 images are "
+            "not supported in profile.");
+      }
+      break;
+    case AOM_IMG_FMT_I42216:
+    case AOM_IMG_FMT_I44416:
+    case AOM_IMG_FMT_I44016:
+      if (ctx->cfg.g_profile != (unsigned int)PROFILE_1 &&
+          ctx->cfg.g_profile != (unsigned int)PROFILE_3) {
+        ERROR(
+            "Invalid image format. 16-bit I422, I444, I440 images are "
+            "not supported in profile.");
+      }
+      break;
+    default:
+      ERROR(
+          "Invalid image format. Only YV12, I420, I422, I444 images are "
+          "supported.");
+      break;
+  }
+
+  if (img->d_w != ctx->cfg.g_w || img->d_h != ctx->cfg.g_h)
+    ERROR("Image size must match encoder init configuration size");
+
+  return AOM_CODEC_OK;
+}
+
+static int get_image_bps(const aom_image_t *img) {
+  switch (img->fmt) {
+    case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_I420: return 12;
+    case AOM_IMG_FMT_I422: return 16;
+    case AOM_IMG_FMT_I444: return 24;
+    case AOM_IMG_FMT_I440: return 16;
+    case AOM_IMG_FMT_I42016: return 24;
+    case AOM_IMG_FMT_I42216: return 32;
+    case AOM_IMG_FMT_I44416: return 48;
+    case AOM_IMG_FMT_I44016: return 32;
+    default: assert(0 && "Invalid image format"); break;
+  }
+  return 0;
+}
+
+static aom_codec_err_t set_encoder_config(
+    AV1EncoderConfig *oxcf, const aom_codec_enc_cfg_t *cfg,
+    const struct av1_extracfg *extra_cfg) {
+  const int is_vbr = cfg->rc_end_usage == AOM_VBR;
+  oxcf->profile = cfg->g_profile;
+  oxcf->max_threads = (int)cfg->g_threads;
+  oxcf->width = cfg->g_w;
+  oxcf->height = cfg->g_h;
+  oxcf->bit_depth = cfg->g_bit_depth;
+  oxcf->input_bit_depth = cfg->g_input_bit_depth;
+  // guess a frame rate if out of whack, use 30
+  oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
+  if (oxcf->init_framerate > 180) oxcf->init_framerate = 30;
+
+  oxcf->mode = GOOD;
+
+  switch (cfg->g_pass) {
+    case AOM_RC_ONE_PASS: oxcf->pass = 0; break;
+    case AOM_RC_FIRST_PASS: oxcf->pass = 1; break;
+    case AOM_RC_LAST_PASS: oxcf->pass = 2; break;
+  }
+
+  oxcf->lag_in_frames =
+      cfg->g_pass == AOM_RC_FIRST_PASS ? 0 : cfg->g_lag_in_frames;
+  oxcf->rc_mode = cfg->rc_end_usage;
+
+  // Convert target bandwidth from Kbit/s to Bit/s
+  oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate;
+  oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
+  oxcf->rc_max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct;
+  oxcf->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct;
+
+  oxcf->best_allowed_q =
+      extra_cfg->lossless ? 0 : av1_quantizer_to_qindex(cfg->rc_min_quantizer);
+  oxcf->worst_allowed_q =
+      extra_cfg->lossless ? 0 : av1_quantizer_to_qindex(cfg->rc_max_quantizer);
+  oxcf->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level);
+  oxcf->fixed_q = -1;
+
+#if CONFIG_AOM_QM
+  oxcf->using_qm = extra_cfg->enable_qm;
+  oxcf->qm_minlevel = extra_cfg->qm_min;
+  oxcf->qm_maxlevel = extra_cfg->qm_max;
+#endif
+
+#if CONFIG_TILE_GROUPS
+  oxcf->num_tile_groups = extra_cfg->num_tg;
+  oxcf->mtu = extra_cfg->mtu_size;
+#endif
+
+#if CONFIG_TEMPMV_SIGNALING
+  oxcf->disable_tempmv = extra_cfg->disable_tempmv;
+#endif
+  oxcf->under_shoot_pct = cfg->rc_undershoot_pct;
+  oxcf->over_shoot_pct = cfg->rc_overshoot_pct;
+
+  oxcf->scaled_frame_width = cfg->rc_scaled_width;
+  oxcf->scaled_frame_height = cfg->rc_scaled_height;
+  if (cfg->rc_resize_allowed == 1) {
+    oxcf->resize_mode =
+        (oxcf->scaled_frame_width == 0 || oxcf->scaled_frame_height == 0)
+            ? RESIZE_DYNAMIC
+            : RESIZE_FIXED;
+  } else {
+    oxcf->resize_mode = RESIZE_NONE;
+  }
+
+  oxcf->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz;
+  oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
+  oxcf->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
+
+  oxcf->drop_frames_water_mark = cfg->rc_dropframe_thresh;
+
+  oxcf->two_pass_vbrbias = cfg->rc_2pass_vbr_bias_pct;
+  oxcf->two_pass_vbrmin_section = cfg->rc_2pass_vbr_minsection_pct;
+  oxcf->two_pass_vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct;
+
+  oxcf->auto_key =
+      cfg->kf_mode == AOM_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist;
+
+  oxcf->key_freq = cfg->kf_max_dist;
+
+  oxcf->speed = extra_cfg->cpu_used;
+  oxcf->enable_auto_arf = extra_cfg->enable_auto_alt_ref;
+#if CONFIG_EXT_REFS
+  oxcf->enable_auto_brf = extra_cfg->enable_auto_bwd_ref;
+#endif  // CONFIG_EXT_REFS
+  oxcf->noise_sensitivity = extra_cfg->noise_sensitivity;
+  oxcf->sharpness = extra_cfg->sharpness;
+
+  oxcf->two_pass_stats_in = cfg->rc_twopass_stats_in;
+
+#if CONFIG_FP_MB_STATS
+  oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in;
+#endif
+
+  oxcf->color_space = extra_cfg->color_space;
+  oxcf->color_range = extra_cfg->color_range;
+  oxcf->render_width = extra_cfg->render_width;
+  oxcf->render_height = extra_cfg->render_height;
+  oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
+  oxcf->arnr_strength = extra_cfg->arnr_strength;
+  oxcf->min_gf_interval = extra_cfg->min_gf_interval;
+  oxcf->max_gf_interval = extra_cfg->max_gf_interval;
+
+  oxcf->tuning = extra_cfg->tuning;
+  oxcf->content = extra_cfg->content;
+
+#if CONFIG_EXT_PARTITION
+  oxcf->superblock_size = extra_cfg->superblock_size;
+#endif  // CONFIG_EXT_PARTITION
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+  oxcf->ans_window_size_log2 = extra_cfg->ans_window_size_log2;
+#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
+
+#if CONFIG_EXT_TILE
+  {
+#if CONFIG_EXT_PARTITION
+    const unsigned int max =
+        extra_cfg->superblock_size == AOM_SUPERBLOCK_SIZE_64X64 ? 64 : 32;
+#else
+    const unsigned int max = 64;
+#endif  // CONFIG_EXT_PARTITION
+    oxcf->tile_columns = AOMMIN(extra_cfg->tile_columns, max);
+    oxcf->tile_rows = AOMMIN(extra_cfg->tile_rows, max);
+    oxcf->tile_encoding_mode = extra_cfg->tile_encoding_mode;
+  }
+#else
+  oxcf->tile_columns = extra_cfg->tile_columns;
+  oxcf->tile_rows = extra_cfg->tile_rows;
+#endif  // CONFIG_EXT_TILE
+#if CONFIG_DEPENDENT_HORZTILES
+  oxcf->dependent_horz_tiles = extra_cfg->dependent_horz_tiles;
+#endif
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+  oxcf->loop_filter_across_tiles_enabled =
+      extra_cfg->loop_filter_across_tiles_enabled;
+#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+  oxcf->error_resilient_mode = cfg->g_error_resilient;
+  oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode;
+
+  oxcf->aq_mode = extra_cfg->aq_mode;
+#if CONFIG_EXT_DELTA_Q
+  oxcf->deltaq_mode = extra_cfg->deltaq_mode;
+#endif
+
+  oxcf->frame_periodic_boost = extra_cfg->frame_periodic_boost;
+
+  oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test;
+  /*
+  printf("Current AV1 Settings: \n");
+  printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
+  printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity);
+  printf("sharpness: %d\n",    oxcf->sharpness);
+  printf("cpu_used: %d\n",  oxcf->cpu_used);
+  printf("Mode: %d\n",     oxcf->mode);
+  printf("auto_key: %d\n",  oxcf->auto_key);
+  printf("key_freq: %d\n", oxcf->key_freq);
+  printf("end_usage: %d\n", oxcf->end_usage);
+  printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct);
+  printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct);
+  printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level);
+  printf("optimal_buffer_level: %d\n",  oxcf->optimal_buffer_level);
+  printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size);
+  printf("fixed_q: %d\n",  oxcf->fixed_q);
+  printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q);
+  printf("best_allowed_q: %d\n", oxcf->best_allowed_q);
+  printf("allow_spatial_resampling: %d\n", oxcf->allow_spatial_resampling);
+  printf("scaled_frame_width: %d\n", oxcf->scaled_frame_width);
+  printf("scaled_frame_height: %d\n", oxcf->scaled_frame_height);
+  printf("two_pass_vbrbias: %d\n",  oxcf->two_pass_vbrbias);
+  printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
+  printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
+  printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
+  printf("enable_auto_arf: %d\n", oxcf->enable_auto_arf);
+  printf("Version: %d\n", oxcf->Version);
+  printf("error resilient: %d\n", oxcf->error_resilient_mode);
+  printf("frame parallel detokenization: %d\n",
+         oxcf->frame_parallel_decoding_mode);
+  */
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx,
+                                          const aom_codec_enc_cfg_t *cfg) {
+  aom_codec_err_t res;
+  int force_key = 0;
+
+  if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
+    if (cfg->g_lag_in_frames > 1 || cfg->g_pass != AOM_RC_ONE_PASS)
+      ERROR("Cannot change width or height after initialization");
+    if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) ||
+        (ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
+        (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+      force_key = 1;
+  }
+
+  // Prevent increasing lag_in_frames. This check is stricter than it needs
+  // to be -- the limit is not increasing past the first lag_in_frames
+  // value, but we don't track the initial config, only the last successful
+  // config.
+  if (cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames)
+    ERROR("Cannot increase lag_in_frames");
+
+  res = validate_config(ctx, cfg, &ctx->extra_cfg);
+
+  if (res == AOM_CODEC_OK) {
+    ctx->cfg = *cfg;
+    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    // On profile change, request a key frame
+    force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
+    av1_change_config(ctx->cpi, &ctx->oxcf);
+  }
+
+  if (force_key) ctx->next_frame_flags |= AOM_EFLAG_FORCE_KF;
+
+  return res;
+}
+
+static aom_codec_err_t ctrl_get_quantizer(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  *arg = av1_get_quantizer(ctx->cpi);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_quantizer64(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  *arg = av1_qindex_to_quantizer(av1_get_quantizer(ctx->cpi));
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t update_extra_cfg(aom_codec_alg_priv_t *ctx,
+                                        const struct av1_extracfg *extra_cfg) {
+  const aom_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg);
+  if (res == AOM_CODEC_OK) {
+    ctx->extra_cfg = *extra_cfg;
+    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    av1_change_config(ctx->cpi, &ctx->oxcf);
+  }
+  return res;
+}
+
+static aom_codec_err_t ctrl_set_cpuused(aom_codec_alg_priv_t *ctx,
+                                        va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.cpu_used = CAST(AOME_SET_CPUUSED, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_auto_alt_ref(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_auto_alt_ref = CAST(AOME_SET_ENABLEAUTOALTREF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+#if CONFIG_EXT_REFS
+static aom_codec_err_t ctrl_set_enable_auto_bwd_ref(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_auto_bwd_ref = CAST(AOME_SET_ENABLEAUTOBWDREF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+#endif  // CONFIG_EXT_REFS
+
+static aom_codec_err_t ctrl_set_noise_sensitivity(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.noise_sensitivity = CAST(AV1E_SET_NOISE_SENSITIVITY, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_sharpness(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.sharpness = CAST(AOME_SET_SHARPNESS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_static_thresh(aom_codec_alg_priv_t *ctx,
+                                              va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.static_thresh = CAST(AOME_SET_STATIC_THRESHOLD, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tile_columns(aom_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tile_columns = CAST(AV1E_SET_TILE_COLUMNS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tile_rows(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tile_rows = CAST(AV1E_SET_TILE_ROWS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+#if CONFIG_DEPENDENT_HORZTILES
+static aom_codec_err_t ctrl_set_tile_dependent_rows(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.dependent_horz_tiles = CAST(AV1E_SET_TILE_DEPENDENT_ROWS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+#endif
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+static aom_codec_err_t ctrl_set_tile_loopfilter(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.loop_filter_across_tiles_enabled =
+      CAST(AV1E_SET_TILE_LOOPFILTER, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+
+static aom_codec_err_t ctrl_set_arnr_max_frames(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.arnr_max_frames = CAST(AOME_SET_ARNR_MAXFRAMES, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_arnr_strength(aom_codec_alg_priv_t *ctx,
+                                              va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.arnr_strength = CAST(AOME_SET_ARNR_STRENGTH, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tuning(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tuning = CAST(AOME_SET_TUNING, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_cq_level(aom_codec_alg_priv_t *ctx,
+                                         va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.cq_level = CAST(AOME_SET_CQ_LEVEL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_rc_max_intra_bitrate_pct(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.rc_max_intra_bitrate_pct =
+      CAST(AOME_SET_MAX_INTRA_BITRATE_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_rc_max_inter_bitrate_pct(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.rc_max_inter_bitrate_pct =
+      CAST(AOME_SET_MAX_INTER_BITRATE_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.gf_cbr_boost_pct = CAST(AV1E_SET_GF_CBR_BOOST_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_lossless(aom_codec_alg_priv_t *ctx,
+                                         va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.lossless = CAST(AV1E_SET_LOSSLESS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+#if CONFIG_AOM_QM
+static aom_codec_err_t ctrl_set_enable_qm(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_qm = CAST(AV1E_SET_ENABLE_QM, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_qm_min(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.qm_min = CAST(AV1E_SET_QM_MIN, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_qm_max(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.qm_max = CAST(AV1E_SET_QM_MAX, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+#endif
+
+#if CONFIG_TILE_GROUPS
+static aom_codec_err_t ctrl_set_num_tg(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.num_tg = CAST(AV1E_SET_NUM_TG, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_mtu(aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.mtu_size = CAST(AV1E_SET_MTU, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+#endif
+#if CONFIG_TEMPMV_SIGNALING
+static aom_codec_err_t ctrl_set_disable_tempmv(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.disable_tempmv = CAST(AV1E_SET_DISABLE_TEMPMV, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+#endif
+static aom_codec_err_t ctrl_set_frame_parallel_decoding_mode(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.frame_parallel_decoding_mode =
+      CAST(AV1E_SET_FRAME_PARALLEL_DECODING, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+#if CONFIG_EXT_TILE
+static aom_codec_err_t ctrl_set_tile_encoding_mode(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tile_encoding_mode = CAST(AV1E_SET_TILE_ENCODING_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+#endif  // CONFIG_EXT_TILE
+
+static aom_codec_err_t ctrl_set_aq_mode(aom_codec_alg_priv_t *ctx,
+                                        va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.aq_mode = CAST(AV1E_SET_AQ_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+#if CONFIG_EXT_DELTA_Q
+static aom_codec_err_t ctrl_set_deltaq_mode(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.deltaq_mode = CAST(AV1E_SET_DELTAQ_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+#endif
+static aom_codec_err_t ctrl_set_min_gf_interval(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.min_gf_interval = CAST(AV1E_SET_MIN_GF_INTERVAL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_max_gf_interval(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.max_gf_interval = CAST(AV1E_SET_MAX_GF_INTERVAL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_frame_periodic_boost(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.frame_periodic_boost = CAST(AV1E_SET_FRAME_PERIODIC_BOOST, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_enable_motion_vector_unit_test(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.motion_vector_unit_test =
+      CAST(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx,
+                                    aom_codec_priv_enc_mr_cfg_t *data) {
+  aom_codec_err_t res = AOM_CODEC_OK;
+  (void)data;
+
+  if (ctx->priv == NULL) {
+    aom_codec_alg_priv_t *const priv = aom_calloc(1, sizeof(*priv));
+    if (priv == NULL) return AOM_CODEC_MEM_ERROR;
+
+    ctx->priv = (aom_codec_priv_t *)priv;
+    ctx->priv->init_flags = ctx->init_flags;
+    ctx->priv->enc.total_encoders = 1;
+    priv->buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
+    if (priv->buffer_pool == NULL) return AOM_CODEC_MEM_ERROR;
+
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&priv->buffer_pool->pool_mutex, NULL)) {
+      return AOM_CODEC_MEM_ERROR;
+    }
+#endif
+
+    if (ctx->config.enc) {
+      // Update the reference to the config structure to an internal copy.
+      priv->cfg = *ctx->config.enc;
+      ctx->config.enc = &priv->cfg;
+    }
+
+    priv->extra_cfg = default_extra_cfg;
+    once(av1_initialize_enc);
+
+    res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
+
+    if (res == AOM_CODEC_OK) {
+      set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
+#if CONFIG_HIGHBITDEPTH
+      priv->oxcf.use_highbitdepth =
+          (ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
+#endif
+      priv->cpi = av1_create_compressor(&priv->oxcf, priv->buffer_pool);
+      if (priv->cpi == NULL)
+        res = AOM_CODEC_MEM_ERROR;
+      else
+        priv->cpi->output_pkt_list = &priv->pkt_list.head;
+    }
+  }
+
+  return res;
+}
+
+static aom_codec_err_t encoder_destroy(aom_codec_alg_priv_t *ctx) {
+  free(ctx->cx_data);
+  av1_remove_compressor(ctx->cpi);
+#if CONFIG_MULTITHREAD
+  pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
+#endif
+  aom_free(ctx->buffer_pool);
+  aom_free(ctx);
+  return AOM_CODEC_OK;
+}
+
+static void pick_quickcompress_mode(aom_codec_alg_priv_t *ctx,
+                                    unsigned long deadline) {
+  MODE new_mode = GOOD;
+
+  switch (ctx->cfg.g_pass) {
+    case AOM_RC_ONE_PASS:
+      switch (deadline) {
+        default: new_mode = GOOD; break;
+      }
+      break;
+    case AOM_RC_FIRST_PASS: break;
+    case AOM_RC_LAST_PASS: new_mode = GOOD;
+  }
+
+  if (ctx->oxcf.mode != new_mode) {
+    ctx->oxcf.mode = new_mode;
+    av1_change_config(ctx->cpi, &ctx->oxcf);
+  }
+}
+
+// Turn on to test if supplemental superframe data breaks decoding
+// #define TEST_SUPPLEMENTAL_SUPERFRAME_DATA
+static int write_superframe_index(aom_codec_alg_priv_t *ctx) {
+  uint8_t marker = 0xc0;
+  unsigned int mask;
+  int mag, index_sz;
+  int i;
+  size_t max_frame_sz = 0;
+
+  assert(ctx->pending_frame_count);
+  assert(ctx->pending_frame_count <= 8);
+
+  // Add the number of frames to the marker byte
+  marker |= ctx->pending_frame_count - 1;
+  for (i = 0; i < ctx->pending_frame_count - 1; i++) {
+    const size_t frame_sz = (unsigned int)ctx->pending_frame_sizes[i] - 1;
+    max_frame_sz = frame_sz > max_frame_sz ? frame_sz : max_frame_sz;
+  }
+
+  // Choose the magnitude
+  for (mag = 0, mask = 0xff; mag < 4; mag++) {
+    if (max_frame_sz <= mask) break;
+    mask <<= 8;
+    mask |= 0xff;
+  }
+  marker |= mag << 3;
+
+  // Write the index
+  index_sz = 2 + (mag + 1) * (ctx->pending_frame_count - 1);
+  if (ctx->pending_cx_data_sz + index_sz < ctx->cx_data_sz) {
+    uint8_t *x = ctx->pending_cx_data + ctx->pending_cx_data_sz;
+#ifdef TEST_SUPPLEMENTAL_SUPERFRAME_DATA
+    uint8_t marker_test = 0xc0;
+    int mag_test = 2;     // 1 - 4
+    int frames_test = 4;  // 1 - 8
+    int index_sz_test = 2 + mag_test * frames_test;
+    marker_test |= frames_test - 1;
+    marker_test |= (mag_test - 1) << 3;
+    *x++ = marker_test;
+    for (i = 0; i < mag_test * frames_test; ++i)
+      *x++ = 0;  // fill up with arbitrary data
+    *x++ = marker_test;
+    ctx->pending_cx_data_sz += index_sz_test;
+    printf("Added supplemental superframe data\n");
+#endif
+
+    *x++ = marker;
+    for (i = 0; i < ctx->pending_frame_count - 1; i++) {
+      unsigned int this_sz;
+      int j;
+
+      assert(ctx->pending_frame_sizes[i] > 0);
+      this_sz = (unsigned int)ctx->pending_frame_sizes[i] - 1;
+      for (j = 0; j <= mag; j++) {
+        *x++ = this_sz & 0xff;
+        this_sz >>= 8;
+      }
+    }
+    *x++ = marker;
+    ctx->pending_cx_data_sz += index_sz;
+#ifdef TEST_SUPPLEMENTAL_SUPERFRAME_DATA
+    index_sz += index_sz_test;
+#endif
+  }
+  return index_sz;
+}
+
+// av1 uses 10,000,000 ticks/second as time stamp
+#define TICKS_PER_SEC 10000000LL
+
+static int64_t timebase_units_to_ticks(const aom_rational_t *timebase,
+                                       int64_t n) {
+  return n * TICKS_PER_SEC * timebase->num / timebase->den;
+}
+
+static int64_t ticks_to_timebase_units(const aom_rational_t *timebase,
+                                       int64_t n) {
+  const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
+  return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
+}
+
+static aom_codec_frame_flags_t get_frame_pkt_flags(const AV1_COMP *cpi,
+                                                   unsigned int lib_flags) {
+  aom_codec_frame_flags_t flags = lib_flags << 16;
+
+  if (lib_flags & FRAMEFLAGS_KEY) flags |= AOM_FRAME_IS_KEY;
+
+  if (cpi->droppable) flags |= AOM_FRAME_IS_DROPPABLE;
+
+  return flags;
+}
+
+static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
+                                      const aom_image_t *img,
+                                      aom_codec_pts_t pts,
+                                      unsigned long duration,
+                                      aom_enc_frame_flags_t enc_flags,
+                                      unsigned long deadline) {
+  const size_t kMinCompressedSize = 8192;
+  volatile aom_codec_err_t res = AOM_CODEC_OK;
+  volatile aom_enc_frame_flags_t flags = enc_flags;
+  AV1_COMP *const cpi = ctx->cpi;
+  const aom_rational_t *const timebase = &ctx->cfg.g_timebase;
+  size_t data_sz;
+
+  if (cpi == NULL) return AOM_CODEC_INVALID_PARAM;
+
+  if (img != NULL) {
+    res = validate_img(ctx, img);
+    // TODO(jzern) the checks related to cpi's validity should be treated as a
+    // failure condition, encoder setup is done fully in init() currently.
+    if (res == AOM_CODEC_OK) {
+#if CONFIG_EXT_REFS
+      data_sz = ALIGN_POWER_OF_TWO(ctx->cfg.g_w, 5) *
+                ALIGN_POWER_OF_TWO(ctx->cfg.g_h, 5) * get_image_bps(img);
+#else
+      // There's no codec control for multiple alt-refs so check the encoder
+      // instance for its status to determine the compressed data size.
+      data_sz = ALIGN_POWER_OF_TWO(ctx->cfg.g_w, 5) *
+                ALIGN_POWER_OF_TWO(ctx->cfg.g_h, 5) * get_image_bps(img) / 8 *
+                (cpi->multi_arf_allowed ? 8 : 2);
+#endif  // CONFIG_EXT_REFS
+      if (data_sz < kMinCompressedSize) data_sz = kMinCompressedSize;
+      if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
+        ctx->cx_data_sz = data_sz;
+        free(ctx->cx_data);
+        ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz);
+        if (ctx->cx_data == NULL) {
+          return AOM_CODEC_MEM_ERROR;
+        }
+      }
+    }
+  }
+
+  pick_quickcompress_mode(ctx, deadline);
+  aom_codec_pkt_list_init(&ctx->pkt_list);
+
+  // Handle Flags
+  if (((flags & AOM_EFLAG_NO_UPD_GF) && (flags & AOM_EFLAG_FORCE_GF)) ||
+      ((flags & AOM_EFLAG_NO_UPD_ARF) && (flags & AOM_EFLAG_FORCE_ARF))) {
+    ctx->base.err_detail = "Conflicting flags.";
+    return AOM_CODEC_INVALID_PARAM;
+  }
+
+  if (setjmp(cpi->common.error.jmp)) {
+    cpi->common.error.setjmp = 0;
+    res = update_error_state(ctx, &cpi->common.error);
+    aom_clear_system_state();
+    return res;
+  }
+  cpi->common.error.setjmp = 1;
+
+  av1_apply_encoding_flags(cpi, flags);
+
+  // Handle fixed keyframe intervals
+  if (ctx->cfg.kf_mode == AOM_KF_AUTO &&
+      ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) {
+    if (++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) {
+      flags |= AOM_EFLAG_FORCE_KF;
+      ctx->fixed_kf_cntr = 1;
+    }
+  }
+
+  if (res == AOM_CODEC_OK) {
+    unsigned int lib_flags = 0;
+    YV12_BUFFER_CONFIG sd;
+    int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts);
+    int64_t dst_end_time_stamp =
+        timebase_units_to_ticks(timebase, pts + duration);
+    size_t size, cx_data_sz;
+    unsigned char *cx_data;
+
+    // Set up internal flags
+    if (ctx->base.init_flags & AOM_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1;
+
+    if (img != NULL) {
+      res = image2yuvconfig(img, &sd);
+
+      // Store the original flags in to the frame buffer. Will extract the
+      // key frame flag when we actually encode this frame.
+      if (av1_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd,
+                                dst_time_stamp, dst_end_time_stamp)) {
+        res = update_error_state(ctx, &cpi->common.error);
+      }
+      ctx->next_frame_flags = 0;
+    }
+
+    cx_data = ctx->cx_data;
+    cx_data_sz = ctx->cx_data_sz;
+
+    /* Any pending invisible frames? */
+    if (ctx->pending_cx_data) {
+      memmove(cx_data, ctx->pending_cx_data, ctx->pending_cx_data_sz);
+      ctx->pending_cx_data = cx_data;
+      cx_data += ctx->pending_cx_data_sz;
+      cx_data_sz -= ctx->pending_cx_data_sz;
+
+      /* TODO: this is a minimal check, the underlying codec doesn't respect
+       * the buffer size anyway.
+       */
+      if (cx_data_sz < ctx->cx_data_sz / 2) {
+        aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+                           "Compressed data buffer too small");
+        return AOM_CODEC_ERROR;
+      }
+    }
+
+    while (cx_data_sz >= ctx->cx_data_sz / 2 &&
+           -1 != av1_get_compressed_data(cpi, &lib_flags, &size, cx_data,
+                                         &dst_time_stamp, &dst_end_time_stamp,
+                                         !img)) {
+#if CONFIG_REFERENCE_BUFFER
+      if (cpi->common.invalid_delta_frame_id_minus1) {
+        ctx->base.err_detail = "Invalid delta_frame_id_minus1";
+        return AOM_CODEC_ERROR;
+      }
+#endif
+      if (size) {
+        aom_codec_cx_pkt_t pkt;
+
+        // Pack invisible frames with the next visible frame
+        if (!cpi->common.show_frame) {
+          if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data;
+          ctx->pending_cx_data_sz += size;
+          ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
+          cx_data += size;
+          cx_data_sz -= size;
+
+          continue;
+        }
+
+        // Add the frame packet to the list of returned packets.
+        pkt.kind = AOM_CODEC_CX_FRAME_PKT;
+        pkt.data.frame.pts = ticks_to_timebase_units(timebase, dst_time_stamp);
+        pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units(
+            timebase, dst_end_time_stamp - dst_time_stamp);
+        pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
+
+        if (ctx->pending_cx_data) {
+          ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
+          ctx->pending_cx_data_sz += size;
+          size += write_superframe_index(ctx);
+          pkt.data.frame.buf = ctx->pending_cx_data;
+          pkt.data.frame.sz = ctx->pending_cx_data_sz;
+          ctx->pending_cx_data = NULL;
+          ctx->pending_cx_data_sz = 0;
+          ctx->pending_frame_count = 0;
+        } else {
+          pkt.data.frame.buf = cx_data;
+          pkt.data.frame.sz = size;
+        }
+        pkt.data.frame.partition_id = -1;
+
+        aom_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+
+        cx_data += size;
+        cx_data_sz -= size;
+      }
+    }
+  }
+
+  cpi->common.error.setjmp = 0;
+  return res;
+}
+
+static const aom_codec_cx_pkt_t *encoder_get_cxdata(aom_codec_alg_priv_t *ctx,
+                                                    aom_codec_iter_t *iter) {
+  return aom_codec_pkt_list_get(&ctx->pkt_list.head, iter);
+}
+
+static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  aom_ref_frame_t *const frame = va_arg(args, aom_ref_frame_t *);
+
+  if (frame != NULL) {
+    YV12_BUFFER_CONFIG sd;
+
+    image2yuvconfig(&frame->img, &sd);
+    av1_set_reference_enc(ctx->cpi, ref_frame_to_av1_reframe(frame->frame_type),
+                          &sd);
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  aom_ref_frame_t *const frame = va_arg(args, aom_ref_frame_t *);
+
+  if (frame != NULL) {
+    YV12_BUFFER_CONFIG sd;
+
+    image2yuvconfig(&frame->img, &sd);
+    av1_copy_reference_enc(ctx->cpi,
+                           ref_frame_to_av1_reframe(frame->frame_type), &sd);
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
+
+  if (frame != NULL) {
+    YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->cpi->common, frame->idx);
+    if (fb == NULL) return AOM_CODEC_ERROR;
+
+    yuvconfig2image(&frame->img, fb, NULL);
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  aom_image_t *const new_img = va_arg(args, aom_image_t *);
+
+  if (new_img != NULL) {
+    YV12_BUFFER_CONFIG new_frame;
+
+    if (av1_get_last_show_frame(ctx->cpi, &new_frame) == 0) {
+      yuvconfig2image(new_img, &new_frame, NULL);
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_set_previewpp(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  (void)ctx;
+  (void)args;
+  return AOM_CODEC_INCAPABLE;
+}
+
+static aom_image_t *encoder_get_preview(aom_codec_alg_priv_t *ctx) {
+  YV12_BUFFER_CONFIG sd;
+
+  if (av1_get_preview_raw_frame(ctx->cpi, &sd) == 0) {
+    yuvconfig2image(&ctx->preview_img, &sd, NULL);
+    return &ctx->preview_img;
+  } else {
+    return NULL;
+  }
+}
+
+static aom_codec_err_t ctrl_use_reference(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  const int reference_flag = va_arg(args, int);
+
+  av1_use_as_reference(ctx->cpi, reference_flag);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_roi_map(aom_codec_alg_priv_t *ctx,
+                                        va_list args) {
+  (void)ctx;
+  (void)args;
+
+  // TODO(yaowu): Need to re-implement and test for AV1.
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_set_active_map(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  aom_active_map_t *const map = va_arg(args, aom_active_map_t *);
+
+  if (map) {
+    if (!av1_set_active_map(ctx->cpi, map->active_map, (int)map->rows,
+                            (int)map->cols))
+      return AOM_CODEC_OK;
+    else
+      return AOM_CODEC_INVALID_PARAM;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_get_active_map(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  aom_active_map_t *const map = va_arg(args, aom_active_map_t *);
+
+  if (map) {
+    if (!av1_get_active_map(ctx->cpi, map->active_map, (int)map->rows,
+                            (int)map->cols))
+      return AOM_CODEC_OK;
+    else
+      return AOM_CODEC_INVALID_PARAM;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_set_scale_mode(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  aom_scaling_mode_t *const mode = va_arg(args, aom_scaling_mode_t *);
+
+  if (mode) {
+    const int res =
+        av1_set_internal_size(ctx->cpi, (AOM_SCALING)mode->h_scaling_mode,
+                              (AOM_SCALING)mode->v_scaling_mode);
+    return (res == 0) ? AOM_CODEC_OK : AOM_CODEC_INVALID_PARAM;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_set_tune_content(aom_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.content = CAST(AV1E_SET_TUNE_CONTENT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_color_space(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.color_space = CAST(AV1E_SET_COLOR_SPACE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_color_range(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.color_range = CAST(AV1E_SET_COLOR_RANGE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_render_size(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  int *const render_size = va_arg(args, int *);
+  extra_cfg.render_width = render_size[0];
+  extra_cfg.render_height = render_size[1];
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_superblock_size(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.superblock_size = CAST(AV1E_SET_SUPERBLOCK_SIZE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+static aom_codec_err_t ctrl_set_ans_window_size_log2(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.ans_window_size_log2 = CAST(AV1E_SET_ANS_WINDOW_SIZE_LOG2, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+#endif
+
+static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
+  { AOM_COPY_REFERENCE, ctrl_copy_reference },
+  { AOME_USE_REFERENCE, ctrl_use_reference },
+
+  // Setters
+  { AOM_SET_REFERENCE, ctrl_set_reference },
+  { AOM_SET_POSTPROC, ctrl_set_previewpp },
+  { AOME_SET_ROI_MAP, ctrl_set_roi_map },
+  { AOME_SET_ACTIVEMAP, ctrl_set_active_map },
+  { AOME_SET_SCALEMODE, ctrl_set_scale_mode },
+  { AOME_SET_CPUUSED, ctrl_set_cpuused },
+  { AOME_SET_ENABLEAUTOALTREF, ctrl_set_enable_auto_alt_ref },
+#if CONFIG_EXT_REFS
+  { AOME_SET_ENABLEAUTOBWDREF, ctrl_set_enable_auto_bwd_ref },
+#endif  // CONFIG_EXT_REFS
+  { AOME_SET_SHARPNESS, ctrl_set_sharpness },
+  { AOME_SET_STATIC_THRESHOLD, ctrl_set_static_thresh },
+  { AV1E_SET_TILE_COLUMNS, ctrl_set_tile_columns },
+  { AV1E_SET_TILE_ROWS, ctrl_set_tile_rows },
+#if CONFIG_DEPENDENT_HORZTILES
+  { AV1E_SET_TILE_DEPENDENT_ROWS, ctrl_set_tile_dependent_rows },
+#endif
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+  { AV1E_SET_TILE_LOOPFILTER, ctrl_set_tile_loopfilter },
+#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+  { AOME_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames },
+  { AOME_SET_ARNR_STRENGTH, ctrl_set_arnr_strength },
+  { AOME_SET_TUNING, ctrl_set_tuning },
+  { AOME_SET_CQ_LEVEL, ctrl_set_cq_level },
+  { AOME_SET_MAX_INTRA_BITRATE_PCT, ctrl_set_rc_max_intra_bitrate_pct },
+  { AV1E_SET_MAX_INTER_BITRATE_PCT, ctrl_set_rc_max_inter_bitrate_pct },
+  { AV1E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct },
+  { AV1E_SET_LOSSLESS, ctrl_set_lossless },
+#if CONFIG_AOM_QM
+  { AV1E_SET_ENABLE_QM, ctrl_set_enable_qm },
+  { AV1E_SET_QM_MIN, ctrl_set_qm_min },
+  { AV1E_SET_QM_MAX, ctrl_set_qm_max },
+#endif
+#if CONFIG_TILE_GROUPS
+  { AV1E_SET_NUM_TG, ctrl_set_num_tg },
+  { AV1E_SET_MTU, ctrl_set_mtu },
+#endif
+#if CONFIG_TEMPMV_SIGNALING
+  { AV1E_SET_DISABLE_TEMPMV, ctrl_set_disable_tempmv },
+#endif
+  { AV1E_SET_FRAME_PARALLEL_DECODING, ctrl_set_frame_parallel_decoding_mode },
+  { AV1E_SET_AQ_MODE, ctrl_set_aq_mode },
+#if CONFIG_EXT_DELTA_Q
+  { AV1E_SET_DELTAQ_MODE, ctrl_set_deltaq_mode },
+#endif
+  { AV1E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost },
+  { AV1E_SET_TUNE_CONTENT, ctrl_set_tune_content },
+  { AV1E_SET_COLOR_SPACE, ctrl_set_color_space },
+  { AV1E_SET_COLOR_RANGE, ctrl_set_color_range },
+  { AV1E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity },
+  { AV1E_SET_MIN_GF_INTERVAL, ctrl_set_min_gf_interval },
+  { AV1E_SET_MAX_GF_INTERVAL, ctrl_set_max_gf_interval },
+  { AV1E_SET_RENDER_SIZE, ctrl_set_render_size },
+  { AV1E_SET_SUPERBLOCK_SIZE, ctrl_set_superblock_size },
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+  { AV1E_SET_ANS_WINDOW_SIZE_LOG2, ctrl_set_ans_window_size_log2 },
+#endif
+#if CONFIG_EXT_TILE
+  { AV1E_SET_TILE_ENCODING_MODE, ctrl_set_tile_encoding_mode },
+#endif  // CONFIG_EXT_TILE
+  { AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test },
+
+  // Getters
+  { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer },
+  { AOME_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64 },
+  { AV1_GET_REFERENCE, ctrl_get_reference },
+  { AV1E_GET_ACTIVEMAP, ctrl_get_active_map },
+  { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
+
+  { -1, NULL },
+};
+
+static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
+  { 0,
+    {
+        // NOLINT
+        0,  // g_usage
+        8,  // g_threads
+        0,  // g_profile
+
+        320,         // g_width
+        240,         // g_height
+        AOM_BITS_8,  // g_bit_depth
+        8,           // g_input_bit_depth
+
+        { 1, 30 },  // g_timebase
+
+        0,  // g_error_resilient
+
+        AOM_RC_ONE_PASS,  // g_pass
+
+        25,  // g_lag_in_frames
+
+        0,   // rc_dropframe_thresh
+        0,   // rc_resize_allowed
+        0,   // rc_scaled_width
+        0,   // rc_scaled_height
+        60,  // rc_resize_down_thresold
+        30,  // rc_resize_up_thresold
+
+        AOM_VBR,      // rc_end_usage
+        { NULL, 0 },  // rc_twopass_stats_in
+        { NULL, 0 },  // rc_firstpass_mb_stats_in
+        256,          // rc_target_bandwidth
+        0,            // rc_min_quantizer
+        63,           // rc_max_quantizer
+        25,           // rc_undershoot_pct
+        25,           // rc_overshoot_pct
+
+        6000,  // rc_max_buffer_size
+        4000,  // rc_buffer_initial_size
+        5000,  // rc_buffer_optimal_size
+
+        50,    // rc_two_pass_vbrbias
+        0,     // rc_two_pass_vbrmin_section
+        2000,  // rc_two_pass_vbrmax_section
+
+        // keyframing settings (kf)
+        AOM_KF_AUTO,  // g_kfmode
+        0,            // kf_min_dist
+        9999,         // kf_max_dist
+    } },
+};
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(aom_codec_av1_cx) = {
+  "AOMedia Project AV1 Encoder" VERSION_STRING,
+  AOM_CODEC_INTERNAL_ABI_VERSION,
+#if CONFIG_HIGHBITDEPTH
+  AOM_CODEC_CAP_HIGHBITDEPTH |
+#endif
+      AOM_CODEC_CAP_ENCODER | AOM_CODEC_CAP_PSNR,  // aom_codec_caps_t
+  encoder_init,                                    // aom_codec_init_fn_t
+  encoder_destroy,                                 // aom_codec_destroy_fn_t
+  encoder_ctrl_maps,                               // aom_codec_ctrl_fn_map_t
+  {
+      // NOLINT
+      NULL,  // aom_codec_peek_si_fn_t
+      NULL,  // aom_codec_get_si_fn_t
+      NULL,  // aom_codec_decode_fn_t
+      NULL,  // aom_codec_frame_get_fn_t
+      NULL   // aom_codec_set_fb_fn_t
+  },
+  {
+      // NOLINT
+      1,                      // 1 cfg map
+      encoder_usage_cfg_map,  // aom_codec_enc_cfg_map_t
+      encoder_encode,         // aom_codec_encode_fn_t
+      encoder_get_cxdata,     // aom_codec_get_cx_data_fn_t
+      encoder_set_config,     // aom_codec_enc_config_set_fn_t
+      NULL,                   // aom_codec_get_global_headers_fn_t
+      encoder_get_preview,    // aom_codec_get_preview_frame_fn_t
+      NULL                    // aom_codec_enc_mr_get_mem_loc_fn_t
+  }
+};
diff --git a/third_party/aom/av1/av1_dx.mk b/third_party/aom/av1/av1_dx.mk
new file mode 100644
index 000000000..1a54ea22a
--- /dev/null
+++ b/third_party/aom/av1/av1_dx.mk
@@ -0,0 +1,71 @@
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+
+AV1_DX_EXPORTS += exports_dec
+
+AV1_DX_SRCS-yes += $(AV1_COMMON_SRCS-yes)
+AV1_DX_SRCS-no  += $(AV1_COMMON_SRCS-no)
+AV1_DX_SRCS_REMOVE-yes += $(AV1_COMMON_SRCS_REMOVE-yes)
+AV1_DX_SRCS_REMOVE-no  += $(AV1_COMMON_SRCS_REMOVE-no)
+
+AV1_DX_SRCS-yes += av1_dx_iface.c
+
+AV1_DX_SRCS-yes += decoder/decodemv.c
+AV1_DX_SRCS-yes += decoder/decodeframe.c
+AV1_DX_SRCS-yes += decoder/decodeframe.h
+AV1_DX_SRCS-yes += decoder/detokenize.c
+AV1_DX_SRCS-yes += decoder/decodemv.h
+AV1_DX_SRCS-$(CONFIG_LV_MAP) += decoder/decodetxb.c
+AV1_DX_SRCS-$(CONFIG_LV_MAP) += decoder/decodetxb.h
+AV1_DX_SRCS-yes += decoder/detokenize.h
+AV1_DX_SRCS-yes += decoder/dthread.c
+AV1_DX_SRCS-yes += decoder/dthread.h
+AV1_DX_SRCS-yes += decoder/decoder.c
+AV1_DX_SRCS-yes += decoder/decoder.h
+AV1_DX_SRCS-yes += decoder/dsubexp.c
+AV1_DX_SRCS-yes += decoder/dsubexp.h
+
+ifeq ($(CONFIG_ACCOUNTING),yes)
+AV1_DX_SRCS-yes += decoder/accounting.h
+AV1_DX_SRCS-yes += decoder/accounting.c
+endif
+
+ifeq ($(CONFIG_INSPECTION),yes)
+AV1_DX_SRCS-yes += decoder/inspection.c
+AV1_DX_SRCS-yes += decoder/inspection.h
+endif
+
+ifeq ($(CONFIG_PVQ),yes)
+# PVQ from daala
+AV1_DX_SRCS-yes += decoder/pvq_decoder.c
+AV1_DX_SRCS-yes += decoder/pvq_decoder.h
+AV1_DX_SRCS-yes += decoder/decint.h
+AV1_DX_SRCS-yes += decoder/generic_decoder.c
+AV1_DX_SRCS-yes += decoder/laplace_decoder.c
+AV1_DX_SRCS-yes += encoder/hybrid_fwd_txfm.c
+AV1_DX_SRCS-yes += encoder/hybrid_fwd_txfm.h
+
+AV1_DX_SRCS-yes += encoder/dct.c
+AV1_DX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
+AV1_DX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_intrin_sse2.c
+AV1_DX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3.c
+
+ifneq ($(CONFIG_HIGHBITDEPTH),yes)
+AV1_DX_SRCS-$(HAVE_NEON) += encoder/arm/neon/dct_neon.c
+endif
+
+AV1_DX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct4x4_msa.c
+AV1_DX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct8x8_msa.c
+AV1_DX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct16x16_msa.c
+AV1_DX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct_msa.h
+endif
+
+AV1_DX_SRCS-yes := $(filter-out $(AV1_DX_SRCS_REMOVE-yes),$(AV1_DX_SRCS-yes))
diff --git a/third_party/aom/av1/av1_dx_iface.c b/third_party/aom/av1/av1_dx_iface.c
new file mode 100644
index 000000000..f20ea4815
--- /dev/null
+++ b/third_party/aom/av1/av1_dx_iface.c
@@ -0,0 +1,1223 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "./aom_config.h"
+#include "./aom_version.h"
+
+#include "aom/internal/aom_codec_internal.h"
+#include "aom/aomdx.h"
+#include "aom/aom_decoder.h"
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_util/aom_thread.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/frame_buffers.h"
+#include "av1/common/enums.h"
+
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/decodeframe.h"
+
+#include "av1/av1_iface_common.h"
+
+typedef aom_codec_stream_info_t av1_stream_info_t;
+
+// This limit is due to framebuffer numbers.
+// TODO(hkuang): Remove this limit after implementing ondemand framebuffers.
+#define FRAME_CACHE_SIZE 6  // Cache maximum 6 decoded frames.
+
+typedef struct cache_frame {
+  int fb_idx;
+  aom_image_t img;
+} cache_frame;
+
+struct aom_codec_alg_priv {
+  aom_codec_priv_t base;
+  aom_codec_dec_cfg_t cfg;
+  av1_stream_info_t si;
+  int postproc_cfg_set;
+  aom_postproc_cfg_t postproc_cfg;
+  aom_decrypt_cb decrypt_cb;
+  void *decrypt_state;
+  aom_image_t img;
+  int img_avail;
+  int flushed;
+  int invert_tile_order;
+  int last_show_frame;  // Index of last output frame.
+  int byte_alignment;
+  int skip_loop_filter;
+  int decode_tile_row;
+  int decode_tile_col;
+
+  // Frame parallel related.
+  int frame_parallel_decode;  // frame-based threading.
+  AVxWorker *frame_workers;
+  int num_frame_workers;
+  int next_submit_worker_id;
+  int last_submit_worker_id;
+  int next_output_worker_id;
+  int available_threads;
+  cache_frame frame_cache[FRAME_CACHE_SIZE];
+  int frame_cache_write;
+  int frame_cache_read;
+  int num_cache_frames;
+  int need_resync;  // wait for key/intra-only frame
+  // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
+  BufferPool *buffer_pool;
+
+  // External frame buffer info to save for AV1 common.
+  void *ext_priv;  // Private data associated with the external frame buffers.
+  aom_get_frame_buffer_cb_fn_t get_ext_fb_cb;
+  aom_release_frame_buffer_cb_fn_t release_ext_fb_cb;
+
+#if CONFIG_INSPECTION
+  aom_inspect_cb inspect_cb;
+  void *inspect_ctx;
+#endif
+};
+
+static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx,
+                                    aom_codec_priv_enc_mr_cfg_t *data) {
+  // This function only allocates space for the aom_codec_alg_priv_t
+  // structure. More memory may be required at the time the stream
+  // information becomes known.
+  (void)data;
+
+  if (!ctx->priv) {
+    aom_codec_alg_priv_t *const priv =
+        (aom_codec_alg_priv_t *)aom_calloc(1, sizeof(*priv));
+    if (priv == NULL) return AOM_CODEC_MEM_ERROR;
+
+    ctx->priv = (aom_codec_priv_t *)priv;
+    ctx->priv->init_flags = ctx->init_flags;
+    priv->si.sz = sizeof(priv->si);
+    priv->flushed = 0;
+    // Only do frame parallel decode when threads > 1.
+    priv->frame_parallel_decode =
+        (ctx->config.dec && (ctx->config.dec->threads > 1) &&
+         (ctx->init_flags & AOM_CODEC_USE_FRAME_THREADING))
+            ? 1
+            : 0;
+    if (ctx->config.dec) {
+      priv->cfg = *ctx->config.dec;
+      ctx->config.dec = &priv->cfg;
+    }
+  }
+
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {
+  if (ctx->frame_workers != NULL) {
+    int i;
+    for (i = 0; i < ctx->num_frame_workers; ++i) {
+      AVxWorker *const worker = &ctx->frame_workers[i];
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      aom_get_worker_interface()->end(worker);
+      av1_remove_common(&frame_worker_data->pbi->common);
+#if CONFIG_LOOP_RESTORATION
+      av1_free_restoration_buffers(&frame_worker_data->pbi->common);
+#endif  // CONFIG_LOOP_RESTORATION
+      av1_decoder_remove(frame_worker_data->pbi);
+      aom_free(frame_worker_data->scratch_buffer);
+#if CONFIG_MULTITHREAD
+      pthread_mutex_destroy(&frame_worker_data->stats_mutex);
+      pthread_cond_destroy(&frame_worker_data->stats_cond);
+#endif
+      aom_free(frame_worker_data);
+    }
+#if CONFIG_MULTITHREAD
+    pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
+#endif
+  }
+
+  if (ctx->buffer_pool) {
+    av1_free_ref_frame_buffers(ctx->buffer_pool);
+    av1_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers);
+  }
+
+  aom_free(ctx->frame_workers);
+  aom_free(ctx->buffer_pool);
+  aom_free(ctx);
+  return AOM_CODEC_OK;
+}
+
+static int parse_bitdepth_colorspace_sampling(BITSTREAM_PROFILE profile,
+                                              struct aom_read_bit_buffer *rb) {
+  aom_color_space_t color_space;
+  if (profile >= PROFILE_2) rb->bit_offset += 1;  // Bit-depth 10 or 12.
+  color_space = (aom_color_space_t)aom_rb_read_literal(rb, 3);
+  if (color_space != AOM_CS_SRGB) {
+    rb->bit_offset += 1;  // [16,235] (including xvycc) vs [0,255] range.
+    if (profile == PROFILE_1 || profile == PROFILE_3) {
+      rb->bit_offset += 2;  // subsampling x/y.
+      rb->bit_offset += 1;  // unused.
+    }
+  } else {
+    if (profile == PROFILE_1 || profile == PROFILE_3) {
+      rb->bit_offset += 1;  // unused
+    } else {
+      // RGB is only available in version 1.
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static aom_codec_err_t decoder_peek_si_internal(
+    const uint8_t *data, unsigned int data_sz, aom_codec_stream_info_t *si,
+    int *is_intra_only, aom_decrypt_cb decrypt_cb, void *decrypt_state) {
+  int intra_only_flag = 0;
+  uint8_t clear_buffer[9];
+
+  if (data + data_sz <= data) return AOM_CODEC_INVALID_PARAM;
+
+  si->is_kf = 0;
+  si->w = si->h = 0;
+
+  if (decrypt_cb) {
+    data_sz = AOMMIN(sizeof(clear_buffer), data_sz);
+    decrypt_cb(decrypt_state, data, clear_buffer, data_sz);
+    data = clear_buffer;
+  }
+
+  {
+    int show_frame;
+    int error_resilient;
+    struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
+    const int frame_marker = aom_rb_read_literal(&rb, 2);
+    const BITSTREAM_PROFILE profile = av1_read_profile(&rb);
+
+    if (frame_marker != AOM_FRAME_MARKER) return AOM_CODEC_UNSUP_BITSTREAM;
+
+    if (profile >= MAX_PROFILES) return AOM_CODEC_UNSUP_BITSTREAM;
+
+    if ((profile >= 2 && data_sz <= 1) || data_sz < 1)
+      return AOM_CODEC_UNSUP_BITSTREAM;
+
+    if (aom_rb_read_bit(&rb)) {     // show an existing frame
+      aom_rb_read_literal(&rb, 3);  // Frame buffer to show.
+      return AOM_CODEC_OK;
+    }
+
+    if (data_sz <= 8) return AOM_CODEC_UNSUP_BITSTREAM;
+
+    si->is_kf = !aom_rb_read_bit(&rb);
+    show_frame = aom_rb_read_bit(&rb);
+    error_resilient = aom_rb_read_bit(&rb);
+#if CONFIG_REFERENCE_BUFFER
+    {
+      /* TODO: Move outside frame loop or inside key-frame branch */
+      int frame_id_len;
+      SequenceHeader seq_params;
+      read_sequence_header(&seq_params);
+      if (seq_params.frame_id_numbers_present_flag) {
+        frame_id_len = seq_params.frame_id_length_minus7 + 7;
+        aom_rb_read_literal(&rb, frame_id_len);
+      }
+    }
+#endif
+    if (si->is_kf) {
+      if (!av1_read_sync_code(&rb)) return AOM_CODEC_UNSUP_BITSTREAM;
+
+      if (!parse_bitdepth_colorspace_sampling(profile, &rb))
+        return AOM_CODEC_UNSUP_BITSTREAM;
+      av1_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
+    } else {
+      intra_only_flag = show_frame ? 0 : aom_rb_read_bit(&rb);
+
+      rb.bit_offset += error_resilient ? 0 : 2;  // reset_frame_context
+
+      if (intra_only_flag) {
+        if (!av1_read_sync_code(&rb)) return AOM_CODEC_UNSUP_BITSTREAM;
+        if (profile > PROFILE_0) {
+          if (!parse_bitdepth_colorspace_sampling(profile, &rb))
+            return AOM_CODEC_UNSUP_BITSTREAM;
+        }
+        rb.bit_offset += REF_FRAMES;  // refresh_frame_flags
+        av1_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
+      }
+    }
+  }
+  if (is_intra_only != NULL) *is_intra_only = intra_only_flag;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t decoder_peek_si(const uint8_t *data,
+                                       unsigned int data_sz,
+                                       aom_codec_stream_info_t *si) {
+  return decoder_peek_si_internal(data, data_sz, si, NULL, NULL, NULL);
+}
+
+static aom_codec_err_t decoder_get_si(aom_codec_alg_priv_t *ctx,
+                                      aom_codec_stream_info_t *si) {
+  const size_t sz = (si->sz >= sizeof(av1_stream_info_t))
+                        ? sizeof(av1_stream_info_t)
+                        : sizeof(aom_codec_stream_info_t);
+  memcpy(si, &ctx->si, sz);
+  si->sz = (unsigned int)sz;
+
+  return AOM_CODEC_OK;
+}
+
+static void set_error_detail(aom_codec_alg_priv_t *ctx,
+                             const char *const error) {
+  ctx->base.err_detail = error;
+}
+
+static aom_codec_err_t update_error_state(
+    aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) {
+  if (error->error_code)
+    set_error_detail(ctx, error->has_detail ? error->detail : NULL);
+
+  return error->error_code;
+}
+
+static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) {
+  int i;
+
+  for (i = 0; i < ctx->num_frame_workers; ++i) {
+    AVxWorker *const worker = &ctx->frame_workers[i];
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+    BufferPool *const pool = cm->buffer_pool;
+
+    cm->new_fb_idx = INVALID_IDX;
+    cm->byte_alignment = ctx->byte_alignment;
+    cm->skip_loop_filter = ctx->skip_loop_filter;
+
+    if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+      pool->get_fb_cb = ctx->get_ext_fb_cb;
+      pool->release_fb_cb = ctx->release_ext_fb_cb;
+      pool->cb_priv = ctx->ext_priv;
+    } else {
+      pool->get_fb_cb = av1_get_frame_buffer;
+      pool->release_fb_cb = av1_release_frame_buffer;
+
+      if (av1_alloc_internal_frame_buffers(&pool->int_frame_buffers))
+        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                           "Failed to initialize internal frame buffers");
+
+      pool->cb_priv = &pool->int_frame_buffers;
+    }
+  }
+}
+
+static void set_default_ppflags(aom_postproc_cfg_t *cfg) {
+  cfg->post_proc_flag = AOM_DEBLOCK | AOM_DEMACROBLOCK;
+  cfg->deblocking_level = 4;
+  cfg->noise_level = 0;
+}
+
+static int frame_worker_hook(void *arg1, void *arg2) {
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
+  const uint8_t *data = frame_worker_data->data;
+  (void)arg2;
+
+  frame_worker_data->result = av1_receive_compressed_data(
+      frame_worker_data->pbi, frame_worker_data->data_size, &data);
+  frame_worker_data->data_end = data;
+
+  if (frame_worker_data->pbi->common.frame_parallel_decode) {
+    // In frame parallel decoding, a worker thread must successfully decode all
+    // the compressed data.
+    if (frame_worker_data->result != 0 ||
+        frame_worker_data->data + frame_worker_data->data_size - 1 > data) {
+      AVxWorker *const worker = frame_worker_data->pbi->frame_worker_owner;
+      BufferPool *const pool = frame_worker_data->pbi->common.buffer_pool;
+      // Signal all the other threads that are waiting for this frame.
+      av1_frameworker_lock_stats(worker);
+      frame_worker_data->frame_context_ready = 1;
+      lock_buffer_pool(pool);
+      frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
+      unlock_buffer_pool(pool);
+      frame_worker_data->pbi->need_resync = 1;
+      av1_frameworker_signal_stats(worker);
+      av1_frameworker_unlock_stats(worker);
+      return 0;
+    }
+  } else if (frame_worker_data->result != 0) {
+    // Check decode result in serial decode.
+    frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
+    frame_worker_data->pbi->need_resync = 1;
+  }
+  return !frame_worker_data->result;
+}
+
+static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
+  int i;
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+  ctx->last_show_frame = -1;
+  ctx->next_submit_worker_id = 0;
+  ctx->last_submit_worker_id = 0;
+  ctx->next_output_worker_id = 0;
+  ctx->frame_cache_read = 0;
+  ctx->frame_cache_write = 0;
+  ctx->num_cache_frames = 0;
+  ctx->need_resync = 1;
+  ctx->num_frame_workers =
+      (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads : 1;
+  if (ctx->num_frame_workers > MAX_DECODE_THREADS)
+    ctx->num_frame_workers = MAX_DECODE_THREADS;
+  ctx->available_threads = ctx->num_frame_workers;
+  ctx->flushed = 0;
+
+  ctx->buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
+  if (ctx->buffer_pool == NULL) return AOM_CODEC_MEM_ERROR;
+
+#if CONFIG_MULTITHREAD
+  if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) {
+    set_error_detail(ctx, "Failed to allocate buffer pool mutex");
+    return AOM_CODEC_MEM_ERROR;
+  }
+#endif
+
+  ctx->frame_workers = (AVxWorker *)aom_malloc(ctx->num_frame_workers *
+                                               sizeof(*ctx->frame_workers));
+  if (ctx->frame_workers == NULL) {
+    set_error_detail(ctx, "Failed to allocate frame_workers");
+    return AOM_CODEC_MEM_ERROR;
+  }
+
+  for (i = 0; i < ctx->num_frame_workers; ++i) {
+    AVxWorker *const worker = &ctx->frame_workers[i];
+    FrameWorkerData *frame_worker_data = NULL;
+    winterface->init(worker);
+    worker->data1 = aom_memalign(32, sizeof(FrameWorkerData));
+    if (worker->data1 == NULL) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data");
+      return AOM_CODEC_MEM_ERROR;
+    }
+    frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi = av1_decoder_create(ctx->buffer_pool);
+    if (frame_worker_data->pbi == NULL) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data");
+      return AOM_CODEC_MEM_ERROR;
+    }
+    frame_worker_data->pbi->frame_worker_owner = worker;
+    frame_worker_data->worker_id = i;
+    frame_worker_data->scratch_buffer = NULL;
+    frame_worker_data->scratch_buffer_size = 0;
+    frame_worker_data->frame_context_ready = 0;
+    frame_worker_data->received_frame = 0;
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data mutex");
+      return AOM_CODEC_MEM_ERROR;
+    }
+
+    if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data cond");
+      return AOM_CODEC_MEM_ERROR;
+    }
+#endif
+    // If decoding in serial mode, FrameWorker thread could create tile worker
+    // thread or loopfilter thread.
+    frame_worker_data->pbi->max_threads =
+        (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0;
+
+    frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
+    frame_worker_data->pbi->common.frame_parallel_decode =
+        ctx->frame_parallel_decode;
+    worker->hook = (AVxWorkerHook)frame_worker_hook;
+    if (!winterface->reset(worker)) {
+      set_error_detail(ctx, "Frame Worker thread creation failed");
+      return AOM_CODEC_MEM_ERROR;
+    }
+  }
+
+  // If postprocessing was enabled by the application and a
+  // configuration has not been provided, default it.
+  if (!ctx->postproc_cfg_set && (ctx->base.init_flags & AOM_CODEC_USE_POSTPROC))
+    set_default_ppflags(&ctx->postproc_cfg);
+
+  init_buffer_callbacks(ctx);
+
+  return AOM_CODEC_OK;
+}
+
+static INLINE void check_resync(aom_codec_alg_priv_t *const ctx,
+                                const AV1Decoder *const pbi) {
+  // Clear resync flag if worker got a key frame or intra only frame.
+  if (ctx->need_resync == 1 && pbi->need_resync == 0 &&
+      (pbi->common.intra_only || pbi->common.frame_type == KEY_FRAME))
+    ctx->need_resync = 0;
+}
+
+static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx,
+                                  const uint8_t **data, unsigned int data_sz,
+                                  void *user_priv, int64_t deadline) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  (void)deadline;
+
+  // Determine the stream parameters. Note that we rely on peek_si to
+  // validate that we have a buffer that does not wrap around the top
+  // of the heap.
+  if (!ctx->si.h) {
+    int is_intra_only = 0;
+    const aom_codec_err_t res =
+        decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only,
+                                 ctx->decrypt_cb, ctx->decrypt_state);
+    if (res != AOM_CODEC_OK) return res;
+
+    if (!ctx->si.is_kf && !is_intra_only) return AOM_CODEC_ERROR;
+  }
+
+  if (!ctx->frame_parallel_decode) {
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->data = *data;
+    frame_worker_data->data_size = data_sz;
+    frame_worker_data->user_priv = user_priv;
+    frame_worker_data->received_frame = 1;
+
+    // Set these even if already initialized.  The caller may have changed the
+    // decrypt config between frames.
+    frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
+    frame_worker_data->pbi->decrypt_state = ctx->decrypt_state;
+#if CONFIG_INSPECTION
+    frame_worker_data->pbi->inspect_cb = ctx->inspect_cb;
+    frame_worker_data->pbi->inspect_ctx = ctx->inspect_ctx;
+#endif
+
+#if CONFIG_EXT_TILE
+    frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
+    frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
+#endif  // CONFIG_EXT_TILE
+
+    worker->had_error = 0;
+    winterface->execute(worker);
+
+    // Update data pointer after decode.
+    *data = frame_worker_data->data_end;
+
+    if (worker->had_error)
+      return update_error_state(ctx, &frame_worker_data->pbi->common.error);
+
+    check_resync(ctx, frame_worker_data->pbi);
+  } else {
+    AVxWorker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id];
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    // Copy context from last worker thread to next worker thread.
+    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
+      av1_frameworker_copy_context(
+          &ctx->frame_workers[ctx->next_submit_worker_id],
+          &ctx->frame_workers[ctx->last_submit_worker_id]);
+
+    frame_worker_data->pbi->ready_for_new_data = 0;
+    // Copy the compressed data into worker's internal buffer.
+    // TODO(hkuang): Will all the workers allocate the same size
+    // as the size of the first intra frame be better? This will
+    // avoid too many deallocate and allocate.
+    if (frame_worker_data->scratch_buffer_size < data_sz) {
+      aom_free(frame_worker_data->scratch_buffer);
+      frame_worker_data->scratch_buffer = (uint8_t *)aom_malloc(data_sz);
+      if (frame_worker_data->scratch_buffer == NULL) {
+        set_error_detail(ctx, "Failed to reallocate scratch buffer");
+        return AOM_CODEC_MEM_ERROR;
+      }
+      frame_worker_data->scratch_buffer_size = data_sz;
+    }
+    frame_worker_data->data_size = data_sz;
+    memcpy(frame_worker_data->scratch_buffer, *data, data_sz);
+
+    frame_worker_data->frame_decoded = 0;
+    frame_worker_data->frame_context_ready = 0;
+    frame_worker_data->received_frame = 1;
+    frame_worker_data->data = frame_worker_data->scratch_buffer;
+    frame_worker_data->user_priv = user_priv;
+
+    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
+      ctx->last_submit_worker_id =
+          (ctx->last_submit_worker_id + 1) % ctx->num_frame_workers;
+
+    ctx->next_submit_worker_id =
+        (ctx->next_submit_worker_id + 1) % ctx->num_frame_workers;
+    --ctx->available_threads;
+    worker->had_error = 0;
+    winterface->launch(worker);
+  }
+
+  return AOM_CODEC_OK;
+}
+
+static void wait_worker_and_cache_frame(aom_codec_alg_priv_t *ctx) {
+  YV12_BUFFER_CONFIG sd;
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+  ctx->next_output_worker_id =
+      (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
+  // TODO(hkuang): Add worker error handling here.
+  winterface->sync(worker);
+  frame_worker_data->received_frame = 0;
+  ++ctx->available_threads;
+
+  check_resync(ctx, frame_worker_data->pbi);
+
+  if (av1_get_raw_frame(frame_worker_data->pbi, &sd) == 0) {
+    AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+    ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx;
+    yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd,
+                    frame_worker_data->user_priv);
+    ctx->frame_cache[ctx->frame_cache_write].img.fb_priv =
+        frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+    ctx->frame_cache_write = (ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE;
+    ++ctx->num_cache_frames;
+  }
+}
+
+static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
+                                      const uint8_t *data, unsigned int data_sz,
+                                      void *user_priv, long deadline) {
+  const uint8_t *data_start = data;
+  const uint8_t *const data_end = data + data_sz;
+  aom_codec_err_t res;
+  uint32_t frame_sizes[8];
+  int frame_count;
+
+  if (data == NULL && data_sz == 0) {
+    ctx->flushed = 1;
+    return AOM_CODEC_OK;
+  }
+
+  // Reset flushed when receiving a valid frame.
+  ctx->flushed = 0;
+
+  // Initialize the decoder workers on the first frame.
+  if (ctx->frame_workers == NULL) {
+    res = init_decoder(ctx);
+    if (res != AOM_CODEC_OK) return res;
+  }
+
+  res = av1_parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
+                                   ctx->decrypt_cb, ctx->decrypt_state);
+  if (res != AOM_CODEC_OK) return res;
+
+  if (ctx->frame_parallel_decode) {
+    // Decode in frame parallel mode. When decoding in this mode, the frame
+    // passed to the decoder must be either a normal frame or a superframe with
+    // superframe index so the decoder could get each frame's start position
+    // in the superframe.
+    if (frame_count > 0) {
+      int i;
+
+      for (i = 0; i < frame_count; ++i) {
+        const uint8_t *data_start_copy = data_start;
+        const uint32_t frame_size = frame_sizes[i];
+        if (data_start < data ||
+            frame_size > (uint32_t)(data_end - data_start)) {
+          set_error_detail(ctx, "Invalid frame size in index");
+          return AOM_CODEC_CORRUPT_FRAME;
+        }
+
+        if (ctx->available_threads == 0) {
+          // No more threads for decoding. Wait until the next output worker
+          // finishes decoding. Then copy the decoded frame into cache.
+          if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
+            wait_worker_and_cache_frame(ctx);
+          } else {
+            // TODO(hkuang): Add unit test to test this path.
+            set_error_detail(ctx, "Frame output cache is full.");
+            return AOM_CODEC_ERROR;
+          }
+        }
+
+        res =
+            decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline);
+        if (res != AOM_CODEC_OK) return res;
+        data_start += frame_size;
+      }
+    } else {
+      if (ctx->available_threads == 0) {
+        // No more threads for decoding. Wait until the next output worker
+        // finishes decoding. Then copy the decoded frame into cache.
+        if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
+          wait_worker_and_cache_frame(ctx);
+        } else {
+          // TODO(hkuang): Add unit test to test this path.
+          set_error_detail(ctx, "Frame output cache is full.");
+          return AOM_CODEC_ERROR;
+        }
+      }
+
+      res = decode_one(ctx, &data, data_sz, user_priv, deadline);
+      if (res != AOM_CODEC_OK) return res;
+    }
+  } else {
+    // Decode in serial mode.
+    if (frame_count > 0) {
+      int i;
+
+      for (i = 0; i < frame_count; ++i) {
+        const uint8_t *data_start_copy = data_start;
+        const uint32_t frame_size = frame_sizes[i];
+        if (data_start < data ||
+            frame_size > (uint32_t)(data_end - data_start)) {
+          set_error_detail(ctx, "Invalid frame size in index");
+          return AOM_CODEC_CORRUPT_FRAME;
+        }
+
+        res =
+            decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline);
+        if (res != AOM_CODEC_OK) return res;
+
+        data_start += frame_size;
+      }
+    } else {
+      while (data_start < data_end) {
+        const uint32_t frame_size = (uint32_t)(data_end - data_start);
+        res = decode_one(ctx, &data_start, frame_size, user_priv, deadline);
+        if (res != AOM_CODEC_OK) return res;
+
+        // Account for suboptimal termination by the encoder.
+        while (data_start < data_end) {
+          const uint8_t marker =
+              read_marker(ctx->decrypt_cb, ctx->decrypt_state, data_start);
+          if (marker) break;
+          ++data_start;
+        }
+      }
+    }
+  }
+
+  return res;
+}
+
+static void release_last_output_frame(aom_codec_alg_priv_t *ctx) {
+  RefCntBuffer *const frame_bufs = ctx->buffer_pool->frame_bufs;
+  // Decrease reference count of last output frame in frame parallel mode.
+  if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) {
+    BufferPool *const pool = ctx->buffer_pool;
+    lock_buffer_pool(pool);
+    decrease_ref_count(ctx->last_show_frame, frame_bufs, pool);
+    unlock_buffer_pool(pool);
+  }
+}
+
+static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
+                                      aom_codec_iter_t *iter) {
+  aom_image_t *img = NULL;
+
+  // Only return frame when all the cpu are busy or
+  // application fluhsed the decoder in frame parallel decode.
+  if (ctx->frame_parallel_decode && ctx->available_threads > 0 &&
+      !ctx->flushed) {
+    return NULL;
+  }
+
+  // Output the frames in the cache first.
+  if (ctx->num_cache_frames > 0) {
+    release_last_output_frame(ctx);
+    ctx->last_show_frame = ctx->frame_cache[ctx->frame_cache_read].fb_idx;
+    if (ctx->need_resync) return NULL;
+    img = &ctx->frame_cache[ctx->frame_cache_read].img;
+    ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE;
+    --ctx->num_cache_frames;
+    return img;
+  }
+
+  // iter acts as a flip flop, so an image is only returned on the first
+  // call to get_frame.
+  if (*iter == NULL && ctx->frame_workers != NULL) {
+    do {
+      YV12_BUFFER_CONFIG sd;
+      const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+      AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      ctx->next_output_worker_id =
+          (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
+      // Wait for the frame from worker thread.
+      if (winterface->sync(worker)) {
+        // Check if worker has received any frames.
+        if (frame_worker_data->received_frame == 1) {
+          ++ctx->available_threads;
+          frame_worker_data->received_frame = 0;
+          check_resync(ctx, frame_worker_data->pbi);
+        }
+        if (av1_get_raw_frame(frame_worker_data->pbi, &sd) == 0) {
+          AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+          RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+          release_last_output_frame(ctx);
+          ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx;
+          if (ctx->need_resync) return NULL;
+          yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv);
+
+#if CONFIG_EXT_TILE
+          if (cm->tile_encoding_mode &&
+              frame_worker_data->pbi->dec_tile_row >= 0) {
+            const int tile_row =
+                AOMMIN(frame_worker_data->pbi->dec_tile_row, cm->tile_rows - 1);
+            const int mi_row = tile_row * cm->tile_height;
+            const int ssy = ctx->img.y_chroma_shift;
+            int plane;
+            ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0];
+            for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+              ctx->img.planes[plane] +=
+                  mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane];
+            }
+            ctx->img.d_h =
+                AOMMIN(cm->tile_height, cm->mi_rows - mi_row) * MI_SIZE;
+          }
+
+          if (cm->tile_encoding_mode &&
+              frame_worker_data->pbi->dec_tile_col >= 0) {
+            const int tile_col =
+                AOMMIN(frame_worker_data->pbi->dec_tile_col, cm->tile_cols - 1);
+            const int mi_col = tile_col * cm->tile_width;
+            const int ssx = ctx->img.x_chroma_shift;
+            int plane;
+            ctx->img.planes[0] += mi_col * MI_SIZE;
+            for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+              ctx->img.planes[plane] += mi_col * (MI_SIZE >> ssx);
+            }
+            ctx->img.d_w =
+                AOMMIN(cm->tile_width, cm->mi_cols - mi_col) * MI_SIZE;
+          }
+#endif  // CONFIG_EXT_TILE
+
+          ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+          img = &ctx->img;
+          return img;
+        }
+      } else {
+        // Decoding failed. Release the worker thread.
+        frame_worker_data->received_frame = 0;
+        ++ctx->available_threads;
+        ctx->need_resync = 1;
+        if (ctx->flushed != 1) return NULL;
+      }
+    } while (ctx->next_output_worker_id != ctx->next_submit_worker_id);
+  }
+  return NULL;
+}
+
+static aom_codec_err_t decoder_set_fb_fn(
+    aom_codec_alg_priv_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
+    aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
+  if (cb_get == NULL || cb_release == NULL) {
+    return AOM_CODEC_INVALID_PARAM;
+  } else if (ctx->frame_workers == NULL) {
+    // If the decoder has already been initialized, do not accept changes to
+    // the frame buffer functions.
+    ctx->get_ext_fb_cb = cb_get;
+    ctx->release_ext_fb_cb = cb_release;
+    ctx->ext_priv = cb_priv;
+    return AOM_CODEC_OK;
+  }
+
+  return AOM_CODEC_ERROR;
+}
+
+static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  aom_ref_frame_t *const data = va_arg(args, aom_ref_frame_t *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return AOM_CODEC_INCAPABLE;
+  }
+
+  if (data) {
+    aom_ref_frame_t *const frame = (aom_ref_frame_t *)data;
+    YV12_BUFFER_CONFIG sd;
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    image2yuvconfig(&frame->img, &sd);
+    return av1_set_reference_dec(&frame_worker_data->pbi->common,
+                                 ref_frame_to_av1_reframe(frame->frame_type),
+                                 &sd);
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  const aom_ref_frame_t *const frame = va_arg(args, aom_ref_frame_t *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return AOM_CODEC_INCAPABLE;
+  }
+
+  if (frame) {
+    YV12_BUFFER_CONFIG sd;
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    image2yuvconfig(&frame->img, &sd);
+    return av1_copy_reference_dec(frame_worker_data->pbi,
+                                  (AOM_REFFRAME)frame->frame_type, &sd);
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  av1_ref_frame_t *data = va_arg(args, av1_ref_frame_t *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return AOM_CODEC_INCAPABLE;
+  }
+
+  if (data) {
+    YV12_BUFFER_CONFIG *fb;
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx);
+    if (fb == NULL) return AOM_CODEC_ERROR;
+    yuvconfig2image(&data->img, fb, NULL);
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  aom_image_t *new_img = va_arg(args, aom_image_t *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return AOM_CODEC_INCAPABLE;
+  }
+
+  if (new_img) {
+    YV12_BUFFER_CONFIG new_frame;
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+
+    if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) {
+      yuvconfig2image(new_img, &new_frame, NULL);
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_set_postproc(aom_codec_alg_priv_t *ctx,
+                                         va_list args) {
+  (void)ctx;
+  (void)args;
+  return AOM_CODEC_INCAPABLE;
+}
+
+static aom_codec_err_t ctrl_set_dbg_options(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  (void)ctx;
+  (void)args;
+  return AOM_CODEC_INCAPABLE;
+}
+
+static aom_codec_err_t ctrl_get_last_ref_updates(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  int *const update_info = va_arg(args, int *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return AOM_CODEC_INCAPABLE;
+  }
+
+  if (update_info) {
+    if (ctx->frame_workers) {
+      AVxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      *update_info = frame_worker_data->pbi->refresh_frame_flags;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_last_quantizer(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  *arg =
+      ((FrameWorkerData *)ctx->frame_workers[0].data1)->pbi->common.base_qindex;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_frame_corrupted(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  int *corrupted = va_arg(args, int *);
+
+  if (corrupted) {
+    if (ctx->frame_workers) {
+      AVxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      RefCntBuffer *const frame_bufs =
+          frame_worker_data->pbi->common.buffer_pool->frame_bufs;
+      if (frame_worker_data->pbi->common.frame_to_show == NULL)
+        return AOM_CODEC_ERROR;
+      if (ctx->last_show_frame >= 0)
+        *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_frame_size(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  int *const frame_size = va_arg(args, int *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return AOM_CODEC_INCAPABLE;
+  }
+
+  if (frame_size) {
+    if (ctx->frame_workers) {
+      AVxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+      frame_size[0] = cm->width;
+      frame_size[1] = cm->height;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_render_size(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  int *const render_size = va_arg(args, int *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return AOM_CODEC_INCAPABLE;
+  }
+
+  if (render_size) {
+    if (ctx->frame_workers) {
+      AVxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+      render_size[0] = cm->render_width;
+      render_size[1] = cm->render_height;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_bit_depth(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  unsigned int *const bit_depth = va_arg(args, unsigned int *);
+  AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+
+  if (bit_depth) {
+    if (worker) {
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+      *bit_depth = cm->bit_depth;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_set_invert_tile_order(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  ctx->invert_tile_order = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_decryptor(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  aom_decrypt_init *init = va_arg(args, aom_decrypt_init *);
+  ctx->decrypt_cb = init ? init->decrypt_cb : NULL;
+  ctx->decrypt_state = init ? init->decrypt_state : NULL;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_byte_alignment(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  const int legacy_byte_alignment = 0;
+  const int min_byte_alignment = 32;
+  const int max_byte_alignment = 1024;
+  const int byte_alignment = va_arg(args, int);
+
+  if (byte_alignment != legacy_byte_alignment &&
+      (byte_alignment < min_byte_alignment ||
+       byte_alignment > max_byte_alignment ||
+       (byte_alignment & (byte_alignment - 1)) != 0))
+    return AOM_CODEC_INVALID_PARAM;
+
+  ctx->byte_alignment = byte_alignment;
+  if (ctx->frame_workers) {
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi->common.byte_alignment = byte_alignment;
+  }
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_skip_loop_filter(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  ctx->skip_loop_filter = va_arg(args, int);
+
+  if (ctx->frame_workers) {
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi->common.skip_loop_filter = ctx->skip_loop_filter;
+  }
+
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_accounting(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+#if !CONFIG_ACCOUNTING
+  (void)ctx;
+  (void)args;
+  return AOM_CODEC_INCAPABLE;
+#else
+  if (ctx->frame_workers) {
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    AV1Decoder *pbi = frame_worker_data->pbi;
+    Accounting **acct = va_arg(args, Accounting **);
+    *acct = &pbi->accounting;
+    return AOM_CODEC_OK;
+  }
+  return AOM_CODEC_ERROR;
+#endif
+}
+static aom_codec_err_t ctrl_set_decode_tile_row(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  ctx->decode_tile_row = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_decode_tile_col(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  ctx->decode_tile_col = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_inspection_callback(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+#if !CONFIG_INSPECTION
+  (void)ctx;
+  (void)args;
+  return AOM_CODEC_INCAPABLE;
+#else
+  aom_inspect_init *init = va_arg(args, aom_inspect_init *);
+  ctx->inspect_cb = init->inspect_cb;
+  ctx->inspect_ctx = init->inspect_ctx;
+  return AOM_CODEC_OK;
+#endif
+}
+
+static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
+  { AOM_COPY_REFERENCE, ctrl_copy_reference },
+
+  // Setters
+  { AOM_SET_REFERENCE, ctrl_set_reference },
+  { AOM_SET_POSTPROC, ctrl_set_postproc },
+  { AOM_SET_DBG_COLOR_REF_FRAME, ctrl_set_dbg_options },
+  { AOM_SET_DBG_COLOR_MB_MODES, ctrl_set_dbg_options },
+  { AOM_SET_DBG_COLOR_B_MODES, ctrl_set_dbg_options },
+  { AOM_SET_DBG_DISPLAY_MV, ctrl_set_dbg_options },
+  { AV1_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order },
+  { AOMD_SET_DECRYPTOR, ctrl_set_decryptor },
+  { AV1_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment },
+  { AV1_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter },
+  { AV1_SET_DECODE_TILE_ROW, ctrl_set_decode_tile_row },
+  { AV1_SET_DECODE_TILE_COL, ctrl_set_decode_tile_col },
+  { AV1_SET_INSPECTION_CALLBACK, ctrl_set_inspection_callback },
+
+  // Getters
+  { AOMD_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted },
+  { AOMD_GET_LAST_QUANTIZER, ctrl_get_last_quantizer },
+  { AOMD_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates },
+  { AV1D_GET_BIT_DEPTH, ctrl_get_bit_depth },
+  { AV1D_GET_DISPLAY_SIZE, ctrl_get_render_size },
+  { AV1D_GET_FRAME_SIZE, ctrl_get_frame_size },
+  { AV1_GET_ACCOUNTING, ctrl_get_accounting },
+  { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
+  { AV1_GET_REFERENCE, ctrl_get_reference },
+
+  { -1, NULL },
+};
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(aom_codec_av1_dx) = {
+  "AOMedia Project AV1 Decoder" VERSION_STRING,
+  AOM_CODEC_INTERNAL_ABI_VERSION,
+  AOM_CODEC_CAP_DECODER |
+      AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER,  // aom_codec_caps_t
+  decoder_init,                             // aom_codec_init_fn_t
+  decoder_destroy,                          // aom_codec_destroy_fn_t
+  decoder_ctrl_maps,                        // aom_codec_ctrl_fn_map_t
+  {
+      // NOLINT
+      decoder_peek_si,    // aom_codec_peek_si_fn_t
+      decoder_get_si,     // aom_codec_get_si_fn_t
+      decoder_decode,     // aom_codec_decode_fn_t
+      decoder_get_frame,  // aom_codec_frame_get_fn_t
+      decoder_set_fb_fn,  // aom_codec_set_fb_fn_t
+  },
+  {
+      // NOLINT
+      0,
+      NULL,  // aom_codec_enc_cfg_map_t
+      NULL,  // aom_codec_encode_fn_t
+      NULL,  // aom_codec_get_cx_data_fn_t
+      NULL,  // aom_codec_enc_config_set_fn_t
+      NULL,  // aom_codec_get_global_headers_fn_t
+      NULL,  // aom_codec_get_preview_frame_fn_t
+      NULL   // aom_codec_enc_mr_get_mem_loc_fn_t
+  }
+};
diff --git a/third_party/aom/av1/av1_iface_common.h b/third_party/aom/av1/av1_iface_common.h
new file mode 100644
index 000000000..df3614212
--- /dev/null
+++ b/third_party/aom/av1/av1_iface_common.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_AV1_IFACE_COMMON_H_
+#define AV1_AV1_IFACE_COMMON_H_
+
+#include "aom_ports/mem.h"
+
+static void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12,
+                            void *user_priv) {
+  /** aom_img_wrap() doesn't allow specifying independent strides for
+    * the Y, U, and V planes, nor other alignment adjustments that
+    * might be representable by a YV12_BUFFER_CONFIG, so we just
+    * initialize all the fields.*/
+  int bps;
+  if (!yv12->subsampling_y) {
+    if (!yv12->subsampling_x) {
+      img->fmt = AOM_IMG_FMT_I444;
+      bps = 24;
+    } else {
+      img->fmt = AOM_IMG_FMT_I422;
+      bps = 16;
+    }
+  } else {
+    if (!yv12->subsampling_x) {
+      img->fmt = AOM_IMG_FMT_I440;
+      bps = 16;
+    } else {
+      img->fmt = AOM_IMG_FMT_I420;
+      bps = 12;
+    }
+  }
+  img->cs = yv12->color_space;
+  img->range = yv12->color_range;
+  img->bit_depth = 8;
+  img->w = yv12->y_stride;
+  img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * AOM_BORDER_IN_PIXELS, 3);
+  img->d_w = yv12->y_crop_width;
+  img->d_h = yv12->y_crop_height;
+  img->r_w = yv12->render_width;
+  img->r_h = yv12->render_height;
+  img->x_chroma_shift = yv12->subsampling_x;
+  img->y_chroma_shift = yv12->subsampling_y;
+  img->planes[AOM_PLANE_Y] = yv12->y_buffer;
+  img->planes[AOM_PLANE_U] = yv12->u_buffer;
+  img->planes[AOM_PLANE_V] = yv12->v_buffer;
+  img->planes[AOM_PLANE_ALPHA] = NULL;
+  img->stride[AOM_PLANE_Y] = yv12->y_stride;
+  img->stride[AOM_PLANE_U] = yv12->uv_stride;
+  img->stride[AOM_PLANE_V] = yv12->uv_stride;
+  img->stride[AOM_PLANE_ALPHA] = yv12->y_stride;
+#if CONFIG_HIGHBITDEPTH
+  if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) {
+    // aom_image_t uses byte strides and a pointer to the first byte
+    // of the image.
+    img->fmt = (aom_img_fmt_t)(img->fmt | AOM_IMG_FMT_HIGHBITDEPTH);
+    img->bit_depth = yv12->bit_depth;
+    img->planes[AOM_PLANE_Y] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->y_buffer);
+    img->planes[AOM_PLANE_U] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->u_buffer);
+    img->planes[AOM_PLANE_V] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->v_buffer);
+    img->planes[AOM_PLANE_ALPHA] = NULL;
+    img->stride[AOM_PLANE_Y] = 2 * yv12->y_stride;
+    img->stride[AOM_PLANE_U] = 2 * yv12->uv_stride;
+    img->stride[AOM_PLANE_V] = 2 * yv12->uv_stride;
+    img->stride[AOM_PLANE_ALPHA] = 2 * yv12->y_stride;
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+  img->bps = bps;
+  img->user_priv = user_priv;
+  img->img_data = yv12->buffer_alloc;
+  img->img_data_owner = 0;
+  img->self_allocd = 0;
+}
+
+static aom_codec_err_t image2yuvconfig(const aom_image_t *img,
+                                       YV12_BUFFER_CONFIG *yv12) {
+  yv12->y_buffer = img->planes[AOM_PLANE_Y];
+  yv12->u_buffer = img->planes[AOM_PLANE_U];
+  yv12->v_buffer = img->planes[AOM_PLANE_V];
+
+  yv12->y_crop_width = img->d_w;
+  yv12->y_crop_height = img->d_h;
+  yv12->render_width = img->r_w;
+  yv12->render_height = img->r_h;
+  yv12->y_width = img->d_w;
+  yv12->y_height = img->d_h;
+
+  yv12->uv_width =
+      img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2 : yv12->y_width;
+  yv12->uv_height =
+      img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2 : yv12->y_height;
+  yv12->uv_crop_width = yv12->uv_width;
+  yv12->uv_crop_height = yv12->uv_height;
+
+  yv12->y_stride = img->stride[AOM_PLANE_Y];
+  yv12->uv_stride = img->stride[AOM_PLANE_U];
+  yv12->color_space = img->cs;
+  yv12->color_range = img->range;
+
+#if CONFIG_HIGHBITDEPTH
+  if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+    // In aom_image_t
+    //     planes point to uint8 address of start of data
+    //     stride counts uint8s to reach next row
+    // In YV12_BUFFER_CONFIG
+    //     y_buffer, u_buffer, v_buffer point to uint16 address of data
+    //     stride and border counts in uint16s
+    // This means that all the address calculations in the main body of code
+    // should work correctly.
+    // However, before we do any pixel operations we need to cast the address
+    // to a uint16 ponter and double its value.
+    yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer);
+    yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer);
+    yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer);
+    yv12->y_stride >>= 1;
+    yv12->uv_stride >>= 1;
+    yv12->flags = YV12_FLAG_HIGHBITDEPTH;
+  } else {
+    yv12->flags = 0;
+  }
+  yv12->border = (yv12->y_stride - img->w) / 2;
+#else
+  yv12->border = (img->stride[AOM_PLANE_Y] - img->w) / 2;
+#endif  // CONFIG_HIGHBITDEPTH
+  yv12->subsampling_x = img->x_chroma_shift;
+  yv12->subsampling_y = img->y_chroma_shift;
+  return AOM_CODEC_OK;
+}
+
+static AOM_REFFRAME ref_frame_to_av1_reframe(aom_ref_frame_type_t frame) {
+  switch (frame) {
+    case AOM_LAST_FRAME: return AOM_LAST_FLAG;
+    case AOM_GOLD_FRAME: return AOM_GOLD_FLAG;
+    case AOM_ALTR_FRAME: return AOM_ALT_FLAG;
+  }
+  assert(0 && "Invalid Reference Frame");
+  return AOM_LAST_FLAG;
+}
+#endif  // AV1_AV1_IFACE_COMMON_H_
diff --git a/third_party/aom/av1/common/alloccommon.c b/third_party/aom/av1/common/alloccommon.c
new file mode 100644
index 000000000..79d41a9c8
--- /dev/null
+++ b/third_party/aom/av1/common/alloccommon.c
@@ -0,0 +1,209 @@
+/*
+ *
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_config.h"
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/blockd.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/onyxc_int.h"
+
+void av1_set_mb_mi(AV1_COMMON *cm, int width, int height) {
+  // TODO(jingning): Fine tune the loop filter operations and bring this
+  // back to integer multiple of 4 for cb4x4.
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
+
+  cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
+  cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
+  cm->mi_stride = calc_mi_size(cm->mi_cols);
+
+#if CONFIG_CB4X4
+  cm->mb_cols = (cm->mi_cols + 2) >> 2;
+  cm->mb_rows = (cm->mi_rows + 2) >> 2;
+#else
+  cm->mb_cols = (cm->mi_cols + 1) >> 1;
+  cm->mb_rows = (cm->mi_rows + 1) >> 1;
+#endif
+  cm->MBs = cm->mb_rows * cm->mb_cols;
+}
+
+static int alloc_seg_map(AV1_COMMON *cm, int seg_map_size) {
+  int i;
+
+  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+    cm->seg_map_array[i] = (uint8_t *)aom_calloc(seg_map_size, 1);
+    if (cm->seg_map_array[i] == NULL) return 1;
+  }
+  cm->seg_map_alloc_size = seg_map_size;
+
+  // Init the index.
+  cm->seg_map_idx = 0;
+  cm->prev_seg_map_idx = 1;
+
+  cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
+  if (!cm->frame_parallel_decode)
+    cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
+
+  return 0;
+}
+
+static void free_seg_map(AV1_COMMON *cm) {
+  int i;
+
+  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+    aom_free(cm->seg_map_array[i]);
+    cm->seg_map_array[i] = NULL;
+  }
+
+  cm->current_frame_seg_map = NULL;
+
+  if (!cm->frame_parallel_decode) {
+    cm->last_frame_seg_map = NULL;
+  }
+}
+
+void av1_free_ref_frame_buffers(BufferPool *pool) {
+  int i;
+
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    if (pool->frame_bufs[i].ref_count > 0 &&
+        pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
+      pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
+      pool->frame_bufs[i].ref_count = 0;
+    }
+    aom_free(pool->frame_bufs[i].mvs);
+    pool->frame_bufs[i].mvs = NULL;
+    aom_free_frame_buffer(&pool->frame_bufs[i].buf);
+  }
+}
+
+#if CONFIG_LOOP_RESTORATION
+// Assumes cm->rst_info[p].restoration_tilesize is already initialized
+void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
+  int p;
+  av1_alloc_restoration_struct(cm, &cm->rst_info[0], cm->width, cm->height);
+  for (p = 1; p < MAX_MB_PLANE; ++p)
+    av1_alloc_restoration_struct(
+        cm, &cm->rst_info[p], ROUND_POWER_OF_TWO(cm->width, cm->subsampling_x),
+        ROUND_POWER_OF_TWO(cm->height, cm->subsampling_y));
+  aom_free(cm->rst_internal.tmpbuf);
+  CHECK_MEM_ERROR(cm, cm->rst_internal.tmpbuf,
+                  (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
+}
+
+void av1_free_restoration_buffers(AV1_COMMON *cm) {
+  int p;
+  for (p = 0; p < MAX_MB_PLANE; ++p)
+    av1_free_restoration_struct(&cm->rst_info[p]);
+  aom_free(cm->rst_internal.tmpbuf);
+  cm->rst_internal.tmpbuf = NULL;
+}
+#endif  // CONFIG_LOOP_RESTORATION
+
+void av1_free_context_buffers(AV1_COMMON *cm) {
+  int i;
+  cm->free_mi(cm);
+  free_seg_map(cm);
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    aom_free(cm->above_context[i]);
+    cm->above_context[i] = NULL;
+  }
+  aom_free(cm->above_seg_context);
+  cm->above_seg_context = NULL;
+#if CONFIG_VAR_TX
+  aom_free(cm->above_txfm_context);
+  cm->above_txfm_context = NULL;
+#endif
+}
+
+int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height) {
+  int new_mi_size;
+
+  av1_set_mb_mi(cm, width, height);
+  new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
+  if (cm->mi_alloc_size < new_mi_size) {
+    cm->free_mi(cm);
+    if (cm->alloc_mi(cm, new_mi_size)) goto fail;
+  }
+
+  if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
+    // Create the segmentation map structure and set to 0.
+    free_seg_map(cm);
+    if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail;
+  }
+
+  if (cm->above_context_alloc_cols < cm->mi_cols) {
+    // TODO(geza.lore): These are bigger than they need to be.
+    // cm->tile_width would be enough but it complicates indexing a
+    // little elsewhere.
+    const int aligned_mi_cols =
+        ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+    int i;
+
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      aom_free(cm->above_context[i]);
+      cm->above_context[i] = (ENTROPY_CONTEXT *)aom_calloc(
+          2 * aligned_mi_cols, sizeof(*cm->above_context[0]));
+      if (!cm->above_context[i]) goto fail;
+    }
+
+    aom_free(cm->above_seg_context);
+    cm->above_seg_context = (PARTITION_CONTEXT *)aom_calloc(
+        aligned_mi_cols, sizeof(*cm->above_seg_context));
+    if (!cm->above_seg_context) goto fail;
+
+#if CONFIG_VAR_TX
+    aom_free(cm->above_txfm_context);
+    cm->above_txfm_context = (TXFM_CONTEXT *)aom_calloc(
+        aligned_mi_cols, sizeof(*cm->above_txfm_context));
+    if (!cm->above_txfm_context) goto fail;
+#endif
+
+    cm->above_context_alloc_cols = aligned_mi_cols;
+  }
+
+  return 0;
+
+fail:
+  // clear the mi_* values to force a realloc on resync
+  av1_set_mb_mi(cm, 0, 0);
+  av1_free_context_buffers(cm);
+  return 1;
+}
+
+void av1_remove_common(AV1_COMMON *cm) {
+  av1_free_context_buffers(cm);
+
+  aom_free(cm->fc);
+  cm->fc = NULL;
+  aom_free(cm->frame_contexts);
+  cm->frame_contexts = NULL;
+}
+
+void av1_init_context_buffers(AV1_COMMON *cm) {
+  cm->setup_mi(cm);
+  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+    memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
+}
+
+void av1_swap_current_and_last_seg_map(AV1_COMMON *cm) {
+  // Swap indices.
+  const int tmp = cm->seg_map_idx;
+  cm->seg_map_idx = cm->prev_seg_map_idx;
+  cm->prev_seg_map_idx = tmp;
+
+  cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
+  cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
+}
diff --git a/third_party/aom/av1/common/alloccommon.h b/third_party/aom/av1/common/alloccommon.h
new file mode 100644
index 000000000..51863cd04
--- /dev/null
+++ b/third_party/aom/av1/common/alloccommon.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_ALLOCCOMMON_H_
+#define AV1_COMMON_ALLOCCOMMON_H_
+
+#define INVALID_IDX -1  // Invalid buffer index.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+struct BufferPool;
+
+void av1_remove_common(struct AV1Common *cm);
+
+int av1_alloc_context_buffers(struct AV1Common *cm, int width, int height);
+void av1_init_context_buffers(struct AV1Common *cm);
+void av1_free_context_buffers(struct AV1Common *cm);
+
+void av1_free_ref_frame_buffers(struct BufferPool *pool);
+#if CONFIG_LOOP_RESTORATION
+void av1_alloc_restoration_buffers(struct AV1Common *cm);
+void av1_free_restoration_buffers(struct AV1Common *cm);
+#endif  // CONFIG_LOOP_RESTORATION
+
+int av1_alloc_state_buffers(struct AV1Common *cm, int width, int height);
+void av1_free_state_buffers(struct AV1Common *cm);
+
+void av1_set_mb_mi(struct AV1Common *cm, int width, int height);
+
+void av1_swap_current_and_last_seg_map(struct AV1Common *cm);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_ALLOCCOMMON_H_
diff --git a/third_party/aom/av1/common/arm/neon/iht4x4_add_neon.c b/third_party/aom/av1/common/arm/neon/iht4x4_add_neon.c
new file mode 100644
index 000000000..02572d405
--- /dev/null
+++ b/third_party/aom/av1/common/arm/neon/iht4x4_add_neon.c
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/common.h"
+
+static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) {
+  int32x4_t q8s32, q9s32;
+  int16x4x2_t d0x2s16, d1x2s16;
+  int32x4x2_t q0x2s32;
+
+  d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
+  d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
+
+  q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
+  q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
+  q0x2s32 = vtrnq_s32(q8s32, q9s32);
+
+  *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
+  *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
+  return;
+}
+
+static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16,
+                                             int16x4_t *d2s16) {
+  *d0s16 = vdup_n_s16((int16_t)cospi_8_64);
+  *d1s16 = vdup_n_s16((int16_t)cospi_16_64);
+  *d2s16 = vdup_n_s16((int16_t)cospi_24_64);
+  return;
+}
+
+static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16,
+                                           int16x4_t *d5s16, int16x8_t *q3s16) {
+  *d3s16 = vdup_n_s16((int16_t)sinpi_1_9);
+  *d4s16 = vdup_n_s16((int16_t)sinpi_2_9);
+  *q3s16 = vdupq_n_s16((int16_t)sinpi_3_9);
+  *d5s16 = vdup_n_s16((int16_t)sinpi_4_9);
+  return;
+}
+
+static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16,
+                              int16x4_t *d2s16, int16x8_t *q8s16,
+                              int16x8_t *q9s16) {
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
+  int16x4_t d26s16, d27s16, d28s16, d29s16;
+  int32x4_t q10s32, q13s32, q14s32, q15s32;
+  int16x8_t q13s16, q14s16;
+
+  d16s16 = vget_low_s16(*q8s16);
+  d17s16 = vget_high_s16(*q8s16);
+  d18s16 = vget_low_s16(*q9s16);
+  d19s16 = vget_high_s16(*q9s16);
+
+  d23s16 = vadd_s16(d16s16, d18s16);
+  d24s16 = vsub_s16(d16s16, d18s16);
+
+  q15s32 = vmull_s16(d17s16, *d2s16);
+  q10s32 = vmull_s16(d17s16, *d0s16);
+  q13s32 = vmull_s16(d23s16, *d1s16);
+  q14s32 = vmull_s16(d24s16, *d1s16);
+  q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
+  q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
+
+  d26s16 = vqrshrn_n_s32(q13s32, 14);
+  d27s16 = vqrshrn_n_s32(q14s32, 14);
+  d29s16 = vqrshrn_n_s32(q15s32, 14);
+  d28s16 = vqrshrn_n_s32(q10s32, 14);
+
+  q13s16 = vcombine_s16(d26s16, d27s16);
+  q14s16 = vcombine_s16(d28s16, d29s16);
+  *q8s16 = vaddq_s16(q13s16, q14s16);
+  *q9s16 = vsubq_s16(q13s16, q14s16);
+  *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16));  // vswp
+  return;
+}
+
+static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
+                               int16x4_t *d5s16, int16x8_t *q3s16,
+                               int16x8_t *q8s16, int16x8_t *q9s16) {
+  int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
+  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+  d6s16 = vget_low_s16(*q3s16);
+
+  d16s16 = vget_low_s16(*q8s16);
+  d17s16 = vget_high_s16(*q8s16);
+  d18s16 = vget_low_s16(*q9s16);
+  d19s16 = vget_high_s16(*q9s16);
+
+  q10s32 = vmull_s16(*d3s16, d16s16);
+  q11s32 = vmull_s16(*d4s16, d16s16);
+  q12s32 = vmull_s16(d6s16, d17s16);
+  q13s32 = vmull_s16(*d5s16, d18s16);
+  q14s32 = vmull_s16(*d3s16, d18s16);
+  q15s32 = vmovl_s16(d16s16);
+  q15s32 = vaddw_s16(q15s32, d19s16);
+  q8s32 = vmull_s16(*d4s16, d19s16);
+  q15s32 = vsubw_s16(q15s32, d18s16);
+  q9s32 = vmull_s16(*d5s16, d19s16);
+
+  q10s32 = vaddq_s32(q10s32, q13s32);
+  q10s32 = vaddq_s32(q10s32, q8s32);
+  q11s32 = vsubq_s32(q11s32, q14s32);
+  q8s32 = vdupq_n_s32((int32_t)sinpi_3_9);
+  q11s32 = vsubq_s32(q11s32, q9s32);
+  q15s32 = vmulq_s32(q15s32, q8s32);
+
+  q13s32 = vaddq_s32(q10s32, q12s32);
+  q10s32 = vaddq_s32(q10s32, q11s32);
+  q14s32 = vaddq_s32(q11s32, q12s32);
+  q10s32 = vsubq_s32(q10s32, q12s32);
+
+  d16s16 = vqrshrn_n_s32(q13s32, 14);
+  d17s16 = vqrshrn_n_s32(q14s32, 14);
+  d18s16 = vqrshrn_n_s32(q15s32, 14);
+  d19s16 = vqrshrn_n_s32(q10s32, 14);
+
+  *q8s16 = vcombine_s16(d16s16, d17s16);
+  *q9s16 = vcombine_s16(d18s16, d19s16);
+  return;
+}
+
+void av1_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
+                            int dest_stride, int tx_type) {
+  uint8x8_t d26u8, d27u8;
+  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
+  uint32x2_t d26u32, d27u32;
+  int16x8_t q3s16, q8s16, q9s16;
+  uint16x8_t q8u16, q9u16;
+
+  d26u32 = d27u32 = vdup_n_u32(0);
+
+  q8s16 = vld1q_s16(input);
+  q9s16 = vld1q_s16(input + 8);
+
+  TRANSPOSE4X4(&q8s16, &q9s16);
+
+  switch (tx_type) {
+    case 0:  // idct_idct is not supported. Fall back to C
+      av1_iht4x4_16_add_c(input, dest, dest_stride, tx_type);
+      return;
+      break;
+    case 1:  // iadst_idct
+      // generate constants
+      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+      // first transform rows
+      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+
+      // transpose the matrix
+      TRANSPOSE4X4(&q8s16, &q9s16);
+
+      // then transform columns
+      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+      break;
+    case 2:  // idct_iadst
+      // generate constantsyy
+      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+      // first transform rows
+      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+      // transpose the matrix
+      TRANSPOSE4X4(&q8s16, &q9s16);
+
+      // then transform columns
+      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+      break;
+    case 3:  // iadst_iadst
+      // generate constants
+      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+      // first transform rows
+      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+      // transpose the matrix
+      TRANSPOSE4X4(&q8s16, &q9s16);
+
+      // then transform columns
+      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+      break;
+    default:  // iadst_idct
+      assert(0);
+      break;
+  }
+
+  q8s16 = vrshrq_n_s16(q8s16, 4);
+  q9s16 = vrshrq_n_s16(q9s16, 4);
+
+  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
+  dest += dest_stride;
+  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
+  dest += dest_stride;
+  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
+  dest += dest_stride;
+  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
+
+  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
+  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
+
+  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
+  dest -= dest_stride;
+  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
+  dest -= dest_stride;
+  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
+  dest -= dest_stride;
+  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
+  return;
+}
diff --git a/third_party/aom/av1/common/arm/neon/iht8x8_add_neon.c b/third_party/aom/av1/common/arm/neon/iht8x8_add_neon.c
new file mode 100644
index 000000000..86798ccf1
--- /dev/null
+++ b/third_party/aom/av1/common/arm/neon/iht8x8_add_neon.c
@@ -0,0 +1,593 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/common.h"
+
+static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
+                                int16x8_t *q10s16, int16x8_t *q11s16,
+                                int16x8_t *q12s16, int16x8_t *q13s16,
+                                int16x8_t *q14s16, int16x8_t *q15s16) {
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+  int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+  d16s16 = vget_low_s16(*q8s16);
+  d17s16 = vget_high_s16(*q8s16);
+  d18s16 = vget_low_s16(*q9s16);
+  d19s16 = vget_high_s16(*q9s16);
+  d20s16 = vget_low_s16(*q10s16);
+  d21s16 = vget_high_s16(*q10s16);
+  d22s16 = vget_low_s16(*q11s16);
+  d23s16 = vget_high_s16(*q11s16);
+  d24s16 = vget_low_s16(*q12s16);
+  d25s16 = vget_high_s16(*q12s16);
+  d26s16 = vget_low_s16(*q13s16);
+  d27s16 = vget_high_s16(*q13s16);
+  d28s16 = vget_low_s16(*q14s16);
+  d29s16 = vget_high_s16(*q14s16);
+  d30s16 = vget_low_s16(*q15s16);
+  d31s16 = vget_high_s16(*q15s16);
+
+  *q8s16 = vcombine_s16(d16s16, d24s16);   // vswp d17, d24
+  *q9s16 = vcombine_s16(d18s16, d26s16);   // vswp d19, d26
+  *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+  *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+  *q12s16 = vcombine_s16(d17s16, d25s16);
+  *q13s16 = vcombine_s16(d19s16, d27s16);
+  *q14s16 = vcombine_s16(d21s16, d29s16);
+  *q15s16 = vcombine_s16(d23s16, d31s16);
+
+  q0x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
+  q1x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
+  q2x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
+  q3x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
+
+  q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                      vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+  q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                      vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+  q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                      vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+  q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                      vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+  *q8s16 = q0x2s16.val[0];
+  *q9s16 = q0x2s16.val[1];
+  *q10s16 = q1x2s16.val[0];
+  *q11s16 = q1x2s16.val[1];
+  *q12s16 = q2x2s16.val[0];
+  *q13s16 = q2x2s16.val[1];
+  *q14s16 = q3x2s16.val[0];
+  *q15s16 = q3x2s16.val[1];
+  return;
+}
+
+static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
+                              int16x8_t *q10s16, int16x8_t *q11s16,
+                              int16x8_t *q12s16, int16x8_t *q13s16,
+                              int16x8_t *q14s16, int16x8_t *q15s16) {
+  int16x4_t d0s16, d1s16, d2s16, d3s16;
+  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+  int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+  d0s16 = vdup_n_s16((int16_t)cospi_28_64);
+  d1s16 = vdup_n_s16((int16_t)cospi_4_64);
+  d2s16 = vdup_n_s16((int16_t)cospi_12_64);
+  d3s16 = vdup_n_s16((int16_t)cospi_20_64);
+
+  d16s16 = vget_low_s16(*q8s16);
+  d17s16 = vget_high_s16(*q8s16);
+  d18s16 = vget_low_s16(*q9s16);
+  d19s16 = vget_high_s16(*q9s16);
+  d20s16 = vget_low_s16(*q10s16);
+  d21s16 = vget_high_s16(*q10s16);
+  d22s16 = vget_low_s16(*q11s16);
+  d23s16 = vget_high_s16(*q11s16);
+  d24s16 = vget_low_s16(*q12s16);
+  d25s16 = vget_high_s16(*q12s16);
+  d26s16 = vget_low_s16(*q13s16);
+  d27s16 = vget_high_s16(*q13s16);
+  d28s16 = vget_low_s16(*q14s16);
+  d29s16 = vget_high_s16(*q14s16);
+  d30s16 = vget_low_s16(*q15s16);
+  d31s16 = vget_high_s16(*q15s16);
+
+  q2s32 = vmull_s16(d18s16, d0s16);
+  q3s32 = vmull_s16(d19s16, d0s16);
+  q5s32 = vmull_s16(d26s16, d2s16);
+  q6s32 = vmull_s16(d27s16, d2s16);
+
+  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+  q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+  q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+  d8s16 = vqrshrn_n_s32(q2s32, 14);
+  d9s16 = vqrshrn_n_s32(q3s32, 14);
+  d10s16 = vqrshrn_n_s32(q5s32, 14);
+  d11s16 = vqrshrn_n_s32(q6s32, 14);
+  q4s16 = vcombine_s16(d8s16, d9s16);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+
+  q2s32 = vmull_s16(d18s16, d1s16);
+  q3s32 = vmull_s16(d19s16, d1s16);
+  q9s32 = vmull_s16(d26s16, d3s16);
+  q13s32 = vmull_s16(d27s16, d3s16);
+
+  q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+  q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+  q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+  d14s16 = vqrshrn_n_s32(q2s32, 14);
+  d15s16 = vqrshrn_n_s32(q3s32, 14);
+  d12s16 = vqrshrn_n_s32(q9s32, 14);
+  d13s16 = vqrshrn_n_s32(q13s32, 14);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+  q7s16 = vcombine_s16(d14s16, d15s16);
+
+  d0s16 = vdup_n_s16((int16_t)cospi_16_64);
+
+  q2s32 = vmull_s16(d16s16, d0s16);
+  q3s32 = vmull_s16(d17s16, d0s16);
+  q13s32 = vmull_s16(d16s16, d0s16);
+  q15s32 = vmull_s16(d17s16, d0s16);
+
+  q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+  q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+  q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+  q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+  d0s16 = vdup_n_s16((int16_t)cospi_24_64);
+  d1s16 = vdup_n_s16((int16_t)cospi_8_64);
+
+  d18s16 = vqrshrn_n_s32(q2s32, 14);
+  d19s16 = vqrshrn_n_s32(q3s32, 14);
+  d22s16 = vqrshrn_n_s32(q13s32, 14);
+  d23s16 = vqrshrn_n_s32(q15s32, 14);
+  *q9s16 = vcombine_s16(d18s16, d19s16);
+  *q11s16 = vcombine_s16(d22s16, d23s16);
+
+  q2s32 = vmull_s16(d20s16, d0s16);
+  q3s32 = vmull_s16(d21s16, d0s16);
+  q8s32 = vmull_s16(d20s16, d1s16);
+  q12s32 = vmull_s16(d21s16, d1s16);
+
+  q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+  q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+  q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+  q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+  d26s16 = vqrshrn_n_s32(q2s32, 14);
+  d27s16 = vqrshrn_n_s32(q3s32, 14);
+  d30s16 = vqrshrn_n_s32(q8s32, 14);
+  d31s16 = vqrshrn_n_s32(q12s32, 14);
+  *q13s16 = vcombine_s16(d26s16, d27s16);
+  *q15s16 = vcombine_s16(d30s16, d31s16);
+
+  q0s16 = vaddq_s16(*q9s16, *q15s16);
+  q1s16 = vaddq_s16(*q11s16, *q13s16);
+  q2s16 = vsubq_s16(*q11s16, *q13s16);
+  q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+  *q13s16 = vsubq_s16(q4s16, q5s16);
+  q4s16 = vaddq_s16(q4s16, q5s16);
+  *q14s16 = vsubq_s16(q7s16, q6s16);
+  q7s16 = vaddq_s16(q7s16, q6s16);
+  d26s16 = vget_low_s16(*q13s16);
+  d27s16 = vget_high_s16(*q13s16);
+  d28s16 = vget_low_s16(*q14s16);
+  d29s16 = vget_high_s16(*q14s16);
+
+  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
+
+  q9s32 = vmull_s16(d28s16, d16s16);
+  q10s32 = vmull_s16(d29s16, d16s16);
+  q11s32 = vmull_s16(d28s16, d16s16);
+  q12s32 = vmull_s16(d29s16, d16s16);
+
+  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+  d10s16 = vqrshrn_n_s32(q9s32, 14);
+  d11s16 = vqrshrn_n_s32(q10s32, 14);
+  d12s16 = vqrshrn_n_s32(q11s32, 14);
+  d13s16 = vqrshrn_n_s32(q12s32, 14);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  *q8s16 = vaddq_s16(q0s16, q7s16);
+  *q9s16 = vaddq_s16(q1s16, q6s16);
+  *q10s16 = vaddq_s16(q2s16, q5s16);
+  *q11s16 = vaddq_s16(q3s16, q4s16);
+  *q12s16 = vsubq_s16(q3s16, q4s16);
+  *q13s16 = vsubq_s16(q2s16, q5s16);
+  *q14s16 = vsubq_s16(q1s16, q6s16);
+  *q15s16 = vsubq_s16(q0s16, q7s16);
+  return;
+}
+
+static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
+                               int16x8_t *q10s16, int16x8_t *q11s16,
+                               int16x8_t *q12s16, int16x8_t *q13s16,
+                               int16x8_t *q14s16, int16x8_t *q15s16) {
+  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+  int16x8_t q2s16, q4s16, q5s16, q6s16;
+  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
+  int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+  d16s16 = vget_low_s16(*q8s16);
+  d17s16 = vget_high_s16(*q8s16);
+  d18s16 = vget_low_s16(*q9s16);
+  d19s16 = vget_high_s16(*q9s16);
+  d20s16 = vget_low_s16(*q10s16);
+  d21s16 = vget_high_s16(*q10s16);
+  d22s16 = vget_low_s16(*q11s16);
+  d23s16 = vget_high_s16(*q11s16);
+  d24s16 = vget_low_s16(*q12s16);
+  d25s16 = vget_high_s16(*q12s16);
+  d26s16 = vget_low_s16(*q13s16);
+  d27s16 = vget_high_s16(*q13s16);
+  d28s16 = vget_low_s16(*q14s16);
+  d29s16 = vget_high_s16(*q14s16);
+  d30s16 = vget_low_s16(*q15s16);
+  d31s16 = vget_high_s16(*q15s16);
+
+  d14s16 = vdup_n_s16((int16_t)cospi_2_64);
+  d15s16 = vdup_n_s16((int16_t)cospi_30_64);
+
+  q1s32 = vmull_s16(d30s16, d14s16);
+  q2s32 = vmull_s16(d31s16, d14s16);
+  q3s32 = vmull_s16(d30s16, d15s16);
+  q4s32 = vmull_s16(d31s16, d15s16);
+
+  d30s16 = vdup_n_s16((int16_t)cospi_18_64);
+  d31s16 = vdup_n_s16((int16_t)cospi_14_64);
+
+  q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
+  q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
+  q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
+  q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);
+
+  q5s32 = vmull_s16(d22s16, d30s16);
+  q6s32 = vmull_s16(d23s16, d30s16);
+  q7s32 = vmull_s16(d22s16, d31s16);
+  q8s32 = vmull_s16(d23s16, d31s16);
+
+  q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
+  q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
+  q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
+  q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);
+
+  q11s32 = vaddq_s32(q1s32, q5s32);
+  q12s32 = vaddq_s32(q2s32, q6s32);
+  q1s32 = vsubq_s32(q1s32, q5s32);
+  q2s32 = vsubq_s32(q2s32, q6s32);
+
+  d22s16 = vqrshrn_n_s32(q11s32, 14);
+  d23s16 = vqrshrn_n_s32(q12s32, 14);
+  *q11s16 = vcombine_s16(d22s16, d23s16);
+
+  q12s32 = vaddq_s32(q3s32, q7s32);
+  q15s32 = vaddq_s32(q4s32, q8s32);
+  q3s32 = vsubq_s32(q3s32, q7s32);
+  q4s32 = vsubq_s32(q4s32, q8s32);
+
+  d2s16 = vqrshrn_n_s32(q1s32, 14);
+  d3s16 = vqrshrn_n_s32(q2s32, 14);
+  d24s16 = vqrshrn_n_s32(q12s32, 14);
+  d25s16 = vqrshrn_n_s32(q15s32, 14);
+  d6s16 = vqrshrn_n_s32(q3s32, 14);
+  d7s16 = vqrshrn_n_s32(q4s32, 14);
+  *q12s16 = vcombine_s16(d24s16, d25s16);
+
+  d0s16 = vdup_n_s16((int16_t)cospi_10_64);
+  d1s16 = vdup_n_s16((int16_t)cospi_22_64);
+  q4s32 = vmull_s16(d26s16, d0s16);
+  q5s32 = vmull_s16(d27s16, d0s16);
+  q2s32 = vmull_s16(d26s16, d1s16);
+  q6s32 = vmull_s16(d27s16, d1s16);
+
+  d30s16 = vdup_n_s16((int16_t)cospi_26_64);
+  d31s16 = vdup_n_s16((int16_t)cospi_6_64);
+
+  q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
+  q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
+  q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
+  q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);
+
+  q0s32 = vmull_s16(d18s16, d30s16);
+  q13s32 = vmull_s16(d19s16, d30s16);
+
+  q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
+  q13s32 = vmlal_s16(q13s32, d29s16, d31s16);
+
+  q10s32 = vmull_s16(d18s16, d31s16);
+  q9s32 = vmull_s16(d19s16, d31s16);
+
+  q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
+  q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
+
+  q14s32 = vaddq_s32(q2s32, q10s32);
+  q15s32 = vaddq_s32(q6s32, q9s32);
+  q2s32 = vsubq_s32(q2s32, q10s32);
+  q6s32 = vsubq_s32(q6s32, q9s32);
+
+  d28s16 = vqrshrn_n_s32(q14s32, 14);
+  d29s16 = vqrshrn_n_s32(q15s32, 14);
+  d4s16 = vqrshrn_n_s32(q2s32, 14);
+  d5s16 = vqrshrn_n_s32(q6s32, 14);
+  *q14s16 = vcombine_s16(d28s16, d29s16);
+
+  q9s32 = vaddq_s32(q4s32, q0s32);
+  q10s32 = vaddq_s32(q5s32, q13s32);
+  q4s32 = vsubq_s32(q4s32, q0s32);
+  q5s32 = vsubq_s32(q5s32, q13s32);
+
+  d30s16 = vdup_n_s16((int16_t)cospi_8_64);
+  d31s16 = vdup_n_s16((int16_t)cospi_24_64);
+
+  d18s16 = vqrshrn_n_s32(q9s32, 14);
+  d19s16 = vqrshrn_n_s32(q10s32, 14);
+  d8s16 = vqrshrn_n_s32(q4s32, 14);
+  d9s16 = vqrshrn_n_s32(q5s32, 14);
+  *q9s16 = vcombine_s16(d18s16, d19s16);
+
+  q5s32 = vmull_s16(d2s16, d30s16);
+  q6s32 = vmull_s16(d3s16, d30s16);
+  q7s32 = vmull_s16(d2s16, d31s16);
+  q0s32 = vmull_s16(d3s16, d31s16);
+
+  q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
+  q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
+  q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
+  q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
+
+  q1s32 = vmull_s16(d4s16, d30s16);
+  q3s32 = vmull_s16(d5s16, d30s16);
+  q10s32 = vmull_s16(d4s16, d31s16);
+  q2s32 = vmull_s16(d5s16, d31s16);
+
+  q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
+  q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
+  q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
+  q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
+
+  *q8s16 = vaddq_s16(*q11s16, *q9s16);
+  *q11s16 = vsubq_s16(*q11s16, *q9s16);
+  q4s16 = vaddq_s16(*q12s16, *q14s16);
+  *q12s16 = vsubq_s16(*q12s16, *q14s16);
+
+  q14s32 = vaddq_s32(q5s32, q1s32);
+  q15s32 = vaddq_s32(q6s32, q3s32);
+  q5s32 = vsubq_s32(q5s32, q1s32);
+  q6s32 = vsubq_s32(q6s32, q3s32);
+
+  d18s16 = vqrshrn_n_s32(q14s32, 14);
+  d19s16 = vqrshrn_n_s32(q15s32, 14);
+  d10s16 = vqrshrn_n_s32(q5s32, 14);
+  d11s16 = vqrshrn_n_s32(q6s32, 14);
+  *q9s16 = vcombine_s16(d18s16, d19s16);
+
+  q1s32 = vaddq_s32(q7s32, q10s32);
+  q3s32 = vaddq_s32(q0s32, q2s32);
+  q7s32 = vsubq_s32(q7s32, q10s32);
+  q0s32 = vsubq_s32(q0s32, q2s32);
+
+  d28s16 = vqrshrn_n_s32(q1s32, 14);
+  d29s16 = vqrshrn_n_s32(q3s32, 14);
+  d14s16 = vqrshrn_n_s32(q7s32, 14);
+  d15s16 = vqrshrn_n_s32(q0s32, 14);
+  *q14s16 = vcombine_s16(d28s16, d29s16);
+
+  d30s16 = vdup_n_s16((int16_t)cospi_16_64);
+
+  d22s16 = vget_low_s16(*q11s16);
+  d23s16 = vget_high_s16(*q11s16);
+  q2s32 = vmull_s16(d22s16, d30s16);
+  q3s32 = vmull_s16(d23s16, d30s16);
+  q13s32 = vmull_s16(d22s16, d30s16);
+  q1s32 = vmull_s16(d23s16, d30s16);
+
+  d24s16 = vget_low_s16(*q12s16);
+  d25s16 = vget_high_s16(*q12s16);
+  q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
+  q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
+  q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
+  q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
+
+  d4s16 = vqrshrn_n_s32(q2s32, 14);
+  d5s16 = vqrshrn_n_s32(q3s32, 14);
+  d24s16 = vqrshrn_n_s32(q13s32, 14);
+  d25s16 = vqrshrn_n_s32(q1s32, 14);
+  q2s16 = vcombine_s16(d4s16, d5s16);
+  *q12s16 = vcombine_s16(d24s16, d25s16);
+
+  q13s32 = vmull_s16(d10s16, d30s16);
+  q1s32 = vmull_s16(d11s16, d30s16);
+  q11s32 = vmull_s16(d10s16, d30s16);
+  q0s32 = vmull_s16(d11s16, d30s16);
+
+  q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
+  q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
+  q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
+  q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
+
+  d20s16 = vqrshrn_n_s32(q13s32, 14);
+  d21s16 = vqrshrn_n_s32(q1s32, 14);
+  d12s16 = vqrshrn_n_s32(q11s32, 14);
+  d13s16 = vqrshrn_n_s32(q0s32, 14);
+  *q10s16 = vcombine_s16(d20s16, d21s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  q5s16 = vdupq_n_s16(0);
+
+  *q9s16 = vsubq_s16(q5s16, *q9s16);
+  *q11s16 = vsubq_s16(q5s16, q2s16);
+  *q13s16 = vsubq_s16(q5s16, q6s16);
+  *q15s16 = vsubq_s16(q5s16, q4s16);
+  return;
+}
+
+void av1_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
+                            int dest_stride, int tx_type) {
+  int i;
+  uint8_t *d1, *d2;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8;
+  uint64x1_t d0u64, d1u64, d2u64, d3u64;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+  q8s16 = vld1q_s16(input);
+  q9s16 = vld1q_s16(input + 8);
+  q10s16 = vld1q_s16(input + 8 * 2);
+  q11s16 = vld1q_s16(input + 8 * 3);
+  q12s16 = vld1q_s16(input + 8 * 4);
+  q13s16 = vld1q_s16(input + 8 * 5);
+  q14s16 = vld1q_s16(input + 8 * 6);
+  q15s16 = vld1q_s16(input + 8 * 7);
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  switch (tx_type) {
+    case 0:  // idct_idct is not supported. Fall back to C
+      av1_iht8x8_64_add_c(input, dest, dest_stride, tx_type);
+      return;
+      break;
+    case 1:  // iadst_idct
+      // generate IDCT constants
+      // GENERATE_IDCT_CONSTANTS
+
+      // first transform rows
+      IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+                 &q15s16);
+
+      // transpose the matrix
+      TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+                   &q15s16);
+
+      // generate IADST constants
+      // GENERATE_IADST_CONSTANTS
+
+      // then transform columns
+      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+                  &q15s16);
+      break;
+    case 2:  // idct_iadst
+      // generate IADST constants
+      // GENERATE_IADST_CONSTANTS
+
+      // first transform rows
+      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+                  &q15s16);
+
+      // transpose the matrix
+      TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+                   &q15s16);
+
+      // generate IDCT constants
+      // GENERATE_IDCT_CONSTANTS
+
+      // then transform columns
+      IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+                 &q15s16);
+      break;
+    case 3:  // iadst_iadst
+      // generate IADST constants
+      // GENERATE_IADST_CONSTANTS
+
+      // first transform rows
+      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+                  &q15s16);
+
+      // transpose the matrix
+      TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+                   &q15s16);
+
+      // then transform columns
+      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+                  &q15s16);
+      break;
+    default:  // iadst_idct
+      assert(0);
+      break;
+  }
+
+  q8s16 = vrshrq_n_s16(q8s16, 5);
+  q9s16 = vrshrq_n_s16(q9s16, 5);
+  q10s16 = vrshrq_n_s16(q10s16, 5);
+  q11s16 = vrshrq_n_s16(q11s16, 5);
+  q12s16 = vrshrq_n_s16(q12s16, 5);
+  q13s16 = vrshrq_n_s16(q13s16, 5);
+  q14s16 = vrshrq_n_s16(q14s16, 5);
+  q15s16 = vrshrq_n_s16(q15s16, 5);
+
+  for (d1 = d2 = dest, i = 0; i < 2; i++) {
+    if (i != 0) {
+      q8s16 = q12s16;
+      q9s16 = q13s16;
+      q10s16 = q14s16;
+      q11s16 = q15s16;
+    }
+
+    d0u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d1u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d2u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d3u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+    q10u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+    q11u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+    d2 += dest_stride;
+  }
+  return;
+}
diff --git a/third_party/aom/av1/common/av1_fwd_txfm1d.c b/third_party/aom/av1/common/av1_fwd_txfm1d.c
new file mode 100644
index 000000000..7a691e03f
--- /dev/null
+++ b/third_party/aom/av1/common/av1_fwd_txfm1d.c
@@ -0,0 +1,2312 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include "av1/common/av1_fwd_txfm1d.h"
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+
+void range_check_func(int32_t stage, const int32_t *input, const int32_t *buf,
+                      int32_t size, int8_t bit);
+
+#define range_check(stage, input, buf, size, bit) \
+  range_check_func(stage, input, buf, size, bit)
+#else
+#define range_check(stage, input, buf, size, bit) \
+  {                                               \
+    (void)stage;                                  \
+    (void)input;                                  \
+    (void)buf;                                    \
+    (void)size;                                   \
+    (void)bit;                                    \
+  }
+#endif
+
+// TODO(angiebird): Make 1-d txfm functions static
+void av1_fdct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+                   const int8_t *stage_range) {
+  const int32_t size = 4;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[3];
+  bf1[1] = input[1] + input[2];
+  bf1[2] = -input[2] + input[1];
+  bf1[3] = -input[3] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[2];
+  bf1[2] = bf0[1];
+  bf1[3] = bf0[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+                   const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[7];
+  bf1[1] = input[1] + input[6];
+  bf1[2] = input[2] + input[5];
+  bf1[3] = input[3] + input[4];
+  bf1[4] = -input[4] + input[3];
+  bf1[5] = -input[5] + input[2];
+  bf1[6] = -input[6] + input[1];
+  bf1[7] = -input[7] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[4];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[6];
+  bf1[4] = bf0[1];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[3];
+  bf1[7] = bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct16_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[15];
+  bf1[1] = input[1] + input[14];
+  bf1[2] = input[2] + input[13];
+  bf1[3] = input[3] + input[12];
+  bf1[4] = input[4] + input[11];
+  bf1[5] = input[5] + input[10];
+  bf1[6] = input[6] + input[9];
+  bf1[7] = input[7] + input[8];
+  bf1[8] = -input[8] + input[7];
+  bf1[9] = -input[9] + input[6];
+  bf1[10] = -input[10] + input[5];
+  bf1[11] = -input[11] + input[4];
+  bf1[12] = -input[12] + input[3];
+  bf1[13] = -input[13] + input[2];
+  bf1[14] = -input[14] + input[1];
+  bf1[15] = -input[15] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[8];
+  bf1[2] = bf0[4];
+  bf1[3] = bf0[12];
+  bf1[4] = bf0[2];
+  bf1[5] = bf0[10];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[14];
+  bf1[8] = bf0[1];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[5];
+  bf1[11] = bf0[13];
+  bf1[12] = bf0[3];
+  bf1[13] = bf0[11];
+  bf1[14] = bf0[7];
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct32_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 32;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[31];
+  bf1[1] = input[1] + input[30];
+  bf1[2] = input[2] + input[29];
+  bf1[3] = input[3] + input[28];
+  bf1[4] = input[4] + input[27];
+  bf1[5] = input[5] + input[26];
+  bf1[6] = input[6] + input[25];
+  bf1[7] = input[7] + input[24];
+  bf1[8] = input[8] + input[23];
+  bf1[9] = input[9] + input[22];
+  bf1[10] = input[10] + input[21];
+  bf1[11] = input[11] + input[20];
+  bf1[12] = input[12] + input[19];
+  bf1[13] = input[13] + input[18];
+  bf1[14] = input[14] + input[17];
+  bf1[15] = input[15] + input[16];
+  bf1[16] = -input[16] + input[15];
+  bf1[17] = -input[17] + input[14];
+  bf1[18] = -input[18] + input[13];
+  bf1[19] = -input[19] + input[12];
+  bf1[20] = -input[20] + input[11];
+  bf1[21] = -input[21] + input[10];
+  bf1[22] = -input[22] + input[9];
+  bf1[23] = -input[23] + input[8];
+  bf1[24] = -input[24] + input[7];
+  bf1[25] = -input[25] + input[6];
+  bf1[26] = -input[26] + input[5];
+  bf1[27] = -input[27] + input[4];
+  bf1[28] = -input[28] + input[3];
+  bf1[29] = -input[29] + input[2];
+  bf1[30] = -input[30] + input[1];
+  bf1[31] = -input[31] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = -bf0[8] + bf0[7];
+  bf1[9] = -bf0[9] + bf0[6];
+  bf1[10] = -bf0[10] + bf0[5];
+  bf1[11] = -bf0[11] + bf0[4];
+  bf1[12] = -bf0[12] + bf0[3];
+  bf1[13] = -bf0[13] + bf0[2];
+  bf1[14] = -bf0[14] + bf0[1];
+  bf1[15] = -bf0[15] + bf0[0];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = -bf0[20] + bf0[19];
+  bf1[21] = -bf0[21] + bf0[18];
+  bf1[22] = -bf0[22] + bf0[17];
+  bf1[23] = -bf0[23] + bf0[16];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[28] + bf0[27];
+  bf1[29] = bf0[29] + bf0[26];
+  bf1[30] = bf0[30] + bf0[25];
+  bf1[31] = bf0[31] + bf0[24];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit[stage]);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = -bf0[18] + bf0[17];
+  bf1[19] = -bf0[19] + bf0[16];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[22] + bf0[21];
+  bf1[23] = bf0[23] + bf0[20];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = -bf0[26] + bf0[25];
+  bf1[27] = -bf0[27] + bf0[24];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[30] + bf0[29];
+  bf1[31] = bf0[31] + bf0[28];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit[stage]);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit[stage]);
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = -bf0[17] + bf0[16];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[19] + bf0[18];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = -bf0[21] + bf0[20];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[23] + bf0[22];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = -bf0[25] + bf0[24];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[27] + bf0[26];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = -bf0[29] + bf0[28];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[31] + bf0[30];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[16];
+  bf1[2] = bf0[8];
+  bf1[3] = bf0[24];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[20];
+  bf1[6] = bf0[12];
+  bf1[7] = bf0[28];
+  bf1[8] = bf0[2];
+  bf1[9] = bf0[18];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[26];
+  bf1[12] = bf0[6];
+  bf1[13] = bf0[22];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[30];
+  bf1[16] = bf0[1];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[9];
+  bf1[19] = bf0[25];
+  bf1[20] = bf0[5];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[13];
+  bf1[23] = bf0[29];
+  bf1[24] = bf0[3];
+  bf1[25] = bf0[19];
+  bf1[26] = bf0[11];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[7];
+  bf1[29] = bf0[23];
+  bf1[30] = bf0[15];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst4_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 4;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[3];
+  bf1[1] = input[0];
+  bf1[2] = input[1];
+  bf1[3] = input[2];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[8], bf0[1], cospi[56], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[40], bf0[3], cospi[24], bf0[2], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = -bf0[2] + bf0[0];
+  bf1[3] = -bf0[3] + bf0[1];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[2];
+  bf1[2] = bf0[3];
+  bf1[3] = -bf0[1];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst8_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[7];
+  bf1[1] = input[0];
+  bf1[2] = input[5];
+  bf1[3] = input[2];
+  bf1[4] = input[3];
+  bf1[5] = input[4];
+  bf1[6] = input[1];
+  bf1[7] = input[6];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[4], bf0[1], cospi[60], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[20], bf0[3], cospi[44], bf0[2], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[36], bf0[5], cospi[28], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[52], bf0[7], cospi[12], bf0[6], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = -bf0[4] + bf0[0];
+  bf1[5] = -bf0[5] + bf0[1];
+  bf1[6] = -bf0[6] + bf0[2];
+  bf1[7] = -bf0[7] + bf0[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = -bf0[2] + bf0[0];
+  bf1[3] = -bf0[3] + bf0[1];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = -bf0[6] + bf0[4];
+  bf1[7] = -bf0[7] + bf0[5];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[4];
+  bf1[2] = bf0[6];
+  bf1[3] = -bf0[2];
+  bf1[4] = bf0[3];
+  bf1[5] = -bf0[7];
+  bf1[6] = bf0[5];
+  bf1[7] = -bf0[1];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst16_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[15];
+  bf1[1] = input[0];
+  bf1[2] = input[13];
+  bf1[3] = input[2];
+  bf1[4] = input[11];
+  bf1[5] = input[4];
+  bf1[6] = input[9];
+  bf1[7] = input[6];
+  bf1[8] = input[7];
+  bf1[9] = input[8];
+  bf1[10] = input[5];
+  bf1[11] = input[10];
+  bf1[12] = input[3];
+  bf1[13] = input[12];
+  bf1[14] = input[1];
+  bf1[15] = input[14];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[2], bf0[1], cospi[62], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[10], bf0[3], cospi[54], bf0[2], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[18], bf0[5], cospi[46], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[26], bf0[7], cospi[38], bf0[6], cos_bit[stage]);
+  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[34], bf0[9], cospi[30], bf0[8], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[42], bf0[11], cospi[22], bf0[10], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[50], bf0[13], cospi[14], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(-cospi[58], bf0[15], cospi[6], bf0[14], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = -bf0[8] + bf0[0];
+  bf1[9] = -bf0[9] + bf0[1];
+  bf1[10] = -bf0[10] + bf0[2];
+  bf1[11] = -bf0[11] + bf0[3];
+  bf1[12] = -bf0[12] + bf0[4];
+  bf1[13] = -bf0[13] + bf0[5];
+  bf1[14] = -bf0[14] + bf0[6];
+  bf1[15] = -bf0[15] + bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit[stage]);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = -bf0[4] + bf0[0];
+  bf1[5] = -bf0[5] + bf0[1];
+  bf1[6] = -bf0[6] + bf0[2];
+  bf1[7] = -bf0[7] + bf0[3];
+  bf1[8] = bf0[8] + bf0[12];
+  bf1[9] = bf0[9] + bf0[13];
+  bf1[10] = bf0[10] + bf0[14];
+  bf1[11] = bf0[11] + bf0[15];
+  bf1[12] = -bf0[12] + bf0[8];
+  bf1[13] = -bf0[13] + bf0[9];
+  bf1[14] = -bf0[14] + bf0[10];
+  bf1[15] = -bf0[15] + bf0[11];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = -bf0[2] + bf0[0];
+  bf1[3] = -bf0[3] + bf0[1];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = -bf0[6] + bf0[4];
+  bf1[7] = -bf0[7] + bf0[5];
+  bf1[8] = bf0[8] + bf0[10];
+  bf1[9] = bf0[9] + bf0[11];
+  bf1[10] = -bf0[10] + bf0[8];
+  bf1[11] = -bf0[11] + bf0[9];
+  bf1[12] = bf0[12] + bf0[14];
+  bf1[13] = bf0[13] + bf0[15];
+  bf1[14] = -bf0[14] + bf0[12];
+  bf1[15] = -bf0[15] + bf0[13];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[8];
+  bf1[2] = bf0[12];
+  bf1[3] = -bf0[4];
+  bf1[4] = bf0[6];
+  bf1[5] = -bf0[14];
+  bf1[6] = bf0[10];
+  bf1[7] = -bf0[2];
+  bf1[8] = bf0[3];
+  bf1[9] = -bf0[11];
+  bf1[10] = bf0[15];
+  bf1[11] = -bf0[7];
+  bf1[12] = bf0[5];
+  bf1[13] = -bf0[13];
+  bf1[14] = bf0[9];
+  bf1[15] = -bf0[1];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst32_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 32;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[31];
+  bf1[1] = input[0];
+  bf1[2] = input[29];
+  bf1[3] = input[2];
+  bf1[4] = input[27];
+  bf1[5] = input[4];
+  bf1[6] = input[25];
+  bf1[7] = input[6];
+  bf1[8] = input[23];
+  bf1[9] = input[8];
+  bf1[10] = input[21];
+  bf1[11] = input[10];
+  bf1[12] = input[19];
+  bf1[13] = input[12];
+  bf1[14] = input[17];
+  bf1[15] = input[14];
+  bf1[16] = input[15];
+  bf1[17] = input[16];
+  bf1[18] = input[13];
+  bf1[19] = input[18];
+  bf1[20] = input[11];
+  bf1[21] = input[20];
+  bf1[22] = input[9];
+  bf1[23] = input[22];
+  bf1[24] = input[7];
+  bf1[25] = input[24];
+  bf1[26] = input[5];
+  bf1[27] = input[26];
+  bf1[28] = input[3];
+  bf1[29] = input[28];
+  bf1[30] = input[1];
+  bf1[31] = input[30];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[1], bf0[1], cospi[63], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[5], bf0[3], cospi[59], bf0[2], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[9], bf0[5], cospi[55], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[13], bf0[7], cospi[51], bf0[6], cos_bit[stage]);
+  bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[17], bf0[9], cospi[47], bf0[8], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[21], bf0[11], cospi[43], bf0[10], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[25], bf0[13], cospi[39], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(-cospi[29], bf0[15], cospi[35], bf0[14], cos_bit[stage]);
+  bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit[stage]);
+  bf1[17] = half_btf(-cospi[33], bf0[17], cospi[31], bf0[16], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[37], bf0[19], cospi[27], bf0[18], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[41], bf0[21], cospi[23], bf0[20], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[45], bf0[23], cospi[19], bf0[22], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(-cospi[49], bf0[25], cospi[15], bf0[24], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[53], bf0[27], cospi[11], bf0[26], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(-cospi[57], bf0[29], cospi[7], bf0[28], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(-cospi[61], bf0[31], cospi[3], bf0[30], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[16];
+  bf1[1] = bf0[1] + bf0[17];
+  bf1[2] = bf0[2] + bf0[18];
+  bf1[3] = bf0[3] + bf0[19];
+  bf1[4] = bf0[4] + bf0[20];
+  bf1[5] = bf0[5] + bf0[21];
+  bf1[6] = bf0[6] + bf0[22];
+  bf1[7] = bf0[7] + bf0[23];
+  bf1[8] = bf0[8] + bf0[24];
+  bf1[9] = bf0[9] + bf0[25];
+  bf1[10] = bf0[10] + bf0[26];
+  bf1[11] = bf0[11] + bf0[27];
+  bf1[12] = bf0[12] + bf0[28];
+  bf1[13] = bf0[13] + bf0[29];
+  bf1[14] = bf0[14] + bf0[30];
+  bf1[15] = bf0[15] + bf0[31];
+  bf1[16] = -bf0[16] + bf0[0];
+  bf1[17] = -bf0[17] + bf0[1];
+  bf1[18] = -bf0[18] + bf0[2];
+  bf1[19] = -bf0[19] + bf0[3];
+  bf1[20] = -bf0[20] + bf0[4];
+  bf1[21] = -bf0[21] + bf0[5];
+  bf1[22] = -bf0[22] + bf0[6];
+  bf1[23] = -bf0[23] + bf0[7];
+  bf1[24] = -bf0[24] + bf0[8];
+  bf1[25] = -bf0[25] + bf0[9];
+  bf1[26] = -bf0[26] + bf0[10];
+  bf1[27] = -bf0[27] + bf0[11];
+  bf1[28] = -bf0[28] + bf0[12];
+  bf1[29] = -bf0[29] + bf0[13];
+  bf1[30] = -bf0[30] + bf0[14];
+  bf1[31] = -bf0[31] + bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit[stage]);
+  bf1[17] = half_btf(-cospi[4], bf0[17], cospi[60], bf0[16], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[20], bf0[19], cospi[44], bf0[18], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[36], bf0[21], cospi[28], bf0[20], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[52], bf0[23], cospi[12], bf0[22], cos_bit[stage]);
+  bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[60], bf0[25], cospi[4], bf0[24], cos_bit[stage]);
+  bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[44], bf0[27], cospi[20], bf0[26], cos_bit[stage]);
+  bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[28], bf0[29], cospi[36], bf0[28], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[12], bf0[31], cospi[52], bf0[30], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = -bf0[8] + bf0[0];
+  bf1[9] = -bf0[9] + bf0[1];
+  bf1[10] = -bf0[10] + bf0[2];
+  bf1[11] = -bf0[11] + bf0[3];
+  bf1[12] = -bf0[12] + bf0[4];
+  bf1[13] = -bf0[13] + bf0[5];
+  bf1[14] = -bf0[14] + bf0[6];
+  bf1[15] = -bf0[15] + bf0[7];
+  bf1[16] = bf0[16] + bf0[24];
+  bf1[17] = bf0[17] + bf0[25];
+  bf1[18] = bf0[18] + bf0[26];
+  bf1[19] = bf0[19] + bf0[27];
+  bf1[20] = bf0[20] + bf0[28];
+  bf1[21] = bf0[21] + bf0[29];
+  bf1[22] = bf0[22] + bf0[30];
+  bf1[23] = bf0[23] + bf0[31];
+  bf1[24] = -bf0[24] + bf0[16];
+  bf1[25] = -bf0[25] + bf0[17];
+  bf1[26] = -bf0[26] + bf0[18];
+  bf1[27] = -bf0[27] + bf0[19];
+  bf1[28] = -bf0[28] + bf0[20];
+  bf1[29] = -bf0[29] + bf0[21];
+  bf1[30] = -bf0[30] + bf0[22];
+  bf1[31] = -bf0[31] + bf0[23];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit[stage]);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(-cospi[8], bf0[25], cospi[56], bf0[24], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[40], bf0[27], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[56], bf0[29], cospi[8], bf0[28], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[24], bf0[31], cospi[40], bf0[30], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = -bf0[4] + bf0[0];
+  bf1[5] = -bf0[5] + bf0[1];
+  bf1[6] = -bf0[6] + bf0[2];
+  bf1[7] = -bf0[7] + bf0[3];
+  bf1[8] = bf0[8] + bf0[12];
+  bf1[9] = bf0[9] + bf0[13];
+  bf1[10] = bf0[10] + bf0[14];
+  bf1[11] = bf0[11] + bf0[15];
+  bf1[12] = -bf0[12] + bf0[8];
+  bf1[13] = -bf0[13] + bf0[9];
+  bf1[14] = -bf0[14] + bf0[10];
+  bf1[15] = -bf0[15] + bf0[11];
+  bf1[16] = bf0[16] + bf0[20];
+  bf1[17] = bf0[17] + bf0[21];
+  bf1[18] = bf0[18] + bf0[22];
+  bf1[19] = bf0[19] + bf0[23];
+  bf1[20] = -bf0[20] + bf0[16];
+  bf1[21] = -bf0[21] + bf0[17];
+  bf1[22] = -bf0[22] + bf0[18];
+  bf1[23] = -bf0[23] + bf0[19];
+  bf1[24] = bf0[24] + bf0[28];
+  bf1[25] = bf0[25] + bf0[29];
+  bf1[26] = bf0[26] + bf0[30];
+  bf1[27] = bf0[27] + bf0[31];
+  bf1[28] = -bf0[28] + bf0[24];
+  bf1[29] = -bf0[29] + bf0[25];
+  bf1[30] = -bf0[30] + bf0[26];
+  bf1[31] = -bf0[31] + bf0[27];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[20], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[48], bf0[23], cospi[16], bf0[22], cos_bit[stage]);
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(-cospi[16], bf0[29], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[48], bf0[31], cospi[16], bf0[30], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = -bf0[2] + bf0[0];
+  bf1[3] = -bf0[3] + bf0[1];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = -bf0[6] + bf0[4];
+  bf1[7] = -bf0[7] + bf0[5];
+  bf1[8] = bf0[8] + bf0[10];
+  bf1[9] = bf0[9] + bf0[11];
+  bf1[10] = -bf0[10] + bf0[8];
+  bf1[11] = -bf0[11] + bf0[9];
+  bf1[12] = bf0[12] + bf0[14];
+  bf1[13] = bf0[13] + bf0[15];
+  bf1[14] = -bf0[14] + bf0[12];
+  bf1[15] = -bf0[15] + bf0[13];
+  bf1[16] = bf0[16] + bf0[18];
+  bf1[17] = bf0[17] + bf0[19];
+  bf1[18] = -bf0[18] + bf0[16];
+  bf1[19] = -bf0[19] + bf0[17];
+  bf1[20] = bf0[20] + bf0[22];
+  bf1[21] = bf0[21] + bf0[23];
+  bf1[22] = -bf0[22] + bf0[20];
+  bf1[23] = -bf0[23] + bf0[21];
+  bf1[24] = bf0[24] + bf0[26];
+  bf1[25] = bf0[25] + bf0[27];
+  bf1[26] = -bf0[26] + bf0[24];
+  bf1[27] = -bf0[27] + bf0[25];
+  bf1[28] = bf0[28] + bf0[30];
+  bf1[29] = bf0[29] + bf0[31];
+  bf1[30] = -bf0[30] + bf0[28];
+  bf1[31] = -bf0[31] + bf0[29];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[32], bf0[19], cospi[32], bf0[18], cos_bit[stage]);
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[22], cos_bit[stage]);
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[32], bf0[27], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(-cospi[32], bf0[31], cospi[32], bf0[30], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[16];
+  bf1[2] = bf0[24];
+  bf1[3] = -bf0[8];
+  bf1[4] = bf0[12];
+  bf1[5] = -bf0[28];
+  bf1[6] = bf0[20];
+  bf1[7] = -bf0[4];
+  bf1[8] = bf0[6];
+  bf1[9] = -bf0[22];
+  bf1[10] = bf0[30];
+  bf1[11] = -bf0[14];
+  bf1[12] = bf0[10];
+  bf1[13] = -bf0[26];
+  bf1[14] = bf0[18];
+  bf1[15] = -bf0[2];
+  bf1[16] = bf0[3];
+  bf1[17] = -bf0[19];
+  bf1[18] = bf0[27];
+  bf1[19] = -bf0[11];
+  bf1[20] = bf0[15];
+  bf1[21] = -bf0[31];
+  bf1[22] = bf0[23];
+  bf1[23] = -bf0[7];
+  bf1[24] = bf0[5];
+  bf1[25] = -bf0[21];
+  bf1[26] = bf0[29];
+  bf1[27] = -bf0[13];
+  bf1[28] = bf0[9];
+  bf1[29] = -bf0[25];
+  bf1[30] = bf0[17];
+  bf1[31] = -bf0[1];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+#if CONFIG_TX64X64
+void av1_fdct64_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 64;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[64];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf1 = output;
+  bf1[0] = input[0] + input[63];
+  bf1[1] = input[1] + input[62];
+  bf1[2] = input[2] + input[61];
+  bf1[3] = input[3] + input[60];
+  bf1[4] = input[4] + input[59];
+  bf1[5] = input[5] + input[58];
+  bf1[6] = input[6] + input[57];
+  bf1[7] = input[7] + input[56];
+  bf1[8] = input[8] + input[55];
+  bf1[9] = input[9] + input[54];
+  bf1[10] = input[10] + input[53];
+  bf1[11] = input[11] + input[52];
+  bf1[12] = input[12] + input[51];
+  bf1[13] = input[13] + input[50];
+  bf1[14] = input[14] + input[49];
+  bf1[15] = input[15] + input[48];
+  bf1[16] = input[16] + input[47];
+  bf1[17] = input[17] + input[46];
+  bf1[18] = input[18] + input[45];
+  bf1[19] = input[19] + input[44];
+  bf1[20] = input[20] + input[43];
+  bf1[21] = input[21] + input[42];
+  bf1[22] = input[22] + input[41];
+  bf1[23] = input[23] + input[40];
+  bf1[24] = input[24] + input[39];
+  bf1[25] = input[25] + input[38];
+  bf1[26] = input[26] + input[37];
+  bf1[27] = input[27] + input[36];
+  bf1[28] = input[28] + input[35];
+  bf1[29] = input[29] + input[34];
+  bf1[30] = input[30] + input[33];
+  bf1[31] = input[31] + input[32];
+  bf1[32] = -input[32] + input[31];
+  bf1[33] = -input[33] + input[30];
+  bf1[34] = -input[34] + input[29];
+  bf1[35] = -input[35] + input[28];
+  bf1[36] = -input[36] + input[27];
+  bf1[37] = -input[37] + input[26];
+  bf1[38] = -input[38] + input[25];
+  bf1[39] = -input[39] + input[24];
+  bf1[40] = -input[40] + input[23];
+  bf1[41] = -input[41] + input[22];
+  bf1[42] = -input[42] + input[21];
+  bf1[43] = -input[43] + input[20];
+  bf1[44] = -input[44] + input[19];
+  bf1[45] = -input[45] + input[18];
+  bf1[46] = -input[46] + input[17];
+  bf1[47] = -input[47] + input[16];
+  bf1[48] = -input[48] + input[15];
+  bf1[49] = -input[49] + input[14];
+  bf1[50] = -input[50] + input[13];
+  bf1[51] = -input[51] + input[12];
+  bf1[52] = -input[52] + input[11];
+  bf1[53] = -input[53] + input[10];
+  bf1[54] = -input[54] + input[9];
+  bf1[55] = -input[55] + input[8];
+  bf1[56] = -input[56] + input[7];
+  bf1[57] = -input[57] + input[6];
+  bf1[58] = -input[58] + input[5];
+  bf1[59] = -input[59] + input[4];
+  bf1[60] = -input[60] + input[3];
+  bf1[61] = -input[61] + input[2];
+  bf1[62] = -input[62] + input[1];
+  bf1[63] = -input[63] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[31];
+  bf1[1] = bf0[1] + bf0[30];
+  bf1[2] = bf0[2] + bf0[29];
+  bf1[3] = bf0[3] + bf0[28];
+  bf1[4] = bf0[4] + bf0[27];
+  bf1[5] = bf0[5] + bf0[26];
+  bf1[6] = bf0[6] + bf0[25];
+  bf1[7] = bf0[7] + bf0[24];
+  bf1[8] = bf0[8] + bf0[23];
+  bf1[9] = bf0[9] + bf0[22];
+  bf1[10] = bf0[10] + bf0[21];
+  bf1[11] = bf0[11] + bf0[20];
+  bf1[12] = bf0[12] + bf0[19];
+  bf1[13] = bf0[13] + bf0[18];
+  bf1[14] = bf0[14] + bf0[17];
+  bf1[15] = bf0[15] + bf0[16];
+  bf1[16] = -bf0[16] + bf0[15];
+  bf1[17] = -bf0[17] + bf0[14];
+  bf1[18] = -bf0[18] + bf0[13];
+  bf1[19] = -bf0[19] + bf0[12];
+  bf1[20] = -bf0[20] + bf0[11];
+  bf1[21] = -bf0[21] + bf0[10];
+  bf1[22] = -bf0[22] + bf0[9];
+  bf1[23] = -bf0[23] + bf0[8];
+  bf1[24] = -bf0[24] + bf0[7];
+  bf1[25] = -bf0[25] + bf0[6];
+  bf1[26] = -bf0[26] + bf0[5];
+  bf1[27] = -bf0[27] + bf0[4];
+  bf1[28] = -bf0[28] + bf0[3];
+  bf1[29] = -bf0[29] + bf0[2];
+  bf1[30] = -bf0[30] + bf0[1];
+  bf1[31] = -bf0[31] + bf0[0];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = bf0[37];
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
+  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
+  bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit[stage]);
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = bf0[58];
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = -bf0[8] + bf0[7];
+  bf1[9] = -bf0[9] + bf0[6];
+  bf1[10] = -bf0[10] + bf0[5];
+  bf1[11] = -bf0[11] + bf0[4];
+  bf1[12] = -bf0[12] + bf0[3];
+  bf1[13] = -bf0[13] + bf0[2];
+  bf1[14] = -bf0[14] + bf0[1];
+  bf1[15] = -bf0[15] + bf0[0];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[47];
+  bf1[33] = bf0[33] + bf0[46];
+  bf1[34] = bf0[34] + bf0[45];
+  bf1[35] = bf0[35] + bf0[44];
+  bf1[36] = bf0[36] + bf0[43];
+  bf1[37] = bf0[37] + bf0[42];
+  bf1[38] = bf0[38] + bf0[41];
+  bf1[39] = bf0[39] + bf0[40];
+  bf1[40] = -bf0[40] + bf0[39];
+  bf1[41] = -bf0[41] + bf0[38];
+  bf1[42] = -bf0[42] + bf0[37];
+  bf1[43] = -bf0[43] + bf0[36];
+  bf1[44] = -bf0[44] + bf0[35];
+  bf1[45] = -bf0[45] + bf0[34];
+  bf1[46] = -bf0[46] + bf0[33];
+  bf1[47] = -bf0[47] + bf0[32];
+  bf1[48] = -bf0[48] + bf0[63];
+  bf1[49] = -bf0[49] + bf0[62];
+  bf1[50] = -bf0[50] + bf0[61];
+  bf1[51] = -bf0[51] + bf0[60];
+  bf1[52] = -bf0[52] + bf0[59];
+  bf1[53] = -bf0[53] + bf0[58];
+  bf1[54] = -bf0[54] + bf0[57];
+  bf1[55] = -bf0[55] + bf0[56];
+  bf1[56] = bf0[56] + bf0[55];
+  bf1[57] = bf0[57] + bf0[54];
+  bf1[58] = bf0[58] + bf0[53];
+  bf1[59] = bf0[59] + bf0[52];
+  bf1[60] = bf0[60] + bf0[51];
+  bf1[61] = bf0[61] + bf0[50];
+  bf1[62] = bf0[62] + bf0[49];
+  bf1[63] = bf0[63] + bf0[48];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = -bf0[20] + bf0[19];
+  bf1[21] = -bf0[21] + bf0[18];
+  bf1[22] = -bf0[22] + bf0[17];
+  bf1[23] = -bf0[23] + bf0[16];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[28] + bf0[27];
+  bf1[29] = bf0[29] + bf0[26];
+  bf1[30] = bf0[30] + bf0[25];
+  bf1[31] = bf0[31] + bf0[24];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit[stage]);
+  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit[stage]);
+  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit[stage]);
+  bf1[44] = bf0[44];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = bf0[50];
+  bf1[51] = bf0[51];
+  bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit[stage]);
+  bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit[stage]);
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit[stage]);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[39];
+  bf1[33] = bf0[33] + bf0[38];
+  bf1[34] = bf0[34] + bf0[37];
+  bf1[35] = bf0[35] + bf0[36];
+  bf1[36] = -bf0[36] + bf0[35];
+  bf1[37] = -bf0[37] + bf0[34];
+  bf1[38] = -bf0[38] + bf0[33];
+  bf1[39] = -bf0[39] + bf0[32];
+  bf1[40] = -bf0[40] + bf0[47];
+  bf1[41] = -bf0[41] + bf0[46];
+  bf1[42] = -bf0[42] + bf0[45];
+  bf1[43] = -bf0[43] + bf0[44];
+  bf1[44] = bf0[44] + bf0[43];
+  bf1[45] = bf0[45] + bf0[42];
+  bf1[46] = bf0[46] + bf0[41];
+  bf1[47] = bf0[47] + bf0[40];
+  bf1[48] = bf0[48] + bf0[55];
+  bf1[49] = bf0[49] + bf0[54];
+  bf1[50] = bf0[50] + bf0[53];
+  bf1[51] = bf0[51] + bf0[52];
+  bf1[52] = -bf0[52] + bf0[51];
+  bf1[53] = -bf0[53] + bf0[50];
+  bf1[54] = -bf0[54] + bf0[49];
+  bf1[55] = -bf0[55] + bf0[48];
+  bf1[56] = -bf0[56] + bf0[63];
+  bf1[57] = -bf0[57] + bf0[62];
+  bf1[58] = -bf0[58] + bf0[61];
+  bf1[59] = -bf0[59] + bf0[60];
+  bf1[60] = bf0[60] + bf0[59];
+  bf1[61] = bf0[61] + bf0[58];
+  bf1[62] = bf0[62] + bf0[57];
+  bf1[63] = bf0[63] + bf0[56];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = -bf0[18] + bf0[17];
+  bf1[19] = -bf0[19] + bf0[16];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[22] + bf0[21];
+  bf1[23] = bf0[23] + bf0[20];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = -bf0[26] + bf0[25];
+  bf1[27] = -bf0[27] + bf0[24];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[30] + bf0[29];
+  bf1[31] = bf0[31] + bf0[28];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit[stage]);
+  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit[stage]);
+  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit[stage]);
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = bf0[41];
+  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit[stage]);
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit[stage]);
+  bf1[54] = bf0[54];
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit[stage]);
+  bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit[stage]);
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit[stage]);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit[stage]);
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[35];
+  bf1[33] = bf0[33] + bf0[34];
+  bf1[34] = -bf0[34] + bf0[33];
+  bf1[35] = -bf0[35] + bf0[32];
+  bf1[36] = -bf0[36] + bf0[39];
+  bf1[37] = -bf0[37] + bf0[38];
+  bf1[38] = bf0[38] + bf0[37];
+  bf1[39] = bf0[39] + bf0[36];
+  bf1[40] = bf0[40] + bf0[43];
+  bf1[41] = bf0[41] + bf0[42];
+  bf1[42] = -bf0[42] + bf0[41];
+  bf1[43] = -bf0[43] + bf0[40];
+  bf1[44] = -bf0[44] + bf0[47];
+  bf1[45] = -bf0[45] + bf0[46];
+  bf1[46] = bf0[46] + bf0[45];
+  bf1[47] = bf0[47] + bf0[44];
+  bf1[48] = bf0[48] + bf0[51];
+  bf1[49] = bf0[49] + bf0[50];
+  bf1[50] = -bf0[50] + bf0[49];
+  bf1[51] = -bf0[51] + bf0[48];
+  bf1[52] = -bf0[52] + bf0[55];
+  bf1[53] = -bf0[53] + bf0[54];
+  bf1[54] = bf0[54] + bf0[53];
+  bf1[55] = bf0[55] + bf0[52];
+  bf1[56] = bf0[56] + bf0[59];
+  bf1[57] = bf0[57] + bf0[58];
+  bf1[58] = -bf0[58] + bf0[57];
+  bf1[59] = -bf0[59] + bf0[56];
+  bf1[60] = -bf0[60] + bf0[63];
+  bf1[61] = -bf0[61] + bf0[62];
+  bf1[62] = bf0[62] + bf0[61];
+  bf1[63] = bf0[63] + bf0[60];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = -bf0[17] + bf0[16];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[19] + bf0[18];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = -bf0[21] + bf0[20];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[23] + bf0[22];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = -bf0[25] + bf0[24];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[27] + bf0[26];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = -bf0[29] + bf0[28];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[31] + bf0[30];
+  bf1[32] = bf0[32];
+  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit[stage]);
+  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit[stage]);
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit[stage]);
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit[stage]);
+  bf1[43] = bf0[43];
+  bf1[44] = bf0[44];
+  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit[stage]);
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit[stage]);
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[52];
+  bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit[stage]);
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit[stage]);
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit[stage]);
+  bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit[stage]);
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit[stage]);
+  bf1[32] = bf0[32] + bf0[33];
+  bf1[33] = -bf0[33] + bf0[32];
+  bf1[34] = -bf0[34] + bf0[35];
+  bf1[35] = bf0[35] + bf0[34];
+  bf1[36] = bf0[36] + bf0[37];
+  bf1[37] = -bf0[37] + bf0[36];
+  bf1[38] = -bf0[38] + bf0[39];
+  bf1[39] = bf0[39] + bf0[38];
+  bf1[40] = bf0[40] + bf0[41];
+  bf1[41] = -bf0[41] + bf0[40];
+  bf1[42] = -bf0[42] + bf0[43];
+  bf1[43] = bf0[43] + bf0[42];
+  bf1[44] = bf0[44] + bf0[45];
+  bf1[45] = -bf0[45] + bf0[44];
+  bf1[46] = -bf0[46] + bf0[47];
+  bf1[47] = bf0[47] + bf0[46];
+  bf1[48] = bf0[48] + bf0[49];
+  bf1[49] = -bf0[49] + bf0[48];
+  bf1[50] = -bf0[50] + bf0[51];
+  bf1[51] = bf0[51] + bf0[50];
+  bf1[52] = bf0[52] + bf0[53];
+  bf1[53] = -bf0[53] + bf0[52];
+  bf1[54] = -bf0[54] + bf0[55];
+  bf1[55] = bf0[55] + bf0[54];
+  bf1[56] = bf0[56] + bf0[57];
+  bf1[57] = -bf0[57] + bf0[56];
+  bf1[58] = -bf0[58] + bf0[59];
+  bf1[59] = bf0[59] + bf0[58];
+  bf1[60] = bf0[60] + bf0[61];
+  bf1[61] = -bf0[61] + bf0[60];
+  bf1[62] = -bf0[62] + bf0[63];
+  bf1[63] = bf0[63] + bf0[62];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit[stage]);
+  bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit[stage]);
+  bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit[stage]);
+  bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit[stage]);
+  bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit[stage]);
+  bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit[stage]);
+  bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit[stage]);
+  bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit[stage]);
+  bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit[stage]);
+  bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit[stage]);
+  bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit[stage]);
+  bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit[stage]);
+  bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[32];
+  bf1[2] = bf0[16];
+  bf1[3] = bf0[48];
+  bf1[4] = bf0[8];
+  bf1[5] = bf0[40];
+  bf1[6] = bf0[24];
+  bf1[7] = bf0[56];
+  bf1[8] = bf0[4];
+  bf1[9] = bf0[36];
+  bf1[10] = bf0[20];
+  bf1[11] = bf0[52];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[44];
+  bf1[14] = bf0[28];
+  bf1[15] = bf0[60];
+  bf1[16] = bf0[2];
+  bf1[17] = bf0[34];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[50];
+  bf1[20] = bf0[10];
+  bf1[21] = bf0[42];
+  bf1[22] = bf0[26];
+  bf1[23] = bf0[58];
+  bf1[24] = bf0[6];
+  bf1[25] = bf0[38];
+  bf1[26] = bf0[22];
+  bf1[27] = bf0[54];
+  bf1[28] = bf0[14];
+  bf1[29] = bf0[46];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[62];
+  bf1[32] = bf0[1];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[17];
+  bf1[35] = bf0[49];
+  bf1[36] = bf0[9];
+  bf1[37] = bf0[41];
+  bf1[38] = bf0[25];
+  bf1[39] = bf0[57];
+  bf1[40] = bf0[5];
+  bf1[41] = bf0[37];
+  bf1[42] = bf0[21];
+  bf1[43] = bf0[53];
+  bf1[44] = bf0[13];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[29];
+  bf1[47] = bf0[61];
+  bf1[48] = bf0[3];
+  bf1[49] = bf0[35];
+  bf1[50] = bf0[19];
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[11];
+  bf1[53] = bf0[43];
+  bf1[54] = bf0[27];
+  bf1[55] = bf0[59];
+  bf1[56] = bf0[7];
+  bf1[57] = bf0[39];
+  bf1[58] = bf0[23];
+  bf1[59] = bf0[55];
+  bf1[60] = bf0[15];
+  bf1[61] = bf0[47];
+  bf1[62] = bf0[31];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+#endif  // CONFIG_TX64X64
diff --git a/third_party/aom/av1/common/av1_fwd_txfm1d.h b/third_party/aom/av1/common/av1_fwd_txfm1d.h
new file mode 100644
index 000000000..9f246717e
--- /dev/null
+++ b/third_party/aom/av1/common/av1_fwd_txfm1d.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_FWD_TXFM1D_H_
+#define AV1_FWD_TXFM1D_H_
+
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+                   const int8_t *stage_range);
+void av1_fdct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+                   const int8_t *stage_range);
+void av1_fdct16_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fdct32_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fdct64_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+
+void av1_fadst4_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fadst8_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fadst16_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fadst32_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AV1_FWD_TXFM1D_H_
diff --git a/third_party/aom/av1/common/av1_fwd_txfm2d.c b/third_party/aom/av1/common/av1_fwd_txfm2d.c
new file mode 100644
index 000000000..d1dba82ca
--- /dev/null
+++ b/third_party/aom/av1/common/av1_fwd_txfm2d.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_fwd_txfm1d.h"
+#include "av1/common/av1_fwd_txfm2d_cfg.h"
+#include "av1/common/av1_txfm.h"
+
+static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT4: return av1_fdct4_new;
+    case TXFM_TYPE_DCT8: return av1_fdct8_new;
+    case TXFM_TYPE_DCT16: return av1_fdct16_new;
+    case TXFM_TYPE_DCT32: return av1_fdct32_new;
+    case TXFM_TYPE_ADST4: return av1_fadst4_new;
+    case TXFM_TYPE_ADST8: return av1_fadst8_new;
+    case TXFM_TYPE_ADST16: return av1_fadst16_new;
+    case TXFM_TYPE_ADST32: return av1_fadst32_new;
+    default: assert(0); return NULL;
+  }
+}
+
+static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
+                                const int stride, const TXFM_2D_FLIP_CFG *cfg,
+                                int32_t *buf) {
+  int c, r;
+  const int txfm_size = cfg->cfg->txfm_size;
+  const int8_t *shift = cfg->cfg->shift;
+  const int8_t *stage_range_col = cfg->cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->cfg->stage_range_row;
+  const int8_t *cos_bit_col = cfg->cfg->cos_bit_col;
+  const int8_t *cos_bit_row = cfg->cfg->cos_bit_row;
+  const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->cfg->txfm_type_col);
+  const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->cfg->txfm_type_row);
+
+  // use output buffer as temp buffer
+  int32_t *temp_in = output;
+  int32_t *temp_out = output + txfm_size;
+
+  // Columns
+  for (c = 0; c < txfm_size; ++c) {
+    if (cfg->ud_flip == 0) {
+      for (r = 0; r < txfm_size; ++r) temp_in[r] = input[r * stride + c];
+    } else {
+      for (r = 0; r < txfm_size; ++r)
+        // flip upside down
+        temp_in[r] = input[(txfm_size - r - 1) * stride + c];
+    }
+    round_shift_array(temp_in, txfm_size, -shift[0]);
+    txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
+    round_shift_array(temp_out, txfm_size, -shift[1]);
+    if (cfg->lr_flip == 0) {
+      for (r = 0; r < txfm_size; ++r) buf[r * txfm_size + c] = temp_out[r];
+    } else {
+      for (r = 0; r < txfm_size; ++r)
+        // flip from left to right
+        buf[r * txfm_size + (txfm_size - c - 1)] = temp_out[r];
+    }
+  }
+
+  // Rows
+  for (r = 0; r < txfm_size; ++r) {
+    txfm_func_row(buf + r * txfm_size, output + r * txfm_size, cos_bit_row,
+                  stage_range_row);
+    round_shift_array(output + r * txfm_size, txfm_size, -shift[2]);
+  }
+}
+
+void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride,
+                          int tx_type, int bd) {
+  int32_t txfm_buf[4 * 4];
+  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_4X4);
+  (void)bd;
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
+}
+
+void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride,
+                          int tx_type, int bd) {
+  int32_t txfm_buf[8 * 8];
+  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_8X8);
+  (void)bd;
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
+}
+
+void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride,
+                            int tx_type, int bd) {
+  int32_t txfm_buf[16 * 16];
+  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_16X16);
+  (void)bd;
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
+}
+
+void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride,
+                            int tx_type, int bd) {
+  int32_t txfm_buf[32 * 32];
+  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_32X32);
+  (void)bd;
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
+}
+
+void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride,
+                            int tx_type, int bd) {
+  int32_t txfm_buf[64 * 64];
+  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_64x64_cfg(tx_type);
+  (void)bd;
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
+}
+
+#if CONFIG_EXT_TX
+static const TXFM_2D_CFG *fwd_txfm_cfg_ls[FLIPADST_ADST + 1][TX_SIZES] = {
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &fwd_txfm_2d_cfg_dct_dct_4, &fwd_txfm_2d_cfg_dct_dct_8,
+      &fwd_txfm_2d_cfg_dct_dct_16, &fwd_txfm_2d_cfg_dct_dct_32 },
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &fwd_txfm_2d_cfg_adst_dct_4, &fwd_txfm_2d_cfg_adst_dct_8,
+      &fwd_txfm_2d_cfg_adst_dct_16, &fwd_txfm_2d_cfg_adst_dct_32 },
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &fwd_txfm_2d_cfg_dct_adst_4, &fwd_txfm_2d_cfg_dct_adst_8,
+      &fwd_txfm_2d_cfg_dct_adst_16, &fwd_txfm_2d_cfg_dct_adst_32 },
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_adst_8,
+      &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_adst_32 },
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &fwd_txfm_2d_cfg_adst_dct_4, &fwd_txfm_2d_cfg_adst_dct_8,
+      &fwd_txfm_2d_cfg_adst_dct_16, &fwd_txfm_2d_cfg_adst_dct_32 },
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &fwd_txfm_2d_cfg_dct_adst_4, &fwd_txfm_2d_cfg_dct_adst_8,
+      &fwd_txfm_2d_cfg_dct_adst_16, &fwd_txfm_2d_cfg_dct_adst_32 },
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_adst_8,
+      &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_adst_32 },
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_adst_8,
+      &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_adst_32 },
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_adst_8,
+      &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_adst_32 },
+};
+#else  // CONFIG_EXT_TX
+static const TXFM_2D_CFG *fwd_txfm_cfg_ls[TX_TYPES][TX_SIZES] = {
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &fwd_txfm_2d_cfg_dct_dct_4, &fwd_txfm_2d_cfg_dct_dct_8,
+      &fwd_txfm_2d_cfg_dct_dct_16, &fwd_txfm_2d_cfg_dct_dct_32 },
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &fwd_txfm_2d_cfg_adst_dct_4, &fwd_txfm_2d_cfg_adst_dct_8,
+      &fwd_txfm_2d_cfg_adst_dct_16, &fwd_txfm_2d_cfg_adst_dct_32 },
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &fwd_txfm_2d_cfg_dct_adst_4, &fwd_txfm_2d_cfg_dct_adst_8,
+      &fwd_txfm_2d_cfg_dct_adst_16, &fwd_txfm_2d_cfg_dct_adst_32 },
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_adst_8,
+      &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_adst_32 },
+};
+#endif  // CONFIG_EXT_TX
+
+TXFM_2D_FLIP_CFG av1_get_fwd_txfm_cfg(int tx_type, int tx_size) {
+  TXFM_2D_FLIP_CFG cfg;
+  set_flip_cfg(tx_type, &cfg);
+  cfg.cfg = fwd_txfm_cfg_ls[tx_type][tx_size];
+  return cfg;
+}
+
+TXFM_2D_FLIP_CFG av1_get_fwd_txfm_64x64_cfg(int tx_type) {
+  TXFM_2D_FLIP_CFG cfg;
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg.cfg = &fwd_txfm_2d_cfg_dct_dct_64;
+      cfg.ud_flip = 0;
+      cfg.lr_flip = 0;
+      break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    default:
+      cfg.ud_flip = 0;
+      cfg.lr_flip = 0;
+      assert(0);
+  }
+  return cfg;
+}
diff --git a/third_party/aom/av1/common/av1_fwd_txfm2d_cfg.h b/third_party/aom/av1/common/av1_fwd_txfm2d_cfg.h
new file mode 100644
index 000000000..b5c828286
--- /dev/null
+++ b/third_party/aom/av1/common/av1_fwd_txfm2d_cfg.h
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_FWD_TXFM2D_CFG_H_
+#define AV1_FWD_TXFM2D_CFG_H_
+#include "av1/common/enums.h"
+#include "av1/common/av1_fwd_txfm1d.h"
+//  ---------------- config fwd_dct_dct_4 ----------------
+static const int8_t fwd_shift_dct_dct_4[3] = { 2, 0, 0 };
+static const int8_t fwd_stage_range_col_dct_dct_4[4] = { 15, 16, 17, 17 };
+static const int8_t fwd_stage_range_row_dct_dct_4[4] = { 17, 18, 18, 18 };
+static const int8_t fwd_cos_bit_col_dct_dct_4[4] = { 13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_dct_dct_4[4] = { 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_dct_4 = {
+  4,  // .txfm_size
+  4,  // .stage_num_col
+  4,  // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_dct_dct_4,            // .shift
+  fwd_stage_range_col_dct_dct_4,  // .stage_range_col
+  fwd_stage_range_row_dct_dct_4,  // .stage_range_row
+  fwd_cos_bit_col_dct_dct_4,      // .cos_bit_col
+  fwd_cos_bit_row_dct_dct_4,      // .cos_bit_row
+  TXFM_TYPE_DCT4,                 // .txfm_type_col
+  TXFM_TYPE_DCT4
+};  // .txfm_type_row
+
+//  ---------------- config fwd_dct_dct_8 ----------------
+static const int8_t fwd_shift_dct_dct_8[3] = { 2, -1, 0 };
+static const int8_t fwd_stage_range_col_dct_dct_8[6] = {
+  15, 16, 17, 18, 18, 18
+};
+static const int8_t fwd_stage_range_row_dct_dct_8[6] = {
+  17, 18, 19, 19, 19, 19
+};
+static const int8_t fwd_cos_bit_col_dct_dct_8[6] = { 13, 13, 13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_dct_dct_8[6] = { 13, 13, 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_dct_8 = {
+  8,  // .txfm_size
+  6,  // .stage_num_col
+  6,  // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_dct_dct_8,            // .shift
+  fwd_stage_range_col_dct_dct_8,  // .stage_range_col
+  fwd_stage_range_row_dct_dct_8,  // .stage_range_row
+  fwd_cos_bit_col_dct_dct_8,      // .cos_bit_col
+  fwd_cos_bit_row_dct_dct_8,      // .cos_bit_row
+  TXFM_TYPE_DCT8,                 // .txfm_type_col
+  TXFM_TYPE_DCT8
+};  // .txfm_type_row
+
+//  ---------------- config fwd_dct_dct_16 ----------------
+static const int8_t fwd_shift_dct_dct_16[3] = { 2, -2, 0 };
+static const int8_t fwd_stage_range_col_dct_dct_16[8] = { 15, 16, 17, 18,
+                                                          19, 19, 19, 19 };
+static const int8_t fwd_stage_range_row_dct_dct_16[8] = { 17, 18, 19, 20,
+                                                          20, 20, 20, 20 };
+static const int8_t fwd_cos_bit_col_dct_dct_16[8] = { 13, 13, 13, 13,
+                                                      13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_dct_dct_16[8] = { 12, 12, 12, 12,
+                                                      12, 12, 12, 12 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_dct_16 = {
+  16,  // .txfm_size
+  8,   // .stage_num_col
+  8,   // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_dct_dct_16,            // .shift
+  fwd_stage_range_col_dct_dct_16,  // .stage_range_col
+  fwd_stage_range_row_dct_dct_16,  // .stage_range_row
+  fwd_cos_bit_col_dct_dct_16,      // .cos_bit_col
+  fwd_cos_bit_row_dct_dct_16,      // .cos_bit_row
+  TXFM_TYPE_DCT16,                 // .txfm_type_col
+  TXFM_TYPE_DCT16
+};  // .txfm_type_row
+
+//  ---------------- config fwd_dct_dct_32 ----------------
+static const int8_t fwd_shift_dct_dct_32[3] = { 2, -4, 0 };
+static const int8_t fwd_stage_range_col_dct_dct_32[10] = { 15, 16, 17, 18, 19,
+                                                           20, 20, 20, 20, 20 };
+static const int8_t fwd_stage_range_row_dct_dct_32[10] = { 16, 17, 18, 19, 20,
+                                                           20, 20, 20, 20, 20 };
+static const int8_t fwd_cos_bit_col_dct_dct_32[10] = { 12, 12, 12, 12, 12,
+                                                       12, 12, 12, 12, 12 };
+static const int8_t fwd_cos_bit_row_dct_dct_32[10] = { 12, 12, 12, 12, 12,
+                                                       12, 12, 12, 12, 12 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_dct_32 = {
+  32,  // .txfm_size
+  10,  // .stage_num_col
+  10,  // .stage_num_row
+  // 1,  // .log_scale
+  fwd_shift_dct_dct_32,            // .shift
+  fwd_stage_range_col_dct_dct_32,  // .stage_range_col
+  fwd_stage_range_row_dct_dct_32,  // .stage_range_row
+  fwd_cos_bit_col_dct_dct_32,      // .cos_bit_col
+  fwd_cos_bit_row_dct_dct_32,      // .cos_bit_row
+  TXFM_TYPE_DCT32,                 // .txfm_type_col
+  TXFM_TYPE_DCT32
+};  // .txfm_type_row
+
+//  ---------------- config fwd_dct_dct_64 ----------------
+static const int8_t fwd_shift_dct_dct_64[3] = { 0, -2, -2 };
+static const int8_t fwd_stage_range_col_dct_dct_64[12] = {
+  13, 14, 15, 16, 17, 18, 19, 19, 19, 19, 19, 19
+};
+static const int8_t fwd_stage_range_row_dct_dct_64[12] = {
+  17, 18, 19, 20, 21, 22, 22, 22, 22, 22, 22, 22
+};
+static const int8_t fwd_cos_bit_col_dct_dct_64[12] = { 15, 15, 15, 15, 15, 14,
+                                                       13, 13, 13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_dct_dct_64[12] = { 15, 14, 13, 12, 11, 10,
+                                                       10, 10, 10, 10, 10, 10 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_dct_64 = {
+  64,                              // .txfm_size
+  12,                              // .stage_num_col
+  12,                              // .stage_num_row
+  fwd_shift_dct_dct_64,            // .shift
+  fwd_stage_range_col_dct_dct_64,  // .stage_range_col
+  fwd_stage_range_row_dct_dct_64,  // .stage_range_row
+  fwd_cos_bit_col_dct_dct_64,      // .cos_bit_col
+  fwd_cos_bit_row_dct_dct_64,      // .cos_bit_row
+  TXFM_TYPE_DCT64,                 // .txfm_type_col
+  TXFM_TYPE_DCT64
+};  // .txfm_type_row
+
+//  ---------------- config fwd_dct_adst_4 ----------------
+static const int8_t fwd_shift_dct_adst_4[3] = { 2, 0, 0 };
+static const int8_t fwd_stage_range_col_dct_adst_4[4] = { 15, 16, 17, 17 };
+static const int8_t fwd_stage_range_row_dct_adst_4[6] = {
+  17, 17, 17, 18, 18, 18
+};
+static const int8_t fwd_cos_bit_col_dct_adst_4[4] = { 13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_dct_adst_4[6] = { 13, 13, 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_adst_4 = {
+  4,  // .txfm_size
+  4,  // .stage_num_col
+  6,  // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_dct_adst_4,            // .shift
+  fwd_stage_range_col_dct_adst_4,  // .stage_range_col
+  fwd_stage_range_row_dct_adst_4,  // .stage_range_row
+  fwd_cos_bit_col_dct_adst_4,      // .cos_bit_col
+  fwd_cos_bit_row_dct_adst_4,      // .cos_bit_row
+  TXFM_TYPE_DCT4,                  // .txfm_type_col
+  TXFM_TYPE_ADST4
+};  // .txfm_type_row
+
+//  ---------------- config fwd_dct_adst_8 ----------------
+static const int8_t fwd_shift_dct_adst_8[3] = { 2, -1, 0 };
+static const int8_t fwd_stage_range_col_dct_adst_8[6] = {
+  15, 16, 17, 18, 18, 18
+};
+static const int8_t fwd_stage_range_row_dct_adst_8[8] = { 17, 17, 17, 18,
+                                                          18, 19, 19, 19 };
+static const int8_t fwd_cos_bit_col_dct_adst_8[6] = { 13, 13, 13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_dct_adst_8[8] = { 13, 13, 13, 13,
+                                                      13, 13, 13, 13 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_adst_8 = {
+  8,  // .txfm_size
+  6,  // .stage_num_col
+  8,  // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_dct_adst_8,            // .shift
+  fwd_stage_range_col_dct_adst_8,  // .stage_range_col
+  fwd_stage_range_row_dct_adst_8,  // .stage_range_row
+  fwd_cos_bit_col_dct_adst_8,      // .cos_bit_col
+  fwd_cos_bit_row_dct_adst_8,      // .cos_bit_row
+  TXFM_TYPE_DCT8,                  // .txfm_type_col
+  TXFM_TYPE_ADST8
+};  // .txfm_type_row
+
+//  ---------------- config fwd_dct_adst_16 ----------------
+static const int8_t fwd_shift_dct_adst_16[3] = { 2, -2, 0 };
+static const int8_t fwd_stage_range_col_dct_adst_16[8] = { 15, 16, 17, 18,
+                                                           19, 19, 19, 19 };
+static const int8_t fwd_stage_range_row_dct_adst_16[10] = {
+  17, 17, 17, 18, 18, 19, 19, 20, 20, 20
+};
+static const int8_t fwd_cos_bit_col_dct_adst_16[8] = { 13, 13, 13, 13,
+                                                       13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_dct_adst_16[10] = { 12, 12, 12, 12, 12,
+                                                        12, 12, 12, 12, 12 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_adst_16 = {
+  16,  // .txfm_size
+  8,   // .stage_num_col
+  10,  // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_dct_adst_16,            // .shift
+  fwd_stage_range_col_dct_adst_16,  // .stage_range_col
+  fwd_stage_range_row_dct_adst_16,  // .stage_range_row
+  fwd_cos_bit_col_dct_adst_16,      // .cos_bit_col
+  fwd_cos_bit_row_dct_adst_16,      // .cos_bit_row
+  TXFM_TYPE_DCT16,                  // .txfm_type_col
+  TXFM_TYPE_ADST16
+};  // .txfm_type_row
+
+//  ---------------- config fwd_dct_adst_32 ----------------
+static const int8_t fwd_shift_dct_adst_32[3] = { 2, -4, 0 };
+static const int8_t fwd_stage_range_col_dct_adst_32[10] = {
+  15, 16, 17, 18, 19, 20, 20, 20, 20, 20
+};
+static const int8_t fwd_stage_range_row_dct_adst_32[12] = {
+  16, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 20
+};
+static const int8_t fwd_cos_bit_col_dct_adst_32[10] = { 12, 12, 12, 12, 12,
+                                                        12, 12, 12, 12, 12 };
+static const int8_t fwd_cos_bit_row_dct_adst_32[12] = {
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12
+};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_adst_32 = {
+  32,  // .txfm_size
+  10,  // .stage_num_col
+  12,  // .stage_num_row
+  // 1,  // .log_scale
+  fwd_shift_dct_adst_32,            // .shift
+  fwd_stage_range_col_dct_adst_32,  // .stage_range_col
+  fwd_stage_range_row_dct_adst_32,  // .stage_range_row
+  fwd_cos_bit_col_dct_adst_32,      // .cos_bit_col
+  fwd_cos_bit_row_dct_adst_32,      // .cos_bit_row
+  TXFM_TYPE_DCT32,                  // .txfm_type_col
+  TXFM_TYPE_ADST32
+};  // .txfm_type_row
+//  ---------------- config fwd_adst_adst_4 ----------------
+static const int8_t fwd_shift_adst_adst_4[3] = { 2, 0, 0 };
+static const int8_t fwd_stage_range_col_adst_adst_4[6] = { 15, 15, 16,
+                                                           17, 17, 17 };
+static const int8_t fwd_stage_range_row_adst_adst_4[6] = { 17, 17, 17,
+                                                           18, 18, 18 };
+static const int8_t fwd_cos_bit_col_adst_adst_4[6] = { 13, 13, 13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_adst_adst_4[6] = { 13, 13, 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_adst_4 = {
+  4,  // .txfm_size
+  6,  // .stage_num_col
+  6,  // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_adst_adst_4,            // .shift
+  fwd_stage_range_col_adst_adst_4,  // .stage_range_col
+  fwd_stage_range_row_adst_adst_4,  // .stage_range_row
+  fwd_cos_bit_col_adst_adst_4,      // .cos_bit_col
+  fwd_cos_bit_row_adst_adst_4,      // .cos_bit_row
+  TXFM_TYPE_ADST4,                  // .txfm_type_col
+  TXFM_TYPE_ADST4
+};  // .txfm_type_row
+
+//  ---------------- config fwd_adst_adst_8 ----------------
+static const int8_t fwd_shift_adst_adst_8[3] = { 2, -1, 0 };
+static const int8_t fwd_stage_range_col_adst_adst_8[8] = { 15, 15, 16, 17,
+                                                           17, 18, 18, 18 };
+static const int8_t fwd_stage_range_row_adst_adst_8[8] = { 17, 17, 17, 18,
+                                                           18, 19, 19, 19 };
+static const int8_t fwd_cos_bit_col_adst_adst_8[8] = { 13, 13, 13, 13,
+                                                       13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_adst_adst_8[8] = { 13, 13, 13, 13,
+                                                       13, 13, 13, 13 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_adst_8 = {
+  8,  // .txfm_size
+  8,  // .stage_num_col
+  8,  // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_adst_adst_8,            // .shift
+  fwd_stage_range_col_adst_adst_8,  // .stage_range_col
+  fwd_stage_range_row_adst_adst_8,  // .stage_range_row
+  fwd_cos_bit_col_adst_adst_8,      // .cos_bit_col
+  fwd_cos_bit_row_adst_adst_8,      // .cos_bit_row
+  TXFM_TYPE_ADST8,                  // .txfm_type_col
+  TXFM_TYPE_ADST8
+};  // .txfm_type_row
+
+//  ---------------- config fwd_adst_adst_16 ----------------
+static const int8_t fwd_shift_adst_adst_16[3] = { 2, -2, 0 };
+static const int8_t fwd_stage_range_col_adst_adst_16[10] = {
+  15, 15, 16, 17, 17, 18, 18, 19, 19, 19
+};
+static const int8_t fwd_stage_range_row_adst_adst_16[10] = {
+  17, 17, 17, 18, 18, 19, 19, 20, 20, 20
+};
+static const int8_t fwd_cos_bit_col_adst_adst_16[10] = { 13, 13, 13, 13, 13,
+                                                         13, 13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_adst_adst_16[10] = { 12, 12, 12, 12, 12,
+                                                         12, 12, 12, 12, 12 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_adst_16 = {
+  16,  // .txfm_size
+  10,  // .stage_num_col
+  10,  // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_adst_adst_16,            // .shift
+  fwd_stage_range_col_adst_adst_16,  // .stage_range_col
+  fwd_stage_range_row_adst_adst_16,  // .stage_range_row
+  fwd_cos_bit_col_adst_adst_16,      // .cos_bit_col
+  fwd_cos_bit_row_adst_adst_16,      // .cos_bit_row
+  TXFM_TYPE_ADST16,                  // .txfm_type_col
+  TXFM_TYPE_ADST16
+};  // .txfm_type_row
+
+//  ---------------- config fwd_adst_adst_32 ----------------
+static const int8_t fwd_shift_adst_adst_32[3] = { 2, -4, 0 };
+static const int8_t fwd_stage_range_col_adst_adst_32[12] = {
+  15, 15, 16, 17, 17, 18, 18, 19, 19, 20, 20, 20
+};
+static const int8_t fwd_stage_range_row_adst_adst_32[12] = {
+  16, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 20
+};
+static const int8_t fwd_cos_bit_col_adst_adst_32[12] = {
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12
+};
+static const int8_t fwd_cos_bit_row_adst_adst_32[12] = {
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12
+};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_adst_32 = {
+  32,  // .txfm_size
+  12,  // .stage_num_col
+  12,  // .stage_num_row
+  // 1,  // .log_scale
+  fwd_shift_adst_adst_32,            // .shift
+  fwd_stage_range_col_adst_adst_32,  // .stage_range_col
+  fwd_stage_range_row_adst_adst_32,  // .stage_range_row
+  fwd_cos_bit_col_adst_adst_32,      // .cos_bit_col
+  fwd_cos_bit_row_adst_adst_32,      // .cos_bit_row
+  TXFM_TYPE_ADST32,                  // .txfm_type_col
+  TXFM_TYPE_ADST32
+};  // .txfm_type_row
+
+//  ---------------- config fwd_adst_dct_4 ----------------
+static const int8_t fwd_shift_adst_dct_4[3] = { 2, 0, 0 };
+static const int8_t fwd_stage_range_col_adst_dct_4[6] = {
+  15, 15, 16, 17, 17, 17
+};
+static const int8_t fwd_stage_range_row_adst_dct_4[4] = { 17, 18, 18, 18 };
+static const int8_t fwd_cos_bit_col_adst_dct_4[6] = { 13, 13, 13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_adst_dct_4[4] = { 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_dct_4 = {
+  4,  // .txfm_size
+  6,  // .stage_num_col
+  4,  // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_adst_dct_4,            // .shift
+  fwd_stage_range_col_adst_dct_4,  // .stage_range_col
+  fwd_stage_range_row_adst_dct_4,  // .stage_range_row
+  fwd_cos_bit_col_adst_dct_4,      // .cos_bit_col
+  fwd_cos_bit_row_adst_dct_4,      // .cos_bit_row
+  TXFM_TYPE_ADST4,                 // .txfm_type_col
+  TXFM_TYPE_DCT4
+};  // .txfm_type_row
+
+//  ---------------- config fwd_adst_dct_8 ----------------
+static const int8_t fwd_shift_adst_dct_8[3] = { 2, -1, 0 };
+static const int8_t fwd_stage_range_col_adst_dct_8[8] = { 15, 15, 16, 17,
+                                                          17, 18, 18, 18 };
+static const int8_t fwd_stage_range_row_adst_dct_8[6] = {
+  17, 18, 19, 19, 19, 19
+};
+static const int8_t fwd_cos_bit_col_adst_dct_8[8] = { 13, 13, 13, 13,
+                                                      13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_adst_dct_8[6] = { 13, 13, 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_dct_8 = {
+  8,  // .txfm_size
+  8,  // .stage_num_col
+  6,  // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_adst_dct_8,            // .shift
+  fwd_stage_range_col_adst_dct_8,  // .stage_range_col
+  fwd_stage_range_row_adst_dct_8,  // .stage_range_row
+  fwd_cos_bit_col_adst_dct_8,      // .cos_bit_col
+  fwd_cos_bit_row_adst_dct_8,      // .cos_bit_row
+  TXFM_TYPE_ADST8,                 // .txfm_type_col
+  TXFM_TYPE_DCT8
+};  // .txfm_type_row
+
+//  ---------------- config fwd_adst_dct_16 ----------------
+static const int8_t fwd_shift_adst_dct_16[3] = { 2, -2, 0 };
+static const int8_t fwd_stage_range_col_adst_dct_16[10] = {
+  15, 15, 16, 17, 17, 18, 18, 19, 19, 19
+};
+static const int8_t fwd_stage_range_row_adst_dct_16[8] = { 17, 18, 19, 20,
+                                                           20, 20, 20, 20 };
+static const int8_t fwd_cos_bit_col_adst_dct_16[10] = { 13, 13, 13, 13, 13,
+                                                        13, 13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_adst_dct_16[8] = { 12, 12, 12, 12,
+                                                       12, 12, 12, 12 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_dct_16 = {
+  16,  // .txfm_size
+  10,  // .stage_num_col
+  8,   // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_adst_dct_16,            // .shift
+  fwd_stage_range_col_adst_dct_16,  // .stage_range_col
+  fwd_stage_range_row_adst_dct_16,  // .stage_range_row
+  fwd_cos_bit_col_adst_dct_16,      // .cos_bit_col
+  fwd_cos_bit_row_adst_dct_16,      // .cos_bit_row
+  TXFM_TYPE_ADST16,                 // .txfm_type_col
+  TXFM_TYPE_DCT16
+};  // .txfm_type_row
+
+//  ---------------- config fwd_adst_dct_32 ----------------
+static const int8_t fwd_shift_adst_dct_32[3] = { 2, -4, 0 };
+static const int8_t fwd_stage_range_col_adst_dct_32[12] = {
+  15, 15, 16, 17, 17, 18, 18, 19, 19, 20, 20, 20
+};
+static const int8_t fwd_stage_range_row_adst_dct_32[10] = {
+  16, 17, 18, 19, 20, 20, 20, 20, 20, 20
+};
+static const int8_t fwd_cos_bit_col_adst_dct_32[12] = {
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12
+};
+static const int8_t fwd_cos_bit_row_adst_dct_32[10] = { 12, 12, 12, 12, 12,
+                                                        12, 12, 12, 12, 12 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_dct_32 = {
+  32,  // .txfm_size
+  12,  // .stage_num_col
+  10,  // .stage_num_row
+  // 1,  // .log_scale
+  fwd_shift_adst_dct_32,            // .shift
+  fwd_stage_range_col_adst_dct_32,  // .stage_range_col
+  fwd_stage_range_row_adst_dct_32,  // .stage_range_row
+  fwd_cos_bit_col_adst_dct_32,      // .cos_bit_col
+  fwd_cos_bit_row_adst_dct_32,      // .cos_bit_row
+  TXFM_TYPE_ADST32,                 // .txfm_type_col
+  TXFM_TYPE_DCT32
+};      // .txfm_type_row
+#endif  // AV1_FWD_TXFM2D_CFG_H_
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.c b/third_party/aom/av1/common/av1_inv_txfm1d.c
new file mode 100644
index 000000000..54bbe9adf
--- /dev/null
+++ b/third_party/aom/av1/common/av1_inv_txfm1d.c
@@ -0,0 +1,2334 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include "av1/common/av1_inv_txfm1d.h"
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+
+void range_check_func(int32_t stage, const int32_t *input, const int32_t *buf,
+                      int32_t size, int8_t bit) {
+  const int64_t maxValue = (1LL << (bit - 1)) - 1;
+  const int64_t minValue = -(1LL << (bit - 1));
+
+  for (int i = 0; i < size; ++i) {
+    if (buf[i] < minValue || buf[i] > maxValue) {
+      fprintf(stderr, "Error: coeffs contain out-of-range values\n");
+      fprintf(stderr, "stage: %d\n", stage);
+      fprintf(stderr, "node: %d\n", i);
+      fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", minValue,
+              maxValue);
+      fprintf(stderr, "coeffs: ");
+
+      fprintf(stderr, "[");
+      for (int j = 0; j < size; j++) {
+        if (j > 0) fprintf(stderr, ", ");
+        fprintf(stderr, "%d", input[j]);
+      }
+      fprintf(stderr, "]\n");
+      assert(0);
+    }
+  }
+}
+
+#define range_check(stage, input, buf, size, bit) \
+  range_check_func(stage, input, buf, size, bit)
+#else
+#define range_check(stage, input, buf, size, bit) \
+  {                                               \
+    (void)stage;                                  \
+    (void)input;                                  \
+    (void)buf;                                    \
+    (void)size;                                   \
+    (void)bit;                                    \
+  }
+#endif
+
+// TODO(angiebird): Make 1-d txfm functions static
+void av1_idct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+                   const int8_t *stage_range) {
+  const int32_t size = 4;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[2];
+  bf1[2] = input[1];
+  bf1[3] = input[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_idct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+                   const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[4];
+  bf1[2] = input[2];
+  bf1[3] = input[6];
+  bf1[4] = input[1];
+  bf1[5] = input[5];
+  bf1[6] = input[3];
+  bf1[7] = input[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = bf0[4] - bf0[5];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[6] + bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = bf0[3] - bf0[4];
+  bf1[5] = bf0[2] - bf0[5];
+  bf1[6] = bf0[1] - bf0[6];
+  bf1[7] = bf0[0] - bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_idct16_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[8];
+  bf1[2] = input[4];
+  bf1[3] = input[12];
+  bf1[4] = input[2];
+  bf1[5] = input[10];
+  bf1[6] = input[6];
+  bf1[7] = input[14];
+  bf1[8] = input[1];
+  bf1[9] = input[9];
+  bf1[10] = input[5];
+  bf1[11] = input[13];
+  bf1[12] = input[3];
+  bf1[13] = input[11];
+  bf1[14] = input[7];
+  bf1[15] = input[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = bf0[8] - bf0[9];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[10] + bf0[11];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = bf0[12] - bf0[13];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[14] + bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = bf0[4] - bf0[5];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[6] + bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = bf0[9] - bf0[10];
+  bf1[11] = bf0[8] - bf0[11];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[13] + bf0[14];
+  bf1[15] = bf0[12] + bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = bf0[3] - bf0[4];
+  bf1[5] = bf0[2] - bf0[5];
+  bf1[6] = bf0[1] - bf0[6];
+  bf1[7] = bf0[0] - bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = bf0[7] - bf0[8];
+  bf1[9] = bf0[6] - bf0[9];
+  bf1[10] = bf0[5] - bf0[10];
+  bf1[11] = bf0[4] - bf0[11];
+  bf1[12] = bf0[3] - bf0[12];
+  bf1[13] = bf0[2] - bf0[13];
+  bf1[14] = bf0[1] - bf0[14];
+  bf1[15] = bf0[0] - bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_idct32_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 32;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[16];
+  bf1[2] = input[8];
+  bf1[3] = input[24];
+  bf1[4] = input[4];
+  bf1[5] = input[20];
+  bf1[6] = input[12];
+  bf1[7] = input[28];
+  bf1[8] = input[2];
+  bf1[9] = input[18];
+  bf1[10] = input[10];
+  bf1[11] = input[26];
+  bf1[12] = input[6];
+  bf1[13] = input[22];
+  bf1[14] = input[14];
+  bf1[15] = input[30];
+  bf1[16] = input[1];
+  bf1[17] = input[17];
+  bf1[18] = input[9];
+  bf1[19] = input[25];
+  bf1[20] = input[5];
+  bf1[21] = input[21];
+  bf1[22] = input[13];
+  bf1[23] = input[29];
+  bf1[24] = input[3];
+  bf1[25] = input[19];
+  bf1[26] = input[11];
+  bf1[27] = input[27];
+  bf1[28] = input[7];
+  bf1[29] = input[23];
+  bf1[30] = input[15];
+  bf1[31] = input[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = bf0[16] - bf0[17];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[18] + bf0[19];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = bf0[20] - bf0[21];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[22] + bf0[23];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = bf0[24] - bf0[25];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[26] + bf0[27];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = bf0[28] - bf0[29];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[30] + bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = bf0[8] - bf0[9];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[10] + bf0[11];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = bf0[12] - bf0[13];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[14] + bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit[stage]);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit[stage]);
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = bf0[4] - bf0[5];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[6] + bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = bf0[17] - bf0[18];
+  bf1[19] = bf0[16] - bf0[19];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[21] + bf0[22];
+  bf1[23] = bf0[20] + bf0[23];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = bf0[25] - bf0[26];
+  bf1[27] = bf0[24] - bf0[27];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[29] + bf0[30];
+  bf1[31] = bf0[28] + bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = bf0[9] - bf0[10];
+  bf1[11] = bf0[8] - bf0[11];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[13] + bf0[14];
+  bf1[15] = bf0[12] + bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit[stage]);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = bf0[3] - bf0[4];
+  bf1[5] = bf0[2] - bf0[5];
+  bf1[6] = bf0[1] - bf0[6];
+  bf1[7] = bf0[0] - bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = bf0[19] - bf0[20];
+  bf1[21] = bf0[18] - bf0[21];
+  bf1[22] = bf0[17] - bf0[22];
+  bf1[23] = bf0[16] - bf0[23];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[27] + bf0[28];
+  bf1[29] = bf0[26] + bf0[29];
+  bf1[30] = bf0[25] + bf0[30];
+  bf1[31] = bf0[24] + bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = bf0[7] - bf0[8];
+  bf1[9] = bf0[6] - bf0[9];
+  bf1[10] = bf0[5] - bf0[10];
+  bf1[11] = bf0[4] - bf0[11];
+  bf1[12] = bf0[3] - bf0[12];
+  bf1[13] = bf0[2] - bf0[13];
+  bf1[14] = bf0[1] - bf0[14];
+  bf1[15] = bf0[0] - bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[31];
+  bf1[1] = bf0[1] + bf0[30];
+  bf1[2] = bf0[2] + bf0[29];
+  bf1[3] = bf0[3] + bf0[28];
+  bf1[4] = bf0[4] + bf0[27];
+  bf1[5] = bf0[5] + bf0[26];
+  bf1[6] = bf0[6] + bf0[25];
+  bf1[7] = bf0[7] + bf0[24];
+  bf1[8] = bf0[8] + bf0[23];
+  bf1[9] = bf0[9] + bf0[22];
+  bf1[10] = bf0[10] + bf0[21];
+  bf1[11] = bf0[11] + bf0[20];
+  bf1[12] = bf0[12] + bf0[19];
+  bf1[13] = bf0[13] + bf0[18];
+  bf1[14] = bf0[14] + bf0[17];
+  bf1[15] = bf0[15] + bf0[16];
+  bf1[16] = bf0[15] - bf0[16];
+  bf1[17] = bf0[14] - bf0[17];
+  bf1[18] = bf0[13] - bf0[18];
+  bf1[19] = bf0[12] - bf0[19];
+  bf1[20] = bf0[11] - bf0[20];
+  bf1[21] = bf0[10] - bf0[21];
+  bf1[22] = bf0[9] - bf0[22];
+  bf1[23] = bf0[8] - bf0[23];
+  bf1[24] = bf0[7] - bf0[24];
+  bf1[25] = bf0[6] - bf0[25];
+  bf1[26] = bf0[5] - bf0[26];
+  bf1[27] = bf0[4] - bf0[27];
+  bf1[28] = bf0[3] - bf0[28];
+  bf1[29] = bf0[2] - bf0[29];
+  bf1[30] = bf0[1] - bf0[30];
+  bf1[31] = bf0[0] - bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_iadst4_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 4;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[3];
+  bf1[2] = -input[1];
+  bf1[3] = input[2];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[56], bf0[0], -cospi[8], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[24], bf0[2], -cospi[40], bf0[3], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[2];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_iadst8_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[7];
+  bf1[2] = -input[3];
+  bf1[3] = input[4];
+  bf1[4] = -input[1];
+  bf1[5] = input[6];
+  bf1[6] = input[2];
+  bf1[7] = -input[5];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[6];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[4];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[2];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_iadst16_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[15];
+  bf1[2] = -input[7];
+  bf1[3] = input[8];
+  bf1[4] = -input[3];
+  bf1[5] = input[12];
+  bf1[6] = input[4];
+  bf1[7] = -input[11];
+  bf1[8] = -input[1];
+  bf1[9] = input[14];
+  bf1[10] = input[6];
+  bf1[11] = -input[9];
+  bf1[12] = input[2];
+  bf1[13] = -input[13];
+  bf1[14] = -input[5];
+  bf1[15] = input[10];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit[stage]);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
+  bf1[8] = bf0[8] + bf0[10];
+  bf1[9] = bf0[9] + bf0[11];
+  bf1[10] = bf0[8] - bf0[10];
+  bf1[11] = bf0[9] - bf0[11];
+  bf1[12] = bf0[12] + bf0[14];
+  bf1[13] = bf0[13] + bf0[15];
+  bf1[14] = bf0[12] - bf0[14];
+  bf1[15] = bf0[13] - bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
+  bf1[8] = bf0[8] + bf0[12];
+  bf1[9] = bf0[9] + bf0[13];
+  bf1[10] = bf0[10] + bf0[14];
+  bf1[11] = bf0[11] + bf0[15];
+  bf1[12] = bf0[8] - bf0[12];
+  bf1[13] = bf0[9] - bf0[13];
+  bf1[14] = bf0[10] - bf0[14];
+  bf1[15] = bf0[11] - bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit[stage]);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = bf0[0] - bf0[8];
+  bf1[9] = bf0[1] - bf0[9];
+  bf1[10] = bf0[2] - bf0[10];
+  bf1[11] = bf0[3] - bf0[11];
+  bf1[12] = bf0[4] - bf0[12];
+  bf1[13] = bf0[5] - bf0[13];
+  bf1[14] = bf0[6] - bf0[14];
+  bf1[15] = bf0[7] - bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit[stage]);
+  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[14];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[12];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[10];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[8];
+  bf1[8] = bf0[9];
+  bf1[9] = bf0[6];
+  bf1[10] = bf0[11];
+  bf1[11] = bf0[4];
+  bf1[12] = bf0[13];
+  bf1[13] = bf0[2];
+  bf1[14] = bf0[15];
+  bf1[15] = bf0[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_iadst32_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 32;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[31];
+  bf1[2] = -input[15];
+  bf1[3] = input[16];
+  bf1[4] = -input[7];
+  bf1[5] = input[24];
+  bf1[6] = input[8];
+  bf1[7] = -input[23];
+  bf1[8] = -input[3];
+  bf1[9] = input[28];
+  bf1[10] = input[12];
+  bf1[11] = -input[19];
+  bf1[12] = input[4];
+  bf1[13] = -input[27];
+  bf1[14] = -input[11];
+  bf1[15] = input[20];
+  bf1[16] = -input[1];
+  bf1[17] = input[30];
+  bf1[18] = input[14];
+  bf1[19] = -input[17];
+  bf1[20] = input[6];
+  bf1[21] = -input[25];
+  bf1[22] = -input[9];
+  bf1[23] = input[22];
+  bf1[24] = input[2];
+  bf1[25] = -input[29];
+  bf1[26] = -input[13];
+  bf1[27] = input[18];
+  bf1[28] = -input[5];
+  bf1[29] = input[26];
+  bf1[30] = input[10];
+  bf1[31] = -input[21];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit[stage]);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[32], bf0[18], -cospi[32], bf0[19], cos_bit[stage]);
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[32], bf0[22], -cospi[32], bf0[23], cos_bit[stage]);
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[26], -cospi[32], bf0[27], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[32], bf0[30], -cospi[32], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
+  bf1[8] = bf0[8] + bf0[10];
+  bf1[9] = bf0[9] + bf0[11];
+  bf1[10] = bf0[8] - bf0[10];
+  bf1[11] = bf0[9] - bf0[11];
+  bf1[12] = bf0[12] + bf0[14];
+  bf1[13] = bf0[13] + bf0[15];
+  bf1[14] = bf0[12] - bf0[14];
+  bf1[15] = bf0[13] - bf0[15];
+  bf1[16] = bf0[16] + bf0[18];
+  bf1[17] = bf0[17] + bf0[19];
+  bf1[18] = bf0[16] - bf0[18];
+  bf1[19] = bf0[17] - bf0[19];
+  bf1[20] = bf0[20] + bf0[22];
+  bf1[21] = bf0[21] + bf0[23];
+  bf1[22] = bf0[20] - bf0[22];
+  bf1[23] = bf0[21] - bf0[23];
+  bf1[24] = bf0[24] + bf0[26];
+  bf1[25] = bf0[25] + bf0[27];
+  bf1[26] = bf0[24] - bf0[26];
+  bf1[27] = bf0[25] - bf0[27];
+  bf1[28] = bf0[28] + bf0[30];
+  bf1[29] = bf0[29] + bf0[31];
+  bf1[30] = bf0[28] - bf0[30];
+  bf1[31] = bf0[29] - bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[48], bf0[20], -cospi[16], bf0[21], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[16], bf0[22], cospi[48], bf0[23], cos_bit[stage]);
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[48], bf0[28], -cospi[16], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[16], bf0[30], cospi[48], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
+  bf1[8] = bf0[8] + bf0[12];
+  bf1[9] = bf0[9] + bf0[13];
+  bf1[10] = bf0[10] + bf0[14];
+  bf1[11] = bf0[11] + bf0[15];
+  bf1[12] = bf0[8] - bf0[12];
+  bf1[13] = bf0[9] - bf0[13];
+  bf1[14] = bf0[10] - bf0[14];
+  bf1[15] = bf0[11] - bf0[15];
+  bf1[16] = bf0[16] + bf0[20];
+  bf1[17] = bf0[17] + bf0[21];
+  bf1[18] = bf0[18] + bf0[22];
+  bf1[19] = bf0[19] + bf0[23];
+  bf1[20] = bf0[16] - bf0[20];
+  bf1[21] = bf0[17] - bf0[21];
+  bf1[22] = bf0[18] - bf0[22];
+  bf1[23] = bf0[19] - bf0[23];
+  bf1[24] = bf0[24] + bf0[28];
+  bf1[25] = bf0[25] + bf0[29];
+  bf1[26] = bf0[26] + bf0[30];
+  bf1[27] = bf0[27] + bf0[31];
+  bf1[28] = bf0[24] - bf0[28];
+  bf1[29] = bf0[25] - bf0[29];
+  bf1[30] = bf0[26] - bf0[30];
+  bf1[31] = bf0[27] - bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit[stage]);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[56], bf0[24], -cospi[8], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[24], bf0[26], -cospi[40], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[8], bf0[28], cospi[56], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[40], bf0[30], cospi[24], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = bf0[0] - bf0[8];
+  bf1[9] = bf0[1] - bf0[9];
+  bf1[10] = bf0[2] - bf0[10];
+  bf1[11] = bf0[3] - bf0[11];
+  bf1[12] = bf0[4] - bf0[12];
+  bf1[13] = bf0[5] - bf0[13];
+  bf1[14] = bf0[6] - bf0[14];
+  bf1[15] = bf0[7] - bf0[15];
+  bf1[16] = bf0[16] + bf0[24];
+  bf1[17] = bf0[17] + bf0[25];
+  bf1[18] = bf0[18] + bf0[26];
+  bf1[19] = bf0[19] + bf0[27];
+  bf1[20] = bf0[20] + bf0[28];
+  bf1[21] = bf0[21] + bf0[29];
+  bf1[22] = bf0[22] + bf0[30];
+  bf1[23] = bf0[23] + bf0[31];
+  bf1[24] = bf0[16] - bf0[24];
+  bf1[25] = bf0[17] - bf0[25];
+  bf1[26] = bf0[18] - bf0[26];
+  bf1[27] = bf0[19] - bf0[27];
+  bf1[28] = bf0[20] - bf0[28];
+  bf1[29] = bf0[21] - bf0[29];
+  bf1[30] = bf0[22] - bf0[30];
+  bf1[31] = bf0[23] - bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[60], bf0[16], -cospi[4], bf0[17], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[44], bf0[18], -cospi[20], bf0[19], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[28], bf0[20], -cospi[36], bf0[21], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[12], bf0[22], -cospi[52], bf0[23], cos_bit[stage]);
+  bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[4], bf0[24], cospi[60], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[20], bf0[26], cospi[44], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[36], bf0[28], cospi[28], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[52], bf0[30], cospi[12], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[16];
+  bf1[1] = bf0[1] + bf0[17];
+  bf1[2] = bf0[2] + bf0[18];
+  bf1[3] = bf0[3] + bf0[19];
+  bf1[4] = bf0[4] + bf0[20];
+  bf1[5] = bf0[5] + bf0[21];
+  bf1[6] = bf0[6] + bf0[22];
+  bf1[7] = bf0[7] + bf0[23];
+  bf1[8] = bf0[8] + bf0[24];
+  bf1[9] = bf0[9] + bf0[25];
+  bf1[10] = bf0[10] + bf0[26];
+  bf1[11] = bf0[11] + bf0[27];
+  bf1[12] = bf0[12] + bf0[28];
+  bf1[13] = bf0[13] + bf0[29];
+  bf1[14] = bf0[14] + bf0[30];
+  bf1[15] = bf0[15] + bf0[31];
+  bf1[16] = bf0[0] - bf0[16];
+  bf1[17] = bf0[1] - bf0[17];
+  bf1[18] = bf0[2] - bf0[18];
+  bf1[19] = bf0[3] - bf0[19];
+  bf1[20] = bf0[4] - bf0[20];
+  bf1[21] = bf0[5] - bf0[21];
+  bf1[22] = bf0[6] - bf0[22];
+  bf1[23] = bf0[7] - bf0[23];
+  bf1[24] = bf0[8] - bf0[24];
+  bf1[25] = bf0[9] - bf0[25];
+  bf1[26] = bf0[10] - bf0[26];
+  bf1[27] = bf0[11] - bf0[27];
+  bf1[28] = bf0[12] - bf0[28];
+  bf1[29] = bf0[13] - bf0[29];
+  bf1[30] = bf0[14] - bf0[30];
+  bf1[31] = bf0[15] - bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[63], bf0[0], -cospi[1], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[59], bf0[2], -cospi[5], bf0[3], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[55], bf0[4], -cospi[9], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[51], bf0[6], -cospi[13], bf0[7], cos_bit[stage]);
+  bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[47], bf0[8], -cospi[17], bf0[9], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[43], bf0[10], -cospi[21], bf0[11], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[39], bf0[12], -cospi[25], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[35], bf0[14], -cospi[29], bf0[15], cos_bit[stage]);
+  bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[31], bf0[16], -cospi[33], bf0[17], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[27], bf0[18], -cospi[37], bf0[19], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[23], bf0[20], -cospi[41], bf0[21], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[19], bf0[22], -cospi[45], bf0[23], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[15], bf0[24], -cospi[49], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[11], bf0[26], -cospi[53], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[7], bf0[28], -cospi[57], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[3], bf0[30], -cospi[61], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[30];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[28];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[26];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[24];
+  bf1[8] = bf0[9];
+  bf1[9] = bf0[22];
+  bf1[10] = bf0[11];
+  bf1[11] = bf0[20];
+  bf1[12] = bf0[13];
+  bf1[13] = bf0[18];
+  bf1[14] = bf0[15];
+  bf1[15] = bf0[16];
+  bf1[16] = bf0[17];
+  bf1[17] = bf0[14];
+  bf1[18] = bf0[19];
+  bf1[19] = bf0[12];
+  bf1[20] = bf0[21];
+  bf1[21] = bf0[10];
+  bf1[22] = bf0[23];
+  bf1[23] = bf0[8];
+  bf1[24] = bf0[25];
+  bf1[25] = bf0[6];
+  bf1[26] = bf0[27];
+  bf1[27] = bf0[4];
+  bf1[28] = bf0[29];
+  bf1[29] = bf0[2];
+  bf1[30] = bf0[31];
+  bf1[31] = bf0[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+#if CONFIG_TX64X64
+void av1_idct64_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 64;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[64];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[32];
+  bf1[2] = input[16];
+  bf1[3] = input[48];
+  bf1[4] = input[8];
+  bf1[5] = input[40];
+  bf1[6] = input[24];
+  bf1[7] = input[56];
+  bf1[8] = input[4];
+  bf1[9] = input[36];
+  bf1[10] = input[20];
+  bf1[11] = input[52];
+  bf1[12] = input[12];
+  bf1[13] = input[44];
+  bf1[14] = input[28];
+  bf1[15] = input[60];
+  bf1[16] = input[2];
+  bf1[17] = input[34];
+  bf1[18] = input[18];
+  bf1[19] = input[50];
+  bf1[20] = input[10];
+  bf1[21] = input[42];
+  bf1[22] = input[26];
+  bf1[23] = input[58];
+  bf1[24] = input[6];
+  bf1[25] = input[38];
+  bf1[26] = input[22];
+  bf1[27] = input[54];
+  bf1[28] = input[14];
+  bf1[29] = input[46];
+  bf1[30] = input[30];
+  bf1[31] = input[62];
+  bf1[32] = input[1];
+  bf1[33] = input[33];
+  bf1[34] = input[17];
+  bf1[35] = input[49];
+  bf1[36] = input[9];
+  bf1[37] = input[41];
+  bf1[38] = input[25];
+  bf1[39] = input[57];
+  bf1[40] = input[5];
+  bf1[41] = input[37];
+  bf1[42] = input[21];
+  bf1[43] = input[53];
+  bf1[44] = input[13];
+  bf1[45] = input[45];
+  bf1[46] = input[29];
+  bf1[47] = input[61];
+  bf1[48] = input[3];
+  bf1[49] = input[35];
+  bf1[50] = input[19];
+  bf1[51] = input[51];
+  bf1[52] = input[11];
+  bf1[53] = input[43];
+  bf1[54] = input[27];
+  bf1[55] = input[59];
+  bf1[56] = input[7];
+  bf1[57] = input[39];
+  bf1[58] = input[23];
+  bf1[59] = input[55];
+  bf1[60] = input[15];
+  bf1[61] = input[47];
+  bf1[62] = input[31];
+  bf1[63] = input[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit[stage]);
+  bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit[stage]);
+  bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit[stage]);
+  bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit[stage]);
+  bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit[stage]);
+  bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit[stage]);
+  bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit[stage]);
+  bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit[stage]);
+  bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit[stage]);
+  bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit[stage]);
+  bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit[stage]);
+  bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit[stage]);
+  bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit[stage]);
+  bf1[32] = bf0[32] + bf0[33];
+  bf1[33] = bf0[32] - bf0[33];
+  bf1[34] = -bf0[34] + bf0[35];
+  bf1[35] = bf0[34] + bf0[35];
+  bf1[36] = bf0[36] + bf0[37];
+  bf1[37] = bf0[36] - bf0[37];
+  bf1[38] = -bf0[38] + bf0[39];
+  bf1[39] = bf0[38] + bf0[39];
+  bf1[40] = bf0[40] + bf0[41];
+  bf1[41] = bf0[40] - bf0[41];
+  bf1[42] = -bf0[42] + bf0[43];
+  bf1[43] = bf0[42] + bf0[43];
+  bf1[44] = bf0[44] + bf0[45];
+  bf1[45] = bf0[44] - bf0[45];
+  bf1[46] = -bf0[46] + bf0[47];
+  bf1[47] = bf0[46] + bf0[47];
+  bf1[48] = bf0[48] + bf0[49];
+  bf1[49] = bf0[48] - bf0[49];
+  bf1[50] = -bf0[50] + bf0[51];
+  bf1[51] = bf0[50] + bf0[51];
+  bf1[52] = bf0[52] + bf0[53];
+  bf1[53] = bf0[52] - bf0[53];
+  bf1[54] = -bf0[54] + bf0[55];
+  bf1[55] = bf0[54] + bf0[55];
+  bf1[56] = bf0[56] + bf0[57];
+  bf1[57] = bf0[56] - bf0[57];
+  bf1[58] = -bf0[58] + bf0[59];
+  bf1[59] = bf0[58] + bf0[59];
+  bf1[60] = bf0[60] + bf0[61];
+  bf1[61] = bf0[60] - bf0[61];
+  bf1[62] = -bf0[62] + bf0[63];
+  bf1[63] = bf0[62] + bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = bf0[16] - bf0[17];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[18] + bf0[19];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = bf0[20] - bf0[21];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[22] + bf0[23];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = bf0[24] - bf0[25];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[26] + bf0[27];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = bf0[28] - bf0[29];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[30] + bf0[31];
+  bf1[32] = bf0[32];
+  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit[stage]);
+  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit[stage]);
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit[stage]);
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit[stage]);
+  bf1[43] = bf0[43];
+  bf1[44] = bf0[44];
+  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit[stage]);
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit[stage]);
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[52];
+  bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit[stage]);
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit[stage]);
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit[stage]);
+  bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit[stage]);
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = bf0[8] - bf0[9];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[10] + bf0[11];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = bf0[12] - bf0[13];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[14] + bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit[stage]);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit[stage]);
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[35];
+  bf1[33] = bf0[33] + bf0[34];
+  bf1[34] = bf0[33] - bf0[34];
+  bf1[35] = bf0[32] - bf0[35];
+  bf1[36] = -bf0[36] + bf0[39];
+  bf1[37] = -bf0[37] + bf0[38];
+  bf1[38] = bf0[37] + bf0[38];
+  bf1[39] = bf0[36] + bf0[39];
+  bf1[40] = bf0[40] + bf0[43];
+  bf1[41] = bf0[41] + bf0[42];
+  bf1[42] = bf0[41] - bf0[42];
+  bf1[43] = bf0[40] - bf0[43];
+  bf1[44] = -bf0[44] + bf0[47];
+  bf1[45] = -bf0[45] + bf0[46];
+  bf1[46] = bf0[45] + bf0[46];
+  bf1[47] = bf0[44] + bf0[47];
+  bf1[48] = bf0[48] + bf0[51];
+  bf1[49] = bf0[49] + bf0[50];
+  bf1[50] = bf0[49] - bf0[50];
+  bf1[51] = bf0[48] - bf0[51];
+  bf1[52] = -bf0[52] + bf0[55];
+  bf1[53] = -bf0[53] + bf0[54];
+  bf1[54] = bf0[53] + bf0[54];
+  bf1[55] = bf0[52] + bf0[55];
+  bf1[56] = bf0[56] + bf0[59];
+  bf1[57] = bf0[57] + bf0[58];
+  bf1[58] = bf0[57] - bf0[58];
+  bf1[59] = bf0[56] - bf0[59];
+  bf1[60] = -bf0[60] + bf0[63];
+  bf1[61] = -bf0[61] + bf0[62];
+  bf1[62] = bf0[61] + bf0[62];
+  bf1[63] = bf0[60] + bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = bf0[4] - bf0[5];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[6] + bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = bf0[17] - bf0[18];
+  bf1[19] = bf0[16] - bf0[19];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[21] + bf0[22];
+  bf1[23] = bf0[20] + bf0[23];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = bf0[25] - bf0[26];
+  bf1[27] = bf0[24] - bf0[27];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[29] + bf0[30];
+  bf1[31] = bf0[28] + bf0[31];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit[stage]);
+  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit[stage]);
+  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit[stage]);
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = bf0[41];
+  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit[stage]);
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit[stage]);
+  bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit[stage]);
+  bf1[54] = bf0[54];
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit[stage]);
+  bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit[stage]);
+  bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit[stage]);
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = bf0[9] - bf0[10];
+  bf1[11] = bf0[8] - bf0[11];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[13] + bf0[14];
+  bf1[15] = bf0[12] + bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit[stage]);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[39];
+  bf1[33] = bf0[33] + bf0[38];
+  bf1[34] = bf0[34] + bf0[37];
+  bf1[35] = bf0[35] + bf0[36];
+  bf1[36] = bf0[35] - bf0[36];
+  bf1[37] = bf0[34] - bf0[37];
+  bf1[38] = bf0[33] - bf0[38];
+  bf1[39] = bf0[32] - bf0[39];
+  bf1[40] = -bf0[40] + bf0[47];
+  bf1[41] = -bf0[41] + bf0[46];
+  bf1[42] = -bf0[42] + bf0[45];
+  bf1[43] = -bf0[43] + bf0[44];
+  bf1[44] = bf0[43] + bf0[44];
+  bf1[45] = bf0[42] + bf0[45];
+  bf1[46] = bf0[41] + bf0[46];
+  bf1[47] = bf0[40] + bf0[47];
+  bf1[48] = bf0[48] + bf0[55];
+  bf1[49] = bf0[49] + bf0[54];
+  bf1[50] = bf0[50] + bf0[53];
+  bf1[51] = bf0[51] + bf0[52];
+  bf1[52] = bf0[51] - bf0[52];
+  bf1[53] = bf0[50] - bf0[53];
+  bf1[54] = bf0[49] - bf0[54];
+  bf1[55] = bf0[48] - bf0[55];
+  bf1[56] = -bf0[56] + bf0[63];
+  bf1[57] = -bf0[57] + bf0[62];
+  bf1[58] = -bf0[58] + bf0[61];
+  bf1[59] = -bf0[59] + bf0[60];
+  bf1[60] = bf0[59] + bf0[60];
+  bf1[61] = bf0[58] + bf0[61];
+  bf1[62] = bf0[57] + bf0[62];
+  bf1[63] = bf0[56] + bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = bf0[3] - bf0[4];
+  bf1[5] = bf0[2] - bf0[5];
+  bf1[6] = bf0[1] - bf0[6];
+  bf1[7] = bf0[0] - bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = bf0[19] - bf0[20];
+  bf1[21] = bf0[18] - bf0[21];
+  bf1[22] = bf0[17] - bf0[22];
+  bf1[23] = bf0[16] - bf0[23];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[27] + bf0[28];
+  bf1[29] = bf0[26] + bf0[29];
+  bf1[30] = bf0[25] + bf0[30];
+  bf1[31] = bf0[24] + bf0[31];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit[stage]);
+  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit[stage]);
+  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit[stage]);
+  bf1[44] = bf0[44];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = bf0[50];
+  bf1[51] = bf0[51];
+  bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit[stage]);
+  bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit[stage]);
+  bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit[stage]);
+  bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit[stage]);
+  bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit[stage]);
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = bf0[7] - bf0[8];
+  bf1[9] = bf0[6] - bf0[9];
+  bf1[10] = bf0[5] - bf0[10];
+  bf1[11] = bf0[4] - bf0[11];
+  bf1[12] = bf0[3] - bf0[12];
+  bf1[13] = bf0[2] - bf0[13];
+  bf1[14] = bf0[1] - bf0[14];
+  bf1[15] = bf0[0] - bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[47];
+  bf1[33] = bf0[33] + bf0[46];
+  bf1[34] = bf0[34] + bf0[45];
+  bf1[35] = bf0[35] + bf0[44];
+  bf1[36] = bf0[36] + bf0[43];
+  bf1[37] = bf0[37] + bf0[42];
+  bf1[38] = bf0[38] + bf0[41];
+  bf1[39] = bf0[39] + bf0[40];
+  bf1[40] = bf0[39] - bf0[40];
+  bf1[41] = bf0[38] - bf0[41];
+  bf1[42] = bf0[37] - bf0[42];
+  bf1[43] = bf0[36] - bf0[43];
+  bf1[44] = bf0[35] - bf0[44];
+  bf1[45] = bf0[34] - bf0[45];
+  bf1[46] = bf0[33] - bf0[46];
+  bf1[47] = bf0[32] - bf0[47];
+  bf1[48] = -bf0[48] + bf0[63];
+  bf1[49] = -bf0[49] + bf0[62];
+  bf1[50] = -bf0[50] + bf0[61];
+  bf1[51] = -bf0[51] + bf0[60];
+  bf1[52] = -bf0[52] + bf0[59];
+  bf1[53] = -bf0[53] + bf0[58];
+  bf1[54] = -bf0[54] + bf0[57];
+  bf1[55] = -bf0[55] + bf0[56];
+  bf1[56] = bf0[55] + bf0[56];
+  bf1[57] = bf0[54] + bf0[57];
+  bf1[58] = bf0[53] + bf0[58];
+  bf1[59] = bf0[52] + bf0[59];
+  bf1[60] = bf0[51] + bf0[60];
+  bf1[61] = bf0[50] + bf0[61];
+  bf1[62] = bf0[49] + bf0[62];
+  bf1[63] = bf0[48] + bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[31];
+  bf1[1] = bf0[1] + bf0[30];
+  bf1[2] = bf0[2] + bf0[29];
+  bf1[3] = bf0[3] + bf0[28];
+  bf1[4] = bf0[4] + bf0[27];
+  bf1[5] = bf0[5] + bf0[26];
+  bf1[6] = bf0[6] + bf0[25];
+  bf1[7] = bf0[7] + bf0[24];
+  bf1[8] = bf0[8] + bf0[23];
+  bf1[9] = bf0[9] + bf0[22];
+  bf1[10] = bf0[10] + bf0[21];
+  bf1[11] = bf0[11] + bf0[20];
+  bf1[12] = bf0[12] + bf0[19];
+  bf1[13] = bf0[13] + bf0[18];
+  bf1[14] = bf0[14] + bf0[17];
+  bf1[15] = bf0[15] + bf0[16];
+  bf1[16] = bf0[15] - bf0[16];
+  bf1[17] = bf0[14] - bf0[17];
+  bf1[18] = bf0[13] - bf0[18];
+  bf1[19] = bf0[12] - bf0[19];
+  bf1[20] = bf0[11] - bf0[20];
+  bf1[21] = bf0[10] - bf0[21];
+  bf1[22] = bf0[9] - bf0[22];
+  bf1[23] = bf0[8] - bf0[23];
+  bf1[24] = bf0[7] - bf0[24];
+  bf1[25] = bf0[6] - bf0[25];
+  bf1[26] = bf0[5] - bf0[26];
+  bf1[27] = bf0[4] - bf0[27];
+  bf1[28] = bf0[3] - bf0[28];
+  bf1[29] = bf0[2] - bf0[29];
+  bf1[30] = bf0[1] - bf0[30];
+  bf1[31] = bf0[0] - bf0[31];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = bf0[37];
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
+  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
+  bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = bf0[58];
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[63];
+  bf1[1] = bf0[1] + bf0[62];
+  bf1[2] = bf0[2] + bf0[61];
+  bf1[3] = bf0[3] + bf0[60];
+  bf1[4] = bf0[4] + bf0[59];
+  bf1[5] = bf0[5] + bf0[58];
+  bf1[6] = bf0[6] + bf0[57];
+  bf1[7] = bf0[7] + bf0[56];
+  bf1[8] = bf0[8] + bf0[55];
+  bf1[9] = bf0[9] + bf0[54];
+  bf1[10] = bf0[10] + bf0[53];
+  bf1[11] = bf0[11] + bf0[52];
+  bf1[12] = bf0[12] + bf0[51];
+  bf1[13] = bf0[13] + bf0[50];
+  bf1[14] = bf0[14] + bf0[49];
+  bf1[15] = bf0[15] + bf0[48];
+  bf1[16] = bf0[16] + bf0[47];
+  bf1[17] = bf0[17] + bf0[46];
+  bf1[18] = bf0[18] + bf0[45];
+  bf1[19] = bf0[19] + bf0[44];
+  bf1[20] = bf0[20] + bf0[43];
+  bf1[21] = bf0[21] + bf0[42];
+  bf1[22] = bf0[22] + bf0[41];
+  bf1[23] = bf0[23] + bf0[40];
+  bf1[24] = bf0[24] + bf0[39];
+  bf1[25] = bf0[25] + bf0[38];
+  bf1[26] = bf0[26] + bf0[37];
+  bf1[27] = bf0[27] + bf0[36];
+  bf1[28] = bf0[28] + bf0[35];
+  bf1[29] = bf0[29] + bf0[34];
+  bf1[30] = bf0[30] + bf0[33];
+  bf1[31] = bf0[31] + bf0[32];
+  bf1[32] = bf0[31] - bf0[32];
+  bf1[33] = bf0[30] - bf0[33];
+  bf1[34] = bf0[29] - bf0[34];
+  bf1[35] = bf0[28] - bf0[35];
+  bf1[36] = bf0[27] - bf0[36];
+  bf1[37] = bf0[26] - bf0[37];
+  bf1[38] = bf0[25] - bf0[38];
+  bf1[39] = bf0[24] - bf0[39];
+  bf1[40] = bf0[23] - bf0[40];
+  bf1[41] = bf0[22] - bf0[41];
+  bf1[42] = bf0[21] - bf0[42];
+  bf1[43] = bf0[20] - bf0[43];
+  bf1[44] = bf0[19] - bf0[44];
+  bf1[45] = bf0[18] - bf0[45];
+  bf1[46] = bf0[17] - bf0[46];
+  bf1[47] = bf0[16] - bf0[47];
+  bf1[48] = bf0[15] - bf0[48];
+  bf1[49] = bf0[14] - bf0[49];
+  bf1[50] = bf0[13] - bf0[50];
+  bf1[51] = bf0[12] - bf0[51];
+  bf1[52] = bf0[11] - bf0[52];
+  bf1[53] = bf0[10] - bf0[53];
+  bf1[54] = bf0[9] - bf0[54];
+  bf1[55] = bf0[8] - bf0[55];
+  bf1[56] = bf0[7] - bf0[56];
+  bf1[57] = bf0[6] - bf0[57];
+  bf1[58] = bf0[5] - bf0[58];
+  bf1[59] = bf0[4] - bf0[59];
+  bf1[60] = bf0[3] - bf0[60];
+  bf1[61] = bf0[2] - bf0[61];
+  bf1[62] = bf0[1] - bf0[62];
+  bf1[63] = bf0[0] - bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+#endif  // CONFIG_TX64X64
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.h b/third_party/aom/av1/common/av1_inv_txfm1d.h
new file mode 100644
index 000000000..9e7a2323b
--- /dev/null
+++ b/third_party/aom/av1/common/av1_inv_txfm1d.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_INV_TXFM1D_H_
+#define AV1_INV_TXFM1D_H_
+
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_idct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+                   const int8_t *stage_range);
+void av1_idct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+                   const int8_t *stage_range);
+void av1_idct16_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void av1_idct32_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void av1_idct64_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+
+void av1_iadst4_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void av1_iadst8_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void av1_iadst16_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+void av1_iadst32_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AV1_INV_TXFM1D_H_
diff --git a/third_party/aom/av1/common/av1_inv_txfm2d.c b/third_party/aom/av1/common/av1_inv_txfm2d.c
new file mode 100644
index 000000000..d56c7d11f
--- /dev/null
+++ b/third_party/aom/av1/common/av1_inv_txfm2d.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm2d_cfg.h"
+
+static INLINE TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT4: return av1_idct4_new;
+    case TXFM_TYPE_DCT8: return av1_idct8_new;
+    case TXFM_TYPE_DCT16: return av1_idct16_new;
+    case TXFM_TYPE_DCT32: return av1_idct32_new;
+    case TXFM_TYPE_ADST4: return av1_iadst4_new;
+    case TXFM_TYPE_ADST8: return av1_iadst8_new;
+    case TXFM_TYPE_ADST16: return av1_iadst16_new;
+    case TXFM_TYPE_ADST32: return av1_iadst32_new;
+    default: assert(0); return NULL;
+  }
+}
+
+const TXFM_2D_CFG *inv_txfm_cfg_ls[TX_TYPES][TX_SIZES] = {
+  // DCT_DCT
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &inv_txfm_2d_cfg_dct_dct_4, &inv_txfm_2d_cfg_dct_dct_8,
+      &inv_txfm_2d_cfg_dct_dct_16, &inv_txfm_2d_cfg_dct_dct_32 },
+  // ADST_DCT
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &inv_txfm_2d_cfg_adst_dct_4, &inv_txfm_2d_cfg_adst_dct_8,
+      &inv_txfm_2d_cfg_adst_dct_16, &inv_txfm_2d_cfg_adst_dct_32 },
+  // DCT_ADST
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &inv_txfm_2d_cfg_dct_adst_4, &inv_txfm_2d_cfg_dct_adst_8,
+      &inv_txfm_2d_cfg_dct_adst_16, &inv_txfm_2d_cfg_dct_adst_32 },
+  // ADST_ADST
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+      &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32 },
+#if CONFIG_EXT_TX
+  // FLIPADST_DCT
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &inv_txfm_2d_cfg_adst_dct_4, &inv_txfm_2d_cfg_adst_dct_8,
+      &inv_txfm_2d_cfg_adst_dct_16, &inv_txfm_2d_cfg_adst_dct_32 },
+  // DCT_FLIPADST
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &inv_txfm_2d_cfg_dct_adst_4, &inv_txfm_2d_cfg_dct_adst_8,
+      &inv_txfm_2d_cfg_dct_adst_16, &inv_txfm_2d_cfg_dct_adst_32 },
+  // FLIPADST_FLIPADST
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+      &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32 },
+  // ADST_FLIPADST
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+      &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32 },
+  // FLIPADST_ADST
+  {
+#if CONFIG_CB4X4
+      NULL,
+#endif
+      &inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+      &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32 },
+  { // IDTX
+#if CONFIG_CB4X4
+    NULL,
+#endif
+    &inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+    &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32 },
+  { // V_DCT
+#if CONFIG_CB4X4
+    NULL,
+#endif
+    &inv_txfm_2d_cfg_dct_adst_4, &inv_txfm_2d_cfg_dct_adst_8,
+    &inv_txfm_2d_cfg_dct_adst_16, &inv_txfm_2d_cfg_dct_adst_32 },
+  { // H_DCT
+#if CONFIG_CB4X4
+    NULL,
+#endif
+    &inv_txfm_2d_cfg_adst_dct_4, &inv_txfm_2d_cfg_adst_dct_8,
+    &inv_txfm_2d_cfg_adst_dct_16, &inv_txfm_2d_cfg_adst_dct_32 },
+  { // V_ADST
+#if CONFIG_CB4X4
+    NULL,
+#endif
+    &inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+    &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32 },
+  { // H_ADST
+#if CONFIG_CB4X4
+    NULL,
+#endif
+    &inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+    &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32 },
+  { // V_FLIP_ADST
+#if CONFIG_CB4X4
+    NULL,
+#endif
+    &inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+    &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32 },
+  { // H_FLIP_ADST
+#if CONFIG_CB4X4
+    NULL,
+#endif
+    &inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+    &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32 },
+#endif  // CONFIG_EXT_TX
+};
+
+TXFM_2D_FLIP_CFG av1_get_inv_txfm_cfg(int tx_type, int tx_size) {
+  TXFM_2D_FLIP_CFG cfg;
+  set_flip_cfg(tx_type, &cfg);
+  cfg.cfg = inv_txfm_cfg_ls[tx_type][tx_size];
+  return cfg;
+}
+
+TXFM_2D_FLIP_CFG av1_get_inv_txfm_64x64_cfg(int tx_type) {
+  TXFM_2D_FLIP_CFG cfg = { 0, 0, NULL };
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg.cfg = &inv_txfm_2d_cfg_dct_dct_64;
+      set_flip_cfg(tx_type, &cfg);
+      break;
+    default: assert(0);
+  }
+  return cfg;
+}
+
+static INLINE void inv_txfm2d_add_c(const int32_t *input, int16_t *output,
+                                    int stride, TXFM_2D_FLIP_CFG *cfg,
+                                    int32_t *txfm_buf) {
+  const int txfm_size = cfg->cfg->txfm_size;
+  const int8_t *shift = cfg->cfg->shift;
+  const int8_t *stage_range_col = cfg->cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->cfg->stage_range_row;
+  const int8_t *cos_bit_col = cfg->cfg->cos_bit_col;
+  const int8_t *cos_bit_row = cfg->cfg->cos_bit_row;
+  const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->cfg->txfm_type_col);
+  const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->cfg->txfm_type_row);
+
+  // txfm_buf's length is  txfm_size * txfm_size + 2 * txfm_size
+  // it is used for intermediate data buffering
+  int32_t *temp_in = txfm_buf;
+  int32_t *temp_out = temp_in + txfm_size;
+  int32_t *buf = temp_out + txfm_size;
+  int32_t *buf_ptr = buf;
+  int c, r;
+
+  // Rows
+  for (r = 0; r < txfm_size; ++r) {
+    txfm_func_row(input, buf_ptr, cos_bit_row, stage_range_row);
+    round_shift_array(buf_ptr, txfm_size, -shift[0]);
+    input += txfm_size;
+    buf_ptr += txfm_size;
+  }
+
+  // Columns
+  for (c = 0; c < txfm_size; ++c) {
+    if (cfg->lr_flip == 0) {
+      for (r = 0; r < txfm_size; ++r) temp_in[r] = buf[r * txfm_size + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size; ++r)
+        temp_in[r] = buf[r * txfm_size + (txfm_size - c - 1)];
+    }
+    txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
+    round_shift_array(temp_out, txfm_size, -shift[1]);
+    if (cfg->ud_flip == 0) {
+      for (r = 0; r < txfm_size; ++r) output[r * stride + c] += temp_out[r];
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size; ++r)
+        output[r * stride + c] += temp_out[txfm_size - r - 1];
+    }
+  }
+}
+
+static INLINE void inv_txfm2d_add_facade(const int32_t *input, uint16_t *output,
+                                         int stride, int32_t *txfm_buf,
+                                         int tx_type, int tx_size, int bd) {
+  // output contains the prediction signal which is always positive and smaller
+  // than (1 << bd) - 1
+  // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
+  // int16_t*
+  TXFM_2D_FLIP_CFG cfg = av1_get_inv_txfm_cfg(tx_type, tx_size);
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, &cfg, txfm_buf);
+  clamp_block((int16_t *)output, cfg.cfg->txfm_size, stride, 0, (1 << bd) - 1);
+}
+
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output,
+                              int stride, int tx_type, int bd) {
+  int txfm_buf[4 * 4 + 4 + 4];
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X4, bd);
+}
+
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output,
+                              int stride, int tx_type, int bd) {
+  int txfm_buf[8 * 8 + 8 + 8];
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X8, bd);
+}
+
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output,
+                                int stride, int tx_type, int bd) {
+  int txfm_buf[16 * 16 + 16 + 16];
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X16, bd);
+}
+
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output,
+                                int stride, int tx_type, int bd) {
+  int txfm_buf[32 * 32 + 32 + 32];
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X32, bd);
+}
+
+void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output,
+                                int stride, int tx_type, int bd) {
+  int txfm_buf[64 * 64 + 64 + 64];
+  // output contains the prediction signal which is always positive and smaller
+  // than (1 << bd) - 1
+  // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
+  // int16_t*
+  TXFM_2D_FLIP_CFG cfg = av1_get_inv_txfm_64x64_cfg(tx_type);
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, &cfg, txfm_buf);
+  clamp_block((int16_t *)output, 64, stride, 0, (1 << bd) - 1);
+}
diff --git a/third_party/aom/av1/common/av1_inv_txfm2d_cfg.h b/third_party/aom/av1/common/av1_inv_txfm2d_cfg.h
new file mode 100644
index 000000000..9eabc2e5a
--- /dev/null
+++ b/third_party/aom/av1/common/av1_inv_txfm2d_cfg.h
@@ -0,0 +1,447 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_INV_TXFM2D_CFG_H_
+#define AV1_INV_TXFM2D_CFG_H_
+#include "av1/common/av1_inv_txfm1d.h"
+//  ---------------- config inv_dct_dct_4 ----------------
+static const int8_t inv_shift_dct_dct_4[2] = { 0, -4 };
+static const int8_t inv_stage_range_col_dct_dct_4[4] = { 18, 18, 17, 17 };
+static const int8_t inv_stage_range_row_dct_dct_4[4] = { 18, 18, 18, 18 };
+static const int8_t inv_cos_bit_col_dct_dct_4[4] = { 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_dct_dct_4[4] = { 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_4 = {
+  4,  // .txfm_size
+  4,  // .stage_num_col
+  4,  // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_dct_dct_4,            // .shift
+  inv_stage_range_col_dct_dct_4,  // .stage_range_col
+  inv_stage_range_row_dct_dct_4,  // .stage_range_row
+  inv_cos_bit_col_dct_dct_4,      // .cos_bit_col
+  inv_cos_bit_row_dct_dct_4,      // .cos_bit_row
+  TXFM_TYPE_DCT4,                 // .txfm_type_col
+  TXFM_TYPE_DCT4
+};  // .txfm_type_row
+
+//  ---------------- config inv_dct_dct_8 ----------------
+static const int8_t inv_shift_dct_dct_8[2] = { 0, -5 };
+static const int8_t inv_stage_range_col_dct_dct_8[6] = {
+  19, 19, 19, 19, 18, 18
+};
+static const int8_t inv_stage_range_row_dct_dct_8[6] = {
+  19, 19, 19, 19, 19, 19
+};
+static const int8_t inv_cos_bit_col_dct_dct_8[6] = { 13, 13, 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_dct_dct_8[6] = { 13, 13, 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_8 = {
+  8,  // .txfm_size
+  6,  // .stage_num_col
+  6,  // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_dct_dct_8,            // .shift
+  inv_stage_range_col_dct_dct_8,  // .stage_range_col
+  inv_stage_range_row_dct_dct_8,  // .stage_range_row
+  inv_cos_bit_col_dct_dct_8,      // .cos_bit_col
+  inv_cos_bit_row_dct_dct_8,      // .cos_bit_row
+  TXFM_TYPE_DCT8,                 // .txfm_type_col
+  TXFM_TYPE_DCT8
+};  // .txfm_type_row
+
+//  ---------------- config inv_dct_dct_16 ----------------
+static const int8_t inv_shift_dct_dct_16[2] = { -1, -5 };
+static const int8_t inv_stage_range_col_dct_dct_16[8] = { 19, 19, 19, 19,
+                                                          19, 19, 18, 18 };
+static const int8_t inv_stage_range_row_dct_dct_16[8] = { 20, 20, 20, 20,
+                                                          20, 20, 20, 20 };
+static const int8_t inv_cos_bit_col_dct_dct_16[8] = { 13, 13, 13, 13,
+                                                      13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_dct_dct_16[8] = { 12, 12, 12, 12,
+                                                      12, 12, 12, 12 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_16 = {
+  16,  // .txfm_size
+  8,   // .stage_num_col
+  8,   // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_dct_dct_16,            // .shift
+  inv_stage_range_col_dct_dct_16,  // .stage_range_col
+  inv_stage_range_row_dct_dct_16,  // .stage_range_row
+  inv_cos_bit_col_dct_dct_16,      // .cos_bit_col
+  inv_cos_bit_row_dct_dct_16,      // .cos_bit_row
+  TXFM_TYPE_DCT16,                 // .txfm_type_col
+  TXFM_TYPE_DCT16
+};  // .txfm_type_row
+
+//  ---------------- config inv_dct_dct_32 ----------------
+static const int8_t inv_shift_dct_dct_32[2] = { -1, -5 };
+static const int8_t inv_stage_range_col_dct_dct_32[10] = { 19, 19, 19, 19, 19,
+                                                           19, 19, 19, 18, 18 };
+static const int8_t inv_stage_range_row_dct_dct_32[10] = { 20, 20, 20, 20, 20,
+                                                           20, 20, 20, 20, 20 };
+static const int8_t inv_cos_bit_col_dct_dct_32[10] = { 13, 13, 13, 13, 13,
+                                                       13, 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_dct_dct_32[10] = { 12, 12, 12, 12, 12,
+                                                       12, 12, 12, 12, 12 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_32 = {
+  32,  // .txfm_size
+  10,  // .stage_num_col
+  10,  // .stage_num_row
+  // 1,  // .log_scale
+  inv_shift_dct_dct_32,            // .shift
+  inv_stage_range_col_dct_dct_32,  // .stage_range_col
+  inv_stage_range_row_dct_dct_32,  // .stage_range_row
+  inv_cos_bit_col_dct_dct_32,      // .cos_bit_col
+  inv_cos_bit_row_dct_dct_32,      // .cos_bit_row
+  TXFM_TYPE_DCT32,                 // .txfm_type_col
+  TXFM_TYPE_DCT32
+};  // .txfm_type_row
+
+//  ---------------- config inv_dct_dct_64 ----------------
+static const int8_t inv_shift_dct_dct_64[2] = { -1, -7 };
+static const int8_t inv_stage_range_col_dct_dct_64[12] = {
+  19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18
+};
+static const int8_t inv_stage_range_row_dct_dct_64[12] = {
+  20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+};
+static const int8_t inv_cos_bit_col_dct_dct_64[12] = { 13, 13, 13, 13, 13, 13,
+                                                       13, 13, 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_dct_dct_64[12] = { 12, 12, 12, 12, 12, 12,
+                                                       12, 12, 12, 12, 12, 12 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_64 = {
+  64,                              // .txfm_size
+  12,                              // .stage_num_col
+  12,                              // .stage_num_row
+  inv_shift_dct_dct_64,            // .shift
+  inv_stage_range_col_dct_dct_64,  // .stage_range_col
+  inv_stage_range_row_dct_dct_64,  // .stage_range_row
+  inv_cos_bit_col_dct_dct_64,      // .cos_bit_col
+  inv_cos_bit_row_dct_dct_64,      // .cos_bit_row
+  TXFM_TYPE_DCT64,                 // .txfm_type_col
+  TXFM_TYPE_DCT64
+};  // .txfm_type_row
+
+//  ---------------- config inv_dct_adst_4 ----------------
+static const int8_t inv_shift_dct_adst_4[2] = { 0, -4 };
+static const int8_t inv_stage_range_col_dct_adst_4[4] = { 18, 18, 17, 17 };
+static const int8_t inv_stage_range_row_dct_adst_4[6] = {
+  18, 18, 18, 18, 18, 18
+};
+static const int8_t inv_cos_bit_col_dct_adst_4[4] = { 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_dct_adst_4[6] = { 13, 13, 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_adst_4 = {
+  4,  // .txfm_size
+  4,  // .stage_num_col
+  6,  // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_dct_adst_4,            // .shift
+  inv_stage_range_col_dct_adst_4,  // .stage_range_col
+  inv_stage_range_row_dct_adst_4,  // .stage_range_row
+  inv_cos_bit_col_dct_adst_4,      // .cos_bit_col
+  inv_cos_bit_row_dct_adst_4,      // .cos_bit_row
+  TXFM_TYPE_DCT4,                  // .txfm_type_col
+  TXFM_TYPE_ADST4
+};  // .txfm_type_row
+
+//  ---------------- config inv_dct_adst_8 ----------------
+static const int8_t inv_shift_dct_adst_8[2] = { 0, -5 };
+static const int8_t inv_stage_range_col_dct_adst_8[6] = {
+  19, 19, 19, 19, 18, 18
+};
+static const int8_t inv_stage_range_row_dct_adst_8[8] = { 19, 19, 19, 19,
+                                                          19, 19, 19, 19 };
+static const int8_t inv_cos_bit_col_dct_adst_8[6] = { 13, 13, 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_dct_adst_8[8] = { 13, 13, 13, 13,
+                                                      13, 13, 13, 13 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_adst_8 = {
+  8,  // .txfm_size
+  6,  // .stage_num_col
+  8,  // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_dct_adst_8,            // .shift
+  inv_stage_range_col_dct_adst_8,  // .stage_range_col
+  inv_stage_range_row_dct_adst_8,  // .stage_range_row
+  inv_cos_bit_col_dct_adst_8,      // .cos_bit_col
+  inv_cos_bit_row_dct_adst_8,      // .cos_bit_row
+  TXFM_TYPE_DCT8,                  // .txfm_type_col
+  TXFM_TYPE_ADST8
+};  // .txfm_type_row
+
+//  ---------------- config inv_dct_adst_16 ----------------
+static const int8_t inv_shift_dct_adst_16[2] = { -1, -5 };
+static const int8_t inv_stage_range_col_dct_adst_16[8] = { 19, 19, 19, 19,
+                                                           19, 19, 18, 18 };
+static const int8_t inv_stage_range_row_dct_adst_16[10] = {
+  20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+};
+static const int8_t inv_cos_bit_col_dct_adst_16[8] = { 13, 13, 13, 13,
+                                                       13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_dct_adst_16[10] = { 12, 12, 12, 12, 12,
+                                                        12, 12, 12, 12, 12 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_adst_16 = {
+  16,  // .txfm_size
+  8,   // .stage_num_col
+  10,  // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_dct_adst_16,            // .shift
+  inv_stage_range_col_dct_adst_16,  // .stage_range_col
+  inv_stage_range_row_dct_adst_16,  // .stage_range_row
+  inv_cos_bit_col_dct_adst_16,      // .cos_bit_col
+  inv_cos_bit_row_dct_adst_16,      // .cos_bit_row
+  TXFM_TYPE_DCT16,                  // .txfm_type_col
+  TXFM_TYPE_ADST16
+};  // .txfm_type_row
+
+//  ---------------- config inv_dct_adst_32 ----------------
+static const int8_t inv_shift_dct_adst_32[2] = { -1, -5 };
+static const int8_t inv_stage_range_col_dct_adst_32[10] = {
+  19, 19, 19, 19, 19, 19, 19, 19, 18, 18
+};
+static const int8_t inv_stage_range_row_dct_adst_32[12] = {
+  20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+};
+static const int8_t inv_cos_bit_col_dct_adst_32[10] = { 13, 13, 13, 13, 13,
+                                                        13, 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_dct_adst_32[12] = {
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12
+};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_adst_32 = {
+  32,  // .txfm_size
+  10,  // .stage_num_col
+  12,  // .stage_num_row
+  // 1,  // .log_scale
+  inv_shift_dct_adst_32,            // .shift
+  inv_stage_range_col_dct_adst_32,  // .stage_range_col
+  inv_stage_range_row_dct_adst_32,  // .stage_range_row
+  inv_cos_bit_col_dct_adst_32,      // .cos_bit_col
+  inv_cos_bit_row_dct_adst_32,      // .cos_bit_row
+  TXFM_TYPE_DCT32,                  // .txfm_type_col
+  TXFM_TYPE_ADST32
+};  // .txfm_type_row
+
+//  ---------------- config inv_adst_adst_4 ----------------
+static const int8_t inv_shift_adst_adst_4[2] = { 0, -4 };
+static const int8_t inv_stage_range_col_adst_adst_4[6] = { 18, 18, 18,
+                                                           18, 17, 17 };
+static const int8_t inv_stage_range_row_adst_adst_4[6] = { 18, 18, 18,
+                                                           18, 18, 18 };
+static const int8_t inv_cos_bit_col_adst_adst_4[6] = { 13, 13, 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_adst_adst_4[6] = { 13, 13, 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_adst_4 = {
+  4,  // .txfm_size
+  6,  // .stage_num_col
+  6,  // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_adst_adst_4,            // .shift
+  inv_stage_range_col_adst_adst_4,  // .stage_range_col
+  inv_stage_range_row_adst_adst_4,  // .stage_range_row
+  inv_cos_bit_col_adst_adst_4,      // .cos_bit_col
+  inv_cos_bit_row_adst_adst_4,      // .cos_bit_row
+  TXFM_TYPE_ADST4,                  // .txfm_type_col
+  TXFM_TYPE_ADST4
+};  // .txfm_type_row
+
+//  ---------------- config inv_adst_adst_8 ----------------
+static const int8_t inv_shift_adst_adst_8[2] = { 0, -5 };
+static const int8_t inv_stage_range_col_adst_adst_8[8] = { 19, 19, 19, 19,
+                                                           19, 19, 18, 18 };
+static const int8_t inv_stage_range_row_adst_adst_8[8] = { 19, 19, 19, 19,
+                                                           19, 19, 19, 19 };
+static const int8_t inv_cos_bit_col_adst_adst_8[8] = { 13, 13, 13, 13,
+                                                       13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_adst_adst_8[8] = { 13, 13, 13, 13,
+                                                       13, 13, 13, 13 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_adst_8 = {
+  8,  // .txfm_size
+  8,  // .stage_num_col
+  8,  // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_adst_adst_8,            // .shift
+  inv_stage_range_col_adst_adst_8,  // .stage_range_col
+  inv_stage_range_row_adst_adst_8,  // .stage_range_row
+  inv_cos_bit_col_adst_adst_8,      // .cos_bit_col
+  inv_cos_bit_row_adst_adst_8,      // .cos_bit_row
+  TXFM_TYPE_ADST8,                  // .txfm_type_col
+  TXFM_TYPE_ADST8
+};  // .txfm_type_row
+
+//  ---------------- config inv_adst_adst_16 ----------------
+static const int8_t inv_shift_adst_adst_16[2] = { -1, -5 };
+static const int8_t inv_stage_range_col_adst_adst_16[10] = {
+  19, 19, 19, 19, 19, 19, 19, 19, 18, 18
+};
+static const int8_t inv_stage_range_row_adst_adst_16[10] = {
+  20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+};
+static const int8_t inv_cos_bit_col_adst_adst_16[10] = { 13, 13, 13, 13, 13,
+                                                         13, 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_adst_adst_16[10] = { 12, 12, 12, 12, 12,
+                                                         12, 12, 12, 12, 12 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_adst_16 = {
+  16,  // .txfm_size
+  10,  // .stage_num_col
+  10,  // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_adst_adst_16,            // .shift
+  inv_stage_range_col_adst_adst_16,  // .stage_range_col
+  inv_stage_range_row_adst_adst_16,  // .stage_range_row
+  inv_cos_bit_col_adst_adst_16,      // .cos_bit_col
+  inv_cos_bit_row_adst_adst_16,      // .cos_bit_row
+  TXFM_TYPE_ADST16,                  // .txfm_type_col
+  TXFM_TYPE_ADST16
+};  // .txfm_type_row
+
+//  ---------------- config inv_adst_adst_32 ----------------
+static const int8_t inv_shift_adst_adst_32[2] = { -1, -5 };
+static const int8_t inv_stage_range_col_adst_adst_32[12] = {
+  19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18
+};
+static const int8_t inv_stage_range_row_adst_adst_32[12] = {
+  20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+};
+static const int8_t inv_cos_bit_col_adst_adst_32[12] = {
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13
+};
+static const int8_t inv_cos_bit_row_adst_adst_32[12] = {
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12
+};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_adst_32 = {
+  32,  // .txfm_size
+  12,  // .stage_num_col
+  12,  // .stage_num_row
+  // 1,  // .log_scale
+  inv_shift_adst_adst_32,            // .shift
+  inv_stage_range_col_adst_adst_32,  // .stage_range_col
+  inv_stage_range_row_adst_adst_32,  // .stage_range_row
+  inv_cos_bit_col_adst_adst_32,      // .cos_bit_col
+  inv_cos_bit_row_adst_adst_32,      // .cos_bit_row
+  TXFM_TYPE_ADST32,                  // .txfm_type_col
+  TXFM_TYPE_ADST32
+};  // .txfm_type_row
+
+//  ---------------- config inv_adst_dct_4 ----------------
+static const int8_t inv_shift_adst_dct_4[2] = { 0, -4 };
+static const int8_t inv_stage_range_col_adst_dct_4[6] = {
+  18, 18, 18, 18, 17, 17
+};
+static const int8_t inv_stage_range_row_adst_dct_4[4] = { 18, 18, 18, 18 };
+static const int8_t inv_cos_bit_col_adst_dct_4[6] = { 13, 13, 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_adst_dct_4[4] = { 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_dct_4 = {
+  4,  // .txfm_size
+  6,  // .stage_num_col
+  4,  // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_adst_dct_4,            // .shift
+  inv_stage_range_col_adst_dct_4,  // .stage_range_col
+  inv_stage_range_row_adst_dct_4,  // .stage_range_row
+  inv_cos_bit_col_adst_dct_4,      // .cos_bit_col
+  inv_cos_bit_row_adst_dct_4,      // .cos_bit_row
+  TXFM_TYPE_ADST4,                 // .txfm_type_col
+  TXFM_TYPE_DCT4
+};  // .txfm_type_row
+
+//  ---------------- config inv_adst_dct_8 ----------------
+static const int8_t inv_shift_adst_dct_8[2] = { 0, -5 };
+static const int8_t inv_stage_range_col_adst_dct_8[8] = { 19, 19, 19, 19,
+                                                          19, 19, 18, 18 };
+static const int8_t inv_stage_range_row_adst_dct_8[6] = {
+  19, 19, 19, 19, 19, 19
+};
+static const int8_t inv_cos_bit_col_adst_dct_8[8] = { 13, 13, 13, 13,
+                                                      13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_adst_dct_8[6] = { 13, 13, 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_dct_8 = {
+  8,  // .txfm_size
+  8,  // .stage_num_col
+  6,  // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_adst_dct_8,            // .shift
+  inv_stage_range_col_adst_dct_8,  // .stage_range_col
+  inv_stage_range_row_adst_dct_8,  // .stage_range_row
+  inv_cos_bit_col_adst_dct_8,      // .cos_bit_col
+  inv_cos_bit_row_adst_dct_8,      // .cos_bit_row
+  TXFM_TYPE_ADST8,                 // .txfm_type_col
+  TXFM_TYPE_DCT8
+};  // .txfm_type_row
+
+//  ---------------- config inv_adst_dct_16 ----------------
+static const int8_t inv_shift_adst_dct_16[2] = { -1, -5 };
+static const int8_t inv_stage_range_col_adst_dct_16[10] = {
+  19, 19, 19, 19, 19, 19, 19, 19, 18, 18
+};
+static const int8_t inv_stage_range_row_adst_dct_16[8] = { 20, 20, 20, 20,
+                                                           20, 20, 20, 20 };
+static const int8_t inv_cos_bit_col_adst_dct_16[10] = { 13, 13, 13, 13, 13,
+                                                        13, 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_adst_dct_16[8] = { 12, 12, 12, 12,
+                                                       12, 12, 12, 12 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_dct_16 = {
+  16,  // .txfm_size
+  10,  // .stage_num_col
+  8,   // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_adst_dct_16,            // .shift
+  inv_stage_range_col_adst_dct_16,  // .stage_range_col
+  inv_stage_range_row_adst_dct_16,  // .stage_range_row
+  inv_cos_bit_col_adst_dct_16,      // .cos_bit_col
+  inv_cos_bit_row_adst_dct_16,      // .cos_bit_row
+  TXFM_TYPE_ADST16,                 // .txfm_type_col
+  TXFM_TYPE_DCT16
+};  // .txfm_type_row
+
+//  ---------------- config inv_adst_dct_32 ----------------
+static const int8_t inv_shift_adst_dct_32[2] = { -1, -5 };
+static const int8_t inv_stage_range_col_adst_dct_32[12] = {
+  19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18
+};
+static const int8_t inv_stage_range_row_adst_dct_32[10] = {
+  20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+};
+static const int8_t inv_cos_bit_col_adst_dct_32[12] = {
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13
+};
+static const int8_t inv_cos_bit_row_adst_dct_32[10] = { 12, 12, 12, 12, 12,
+                                                        12, 12, 12, 12, 12 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_dct_32 = {
+  32,  // .txfm_size
+  12,  // .stage_num_col
+  10,  // .stage_num_row
+  // 1,  // .log_scale
+  inv_shift_adst_dct_32,            // .shift
+  inv_stage_range_col_adst_dct_32,  // .stage_range_col
+  inv_stage_range_row_adst_dct_32,  // .stage_range_row
+  inv_cos_bit_col_adst_dct_32,      // .cos_bit_col
+  inv_cos_bit_row_adst_dct_32,      // .cos_bit_row
+  TXFM_TYPE_ADST32,                 // .txfm_type_col
+  TXFM_TYPE_DCT32
+};  // .txfm_type_row
+
+extern const TXFM_2D_CFG *inv_txfm_cfg_ls[TX_TYPES][TX_SIZES];
+
+#endif  // AV1_INV_TXFM2D_CFG_H_
diff --git a/third_party/aom/av1/common/av1_loopfilter.c b/third_party/aom/av1/common/av1_loopfilter.c
new file mode 100644
index 000000000..530871795
--- /dev/null
+++ b/third_party/aom/av1/common/av1_loopfilter.c
@@ -0,0 +1,2336 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/seg_common.h"
+
+#define CONFIG_PARALLEL_DEBLOCKING_15TAPLUMAONLY 0
+
+// 64 bit masks for left transform size. Each 1 represents a position where
+// we should apply a loop filter across the left border of an 8x8 block
+// boundary.
+//
+// In the case of TX_16X16->  ( in low order byte first we end up with
+// a mask that looks like this
+//
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//
+// A loopfilter should be applied to every other 8x8 horizontally.
+static const uint64_t left_64x64_txform_mask[TX_SIZES] = {
+#if CONFIG_CB4X4
+  0xffffffffffffffffULL,  // TX_2X2
+#endif
+  0xffffffffffffffffULL,  // TX_4X4
+  0xffffffffffffffffULL,  // TX_8x8
+  0x5555555555555555ULL,  // TX_16x16
+  0x1111111111111111ULL,  // TX_32x32
+#if CONFIG_TX64X64
+  0x0101010101010101ULL,  // TX_64x64
+#endif                    // CONFIG_TX64X64
+};
+
+// 64 bit masks for above transform size. Each 1 represents a position where
+// we should apply a loop filter across the top border of an 8x8 block
+// boundary.
+//
+// In the case of TX_32x32 ->  ( in low order byte first we end up with
+// a mask that looks like this
+//
+//    11111111
+//    00000000
+//    00000000
+//    00000000
+//    11111111
+//    00000000
+//    00000000
+//    00000000
+//
+// A loopfilter should be applied to every other 4 the row vertically.
+static const uint64_t above_64x64_txform_mask[TX_SIZES] = {
+#if CONFIG_CB4X4
+  0xffffffffffffffffULL,  // TX_4X4
+#endif
+  0xffffffffffffffffULL,  // TX_4X4
+  0xffffffffffffffffULL,  // TX_8x8
+  0x00ff00ff00ff00ffULL,  // TX_16x16
+  0x000000ff000000ffULL,  // TX_32x32
+#if CONFIG_TX64X64
+  0x00000000000000ffULL,  // TX_64x64
+#endif                    // CONFIG_TX64X64
+};
+
+// 64 bit masks for prediction sizes (left). Each 1 represents a position
+// where left border of an 8x8 block. These are aligned to the right most
+// appropriate bit, and then shifted into place.
+//
+// In the case of TX_16x32 ->  ( low order byte first ) we end up with
+// a mask that looks like this :
+//
+//  10000000
+//  10000000
+//  10000000
+//  10000000
+//  00000000
+//  00000000
+//  00000000
+//  00000000
+static const uint64_t left_prediction_mask[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  0x0000000000000001ULL,  // BLOCK_2X2,
+  0x0000000000000001ULL,  // BLOCK_2X4,
+  0x0000000000000001ULL,  // BLOCK_4X2,
+#endif
+  0x0000000000000001ULL,  // BLOCK_4X4,
+  0x0000000000000001ULL,  // BLOCK_4X8,
+  0x0000000000000001ULL,  // BLOCK_8X4,
+  0x0000000000000001ULL,  // BLOCK_8X8,
+  0x0000000000000101ULL,  // BLOCK_8X16,
+  0x0000000000000001ULL,  // BLOCK_16X8,
+  0x0000000000000101ULL,  // BLOCK_16X16,
+  0x0000000001010101ULL,  // BLOCK_16X32,
+  0x0000000000000101ULL,  // BLOCK_32X16,
+  0x0000000001010101ULL,  // BLOCK_32X32,
+  0x0101010101010101ULL,  // BLOCK_32X64,
+  0x0000000001010101ULL,  // BLOCK_64X32,
+  0x0101010101010101ULL,  // BLOCK_64X64
+};
+
+// 64 bit mask to shift and set for each prediction size.
+static const uint64_t above_prediction_mask[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  0x0000000000000001ULL,  // BLOCK_2X2
+  0x0000000000000001ULL,  // BLOCK_2X4
+  0x0000000000000001ULL,  // BLOCK_4X2
+#endif
+  0x0000000000000001ULL,  // BLOCK_4X4
+  0x0000000000000001ULL,  // BLOCK_4X8
+  0x0000000000000001ULL,  // BLOCK_8X4
+  0x0000000000000001ULL,  // BLOCK_8X8
+  0x0000000000000001ULL,  // BLOCK_8X16,
+  0x0000000000000003ULL,  // BLOCK_16X8
+  0x0000000000000003ULL,  // BLOCK_16X16
+  0x0000000000000003ULL,  // BLOCK_16X32,
+  0x000000000000000fULL,  // BLOCK_32X16,
+  0x000000000000000fULL,  // BLOCK_32X32,
+  0x000000000000000fULL,  // BLOCK_32X64,
+  0x00000000000000ffULL,  // BLOCK_64X32,
+  0x00000000000000ffULL,  // BLOCK_64X64
+};
+// 64 bit mask to shift and set for each prediction size. A bit is set for
+// each 8x8 block that would be in the left most block of the given block
+// size in the 64x64 block.
+static const uint64_t size_mask[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  0x0000000000000001ULL,  // BLOCK_2X2
+  0x0000000000000001ULL,  // BLOCK_2X4
+  0x0000000000000001ULL,  // BLOCK_4X2
+#endif
+  0x0000000000000001ULL,  // BLOCK_4X4
+  0x0000000000000001ULL,  // BLOCK_4X8
+  0x0000000000000001ULL,  // BLOCK_8X4
+  0x0000000000000001ULL,  // BLOCK_8X8
+  0x0000000000000101ULL,  // BLOCK_8X16,
+  0x0000000000000003ULL,  // BLOCK_16X8
+  0x0000000000000303ULL,  // BLOCK_16X16
+  0x0000000003030303ULL,  // BLOCK_16X32,
+  0x0000000000000f0fULL,  // BLOCK_32X16,
+  0x000000000f0f0f0fULL,  // BLOCK_32X32,
+  0x0f0f0f0f0f0f0f0fULL,  // BLOCK_32X64,
+  0x00000000ffffffffULL,  // BLOCK_64X32,
+  0xffffffffffffffffULL,  // BLOCK_64X64
+};
+
+// These are used for masking the left and above 32x32 borders.
+static const uint64_t left_border = 0x1111111111111111ULL;
+static const uint64_t above_border = 0x000000ff000000ffULL;
+
+// 16 bit masks for uv transform sizes.
+static const uint16_t left_64x64_txform_mask_uv[TX_SIZES] = {
+#if CONFIG_CB4X4
+  0xffff,  // TX_2X2
+#endif
+  0xffff,  // TX_4X4
+  0xffff,  // TX_8x8
+  0x5555,  // TX_16x16
+  0x1111,  // TX_32x32
+#if CONFIG_TX64X64
+  0x0101,  // TX_64x64, never used
+#endif     // CONFIG_TX64X64
+};
+
+static const uint16_t above_64x64_txform_mask_uv[TX_SIZES] = {
+#if CONFIG_CB4X4
+  0xffff,  // TX_2X2
+#endif
+  0xffff,  // TX_4X4
+  0xffff,  // TX_8x8
+  0x0f0f,  // TX_16x16
+  0x000f,  // TX_32x32
+#if CONFIG_TX64X64
+  0x0003,  // TX_64x64, never used
+#endif     // CONFIG_TX64X64
+};
+
+// 16 bit left mask to shift and set for each uv prediction size.
+static const uint16_t left_prediction_mask_uv[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  0x0001,  // BLOCK_2X2,
+  0x0001,  // BLOCK_2X4,
+  0x0001,  // BLOCK_4X2,
+#endif
+  0x0001,  // BLOCK_4X4,
+  0x0001,  // BLOCK_4X8,
+  0x0001,  // BLOCK_8X4,
+  0x0001,  // BLOCK_8X8,
+  0x0001,  // BLOCK_8X16,
+  0x0001,  // BLOCK_16X8,
+  0x0001,  // BLOCK_16X16,
+  0x0011,  // BLOCK_16X32,
+  0x0001,  // BLOCK_32X16,
+  0x0011,  // BLOCK_32X32,
+  0x1111,  // BLOCK_32X64
+  0x0011,  // BLOCK_64X32,
+  0x1111,  // BLOCK_64X64
+};
+// 16 bit above mask to shift and set for uv each prediction size.
+static const uint16_t above_prediction_mask_uv[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  0x0001,  // BLOCK_2X2
+  0x0001,  // BLOCK_2X4
+  0x0001,  // BLOCK_4X2
+#endif
+  0x0001,  // BLOCK_4X4
+  0x0001,  // BLOCK_4X8
+  0x0001,  // BLOCK_8X4
+  0x0001,  // BLOCK_8X8
+  0x0001,  // BLOCK_8X16,
+  0x0001,  // BLOCK_16X8
+  0x0001,  // BLOCK_16X16
+  0x0001,  // BLOCK_16X32,
+  0x0003,  // BLOCK_32X16,
+  0x0003,  // BLOCK_32X32,
+  0x0003,  // BLOCK_32X64,
+  0x000f,  // BLOCK_64X32,
+  0x000f,  // BLOCK_64X64
+};
+
+// 64 bit mask to shift and set for each uv prediction size
+static const uint16_t size_mask_uv[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  0x0001,  // BLOCK_2X2
+  0x0001,  // BLOCK_2X4
+  0x0001,  // BLOCK_4X2
+#endif
+  0x0001,  // BLOCK_4X4
+  0x0001,  // BLOCK_4X8
+  0x0001,  // BLOCK_8X4
+  0x0001,  // BLOCK_8X8
+  0x0001,  // BLOCK_8X16,
+  0x0001,  // BLOCK_16X8
+  0x0001,  // BLOCK_16X16
+  0x0011,  // BLOCK_16X32,
+  0x0003,  // BLOCK_32X16,
+  0x0033,  // BLOCK_32X32,
+  0x3333,  // BLOCK_32X64,
+  0x00ff,  // BLOCK_64X32,
+  0xffff,  // BLOCK_64X64
+};
+static const uint16_t left_border_uv = 0x1111;
+static const uint16_t above_border_uv = 0x000f;
+
+static const int mode_lf_lut[] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // INTRA_MODES
+#if CONFIG_ALT_INTRA
+  0,
+#endif
+  1, 1, 0, 1,  // INTER_MODES (ZEROMV == 0)
+#if CONFIG_EXT_INTER
+  1, 1, 1, 1, 1, 1, 1, 1, 0, 1  // INTER_COMPOUND_MODES (ZERO_ZEROMV == 0)
+#endif                          // CONFIG_EXT_INTER
+};
+
+static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
+  int lvl;
+
+  // For each possible value for the loop filter fill out limits
+  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) {
+    // Set loop filter parameters that control sharpness.
+    int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
+
+    if (sharpness_lvl > 0) {
+      if (block_inside_limit > (9 - sharpness_lvl))
+        block_inside_limit = (9 - sharpness_lvl);
+    }
+
+    if (block_inside_limit < 1) block_inside_limit = 1;
+
+    memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
+    memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
+           SIMD_WIDTH);
+  }
+}
+#if CONFIG_EXT_DELTA_Q
+static uint8_t get_filter_level(const AV1_COMMON *cm,
+                                const loop_filter_info_n *lfi_n,
+                                const MB_MODE_INFO *mbmi) {
+#if CONFIG_SUPERTX
+  const int segment_id = AOMMIN(mbmi->segment_id, mbmi->segment_id_supertx);
+  assert(
+      IMPLIES(supertx_enabled(mbmi), mbmi->segment_id_supertx != MAX_SEGMENTS));
+  assert(IMPLIES(supertx_enabled(mbmi),
+                 mbmi->segment_id_supertx <= mbmi->segment_id));
+#else
+  const int segment_id = mbmi->segment_id;
+#endif  // CONFIG_SUPERTX
+  if (cm->delta_lf_present_flag) {
+    int lvl_seg = clamp(mbmi->current_delta_lf_from_base + cm->lf.filter_level,
+                        0, MAX_LOOP_FILTER);
+    const int scale = 1 << (lvl_seg >> 5);
+    if (segfeature_active(&cm->seg, segment_id, SEG_LVL_ALT_LF)) {
+      const int data = get_segdata(&cm->seg, segment_id, SEG_LVL_ALT_LF);
+      lvl_seg =
+          clamp(cm->seg.abs_delta == SEGMENT_ABSDATA ? data : lvl_seg + data, 0,
+                MAX_LOOP_FILTER);
+    }
+
+    if (cm->lf.mode_ref_delta_enabled) {
+      lvl_seg += cm->lf.ref_deltas[mbmi->ref_frame[0]] * scale;
+      if (mbmi->ref_frame[0] > INTRA_FRAME)
+        lvl_seg += cm->lf.mode_deltas[mode_lf_lut[mbmi->mode]] * scale;
+      lvl_seg = clamp(lvl_seg, 0, MAX_LOOP_FILTER);
+    }
+    return lvl_seg;
+  } else {
+    return lfi_n->lvl[segment_id][mbmi->ref_frame[0]][mode_lf_lut[mbmi->mode]];
+  }
+}
+#else
+static uint8_t get_filter_level(const loop_filter_info_n *lfi_n,
+                                const MB_MODE_INFO *mbmi) {
+#if CONFIG_SUPERTX
+  const int segment_id = AOMMIN(mbmi->segment_id, mbmi->segment_id_supertx);
+  assert(
+      IMPLIES(supertx_enabled(mbmi), mbmi->segment_id_supertx != MAX_SEGMENTS));
+  assert(IMPLIES(supertx_enabled(mbmi),
+                 mbmi->segment_id_supertx <= mbmi->segment_id));
+#else
+  const int segment_id = mbmi->segment_id;
+#endif  // CONFIG_SUPERTX
+  return lfi_n->lvl[segment_id][mbmi->ref_frame[0]][mode_lf_lut[mbmi->mode]];
+}
+#endif
+
+#define NELEMENTS(x) (sizeof((x)) / sizeof((x)[0]))
+
+void av1_loop_filter_init(AV1_COMMON *cm) {
+  assert(MB_MODE_COUNT == NELEMENTS(mode_lf_lut));
+  loop_filter_info_n *lfi = &cm->lf_info;
+  struct loopfilter *lf = &cm->lf;
+  int lvl;
+
+  // init limits for given sharpness
+  update_sharpness(lfi, lf->sharpness_level);
+  lf->last_sharpness_level = lf->sharpness_level;
+
+  // init hev threshold const vectors
+  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
+    memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
+}
+
+void av1_loop_filter_frame_init(AV1_COMMON *cm, int default_filt_lvl) {
+  int seg_id;
+  // n_shift is the multiplier for lf_deltas
+  // the multiplier is 1 for when filter_lvl is between 0 and 31;
+  // 2 when filter_lvl is between 32 and 63
+  const int scale = 1 << (default_filt_lvl >> 5);
+  loop_filter_info_n *const lfi = &cm->lf_info;
+  struct loopfilter *const lf = &cm->lf;
+  const struct segmentation *const seg = &cm->seg;
+
+  // update limits if sharpness has changed
+  if (lf->last_sharpness_level != lf->sharpness_level) {
+    update_sharpness(lfi, lf->sharpness_level);
+    lf->last_sharpness_level = lf->sharpness_level;
+  }
+
+  for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
+    int lvl_seg = default_filt_lvl;
+    if (segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
+      const int data = get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
+      lvl_seg = clamp(
+          seg->abs_delta == SEGMENT_ABSDATA ? data : default_filt_lvl + data, 0,
+          MAX_LOOP_FILTER);
+    }
+
+    if (!lf->mode_ref_delta_enabled) {
+      // we could get rid of this if we assume that deltas are set to
+      // zero when not in use; encoder always uses deltas
+      memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id]));
+    } else {
+      int ref, mode;
+      const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
+      lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
+
+      for (ref = LAST_FRAME; ref < TOTAL_REFS_PER_FRAME; ++ref) {
+        for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
+          const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
+                                lf->mode_deltas[mode] * scale;
+          lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+        }
+      }
+    }
+  }
+}
+
+static void filter_selectively_vert_row2(int subsampling_factor, uint8_t *s,
+                                         int pitch, unsigned int mask_16x16_l,
+                                         unsigned int mask_8x8_l,
+                                         unsigned int mask_4x4_l,
+                                         unsigned int mask_4x4_int_l,
+                                         const loop_filter_info_n *lfi_n,
+                                         const uint8_t *lfl) {
+  const int mask_shift = subsampling_factor ? 4 : 8;
+  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+  const int lfl_forward = subsampling_factor ? 4 : 8;
+
+  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
+  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
+  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
+  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
+  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
+  unsigned int mask;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
+              mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
+       mask; mask >>= 1) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
+
+    if (mask & 1) {
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          aom_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                   lfi0->hev_thr);
+        } else if (mask_16x16_0 & 1) {
+          aom_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+        } else {
+          aom_lpf_vertical_16(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                              lfi1->hev_thr);
+        }
+      }
+
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                  lfi1->hev_thr);
+        } else if (mask_8x8_0 & 1) {
+          aom_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+        } else {
+          aom_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                             lfi1->hev_thr);
+        }
+      }
+
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                  lfi1->hev_thr);
+        } else if (mask_4x4_0 & 1) {
+          aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+        } else {
+          aom_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                             lfi1->hev_thr);
+        }
+      }
+
+      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
+        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
+          aom_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                  lfi1->hev_thr);
+        } else if (mask_4x4_int_0 & 1) {
+          aom_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                             lfi0->hev_thr);
+        } else {
+          aom_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,
+                             lfi1->hev_thr);
+        }
+      }
+    }
+
+    s += 8;
+    lfl += 1;
+    mask_16x16_0 >>= 1;
+    mask_8x8_0 >>= 1;
+    mask_4x4_0 >>= 1;
+    mask_4x4_int_0 >>= 1;
+    mask_16x16_1 >>= 1;
+    mask_8x8_1 >>= 1;
+    mask_4x4_1 >>= 1;
+    mask_4x4_int_1 >>= 1;
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void highbd_filter_selectively_vert_row2(
+    int subsampling_factor, uint16_t *s, int pitch, unsigned int mask_16x16_l,
+    unsigned int mask_8x8_l, unsigned int mask_4x4_l,
+    unsigned int mask_4x4_int_l, const loop_filter_info_n *lfi_n,
+    const uint8_t *lfl, int bd) {
+  const int mask_shift = subsampling_factor ? 4 : 8;
+  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+  const int lfl_forward = subsampling_factor ? 4 : 8;
+
+  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
+  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
+  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
+  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
+  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
+  unsigned int mask;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
+              mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
+       mask; mask >>= 1) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
+
+    if (mask & 1) {
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          aom_highbd_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                          lfi0->hev_thr, bd);
+        } else if (mask_16x16_0 & 1) {
+          aom_highbd_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
+                                     lfi0->hev_thr, bd);
+        } else {
+          aom_highbd_lpf_vertical_16(s + 8 * pitch, pitch, lfi1->mblim,
+                                     lfi1->lim, lfi1->hev_thr, bd);
+        }
+      }
+
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_8x8_0 & 1) {
+          aom_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, bd);
+        } else {
+          aom_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, bd);
+        }
+      }
+
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          aom_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_4x4_0 & 1) {
+          aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, bd);
+        } else {
+          aom_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, bd);
+        }
+      }
+
+      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
+        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
+          aom_highbd_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_4x4_int_0 & 1) {
+          aom_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, bd);
+        } else {
+          aom_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, bd);
+        }
+      }
+    }
+
+    s += 8;
+    lfl += 1;
+    mask_16x16_0 >>= 1;
+    mask_8x8_0 >>= 1;
+    mask_4x4_0 >>= 1;
+    mask_4x4_int_0 >>= 1;
+    mask_16x16_1 >>= 1;
+    mask_8x8_1 >>= 1;
+    mask_4x4_1 >>= 1;
+    mask_4x4_int_1 >>= 1;
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+static void filter_selectively_horiz(
+    uint8_t *s, int pitch, unsigned int mask_16x16, unsigned int mask_8x8,
+    unsigned int mask_4x4, unsigned int mask_4x4_int,
+    const loop_filter_info_n *lfi_n, const uint8_t *lfl) {
+  unsigned int mask;
+  int count;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask;
+       mask >>= count) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+    count = 1;
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        if ((mask_16x16 & 3) == 3) {
+          aom_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,
+                                     lfi->hev_thr);
+          count = 2;
+        } else {
+          aom_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr);
+        }
+      } else if (mask_8x8 & 1) {
+        if ((mask_8x8 & 3) == 3) {
+          // Next block's thresholds.
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, lfin->mblim, lfin->lim,
+                                    lfin->hev_thr);
+
+          if ((mask_4x4_int & 3) == 3) {
+            aom_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+                                      lfi->lim, lfi->hev_thr, lfin->mblim,
+                                      lfin->lim, lfin->hev_thr);
+          } else {
+            if (mask_4x4_int & 1)
+              aom_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                   lfi->hev_thr);
+            else if (mask_4x4_int & 2)
+              aom_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+                                   lfin->lim, lfin->hev_thr);
+          }
+          count = 2;
+        } else {
+          aom_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+
+          if (mask_4x4_int & 1)
+            aom_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                 lfi->hev_thr);
+        }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & 3) == 3) {
+          // Next block's thresholds.
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, lfin->mblim, lfin->lim,
+                                    lfin->hev_thr);
+          if ((mask_4x4_int & 3) == 3) {
+            aom_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+                                      lfi->lim, lfi->hev_thr, lfin->mblim,
+                                      lfin->lim, lfin->hev_thr);
+          } else {
+            if (mask_4x4_int & 1)
+              aom_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                   lfi->hev_thr);
+            else if (mask_4x4_int & 2)
+              aom_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+                                   lfin->lim, lfin->hev_thr);
+          }
+          count = 2;
+        } else {
+          aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+
+          if (mask_4x4_int & 1)
+            aom_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                 lfi->hev_thr);
+        }
+      } else if (mask_4x4_int & 1) {
+        aom_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                             lfi->hev_thr);
+      }
+    }
+    s += 8 * count;
+    lfl += count;
+    mask_16x16 >>= count;
+    mask_8x8 >>= count;
+    mask_4x4 >>= count;
+    mask_4x4_int >>= count;
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void highbd_filter_selectively_horiz(
+    uint16_t *s, int pitch, unsigned int mask_16x16, unsigned int mask_8x8,
+    unsigned int mask_4x4, unsigned int mask_4x4_int,
+    const loop_filter_info_n *lfi_n, const uint8_t *lfl, int bd) {
+  unsigned int mask;
+  int count;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask;
+       mask >>= count) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+    count = 1;
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        if ((mask_16x16 & 3) == 3) {
+          aom_highbd_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,
+                                            lfi->hev_thr, bd);
+          count = 2;
+        } else {
+          aom_highbd_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim,
+                                           lfi->hev_thr, bd);
+        }
+      } else if (mask_8x8 & 1) {
+        if ((mask_8x8 & 3) == 3) {
+          // Next block's thresholds.
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                           lfi->hev_thr, lfin->mblim, lfin->lim,
+                                           lfin->hev_thr, bd);
+
+          if ((mask_4x4_int & 3) == 3) {
+            aom_highbd_lpf_horizontal_4_dual(
+                s + 4 * pitch, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                lfin->mblim, lfin->lim, lfin->hev_thr, bd);
+          } else {
+            if (mask_4x4_int & 1) {
+              aom_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, bd);
+            } else if (mask_4x4_int & 2) {
+              aom_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+                                          lfin->lim, lfin->hev_thr, bd);
+            }
+          }
+          count = 2;
+        } else {
+          aom_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, bd);
+
+          if (mask_4x4_int & 1) {
+            aom_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                        lfi->lim, lfi->hev_thr, bd);
+          }
+        }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & 3) == 3) {
+          // Next block's thresholds.
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          aom_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+                                           lfi->hev_thr, lfin->mblim, lfin->lim,
+                                           lfin->hev_thr, bd);
+          if ((mask_4x4_int & 3) == 3) {
+            aom_highbd_lpf_horizontal_4_dual(
+                s + 4 * pitch, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                lfin->mblim, lfin->lim, lfin->hev_thr, bd);
+          } else {
+            if (mask_4x4_int & 1) {
+              aom_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, bd);
+            } else if (mask_4x4_int & 2) {
+              aom_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+                                          lfin->lim, lfin->hev_thr, bd);
+            }
+          }
+          count = 2;
+        } else {
+          aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, bd);
+
+          if (mask_4x4_int & 1) {
+            aom_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                        lfi->lim, lfi->hev_thr, bd);
+          }
+        }
+      } else if (mask_4x4_int & 1) {
+        aom_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, bd);
+      }
+    }
+    s += 8 * count;
+    lfl += count;
+    mask_16x16 >>= count;
+    mask_8x8 >>= count;
+    mask_4x4 >>= count;
+    mask_4x4_int >>= count;
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+// This function ors into the current lfm structure, where to do loop
+// filters for the specific mi we are looking at. It uses information
+// including the block_size_type (32x16, 32x32, etc.), the transform size,
+// whether there were any coefficients encoded, and the loop filter strength
+// block we are currently looking at. Shift is used to position the
+// 1's we produce.
+// TODO(JBB) Need another function for different resolution color..
+static void build_masks(AV1_COMMON *const cm,
+                        const loop_filter_info_n *const lfi_n,
+                        const MODE_INFO *mi, const int shift_y,
+                        const int shift_uv, LOOP_FILTER_MASK *lfm) {
+  const MB_MODE_INFO *mbmi = &mi->mbmi;
+  const BLOCK_SIZE block_size = mbmi->sb_type;
+  // TODO(debargha): Check if masks can be setup correctly when
+  // rectangular transfroms are used with the EXT_TX expt.
+  const TX_SIZE tx_size_y = txsize_sqr_map[mbmi->tx_size];
+  const TX_SIZE tx_size_y_left = txsize_horz_map[mbmi->tx_size];
+  const TX_SIZE tx_size_y_above = txsize_vert_map[mbmi->tx_size];
+  const TX_SIZE tx_size_uv =
+      txsize_sqr_map[uv_txsize_lookup[block_size][mbmi->tx_size][1][1]];
+  const TX_SIZE tx_size_uv_left =
+      txsize_horz_map[uv_txsize_lookup[block_size][mbmi->tx_size][1][1]];
+  const TX_SIZE tx_size_uv_above =
+      txsize_vert_map[uv_txsize_lookup[block_size][mbmi->tx_size][1][1]];
+#if CONFIG_EXT_DELTA_Q
+  const int filter_level = get_filter_level(cm, lfi_n, mbmi);
+#else
+  const int filter_level = get_filter_level(lfi_n, mbmi);
+  (void)cm;
+#endif
+  uint64_t *const left_y = &lfm->left_y[tx_size_y_left];
+  uint64_t *const above_y = &lfm->above_y[tx_size_y_above];
+  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
+  uint16_t *const left_uv = &lfm->left_uv[tx_size_uv_left];
+  uint16_t *const above_uv = &lfm->above_uv[tx_size_uv_above];
+  uint16_t *const int_4x4_uv = &lfm->left_int_4x4_uv;
+  int i;
+
+  // If filter level is 0 we don't loop filter.
+  if (!filter_level) {
+    return;
+  } else {
+    const int w = num_8x8_blocks_wide_lookup[block_size];
+    const int h = num_8x8_blocks_high_lookup[block_size];
+    const int row = (shift_y >> MAX_MIB_SIZE_LOG2);
+    const int col = shift_y - (row << MAX_MIB_SIZE_LOG2);
+
+    for (i = 0; i < h; i++) memset(&lfm->lfl_y[row + i][col], filter_level, w);
+  }
+
+  // These set 1 in the current block size for the block size edges.
+  // For instance if the block size is 32x16, we'll set:
+  //    above =   1111
+  //              0000
+  //    and
+  //    left  =   1000
+  //          =   1000
+  // NOTE : In this example the low bit is left most ( 1000 ) is stored as
+  //        1,  not 8...
+  //
+  // U and V set things on a 16 bit scale.
+  //
+  *above_y |= above_prediction_mask[block_size] << shift_y;
+  *above_uv |= above_prediction_mask_uv[block_size] << shift_uv;
+  *left_y |= left_prediction_mask[block_size] << shift_y;
+  *left_uv |= left_prediction_mask_uv[block_size] << shift_uv;
+
+  // If the block has no coefficients and is not intra we skip applying
+  // the loop filter on block edges.
+  if (mbmi->skip && is_inter_block(mbmi)) return;
+
+  // Here we are adding a mask for the transform size. The transform
+  // size mask is set to be correct for a 64x64 prediction block size. We
+  // mask to match the size of the block we are working on and then shift it
+  // into place..
+  *above_y |= (size_mask[block_size] & above_64x64_txform_mask[tx_size_y_above])
+              << shift_y;
+  *above_uv |=
+      (size_mask_uv[block_size] & above_64x64_txform_mask_uv[tx_size_uv_above])
+      << shift_uv;
+
+  *left_y |= (size_mask[block_size] & left_64x64_txform_mask[tx_size_y_left])
+             << shift_y;
+  *left_uv |=
+      (size_mask_uv[block_size] & left_64x64_txform_mask_uv[tx_size_uv_left])
+      << shift_uv;
+
+  // Here we are trying to determine what to do with the internal 4x4 block
+  // boundaries.  These differ from the 4x4 boundaries on the outside edge of
+  // an 8x8 in that the internal ones can be skipped and don't depend on
+  // the prediction block size.
+  if (tx_size_y == TX_4X4)
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y;
+
+  if (tx_size_uv == TX_4X4)
+    *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
+}
+
+// This function does the same thing as the one above with the exception that
+// it only affects the y masks. It exists because for blocks < 16x16 in size,
+// we only update u and v masks on the first block.
+static void build_y_mask(AV1_COMMON *const cm,
+                         const loop_filter_info_n *const lfi_n,
+                         const MODE_INFO *mi, const int shift_y,
+#if CONFIG_SUPERTX
+                         int supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                         LOOP_FILTER_MASK *lfm) {
+  const MB_MODE_INFO *mbmi = &mi->mbmi;
+  const TX_SIZE tx_size_y = txsize_sqr_map[mbmi->tx_size];
+  const TX_SIZE tx_size_y_left = txsize_horz_map[mbmi->tx_size];
+  const TX_SIZE tx_size_y_above = txsize_vert_map[mbmi->tx_size];
+#if CONFIG_SUPERTX
+  const BLOCK_SIZE block_size =
+      supertx_enabled ? (BLOCK_SIZE)(3 * tx_size_y) : mbmi->sb_type;
+#else
+  const BLOCK_SIZE block_size = mbmi->sb_type;
+#endif
+#if CONFIG_EXT_DELTA_Q
+  const int filter_level = get_filter_level(cm, lfi_n, mbmi);
+#else
+  const int filter_level = get_filter_level(lfi_n, mbmi);
+  (void)cm;
+#endif
+  uint64_t *const left_y = &lfm->left_y[tx_size_y_left];
+  uint64_t *const above_y = &lfm->above_y[tx_size_y_above];
+  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
+  int i;
+
+  if (!filter_level) {
+    return;
+  } else {
+    const int w = num_8x8_blocks_wide_lookup[block_size];
+    const int h = num_8x8_blocks_high_lookup[block_size];
+    const int row = (shift_y >> MAX_MIB_SIZE_LOG2);
+    const int col = shift_y - (row << MAX_MIB_SIZE_LOG2);
+
+    for (i = 0; i < h; i++) memset(&lfm->lfl_y[row + i][col], filter_level, w);
+  }
+
+  *above_y |= above_prediction_mask[block_size] << shift_y;
+  *left_y |= left_prediction_mask[block_size] << shift_y;
+
+  if (mbmi->skip && is_inter_block(mbmi)) return;
+
+  *above_y |= (size_mask[block_size] & above_64x64_txform_mask[tx_size_y_above])
+              << shift_y;
+
+  *left_y |= (size_mask[block_size] & left_64x64_txform_mask[tx_size_y_left])
+             << shift_y;
+
+  if (tx_size_y == TX_4X4)
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y;
+}
+
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+// This function update the bit masks for the entire 64x64 region represented
+// by mi_row, mi_col. In case one of the edge is a tile boundary, loop filtering
+// for that edge is disabled. This function only check the tile boundary info
+// for the top left corner mi to determine the boundary information for the
+// top and left edge of the whole super block
+static void update_tile_boundary_filter_mask(AV1_COMMON *const cm,
+                                             const int mi_row, const int mi_col,
+                                             LOOP_FILTER_MASK *lfm) {
+  int i;
+  MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride + mi_col;
+
+  if (mi->mbmi.boundary_info & TILE_LEFT_BOUNDARY) {
+    for (i = 0; i <= TX_32X32; i++) {
+      lfm->left_y[i] &= 0xfefefefefefefefeULL;
+      lfm->left_uv[i] &= 0xeeee;
+    }
+  }
+
+  if (mi->mbmi.boundary_info & TILE_ABOVE_BOUNDARY) {
+    for (i = 0; i <= TX_32X32; i++) {
+      lfm->above_y[i] &= 0xffffffffffffff00ULL;
+      lfm->above_uv[i] &= 0xfff0;
+    }
+  }
+}
+#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+
+// This function sets up the bit masks for the entire 64x64 region represented
+// by mi_row, mi_col.
+// TODO(JBB): This function only works for yv12.
+void av1_setup_mask(AV1_COMMON *const cm, const int mi_row, const int mi_col,
+                    MODE_INFO **mi, const int mode_info_stride,
+                    LOOP_FILTER_MASK *lfm) {
+  int idx_32, idx_16, idx_8;
+  const loop_filter_info_n *const lfi_n = &cm->lf_info;
+  MODE_INFO **mip = mi;
+  MODE_INFO **mip2 = mi;
+
+  // These are offsets to the next mi in the 64x64 block. It is what gets
+  // added to the mi ptr as we go through each loop. It helps us to avoid
+  // setting up special row and column counters for each index. The last step
+  // brings us out back to the starting position.
+  const int offset_32[] = { 4, (mode_info_stride << 2) - 4, 4,
+                            -(mode_info_stride << 2) - 4 };
+  const int offset_16[] = { 2, (mode_info_stride << 1) - 2, 2,
+                            -(mode_info_stride << 1) - 2 };
+  const int offset[] = { 1, mode_info_stride - 1, 1, -mode_info_stride - 1 };
+
+  // Following variables represent shifts to position the current block
+  // mask over the appropriate block. A shift of 36 to the left will move
+  // the bits for the final 32 by 32 block in the 64x64 up 4 rows and left
+  // 4 rows to the appropriate spot.
+  const int shift_32_y[] = { 0, 4, 32, 36 };
+  const int shift_16_y[] = { 0, 2, 16, 18 };
+  const int shift_8_y[] = { 0, 1, 8, 9 };
+  const int shift_32_uv[] = { 0, 2, 8, 10 };
+  const int shift_16_uv[] = { 0, 1, 4, 5 };
+  int i;
+  const int max_rows = AOMMIN(cm->mi_rows - mi_row, MAX_MIB_SIZE);
+  const int max_cols = AOMMIN(cm->mi_cols - mi_col, MAX_MIB_SIZE);
+#if CONFIG_EXT_PARTITION
+  assert(0 && "Not yet updated");
+#endif  // CONFIG_EXT_PARTITION
+
+  av1_zero(*lfm);
+  assert(mip[0] != NULL);
+
+  // TODO(jimbankoski): Try moving most of the following code into decode
+  // loop and storing lfm in the mbmi structure so that we don't have to go
+  // through the recursive loop structure multiple times.
+  switch (mip[0]->mbmi.sb_type) {
+    case BLOCK_64X64: build_masks(cm, lfi_n, mip[0], 0, 0, lfm); break;
+    case BLOCK_64X32: build_masks(cm, lfi_n, mip[0], 0, 0, lfm);
+#if CONFIG_SUPERTX && CONFIG_TX64X64
+      if (supertx_enabled(&mip[0]->mbmi)) break;
+#endif  // CONFIG_SUPERTX && CONFIG_TX64X64
+      mip2 = mip + mode_info_stride * 4;
+      if (4 >= max_rows) break;
+      build_masks(cm, lfi_n, mip2[0], 32, 8, lfm);
+      break;
+    case BLOCK_32X64: build_masks(cm, lfi_n, mip[0], 0, 0, lfm);
+#if CONFIG_SUPERTX && CONFIG_TX64X64
+      if (supertx_enabled(&mip[0]->mbmi)) break;
+#endif  // CONFIG_SUPERTX && CONFIG_TX64X64
+      mip2 = mip + 4;
+      if (4 >= max_cols) break;
+      build_masks(cm, lfi_n, mip2[0], 4, 2, lfm);
+      break;
+    default:
+#if CONFIG_SUPERTX && CONFIG_TX64X64
+      if (mip[0]->mbmi.tx_size == TX_64X64) {
+        build_masks(cm, lfi_n, mip[0], 0, 0, lfm);
+      } else {
+#endif  // CONFIG_SUPERTX && CONFIG_TX64X64
+        for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) {
+          const int shift_y_32 = shift_32_y[idx_32];
+          const int shift_uv_32 = shift_32_uv[idx_32];
+          const int mi_32_col_offset = ((idx_32 & 1) << 2);
+          const int mi_32_row_offset = ((idx_32 >> 1) << 2);
+          if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows)
+            continue;
+          switch (mip[0]->mbmi.sb_type) {
+            case BLOCK_32X32:
+              build_masks(cm, lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
+              break;
+            case BLOCK_32X16:
+              build_masks(cm, lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
+#if CONFIG_SUPERTX
+              if (supertx_enabled(&mip[0]->mbmi)) break;
+#endif
+              if (mi_32_row_offset + 2 >= max_rows) continue;
+              mip2 = mip + mode_info_stride * 2;
+              build_masks(cm, lfi_n, mip2[0], shift_y_32 + 16, shift_uv_32 + 4,
+                          lfm);
+              break;
+            case BLOCK_16X32:
+              build_masks(cm, lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
+#if CONFIG_SUPERTX
+              if (supertx_enabled(&mip[0]->mbmi)) break;
+#endif
+              if (mi_32_col_offset + 2 >= max_cols) continue;
+              mip2 = mip + 2;
+              build_masks(cm, lfi_n, mip2[0], shift_y_32 + 2, shift_uv_32 + 1,
+                          lfm);
+              break;
+            default:
+#if CONFIG_SUPERTX
+              if (mip[0]->mbmi.tx_size == TX_32X32) {
+                build_masks(cm, lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
+                break;
+              }
+#endif
+              for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
+                const int shift_y_32_16 = shift_y_32 + shift_16_y[idx_16];
+                const int shift_uv_32_16 = shift_uv_32 + shift_16_uv[idx_16];
+                const int mi_16_col_offset =
+                    mi_32_col_offset + ((idx_16 & 1) << 1);
+                const int mi_16_row_offset =
+                    mi_32_row_offset + ((idx_16 >> 1) << 1);
+
+                if (mi_16_col_offset >= max_cols ||
+                    mi_16_row_offset >= max_rows)
+                  continue;
+
+                switch (mip[0]->mbmi.sb_type) {
+                  case BLOCK_16X16:
+                    build_masks(cm, lfi_n, mip[0], shift_y_32_16,
+                                shift_uv_32_16, lfm);
+                    break;
+                  case BLOCK_16X8:
+#if CONFIG_SUPERTX
+                    if (supertx_enabled(&mip[0]->mbmi)) break;
+#endif
+                    build_masks(cm, lfi_n, mip[0], shift_y_32_16,
+                                shift_uv_32_16, lfm);
+                    if (mi_16_row_offset + 1 >= max_rows) continue;
+                    mip2 = mip + mode_info_stride;
+                    build_y_mask(cm, lfi_n, mip2[0], shift_y_32_16 + 8,
+#if CONFIG_SUPERTX
+                                 0,
+#endif
+                                 lfm);
+                    break;
+                  case BLOCK_8X16:
+#if CONFIG_SUPERTX
+                    if (supertx_enabled(&mip[0]->mbmi)) break;
+#endif
+                    build_masks(cm, lfi_n, mip[0], shift_y_32_16,
+                                shift_uv_32_16, lfm);
+                    if (mi_16_col_offset + 1 >= max_cols) continue;
+                    mip2 = mip + 1;
+                    build_y_mask(cm, lfi_n, mip2[0], shift_y_32_16 + 1,
+#if CONFIG_SUPERTX
+                                 0,
+#endif
+                                 lfm);
+                    break;
+                  default: {
+                    const int shift_y_32_16_8_zero =
+                        shift_y_32_16 + shift_8_y[0];
+#if CONFIG_SUPERTX
+                    if (mip[0]->mbmi.tx_size == TX_16X16) {
+                      build_masks(cm, lfi_n, mip[0], shift_y_32_16_8_zero,
+                                  shift_uv_32_16, lfm);
+                      break;
+                    }
+#endif
+                    build_masks(cm, lfi_n, mip[0], shift_y_32_16_8_zero,
+                                shift_uv_32_16, lfm);
+                    mip += offset[0];
+                    for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
+                      const int shift_y_32_16_8 =
+                          shift_y_32_16 + shift_8_y[idx_8];
+                      const int mi_8_col_offset =
+                          mi_16_col_offset + ((idx_8 & 1));
+                      const int mi_8_row_offset =
+                          mi_16_row_offset + ((idx_8 >> 1));
+
+                      if (mi_8_col_offset >= max_cols ||
+                          mi_8_row_offset >= max_rows)
+                        continue;
+                      build_y_mask(cm, lfi_n, mip[0], shift_y_32_16_8,
+#if CONFIG_SUPERTX
+                                   supertx_enabled(&mip[0]->mbmi),
+#endif
+                                   lfm);
+                    }
+                    break;
+                  }
+                }
+              }
+              break;
+          }
+        }
+#if CONFIG_SUPERTX && CONFIG_TX64X64
+      }
+#endif  // CONFIG_SUPERTX && CONFIG_TX64X64
+      break;
+  }
+  // The largest loopfilter we have is 16x16 so we use the 16x16 mask
+  // for 32x32 transforms also.
+  lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
+  lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
+  lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
+  lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32];
+
+  // We do at least 8 tap filter on every 32x32 even if the transform size
+  // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and
+  // remove it from the 4x4.
+  lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border;
+  lfm->left_y[TX_4X4] &= ~left_border;
+  lfm->above_y[TX_8X8] |= lfm->above_y[TX_4X4] & above_border;
+  lfm->above_y[TX_4X4] &= ~above_border;
+  lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_4X4] & left_border_uv;
+  lfm->left_uv[TX_4X4] &= ~left_border_uv;
+  lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_4X4] & above_border_uv;
+  lfm->above_uv[TX_4X4] &= ~above_border_uv;
+
+  // We do some special edge handling.
+  if (mi_row + MAX_MIB_SIZE > cm->mi_rows) {
+    const uint64_t rows = cm->mi_rows - mi_row;
+
+    // Each pixel inside the border gets a 1,
+    const uint64_t mask_y = (((uint64_t)1 << (rows << MAX_MIB_SIZE_LOG2)) - 1);
+    const uint16_t mask_uv =
+        (((uint16_t)1 << (((rows + 1) >> 1) << (MAX_MIB_SIZE_LOG2 - 1))) - 1);
+
+    // Remove values completely outside our border.
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= mask_y;
+      lfm->above_y[i] &= mask_y;
+      lfm->left_uv[i] &= mask_uv;
+      lfm->above_uv[i] &= mask_uv;
+    }
+    lfm->int_4x4_y &= mask_y;
+    lfm->above_int_4x4_uv = lfm->left_int_4x4_uv & mask_uv;
+
+    // We don't apply a wide loop filter on the last uv block row. If set
+    // apply the shorter one instead.
+    if (rows == 1) {
+      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16];
+      lfm->above_uv[TX_16X16] = 0;
+    }
+    if (rows == 5) {
+      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00;
+      lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00);
+    }
+  } else {
+    lfm->above_int_4x4_uv = lfm->left_int_4x4_uv;
+  }
+
+  if (mi_col + MAX_MIB_SIZE > cm->mi_cols) {
+    const uint64_t columns = cm->mi_cols - mi_col;
+
+    // Each pixel inside the border gets a 1, the multiply copies the border
+    // to where we need it.
+    const uint64_t mask_y = (((1 << columns) - 1)) * 0x0101010101010101ULL;
+    const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;
+
+    // Internal edges are not applied on the last column of the image so
+    // we mask 1 more for the internal edges
+    const uint16_t mask_uv_int = ((1 << (columns >> 1)) - 1) * 0x1111;
+
+    // Remove the bits outside the image edge.
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= mask_y;
+      lfm->above_y[i] &= mask_y;
+      lfm->left_uv[i] &= mask_uv;
+      lfm->above_uv[i] &= mask_uv;
+    }
+    lfm->int_4x4_y &= mask_y;
+    lfm->left_int_4x4_uv &= mask_uv_int;
+
+    // We don't apply a wide loop filter on the last uv column. If set
+    // apply the shorter one instead.
+    if (columns == 1) {
+      lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16];
+      lfm->left_uv[TX_16X16] = 0;
+    }
+    if (columns == 5) {
+      lfm->left_uv[TX_8X8] |= (lfm->left_uv[TX_16X16] & 0xcccc);
+      lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc);
+    }
+  }
+  // We don't apply a loop filter on the first column in the image, mask that
+  // out.
+  if (mi_col == 0) {
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= 0xfefefefefefefefeULL;
+      lfm->left_uv[i] &= 0xeeee;
+    }
+  }
+
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+  if (av1_disable_loopfilter_on_tile_boundary(cm)) {
+    update_tile_boundary_filter_mask(cm, mi_row, mi_col, lfm);
+  }
+#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+
+  // Assert if we try to apply 2 different loop filters at the same position.
+  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_8X8]));
+  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_4X4]));
+  assert(!(lfm->left_y[TX_8X8] & lfm->left_y[TX_4X4]));
+  assert(!(lfm->int_4x4_y & lfm->left_y[TX_16X16]));
+  assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_8X8]));
+  assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_4X4]));
+  assert(!(lfm->left_uv[TX_8X8] & lfm->left_uv[TX_4X4]));
+  assert(!(lfm->left_int_4x4_uv & lfm->left_uv[TX_16X16]));
+  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_8X8]));
+  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_4X4]));
+  assert(!(lfm->above_y[TX_8X8] & lfm->above_y[TX_4X4]));
+  assert(!(lfm->int_4x4_y & lfm->above_y[TX_16X16]));
+  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_8X8]));
+  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_4X4]));
+  assert(!(lfm->above_uv[TX_8X8] & lfm->above_uv[TX_4X4]));
+  assert(!(lfm->above_int_4x4_uv & lfm->above_uv[TX_16X16]));
+}
+
+static void filter_selectively_vert(
+    uint8_t *s, int pitch, unsigned int mask_16x16, unsigned int mask_8x8,
+    unsigned int mask_4x4, unsigned int mask_4x4_int,
+    const loop_filter_info_n *lfi_n, const uint8_t *lfl) {
+  unsigned int mask;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask;
+       mask >>= 1) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        aom_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+      } else if (mask_8x8 & 1) {
+        aom_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+      } else if (mask_4x4 & 1) {
+        aom_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+      }
+    }
+    if (mask_4x4_int & 1)
+      aom_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+    s += 8;
+    lfl += 1;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void highbd_filter_selectively_vert(
+    uint16_t *s, int pitch, unsigned int mask_16x16, unsigned int mask_8x8,
+    unsigned int mask_4x4, unsigned int mask_4x4_int,
+    const loop_filter_info_n *lfi_n, const uint8_t *lfl, int bd) {
+  unsigned int mask;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask;
+       mask >>= 1) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        aom_highbd_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                                   bd);
+      } else if (mask_8x8 & 1) {
+        aom_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                                  bd);
+      } else if (mask_4x4 & 1) {
+        aom_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                                  bd);
+      }
+    }
+    if (mask_4x4_int & 1)
+      aom_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, bd);
+    s += 8;
+    lfl += 1;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+typedef struct {
+  unsigned int m16x16;
+  unsigned int m8x8;
+  unsigned int m4x4;
+} FilterMasks;
+
+// Get filter level and masks for the given row index 'idx_r'. (Only used for
+// the non420 case).
+// Note: 'row_masks_ptr' and/or 'col_masks_ptr' can be passed NULL.
+static void get_filter_level_and_masks_non420(
+    AV1_COMMON *const cm, const struct macroblockd_plane *const plane,
+    MODE_INFO **mib, int mi_row, int mi_col, int idx_r, uint8_t *const lfl_r,
+    unsigned int *const mask_4x4_int_r, FilterMasks *const row_masks_ptr,
+    FilterMasks *const col_masks_ptr) {
+  const int ss_x = plane->subsampling_x;
+  const int ss_y = plane->subsampling_y;
+  const int col_step = mi_size_wide[BLOCK_8X8] << ss_x;
+  FilterMasks row_masks, col_masks;
+  memset(&row_masks, 0, sizeof(row_masks));
+  memset(&col_masks, 0, sizeof(col_masks));
+  *mask_4x4_int_r = 0;
+  const int r = idx_r >> mi_height_log2_lookup[BLOCK_8X8];
+
+  // Determine the vertical edges that need filtering
+  int idx_c;
+  for (idx_c = 0; idx_c < cm->mib_size && mi_col + idx_c < cm->mi_cols;
+       idx_c += col_step) {
+    const MODE_INFO *mi = mib[idx_r * cm->mi_stride + idx_c];
+    const MB_MODE_INFO *mbmi = &mi[0].mbmi;
+    const BLOCK_SIZE sb_type = mbmi->sb_type;
+    const int skip_this = mbmi->skip && is_inter_block(mbmi);
+    // Map index to 8x8 unit
+    const int c = idx_c >> mi_width_log2_lookup[BLOCK_8X8];
+
+    const int blk_row = r & (num_8x8_blocks_high_lookup[sb_type] - 1);
+    const int blk_col = c & (num_8x8_blocks_wide_lookup[sb_type] - 1);
+
+    // left edge of current unit is block/partition edge -> no skip
+    const int block_edge_left =
+        (num_4x4_blocks_wide_lookup[sb_type] > 1) ? !blk_col : 1;
+    const int skip_this_c = skip_this && !block_edge_left;
+    // top edge of current unit is block/partition edge -> no skip
+    const int block_edge_above =
+        (num_4x4_blocks_high_lookup[sb_type] > 1) ? !blk_row : 1;
+    const int skip_this_r = skip_this && !block_edge_above;
+
+#if CONFIG_VAR_TX
+    const TX_SIZE mb_tx_size = mbmi->inter_tx_size[blk_row][blk_col];
+#endif
+
+    TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
+                          ? get_uv_tx_size(mbmi, plane)
+                          : mbmi->tx_size;
+
+    const int skip_border_4x4_c =
+        ss_x && mi_col + idx_c >= cm->mi_cols - mi_size_wide[BLOCK_8X8];
+    const int skip_border_4x4_r =
+        ss_y && mi_row + idx_r >= cm->mi_rows - mi_size_high[BLOCK_8X8];
+
+    TX_SIZE tx_size_c = txsize_horz_map[tx_size];
+    TX_SIZE tx_size_r = txsize_vert_map[tx_size];
+
+    int tx_size_mask = 0;
+    const int c_step = (c >> ss_x);
+    const int r_step = (r >> ss_y);
+    const int col_mask = 1 << c_step;
+
+#if CONFIG_VAR_TX
+    if (is_inter_block(mbmi) && !mbmi->skip) {
+      tx_size = (plane->plane_type == PLANE_TYPE_UV)
+                    ? uv_txsize_lookup[sb_type][mb_tx_size][ss_x][ss_y]
+                    : mb_tx_size;
+    }
+#endif
+
+// Filter level can vary per MI
+#if CONFIG_EXT_DELTA_Q
+    if (!(lfl_r[c_step] = get_filter_level(cm, &cm->lf_info, mbmi))) continue;
+#else
+    if (!(lfl_r[c_step] = get_filter_level(&cm->lf_info, mbmi))) continue;
+#endif
+
+#if CONFIG_VAR_TX
+    tx_size_r = AOMMIN(tx_size, cm->above_txfm_context[mi_col + c]);
+    tx_size_c =
+        AOMMIN(tx_size, cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK]);
+
+    cm->above_txfm_context[mi_col + c] = tx_size;
+    cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK] = tx_size;
+#endif  // CONFIG_VAR_TX
+
+    if (tx_size_c == TX_32X32)
+      tx_size_mask = 3;
+    else if (tx_size_c == TX_16X16)
+      tx_size_mask = 1;
+    else
+      tx_size_mask = 0;
+
+    // Build masks based on the transform size of each block
+    // handle vertical mask
+    if (tx_size_c == TX_32X32) {
+      if (!skip_this_c && (c_step & tx_size_mask) == 0) {
+        if (!skip_border_4x4_c)
+          col_masks.m16x16 |= col_mask;
+        else
+          col_masks.m8x8 |= col_mask;
+      }
+    } else if (tx_size_c == TX_16X16) {
+      if (!skip_this_c && (c_step & tx_size_mask) == 0) {
+        if (!skip_border_4x4_c)
+          col_masks.m16x16 |= col_mask;
+        else
+          col_masks.m8x8 |= col_mask;
+      }
+    } else {
+      // force 8x8 filtering on 32x32 boundaries
+      if (!skip_this_c && (c_step & tx_size_mask) == 0) {
+        if (tx_size_c == TX_8X8 || ((c >> ss_x) & 3) == 0)
+          col_masks.m8x8 |= col_mask;
+        else
+          col_masks.m4x4 |= col_mask;
+      }
+
+      if (!skip_this && tx_size_c < TX_8X8 && !skip_border_4x4_c &&
+          (c_step & tx_size_mask) == 0)
+        *mask_4x4_int_r |= col_mask;
+    }
+
+    if (tx_size_r == TX_32X32)
+      tx_size_mask = 3;
+    else if (tx_size_r == TX_16X16)
+      tx_size_mask = 1;
+    else
+      tx_size_mask = 0;
+
+    // set horizontal mask
+    if (tx_size_r == TX_32X32) {
+      if (!skip_this_r && (r_step & tx_size_mask) == 0) {
+        if (!skip_border_4x4_r)
+          row_masks.m16x16 |= col_mask;
+        else
+          row_masks.m8x8 |= col_mask;
+      }
+    } else if (tx_size_r == TX_16X16) {
+      if (!skip_this_r && (r_step & tx_size_mask) == 0) {
+        if (!skip_border_4x4_r)
+          row_masks.m16x16 |= col_mask;
+        else
+          row_masks.m8x8 |= col_mask;
+      }
+    } else {
+      // force 8x8 filtering on 32x32 boundaries
+      if (!skip_this_r && (r_step & tx_size_mask) == 0) {
+        if (tx_size_r == TX_8X8 || (r_step & 3) == 0)
+          row_masks.m8x8 |= col_mask;
+        else
+          row_masks.m4x4 |= col_mask;
+      }
+
+      if (!skip_this && tx_size_r < TX_8X8 && !skip_border_4x4_c &&
+          ((r >> ss_y) & tx_size_mask) == 0)
+        *mask_4x4_int_r |= col_mask;
+    }
+  }
+
+  if (row_masks_ptr) *row_masks_ptr = row_masks;
+  if (col_masks_ptr) *col_masks_ptr = col_masks;
+}
+
+void av1_filter_block_plane_non420_ver(AV1_COMMON *const cm,
+                                       struct macroblockd_plane *plane,
+                                       MODE_INFO **mib, int mi_row,
+                                       int mi_col) {
+  const int ss_y = plane->subsampling_y;
+  const int row_step = mi_size_high[BLOCK_8X8] << ss_y;
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t *const dst0 = dst->buf;
+  uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE] = { { 0 } };
+
+  int idx_r;
+  for (idx_r = 0; idx_r < cm->mib_size && mi_row + idx_r < cm->mi_rows;
+       idx_r += row_step) {
+    unsigned int mask_4x4_int;
+    FilterMasks col_masks;
+    const int r = idx_r >> mi_height_log2_lookup[BLOCK_8X8];
+    get_filter_level_and_masks_non420(cm, plane, mib, mi_row, mi_col, idx_r,
+                                      &lfl[r][0], &mask_4x4_int, NULL,
+                                      &col_masks);
+
+    // Disable filtering on the leftmost column or tile boundary
+    unsigned int border_mask = ~(mi_col == 0);
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+    if (av1_disable_loopfilter_on_tile_boundary(cm) &&
+        ((mib[0]->mbmi.boundary_info & TILE_LEFT_BOUNDARY) != 0)) {
+      border_mask = 0xfffffffe;
+    }
+#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+
+#if CONFIG_HIGHBITDEPTH
+    if (cm->use_highbitdepth)
+      highbd_filter_selectively_vert(
+          CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
+          col_masks.m16x16 & border_mask, col_masks.m8x8 & border_mask,
+          col_masks.m4x4 & border_mask, mask_4x4_int, &cm->lf_info, &lfl[r][0],
+          (int)cm->bit_depth);
+    else
+#endif  // CONFIG_HIGHBITDEPTH
+      filter_selectively_vert(
+          dst->buf, dst->stride, col_masks.m16x16 & border_mask,
+          col_masks.m8x8 & border_mask, col_masks.m4x4 & border_mask,
+          mask_4x4_int, &cm->lf_info, &lfl[r][0]);
+    dst->buf += 8 * dst->stride;
+  }
+
+  // Now do horizontal pass
+  dst->buf = dst0;
+}
+
+void av1_filter_block_plane_non420_hor(AV1_COMMON *const cm,
+                                       struct macroblockd_plane *plane,
+                                       MODE_INFO **mib, int mi_row,
+                                       int mi_col) {
+  const int ss_y = plane->subsampling_y;
+  const int row_step = mi_size_high[BLOCK_8X8] << ss_y;
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t *const dst0 = dst->buf;
+  FilterMasks row_masks_array[MAX_MIB_SIZE];
+  unsigned int mask_4x4_int[MAX_MIB_SIZE] = { 0 };
+  uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE] = { { 0 } };
+  int idx_r;
+  for (idx_r = 0; idx_r < cm->mib_size && mi_row + idx_r < cm->mi_rows;
+       idx_r += row_step) {
+    const int r = idx_r >> mi_height_log2_lookup[BLOCK_8X8];
+    get_filter_level_and_masks_non420(cm, plane, mib, mi_row, mi_col, idx_r,
+                                      &lfl[r][0], mask_4x4_int + r,
+                                      row_masks_array + r, NULL);
+  }
+  for (idx_r = 0; idx_r < cm->mib_size && mi_row + idx_r < cm->mi_rows;
+       idx_r += row_step) {
+    const int skip_border_4x4_r =
+        ss_y && mi_row + idx_r >= cm->mi_rows - mi_size_wide[BLOCK_8X8];
+    const int r = idx_r >> mi_width_log2_lookup[BLOCK_8X8];
+    const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
+    FilterMasks row_masks;
+
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+    // Disable filtering on the abovemost row or tile boundary
+    const MODE_INFO *mi = cm->mi + (mi_row + r) * cm->mi_stride;
+    if ((av1_disable_loopfilter_on_tile_boundary(cm) &&
+         (mi->mbmi.boundary_info & TILE_ABOVE_BOUNDARY)) ||
+        (mi_row + idx_r == 0)) {
+      memset(&row_masks, 0, sizeof(row_masks));
+#else
+    if (mi_row + idx_r == 0) {
+      memset(&row_masks, 0, sizeof(row_masks));
+#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+    } else {
+      memcpy(&row_masks, row_masks_array + r, sizeof(row_masks));
+    }
+#if CONFIG_HIGHBITDEPTH
+    if (cm->use_highbitdepth)
+      highbd_filter_selectively_horiz(
+          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, row_masks.m16x16,
+          row_masks.m8x8, row_masks.m4x4, mask_4x4_int_r, &cm->lf_info,
+          &lfl[r][0], (int)cm->bit_depth);
+    else
+#endif  // CONFIG_HIGHBITDEPTH
+      filter_selectively_horiz(dst->buf, dst->stride, row_masks.m16x16,
+                               row_masks.m8x8, row_masks.m4x4, mask_4x4_int_r,
+                               &cm->lf_info, &lfl[r][0]);
+    dst->buf += 8 * dst->stride;
+  }
+  dst->buf = dst0;
+}
+
+void av1_filter_block_plane_ss00_ver(AV1_COMMON *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm) {
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t *const dst0 = dst->buf;
+  int r;
+  uint64_t mask_16x16 = lfm->left_y[TX_16X16];
+  uint64_t mask_8x8 = lfm->left_y[TX_8X8];
+  uint64_t mask_4x4 = lfm->left_y[TX_4X4];
+  uint64_t mask_4x4_int = lfm->int_4x4_y;
+
+  assert(plane->subsampling_x == 0 && plane->subsampling_y == 0);
+
+  // Vertical pass: do 2 rows at one time
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 2) {
+    unsigned int mask_16x16_l = mask_16x16 & 0xffff;
+    unsigned int mask_8x8_l = mask_8x8 & 0xffff;
+    unsigned int mask_4x4_l = mask_4x4 & 0xffff;
+    unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
+
+// Disable filtering on the leftmost column.
+#if CONFIG_HIGHBITDEPTH
+    if (cm->use_highbitdepth)
+      highbd_filter_selectively_vert_row2(
+          plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
+          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+          &lfm->lfl_y[r][0], (int)cm->bit_depth);
+    else
+#endif  // CONFIG_HIGHBITDEPTH
+      filter_selectively_vert_row2(
+          plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
+          mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r][0]);
+
+    dst->buf += 2 * MI_SIZE * dst->stride;
+    mask_16x16 >>= 2 * MI_SIZE;
+    mask_8x8 >>= 2 * MI_SIZE;
+    mask_4x4 >>= 2 * MI_SIZE;
+    mask_4x4_int >>= 2 * MI_SIZE;
+  }
+
+  // Horizontal pass
+  dst->buf = dst0;
+}
+
+void av1_filter_block_plane_ss00_hor(AV1_COMMON *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm) {
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t *const dst0 = dst->buf;
+  int r;
+  uint64_t mask_16x16 = lfm->above_y[TX_16X16];
+  uint64_t mask_8x8 = lfm->above_y[TX_8X8];
+  uint64_t mask_4x4 = lfm->above_y[TX_4X4];
+  uint64_t mask_4x4_int = lfm->int_4x4_y;
+
+  assert(plane->subsampling_x == 0 && plane->subsampling_y == 0);
+
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r++) {
+    unsigned int mask_16x16_r;
+    unsigned int mask_8x8_r;
+    unsigned int mask_4x4_r;
+
+    if (mi_row + r == 0) {
+      mask_16x16_r = 0;
+      mask_8x8_r = 0;
+      mask_4x4_r = 0;
+    } else {
+      mask_16x16_r = mask_16x16 & 0xff;
+      mask_8x8_r = mask_8x8 & 0xff;
+      mask_4x4_r = mask_4x4 & 0xff;
+    }
+
+#if CONFIG_HIGHBITDEPTH
+    if (cm->use_highbitdepth)
+      highbd_filter_selectively_horiz(
+          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
+          mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info, &lfm->lfl_y[r][0],
+          (int)cm->bit_depth);
+    else
+#endif  // CONFIG_HIGHBITDEPTH
+      filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                               mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
+                               &lfm->lfl_y[r][0]);
+
+    dst->buf += MI_SIZE * dst->stride;
+    mask_16x16 >>= MI_SIZE;
+    mask_8x8 >>= MI_SIZE;
+    mask_4x4 >>= MI_SIZE;
+    mask_4x4_int >>= MI_SIZE;
+  }
+  // restore the buf pointer in case there is additional filter pass.
+  dst->buf = dst0;
+}
+
+void av1_filter_block_plane_ss11_ver(AV1_COMMON *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm) {
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t *const dst0 = dst->buf;
+  int r, c;
+
+  uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
+  uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
+  uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
+  uint16_t mask_4x4_int = lfm->left_int_4x4_uv;
+
+  assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
+  assert(plane->plane_type == PLANE_TYPE_UV);
+  memset(lfm->lfl_uv, 0, sizeof(lfm->lfl_uv));
+
+  // Vertical pass: do 2 rows at one time
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 4) {
+    for (c = 0; c < (cm->mib_size >> 1); c++) {
+      lfm->lfl_uv[r >> 1][c] = lfm->lfl_y[r][c << 1];
+      lfm->lfl_uv[(r + 2) >> 1][c] = lfm->lfl_y[r + 2][c << 1];
+    }
+
+    {
+      unsigned int mask_16x16_l = mask_16x16 & 0xff;
+      unsigned int mask_8x8_l = mask_8x8 & 0xff;
+      unsigned int mask_4x4_l = mask_4x4 & 0xff;
+      unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
+
+// Disable filtering on the leftmost column.
+#if CONFIG_HIGHBITDEPTH
+      if (cm->use_highbitdepth)
+        highbd_filter_selectively_vert_row2(
+            plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
+            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+            &lfm->lfl_uv[r >> 1][0], (int)cm->bit_depth);
+      else
+#endif  // CONFIG_HIGHBITDEPTH
+        filter_selectively_vert_row2(plane->subsampling_x, dst->buf,
+                                     dst->stride, mask_16x16_l, mask_8x8_l,
+                                     mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+                                     &lfm->lfl_uv[r >> 1][0]);
+
+      dst->buf += 2 * MI_SIZE * dst->stride;
+      mask_16x16 >>= MI_SIZE;
+      mask_8x8 >>= MI_SIZE;
+      mask_4x4 >>= MI_SIZE;
+      mask_4x4_int >>= MI_SIZE;
+    }
+  }
+
+  // Horizontal pass
+  dst->buf = dst0;
+}
+
+void av1_filter_block_plane_ss11_hor(AV1_COMMON *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm) {
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t *const dst0 = dst->buf;
+  int r, c;
+  uint64_t mask_16x16 = lfm->above_uv[TX_16X16];
+  uint64_t mask_8x8 = lfm->above_uv[TX_8X8];
+  uint64_t mask_4x4 = lfm->above_uv[TX_4X4];
+  uint64_t mask_4x4_int = lfm->above_int_4x4_uv;
+
+  assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
+  memset(lfm->lfl_uv, 0, sizeof(lfm->lfl_uv));
+
+  // re-porpulate the filter level for uv, same as the code for vertical
+  // filter in av1_filter_block_plane_ss11_ver
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 4) {
+    for (c = 0; c < (cm->mib_size >> 1); c++) {
+      lfm->lfl_uv[r >> 1][c] = lfm->lfl_y[r][c << 1];
+      lfm->lfl_uv[(r + 2) >> 1][c] = lfm->lfl_y[r + 2][c << 1];
+    }
+  }
+
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 2) {
+    const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
+    const unsigned int mask_4x4_int_r =
+        skip_border_4x4_r ? 0 : (mask_4x4_int & 0xf);
+    unsigned int mask_16x16_r;
+    unsigned int mask_8x8_r;
+    unsigned int mask_4x4_r;
+
+    if (mi_row + r == 0) {
+      mask_16x16_r = 0;
+      mask_8x8_r = 0;
+      mask_4x4_r = 0;
+    } else {
+      mask_16x16_r = mask_16x16 & 0xf;
+      mask_8x8_r = mask_8x8 & 0xf;
+      mask_4x4_r = mask_4x4 & 0xf;
+    }
+
+#if CONFIG_HIGHBITDEPTH
+    if (cm->use_highbitdepth)
+      highbd_filter_selectively_horiz(
+          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
+          mask_4x4_r, mask_4x4_int_r, &cm->lf_info, &lfm->lfl_uv[r >> 1][0],
+          (int)cm->bit_depth);
+    else
+#endif  // CONFIG_HIGHBITDEPTH
+      filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                               mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
+                               &lfm->lfl_uv[r >> 1][0]);
+
+    dst->buf += MI_SIZE * dst->stride;
+    mask_16x16 >>= MI_SIZE / 2;
+    mask_8x8 >>= MI_SIZE / 2;
+    mask_4x4 >>= MI_SIZE / 2;
+    mask_4x4_int >>= MI_SIZE / 2;
+  }
+  // restore the buf pointer in case there is additional filter pass.
+  dst->buf = dst0;
+}
+
+#if !(CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES || \
+      CONFIG_CB4X4)
+#if CONFIG_PARALLEL_DEBLOCKING
+typedef enum EDGE_DIR { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } EDGE_DIR;
+static const uint32_t av1_prediction_masks[NUM_EDGE_DIRS][BLOCK_SIZES] = {
+  // mask for vertical edges filtering
+  {
+#if CONFIG_CB4X4
+      2 - 1,   // BLOCK_2X2
+      2 - 1,   // BLOCK_2X4
+      4 - 1,   // BLOCK_4X2
+#endif         // CONFIG_CB4X4
+      4 - 1,   // BLOCK_4X4
+      4 - 1,   // BLOCK_4X8
+      8 - 1,   // BLOCK_8X4
+      8 - 1,   // BLOCK_8X8
+      8 - 1,   // BLOCK_8X16
+      16 - 1,  // BLOCK_16X8
+      16 - 1,  // BLOCK_16X16
+      16 - 1,  // BLOCK_16X32
+      32 - 1,  // BLOCK_32X16
+      32 - 1,  // BLOCK_32X32
+      32 - 1,  // BLOCK_32X64
+      64 - 1,  // BLOCK_64X32
+      64 - 1,  // BLOCK_64X64
+#if CONFIG_EXT_PARTITION
+      64 - 1,   // BLOCK_64X128
+      128 - 1,  // BLOCK_128X64
+      128 - 1   // BLOCK_128X128
+#endif          // CONFIG_EXT_PARTITION
+  },
+  // mask for horizontal edges filtering
+  {
+#if CONFIG_CB4X4
+      2 - 1,   // BLOCK_2X2
+      4 - 1,   // BLOCK_2X4
+      2 - 1,   // BLOCK_4X2
+#endif         // CONFIG_CB4X4
+      4 - 1,   // BLOCK_4X4
+      8 - 1,   // BLOCK_4X8
+      4 - 1,   // BLOCK_8X4
+      8 - 1,   // BLOCK_8X8
+      16 - 1,  // BLOCK_8X16
+      8 - 1,   // BLOCK_16X8
+      16 - 1,  // BLOCK_16X16
+      32 - 1,  // BLOCK_16X32
+      16 - 1,  // BLOCK_32X16
+      32 - 1,  // BLOCK_32X32
+      64 - 1,  // BLOCK_32X64
+      32 - 1,  // BLOCK_64X32
+      64 - 1,  // BLOCK_64X64
+#if CONFIG_EXT_PARTITION
+      128 - 1,  // BLOCK_64X128
+      64 - 1,   // BLOCK_128X64
+      128 - 1   // BLOCK_128X128
+#endif          // CONFIG_EXT_PARTITION
+  },
+};
+
+static const uint32_t av1_transform_masks[NUM_EDGE_DIRS][TX_SIZES_ALL] = {
+  {
+#if CONFIG_CB4X4
+      2 - 1,  // TX_2X2
+#endif
+      4 - 1,   // TX_4X4
+      8 - 1,   // TX_8X8
+      16 - 1,  // TX_16X16
+      32 - 1,  // TX_32X32
+#if CONFIG_TX64X64
+      64 - 1,  // TX_64X64
+#endif         // CONFIG_TX64X64
+      4 - 1,   // TX_4X8
+      8 - 1,   // TX_8X4
+      8 - 1,   // TX_8X16
+      16 - 1,  // TX_16X8
+      16 - 1,  // TX_16X32
+      32 - 1,  // TX_32X16
+      4 - 1,   // TX_4X16
+      16 - 1,  // TX_16X4
+      8 - 1,   // TX_8X32
+      32 - 1   // TX_32X8
+  },
+  {
+#if CONFIG_CB4X4
+      2 - 1,  // TX_2X2
+#endif
+      4 - 1,   // TX_4X4
+      8 - 1,   // TX_8X8
+      16 - 1,  // TX_16X16
+      32 - 1,  // TX_32X32
+#if CONFIG_TX64X64
+      64 - 1,  // TX_64X64
+#endif         // CONFIG_TX64X64
+      8 - 1,   // TX_4X8
+      4 - 1,   // TX_8X4
+      16 - 1,  // TX_8X16
+      8 - 1,   // TX_16X8
+      32 - 1,  // TX_16X32
+      16 - 1,  // TX_32X16
+      16 - 1,  // TX_4X16
+      4 - 1,   // TX_16X4
+      32 - 1,  // TX_8X32
+      8 - 1    // TX_32X8
+  }
+};
+
+static TX_SIZE av1_get_transform_size(const MODE_INFO *const pCurr,
+                                      const EDGE_DIR edgeDir,
+                                      const uint32_t scaleHorz,
+                                      const uint32_t scaleVert) {
+  const BLOCK_SIZE bs = pCurr->mbmi.sb_type;
+  TX_SIZE txSize;
+  // since in case of chrominance or non-square transorm need to convert
+  // transform size into transform size in particular direction.
+  txSize = uv_txsize_lookup[bs][pCurr->mbmi.tx_size][scaleHorz][scaleVert];
+  if (VERT_EDGE == edgeDir) {
+    txSize = txsize_horz_map[txSize];
+  } else {
+    txSize = txsize_vert_map[txSize];
+  }
+  return txSize;
+}
+
+typedef struct AV1_DEBLOCKING_PARAMETERS {
+  // length of the filter applied to the outer edge
+  uint32_t filterLength;
+  // length of the filter applied to the inner edge
+  uint32_t filterLengthInternal;
+  // deblocking limits
+  const uint8_t *lim;
+  const uint8_t *mblim;
+  const uint8_t *hev_thr;
+} AV1_DEBLOCKING_PARAMETERS;
+
+static void set_lpf_parameters(AV1_DEBLOCKING_PARAMETERS *const pParams,
+                               const MODE_INFO **const ppCurr,
+                               const ptrdiff_t modeStep,
+                               const AV1_COMMON *const cm,
+                               const EDGE_DIR edgeDir, const uint32_t x,
+                               const uint32_t y, const uint32_t width,
+                               const uint32_t height, const uint32_t scaleHorz,
+                               const uint32_t scaleVert) {
+  // reset to initial values
+  pParams->filterLength = 0;
+  pParams->filterLengthInternal = 0;
+  // no deblocking is required
+  if ((width <= x) || (height <= y)) {
+    return;
+  }
+#if CONFIG_EXT_PARTITION
+  // not sure if changes are required.
+  assert(0 && "Not yet updated");
+#endif  // CONFIG_EXT_PARTITION
+  {
+    const TX_SIZE ts =
+        av1_get_transform_size(ppCurr[0], edgeDir, scaleHorz, scaleVert);
+    const uint32_t currLevel = get_filter_level(&cm->lf_info, &ppCurr[0]->mbmi);
+    const int currSkipped =
+        ppCurr[0]->mbmi.skip && is_inter_block(&ppCurr[0]->mbmi);
+    const uint32_t coord = (VERT_EDGE == edgeDir) ? (x) : (y);
+    uint32_t level = currLevel;
+    // prepare outer edge parameters. deblock the edge if it's an edge of a TU
+    if (coord) {
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+      if (!av1_disable_loopfilter_on_tile_boundary(cm) ||
+          ((VERT_EDGE == edgeDir) &&
+           (0 == (ppCurr[0]->mbmi.boundary_info & TILE_LEFT_BOUNDARY))) ||
+          ((HORZ_EDGE == edgeDir) &&
+           (0 == (ppCurr[0]->mbmi.boundary_info & TILE_ABOVE_BOUNDARY))))
+#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+      {
+        const int32_t tuEdge =
+            (coord & av1_transform_masks[edgeDir][ts]) ? (0) : (1);
+        if (tuEdge) {
+          const MODE_INFO *const pPrev = *(ppCurr - modeStep);
+          const TX_SIZE pvTs =
+              av1_get_transform_size(pPrev, edgeDir, scaleHorz, scaleVert);
+          const uint32_t pvLvl = get_filter_level(&cm->lf_info, &pPrev->mbmi);
+          const int pvSkip = pPrev->mbmi.skip && is_inter_block(&pPrev->mbmi);
+          const int32_t puEdge =
+              (coord &
+               av1_prediction_masks[edgeDir]
+                                   [ss_size_lookup[ppCurr[0]->mbmi.sb_type]
+                                                  [scaleHorz][scaleVert]])
+                  ? (0)
+                  : (1);
+          // if the current and the previous blocks are skipped,
+          // deblock the edge if the edge belongs to a PU's edge only.
+          if ((currLevel || pvLvl) && (!pvSkip || !currSkipped || puEdge)) {
+#if CONFIG_PARALLEL_DEBLOCKING_15TAP || CONFIG_PARALLEL_DEBLOCKING_15TAPLUMAONLY
+            const TX_SIZE minTs = AOMMIN(ts, pvTs);
+            if (TX_4X4 >= minTs) {
+              pParams->filterLength = 4;
+            } else if (TX_8X8 == minTs) {
+              pParams->filterLength = 8;
+            } else {
+              pParams->filterLength = 16;
+#if CONFIG_PARALLEL_DEBLOCKING_15TAPLUMAONLY
+              // No wide filtering for chroma plane
+              if (scaleHorz || scaleVert) {
+                pParams->filterLength = 8;
+              }
+#endif
+            }
+#else
+            pParams->filterLength = (TX_4X4 >= AOMMIN(ts, pvTs)) ? (4) : (8);
+
+#endif  // CONFIG_PARALLEL_DEBLOCKING_15TAP
+
+            // update the level if the current block is skipped,
+            // but the previous one is not
+            level = (currLevel) ? (currLevel) : (pvLvl);
+          }
+        }
+      }
+      // prepare internal edge parameters
+      if (currLevel && !currSkipped) {
+        pParams->filterLengthInternal = (TX_4X4 >= ts) ? (4) : (0);
+      }
+      // prepare common parameters
+      if (pParams->filterLength || pParams->filterLengthInternal) {
+        const loop_filter_thresh *const limits = cm->lf_info.lfthr + level;
+        pParams->lim = limits->lim;
+        pParams->mblim = limits->mblim;
+        pParams->hev_thr = limits->hev_thr;
+      }
+    }
+  }
+}
+
+static void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
+                                        const MACROBLOCKD_PLANE *const pPlane,
+                                        const MODE_INFO **ppModeInfo,
+                                        const ptrdiff_t modeStride,
+                                        const uint32_t cuX,
+                                        const uint32_t cuY) {
+  const uint32_t scaleHorz = pPlane->subsampling_x;
+  const uint32_t scaleVert = pPlane->subsampling_y;
+  const uint32_t width = pPlane->dst.width;
+  const uint32_t height = pPlane->dst.height;
+  uint8_t *const pDst = pPlane->dst.buf;
+  const int dstStride = pPlane->dst.stride;
+  for (int y = 0; y < (MAX_MIB_SIZE >> scaleVert); y += 1) {
+    uint8_t *p = pDst + y * MI_SIZE * dstStride;
+    for (int x = 0; x < (MAX_MIB_SIZE >> scaleHorz); x += 1) {
+      const MODE_INFO **const pCurr =
+          ppModeInfo + (y << scaleVert) * modeStride + (x << scaleHorz);
+      AV1_DEBLOCKING_PARAMETERS params;
+      memset(&params, 0, sizeof(params));
+      set_lpf_parameters(&params, pCurr, ((ptrdiff_t)1 << scaleHorz), cm,
+                         VERT_EDGE, cuX + x * MI_SIZE, cuY + y * MI_SIZE, width,
+                         height, scaleHorz, scaleVert);
+      switch (params.filterLength) {
+        // apply 4-tap filtering
+        case 4:
+          aom_lpf_vertical_4(p, dstStride, params.mblim, params.lim,
+                             params.hev_thr);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          aom_lpf_vertical_8(p, dstStride, params.mblim, params.lim,
+                             params.hev_thr);
+          break;
+#if CONFIG_PARALLEL_DEBLOCKING_15TAP || CONFIG_PARALLEL_DEBLOCKING_15TAPLUMAONLY
+        // apply 16-tap filtering
+        case 16:
+          aom_lpf_vertical_16(p, dstStride, params.mblim, params.lim,
+                              params.hev_thr);
+          break;
+#endif  // CONFIG_PARALLEL_DEBLOCKING_15TAP
+        // no filtering
+        default: break;
+      }
+      // process the internal edge
+      if (params.filterLengthInternal) {
+        aom_lpf_vertical_4(p + 4, dstStride, params.mblim, params.lim,
+                           params.hev_thr);
+      }
+      // advance the destination pointer
+      p += 8;
+    }
+  }
+}
+
+static void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
+                                        const MACROBLOCKD_PLANE *const pPlane,
+                                        const MODE_INFO **ppModeInfo,
+                                        const ptrdiff_t modeStride,
+                                        const uint32_t cuX,
+                                        const uint32_t cuY) {
+  const uint32_t scaleHorz = pPlane->subsampling_x;
+  const uint32_t scaleVert = pPlane->subsampling_y;
+  const uint32_t width = pPlane->dst.width;
+  const uint32_t height = pPlane->dst.height;
+  uint8_t *const pDst = pPlane->dst.buf;
+  const int dstStride = pPlane->dst.stride;
+  for (int y = 0; y < (MAX_MIB_SIZE >> scaleVert); y += 1) {
+    uint8_t *p = pDst + y * MI_SIZE * dstStride;
+    for (int x = 0; x < (MAX_MIB_SIZE >> scaleHorz); x += 1) {
+      const MODE_INFO **const pCurr =
+          ppModeInfo + (y << scaleVert) * modeStride + (x << scaleHorz);
+      AV1_DEBLOCKING_PARAMETERS params;
+      memset(&params, 0, sizeof(params));
+      set_lpf_parameters(&params, pCurr, (modeStride << scaleVert), cm,
+                         HORZ_EDGE, cuX + x * MI_SIZE, cuY + y * MI_SIZE, width,
+                         height, scaleHorz, scaleVert);
+      switch (params.filterLength) {
+        // apply 4-tap filtering
+        case 4:
+          aom_lpf_horizontal_4(p, dstStride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          aom_lpf_horizontal_8(p, dstStride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+#if CONFIG_PARALLEL_DEBLOCKING_15TAP || CONFIG_PARALLEL_DEBLOCKING_15TAPLUMAONLY
+        // apply 16-tap filtering
+        case 16:
+          aom_lpf_horizontal_edge_16(p, dstStride, params.mblim, params.lim,
+                                     params.hev_thr);
+          break;
+#endif  // CONFIG_PARALLEL_DEBLOCKING_15TAP
+        // no filtering
+        default: break;
+      }
+      // process the internal edge
+      if (params.filterLengthInternal) {
+        aom_lpf_horizontal_4(p + 4 * dstStride, dstStride, params.mblim,
+                             params.lim, params.hev_thr);
+      }
+      // advance the destination pointer
+      p += 8;
+    }
+  }
+}
+#endif  // CONFIG_PARALLEL_DEBLOCKING
+#endif
+
+void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
+                          struct macroblockd_plane planes[MAX_MB_PLANE],
+                          int start, int stop, int y_only) {
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES || \
+    CONFIG_CB4X4
+  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
+  int mi_row, mi_col;
+
+#if CONFIG_VAR_TX
+  memset(cm->above_txfm_context, TX_SIZES, cm->mi_cols);
+#endif  // CONFIG_VAR_TX
+  for (mi_row = start; mi_row < stop; mi_row += cm->mib_size) {
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+#if CONFIG_VAR_TX
+    memset(cm->left_txfm_context, TX_SIZES, MAX_MIB_SIZE);
+#endif  // CONFIG_VAR_TX
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += cm->mib_size) {
+      int plane;
+
+      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
+
+      for (plane = 0; plane < num_planes; ++plane) {
+        av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
+                                          mi_row, mi_col);
+        av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col,
+                                          mi_row, mi_col);
+      }
+    }
+  }
+#else  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
+  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
+  int mi_row, mi_col;
+#if !CONFIG_PARALLEL_DEBLOCKING
+  enum lf_path path;
+  LOOP_FILTER_MASK lfm;
+
+  if (y_only)
+    path = LF_PATH_444;
+  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
+    path = LF_PATH_420;
+  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
+    path = LF_PATH_444;
+  else
+    path = LF_PATH_SLOW;
+#endif
+#if CONFIG_PARALLEL_DEBLOCKING
+  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
+      for (int planeIdx = 0; planeIdx < num_planes; planeIdx += 1) {
+        const int32_t scaleHorz = planes[planeIdx].subsampling_x;
+        const int32_t scaleVert = planes[planeIdx].subsampling_y;
+        av1_filter_block_plane_vert(
+            cm, planes + planeIdx, (const MODE_INFO **)(mi + mi_col),
+            cm->mi_stride, (mi_col * MI_SIZE) >> scaleHorz,
+            (mi_row * MI_SIZE) >> scaleVert);
+      }
+    }
+  }
+  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
+      for (int planeIdx = 0; planeIdx < num_planes; planeIdx += 1) {
+        const int32_t scaleHorz = planes[planeIdx].subsampling_x;
+        const int32_t scaleVert = planes[planeIdx].subsampling_y;
+        av1_filter_block_plane_horz(
+            cm, planes + planeIdx, (const MODE_INFO **)(mi + mi_col),
+            cm->mi_stride, (mi_col * MI_SIZE) >> scaleHorz,
+            (mi_row * MI_SIZE) >> scaleVert);
+      }
+    }
+  }
+#else   // CONFIG_PARALLEL_DEBLOCKING
+  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+      int plane;
+
+      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
+
+      // TODO(JBB): Make setup_mask work for non 420.
+      av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+
+      av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, &lfm);
+      av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, &lfm);
+      for (plane = 1; plane < num_planes; ++plane) {
+        switch (path) {
+          case LF_PATH_420:
+            av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, &lfm);
+            av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_444:
+            av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, &lfm);
+            av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_SLOW:
+            av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
+                                              mi_row, mi_col);
+            av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col,
+                                              mi_row, mi_col);
+
+            break;
+        }
+      }
+    }
+  }
+#endif  // CONFIG_PARALLEL_DEBLOCKING
+#endif  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
+}
+
+void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+                           MACROBLOCKD *xd, int frame_filter_level, int y_only,
+                           int partial_frame) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+#if CONFIG_EXT_DELTA_Q
+  int orig_filter_level = cm->lf.filter_level;
+#endif
+  if (!frame_filter_level) return;
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_rows;
+  if (partial_frame && cm->mi_rows > 8) {
+    start_mi_row = cm->mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+  av1_loop_filter_frame_init(cm, frame_filter_level);
+#if CONFIG_EXT_DELTA_Q
+  cm->lf.filter_level = frame_filter_level;
+#endif
+  av1_loop_filter_rows(frame, cm, xd->plane, start_mi_row, end_mi_row, y_only);
+#if CONFIG_EXT_DELTA_Q
+  cm->lf.filter_level = orig_filter_level;
+#endif
+}
+
+void av1_loop_filter_data_reset(
+    LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
+    struct AV1Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]) {
+  lf_data->frame_buffer = frame_buffer;
+  lf_data->cm = cm;
+  lf_data->start = 0;
+  lf_data->stop = 0;
+  lf_data->y_only = 0;
+  memcpy(lf_data->planes, planes, sizeof(lf_data->planes));
+}
+
+int av1_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
+  (void)unused;
+  av1_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                       lf_data->start, lf_data->stop, lf_data->y_only);
+  return 1;
+}
diff --git a/third_party/aom/av1/common/av1_loopfilter.h b/third_party/aom/av1/common/av1_loopfilter.h
new file mode 100644
index 000000000..8ac5d99e6
--- /dev/null
+++ b/third_party/aom/av1/common/av1_loopfilter.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_LOOPFILTER_H_
+#define AV1_COMMON_LOOPFILTER_H_
+
+#include "aom_ports/mem.h"
+#include "./aom_config.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/seg_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_LOOP_FILTER 63
+#define MAX_SHARPNESS 7
+
+#define SIMD_WIDTH 16
+
+#define MAX_MODE_LF_DELTAS 2
+
+enum lf_path {
+  LF_PATH_420,
+  LF_PATH_444,
+  LF_PATH_SLOW,
+};
+
+struct loopfilter {
+  int filter_level;
+
+  int sharpness_level;
+  int last_sharpness_level;
+
+  uint8_t mode_ref_delta_enabled;
+  uint8_t mode_ref_delta_update;
+
+  // 0 = Intra, Last, Last2+Last3(CONFIG_EXT_REFS),
+  // GF, BRF(CONFIG_EXT_REFS), ARF
+  signed char ref_deltas[TOTAL_REFS_PER_FRAME];
+  signed char last_ref_deltas[TOTAL_REFS_PER_FRAME];
+
+  // 0 = ZERO_MV, MV
+  signed char mode_deltas[MAX_MODE_LF_DELTAS];
+  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
+};
+
+// Need to align this structure so when it is declared and
+// passed it can be loaded into vector registers.
+typedef struct {
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]);
+} loop_filter_thresh;
+
+typedef struct {
+  loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
+  uint8_t lvl[MAX_SEGMENTS][TOTAL_REFS_PER_FRAME][MAX_MODE_LF_DELTAS];
+} loop_filter_info_n;
+
+// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
+// Each 1 bit represents a position in which we want to apply the loop filter.
+// Left_ entries refer to whether we apply a filter on the border to the
+// left of the block.   Above_ entries refer to whether or not to apply a
+// filter on the above border.   Int_ entries refer to whether or not to
+// apply borders on the 4x4 edges within the 8x8 block that each bit
+// represents.
+// Since each transform is accompanied by a potentially different type of
+// loop filter there is a different entry in the array for each transform size.
+typedef struct {
+  uint64_t left_y[TX_SIZES];
+  uint64_t above_y[TX_SIZES];
+  uint64_t int_4x4_y;
+  uint16_t left_uv[TX_SIZES];
+  uint16_t above_uv[TX_SIZES];
+  uint16_t left_int_4x4_uv;
+  uint16_t above_int_4x4_uv;
+  uint8_t lfl_y[MAX_MIB_SIZE][MAX_MIB_SIZE];
+  uint8_t lfl_uv[MAX_MIB_SIZE / 2][MAX_MIB_SIZE / 2];
+} LOOP_FILTER_MASK;
+
+/* assorted loopfilter functions which get used elsewhere */
+struct AV1Common;
+struct macroblockd;
+struct AV1LfSyncData;
+
+// This function sets up the bit masks for the entire 64x64 region represented
+// by mi_row, mi_col.
+void av1_setup_mask(struct AV1Common *const cm, const int mi_row,
+                    const int mi_col, MODE_INFO **mi_8x8,
+                    const int mode_info_stride, LOOP_FILTER_MASK *lfm);
+
+void av1_filter_block_plane_ss00_ver(struct AV1Common *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm);
+void av1_filter_block_plane_ss00_hor(struct AV1Common *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm);
+void av1_filter_block_plane_ss11_ver(struct AV1Common *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm);
+void av1_filter_block_plane_ss11_hor(struct AV1Common *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm);
+
+void av1_filter_block_plane_non420_ver(struct AV1Common *const cm,
+                                       struct macroblockd_plane *plane,
+                                       MODE_INFO **mi_8x8, int mi_row,
+                                       int mi_col);
+void av1_filter_block_plane_non420_hor(struct AV1Common *const cm,
+                                       struct macroblockd_plane *plane,
+                                       MODE_INFO **mi_8x8, int mi_row,
+                                       int mi_col);
+
+void av1_loop_filter_init(struct AV1Common *cm);
+
+// Update the loop filter for the current frame.
+// This should be called before av1_loop_filter_rows(),
+// av1_loop_filter_frame()
+// calls this function directly.
+void av1_loop_filter_frame_init(struct AV1Common *cm, int default_filt_lvl);
+
+void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+                           struct macroblockd *mbd, int filter_level,
+                           int y_only, int partial_frame);
+
+// Apply the loop filter to [start, stop) macro block rows in frame_buffer.
+void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
+                          struct AV1Common *cm,
+                          struct macroblockd_plane planes[MAX_MB_PLANE],
+                          int start, int stop, int y_only);
+
+typedef struct LoopFilterWorkerData {
+  YV12_BUFFER_CONFIG *frame_buffer;
+  struct AV1Common *cm;
+  struct macroblockd_plane planes[MAX_MB_PLANE];
+
+  int start;
+  int stop;
+  int y_only;
+} LFWorkerData;
+
+void av1_loop_filter_data_reset(
+    LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
+    struct AV1Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]);
+
+// Operates on the rows described by 'lf_data'.
+int av1_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_LOOPFILTER_H_
diff --git a/third_party/aom/av1/common/av1_rtcd.c b/third_party/aom/av1/common/av1_rtcd.c
new file mode 100644
index 000000000..f9ccd1979
--- /dev/null
+++ b/third_party/aom/av1/common/av1_rtcd.c
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "./aom_config.h"
+#define RTCD_C
+#include "./av1_rtcd.h"
+#include "aom_ports/aom_once.h"
+
+void av1_rtcd() {
+  // TODO(JBB): Remove this once, by insuring that both the encoder and
+  // decoder setup functions are protected by once();
+  once(setup_rtcd_internal);
+}
diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl
new file mode 100755
index 000000000..1dca10c52
--- /dev/null
+++ b/third_party/aom/av1/common/av1_rtcd_defs.pl
@@ -0,0 +1,644 @@
+sub av1_common_forward_decls() {
+print <<EOF
+/*
+ * AV1
+ */
+
+#include "aom/aom_integer.h"
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/filter.h"
+#include "av1/common/convolve.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/odintrin.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct aom_variance_vtable;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+typedef uint16_t od_dering_in;
+EOF
+}
+forward_decls qw/av1_common_forward_decls/;
+
+# functions that are 64 bit only.
+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
+if ($opts{arch} eq "x86_64") {
+  $mmx_x86_64 = 'mmx';
+  $sse2_x86_64 = 'sse2';
+  $ssse3_x86_64 = 'ssse3';
+  $avx_x86_64 = 'avx';
+  $avx2_x86_64 = 'avx2';
+}
+
+#
+# 10/12-tap convolution filters
+#
+add_proto qw/void av1_lowbd_convolve_init/, "void";
+specialize qw/av1_lowbd_convolve_init ssse3/;
+
+add_proto qw/void av1_convolve_horiz/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params";
+specialize qw/av1_convolve_horiz ssse3/;
+
+add_proto qw/void av1_convolve_vert/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params";
+specialize qw/av1_convolve_vert ssse3/;
+
+if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void av1_highbd_convolve_init/, "void";
+  specialize qw/av1_highbd_convolve_init sse4_1/;
+  add_proto qw/void av1_highbd_convolve_horiz/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
+  specialize qw/av1_highbd_convolve_horiz sse4_1/;
+  add_proto qw/void av1_highbd_convolve_vert/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
+  specialize qw/av1_highbd_convolve_vert sse4_1/;
+}
+
+#
+# Inverse dct
+#
+if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+  {
+    add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht4x4_16_add sse2/;
+
+    add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht4x8_32_add sse2/;
+
+    add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht8x4_32_add sse2/;
+
+    add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht8x16_128_add sse2/;
+
+    add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht16x8_128_add sse2/;
+
+    add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht16x32_512_add sse2/;
+
+    add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht32x16_512_add sse2/;
+
+    add_proto qw/void av1_iht4x16_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+
+    add_proto qw/void av1_iht16x4_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+
+    add_proto qw/void av1_iht8x32_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+
+    add_proto qw/void av1_iht32x8_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+
+    add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht8x8_64_add sse2/;
+
+    add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+      specialize qw/av1_iht16x16_256_add sse2 avx2/;
+
+    add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+  }
+} else {
+  {
+    add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/av1_iht4x4_16_add sse2 neon dspr2/;
+
+    add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht4x8_32_add sse2/;
+
+    add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht8x4_32_add sse2/;
+
+    add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht8x16_128_add sse2/;
+
+    add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht16x8_128_add sse2/;
+
+    add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht16x32_512_add sse2/;
+
+    add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht32x16_512_add sse2/;
+
+    add_proto qw/void av1_iht4x16_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+
+    add_proto qw/void av1_iht16x4_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+
+    add_proto qw/void av1_iht8x32_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+
+    add_proto qw/void av1_iht32x8_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+
+    add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht8x8_64_add sse2 neon dspr2/;
+
+    add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+      specialize qw/av1_iht16x16_256_add sse2 avx2 dspr2/;
+
+    add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+
+    if (aom_config("CONFIG_EXT_TX") ne "yes") {
+      specialize qw/av1_iht4x4_16_add msa/;
+      specialize qw/av1_iht8x8_64_add msa/;
+      specialize qw/av1_iht16x16_256_add msa/;
+    }
+  }
+}
+
+add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+
+if (aom_config("CONFIG_TX64X64") eq "yes") {
+  add_proto qw/void av1_iht64x64_4096_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+}
+
+if (aom_config("CONFIG_NEW_QUANT") eq "yes") {
+  add_proto qw/void quantize_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+
+  add_proto qw/void quantize_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+
+  add_proto qw/void quantize_32x32_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+
+  add_proto qw/void quantize_32x32_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+
+  if (aom_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void quantize_64x64_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+
+    add_proto qw/void quantize_64x64_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+  }
+}
+
+# FILTER_INTRA predictor functions
+if (aom_config("CONFIG_FILTER_INTRA") eq "yes") {
+  add_proto qw/void av1_dc_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+  add_proto qw/void av1_v_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+  add_proto qw/void av1_h_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+  add_proto qw/void av1_d45_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+  add_proto qw/void av1_d135_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+  add_proto qw/void av1_d117_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+  add_proto qw/void av1_d153_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+  add_proto qw/void av1_d207_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+  add_proto qw/void av1_d63_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+  add_proto qw/void av1_tm_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+  # High bitdepth functions
+  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void av1_highbd_dc_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    add_proto qw/void av1_highbd_v_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    add_proto qw/void av1_highbd_h_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    add_proto qw/void av1_highbd_d45_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    add_proto qw/void av1_highbd_d135_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    add_proto qw/void av1_highbd_d117_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    add_proto qw/void av1_highbd_d153_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    add_proto qw/void av1_highbd_d207_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    add_proto qw/void av1_highbd_d63_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    add_proto qw/void av1_highbd_tm_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+  }
+}
+
+# High bitdepth functions
+if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+  #
+  # Sub Pixel Filters
+  #
+  add_proto qw/void av1_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+
+  add_proto qw/void av1_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+
+  add_proto qw/void av1_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/av1_highbd_convolve8/, "$sse2_x86_64";
+
+  add_proto qw/void av1_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/av1_highbd_convolve8_horiz/, "$sse2_x86_64";
+
+  add_proto qw/void av1_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
+
+  add_proto qw/void av1_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/av1_highbd_convolve8_avg/, "$sse2_x86_64";
+
+  add_proto qw/void av1_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/av1_highbd_convolve8_avg_horiz/, "$sse2_x86_64";
+
+  add_proto qw/void av1_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/av1_highbd_convolve8_avg_vert/, "$sse2_x86_64";
+
+  #
+  # dct
+  #
+  add_proto qw/void av1_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+
+  add_proto qw/void av1_highbd_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+
+  add_proto qw/void av1_highbd_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+
+  add_proto qw/void av1_highbd_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+
+  add_proto qw/void av1_highbd_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+
+  add_proto qw/void av1_highbd_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+
+  add_proto qw/void av1_highbd_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+
+  add_proto qw/void av1_highbd_iht4x16_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+
+  add_proto qw/void av1_highbd_iht16x4_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+
+  add_proto qw/void av1_highbd_iht8x32_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+
+  add_proto qw/void av1_highbd_iht32x8_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+
+  add_proto qw/void av1_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+
+  add_proto qw/void av1_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
+}
+
+if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+  #inv txfm
+  add_proto qw/void av1_inv_txfm2d_add_4x4/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
+  specialize qw/av1_inv_txfm2d_add_4x4 sse4_1/;
+  add_proto qw/void av1_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
+  specialize qw/av1_inv_txfm2d_add_8x8 sse4_1/;
+  add_proto qw/void av1_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
+  specialize qw/av1_inv_txfm2d_add_16x16 sse4_1/;
+  add_proto qw/void av1_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
+  specialize qw/av1_inv_txfm2d_add_32x32 avx2/;
+  add_proto qw/void av1_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
+}
+
+#
+# Encoder functions below this point.
+#
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+
+# ENCODEMB INVOKE
+
+if (aom_config("CONFIG_AOM_QM") eq "yes") {
+  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+    # the transform coefficients are held in 32-bit
+    # values, so the assembler code for  av1_block_error can no longer be used.
+    add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+
+    add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+
+    add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+
+    if (aom_config("CONFIG_TX64X64") eq "yes") {
+      add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+    }
+
+    add_proto qw/void av1_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+  } else {
+    add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+    specialize qw/av1_block_error avx2 msa/, "$sse2_x86inc";
+
+    add_proto qw/int64_t av1_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
+    specialize qw/av1_block_error_fp neon/, "$sse2_x86inc";
+
+    add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+
+    add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+
+    if (aom_config("CONFIG_TX64X64") eq "yes") {
+      add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+    }
+
+    add_proto qw/void av1_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+  }
+} else {
+  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+    # the transform coefficients are held in 32-bit
+    # values, so the assembler code for  av1_block_error can no longer be used.
+    add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+
+    add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+
+    add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+
+    if (aom_config("CONFIG_TX64X64") eq "yes") {
+      add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    }
+
+    add_proto qw/void av1_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  } else {
+    add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+    specialize qw/av1_block_error sse2 avx2 msa/;
+
+    add_proto qw/int64_t av1_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
+    specialize qw/av1_block_error_fp neon sse2/;
+
+    add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/av1_quantize_fp neon sse2/, "$ssse3_x86_64";
+
+    add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/av1_quantize_fp_32x32/, "$ssse3_x86_64";
+
+    if (aom_config("CONFIG_TX64X64") eq "yes") {
+      add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    }
+
+    add_proto qw/void av1_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/av1_fdct8x8_quant sse2 ssse3 neon/;
+  }
+
+}
+
+# fdct functions
+
+add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht4x4 sse2/;
+
+add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+
+add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht8x8 sse2/;
+
+add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht16x16 sse2 avx2/;
+
+add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht32x32 sse2 avx2/;
+
+if (aom_config("CONFIG_TX64X64") eq "yes") {
+  add_proto qw/void av1_fht64x64/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+}
+
+add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht4x8 sse2/;
+
+add_proto qw/void av1_fht8x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht8x4 sse2/;
+
+add_proto qw/void av1_fht8x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht8x16 sse2/;
+
+add_proto qw/void av1_fht16x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht16x8 sse2/;
+
+add_proto qw/void av1_fht16x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht16x32 sse2/;
+
+add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht32x16 sse2/;
+
+add_proto qw/void av1_fht4x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+
+add_proto qw/void av1_fht16x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+
+add_proto qw/void av1_fht8x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+
+add_proto qw/void av1_fht32x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+
+if (aom_config("CONFIG_HIGHBITDEPTH") ne "yes") {
+  if (aom_config("CONFIG_EXT_TX") ne "yes") {
+    specialize qw/av1_fht4x4 msa/;
+    specialize qw/av1_fht8x8 msa/;
+    specialize qw/av1_fht16x16 msa/;
+  }
+}
+
+add_proto qw/void av1_fwd_idtx/, "const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type";
+
+if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+  #fwd txfm
+  add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_8x8 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_16x16 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_64x64 sse4_1/;
+}
+
+#
+# Motion search
+#
+add_proto qw/int av1_full_search_sad/, "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv";
+specialize qw/av1_full_search_sad sse3 sse4_1/;
+$av1_full_search_sad_sse3=av1_full_search_sadx3;
+$av1_full_search_sad_sse4_1=av1_full_search_sadx8;
+
+add_proto qw/int av1_diamond_search_sad/, "struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv";
+
+add_proto qw/int av1_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv";
+
+add_proto qw/void av1_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+specialize qw/av1_temporal_filter_apply sse2 msa/;
+
+if (aom_config("CONFIG_AOM_QM") eq "yes") {
+  add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+} else {
+  add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
+}
+
+if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+
+  # ENCODEMB INVOKE
+  if (aom_config("CONFIG_NEW_QUANT") eq "yes") {
+    add_proto qw/void highbd_quantize_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+
+    add_proto qw/void highbd_quantize_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+
+    add_proto qw/void highbd_quantize_32x32_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+
+    add_proto qw/void highbd_quantize_32x32_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+
+    if (aom_config("CONFIG_TX64X64") eq "yes") {
+      add_proto qw/void highbd_quantize_64x64_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+
+      add_proto qw/void highbd_quantize_64x64_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+    }
+  }
+
+  add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+  specialize qw/av1_highbd_block_error sse2/;
+
+  if (aom_config("CONFIG_AOM_QM") eq "yes") {
+    add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+
+    add_proto qw/void av1_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+
+    if (aom_config("CONFIG_TX64X64") eq "yes") {
+      add_proto qw/void av1_highbd_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+    }
+
+    add_proto qw/void av1_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+  } else {
+    add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
+    specialize qw/av1_highbd_quantize_fp sse4_1/;
+
+    add_proto qw/void av1_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
+  }
+
+  # fdct functions
+  add_proto qw/void av1_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_highbd_fht4x4 sse4_1/;
+
+  add_proto qw/void av1_highbd_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+
+  add_proto qw/void av1_highbd_fht8x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+
+  add_proto qw/void av1_highbd_fht8x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+
+  add_proto qw/void av1_highbd_fht16x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+
+  add_proto qw/void av1_highbd_fht16x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+
+  add_proto qw/void av1_highbd_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+
+  add_proto qw/void av1_highbd_fht4x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+
+  add_proto qw/void av1_highbd_fht16x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+
+  add_proto qw/void av1_highbd_fht8x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+
+  add_proto qw/void av1_highbd_fht32x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+
+  add_proto qw/void av1_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+
+  add_proto qw/void av1_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+
+  add_proto qw/void av1_highbd_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+
+  if (aom_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void av1_highbd_fht64x64/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  }
+
+  add_proto qw/void av1_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+
+  add_proto qw/void av1_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+
+}
+# End av1_high encoder functions
+
+if (aom_config("CONFIG_EXT_INTER") eq "yes") {
+  add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
+  specialize qw/av1_wedge_sse_from_residuals sse2/;
+  add_proto qw/int av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
+  specialize qw/av1_wedge_sign_from_residuals sse2/;
+  add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
+  specialize qw/av1_wedge_compute_delta_squares sse2/;
+}
+
+}
+# end encoder functions
+
+# If PVQ is enabled, fwd transforms are required by decoder
+if (aom_config("CONFIG_PVQ") eq "yes") {
+# fdct functions
+
+if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_fht4x4 sse2/;
+
+  add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_fht8x8 sse2/;
+
+  add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_fht16x16 sse2/;
+
+  add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/av1_fwht4x4 sse2/;
+} else {
+  add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_fht4x4 sse2 msa/;
+
+  add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_fht8x8 sse2 msa/;
+
+  add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_fht16x16 sse2 msa/;
+
+  add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/av1_fwht4x4 msa sse2/;
+}
+
+}
+
+# Deringing Functions
+
+if (aom_config("CONFIG_CDEF") eq "yes") {
+  add_proto qw/void aom_clpf_block_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
+  add_proto qw/void aom_clpf_hblock_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
+  add_proto qw/void aom_clpf_block/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
+  add_proto qw/void aom_clpf_hblock/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
+  add_proto qw/int od_dir_find8/, "const od_dering_in *img, int stride, int32_t *var, int coeff_shift";
+  add_proto qw/void od_filter_dering_direction_4x4/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping";
+  add_proto qw/void od_filter_dering_direction_8x8/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping";
+
+  add_proto qw/void copy_8x8_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
+  add_proto qw/void copy_4x4_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
+  add_proto qw/void copy_8x8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride";
+  add_proto qw/void copy_4x4_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride";
+  add_proto qw/void copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
+  add_proto qw/void copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";
+
+# VS compiling for 32 bit targets does not support vector types in
+  # structs as arguments, which makes the v256 type of the intrinsics
+  # hard to support, so optimizations for this target are disabled.
+  if ($opts{config} !~ /libs-x86-win32-vs.*/) {
+    specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
+    specialize qw/aom_clpf_hblock_hbd sse2 ssse3 sse4_1 neon/;
+    specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
+    specialize qw/aom_clpf_hblock sse2 ssse3 sse4_1 neon/;
+    specialize qw/od_dir_find8 sse2 ssse3 sse4_1 neon/;
+    specialize qw/od_filter_dering_direction_4x4 sse2 ssse3 sse4_1 neon/;
+    specialize qw/od_filter_dering_direction_8x8 sse2 ssse3 sse4_1 neon/;
+
+    specialize qw/copy_8x8_16bit_to_8bit sse2 ssse3 sse4_1 neon/;
+    specialize qw/copy_4x4_16bit_to_8bit sse2 ssse3 sse4_1 neon/;
+    specialize qw/copy_8x8_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
+    specialize qw/copy_4x4_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
+    specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 neon/;
+    specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
+  }
+}
+
+# PVQ Functions
+
+if (aom_config("CONFIG_PVQ") eq "yes") {
+  add_proto qw/double pvq_search_rdo_double/, "const od_val16 *xcoeff, int n, int k, int *ypulse, double g2, double pvq_norm_lambda, int prev_k";
+  specialize qw/pvq_search_rdo_double sse4_1/;
+}
+
+# WARPED_MOTION / GLOBAL_MOTION functions
+
+if ((aom_config("CONFIG_WARPED_MOTION") eq "yes") ||
+    (aom_config("CONFIG_GLOBAL_MOTION") eq "yes")) {
+  add_proto qw/void av1_warp_affine/, "int32_t *mat, uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int ref_frm, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
+  specialize qw/av1_warp_affine sse2/;
+
+  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void av1_highbd_warp_affine/, "int32_t *mat, uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, int ref_frm, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
+    specialize qw/av1_highbd_warp_affine ssse3/;
+  }
+}
+
+# LOOP_RESTORATION functions
+
+if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
+  add_proto qw/void apply_selfguided_restoration/, "uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf";
+  specialize qw/apply_selfguided_restoration sse4_1/;
+
+  add_proto qw/void av1_selfguided_restoration/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps, int32_t *tmpbuf";
+  specialize qw/av1_selfguided_restoration sse4_1/;
+
+  add_proto qw/void av1_highpass_filter/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
+  specialize qw/av1_highpass_filter sse4_1/;
+
+  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void apply_selfguided_restoration_highbd/, "uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf";
+    specialize qw/apply_selfguided_restoration_highbd sse4_1/;
+
+    add_proto qw/void av1_selfguided_restoration_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps, int32_t *tmpbuf";
+    specialize qw/av1_selfguided_restoration_highbd sse4_1/;
+
+    add_proto qw/void av1_highpass_filter_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
+    specialize qw/av1_highpass_filter_highbd sse4_1/;
+  }
+}
+
+1;
diff --git a/third_party/aom/av1/common/av1_txfm.h b/third_party/aom/av1/common/av1_txfm.h
new file mode 100644
index 000000000..6987317ae
--- /dev/null
+++ b/third_party/aom/av1/common/av1_txfm.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_TXFM_H_
+#define AV1_TXFM_H_
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "av1/common/enums.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static const int cos_bit_min = 10;
+static const int cos_bit_max = 16;
+
+// cospi_arr[i][j] = (int)round(cos(M_PI*j/128) * (1<<(cos_bit_min+i)));
+static const int32_t cospi_arr[7][64] = {
+  { 1024, 1024, 1023, 1021, 1019, 1016, 1013, 1009, 1004, 999, 993, 987, 980,
+    972,  964,  955,  946,  936,  926,  915,  903,  891,  878, 865, 851, 837,
+    822,  807,  792,  775,  759,  742,  724,  706,  688,  669, 650, 630, 610,
+    590,  569,  548,  526,  505,  483,  460,  438,  415,  392, 369, 345, 321,
+    297,  273,  249,  224,  200,  175,  150,  125,  100,  75,  50,  25 },
+  { 2048, 2047, 2046, 2042, 2038, 2033, 2026, 2018, 2009, 1998, 1987,
+    1974, 1960, 1945, 1928, 1911, 1892, 1872, 1851, 1829, 1806, 1782,
+    1757, 1730, 1703, 1674, 1645, 1615, 1583, 1551, 1517, 1483, 1448,
+    1412, 1375, 1338, 1299, 1260, 1220, 1179, 1138, 1096, 1053, 1009,
+    965,  921,  876,  830,  784,  737,  690,  642,  595,  546,  498,
+    449,  400,  350,  301,  251,  201,  151,  100,  50 },
+  { 4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973,
+    3948, 3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564,
+    3513, 3461, 3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896,
+    2824, 2751, 2675, 2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019,
+    1931, 1842, 1751, 1660, 1567, 1474, 1380, 1285, 1189, 1092, 995,
+    897,  799,  700,  601,  501,  401,  301,  201,  101 },
+  { 8192, 8190, 8182, 8170, 8153, 8130, 8103, 8071, 8035, 7993, 7946,
+    7895, 7839, 7779, 7713, 7643, 7568, 7489, 7405, 7317, 7225, 7128,
+    7027, 6921, 6811, 6698, 6580, 6458, 6333, 6203, 6070, 5933, 5793,
+    5649, 5501, 5351, 5197, 5040, 4880, 4717, 4551, 4383, 4212, 4038,
+    3862, 3683, 3503, 3320, 3135, 2948, 2760, 2570, 2378, 2185, 1990,
+    1795, 1598, 1401, 1202, 1003, 803,  603,  402,  201 },
+  { 16384, 16379, 16364, 16340, 16305, 16261, 16207, 16143, 16069, 15986, 15893,
+    15791, 15679, 15557, 15426, 15286, 15137, 14978, 14811, 14635, 14449, 14256,
+    14053, 13842, 13623, 13395, 13160, 12916, 12665, 12406, 12140, 11866, 11585,
+    11297, 11003, 10702, 10394, 10080, 9760,  9434,  9102,  8765,  8423,  8076,
+    7723,  7366,  7005,  6639,  6270,  5897,  5520,  5139,  4756,  4370,  3981,
+    3590,  3196,  2801,  2404,  2006,  1606,  1205,  804,   402 },
+  { 32768, 32758, 32729, 32679, 32610, 32522, 32413, 32286, 32138, 31972, 31786,
+    31581, 31357, 31114, 30853, 30572, 30274, 29957, 29622, 29269, 28899, 28511,
+    28106, 27684, 27246, 26791, 26320, 25833, 25330, 24812, 24279, 23732, 23170,
+    22595, 22006, 21403, 20788, 20160, 19520, 18868, 18205, 17531, 16846, 16151,
+    15447, 14733, 14010, 13279, 12540, 11793, 11039, 10279, 9512,  8740,  7962,
+    7180,  6393,  5602,  4808,  4011,  3212,  2411,  1608,  804 },
+  { 65536, 65516, 65457, 65358, 65220, 65043, 64827, 64571, 64277, 63944, 63572,
+    63162, 62714, 62228, 61705, 61145, 60547, 59914, 59244, 58538, 57798, 57022,
+    56212, 55368, 54491, 53581, 52639, 51665, 50660, 49624, 48559, 47464, 46341,
+    45190, 44011, 42806, 41576, 40320, 39040, 37736, 36410, 35062, 33692, 32303,
+    30893, 29466, 28020, 26558, 25080, 23586, 22078, 20557, 19024, 17479, 15924,
+    14359, 12785, 11204, 9616,  8022,  6424,  4821,  3216,  1608 }
+};
+
+static INLINE int32_t round_shift(int32_t value, int bit) {
+  assert(bit >= 1);
+  return (value + (1 << (bit - 1))) >> bit;
+}
+
+static INLINE void round_shift_array(int32_t *arr, int size, int bit) {
+  int i;
+  if (bit == 0) {
+    return;
+  } else {
+    if (bit > 0) {
+      for (i = 0; i < size; i++) {
+        arr[i] = round_shift(arr[i], bit);
+      }
+    } else {
+      for (i = 0; i < size; i++) {
+        arr[i] = arr[i] * (1 << (-bit));
+      }
+    }
+  }
+}
+
+static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
+                               int bit) {
+  int32_t result_32 = w0 * in0 + w1 * in1;
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  int64_t result_64 = (int64_t)w0 * (int64_t)in0 + (int64_t)w1 * (int64_t)in1;
+  if (result_64 < INT32_MIN || result_64 > INT32_MAX) {
+    printf("%s %d overflow result_32: %d result_64: %" PRId64
+           " w0: %d in0: %d w1: %d in1: "
+           "%d\n",
+           __FILE__, __LINE__, result_32, result_64, w0, in0, w1, in1);
+    assert(0 && "half_btf overflow");
+  }
+#endif
+  return round_shift(result_32, bit);
+}
+
+static INLINE int get_max_bit(int x) {
+  int max_bit = -1;
+  while (x) {
+    x = x >> 1;
+    max_bit++;
+  }
+  return max_bit;
+}
+
+// TODO(angiebird): implement SSE
+static INLINE void clamp_block(int16_t *block, int block_size, int stride,
+                               int low, int high) {
+  int i, j;
+  for (i = 0; i < block_size; ++i) {
+    for (j = 0; j < block_size; ++j) {
+      block[i * stride + j] = clamp(block[i * stride + j], low, high);
+    }
+  }
+}
+
+typedef void (*TxfmFunc)(const int32_t *input, int32_t *output,
+                         const int8_t *cos_bit, const int8_t *stage_range);
+
+typedef enum TXFM_TYPE {
+  TXFM_TYPE_DCT4,
+  TXFM_TYPE_DCT8,
+  TXFM_TYPE_DCT16,
+  TXFM_TYPE_DCT32,
+  TXFM_TYPE_DCT64,
+  TXFM_TYPE_ADST4,
+  TXFM_TYPE_ADST8,
+  TXFM_TYPE_ADST16,
+  TXFM_TYPE_ADST32,
+} TXFM_TYPE;
+
+typedef struct TXFM_2D_CFG {
+  const int txfm_size;
+  const int stage_num_col;
+  const int stage_num_row;
+
+  const int8_t *shift;
+  const int8_t *stage_range_col;
+  const int8_t *stage_range_row;
+  const int8_t *cos_bit_col;
+  const int8_t *cos_bit_row;
+  const TXFM_TYPE txfm_type_col;
+  const TXFM_TYPE txfm_type_row;
+} TXFM_2D_CFG;
+
+typedef struct TXFM_2D_FLIP_CFG {
+  int ud_flip;  // flip upside down
+  int lr_flip;  // flip left to right
+  const TXFM_2D_CFG *cfg;
+} TXFM_2D_FLIP_CFG;
+
+static INLINE void set_flip_cfg(int tx_type, TXFM_2D_FLIP_CFG *cfg) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      cfg->ud_flip = 0;
+      cfg->lr_flip = 0;
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg->ud_flip = 1;
+      cfg->lr_flip = 0;
+      break;
+    case DCT_FLIPADST:
+      cfg->ud_flip = 0;
+      cfg->lr_flip = 1;
+      break;
+    case FLIPADST_FLIPADST:
+      cfg->ud_flip = 1;
+      cfg->lr_flip = 1;
+      break;
+    case ADST_FLIPADST:
+      cfg->ud_flip = 0;
+      cfg->lr_flip = 1;
+      break;
+    case FLIPADST_ADST:
+      cfg->ud_flip = 1;
+      cfg->lr_flip = 0;
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      cfg->ud_flip = 0;
+      cfg->lr_flip = 0;
+      assert(0);
+  }
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+TXFM_2D_FLIP_CFG av1_get_fwd_txfm_cfg(int tx_type, int tx_size);
+TXFM_2D_FLIP_CFG av1_get_fwd_txfm_64x64_cfg(int tx_type);
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // AV1_TXFM_H_
diff --git a/third_party/aom/av1/common/blockd.c b/third_party/aom/av1/common/blockd.c
new file mode 100644
index 000000000..4eb6f01ea
--- /dev/null
+++ b/third_party/aom/av1/common/blockd.c
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "aom_ports/system_state.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+
+PREDICTION_MODE av1_left_block_mode(const MODE_INFO *cur_mi,
+                                    const MODE_INFO *left_mi, int b) {
+  if (b == 0 || b == 2) {
+    if (!left_mi || is_inter_block(&left_mi->mbmi)) return DC_PRED;
+
+    return get_y_mode(left_mi, b + 1);
+  } else {
+    assert(b == 1 || b == 3);
+    return cur_mi->bmi[b - 1].as_mode;
+  }
+}
+
+PREDICTION_MODE av1_above_block_mode(const MODE_INFO *cur_mi,
+                                     const MODE_INFO *above_mi, int b) {
+  if (b == 0 || b == 1) {
+    if (!above_mi || is_inter_block(&above_mi->mbmi)) return DC_PRED;
+
+    return get_y_mode(above_mi, b + 2);
+  } else {
+    assert(b == 2 || b == 3);
+    return cur_mi->bmi[b - 2].as_mode;
+  }
+}
+
+#if CONFIG_COEF_INTERLEAVE
+void av1_foreach_transformed_block_interleave(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
+    foreach_transformed_block_visitor visit, void *arg) {
+  const struct macroblockd_plane *const pd_y = &xd->plane[0];
+  const struct macroblockd_plane *const pd_c = &xd->plane[1];
+  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+
+  const TX_SIZE tx_log2_y = mbmi->tx_size;
+  const TX_SIZE tx_log2_c = get_uv_tx_size(mbmi, pd_c);
+  const int tx_sz_y = (1 << tx_log2_y);
+  const int tx_sz_c = (1 << tx_log2_c);
+
+  const BLOCK_SIZE plane_bsize_y = get_plane_block_size(bsize, pd_y);
+  const BLOCK_SIZE plane_bsize_c = get_plane_block_size(bsize, pd_c);
+
+  const int num_4x4_w_y = num_4x4_blocks_wide_lookup[plane_bsize_y];
+  const int num_4x4_w_c = num_4x4_blocks_wide_lookup[plane_bsize_c];
+  const int num_4x4_h_y = num_4x4_blocks_high_lookup[plane_bsize_y];
+  const int num_4x4_h_c = num_4x4_blocks_high_lookup[plane_bsize_c];
+
+  const int step_y = 1 << (tx_log2_y << 1);
+  const int step_c = 1 << (tx_log2_c << 1);
+
+  const int max_4x4_w_y =
+      get_max_4x4_size(num_4x4_w_y, xd->mb_to_right_edge, pd_y->subsampling_x);
+  const int max_4x4_h_y =
+      get_max_4x4_size(num_4x4_h_y, xd->mb_to_bottom_edge, pd_y->subsampling_y);
+
+  const int extra_step_y = ((num_4x4_w_y - max_4x4_w_y) >> tx_log2_y) * step_y;
+
+  const int max_4x4_w_c =
+      get_max_4x4_size(num_4x4_w_c, xd->mb_to_right_edge, pd_c->subsampling_x);
+  const int max_4x4_h_c =
+      get_max_4x4_size(num_4x4_h_c, xd->mb_to_bottom_edge, pd_c->subsampling_y);
+
+  const int extra_step_c = ((num_4x4_w_c - max_4x4_w_c) >> tx_log2_c) * step_c;
+
+  // The max_4x4_w/h may be smaller than tx_sz under some corner cases,
+  // i.e. when the SB is splitted by tile boundaries.
+  const int tu_num_w_y = (max_4x4_w_y + tx_sz_y - 1) / tx_sz_y;
+  const int tu_num_h_y = (max_4x4_h_y + tx_sz_y - 1) / tx_sz_y;
+  const int tu_num_w_c = (max_4x4_w_c + tx_sz_c - 1) / tx_sz_c;
+  const int tu_num_h_c = (max_4x4_h_c + tx_sz_c - 1) / tx_sz_c;
+  const int tu_num_c = tu_num_w_c * tu_num_h_c;
+
+  int tu_idx_c = 0;
+  int offset_y, row_y, col_y;
+  int offset_c, row_c, col_c;
+
+  for (row_y = 0; row_y < tu_num_h_y; row_y++) {
+    for (col_y = 0; col_y < tu_num_w_y; col_y++) {
+      // luma
+      offset_y = (row_y * tu_num_w_y + col_y) * step_y + row_y * extra_step_y;
+      visit(0, offset_y, row_y * tx_sz_y, col_y * tx_sz_y, plane_bsize_y,
+            tx_log2_y, arg);
+      // chroma
+      if (tu_idx_c < tu_num_c) {
+        row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
+        col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
+        offset_c = tu_idx_c * step_c + (tu_idx_c / tu_num_w_c) * extra_step_c;
+        visit(1, offset_c, row_c, col_c, plane_bsize_c, tx_log2_c, arg);
+        visit(2, offset_c, row_c, col_c, plane_bsize_c, tx_log2_c, arg);
+        tu_idx_c++;
+      }
+    }
+  }
+
+  // In 422 case, it's possible that Chroma has more TUs than Luma
+  while (tu_idx_c < tu_num_c) {
+    row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
+    col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
+    offset_c = tu_idx_c * step_c + row_c * extra_step_c;
+    visit(1, offset_c, row_c, col_c, plane_bsize_c, tx_log2_c, arg);
+    visit(2, offset_c, row_c, col_c, plane_bsize_c, tx_log2_c, arg);
+    tu_idx_c++;
+  }
+}
+#endif
+
+void av1_foreach_transformed_block_in_plane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+    foreach_transformed_block_visitor visit, void *arg) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  // transform size varies per plane, look it up in a common way.
+  const TX_SIZE tx_size = get_tx_size(plane, xd);
+#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+  const BLOCK_SIZE plane_bsize =
+      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
+#else
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+#endif
+  const uint8_t txw_unit = tx_size_wide_unit[tx_size];
+  const uint8_t txh_unit = tx_size_high_unit[tx_size];
+  const int step = txw_unit * txh_unit;
+  int i = 0, r, c;
+
+  // If mb_to_right_edge is < 0 we are in a situation in which
+  // the current block size extends into the UMV and we won't
+  // visit the sub blocks that are wholly within the UMV.
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (r = 0; r < max_blocks_high; r += txh_unit) {
+    // Skip visiting the sub blocks that are wholly within the UMV.
+    for (c = 0; c < max_blocks_wide; c += txw_unit) {
+      visit(plane, i, r, c, plane_bsize, tx_size, arg);
+      i += step;
+    }
+  }
+}
+
+#if CONFIG_LV_MAP
+void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
+                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                   foreach_transformed_block_visitor visit,
+                                   void *arg) {
+  int plane;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_CB4X4
+    if (!is_chroma_reference(mi_row, mi_col, bsize,
+                             xd->plane[plane].subsampling_x,
+                             xd->plane[plane].subsampling_y))
+      continue;
+#else
+    (void)mi_row;
+    (void)mi_col;
+#endif
+    av1_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
+  }
+}
+#endif
+
+#if CONFIG_DAALA_DIST
+void av1_foreach_8x8_transformed_block_in_plane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+    foreach_transformed_block_visitor visit,
+    foreach_transformed_block_visitor mi_visit, void *arg) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  // transform size varies per plane, look it up in a common way.
+  const TX_SIZE tx_size = get_tx_size(plane, xd);
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const uint8_t txw_unit = tx_size_wide_unit[tx_size];
+  const uint8_t txh_unit = tx_size_high_unit[tx_size];
+  const int step = txw_unit * txh_unit;
+  int i = 0, r, c;
+
+  // If mb_to_right_edge is < 0 we are in a situation in which
+  // the current block size extends into the UMV and we won't
+  // visit the sub blocks that are wholly within the UMV.
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (r = 0; r < max_blocks_high; r += txh_unit) {
+    // Skip visiting the sub blocks that are wholly within the UMV.
+    for (c = 0; c < max_blocks_wide; c += txw_unit) {
+      visit(plane, i, r, c, plane_bsize, tx_size, arg);
+      // Call whenever each 8x8 block is done
+      if ((r & 1) && (c & 1))
+        mi_visit(plane, i, r - 1, c - 1, plane_bsize, TX_8X8, arg);
+      i += step;
+    }
+  }
+}
+#endif
+
+#if !CONFIG_PVQ || CONFIG_VAR_TX
+void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
+                      int plane, TX_SIZE tx_size, int has_eob, int aoff,
+                      int loff) {
+  ENTROPY_CONTEXT *const a = pd->above_context + aoff;
+  ENTROPY_CONTEXT *const l = pd->left_context + loff;
+  const int txs_wide = tx_size_wide_unit[tx_size];
+  const int txs_high = tx_size_high_unit[tx_size];
+#if CONFIG_CB4X4
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+#else
+  const BLOCK_SIZE bsize = AOMMAX(xd->mi[0]->mbmi.sb_type, BLOCK_8X8);
+#endif
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+
+  // above
+  if (has_eob && xd->mb_to_right_edge < 0) {
+    int i;
+    const int blocks_wide = max_block_wide(xd, plane_bsize, plane);
+    int above_contexts = txs_wide;
+    if (above_contexts + aoff > blocks_wide)
+      above_contexts = blocks_wide - aoff;
+
+    for (i = 0; i < above_contexts; ++i) a[i] = has_eob;
+    for (i = above_contexts; i < txs_wide; ++i) a[i] = 0;
+  } else {
+    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * txs_wide);
+  }
+
+  // left
+  if (has_eob && xd->mb_to_bottom_edge < 0) {
+    int i;
+    const int blocks_high = max_block_high(xd, plane_bsize, plane);
+    int left_contexts = txs_high;
+    if (left_contexts + loff > blocks_high) left_contexts = blocks_high - loff;
+
+    for (i = 0; i < left_contexts; ++i) l[i] = has_eob;
+    for (i = left_contexts; i < txs_high; ++i) l[i] = 0;
+  } else {
+    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * txs_high);
+  }
+}
+#endif
+
+void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y) {
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].plane_type = get_plane_type(i);
+    xd->plane[i].subsampling_x = i ? ss_x : 0;
+    xd->plane[i].subsampling_y = i ? ss_y : 0;
+  }
+}
+
+#if CONFIG_EXT_INTRA
+const int16_t dr_intra_derivative[90] = {
+  1,    14666, 7330, 4884, 3660, 2926, 2435, 2084, 1821, 1616, 1451, 1317, 1204,
+  1108, 1026,  955,  892,  837,  787,  743,  703,  666,  633,  603,  574,  548,
+  524,  502,   481,  461,  443,  426,  409,  394,  379,  365,  352,  339,  327,
+  316,  305,   294,  284,  274,  265,  256,  247,  238,  230,  222,  214,  207,
+  200,  192,   185,  179,  172,  166,  159,  153,  147,  141,  136,  130,  124,
+  119,  113,   108,  103,  98,   93,   88,   83,   78,   73,   68,   63,   59,
+  54,   49,    45,   40,   35,   31,   26,   22,   17,   13,   8,    4,
+};
+
+#if CONFIG_INTRA_INTERP
+int av1_is_intra_filter_switchable(int angle) {
+  assert(angle > 0 && angle < 270);
+  if (angle % 45 == 0) return 0;
+  if (angle > 90 && angle < 180) {
+    return 1;
+  } else {
+    return ((angle < 90 ? dr_intra_derivative[angle]
+                        : dr_intra_derivative[270 - angle]) &
+            0xFF) > 0;
+  }
+}
+#endif  // CONFIG_INTRA_INTERP
+#endif  // CONFIG_EXT_INTRA
diff --git a/third_party/aom/av1/common/blockd.h b/third_party/aom/av1/common/blockd.h
new file mode 100644
index 000000000..0acab965d
--- /dev/null
+++ b/third_party/aom/av1/common/blockd.h
@@ -0,0 +1,1371 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_BLOCKD_H_
+#define AV1_COMMON_BLOCKD_H_
+
+#include "./aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+
+#include "av1/common/common_data.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/mv.h"
+#include "av1/common/scale.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+#if CONFIG_PVQ
+#include "av1/common/pvq.h"
+#include "av1/common/pvq_state.h"
+#include "av1/decoder/decint.h"
+#endif
+#if CONFIG_CFL
+#include "av1/common/cfl.h"
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SUB8X8_COMP_REF (!(CONFIG_CB4X4 && CONFIG_CHROMA_2X2))
+
+#define MAX_MB_PLANE 3
+
+#if CONFIG_EXT_INTER
+
+#if CONFIG_COMPOUND_SEGMENT
+// Set COMPOUND_SEGMENT_TYPE to one of the three
+// 0: Uniform
+// 1: Difference weighted
+#define COMPOUND_SEGMENT_TYPE 1
+
+#if COMPOUND_SEGMENT_TYPE == 0
+#define MAX_SEG_MASK_BITS 1
+// SEG_MASK_TYPES should not surpass 1 << MAX_SEG_MASK_BITS
+typedef enum {
+  UNIFORM_45 = 0,
+  UNIFORM_45_INV,
+  SEG_MASK_TYPES,
+} SEG_MASK_TYPE;
+
+#elif COMPOUND_SEGMENT_TYPE == 1
+#define MAX_SEG_MASK_BITS 1
+// SEG_MASK_TYPES should not surpass 1 << MAX_SEG_MASK_BITS
+typedef enum {
+  DIFFWTD_42 = 0,
+  DIFFWTD_42_INV,
+  SEG_MASK_TYPES,
+} SEG_MASK_TYPE;
+
+#endif  // COMPOUND_SEGMENT_TYPE
+#endif  // CONFIG_COMPOUND_SEGMENT
+#endif  // CONFIG_EXT_INTER
+
+typedef enum {
+  KEY_FRAME = 0,
+  INTER_FRAME = 1,
+  FRAME_TYPES,
+} FRAME_TYPE;
+
+static INLINE int is_inter_mode(PREDICTION_MODE mode) {
+#if CONFIG_EXT_INTER
+  return mode >= NEARESTMV && mode <= NEW_NEWMV;
+#else
+  return mode >= NEARESTMV && mode <= NEWMV;
+#endif  // CONFIG_EXT_INTER
+}
+
+#if CONFIG_PVQ
+typedef struct PVQ_INFO {
+  int theta[PVQ_MAX_PARTITIONS];
+  int qg[PVQ_MAX_PARTITIONS];
+  int k[PVQ_MAX_PARTITIONS];
+  od_coeff y[OD_TXSIZE_MAX * OD_TXSIZE_MAX];
+  int nb_bands;
+  int off[PVQ_MAX_PARTITIONS];
+  int size[PVQ_MAX_PARTITIONS];
+  int skip_rest;
+  int skip_dir;
+  int bs;  // log of the block size minus two,
+           // i.e. equivalent to aom's TX_SIZE
+  // Block skip info, indicating whether DC/AC, is coded.
+  PVQ_SKIP_TYPE ac_dc_coded;  // bit0: DC coded, bit1 : AC coded (1 means coded)
+  tran_low_t dq_dc_residue;
+} PVQ_INFO;
+
+typedef struct PVQ_QUEUE {
+  PVQ_INFO *buf;  // buffer for pvq info, stored in encoding order
+  int curr_pos;   // curr position to write PVQ_INFO
+  int buf_len;    // allocated buffer length
+  int last_pos;   // last written position of PVQ_INFO in a tile
+} PVQ_QUEUE;
+#endif
+
+typedef struct {
+  uint8_t *plane[MAX_MB_PLANE];
+  int stride[MAX_MB_PLANE];
+} BUFFER_SET;
+
+#if CONFIG_EXT_INTER
+static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) {
+  return mode >= NEARESTMV && mode <= NEWMV;
+}
+#if CONFIG_COMPOUND_SINGLEREF
+static INLINE int is_inter_singleref_comp_mode(PREDICTION_MODE mode) {
+  return mode >= SR_NEAREST_NEARMV && mode <= SR_NEW_NEWMV;
+}
+#endif  // CONFIG_COMPOUND_SINGLEREF
+static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) {
+  return mode >= NEAREST_NEARESTMV && mode <= NEW_NEWMV;
+}
+
+static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
+  static PREDICTION_MODE lut[MB_MODE_COUNT] = {
+    MB_MODE_COUNT,  // DC_PRED
+    MB_MODE_COUNT,  // V_PRED
+    MB_MODE_COUNT,  // H_PRED
+    MB_MODE_COUNT,  // D45_PRED
+    MB_MODE_COUNT,  // D135_PRED
+    MB_MODE_COUNT,  // D117_PRED
+    MB_MODE_COUNT,  // D153_PRED
+    MB_MODE_COUNT,  // D207_PRED
+    MB_MODE_COUNT,  // D63_PRED
+#if CONFIG_ALT_INTRA
+    MB_MODE_COUNT,  // SMOOTH_PRED
+#endif              // CONFIG_ALT_INTRA
+    MB_MODE_COUNT,  // TM_PRED
+    MB_MODE_COUNT,  // NEARESTMV
+    MB_MODE_COUNT,  // NEARMV
+    MB_MODE_COUNT,  // ZEROMV
+    MB_MODE_COUNT,  // NEWMV
+#if CONFIG_COMPOUND_SINGLEREF
+    NEARESTMV,  // SR_NEAREST_NEARMV
+    NEARESTMV,  // SR_NEAREST_NEWMV
+    NEARMV,     // SR_NEAR_NEWMV
+    ZEROMV,     // SR_ZERO_NEWMV
+    NEWMV,      // SR_NEW_NEWMV
+#endif          // CONFIG_COMPOUND_SINGLEREF
+    NEARESTMV,  // NEAREST_NEARESTMV
+    NEARESTMV,  // NEAREST_NEARMV
+    NEARMV,     // NEAR_NEARESTMV
+    NEARMV,     // NEAR_NEARMV
+    NEARESTMV,  // NEAREST_NEWMV
+    NEWMV,      // NEW_NEARESTMV
+    NEARMV,     // NEAR_NEWMV
+    NEWMV,      // NEW_NEARMV
+    ZEROMV,     // ZERO_ZEROMV
+    NEWMV,      // NEW_NEWMV
+  };
+  assert(is_inter_compound_mode(mode));
+  return lut[mode];
+}
+
+static INLINE PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) {
+  static PREDICTION_MODE lut[MB_MODE_COUNT] = {
+    MB_MODE_COUNT,  // DC_PRED
+    MB_MODE_COUNT,  // V_PRED
+    MB_MODE_COUNT,  // H_PRED
+    MB_MODE_COUNT,  // D45_PRED
+    MB_MODE_COUNT,  // D135_PRED
+    MB_MODE_COUNT,  // D117_PRED
+    MB_MODE_COUNT,  // D153_PRED
+    MB_MODE_COUNT,  // D207_PRED
+    MB_MODE_COUNT,  // D63_PRED
+#if CONFIG_ALT_INTRA
+    MB_MODE_COUNT,  // SMOOTH_PRED
+#endif              // CONFIG_ALT_INTRA
+    MB_MODE_COUNT,  // TM_PRED
+    MB_MODE_COUNT,  // NEARESTMV
+    MB_MODE_COUNT,  // NEARMV
+    MB_MODE_COUNT,  // ZEROMV
+    MB_MODE_COUNT,  // NEWMV
+#if CONFIG_COMPOUND_SINGLEREF
+    NEARMV,     // SR_NEAREST_NEARMV
+    NEWMV,      // SR_NEAREST_NEWMV
+    NEWMV,      // SR_NEAR_NEWMV
+    NEWMV,      // SR_ZERO_NEWMV
+    NEWMV,      // SR_NEW_NEWMV
+#endif          // CONFIG_COMPOUND_SINGLEREF
+    NEARESTMV,  // NEAREST_NEARESTMV
+    NEARMV,     // NEAREST_NEARMV
+    NEARESTMV,  // NEAR_NEARESTMV
+    NEARMV,     // NEAR_NEARMV
+    NEWMV,      // NEAREST_NEWMV
+    NEARESTMV,  // NEW_NEARESTMV
+    NEWMV,      // NEAR_NEWMV
+    NEARMV,     // NEW_NEARMV
+    ZEROMV,     // ZERO_ZEROMV
+    NEWMV,      // NEW_NEWMV
+  };
+  assert(is_inter_compound_mode(mode));
+  return lut[mode];
+}
+
+static INLINE int have_nearmv_in_inter_mode(PREDICTION_MODE mode) {
+  return (mode == NEARMV || mode == NEAR_NEARMV || mode == NEAREST_NEARMV ||
+          mode == NEAR_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV);
+}
+
+static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
+  return (mode == NEWMV || mode == NEW_NEWMV || mode == NEAREST_NEWMV ||
+          mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV);
+}
+
+static INLINE int use_masked_motion_search(COMPOUND_TYPE type) {
+#if CONFIG_WEDGE
+  return (type == COMPOUND_WEDGE);
+#else
+  (void)type;
+  return 0;
+#endif
+}
+
+static INLINE int is_masked_compound_type(COMPOUND_TYPE type) {
+#if CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
+  return (type == COMPOUND_WEDGE || type == COMPOUND_SEG);
+#elif !CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
+  return (type == COMPOUND_WEDGE);
+#elif CONFIG_COMPOUND_SEGMENT && !CONFIG_WEDGE
+  return (type == COMPOUND_SEG);
+#endif  // CONFIG_COMPOUND_SEGMENT
+  (void)type;
+  return 0;
+}
+#else
+
+static INLINE int have_nearmv_in_inter_mode(PREDICTION_MODE mode) {
+  return (mode == NEARMV);
+}
+
+static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
+  return (mode == NEWMV);
+}
+#endif  // CONFIG_EXT_INTER
+
+/* For keyframes, intra block modes are predicted by the (already decoded)
+   modes for the Y blocks to the left and above us; for interframes, there
+   is a single probability table. */
+
+typedef struct {
+  PREDICTION_MODE as_mode;
+  int_mv as_mv[2];  // first, second inter predictor motion vectors
+#if CONFIG_REF_MV
+  int_mv pred_mv[2];
+#endif
+#if CONFIG_EXT_INTER
+  int_mv ref_mv[2];
+#endif  // CONFIG_EXT_INTER
+} b_mode_info;
+
+typedef int8_t MV_REFERENCE_FRAME;
+
+#if CONFIG_PALETTE
+typedef struct {
+  // Number of base colors for Y (0) and UV (1)
+  uint8_t palette_size[2];
+// Value of base colors for Y, U, and V
+#if CONFIG_HIGHBITDEPTH
+  uint16_t palette_colors[3 * PALETTE_MAX_SIZE];
+#else
+  uint8_t palette_colors[3 * PALETTE_MAX_SIZE];
+#endif  // CONFIG_HIGHBITDEPTH
+  // Only used by encoder to store the color index of the top left pixel.
+  // TODO(huisu): move this to encoder
+  uint8_t palette_first_color_idx[2];
+} PALETTE_MODE_INFO;
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_FILTER_INTRA
+#define USE_3TAP_INTRA_FILTER 1  // 0: 4-tap; 1: 3-tap
+typedef struct {
+  // 1: an ext intra mode is used; 0: otherwise.
+  uint8_t use_filter_intra_mode[PLANE_TYPES];
+  FILTER_INTRA_MODE filter_intra_mode[PLANE_TYPES];
+} FILTER_INTRA_MODE_INFO;
+#endif  // CONFIG_FILTER_INTRA
+
+#if CONFIG_VAR_TX
+#if CONFIG_RD_DEBUG
+#define TXB_COEFF_COST_MAP_SIZE (2 * MAX_MIB_SIZE)
+#endif
+#endif
+
+typedef struct RD_STATS {
+  int rate;
+  int64_t dist;
+  // Please be careful of using rdcost, it's not guaranteed to be set all the
+  // time.
+  // TODO(angiebird): Create a set of functions to manipulate the RD_STATS. In
+  // these functions, make sure rdcost is always up-to-date according to
+  // rate/dist.
+  int64_t rdcost;
+  int64_t sse;
+  int skip;  // sse should equal to dist when skip == 1
+#if CONFIG_RD_DEBUG
+  int txb_coeff_cost[MAX_MB_PLANE];
+#if CONFIG_VAR_TX
+  int txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE]
+                        [TXB_COEFF_COST_MAP_SIZE];
+#endif  // CONFIG_VAR_TX
+#endif  // CONFIG_RD_DEBUG
+} RD_STATS;
+
+#if CONFIG_EXT_INTER
+// This struct is used to group function args that are commonly
+// sent together in functions related to interinter compound modes
+typedef struct {
+#if CONFIG_WEDGE
+  int wedge_index;
+  int wedge_sign;
+#endif  // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+  SEG_MASK_TYPE mask_type;
+  uint8_t *seg_mask;
+#endif  // CONFIG_COMPOUND_SEGMENT
+  COMPOUND_TYPE interinter_compound_type;
+} INTERINTER_COMPOUND_DATA;
+#endif  // CONFIG_EXT_INTER
+
+// This structure now relates to 8x8 block regions.
+typedef struct {
+  // Common for both INTER and INTRA blocks
+  BLOCK_SIZE sb_type;
+  PREDICTION_MODE mode;
+  TX_SIZE tx_size;
+#if CONFIG_VAR_TX
+  // TODO(jingning): This effectively assigned a separate entry for each
+  // 8x8 block. Apparently it takes much more space than needed.
+  TX_SIZE inter_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
+  TX_SIZE min_tx_size;
+#endif
+  int8_t skip;
+  int8_t segment_id;
+#if CONFIG_SUPERTX
+  // Minimum of all segment IDs under the current supertx block.
+  int8_t segment_id_supertx;
+#endif                      // CONFIG_SUPERTX
+  int8_t seg_id_predicted;  // valid only when temporal_update is enabled
+
+  // Only for INTRA blocks
+  PREDICTION_MODE uv_mode;
+#if CONFIG_PALETTE
+  PALETTE_MODE_INFO palette_mode_info;
+#endif  // CONFIG_PALETTE
+#if CONFIG_INTRABC
+  uint8_t use_intrabc;
+#endif  // CONFIG_INTRABC
+
+// Only for INTER blocks
+#if CONFIG_DUAL_FILTER
+  InterpFilter interp_filter[4];
+#else
+  InterpFilter interp_filter;
+#endif
+  MV_REFERENCE_FRAME ref_frame[2];
+  TX_TYPE tx_type;
+#if CONFIG_TXK_SEL
+  TX_TYPE txk_type[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+#endif
+
+#if CONFIG_FILTER_INTRA
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+#endif  // CONFIG_FILTER_INTRA
+#if CONFIG_EXT_INTRA
+  // The actual prediction angle is the base angle + (angle_delta * step).
+  int8_t angle_delta[2];
+#if CONFIG_INTRA_INTERP
+  // To-Do (huisu): this may be replaced by interp_filter
+  INTRA_FILTER intra_filter;
+#endif  // CONFIG_INTRA_INTERP
+#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_EXT_INTER
+  // interintra members
+  INTERINTRA_MODE interintra_mode;
+  // TODO(debargha): Consolidate these flags
+  int use_wedge_interintra;
+  int interintra_wedge_index;
+  int interintra_wedge_sign;
+  // interinter members
+  COMPOUND_TYPE interinter_compound_type;
+#if CONFIG_WEDGE
+  int wedge_index;
+  int wedge_sign;
+#endif  // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+  SEG_MASK_TYPE mask_type;
+#endif  // CONFIG_COMPOUND_SEGMENT
+#endif  // CONFIG_EXT_INTER
+  MOTION_MODE motion_mode;
+#if CONFIG_MOTION_VAR
+  int overlappable_neighbors[2];
+#endif  // CONFIG_MOTION_VAR
+  int_mv mv[2];
+  int_mv pred_mv[2];
+#if CONFIG_REF_MV
+  uint8_t ref_mv_idx;
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+  PARTITION_TYPE partition;
+#endif
+#if CONFIG_NEW_QUANT
+  int dq_off_index;
+  int send_dq_bit;
+#endif  // CONFIG_NEW_QUANT
+  /* deringing gain *per-superblock* */
+  int8_t cdef_strength;
+#if CONFIG_DELTA_Q
+  int current_q_index;
+#if CONFIG_EXT_DELTA_Q
+  int current_delta_lf_from_base;
+#endif
+#endif
+#if CONFIG_RD_DEBUG
+  RD_STATS rd_stats;
+  int mi_row;
+  int mi_col;
+#endif
+#if CONFIG_WARPED_MOTION
+  int num_proj_ref[2];
+  WarpedMotionParams wm_params[2];
+#endif  // CONFIG_WARPED_MOTION
+
+  BOUNDARY_TYPE boundary_info;
+} MB_MODE_INFO;
+
+typedef struct MODE_INFO {
+  MB_MODE_INFO mbmi;
+  b_mode_info bmi[4];
+} MODE_INFO;
+
+#if CONFIG_INTRABC
+static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) {
+  return mbmi->use_intrabc;
+}
+#endif
+
+static INLINE PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) {
+#if CONFIG_CB4X4
+  (void)block;
+  return mi->mbmi.mode;
+#else
+  return mi->mbmi.sb_type < BLOCK_8X8 ? mi->bmi[block].as_mode : mi->mbmi.mode;
+#endif
+}
+
+static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
+#if CONFIG_INTRABC
+  if (is_intrabc_block(mbmi)) return 1;
+#endif
+  return mbmi->ref_frame[0] > INTRA_FRAME;
+}
+
+static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) {
+  return mbmi->ref_frame[1] > INTRA_FRAME;
+}
+
+PREDICTION_MODE av1_left_block_mode(const MODE_INFO *cur_mi,
+                                    const MODE_INFO *left_mi, int b);
+
+PREDICTION_MODE av1_above_block_mode(const MODE_INFO *cur_mi,
+                                     const MODE_INFO *above_mi, int b);
+
+#if CONFIG_GLOBAL_MOTION
+static INLINE int is_global_mv_block(const MODE_INFO *mi, int block,
+                                     TransformationType type) {
+  PREDICTION_MODE mode = get_y_mode(mi, block);
+#if GLOBAL_SUB8X8_USED
+  const int block_size_allowed = 1;
+#else
+  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
+  const int block_size_allowed = (bsize >= BLOCK_8X8);
+#endif  // GLOBAL_SUB8X8_USED
+#if CONFIG_EXT_INTER
+  return (mode == ZEROMV || mode == ZERO_ZEROMV) && type > TRANSLATION &&
+         block_size_allowed;
+#else
+  return mode == ZEROMV && type > TRANSLATION && block_size_allowed;
+#endif  // CONFIG_EXT_INTER
+}
+#endif  // CONFIG_GLOBAL_MOTION
+
+enum mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 };
+
+struct buf_2d {
+  uint8_t *buf;
+  uint8_t *buf0;
+  int width;
+  int height;
+  int stride;
+};
+
+typedef struct macroblockd_plane {
+  tran_low_t *dqcoeff;
+  PLANE_TYPE plane_type;
+  int subsampling_x;
+  int subsampling_y;
+  struct buf_2d dst;
+  struct buf_2d pre[2];
+  ENTROPY_CONTEXT *above_context;
+  ENTROPY_CONTEXT *left_context;
+  int16_t seg_dequant[MAX_SEGMENTS][2];
+#if CONFIG_NEW_QUANT
+  dequant_val_type_nuq seg_dequant_nuq[MAX_SEGMENTS][QUANT_PROFILES]
+                                      [COEF_BANDS];
+#endif
+#if CONFIG_PALETTE
+  uint8_t *color_index_map;
+#endif  // CONFIG_PALETTE
+
+  // number of 4x4s in current block
+  uint16_t n4_w, n4_h;
+  // log2 of n4_w, n4_h
+  uint8_t n4_wl, n4_hl;
+  // block size in pixels
+  uint8_t width, height;
+
+#if CONFIG_AOM_QM
+  const qm_val_t *seg_iqmatrix[MAX_SEGMENTS][2][TX_SIZES];
+#endif
+  // encoder
+  const int16_t *dequant;
+#if CONFIG_NEW_QUANT
+  const dequant_val_type_nuq *dequant_val_nuq[QUANT_PROFILES];
+#endif  // CONFIG_NEW_QUANT
+#if CONFIG_AOM_QM
+  const qm_val_t *seg_qmatrix[MAX_SEGMENTS][2][TX_SIZES];
+#endif
+
+#if CONFIG_PVQ || CONFIG_DAALA_DIST
+  DECLARE_ALIGNED(16, int16_t, pred[MAX_SB_SQUARE]);
+  // PVQ: forward transformed predicted image, a reference for PVQ.
+  tran_low_t *pvq_ref_coeff;
+#endif
+} MACROBLOCKD_PLANE;
+
+#define BLOCK_OFFSET(x, i) \
+  ((x) + (i) * (1 << (tx_size_wide_log2[0] + tx_size_high_log2[0])))
+
+typedef struct RefBuffer {
+  // TODO(dkovalev): idx is not really required and should be removed, now it
+  // is used in av1_onyxd_if.c
+  int idx;
+  YV12_BUFFER_CONFIG *buf;
+  struct scale_factors sf;
+} RefBuffer;
+
+typedef int16_t EobThresholdMD[TX_SIZES_ALL][TX_TYPES];
+
+typedef struct macroblockd {
+  struct macroblockd_plane plane[MAX_MB_PLANE];
+  uint8_t bmode_blocks_wl;
+  uint8_t bmode_blocks_hl;
+
+  FRAME_COUNTS *counts;
+  TileInfo tile;
+
+  int mi_stride;
+
+  MODE_INFO **mi;
+  MODE_INFO *left_mi;
+  MODE_INFO *above_mi;
+  MB_MODE_INFO *left_mbmi;
+  MB_MODE_INFO *above_mbmi;
+
+  int up_available;
+  int left_available;
+#if CONFIG_CHROMA_SUB8X8
+  int chroma_up_available;
+  int chroma_left_available;
+#endif
+
+  const aom_prob (*partition_probs)[PARTITION_TYPES - 1];
+
+  /* Distance of MB away from frame edges */
+  int mb_to_left_edge;
+  int mb_to_right_edge;
+  int mb_to_top_edge;
+  int mb_to_bottom_edge;
+
+  FRAME_CONTEXT *fc;
+
+  /* pointers to reference frames */
+  const RefBuffer *block_refs[2];
+
+  /* pointer to current frame */
+  const YV12_BUFFER_CONFIG *cur_buf;
+
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
+
+  PARTITION_CONTEXT *above_seg_context;
+  PARTITION_CONTEXT left_seg_context[MAX_MIB_SIZE];
+
+#if CONFIG_VAR_TX
+  TXFM_CONTEXT *above_txfm_context;
+  TXFM_CONTEXT *left_txfm_context;
+  TXFM_CONTEXT left_txfm_context_buffer[MAX_MIB_SIZE];
+
+  TX_SIZE max_tx_size;
+#if CONFIG_SUPERTX
+  TX_SIZE supertx_size;
+#endif
+#endif
+
+  // block dimension in the unit of mode_info.
+  uint8_t n8_w, n8_h;
+
+#if CONFIG_REF_MV
+  uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+  CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+  uint8_t is_sec_rect;
+#endif
+
+#if CONFIG_PVQ
+  daala_dec_ctx daala_dec;
+#endif
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *tile_ctx;
+#endif
+#if CONFIG_HIGHBITDEPTH
+  /* Bit depth: 8, 10, 12 */
+  int bd;
+#endif
+
+  int qindex[MAX_SEGMENTS];
+  int lossless[MAX_SEGMENTS];
+  int corrupted;
+
+  struct aom_internal_error_info *error_info;
+#if CONFIG_GLOBAL_MOTION
+  WarpedMotionParams *global_motion;
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_DELTA_Q
+  int prev_qindex;
+  int delta_qindex;
+  int current_qindex;
+#if CONFIG_EXT_DELTA_Q
+  // Since actual frame level loop filtering level value is not available
+  // at the beginning of the tile (only available during actual filtering)
+  // at encoder side.we record the delta_lf (against the frame level loop
+  // filtering level) and code the delta between previous superblock's delta
+  // lf and current delta lf. It is equivalent to the delta between previous
+  // superblock's actual lf and current lf.
+  int prev_delta_lf_from_base;
+  int current_delta_lf_from_base;
+#endif
+#endif
+#if CONFIG_ADAPT_SCAN
+  const EobThresholdMD *eob_threshold_md;
+#endif
+
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SEGMENT
+  DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SEGMENT
+
+#if CONFIG_CFL
+  CFL_CTX *cfl;
+#endif
+} MACROBLOCKD;
+
+static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
+                                     PARTITION_TYPE partition) {
+  if (partition == PARTITION_INVALID)
+    return BLOCK_INVALID;
+  else
+    return subsize_lookup[partition][bsize];
+}
+
+static const TX_TYPE intra_mode_to_tx_type_context[INTRA_MODES] = {
+  DCT_DCT,    // DC
+  ADST_DCT,   // V
+  DCT_ADST,   // H
+  DCT_DCT,    // D45
+  ADST_ADST,  // D135
+  ADST_DCT,   // D117
+  DCT_ADST,   // D153
+  DCT_ADST,   // D207
+  ADST_DCT,   // D63
+#if CONFIG_ALT_INTRA
+  ADST_ADST,  // SMOOTH
+#endif        // CONFIG_ALT_INTRA
+  ADST_ADST,  // TM
+};
+
+#if CONFIG_SUPERTX
+static INLINE int supertx_enabled(const MB_MODE_INFO *mbmi) {
+  TX_SIZE max_tx_size = txsize_sqr_map[mbmi->tx_size];
+  return tx_size_wide[max_tx_size] >
+         AOMMIN(block_size_wide[mbmi->sb_type], block_size_high[mbmi->sb_type]);
+}
+#endif  // CONFIG_SUPERTX
+
+#define USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4 1
+
+#if CONFIG_RECT_TX
+static INLINE int is_rect_tx(TX_SIZE tx_size) { return tx_size >= TX_SIZES; }
+#endif  // CONFIG_RECT_TX
+
+#if CONFIG_EXT_TX
+#define ALLOW_INTRA_EXT_TX 1
+
+typedef enum {
+  // DCT only
+  EXT_TX_SET_DCTONLY = 0,
+  // DCT + Identity only
+  EXT_TX_SET_DCT_IDTX = 1,
+  // Discrete Trig transforms w/o flip (4) + Identity (1)
+  EXT_TX_SET_DTT4_IDTX = 2,
+  // Discrete Trig transforms w/o flip (4) + Identity (1) + 1D Hor/vert DCT (2)
+  EXT_TX_SET_DTT4_IDTX_1DDCT = 3,
+  // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver DCT (2)
+  EXT_TX_SET_DTT9_IDTX_1DDCT = 4,
+  // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver (6)
+  EXT_TX_SET_ALL16 = 5,
+  EXT_TX_SET_TYPES
+} TxSetType;
+
+// Number of transform types in each set type
+static const int num_ext_tx_set[EXT_TX_SET_TYPES] = { 1, 2, 5, 7, 12, 16 };
+
+// Maps intra set index to the set type
+static const int ext_tx_set_type_intra[EXT_TX_SETS_INTRA] = {
+  EXT_TX_SET_DCTONLY, EXT_TX_SET_DTT4_IDTX_1DDCT, EXT_TX_SET_DTT4_IDTX
+};
+
+// Maps inter set index to the set type
+static const int ext_tx_set_type_inter[EXT_TX_SETS_INTER] = {
+  EXT_TX_SET_DCTONLY, EXT_TX_SET_ALL16, EXT_TX_SET_DTT9_IDTX_1DDCT,
+  EXT_TX_SET_DCT_IDTX
+};
+
+// Maps set types above to the indices used for intra
+static const int ext_tx_set_index_intra[EXT_TX_SET_TYPES] = { 0, -1, 2,
+                                                              1, -1, -1 };
+
+// Maps set types above to the indices used for inter
+static const int ext_tx_set_index_inter[EXT_TX_SET_TYPES] = {
+  0, 3, -1, -1, 2, 1
+};
+
+static INLINE TxSetType get_ext_tx_set_type(TX_SIZE tx_size, BLOCK_SIZE bs,
+                                            int is_inter, int use_reduced_set) {
+  const TX_SIZE tx_size2 = txsize_sqr_up_map[tx_size];
+  tx_size = txsize_sqr_map[tx_size];
+#if CONFIG_CB4X4 && USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
+  (void)bs;
+  if (tx_size > TX_32X32) return EXT_TX_SET_DCTONLY;
+#else
+  if (tx_size > TX_32X32 || bs < BLOCK_8X8) return EXT_TX_SET_DCTONLY;
+#endif
+  if (use_reduced_set)
+    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
+  if (tx_size2 == TX_32X32)
+    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
+  if (is_inter)
+    return (tx_size == TX_16X16 ? EXT_TX_SET_DTT9_IDTX_1DDCT
+                                : EXT_TX_SET_ALL16);
+  else
+    return (tx_size == TX_16X16 ? EXT_TX_SET_DTT4_IDTX
+                                : EXT_TX_SET_DTT4_IDTX_1DDCT);
+}
+
+static INLINE int get_ext_tx_set(TX_SIZE tx_size, BLOCK_SIZE bs, int is_inter,
+                                 int use_reduced_set) {
+  const TxSetType set_type =
+      get_ext_tx_set_type(tx_size, bs, is_inter, use_reduced_set);
+  return is_inter ? ext_tx_set_index_inter[set_type]
+                  : ext_tx_set_index_intra[set_type];
+}
+
+static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA][EXT_TX_SIZES] =
+    {
+#if CONFIG_CB4X4
+      { 1, 1, 1, 1, 1 },  // unused
+      { 0, 1, 1, 0, 0 },
+      { 0, 0, 0, 1, 0 },
+#else
+      { 1, 1, 1, 1 },  // unused
+      { 1, 1, 0, 0 },
+      { 0, 0, 1, 0 },
+#endif  // CONFIG_CB4X4
+    };
+
+static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER][EXT_TX_SIZES] =
+    {
+#if CONFIG_CB4X4
+      { 1, 1, 1, 1, 1 },  // unused
+      { 0, 1, 1, 0, 0 },
+      { 0, 0, 0, 1, 0 },
+      { 0, 0, 0, 0, 1 },
+#else
+      { 1, 1, 1, 1 },  // unused
+      { 1, 1, 0, 0 },
+      { 0, 0, 1, 0 },
+      { 0, 0, 0, 1 },
+#endif  // CONFIG_CB4X4
+    };
+
+// Transform types used in each intra set
+static const int ext_tx_used_intra[EXT_TX_SETS_INTRA][TX_TYPES] = {
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
+};
+
+// Numbers of transform types used in each intra set
+static const int ext_tx_cnt_intra[EXT_TX_SETS_INTRA] = { 1, 7, 5 };
+
+// Transform types used in each inter set
+static const int ext_tx_used_inter[EXT_TX_SETS_INTER][TX_TYPES] = {
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 },
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
+};
+
+// Numbers of transform types used in each inter set
+static const int ext_tx_cnt_inter[EXT_TX_SETS_INTER] = { 1, 16, 12, 2 };
+
+// 1D Transforms used in inter set, this needs to be changed if
+// ext_tx_used_inter is changed
+static const int ext_tx_used_inter_1D[EXT_TX_SETS_INTER][TX_TYPES_1D] = {
+  { 1, 0, 0, 0 }, { 1, 1, 1, 1 }, { 1, 1, 1, 1 }, { 1, 0, 0, 1 },
+};
+
+static INLINE int get_ext_tx_types(TX_SIZE tx_size, BLOCK_SIZE bs, int is_inter,
+                                   int use_reduced_set) {
+  const int set_type =
+      get_ext_tx_set_type(tx_size, bs, is_inter, use_reduced_set);
+  return num_ext_tx_set[set_type];
+}
+
+#if CONFIG_RECT_TX
+static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
+  static const char LUT[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+    0,  // BLOCK_2X2
+    0,  // BLOCK_2X4
+    0,  // BLOCK_4X2
+#endif
+    0,  // BLOCK_4X4
+    1,  // BLOCK_4X8
+    1,  // BLOCK_8X4
+    0,  // BLOCK_8X8
+    1,  // BLOCK_8X16
+    1,  // BLOCK_16X8
+    0,  // BLOCK_16X16
+    1,  // BLOCK_16X32
+    1,  // BLOCK_32X16
+    0,  // BLOCK_32X32
+    0,  // BLOCK_32X64
+    0,  // BLOCK_64X32
+    0,  // BLOCK_64X64
+#if CONFIG_EXT_PARTITION
+    0,  // BLOCK_64X128
+    0,  // BLOCK_128X64
+    0,  // BLOCK_128X128
+#endif  // CONFIG_EXT_PARTITION
+  };
+
+  return LUT[bsize];
+}
+
+static INLINE int is_rect_tx_allowed(const MACROBLOCKD *xd,
+                                     const MB_MODE_INFO *mbmi) {
+  return is_rect_tx_allowed_bsize(mbmi->sb_type) &&
+         !xd->lossless[mbmi->segment_id];
+}
+#endif  // CONFIG_RECT_TX
+#endif  // CONFIG_EXT_TX
+
+static INLINE TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode,
+                                           int is_inter) {
+  const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+#if (CONFIG_VAR_TX || CONFIG_EXT_TX) && CONFIG_RECT_TX
+  const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bsize];
+#else
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+#endif  // (CONFIG_VAR_TX || CONFIG_EXT_TX) && CONFIG_RECT_TX
+  (void)is_inter;
+#if CONFIG_VAR_TX && CONFIG_RECT_TX
+#if CONFIG_CB4X4
+  if (bsize == BLOCK_4X4)
+    return AOMMIN(max_txsize_lookup[bsize], largest_tx_size);
+#else
+  if (bsize < BLOCK_8X8)
+    return AOMMIN(max_txsize_lookup[bsize], largest_tx_size);
+#endif
+  if (txsize_sqr_map[max_rect_tx_size] <= largest_tx_size)
+    return max_rect_tx_size;
+  else
+    return largest_tx_size;
+#elif CONFIG_EXT_TX && CONFIG_RECT_TX
+  if (txsize_sqr_up_map[max_rect_tx_size] <= largest_tx_size) {
+    return max_rect_tx_size;
+  } else {
+    return largest_tx_size;
+  }
+#else
+  return AOMMIN(max_tx_size, largest_tx_size);
+#endif  // CONFIG_VAR_TX && CONFIG_RECT_TX
+}
+
+#if CONFIG_EXT_INTRA
+#define MAX_ANGLE_DELTA 3
+#define ANGLE_STEP 3
+extern const int16_t dr_intra_derivative[90];
+static const uint8_t mode_to_angle_map[INTRA_MODES] = {
+  0, 90, 180, 45, 135, 111, 157, 203, 67, 0,
+};
+#if CONFIG_INTRA_INTERP
+// Returns whether filter selection is needed for a given
+// intra prediction angle.
+int av1_is_intra_filter_switchable(int angle);
+#endif  // CONFIG_INTRA_INTERP
+#endif  // CONFIG_EXT_INTRA
+
+#define FIXED_TX_TYPE 0
+
+// Converts block_index for given transform size to index of the block in raster
+// order.
+static INLINE int av1_block_index_to_raster_order(TX_SIZE tx_size,
+                                                  int block_idx) {
+  // For transform size 4x8, the possible block_idx values are 0 & 2, because
+  // block_idx values are incremented in steps of size 'tx_width_unit x
+  // tx_height_unit'. But, for this transform size, block_idx = 2 corresponds to
+  // block number 1 in raster order, inside an 8x8 MI block.
+  // For any other transform size, the two indices are equivalent.
+  return (tx_size == TX_4X8 && block_idx == 2) ? 1 : block_idx;
+}
+
+// Inverse of above function.
+// Note: only implemented for transform sizes 4x4, 4x8 and 8x4 right now.
+static INLINE int av1_raster_order_to_block_index(TX_SIZE tx_size,
+                                                  int raster_order) {
+  assert(tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4);
+  // We ensure that block indices are 0 & 2 if tx size is 4x8 or 8x4.
+  return (tx_size == TX_4X4) ? raster_order : (raster_order > 0) ? 2 : 0;
+}
+
+static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
+                                          const MACROBLOCKD *xd, int block_idx,
+                                          TX_SIZE tx_size) {
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+
+  if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y ||
+      xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32)
+    return DCT_DCT;
+
+  return intra_mode_to_tx_type_context[plane_type == PLANE_TYPE_Y
+                                           ? get_y_mode(xd->mi[0], block_idx)
+                                           : mbmi->uv_mode];
+}
+
+static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type, const MACROBLOCKD *xd,
+                                  int block, TX_SIZE tx_size) {
+  const MODE_INFO *const mi = xd->mi[0];
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+#if CONFIG_INTRABC
+  // TODO(aconverse@google.com): Revisit this decision
+  if (is_intrabc_block(mbmi)) return DCT_DCT;
+#endif  // CONFIG_INTRABC
+#if !CONFIG_TXK_SEL
+#if FIXED_TX_TYPE
+  const int block_raster_idx = av1_block_index_to_raster_order(tx_size, block);
+  return get_default_tx_type(plane_type, xd, block_raster_idx, tx_size);
+#elif CONFIG_EXT_TX
+#if !CONFIG_CB4X4
+  const int block_raster_idx = av1_block_index_to_raster_order(tx_size, block);
+#endif  // !CONFIG_CB4X4
+  if (xd->lossless[mbmi->segment_id] || txsize_sqr_map[tx_size] > TX_32X32 ||
+      (txsize_sqr_map[tx_size] >= TX_32X32 && !is_inter_block(mbmi)))
+    return DCT_DCT;
+  if (mbmi->sb_type >= BLOCK_8X8 || CONFIG_CB4X4) {
+    if (plane_type == PLANE_TYPE_Y) {
+#if !ALLOW_INTRA_EXT_TX
+      if (is_inter_block(mbmi))
+#endif  // ALLOW_INTRA_EXT_TX
+        return mbmi->tx_type;
+    }
+
+    if (is_inter_block(mbmi)) {
+// UV Inter only
+#if CONFIG_CB4X4
+      if (tx_size < TX_4X4) return DCT_DCT;
+#endif
+      return (mbmi->tx_type == IDTX && txsize_sqr_map[tx_size] >= TX_32X32)
+                 ? DCT_DCT
+                 : mbmi->tx_type;
+    }
+  }
+
+#if CONFIG_CB4X4
+  (void)block;
+  if (tx_size < TX_4X4)
+    return DCT_DCT;
+  else
+    return intra_mode_to_tx_type_context[mbmi->uv_mode];
+#else
+
+  // Sub8x8-Inter/Intra OR UV-Intra
+  if (is_inter_block(mbmi))  // Sub8x8-Inter
+    return DCT_DCT;
+  else  // Sub8x8 Intra OR UV-Intra
+    return intra_mode_to_tx_type_context[plane_type == PLANE_TYPE_Y
+                                             ? get_y_mode(mi, block_raster_idx)
+                                             : mbmi->uv_mode];
+#endif  // CONFIG_CB4X4
+#else   // CONFIG_EXT_TX
+  (void)block;
+  if (plane_type != PLANE_TYPE_Y || xd->lossless[mbmi->segment_id] ||
+      txsize_sqr_map[tx_size] >= TX_32X32)
+    return DCT_DCT;
+  return mbmi->tx_type;
+#endif  // CONFIG_EXT_TX
+#else   // !CONFIG_TXK_SEL
+  (void)tx_size;
+  TX_TYPE tx_type;
+  if (plane_type != PLANE_TYPE_Y || xd->lossless[mbmi->segment_id] ||
+      mbmi->tx_size >= TX_32X32) {
+    tx_type = DCT_DCT;
+  } else {
+    tx_type = mbmi->txk_type[block];
+  }
+  assert(tx_type >= DCT_DCT && tx_type < TX_TYPES);
+  return tx_type;
+#endif  // !CONFIG_TXK_SEL
+}
+
+void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y);
+
+static INLINE int tx_size_to_depth(TX_SIZE tx_size) {
+  return (int)(tx_size - TX_4X4);
+}
+
+static INLINE TX_SIZE depth_to_tx_size(int depth) {
+  return (TX_SIZE)(depth + TX_4X4);
+}
+
+static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi,
+                                     const struct macroblockd_plane *pd) {
+  TX_SIZE uv_txsize;
+#if CONFIG_CB4X4
+  assert(mbmi->tx_size > TX_2X2);
+#endif
+
+#if CONFIG_SUPERTX
+  if (supertx_enabled(mbmi))
+    return uvsupertx_size_lookup[txsize_sqr_map[mbmi->tx_size]]
+                                [pd->subsampling_x][pd->subsampling_y];
+#endif  // CONFIG_SUPERTX
+
+  uv_txsize = uv_txsize_lookup[mbmi->sb_type][mbmi->tx_size][pd->subsampling_x]
+                              [pd->subsampling_y];
+#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+  uv_txsize = AOMMAX(uv_txsize, TX_4X4);
+#endif
+  assert(uv_txsize != TX_INVALID);
+  return uv_txsize;
+}
+
+static INLINE TX_SIZE get_tx_size(int plane, const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
+  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
+  return tx_size;
+}
+
+static INLINE BLOCK_SIZE
+get_plane_block_size(BLOCK_SIZE bsize, const struct macroblockd_plane *pd) {
+  return ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y];
+}
+
+static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const int txs_wide = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+    const int txs_high = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+    memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide);
+    memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) * txs_high);
+  }
+}
+
+typedef void (*foreach_transformed_block_visitor)(int plane, int block,
+                                                  int blk_row, int blk_col,
+                                                  BLOCK_SIZE plane_bsize,
+                                                  TX_SIZE tx_size, void *arg);
+
+void av1_foreach_transformed_block_in_plane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+    foreach_transformed_block_visitor visit, void *arg);
+
+#if CONFIG_LV_MAP
+void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
+                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                   foreach_transformed_block_visitor visit,
+                                   void *arg);
+#endif
+
+#if CONFIG_DAALA_DIST
+void av1_foreach_8x8_transformed_block_in_plane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+    foreach_transformed_block_visitor visit,
+    foreach_transformed_block_visitor mi_visit, void *arg);
+#endif
+
+#if CONFIG_COEF_INTERLEAVE
+static INLINE int get_max_4x4_size(int num_4x4, int mb_to_edge,
+                                   int subsampling) {
+  return num_4x4 + (mb_to_edge >= 0 ? 0 : mb_to_edge >> (5 + subsampling));
+}
+
+void av1_foreach_transformed_block_interleave(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
+    foreach_transformed_block_visitor visit, void *arg);
+#endif
+
+void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
+                      int plane, TX_SIZE tx_size, int has_eob, int aoff,
+                      int loff);
+
+#if CONFIG_EXT_INTER
+static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
+#if CONFIG_INTERINTRA
+  // TODO(debargha): Should this be bsize < BLOCK_LARGEST?
+  return (bsize >= BLOCK_8X8) && (bsize < BLOCK_64X64);
+#else
+  (void)bsize;
+  return 0;
+#endif  // CONFIG_INTERINTRA
+}
+
+static INLINE int is_interintra_allowed_mode(const PREDICTION_MODE mode) {
+#if CONFIG_INTERINTRA
+  return (mode >= NEARESTMV) && (mode <= NEWMV);
+#else
+  (void)mode;
+  return 0;
+#endif  // CONFIG_INTERINTRA
+}
+
+static INLINE int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) {
+#if CONFIG_INTERINTRA
+  return (rf[0] > INTRA_FRAME) && (rf[1] <= INTRA_FRAME);
+#else
+  (void)rf;
+  return 0;
+#endif  // CONFIG_INTERINTRA
+}
+
+static INLINE int is_interintra_allowed(const MB_MODE_INFO *mbmi) {
+  return is_interintra_allowed_bsize(mbmi->sb_type) &&
+         is_interintra_allowed_mode(mbmi->mode) &&
+         is_interintra_allowed_ref(mbmi->ref_frame);
+}
+
+static INLINE int is_interintra_allowed_bsize_group(int group) {
+  int i;
+  for (i = 0; i < BLOCK_SIZES; i++) {
+    if (size_group_lookup[i] == group &&
+        is_interintra_allowed_bsize((BLOCK_SIZE)i)) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+static INLINE int is_interintra_pred(const MB_MODE_INFO *mbmi) {
+  return (mbmi->ref_frame[1] == INTRA_FRAME) && is_interintra_allowed(mbmi);
+}
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_VAR_TX
+static INLINE int get_vartx_max_txsize(const MB_MODE_INFO *const mbmi,
+                                       BLOCK_SIZE bsize) {
+#if CONFIG_CB4X4
+  (void)mbmi;
+  return max_txsize_rect_lookup[bsize];
+#endif  // CONFIG_C4X4
+  return mbmi->sb_type < BLOCK_8X8 ? max_txsize_rect_lookup[mbmi->sb_type]
+                                   : max_txsize_rect_lookup[bsize];
+}
+#endif  // CONFIG_VAR_TX
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+static INLINE int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) {
+  return (bsize >= BLOCK_8X8);
+}
+
+static INLINE int is_motion_variation_allowed_compound(
+    const MB_MODE_INFO *mbmi) {
+  if (!has_second_ref(mbmi))
+    return 1;
+  else
+    return 0;
+}
+
+#if CONFIG_MOTION_VAR
+// input: log2 of length, 0(4), 1(8), ...
+static const int max_neighbor_obmc[6] = { 0, 1, 2, 3, 4, 4 };
+
+static INLINE int check_num_overlappable_neighbors(const MB_MODE_INFO *mbmi) {
+  return !(mbmi->overlappable_neighbors[0] == 0 &&
+           mbmi->overlappable_neighbors[1] == 0);
+}
+#endif
+
+static INLINE MOTION_MODE motion_mode_allowed(
+#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+    int block, const WarpedMotionParams *gm_params,
+#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+    const MODE_INFO *mi) {
+  const MB_MODE_INFO *mbmi = &mi->mbmi;
+#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+  const TransformationType gm_type = gm_params[mbmi->ref_frame[0]].wmtype;
+  if (is_global_mv_block(mi, block, gm_type)) return SIMPLE_TRANSLATION;
+#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+#if CONFIG_EXT_INTER
+  if (is_motion_variation_allowed_bsize(mbmi->sb_type) &&
+      is_inter_mode(mbmi->mode) && mbmi->ref_frame[1] != INTRA_FRAME &&
+      is_motion_variation_allowed_compound(mbmi)) {
+#else
+  if (is_motion_variation_allowed_bsize(mbmi->sb_type) &&
+      is_inter_mode(mbmi->mode) && is_motion_variation_allowed_compound(mbmi)) {
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR
+    if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
+#endif
+#if CONFIG_WARPED_MOTION
+    if (!has_second_ref(mbmi) && mbmi->num_proj_ref[0] >= 1)
+      return WARPED_CAUSAL;
+    else
+#endif  // CONFIG_WARPED_MOTION
+#if CONFIG_MOTION_VAR
+      return OBMC_CAUSAL;
+#else
+    return SIMPLE_TRANSLATION;
+#endif  // CONFIG_MOTION_VAR
+  } else {
+    return SIMPLE_TRANSLATION;
+  }
+}
+
+static INLINE void assert_motion_mode_valid(MOTION_MODE mode,
+#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+                                            int block,
+                                            const WarpedMotionParams *gm_params,
+#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+                                            const MODE_INFO *mi) {
+  const MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
+#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+      block, gm_params,
+#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+      mi);
+  // Check that the input mode is not illegal
+  if (last_motion_mode_allowed < mode)
+    assert(0 && "Illegal motion mode selected");
+}
+
+#if CONFIG_MOTION_VAR
+static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
+  return (is_inter_block(mbmi));
+}
+#endif  // CONFIG_MOTION_VAR
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+// Returns sub-sampled dimensions of the given block.
+// The output values for 'rows_within_bounds' and 'cols_within_bounds' will
+// differ from 'height' and 'width' when part of the block is outside the right
+// and/or bottom image boundary.
+static INLINE void av1_get_block_dimensions(BLOCK_SIZE bsize, int plane,
+                                            const MACROBLOCKD *xd, int *width,
+                                            int *height,
+                                            int *rows_within_bounds,
+                                            int *cols_within_bounds) {
+  const int block_height = block_size_high[bsize];
+  const int block_width = block_size_wide[bsize];
+  const int block_rows = (xd->mb_to_bottom_edge >= 0)
+                             ? block_height
+                             : (xd->mb_to_bottom_edge >> 3) + block_height;
+  const int block_cols = (xd->mb_to_right_edge >= 0)
+                             ? block_width
+                             : (xd->mb_to_right_edge >> 3) + block_width;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_x == 0));
+  assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_y == 0));
+  assert(block_width >= block_cols);
+  assert(block_height >= block_rows);
+  if (width) *width = block_width >> pd->subsampling_x;
+  if (height) *height = block_height >> pd->subsampling_y;
+  if (rows_within_bounds) *rows_within_bounds = block_rows >> pd->subsampling_y;
+  if (cols_within_bounds) *cols_within_bounds = block_cols >> pd->subsampling_x;
+}
+
+#if CONFIG_GLOBAL_MOTION
+static INLINE int is_nontrans_global_motion(const MACROBLOCKD *xd) {
+  const MODE_INFO *mi = xd->mi[0];
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  int ref;
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+#endif
+
+  // First check if all modes are ZEROMV
+  if (mbmi->sb_type >= BLOCK_8X8 || unify_bsize) {
+#if CONFIG_EXT_INTER
+    if (mbmi->mode != ZEROMV && mbmi->mode != ZERO_ZEROMV) return 0;
+#else
+    if (mbmi->mode != ZEROMV) return 0;
+#endif  // CONFIG_EXT_INTER
+  } else {
+#if CONFIG_EXT_INTER
+    if (mi->bmi[0].as_mode != ZEROMV || mi->bmi[1].as_mode != ZEROMV ||
+        mi->bmi[2].as_mode != ZEROMV || mi->bmi[3].as_mode != ZEROMV ||
+        mi->bmi[0].as_mode != ZERO_ZEROMV ||
+        mi->bmi[1].as_mode != ZERO_ZEROMV ||
+        mi->bmi[2].as_mode != ZERO_ZEROMV || mi->bmi[3].as_mode != ZERO_ZEROMV)
+      return 0;
+#else
+    if (mi->bmi[0].as_mode != ZEROMV || mi->bmi[1].as_mode != ZEROMV ||
+        mi->bmi[2].as_mode != ZEROMV || mi->bmi[3].as_mode != ZEROMV)
+      return 0;
+#endif  // CONFIG_EXT_INTER
+  }
+
+#if !GLOBAL_SUB8X8_USED
+  if (mbmi->sb_type < BLOCK_8X8) return 0;
+#endif
+
+  // Now check if all global motion is non translational
+  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+    if (xd->global_motion[mbmi->ref_frame[ref]].wmtype <= TRANSLATION) return 0;
+  }
+  return 1;
+}
+#endif  // CONFIG_GLOBAL_MOTION
+
+static INLINE PLANE_TYPE get_plane_type(int plane) {
+  return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_BLOCKD_H_
diff --git a/third_party/aom/av1/common/cdef.c b/third_party/aom/av1/common/cdef.c
new file mode 100644
index 000000000..53dff98b7
--- /dev/null
+++ b/third_party/aom/av1/common/cdef.c
@@ -0,0 +1,445 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+#include "./aom_scale_rtcd.h"
+#include "aom/aom_integer.h"
+#include "av1/common/cdef.h"
+#include "av1/common/od_dering.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
+
+int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col) {
+  int r, c;
+  int maxc, maxr;
+  int skip = 1;
+  maxc = cm->mi_cols - mi_col;
+  maxr = cm->mi_rows - mi_row;
+#if CONFIG_EXT_PARTITION
+  if (maxr > cm->mib_size_log2) maxr = cm->mib_size_log2;
+  if (maxc > cm->mib_size_log2) maxc = cm->mib_size_log2;
+#else
+  if (maxr > MAX_MIB_SIZE) maxr = MAX_MIB_SIZE;
+  if (maxc > MAX_MIB_SIZE) maxc = MAX_MIB_SIZE;
+#endif
+
+  for (r = 0; r < maxr; r++) {
+    for (c = 0; c < maxc; c++) {
+      skip = skip &&
+             cm->mi_grid_visible[(mi_row + r) * cm->mi_stride + mi_col + c]
+                 ->mbmi.skip;
+    }
+  }
+  return skip;
+}
+
+static int is_8x8_block_skip(MODE_INFO **grid, int mi_row, int mi_col,
+                             int mi_stride) {
+  int is_skip = 1;
+  for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r)
+    for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c)
+      is_skip &= grid[(mi_row + r) * mi_stride + (mi_col + c)]->mbmi.skip;
+
+  return is_skip;
+}
+
+int sb_compute_dering_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
+                           dering_list *dlist, int filter_skip) {
+  int r, c;
+  int maxc, maxr;
+  MODE_INFO **grid;
+  int count = 0;
+  grid = cm->mi_grid_visible;
+  maxc = cm->mi_cols - mi_col;
+  maxr = cm->mi_rows - mi_row;
+#if CONFIG_EXT_PARTITION
+  if (maxr > cm->mib_size_log2) maxr = cm->mib_size_log2;
+  if (maxc > cm->mib_size_log2) maxc = cm->mib_size_log2;
+#else
+  if (maxr > MAX_MIB_SIZE) maxr = MAX_MIB_SIZE;
+  if (maxc > MAX_MIB_SIZE) maxc = MAX_MIB_SIZE;
+#endif
+
+  const int r_step = mi_size_high[BLOCK_8X8];
+  const int c_step = mi_size_wide[BLOCK_8X8];
+  const int r_shift = (r_step == 2);
+  const int c_shift = (c_step == 2);
+
+  assert(r_step == 1 || r_step == 2);
+  assert(c_step == 1 || c_step == 2);
+
+  if (filter_skip) {
+    for (r = 0; r < maxr; r += r_step) {
+      for (c = 0; c < maxc; c += c_step) {
+        dlist[count].by = r >> r_shift;
+        dlist[count].bx = c >> c_shift;
+        dlist[count].skip =
+            is_8x8_block_skip(grid, mi_row + r, mi_col + c, cm->mi_stride);
+        count++;
+      }
+    }
+  } else {
+    for (r = 0; r < maxr; r += r_step) {
+      for (c = 0; c < maxc; c += c_step) {
+        if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c, cm->mi_stride)) {
+          dlist[count].by = r >> r_shift;
+          dlist[count].bx = c >> c_shift;
+          dlist[count].skip = 0;
+          count++;
+        }
+      }
+    }
+  }
+  return count;
+}
+
+void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src,
+                                int sstride, int v, int h) {
+  int i, j;
+  for (i = 0; i < v; i++) {
+    for (j = 0; j < h; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
+
+void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
+                                 const uint16_t *src, int sstride, int v,
+                                 int h) {
+  int i, j;
+  for (i = 0; i < v; i++) {
+    for (j = 0; j < h; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
+
+void copy_sb8_16(UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride,
+                 const uint8_t *src, int src_voffset, int src_hoffset,
+                 int sstride, int vsize, int hsize) {
+#if CONFIG_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    const uint16_t *base =
+        &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
+    copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
+  } else {
+#endif
+    const uint8_t *base = &src[src_voffset * sstride + src_hoffset];
+    copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif
+}
+
+static INLINE void fill_rect(uint16_t *dst, int dstride, int v, int h,
+                             uint16_t x) {
+  int i, j;
+  for (i = 0; i < v; i++) {
+    for (j = 0; j < h; j++) {
+      dst[i * dstride + j] = x;
+    }
+  }
+}
+
+static INLINE void copy_rect(uint16_t *dst, int dstride, const uint16_t *src,
+                             int sstride, int v, int h) {
+  int i, j;
+  for (i = 0; i < v; i++) {
+    for (j = 0; j < h; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
+
+void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+                    MACROBLOCKD *xd) {
+  int sbr, sbc;
+  int nhsb, nvsb;
+  uint16_t src[OD_DERING_INBUF_SIZE];
+  uint16_t *linebuf[3];
+  uint16_t *colbuf[3];
+  dering_list dlist[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  unsigned char *row_dering, *prev_row_dering, *curr_row_dering;
+  int dering_count;
+  int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
+  int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
+  int stride;
+  int mi_wide_l2[3];
+  int mi_high_l2[3];
+  int xdec[3];
+  int ydec[3];
+  int pli;
+  int dering_left;
+  int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
+  int nplanes = 3;
+  int chroma_dering =
+      xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
+      xd->plane[2].subsampling_x == xd->plane[2].subsampling_y;
+  nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
+  nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
+  av1_setup_dst_planes(xd->plane, cm->sb_size, frame, 0, 0);
+  row_dering = aom_malloc(sizeof(*row_dering) * (nhsb + 2) * 2);
+  memset(row_dering, 1, sizeof(*row_dering) * (nhsb + 2) * 2);
+  prev_row_dering = row_dering + 1;
+  curr_row_dering = prev_row_dering + nhsb + 2;
+  for (pli = 0; pli < nplanes; pli++) {
+    xdec[pli] = xd->plane[pli].subsampling_x;
+    ydec[pli] = xd->plane[pli].subsampling_y;
+    mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
+    mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
+  }
+  stride = (cm->mi_cols << MI_SIZE_LOG2) + 2 * OD_FILT_HBORDER;
+  for (pli = 0; pli < nplanes; pli++) {
+    linebuf[pli] = aom_malloc(sizeof(*linebuf) * OD_FILT_VBORDER * stride);
+    colbuf[pli] =
+        aom_malloc(sizeof(*colbuf) *
+                   ((MAX_SB_SIZE << mi_high_l2[pli]) + 2 * OD_FILT_VBORDER) *
+                   OD_FILT_HBORDER);
+  }
+  for (sbr = 0; sbr < nvsb; sbr++) {
+    for (pli = 0; pli < nplanes; pli++) {
+      const int block_height =
+          (MAX_MIB_SIZE << mi_high_l2[pli]) + 2 * OD_FILT_VBORDER;
+      fill_rect(colbuf[pli], OD_FILT_HBORDER, block_height, OD_FILT_HBORDER,
+                OD_DERING_VERY_LARGE);
+    }
+    dering_left = 1;
+    for (sbc = 0; sbc < nhsb; sbc++) {
+      int level, clpf_strength;
+      int uv_level, uv_clpf_strength;
+      int nhb, nvb;
+      int cstart = 0;
+      curr_row_dering[sbc] = 0;
+      if (cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
+                              MAX_MIB_SIZE * sbc] == NULL ||
+          cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
+                              MAX_MIB_SIZE * sbc]
+                  ->mbmi.cdef_strength == -1) {
+        dering_left = 0;
+        continue;
+      }
+      if (!dering_left) cstart = -OD_FILT_HBORDER;
+      nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
+      nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
+      int tile_top, tile_left, tile_bottom, tile_right;
+      int mi_idx = MAX_MIB_SIZE * sbr * cm->mi_stride + MAX_MIB_SIZE * sbc;
+      BOUNDARY_TYPE boundary_tl =
+          cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
+                              MAX_MIB_SIZE * sbc]
+              ->mbmi.boundary_info;
+      tile_top = boundary_tl & TILE_ABOVE_BOUNDARY;
+      tile_left = boundary_tl & TILE_LEFT_BOUNDARY;
+      /* Right and bottom information appear unreliable, so we use the top
+         and left flags for the next superblocks. */
+      if (sbr != nvsb - 1 &&
+          cm->mi_grid_visible[mi_idx + MAX_MIB_SIZE * cm->mi_stride])
+        tile_bottom = cm->mi_grid_visible[mi_idx + MAX_MIB_SIZE * cm->mi_stride]
+                          ->mbmi.boundary_info &
+                      TILE_ABOVE_BOUNDARY;
+      else
+        tile_bottom = 1;
+      if (sbc != nhsb - 1 && cm->mi_grid_visible[mi_idx + MAX_MIB_SIZE])
+        tile_right =
+            cm->mi_grid_visible[mi_idx + MAX_MIB_SIZE]->mbmi.boundary_info &
+            TILE_LEFT_BOUNDARY;
+      else
+        tile_right = 1;
+      const int mbmi_cdef_strength =
+          cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
+                              MAX_MIB_SIZE * sbc]
+              ->mbmi.cdef_strength;
+      level = cm->cdef_strengths[mbmi_cdef_strength] / CLPF_STRENGTHS;
+      clpf_strength = cm->cdef_strengths[mbmi_cdef_strength] % CLPF_STRENGTHS;
+      clpf_strength += clpf_strength == 3;
+      uv_level = cm->cdef_uv_strengths[mbmi_cdef_strength] / CLPF_STRENGTHS;
+      uv_clpf_strength =
+          cm->cdef_uv_strengths[mbmi_cdef_strength] % CLPF_STRENGTHS;
+      uv_clpf_strength += uv_clpf_strength == 3;
+      if ((level == 0 && clpf_strength == 0 && uv_level == 0 &&
+           uv_clpf_strength == 0) ||
+          (dering_count = sb_compute_dering_list(
+               cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE, dlist,
+               get_filter_skip(level) || get_filter_skip(uv_level))) == 0) {
+        dering_left = 0;
+        continue;
+      }
+
+      curr_row_dering[sbc] = 1;
+      for (pli = 0; pli < nplanes; pli++) {
+        uint16_t dst[MAX_SB_SIZE * MAX_SB_SIZE];
+        int coffset;
+        int rend, cend;
+        int clpf_damping = cm->cdef_clpf_damping;
+        int dering_damping = cm->cdef_dering_damping;
+        int hsize = nhb << mi_wide_l2[pli];
+        int vsize = nvb << mi_high_l2[pli];
+
+        if (pli) {
+          if (chroma_dering)
+            level = uv_level;
+          else
+            level = 0;
+          clpf_strength = uv_clpf_strength;
+        }
+
+        if (sbc == nhsb - 1)
+          cend = hsize;
+        else
+          cend = hsize + OD_FILT_HBORDER;
+
+        if (sbr == nvsb - 1)
+          rend = vsize;
+        else
+          rend = vsize + OD_FILT_VBORDER;
+
+        coffset = sbc * MAX_MIB_SIZE << mi_wide_l2[pli];
+        if (sbc == nhsb - 1) {
+          /* On the last superblock column, fill in the right border with
+             OD_DERING_VERY_LARGE to avoid filtering with the outside. */
+          fill_rect(&src[cend + OD_FILT_HBORDER], OD_FILT_BSTRIDE,
+                    rend + OD_FILT_VBORDER, hsize + OD_FILT_HBORDER - cend,
+                    OD_DERING_VERY_LARGE);
+        }
+        if (sbr == nvsb - 1) {
+          /* On the last superblock row, fill in the bottom border with
+             OD_DERING_VERY_LARGE to avoid filtering with the outside. */
+          fill_rect(&src[(rend + OD_FILT_VBORDER) * OD_FILT_BSTRIDE],
+                    OD_FILT_BSTRIDE, OD_FILT_VBORDER,
+                    hsize + 2 * OD_FILT_HBORDER, OD_DERING_VERY_LARGE);
+        }
+        /* Copy in the pixels we need from the current superblock for
+           deringing.*/
+        copy_sb8_16(
+            cm,
+            &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER + cstart],
+            OD_FILT_BSTRIDE, xd->plane[pli].dst.buf,
+            (MAX_MIB_SIZE << mi_high_l2[pli]) * sbr, coffset + cstart,
+            xd->plane[pli].dst.stride, rend, cend - cstart);
+        if (!prev_row_dering[sbc]) {
+          copy_sb8_16(cm, &src[OD_FILT_HBORDER], OD_FILT_BSTRIDE,
+                      xd->plane[pli].dst.buf,
+                      (MAX_MIB_SIZE << mi_high_l2[pli]) * sbr - OD_FILT_VBORDER,
+                      coffset, xd->plane[pli].dst.stride, OD_FILT_VBORDER,
+                      hsize);
+        } else if (sbr > 0) {
+          copy_rect(&src[OD_FILT_HBORDER], OD_FILT_BSTRIDE,
+                    &linebuf[pli][coffset], stride, OD_FILT_VBORDER, hsize);
+        } else {
+          fill_rect(&src[OD_FILT_HBORDER], OD_FILT_BSTRIDE, OD_FILT_VBORDER,
+                    hsize, OD_DERING_VERY_LARGE);
+        }
+        if (!prev_row_dering[sbc - 1]) {
+          copy_sb8_16(cm, src, OD_FILT_BSTRIDE, xd->plane[pli].dst.buf,
+                      (MAX_MIB_SIZE << mi_high_l2[pli]) * sbr - OD_FILT_VBORDER,
+                      coffset - OD_FILT_HBORDER, xd->plane[pli].dst.stride,
+                      OD_FILT_VBORDER, OD_FILT_HBORDER);
+        } else if (sbr > 0 && sbc > 0) {
+          copy_rect(src, OD_FILT_BSTRIDE,
+                    &linebuf[pli][coffset - OD_FILT_HBORDER], stride,
+                    OD_FILT_VBORDER, OD_FILT_HBORDER);
+        } else {
+          fill_rect(src, OD_FILT_BSTRIDE, OD_FILT_VBORDER, OD_FILT_HBORDER,
+                    OD_DERING_VERY_LARGE);
+        }
+        if (!prev_row_dering[sbc + 1]) {
+          copy_sb8_16(cm, &src[OD_FILT_HBORDER + (nhb << mi_wide_l2[pli])],
+                      OD_FILT_BSTRIDE, xd->plane[pli].dst.buf,
+                      (MAX_MIB_SIZE << mi_high_l2[pli]) * sbr - OD_FILT_VBORDER,
+                      coffset + hsize, xd->plane[pli].dst.stride,
+                      OD_FILT_VBORDER, OD_FILT_HBORDER);
+        } else if (sbr > 0 && sbc < nhsb - 1) {
+          copy_rect(&src[hsize + OD_FILT_HBORDER], OD_FILT_BSTRIDE,
+                    &linebuf[pli][coffset + hsize], stride, OD_FILT_VBORDER,
+                    OD_FILT_HBORDER);
+        } else {
+          fill_rect(&src[hsize + OD_FILT_HBORDER], OD_FILT_BSTRIDE,
+                    OD_FILT_VBORDER, OD_FILT_HBORDER, OD_DERING_VERY_LARGE);
+        }
+        if (dering_left) {
+          /* If we deringed the superblock on the left then we need to copy in
+             saved pixels. */
+          copy_rect(src, OD_FILT_BSTRIDE, colbuf[pli], OD_FILT_HBORDER,
+                    rend + OD_FILT_VBORDER, OD_FILT_HBORDER);
+        }
+        /* Saving pixels in case we need to dering the superblock on the
+            right. */
+        copy_rect(colbuf[pli], OD_FILT_HBORDER, src + hsize, OD_FILT_BSTRIDE,
+                  rend + OD_FILT_VBORDER, OD_FILT_HBORDER);
+        copy_sb8_16(
+            cm, &linebuf[pli][coffset], stride, xd->plane[pli].dst.buf,
+            (MAX_MIB_SIZE << mi_high_l2[pli]) * (sbr + 1) - OD_FILT_VBORDER,
+            coffset, xd->plane[pli].dst.stride, OD_FILT_VBORDER, hsize);
+
+        if (tile_top) {
+          fill_rect(src, OD_FILT_BSTRIDE, OD_FILT_VBORDER,
+                    hsize + 2 * OD_FILT_HBORDER, OD_DERING_VERY_LARGE);
+        }
+        if (tile_left) {
+          fill_rect(src, OD_FILT_BSTRIDE, vsize + 2 * OD_FILT_VBORDER,
+                    OD_FILT_HBORDER, OD_DERING_VERY_LARGE);
+        }
+        if (tile_bottom) {
+          fill_rect(&src[(vsize + OD_FILT_VBORDER) * OD_FILT_BSTRIDE],
+                    OD_FILT_BSTRIDE, OD_FILT_VBORDER,
+                    hsize + 2 * OD_FILT_HBORDER, OD_DERING_VERY_LARGE);
+        }
+        if (tile_right) {
+          fill_rect(&src[hsize + OD_FILT_HBORDER], OD_FILT_BSTRIDE,
+                    vsize + 2 * OD_FILT_VBORDER, OD_FILT_HBORDER,
+                    OD_DERING_VERY_LARGE);
+        }
+#if CONFIG_HIGHBITDEPTH
+        if (cm->use_highbitdepth) {
+          od_dering(
+              (uint8_t *)&CONVERT_TO_SHORTPTR(
+                  xd->plane[pli]
+                      .dst.buf)[xd->plane[pli].dst.stride *
+                                    (MAX_MIB_SIZE * sbr << mi_high_l2[pli]) +
+                                (sbc * MAX_MIB_SIZE << mi_wide_l2[pli])],
+              xd->plane[pli].dst.stride, dst,
+              &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER],
+              xdec[pli], ydec[pli], dir, NULL, var, pli, dlist, dering_count,
+              level, clpf_strength, clpf_damping, dering_damping, coeff_shift,
+              0, 1);
+        } else {
+#endif
+          od_dering(&xd->plane[pli]
+                         .dst.buf[xd->plane[pli].dst.stride *
+                                      (MAX_MIB_SIZE * sbr << mi_high_l2[pli]) +
+                                  (sbc * MAX_MIB_SIZE << mi_wide_l2[pli])],
+                    xd->plane[pli].dst.stride, dst,
+                    &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER],
+                    xdec[pli], ydec[pli], dir, NULL, var, pli, dlist,
+                    dering_count, level, clpf_strength, clpf_damping,
+                    dering_damping, coeff_shift, 0, 0);
+
+#if CONFIG_HIGHBITDEPTH
+        }
+#endif
+      }
+      dering_left = 1;
+    }
+    {
+      unsigned char *tmp;
+      tmp = prev_row_dering;
+      prev_row_dering = curr_row_dering;
+      curr_row_dering = tmp;
+    }
+  }
+  aom_free(row_dering);
+  for (pli = 0; pli < nplanes; pli++) {
+    aom_free(linebuf[pli]);
+    aom_free(colbuf[pli]);
+  }
+}
diff --git a/third_party/aom/av1/common/cdef.h b/third_party/aom/av1/common/cdef.h
new file mode 100644
index 000000000..08c438de6
--- /dev/null
+++ b/third_party/aom/av1/common/cdef.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_COMMON_DERING_H_
+#define AV1_COMMON_DERING_H_
+
+#define CDEF_STRENGTH_BITS 7
+
+#define DERING_STRENGTHS 32
+#define CLPF_STRENGTHS 4
+
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "av1/common/od_dering.h"
+#include "av1/common/onyxc_int.h"
+#include "./od_dering.h"
+
+static INLINE int sign(int i) { return i < 0 ? -1 : 1; }
+
+static INLINE int constrain(int diff, int threshold, unsigned int damping) {
+  return threshold
+             ? sign(diff) *
+                   AOMMIN(
+                       abs(diff),
+                       AOMMAX(0, threshold - (abs(diff) >>
+                                              (damping - get_msb(threshold)))))
+             : 0;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col);
+int sb_compute_dering_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
+                           dering_list *dlist, int filter_skip);
+void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd);
+
+void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
+                     AV1_COMMON *cm, MACROBLOCKD *xd);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AV1_COMMON_DERING_H_
diff --git a/third_party/aom/av1/common/cdef_simd.h b/third_party/aom/av1/common/cdef_simd.h
new file mode 100644
index 000000000..2649099a2
--- /dev/null
+++ b/third_party/aom/av1/common/cdef_simd.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_COMMON_CDEF_SIMD_H_
+#define AV1_COMMON_CDEF_SIMD_H_
+
+#include "aom_dsp/aom_simd.h"
+
+// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
+SIMD_INLINE v128 constrain16(v128 a, v128 b, unsigned int threshold,
+                             unsigned int adjdamp) {
+  v128 diff = v128_sub_16(a, b);
+  const v128 sign = v128_shr_n_s16(diff, 15);
+  diff = v128_abs_s16(diff);
+  const v128 s =
+      v128_ssub_u16(v128_dup_16(threshold), v128_shr_u16(diff, adjdamp));
+  return v128_xor(v128_add_16(sign, v128_min_s16(diff, s)), sign);
+}
+
+#endif  // AV1_COMMON_CDEF_SIMD_H_
diff --git a/third_party/aom/av1/common/cfl.c b/third_party/aom/av1/common/cfl.c
new file mode 100644
index 000000000..d66a989ad
--- /dev/null
+++ b/third_party/aom/av1/common/cfl.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/cfl.h"
+#include "av1/common/common_data.h"
+#include "av1/common/onyxc_int.h"
+
+#include "aom/internal/aom_codec_internal.h"
+
+void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm, int subsampling_x,
+              int subsampling_y) {
+  if (!((subsampling_x == 0 && subsampling_y == 0) ||
+        (subsampling_x == 1 && subsampling_y == 1))) {
+    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Only 4:4:4 and 4:2:0 are currently supported by CfL");
+  }
+  memset(&cfl->y_pix, 0, sizeof(uint8_t) * MAX_SB_SQUARE);
+  cfl->subsampling_x = subsampling_x;
+  cfl->subsampling_y = subsampling_y;
+}
+
+// CfL computes its own block-level DC_PRED. This is required to compute both
+// alpha_cb and alpha_cr before the prediction are computed.
+void cfl_dc_pred(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+  const struct macroblockd_plane *const pd_u = &xd->plane[AOM_PLANE_U];
+  const struct macroblockd_plane *const pd_v = &xd->plane[AOM_PLANE_V];
+
+  const uint8_t *const dst_u = pd_u->dst.buf;
+  const uint8_t *const dst_v = pd_v->dst.buf;
+
+  const int dst_u_stride = pd_u->dst.stride;
+  const int dst_v_stride = pd_v->dst.stride;
+
+  const int block_width = (plane_bsize != BLOCK_INVALID)
+                              ? block_size_wide[plane_bsize]
+                              : tx_size_wide[tx_size];
+  const int block_height = (plane_bsize != BLOCK_INVALID)
+                               ? block_size_high[plane_bsize]
+                               : tx_size_high[tx_size];
+
+  // Number of pixel on the top and left borders.
+  const int num_pel = block_width + block_height;
+
+  int sum_u = 0;
+  int sum_v = 0;
+
+  // Match behavior of build_intra_predictors (reconintra.c) at superblock
+  // boundaries:
+  //
+  // 127 127 127 .. 127 127 127 127 127 127
+  // 129  A   B  ..  Y   Z
+  // 129  C   D  ..  W   X
+  // 129  E   F  ..  U   V
+  // 129  G   H  ..  S   T   T   T   T   T
+  // ..
+
+  // TODO(ltrudeau) replace this with DC_PRED assembly
+  if (xd->up_available && xd->mb_to_right_edge >= 0) {
+    for (int i = 0; i < block_width; i++) {
+      sum_u += dst_u[-dst_u_stride + i];
+      sum_v += dst_v[-dst_v_stride + i];
+    }
+  } else {
+    sum_u = block_width * 127;
+    sum_v = block_width * 127;
+  }
+
+  if (xd->left_available && xd->mb_to_bottom_edge >= 0) {
+    for (int i = 0; i < block_height; i++) {
+      sum_u += dst_u[i * dst_u_stride - 1];
+      sum_v += dst_v[i * dst_v_stride - 1];
+    }
+  } else {
+    sum_u += block_height * 129;
+    sum_v += block_height * 129;
+  }
+
+  xd->cfl->dc_pred[CFL_PRED_U] = (sum_u + (num_pel >> 1)) / num_pel;
+  xd->cfl->dc_pred[CFL_PRED_V] = (sum_v + (num_pel >> 1)) / num_pel;
+}
+
+// Predict the current transform block using CfL.
+// it is assumed that dst points at the start of the transform block
+void cfl_predict_block(const CFL_CTX *cfl, uint8_t *dst, int dst_stride,
+                       int row, int col, TX_SIZE tx_size, int dc_pred) {
+  const int tx_block_width = tx_size_wide[tx_size];
+  const int tx_block_height = tx_size_high[tx_size];
+
+  // TODO(ltrudeau) implement alpha
+  // Place holder for alpha
+  const double alpha = 0;
+  const double y_avg = cfl_load(cfl, dst, dst_stride, row, col, tx_size);
+
+  for (int j = 0; j < tx_block_height; j++) {
+    for (int i = 0; i < tx_block_width; i++) {
+      dst[i] = (uint8_t)(alpha * y_avg + dc_pred + 0.5);
+    }
+    dst += dst_stride;
+  }
+}
+
+void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride, int row,
+               int col, TX_SIZE tx_size) {
+  const int tx_width = tx_size_wide[tx_size];
+  const int tx_height = tx_size_high[tx_size];
+  const int tx_off_log2 = tx_size_wide_log2[0];
+
+  // Store the input into the CfL pixel buffer
+  uint8_t *y_pix = &cfl->y_pix[(row * MAX_SB_SIZE + col) << tx_off_log2];
+
+  // Check that we remain inside the pixel buffer.
+  assert(MAX_SB_SIZE * (row + tx_height - 1) + col + tx_width - 1 <
+         MAX_SB_SQUARE);
+
+  for (int j = 0; j < tx_height; j++) {
+    for (int i = 0; i < tx_width; i++) {
+      y_pix[i] = input[i];
+    }
+    y_pix += MAX_SB_SIZE;
+    input += input_stride;
+  }
+
+  // Store the surface of the pixel buffer that was written to, this way we
+  // can manage chroma overrun (e.g. when the chroma surfaces goes beyond the
+  // frame boundary)
+  if (col == 0 && row == 0) {
+    cfl->y_width = tx_width;
+    cfl->y_height = tx_height;
+  } else {
+    cfl->y_width = OD_MAXI((col << tx_off_log2) + tx_width, cfl->y_width);
+    cfl->y_height = OD_MAXI((row << tx_off_log2) + tx_height, cfl->y_height);
+  }
+}
+
+// Load from the CfL pixel buffer into output
+double cfl_load(const CFL_CTX *cfl, uint8_t *output, int output_stride, int row,
+                int col, TX_SIZE tx_size) {
+  const int tx_width = tx_size_wide[tx_size];
+  const int tx_height = tx_size_high[tx_size];
+  const int sub_x = cfl->subsampling_x;
+  const int sub_y = cfl->subsampling_y;
+  const int tx_off_log2 = tx_size_wide_log2[0];
+
+  const uint8_t *y_pix;
+
+  int diff_width = 0;
+  int diff_height = 0;
+
+  int pred_row_offset = 0;
+  int output_row_offset = 0;
+  int top_left, bot_left;
+
+  // TODO(ltrudeau) add support for 4:2:2
+  if (sub_y == 0 && sub_x == 0) {
+    y_pix = &cfl->y_pix[(row * MAX_SB_SIZE + col) << tx_off_log2];
+    int uv_width = (col << tx_off_log2) + tx_width;
+    diff_width = uv_width - cfl->y_width;
+    int uv_height = (row << tx_off_log2) + tx_width;
+    diff_height = uv_height - cfl->y_height;
+    for (int j = 0; j < tx_height; j++) {
+      for (int i = 0; i < tx_width; i++) {
+        // In 4:4:4, pixels match 1 to 1
+        output[output_row_offset + i] = y_pix[pred_row_offset + i];
+      }
+      pred_row_offset += MAX_SB_SIZE;
+      output_row_offset += output_stride;
+    }
+  } else if (sub_y == 1 && sub_x == 1) {
+    y_pix = &cfl->y_pix[(row * MAX_SB_SIZE + col) << (tx_off_log2 + sub_y)];
+    int uv_width = ((col << tx_off_log2) + tx_width) << sub_x;
+    diff_width = (uv_width - cfl->y_width) >> sub_x;
+    int uv_height = ((row << tx_off_log2) + tx_width) << sub_y;
+    diff_height = (uv_height - cfl->y_height) >> sub_y;
+    for (int j = 0; j < tx_height; j++) {
+      for (int i = 0; i < tx_width; i++) {
+        top_left = (pred_row_offset + i) << sub_y;
+        bot_left = top_left + MAX_SB_SIZE;
+        // In 4:2:0, average pixels in 2x2 grid
+        output[output_row_offset + i] = OD_SHR_ROUND(
+            y_pix[top_left] + y_pix[top_left + 1]        // Top row
+                + y_pix[bot_left] + y_pix[bot_left + 1]  // Bottom row
+            ,
+            2);
+      }
+      pred_row_offset += MAX_SB_SIZE;
+      output_row_offset += output_stride;
+    }
+  } else {
+    assert(0);  // Unsupported chroma subsampling
+  }
+  // Due to frame boundary issues, it is possible that the total area of
+  // covered by Chroma exceeds that of Luma. When this happens, we write over
+  // the broken data by repeating the last columns and/or rows.
+  //
+  // Note that in order to manage the case where both rows and columns
+  // overrun,
+  // we apply rows first. This way, when the rows overrun the bottom of the
+  // frame, the columns will be copied over them.
+  if (diff_width > 0) {
+    int last_pixel;
+    output_row_offset = tx_width - diff_width;
+
+    for (int j = 0; j < tx_height; j++) {
+      last_pixel = output_row_offset - 1;
+      for (int i = 0; i < diff_width; i++) {
+        output[output_row_offset + i] = output[last_pixel];
+      }
+      output_row_offset += output_stride;
+    }
+  }
+
+  if (diff_height > 0) {
+    output_row_offset = diff_height * output_stride;
+    const int last_row_offset = output_row_offset - output_stride;
+    for (int j = 0; j < diff_height; j++) {
+      for (int i = 0; i < tx_width; i++) {
+        output[output_row_offset + i] = output[last_row_offset + i];
+      }
+      output_row_offset += output_stride;
+    }
+  }
+
+  int avg = 0;
+  output_row_offset = 0;
+  for (int j = 0; j < tx_height; j++) {
+    for (int i = 0; i < tx_width; i++) {
+      avg += output[output_row_offset + i];
+    }
+    output_row_offset += output_stride;
+  }
+  return avg / (double)(tx_width * tx_height);
+}
diff --git a/third_party/aom/av1/common/cfl.h b/third_party/aom/av1/common/cfl.h
new file mode 100644
index 000000000..371df70be
--- /dev/null
+++ b/third_party/aom/av1/common/cfl.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_CFL_H_
+#define AV1_COMMON_CFL_H_
+
+#include "av1/common/enums.h"
+
+// Forward declaration of AV1_COMMON, in order to avoid creating a cyclic
+// dependency by importing av1/common/onyxc_int.h
+typedef struct AV1Common AV1_COMMON;
+
+// Forward declaration of MACROBLOCK, in order to avoid creating a cyclic
+// dependency by importing av1/common/blockd.h
+typedef struct macroblockd MACROBLOCKD;
+
+typedef struct {
+  // Pixel buffer containing the luma pixels used as prediction for chroma
+  uint8_t y_pix[MAX_SB_SQUARE];
+
+  // Height and width of the luma prediction block currently in the pixel buffer
+  int y_height, y_width;
+
+  // Chroma subsampling
+  int subsampling_x, subsampling_y;
+
+  // CfL Performs its own block level DC_PRED for each chromatic plane
+  int dc_pred[CFL_PRED_PLANES];
+} CFL_CTX;
+
+void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm, int subsampling_x,
+              int subsampling_y);
+
+void cfl_dc_pred(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+
+void cfl_predict_block(const CFL_CTX *cfl, uint8_t *dst, int dst_stride,
+                       int row, int col, TX_SIZE tx_size, int dc_pred);
+
+void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride, int row,
+               int col, TX_SIZE tx_size);
+
+double cfl_load(const CFL_CTX *cfl, uint8_t *output, int output_stride, int row,
+                int col, TX_SIZE tx_size);
+#endif  // AV1_COMMON_CFL_H_
diff --git a/third_party/aom/av1/common/clpf.c b/third_party/aom/av1/common/clpf.c
new file mode 100644
index 000000000..3637deeea
--- /dev/null
+++ b/third_party/aom/av1/common/clpf.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./clpf.h"
+#include "./av1_rtcd.h"
+#include "./cdef.h"
+#include "aom/aom_image.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G,
+                    int H, int s, unsigned int dmp) {
+  int delta = 1 * constrain(A - X, s, dmp) + 3 * constrain(B - X, s, dmp) +
+              1 * constrain(C - X, s, dmp) + 3 * constrain(D - X, s, dmp) +
+              3 * constrain(E - X, s, dmp) + 1 * constrain(F - X, s, dmp) +
+              3 * constrain(G - X, s, dmp) + 1 * constrain(H - X, s, dmp);
+  return (8 + delta - (delta < 0)) >> 4;
+}
+
+int av1_clpf_hsample(int X, int A, int B, int C, int D, int s,
+                     unsigned int dmp) {
+  int delta = 1 * constrain(A - X, s, dmp) + 3 * constrain(B - X, s, dmp) +
+              3 * constrain(C - X, s, dmp) + 1 * constrain(D - X, s, dmp);
+  return (4 + delta - (delta < 0)) >> 3;
+}
+
+void aom_clpf_block_c(uint8_t *dst, const uint16_t *src, int dstride,
+                      int sstride, int sizex, int sizey, unsigned int strength,
+                      unsigned int damping) {
+  int x, y;
+
+  for (y = 0; y < sizey; y++) {
+    for (x = 0; x < sizex; x++) {
+      const int X = src[y * sstride + x];
+      const int A = src[(y - 2) * sstride + x];
+      const int B = src[(y - 1) * sstride + x];
+      const int C = src[y * sstride + x - 2];
+      const int D = src[y * sstride + x - 1];
+      const int E = src[y * sstride + x + 1];
+      const int F = src[y * sstride + x + 2];
+      const int G = src[(y + 1) * sstride + x];
+      const int H = src[(y + 2) * sstride + x];
+      const int delta =
+          av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, damping);
+      dst[y * dstride + x] = X + delta;
+    }
+  }
+}
+
+// Identical to aom_clpf_block_c() apart from "dst".
+void aom_clpf_block_hbd_c(uint16_t *dst, const uint16_t *src, int dstride,
+                          int sstride, int sizex, int sizey,
+                          unsigned int strength, unsigned int damping) {
+  int x, y;
+
+  for (y = 0; y < sizey; y++) {
+    for (x = 0; x < sizex; x++) {
+      const int X = src[y * sstride + x];
+      const int A = src[(y - 2) * sstride + x];
+      const int B = src[(y - 1) * sstride + x];
+      const int C = src[y * sstride + x - 2];
+      const int D = src[y * sstride + x - 1];
+      const int E = src[y * sstride + x + 1];
+      const int F = src[y * sstride + x + 2];
+      const int G = src[(y + 1) * sstride + x];
+      const int H = src[(y + 2) * sstride + x];
+      const int delta =
+          av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, damping);
+      dst[y * dstride + x] = X + delta;
+    }
+  }
+}
+
+// Vertically restricted filter
+void aom_clpf_hblock_c(uint8_t *dst, const uint16_t *src, int dstride,
+                       int sstride, int sizex, int sizey, unsigned int strength,
+                       unsigned int damping) {
+  int x, y;
+
+  for (y = 0; y < sizey; y++) {
+    for (x = 0; x < sizex; x++) {
+      const int X = src[y * sstride + x];
+      const int A = src[y * sstride + x - 2];
+      const int B = src[y * sstride + x - 1];
+      const int C = src[y * sstride + x + 1];
+      const int D = src[y * sstride + x + 2];
+      const int delta = av1_clpf_hsample(X, A, B, C, D, strength, damping);
+      dst[y * dstride + x] = X + delta;
+    }
+  }
+}
+
+void aom_clpf_hblock_hbd_c(uint16_t *dst, const uint16_t *src, int dstride,
+                           int sstride, int sizex, int sizey,
+                           unsigned int strength, unsigned int damping) {
+  int x, y;
+
+  for (y = 0; y < sizey; y++) {
+    for (x = 0; x < sizex; x++) {
+      const int X = src[y * sstride + x];
+      const int A = src[y * sstride + x - 2];
+      const int B = src[y * sstride + x - 1];
+      const int C = src[y * sstride + x + 1];
+      const int D = src[y * sstride + x + 2];
+      const int delta = av1_clpf_hsample(X, A, B, C, D, strength, damping);
+      dst[y * dstride + x] = X + delta;
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/clpf.h b/third_party/aom/av1/common/clpf.h
new file mode 100644
index 000000000..d6348deb0
--- /dev/null
+++ b/third_party/aom/av1/common/clpf.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_COMMON_CLPF_H_
+#define AV1_COMMON_CLPF_H_
+
+#include "av1/common/reconinter.h"
+
+int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G,
+                    int H, int b, unsigned int dmp);
+#endif
diff --git a/third_party/aom/av1/common/clpf_neon.c b/third_party/aom/av1/common/clpf_neon.c
new file mode 100644
index 000000000..f1a004c2c
--- /dev/null
+++ b/third_party/aom/av1/common/clpf_neon.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_neon
+#include "./clpf_simd.h"
diff --git a/third_party/aom/av1/common/clpf_simd.h b/third_party/aom/av1/common/clpf_simd.h
new file mode 100644
index 000000000..a615b5ed3
--- /dev/null
+++ b/third_party/aom/av1/common/clpf_simd.h
@@ -0,0 +1,446 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "./cdef_simd.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem.h"
+
+// sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
+SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
+                           unsigned int adjdamp) {
+  const v256 diff16 = v256_sub_16(a, b);
+  v128 diff = v128_pack_s16_s8(v256_high_v128(diff16), v256_low_v128(diff16));
+  const v128 sign = v128_cmplt_s8(diff, v128_zero());
+  diff = v128_abs_s8(diff);
+  return v128_xor(
+      v128_add_8(sign,
+                 v128_min_u8(diff, v128_ssub_u8(v128_dup_8(strength),
+                                                v128_shr_u8(diff, adjdamp)))),
+      sign);
+}
+
+// delta = 1/16 * constrain(a, x, s, d) + 3/16 * constrain(b, x, s, d) +
+//         1/16 * constrain(c, x, s, d) + 3/16 * constrain(d, x, s, d) +
+//         3/16 * constrain(e, x, s, d) + 1/16 * constrain(f, x, s, d) +
+//         3/16 * constrain(g, x, s, d) + 1/16 * constrain(h, x, s, d)
+SIMD_INLINE v128 calc_delta(v256 x, v256 a, v256 b, v256 c, v256 d, v256 e,
+                            v256 f, v256 g, v256 h, unsigned int s,
+                            unsigned int dmp) {
+  const v128 bdeg =
+      v128_add_8(v128_add_8(constrain(b, x, s, dmp), constrain(d, x, s, dmp)),
+                 v128_add_8(constrain(e, x, s, dmp), constrain(g, x, s, dmp)));
+  const v128 delta = v128_add_8(
+      v128_add_8(v128_add_8(constrain(a, x, s, dmp), constrain(c, x, s, dmp)),
+                 v128_add_8(constrain(f, x, s, dmp), constrain(h, x, s, dmp))),
+      v128_add_8(v128_add_8(bdeg, bdeg), bdeg));
+  return v128_add_8(
+      v128_pack_s16_u8(v256_high_v128(x), v256_low_v128(x)),
+      v128_shr_s8(
+          v128_add_8(v128_dup_8(8),
+                     v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
+          4));
+}
+
+// delta = 1/8 * constrain(a, x, s, d) + 3/8 * constrain(b, x, s, d) +
+//         3/8 * constrain(c, x, s, d) + 1/8 * constrain(d, x, s, d) +
+SIMD_INLINE v128 calc_hdelta(v256 x, v256 a, v256 b, v256 c, v256 d,
+                             unsigned int s, unsigned int dmp) {
+  const v128 bc = v128_add_8(constrain(b, x, s, dmp), constrain(c, x, s, dmp));
+  const v128 delta =
+      v128_add_8(v128_add_8(constrain(a, x, s, dmp), constrain(d, x, s, dmp)),
+                 v128_add_8(v128_add_8(bc, bc), bc));
+  return v128_add_8(
+      v128_pack_s16_u8(v256_high_v128(x), v256_low_v128(x)),
+      v128_shr_s8(
+          v128_add_8(v128_dup_8(4),
+                     v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
+          3));
+}
+
+// Process blocks of width 8, two lines at a time, 8 bit.
+static void SIMD_FUNC(clpf_block8)(uint8_t *dst, const uint16_t *src,
+                                   int dstride, int sstride, int sizey,
+                                   unsigned int strength,
+                                   unsigned int adjdamp) {
+  int y;
+
+  for (y = 0; y < sizey; y += 2) {
+    const v128 l1 = v128_load_aligned(src);
+    const v128 l2 = v128_load_aligned(src + sstride);
+    const v128 l3 = v128_load_aligned(src - sstride);
+    const v128 l4 = v128_load_aligned(src + 2 * sstride);
+    const v256 a = v256_from_v128(v128_load_aligned(src - 2 * sstride), l3);
+    const v256 b = v256_from_v128(l3, l1);
+    const v256 g = v256_from_v128(l2, l4);
+    const v256 h = v256_from_v128(l4, v128_load_aligned(src + 3 * sstride));
+    const v256 c = v256_from_v128(v128_load_unaligned(src - 2),
+                                  v128_load_unaligned(src - 2 + sstride));
+    const v256 d = v256_from_v128(v128_load_unaligned(src - 1),
+                                  v128_load_unaligned(src - 1 + sstride));
+    const v256 e = v256_from_v128(v128_load_unaligned(src + 1),
+                                  v128_load_unaligned(src + 1 + sstride));
+    const v256 f = v256_from_v128(v128_load_unaligned(src + 2),
+                                  v128_load_unaligned(src + 2 + sstride));
+    const v128 o = calc_delta(v256_from_v128(l1, l2), a, b, c, d, e, f, g, h,
+                              strength, adjdamp);
+
+    v64_store_aligned(dst, v128_high_v64(o));
+    v64_store_aligned(dst + dstride, v128_low_v64(o));
+    src += sstride * 2;
+    dst += dstride * 2;
+  }
+}
+
+// Process blocks of width 4, four lines at a time, 8 bit.
+static void SIMD_FUNC(clpf_block4)(uint8_t *dst, const uint16_t *src,
+                                   int dstride, int sstride, int sizey,
+                                   unsigned int strength,
+                                   unsigned int adjdamp) {
+  int y;
+
+  for (y = 0; y < sizey; y += 4) {
+    const v64 l0 = v64_load_aligned(src - 2 * sstride);
+    const v64 l1 = v64_load_aligned(src - sstride);
+    const v64 l2 = v64_load_aligned(src);
+    const v64 l3 = v64_load_aligned(src + sstride);
+    const v64 l4 = v64_load_aligned(src + 2 * sstride);
+    const v64 l5 = v64_load_aligned(src + 3 * sstride);
+    const v64 l6 = v64_load_aligned(src + 4 * sstride);
+    const v64 l7 = v64_load_aligned(src + 5 * sstride);
+    const v128 o =
+        calc_delta(v256_from_v64(l2, l3, l4, l5), v256_from_v64(l0, l1, l2, l3),
+                   v256_from_v64(l1, l2, l3, l4),
+                   v256_from_v64(v64_load_unaligned(src - 2),
+                                 v64_load_unaligned(src + sstride - 2),
+                                 v64_load_unaligned(src + 2 * sstride - 2),
+                                 v64_load_unaligned(src + 3 * sstride - 2)),
+                   v256_from_v64(v64_load_unaligned(src - 1),
+                                 v64_load_unaligned(src + sstride - 1),
+                                 v64_load_unaligned(src + 2 * sstride - 1),
+                                 v64_load_unaligned(src + 3 * sstride - 1)),
+                   v256_from_v64(v64_load_unaligned(src + 1),
+                                 v64_load_unaligned(src + sstride + 1),
+                                 v64_load_unaligned(src + 2 * sstride + 1),
+                                 v64_load_unaligned(src + 3 * sstride + 1)),
+                   v256_from_v64(v64_load_unaligned(src + 2),
+                                 v64_load_unaligned(src + sstride + 2),
+                                 v64_load_unaligned(src + 2 * sstride + 2),
+                                 v64_load_unaligned(src + 3 * sstride + 2)),
+                   v256_from_v64(l3, l4, l5, l6), v256_from_v64(l4, l5, l6, l7),
+                   strength, adjdamp);
+
+    u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
+    u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
+    u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
+    u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
+
+    dst += 4 * dstride;
+    src += 4 * sstride;
+  }
+}
+
+static void SIMD_FUNC(clpf_hblock8)(uint8_t *dst, const uint16_t *src,
+                                    int dstride, int sstride, int sizey,
+                                    unsigned int strength,
+                                    unsigned int adjdamp) {
+  int y;
+
+  for (y = 0; y < sizey; y += 2) {
+    const v256 x = v256_from_v128(v128_load_aligned(src),
+                                  v128_load_aligned(src + sstride));
+    const v256 a = v256_from_v128(v128_load_unaligned(src - 2),
+                                  v128_load_unaligned(src - 2 + sstride));
+    const v256 b = v256_from_v128(v128_load_unaligned(src - 1),
+                                  v128_load_unaligned(src - 1 + sstride));
+    const v256 c = v256_from_v128(v128_load_unaligned(src + 1),
+                                  v128_load_unaligned(src + 1 + sstride));
+    const v256 d = v256_from_v128(v128_load_unaligned(src + 2),
+                                  v128_load_unaligned(src + 2 + sstride));
+    const v128 o = calc_hdelta(x, a, b, c, d, strength, adjdamp);
+
+    v64_store_aligned(dst, v128_high_v64(o));
+    v64_store_aligned(dst + dstride, v128_low_v64(o));
+    src += sstride * 2;
+    dst += dstride * 2;
+  }
+}
+
+// Process blocks of width 4, four lines at a time, 8 bit.
+static void SIMD_FUNC(clpf_hblock4)(uint8_t *dst, const uint16_t *src,
+                                    int dstride, int sstride, int sizey,
+                                    unsigned int strength,
+                                    unsigned int adjdamp) {
+  int y;
+
+  for (y = 0; y < sizey; y += 4) {
+    const v256 a = v256_from_v64(v64_load_unaligned(src - 2),
+                                 v64_load_unaligned(src + sstride - 2),
+                                 v64_load_unaligned(src + 2 * sstride - 2),
+                                 v64_load_unaligned(src + 3 * sstride - 2));
+    const v256 b = v256_from_v64(v64_load_unaligned(src - 1),
+                                 v64_load_unaligned(src + sstride - 1),
+                                 v64_load_unaligned(src + 2 * sstride - 1),
+                                 v64_load_unaligned(src + 3 * sstride - 1));
+    const v256 c = v256_from_v64(v64_load_unaligned(src + 1),
+                                 v64_load_unaligned(src + sstride + 1),
+                                 v64_load_unaligned(src + 2 * sstride + 1),
+                                 v64_load_unaligned(src + 3 * sstride + 1));
+    const v256 d = v256_from_v64(v64_load_unaligned(src + 2),
+                                 v64_load_unaligned(src + sstride + 2),
+                                 v64_load_unaligned(src + 2 * sstride + 2),
+                                 v64_load_unaligned(src + 3 * sstride + 2));
+
+    const v128 o = calc_hdelta(
+        v256_from_v64(v64_load_aligned(src), v64_load_aligned(src + sstride),
+                      v64_load_aligned(src + 2 * sstride),
+                      v64_load_aligned(src + 3 * sstride)),
+        a, b, c, d, strength, adjdamp);
+
+    u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
+    u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
+    u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
+    u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
+
+    dst += 4 * dstride;
+    src += 4 * sstride;
+  }
+}
+
+void SIMD_FUNC(aom_clpf_block)(uint8_t *dst, const uint16_t *src, int dstride,
+                               int sstride, int sizex, int sizey,
+                               unsigned int strength, unsigned int dmp) {
+  if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
+    // Fallback to C for odd sizes:
+    // * block widths not 4 or 8
+    // * block heights not a multiple of 4 if the block width is 4
+    aom_clpf_block_c(dst, src, dstride, sstride, sizex, sizey, strength, dmp);
+  } else {
+    (sizex == 4 ? SIMD_FUNC(clpf_block4) : SIMD_FUNC(clpf_block8))(
+        dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
+  }
+}
+
+void SIMD_FUNC(aom_clpf_hblock)(uint8_t *dst, const uint16_t *src, int dstride,
+                                int sstride, int sizex, int sizey,
+                                unsigned int strength, unsigned int dmp) {
+  if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
+    // Fallback to C for odd sizes:
+    // * block widths not 4 or 8
+    // * block heights not a multiple of 4 if the block width is 4
+    aom_clpf_hblock_c(dst, src, dstride, sstride, sizex, sizey, strength, dmp);
+  } else {
+    (sizex == 4 ? SIMD_FUNC(clpf_hblock4) : SIMD_FUNC(clpf_hblock8))(
+        dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
+  }
+}
+
+// delta = 1/16 * constrain(a, x, s, d) + 3/16 * constrain(b, x, s, d) +
+//         1/16 * constrain(c, x, s, d) + 3/16 * constrain(d, x, s, d) +
+//         3/16 * constrain(e, x, s, d) + 1/16 * constrain(f, x, s, d) +
+//         3/16 * constrain(g, x, s, d) + 1/16 * constrain(h, x, s, d)
+SIMD_INLINE v128 calc_delta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
+                                v128 f, v128 g, v128 h, unsigned int s,
+                                unsigned int dmp) {
+  const v128 bdeg = v128_add_16(
+      v128_add_16(constrain16(b, x, s, dmp), constrain16(d, x, s, dmp)),
+      v128_add_16(constrain16(e, x, s, dmp), constrain16(g, x, s, dmp)));
+  const v128 delta = v128_add_16(
+      v128_add_16(
+          v128_add_16(constrain16(a, x, s, dmp), constrain16(c, x, s, dmp)),
+          v128_add_16(constrain16(f, x, s, dmp), constrain16(h, x, s, dmp))),
+      v128_add_16(v128_add_16(bdeg, bdeg), bdeg));
+  return v128_add_16(
+      x,
+      v128_shr_s16(
+          v128_add_16(v128_dup_16(8),
+                      v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
+          4));
+}
+
+static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
+                            v128 f, v128 g, v128 h, uint16_t *dst,
+                            unsigned int s, unsigned int dmp, int dstride) {
+  o = calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, dmp);
+  v64_store_aligned(dst, v128_high_v64(o));
+  v64_store_aligned(dst + dstride, v128_low_v64(o));
+}
+
+static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
+                            v128 f, v128 g, v128 h, uint16_t *dst,
+                            unsigned int s, unsigned int adjdamp) {
+  v128_store_aligned(dst,
+                     calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, adjdamp));
+}
+
+// delta = 1/16 * constrain(a, x, s, dmp) + 3/16 * constrain(b, x, s, dmp) +
+//         3/16 * constrain(c, x, s, dmp) + 1/16 * constrain(d, x, s, dmp)
+SIMD_INLINE v128 calc_hdelta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d,
+                                 unsigned int s, unsigned int dmp) {
+  const v128 bc =
+      v128_add_16(constrain16(b, x, s, dmp), constrain16(c, x, s, dmp));
+  const v128 delta = v128_add_16(
+      v128_add_16(constrain16(a, x, s, dmp), constrain16(d, x, s, dmp)),
+      v128_add_16(v128_add_16(bc, bc), bc));
+  return v128_add_16(
+      x,
+      v128_shr_s16(
+          v128_add_16(v128_dup_16(4),
+                      v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
+          3));
+}
+
+static void calc_hdelta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d,
+                             uint16_t *dst, unsigned int s,
+                             unsigned int adjdamp, int dstride) {
+  o = calc_hdelta_hbd(o, a, b, c, d, s, adjdamp);
+  v64_store_aligned(dst, v128_high_v64(o));
+  v64_store_aligned(dst + dstride, v128_low_v64(o));
+}
+
+static void calc_hdelta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d,
+                             uint16_t *dst, unsigned int s,
+                             unsigned int adjdamp) {
+  v128_store_aligned(dst, calc_hdelta_hbd(o, a, b, c, d, s, adjdamp));
+}
+
+// Process blocks of width 4, two lines at time.
+static void SIMD_FUNC(clpf_block_hbd4)(uint16_t *dst, const uint16_t *src,
+                                       int dstride, int sstride, int sizey,
+                                       unsigned int strength,
+                                       unsigned int adjdamp) {
+  int y;
+
+  for (y = 0; y < sizey; y += 2) {
+    const v64 l1 = v64_load_aligned(src);
+    const v64 l2 = v64_load_aligned(src + sstride);
+    const v64 l3 = v64_load_aligned(src - sstride);
+    const v64 l4 = v64_load_aligned(src + 2 * sstride);
+    const v128 a = v128_from_v64(v64_load_aligned(src - 2 * sstride), l3);
+    const v128 b = v128_from_v64(l3, l1);
+    const v128 g = v128_from_v64(l2, l4);
+    const v128 h = v128_from_v64(l4, v64_load_aligned(src + 3 * sstride));
+    const v128 c = v128_from_v64(v64_load_unaligned(src - 2),
+                                 v64_load_unaligned(src - 2 + sstride));
+    const v128 d = v128_from_v64(v64_load_unaligned(src - 1),
+                                 v64_load_unaligned(src - 1 + sstride));
+    const v128 e = v128_from_v64(v64_load_unaligned(src + 1),
+                                 v64_load_unaligned(src + 1 + sstride));
+    const v128 f = v128_from_v64(v64_load_unaligned(src + 2),
+                                 v64_load_unaligned(src + 2 + sstride));
+
+    calc_delta_hbd4(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h, dst,
+                    strength, adjdamp, dstride);
+    src += sstride * 2;
+    dst += dstride * 2;
+  }
+}
+
+// The most simple case.  Start here if you need to understand the functions.
+static void SIMD_FUNC(clpf_block_hbd)(uint16_t *dst, const uint16_t *src,
+                                      int dstride, int sstride, int sizey,
+                                      unsigned int strength,
+                                      unsigned int adjdamp) {
+  int y;
+
+  for (y = 0; y < sizey; y++) {
+    const v128 o = v128_load_aligned(src);
+    const v128 a = v128_load_aligned(src - 2 * sstride);
+    const v128 b = v128_load_aligned(src - 1 * sstride);
+    const v128 g = v128_load_aligned(src + sstride);
+    const v128 h = v128_load_aligned(src + 2 * sstride);
+    const v128 c = v128_load_unaligned(src - 2);
+    const v128 d = v128_load_unaligned(src - 1);
+    const v128 e = v128_load_unaligned(src + 1);
+    const v128 f = v128_load_unaligned(src + 2);
+
+    calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, adjdamp);
+    src += sstride;
+    dst += dstride;
+  }
+}
+
+// Process blocks of width 4, horizontal filter, two lines at time.
+static void SIMD_FUNC(clpf_hblock_hbd4)(uint16_t *dst, const uint16_t *src,
+                                        int dstride, int sstride, int sizey,
+                                        unsigned int strength,
+                                        unsigned int adjdamp) {
+  int y;
+
+  for (y = 0; y < sizey; y += 2) {
+    const v128 a = v128_from_v64(v64_load_unaligned(src - 2),
+                                 v64_load_unaligned(src - 2 + sstride));
+    const v128 b = v128_from_v64(v64_load_unaligned(src - 1),
+                                 v64_load_unaligned(src - 1 + sstride));
+    const v128 c = v128_from_v64(v64_load_unaligned(src + 1),
+                                 v64_load_unaligned(src + 1 + sstride));
+    const v128 d = v128_from_v64(v64_load_unaligned(src + 2),
+                                 v64_load_unaligned(src + 2 + sstride));
+
+    calc_hdelta_hbd4(v128_from_v64(v64_load_unaligned(src),
+                                   v64_load_unaligned(src + sstride)),
+                     a, b, c, d, dst, strength, adjdamp, dstride);
+    src += sstride * 2;
+    dst += dstride * 2;
+  }
+}
+
+// Process blocks of width 8, horizontal filter, two lines at time.
+static void SIMD_FUNC(clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src,
+                                       int dstride, int sstride, int sizey,
+                                       unsigned int strength,
+                                       unsigned int adjdamp) {
+  int y;
+
+  for (y = 0; y < sizey; y++) {
+    const v128 o = v128_load_aligned(src);
+    const v128 a = v128_load_unaligned(src - 2);
+    const v128 b = v128_load_unaligned(src - 1);
+    const v128 c = v128_load_unaligned(src + 1);
+    const v128 d = v128_load_unaligned(src + 2);
+
+    calc_hdelta_hbd8(o, a, b, c, d, dst, strength, adjdamp);
+    src += sstride;
+    dst += dstride;
+  }
+}
+
+void SIMD_FUNC(aom_clpf_block_hbd)(uint16_t *dst, const uint16_t *src,
+                                   int dstride, int sstride, int sizex,
+                                   int sizey, unsigned int strength,
+                                   unsigned int dmp) {
+  if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
+    // Fallback to C for odd sizes:
+    // * block width not 4 or 8
+    // * block heights not a multiple of 2 if the block width is 4
+    aom_clpf_block_hbd_c(dst, src, dstride, sstride, sizex, sizey, strength,
+                         dmp);
+  } else {
+    (sizex == 4 ? SIMD_FUNC(clpf_block_hbd4) : SIMD_FUNC(clpf_block_hbd))(
+        dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
+  }
+}
+
+void SIMD_FUNC(aom_clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src,
+                                    int dstride, int sstride, int sizex,
+                                    int sizey, unsigned int strength,
+                                    unsigned int dmp) {
+  if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
+    // Fallback to C for odd sizes:
+    // * block width not 4 or 8
+    // * block heights not a multiple of 2 if the block width is 4
+    aom_clpf_hblock_hbd_c(dst, src, dstride, sstride, sizex, sizey, strength,
+                          dmp);
+  } else {
+    (sizex == 4 ? SIMD_FUNC(clpf_hblock_hbd4) : SIMD_FUNC(clpf_hblock_hbd))(
+        dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
+  }
+}
diff --git a/third_party/aom/av1/common/clpf_sse2.c b/third_party/aom/av1/common/clpf_sse2.c
new file mode 100644
index 000000000..e29c2ab7e
--- /dev/null
+++ b/third_party/aom/av1/common/clpf_sse2.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_sse2
+#include "./clpf_simd.h"
diff --git a/third_party/aom/av1/common/clpf_sse4.c b/third_party/aom/av1/common/clpf_sse4.c
new file mode 100644
index 000000000..537139f17
--- /dev/null
+++ b/third_party/aom/av1/common/clpf_sse4.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_sse4_1
+#include "./clpf_simd.h"
diff --git a/third_party/aom/av1/common/clpf_ssse3.c b/third_party/aom/av1/common/clpf_ssse3.c
new file mode 100644
index 000000000..d7ed8dec5
--- /dev/null
+++ b/third_party/aom/av1/common/clpf_ssse3.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_ssse3
+#include "./clpf_simd.h"
diff --git a/third_party/aom/av1/common/common.h b/third_party/aom/av1/common/common.h
new file mode 100644
index 000000000..551055a76
--- /dev/null
+++ b/third_party/aom/av1/common/common.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_COMMON_H_
+#define AV1_COMMON_COMMON_H_
+
+/* Interface header for common constant data structures and lookup tables */
+
+#include <assert.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/bitops.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PI 3.141592653589793238462643383279502884
+
+// Only need this for fixed-size arrays, for structs just assign.
+#define av1_copy(dest, src)              \
+  {                                      \
+    assert(sizeof(dest) == sizeof(src)); \
+    memcpy(dest, src, sizeof(src));      \
+  }
+
+// Use this for variably-sized arrays.
+#define av1_copy_array(dest, src, n)           \
+  {                                            \
+    assert(sizeof(*(dest)) == sizeof(*(src))); \
+    memcpy(dest, src, n * sizeof(*(src)));     \
+  }
+
+#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
+#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
+
+static INLINE int get_unsigned_bits(unsigned int num_values) {
+  return num_values > 0 ? get_msb(num_values) + 1 : 0;
+}
+
+#define CHECK_MEM_ERROR(cm, lval, expr) \
+  AOM_CHECK_MEM_ERROR(&cm->error, lval, expr)
+// TODO(yaowu: validate the usage of these codes or develop new ones.)
+#define AV1_SYNC_CODE_0 0x49
+#define AV1_SYNC_CODE_1 0x83
+#define AV1_SYNC_CODE_2 0x43
+
+#define AOM_FRAME_MARKER 0x2
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_COMMON_H_
diff --git a/third_party/aom/av1/common/common_data.h b/third_party/aom/av1/common/common_data.h
new file mode 100644
index 000000000..415d5cf73
--- /dev/null
+++ b/third_party/aom/av1/common/common_data.h
@@ -0,0 +1,1405 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_COMMON_DATA_H_
+#define AV1_COMMON_COMMON_DATA_H_
+
+#include "av1/common/enums.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_EXT_PARTITION
+#define IF_EXT_PARTITION(...) __VA_ARGS__
+#else
+#define IF_EXT_PARTITION(...)
+#endif
+
+// Log 2 conversion lookup tables for block width and height
+static const uint8_t b_width_log2_lookup[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  0, 0, 0,
+#endif
+  0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, IF_EXT_PARTITION(4, 5, 5)
+};
+static const uint8_t b_height_log2_lookup[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  0, 0, 0,
+#endif
+  0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, IF_EXT_PARTITION(5, 4, 5)
+};
+// Log 2 conversion lookup tables for modeinfo width and height
+static const uint8_t mi_width_log2_lookup[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, IF_EXT_PARTITION(4, 5, 5)
+#else
+  0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, IF_EXT_PARTITION(3, 4, 4)
+#endif
+};
+static const uint8_t mi_height_log2_lookup[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, IF_EXT_PARTITION(5, 4, 5)
+#else
+  0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3, IF_EXT_PARTITION(4, 3, 4)
+#endif
+};
+
+static const uint8_t mi_size_wide[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, IF_EXT_PARTITION(16, 32, 32)
+#else
+  1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, IF_EXT_PARTITION(8, 16, 16)
+#endif
+};
+static const uint8_t mi_size_high[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, IF_EXT_PARTITION(32, 16, 32)
+#else
+  1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, IF_EXT_PARTITION(16, 8, 16)
+#endif
+};
+
+// Width/height lookup tables in units of various block sizes
+static const uint8_t block_size_wide[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  2, 2, 4,
+#endif
+  4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32, 64, 64, IF_EXT_PARTITION(64, 128, 128)
+};
+
+static const uint8_t block_size_high[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  2, 4, 2,
+#endif
+  4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 64, 32, 64, IF_EXT_PARTITION(128, 64, 128)
+};
+
+static const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  1, 1, 1,
+#endif
+  1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, IF_EXT_PARTITION(16, 32, 32)
+};
+static const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  1, 1, 1,
+#endif
+  1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, IF_EXT_PARTITION(32, 16, 32)
+};
+static const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  1, 1, 1,
+#endif
+  1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, IF_EXT_PARTITION(8, 16, 16)
+};
+static const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  1, 1, 1,
+#endif
+  1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, IF_EXT_PARTITION(16, 8, 16)
+};
+static const uint8_t num_16x16_blocks_wide_lookup[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  1, 1, 1,
+#endif
+  1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4, IF_EXT_PARTITION(4, 8, 8)
+};
+static const uint8_t num_16x16_blocks_high_lookup[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  1, 1, 1,
+#endif
+  1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, IF_EXT_PARTITION(8, 4, 8)
+};
+
+// AOMMIN(3, AOMMIN(b_width_log2(bsize), b_height_log2(bsize)))
+static const uint8_t size_group_lookup[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  0, 0, 0,
+#endif
+  0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, IF_EXT_PARTITION(3, 3, 3)
+};
+
+static const uint8_t num_pels_log2_lookup[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  2, 3, 3,
+#endif
+  4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, IF_EXT_PARTITION(13, 13, 14)
+};
+
+/* clang-format off */
+static const PARTITION_TYPE
+  partition_lookup[MAX_SB_SIZE_LOG2 - 1][BLOCK_SIZES] = {
+  {     // 4X4 ->
+#if CONFIG_CB4X4
+    // 2X2,            2X4,               4X2,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif
+    //                                    4X4
+                                          PARTITION_NONE,
+    // 4X8,            8X4,               8X8
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 8X16,           16X8,              16X16
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 16X32,          32X16,             32X32
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 32X64,          64X32,             64X64
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#if CONFIG_EXT_PARTITION
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // 8X8 ->
+#if CONFIG_CB4X4
+    // 2X2,            2X4,               4X2,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif
+    //                                    4X4
+                                          PARTITION_SPLIT,
+    // 4X8,            8X4,               8X8
+    PARTITION_VERT,    PARTITION_HORZ,    PARTITION_NONE,
+    // 8X16,           16X8,              16X16
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 16X32,          32X16,             32X32
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 32X64,          64X32,             64X64
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#if CONFIG_EXT_PARTITION
+    // 64x128,         128x64,            128x128
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // 16X16 ->
+#if CONFIG_CB4X4
+    // 2X2,            2X4,               4X2,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif
+    //                                    4X4
+                                          PARTITION_SPLIT,
+    // 4X8,            8X4,               8X8
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 8X16,           16X8,              16X16
+    PARTITION_VERT,    PARTITION_HORZ,    PARTITION_NONE,
+    // 16X32,          32X16,             32X32
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 32X64,          64X32,             64X64
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#if CONFIG_EXT_PARTITION
+    // 64x128,         128x64,            128x128
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // 32X32 ->
+#if CONFIG_CB4X4
+    // 2X2,            2X4,               4X2,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif
+    //                                    4X4
+                                          PARTITION_SPLIT,
+    // 4X8,            8X4,               8X8
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 8X16,           16X8,              16X16
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 16X32,          32X16,             32X32
+    PARTITION_VERT,    PARTITION_HORZ,    PARTITION_NONE,
+    // 32X64,          64X32,             64X64
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#if CONFIG_EXT_PARTITION
+    // 64x128,         128x64,            128x128
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // 64X64 ->
+#if CONFIG_CB4X4
+    // 2X2,            2X4,               4X2,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif
+    //                                    4X4
+                                          PARTITION_SPLIT,
+    // 4X8,            8X4,               8X8
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 8X16,           16X8,              16X16
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 16X32,          32X16,             32X32
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 32X64,          64X32,             64X64
+    PARTITION_VERT,    PARTITION_HORZ,    PARTITION_NONE,
+#if CONFIG_EXT_PARTITION
+    // 64x128,         128x64,            128x128
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+  }, {  // 128x128 ->
+#if CONFIG_CB4X4
+    // 2X2,            2X4,               4X2,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif
+    //                                    4X4
+                                          PARTITION_SPLIT,
+    // 4X8,            8X4,               8X8
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 8X16,           16X8,              16X16
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 16X32,          32X16,             32X32
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 32X64,          64X32,             64X64
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 64x128,         128x64,            128x128
+    PARTITION_VERT,    PARTITION_HORZ,    PARTITION_NONE,
+#endif  // CONFIG_EXT_PARTITION
+  }
+};
+
+#if CONFIG_EXT_PARTITION_TYPES
+static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][BLOCK_SIZES] =
+#else
+static const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] =
+#endif  // CONFIG_EXT_PARTITION_TYPES
+{
+  {     // PARTITION_NONE
+#if CONFIG_CB4X4
+    // 2X2,        2X4,           4X2,
+    BLOCK_2X2,     BLOCK_2X4,     BLOCK_4X2,
+#endif
+    //                            4X4
+                                  BLOCK_4X4,
+    // 4X8,        8X4,           8X8
+    BLOCK_4X8,     BLOCK_8X4,     BLOCK_8X8,
+    // 8X16,       16X8,          16X16
+    BLOCK_8X16,    BLOCK_16X8,    BLOCK_16X16,
+    // 16X32,      32X16,         32X32
+    BLOCK_16X32,   BLOCK_32X16,   BLOCK_32X32,
+    // 32X64,      64X32,         64X64
+    BLOCK_32X64,   BLOCK_64X32,   BLOCK_64X64,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_64X128,  BLOCK_128X64,  BLOCK_128X128,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // PARTITION_HORZ
+#if CONFIG_CB4X4
+    // 2X2,        2X4,           4X2,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    //                            4X4
+                                  BLOCK_4X2,
+#else
+    //                            4X4
+                                  BLOCK_INVALID,
+#endif
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // PARTITION_VERT
+#if CONFIG_CB4X4
+    // 2X2,        2X4,           4X2,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    //                            4X4
+                                  BLOCK_2X4,
+#else
+    //                            4X4
+                                  BLOCK_INVALID,
+#endif
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // PARTITION_SPLIT
+#if CONFIG_CB4X4
+    // 2X2,        2X4,           4X2,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+#endif
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X32,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X64,
+#endif  // CONFIG_EXT_PARTITION
+#if CONFIG_EXT_PARTITION_TYPES
+  }, {  // PARTITION_HORZ_A
+#if CONFIG_CB4X4
+    // 2X2,        2X4,           4X2,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+#endif
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // PARTITION_HORZ_B
+#if CONFIG_CB4X4
+    // 2X2,        2X4,           4X2,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+#endif
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // PARTITION_VERT_A
+#if CONFIG_CB4X4
+    // 2X2,        2X4,           4X2,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+#endif
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // PARTITION_VERT_B
+#if CONFIG_CB4X4
+    // 2X2,        2X4,           4X2,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+#endif
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_EXT_PARTITION_TYPES
+  }
+};
+
+static const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  // 2X2,    2X4,      4X2,
+  TX_2X2,    TX_2X2,   TX_2X2,
+#endif
+  //                   4X4
+                       TX_4X4,
+  // 4X8,    8X4,      8X8
+  TX_4X4,    TX_4X4,   TX_8X8,
+  // 8X16,   16X8,     16X16
+  TX_8X8,    TX_8X8,   TX_16X16,
+  // 16X32,  32X16,    32X32
+  TX_16X16,  TX_16X16, TX_32X32,
+  // 32X64,  64X32,
+  TX_32X32,  TX_32X32,
+#if CONFIG_TX64X64
+  // 64X64
+  TX_64X64,
+#if CONFIG_EXT_PARTITION
+  // 64x128, 128x64,   128x128
+  TX_64X64,  TX_64X64, TX_64X64,
+#endif  // CONFIG_EXT_PARTITION
+#else
+  // 64X64
+  TX_32X32,
+#if CONFIG_EXT_PARTITION
+  // 64x128, 128x64,   128x128
+  TX_32X32,  TX_32X32, TX_32X32,
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_TX64X64
+};
+
+#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+static const TX_SIZE max_txsize_rect_lookup[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  // 2X2,    2X4,      4X2,
+  TX_2X2,    TX_2X2,   TX_2X2,
+#endif  // CONFIG_CB4X4
+  //                   4X4
+                       TX_4X4,
+  // 4X8,    8X4,      8X8
+  TX_4X8,    TX_8X4,   TX_8X8,
+  // 8X16,   16X8,     16X16
+  TX_8X16,   TX_16X8,  TX_16X16,
+  // 16X32,  32X16,    32X32
+  TX_16X32,  TX_32X16, TX_32X32,
+  // 32X64,  64X32,
+  TX_32X32,  TX_32X32,
+#if CONFIG_TX64X64
+  // 64X64
+  TX_64X64,
+#if CONFIG_EXT_PARTITION
+  // 64x128, 128x64,   128x128
+  TX_64X64,  TX_64X64, TX_64X64,
+#endif  // CONFIG_EXT_PARTITION
+#else
+  // 64X64
+  TX_32X32,
+#if CONFIG_EXT_PARTITION
+  // 64x128, 128x64,   128x128
+  TX_32X32,  TX_32X32, TX_32X32,
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_TX64X64
+};
+#else
+#define max_txsize_rect_lookup max_txsize_lookup
+#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+
+#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+// Same as "max_txsize_lookup[bsize] - TX_8X8", except for rectangular
+// block which may use a rectangular transform, in which  case it is
+// "(max_txsize_lookup[bsize] + 1) - TX_8X8", invalid for bsize < 8X8
+static const int32_t intra_tx_size_cat_lookup[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  // 2X2,             2X4,                4X2,
+  INT32_MIN,          INT32_MIN,          INT32_MIN,
+  //                                      4X4,
+                                          INT32_MIN,
+  // 4X8,             8X4,                8X8,
+  TX_8X8 - TX_8X8,    TX_8X8 - TX_8X8,    TX_8X8 - TX_8X8,
+#else
+  //                                      4X4
+                                          INT32_MIN,
+  // 4X8,             8X4,                8X8
+  INT32_MIN,          INT32_MIN,          TX_8X8 - TX_8X8,
+#endif  // CONFIG_CB4X4
+  // 8X16,            16X8,               16X16
+  TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,
+  // 16X32,           32X16,              32X32
+  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
+  // 32X64,           64X32,
+  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
+#if CONFIG_TX64X64
+  // 64X64
+  TX_64X64 - TX_8X8,
+#if CONFIG_EXT_PARTITION
+  // 64x128,          128x64,             128x128
+  TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,
+#endif  // CONFIG_EXT_PARTITION
+#else
+  // 64X64
+  TX_32X32 - TX_8X8,
+#if CONFIG_EXT_PARTITION
+  // 64x128,          128x64,             128x128
+  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_TX64X64
+};
+#else
+// Same as "max_txsize_lookup[bsize] - TX_8X8", invalid for bsize < 8X8
+static const int32_t intra_tx_size_cat_lookup[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  // 2X2,             2X4,                4X2,
+  INT32_MIN,          INT32_MIN,          INT32_MIN,
+#endif
+  //                                      4X4
+                                          INT32_MIN,
+  // 4X8,             8X4,                8X8
+  INT32_MIN,          INT32_MIN,          TX_8X8 - TX_8X8,
+  // 8X16,            16X8,               16X16
+  TX_8X8 - TX_8X8,    TX_8X8 - TX_8X8,    TX_16X16 - TX_8X8,
+  // 16X32,           32X16,              32X32
+  TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,  TX_32X32 - TX_8X8,
+  // 32X64,           64X32,
+  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
+#if CONFIG_TX64X64
+  // 64X64
+  TX_64X64 - TX_8X8,
+#if CONFIG_EXT_PARTITION
+  // 64x128,          128x64,             128x128
+  TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,
+#endif  // CONFIG_EXT_PARTITION
+#else
+  // 64X64
+  TX_32X32 - TX_8X8,
+#if CONFIG_EXT_PARTITION
+  // 64x128,          128x64,             128x128
+  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_TX64X64
+};
+#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+
+#define inter_tx_size_cat_lookup intra_tx_size_cat_lookup
+
+/* clang-format on */
+
+static const TX_SIZE sub_tx_size_map[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  TX_2X2,  // TX_2X2
+#endif
+  TX_4X4,    // TX_4X4
+  TX_4X4,    // TX_8X8
+  TX_8X8,    // TX_16X16
+  TX_16X16,  // TX_32X32
+#if CONFIG_TX64X64
+  TX_32X32,  // TX_64X64
+#endif       // CONFIG_TX64X64
+  TX_4X4,    // TX_4X8
+  TX_4X4,    // TX_8X4
+  TX_8X8,    // TX_8X16
+  TX_8X8,    // TX_16X8
+  TX_16X16,  // TX_16X32
+  TX_16X16,  // TX_32X16
+  TX_4X4,    // TX_4X16
+  TX_4X4,    // TX_16X4
+  TX_8X8,    // TX_8X32
+  TX_8X8,    // TX_32X8
+};
+
+static const TX_SIZE txsize_horz_map[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  TX_2X2,  // TX_2X2
+#endif
+  TX_4X4,    // TX_4X4
+  TX_8X8,    // TX_8X8
+  TX_16X16,  // TX_16X16
+  TX_32X32,  // TX_32X32
+#if CONFIG_TX64X64
+  TX_64X64,  // TX_64X64
+#endif       // CONFIG_TX64X64
+  TX_4X4,    // TX_4X8
+  TX_8X8,    // TX_8X4
+  TX_8X8,    // TX_8X16
+  TX_16X16,  // TX_16X8
+  TX_16X16,  // TX_16X32
+  TX_32X32,  // TX_32X16
+  TX_4X4,    // TX_4X16
+  TX_16X16,  // TX_16X4
+  TX_8X8,    // TX_8X32
+  TX_32X32,  // TX_32X8
+};
+
+static const TX_SIZE txsize_vert_map[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  TX_2X2,  // TX_2X2
+#endif
+  TX_4X4,    // TX_4X4
+  TX_8X8,    // TX_8X8
+  TX_16X16,  // TX_16X16
+  TX_32X32,  // TX_32X32
+#if CONFIG_TX64X64
+  TX_64X64,  // TX_64X64
+#endif       // CONFIG_TX64X64
+  TX_8X8,    // TX_4X8
+  TX_4X4,    // TX_8X4
+  TX_16X16,  // TX_8X16
+  TX_8X8,    // TX_16X8
+  TX_32X32,  // TX_16X32
+  TX_16X16,  // TX_32X16
+  TX_16X16,  // TX_4X16
+  TX_4X4,    // TX_16X4
+  TX_32X32,  // TX_8X32
+  TX_8X8,    // TX_32X8
+};
+
+#if CONFIG_CB4X4
+#define TX_SIZE_W_MIN 2
+#else
+#define TX_SIZE_W_MIN 4
+#endif
+
+// Transform block width in pixels
+static const int tx_size_wide[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  2,
+#endif
+  4,  8, 16, 32,
+#if CONFIG_TX64X64
+  64,
+#endif  // CONFIG_TX64X64
+  4,  8, 8,  16, 16, 32, 4, 16, 8, 32
+};
+
+#if CONFIG_CB4X4
+#define TX_SIZE_H_MIN 2
+#else
+#define TX_SIZE_H_MIN 4
+#endif
+
+// Transform block height in pixels
+static const int tx_size_high[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  2,
+#endif
+  4,  8, 16, 32,
+#if CONFIG_TX64X64
+  64,
+#endif  // CONFIG_TX64X64
+  8,  4, 16, 8,  32, 16, 16, 4, 32, 8
+};
+
+// Transform block width in unit
+static const int tx_size_wide_unit[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  1,  2, 4, 8, 16,
+#if CONFIG_TX64X64
+  32,
+#endif  // CONFIG_TX64X64
+  2,  4, 4, 8, 8,  16, 2, 8, 4, 16
+#else  // CONFIG_CB4X4
+  1,  2, 4, 8,
+#if CONFIG_TX64X64
+  16,
+#endif  // CONFIG_TX64X64
+  1,  2, 2, 4, 4, 8, 1, 4, 2, 8
+#endif  // CONFIG_CB4X4
+};
+
+// Transform block height in unit
+static const int tx_size_high_unit[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  1,  2, 4, 8, 16,
+#if CONFIG_TX64X64
+  32,
+#endif  // CONFIG_TX64X64
+  4,  2, 8, 4, 16, 8, 8, 2, 16, 4
+#else  // CONFIG_CB4X4
+  1,  2, 4, 8,
+#if CONFIG_TX64X64
+  16,
+#endif  // CONFIG_TX64X64
+  2,  1, 4, 2, 8, 4, 4, 1, 8, 2
+#endif  // CONFIG_CB4X4
+};
+
+// Transform block width in log2
+static const int tx_size_wide_log2[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  1,
+#endif
+  2, 3, 4, 5,
+#if CONFIG_TX64X64
+  6,
+#endif  // CONFIG_TX64X64
+  2, 3, 3, 4, 4, 5, 2, 4, 3, 5
+};
+
+// Transform block height in log2
+static const int tx_size_high_log2[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  1,
+#endif
+  2, 3, 4, 5,
+#if CONFIG_TX64X64
+  6,
+#endif  // CONFIG_TX64X64
+  3, 2, 4, 3, 5, 4, 4, 2, 5, 3
+};
+
+static const int tx_size_2d[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  4,
+#endif
+  16,   64, 256, 1024,
+#if CONFIG_TX64X64
+  4096,
+#endif  // CONFIG_TX64X64
+  32,   32, 128, 128,  512, 512, 64, 64, 256, 256
+};
+
+static const BLOCK_SIZE txsize_to_bsize[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  BLOCK_2X2,  // TX_2X2
+#endif
+  BLOCK_4X4,    // TX_4X4
+  BLOCK_8X8,    // TX_8X8
+  BLOCK_16X16,  // TX_16X16
+  BLOCK_32X32,  // TX_32X32
+#if CONFIG_TX64X64
+  BLOCK_64X64,    // TX_64X64
+#endif            // CONFIG_TX64X64
+  BLOCK_4X8,      // TX_4X8
+  BLOCK_8X4,      // TX_8X4
+  BLOCK_8X16,     // TX_8X16
+  BLOCK_16X8,     // TX_16X8
+  BLOCK_16X32,    // TX_16X32
+  BLOCK_32X16,    // TX_32X16
+  BLOCK_INVALID,  // TX_4X16
+  BLOCK_INVALID,  // TX_16X4
+  BLOCK_INVALID,  // TX_8X32
+  BLOCK_INVALID,  // TX_32X8
+};
+
+static const TX_SIZE txsize_sqr_map[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  TX_2X2,  // TX_2X2
+#endif
+  TX_4X4,    // TX_4X4
+  TX_8X8,    // TX_8X8
+  TX_16X16,  // TX_16X16
+  TX_32X32,  // TX_32X32
+#if CONFIG_TX64X64
+  TX_64X64,  // TX_64X64
+#endif       // CONFIG_TX64X64
+  TX_4X4,    // TX_4X8
+  TX_4X4,    // TX_8X4
+  TX_8X8,    // TX_8X16
+  TX_8X8,    // TX_16X8
+  TX_16X16,  // TX_16X32
+  TX_16X16,  // TX_32X16
+  TX_4X4,    // TX_4X16
+  TX_4X4,    // TX_16X4
+  TX_8X8,    // TX_8X32
+  TX_8X8,    // TX_32X8
+};
+
+static const TX_SIZE txsize_sqr_up_map[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  TX_2X2,  // TX_2X2
+#endif
+  TX_4X4,    // TX_4X4
+  TX_8X8,    // TX_8X8
+  TX_16X16,  // TX_16X16
+  TX_32X32,  // TX_32X32
+#if CONFIG_TX64X64
+  TX_64X64,  // TX_64X64
+#endif       // CONFIG_TX64X64
+  TX_8X8,    // TX_4X8
+  TX_8X8,    // TX_8X4
+  TX_16X16,  // TX_8X16
+  TX_16X16,  // TX_16X8
+  TX_32X32,  // TX_16X32
+  TX_32X32,  // TX_32X16
+  TX_16X16,  // TX_4X16
+  TX_16X16,  // TX_16X4
+  TX_32X32,  // TX_8X32
+  TX_32X32,  // TX_32X8
+};
+
+/* clang-format off */
+static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
+  TX_4X4,    // ONLY_4X4
+  TX_8X8,    // ALLOW_8X8
+  TX_16X16,  // ALLOW_16X16
+  TX_32X32,  // ALLOW_32X32
+#if CONFIG_TX64X64
+  TX_64X64,  // ALLOW_64X64
+  TX_64X64,  // TX_MODE_SELECT
+#else
+  TX_32X32,  // TX_MODE_SELECT
+#endif  // CONFIG_TX64X64
+};
+/* clang-format on */
+
+static const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
+//  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
+//  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
+#if CONFIG_CB4X4
+  { { BLOCK_2X2, BLOCK_INVALID }, { BLOCK_INVALID, BLOCK_INVALID } },
+  { { BLOCK_2X4, BLOCK_INVALID }, { BLOCK_INVALID, BLOCK_INVALID } },
+  { { BLOCK_4X2, BLOCK_INVALID }, { BLOCK_INVALID, BLOCK_INVALID } },
+  { { BLOCK_4X4, BLOCK_4X2 }, { BLOCK_2X4, BLOCK_2X2 } },
+  { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_INVALID, BLOCK_2X4 } },
+  { { BLOCK_8X4, BLOCK_INVALID }, { BLOCK_4X4, BLOCK_4X2 } },
+#else
+  { { BLOCK_4X4, BLOCK_INVALID }, { BLOCK_INVALID, BLOCK_INVALID } },
+  { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_INVALID, BLOCK_INVALID } },
+  { { BLOCK_8X4, BLOCK_INVALID }, { BLOCK_4X4, BLOCK_INVALID } },
+#endif
+  { { BLOCK_8X8, BLOCK_8X4 }, { BLOCK_4X8, BLOCK_4X4 } },
+  { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_INVALID, BLOCK_4X8 } },
+  { { BLOCK_16X8, BLOCK_INVALID }, { BLOCK_8X8, BLOCK_8X4 } },
+  { { BLOCK_16X16, BLOCK_16X8 }, { BLOCK_8X16, BLOCK_8X8 } },
+  { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_INVALID, BLOCK_8X16 } },
+  { { BLOCK_32X16, BLOCK_INVALID }, { BLOCK_16X16, BLOCK_16X8 } },
+  { { BLOCK_32X32, BLOCK_32X16 }, { BLOCK_16X32, BLOCK_16X16 } },
+  { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_INVALID, BLOCK_16X32 } },
+  { { BLOCK_64X32, BLOCK_INVALID }, { BLOCK_32X32, BLOCK_32X16 } },
+  { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } },
+#if CONFIG_EXT_PARTITION
+  { { BLOCK_64X128, BLOCK_64X64 }, { BLOCK_INVALID, BLOCK_32X64 } },
+  { { BLOCK_128X64, BLOCK_INVALID }, { BLOCK_64X64, BLOCK_64X32 } },
+  { { BLOCK_128X128, BLOCK_128X64 }, { BLOCK_64X128, BLOCK_64X64 } },
+#endif  // CONFIG_EXT_PARTITION
+};
+
+static const TX_SIZE uv_txsize_lookup[BLOCK_SIZES][TX_SIZES_ALL][2][2] = {
+//  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
+//  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
+#if CONFIG_CB4X4
+  {
+      // BLOCK_2X2
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+#if CONFIG_TX64X64
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+#endif  // CONFIG_TX64X64
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+      // BLOCK_2X4
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+#if CONFIG_TX64X64
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+#endif  // CONFIG_TX64X64
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+      // BLOCK_2X4
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+#if CONFIG_TX64X64
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+#endif  // CONFIG_TX64X64
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+#endif
+  {
+// BLOCK_4X4
+#if CONFIG_CB4X4
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_4X4, TX_2X2 }, { TX_2X2, TX_2X2 } },
+#else
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif  // CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#if CONFIG_TX64X64
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif  // CONFIG_TX64X64
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+// BLOCK_4X8
+#if CONFIG_CB4X4
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_4X4, TX_2X2 }, { TX_2X2, TX_2X2 } },
+#else
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#if CONFIG_TX64X64
+      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif  // CONFIG_TX64X64
+#if CONFIG_CB4X4
+      { { TX_4X8, TX_4X4 }, { TX_2X2, TX_2X2 } },  // used
+#else
+      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },  // used
+#endif
+      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+// BLOCK_8X4
+#if CONFIG_CB4X4
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+      { { TX_4X4, TX_2X2 }, { TX_2X2, TX_2X2 } },
+#else
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#if CONFIG_TX64X64
+      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif  // CONFIG_TX64X64
+      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#if CONFIG_CB4X4
+      { { TX_8X4, TX_2X2 }, { TX_4X4, TX_2X2 } },  // used
+#else
+      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },  // used
+#endif
+      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+// BLOCK_8X8
+#if CONFIG_CB4X4
+      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#if CONFIG_TX64X64
+      { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif  // CONFIG_TX64X64
+      { { TX_4X8, TX_4X4 }, { TX_4X8, TX_4X4 } },
+      { { TX_8X4, TX_8X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X4 }, { TX_4X8, TX_4X4 } },
+      { { TX_8X8, TX_8X4 }, { TX_4X8, TX_4X4 } },
+      { { TX_8X8, TX_8X4 }, { TX_4X8, TX_4X4 } },
+      { { TX_8X8, TX_8X4 }, { TX_4X8, TX_4X4 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+// BLOCK_8X16
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
+#if CONFIG_TX64X64
+      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
+#endif  // CONFIG_TX64X64
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X16, TX_8X8 }, { TX_4X8, TX_4X8 } },  // used
+      { { TX_8X16, TX_8X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X16, TX_8X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X16, TX_8X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+// BLOCK_16X8
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
+      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
+      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
+#if CONFIG_TX64X64
+      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
+#endif  // CONFIG_TX64X64
+      { { TX_4X8, TX_4X4 }, { TX_4X8, TX_4X4 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_16X8, TX_8X4 }, { TX_8X8, TX_8X4 } },
+      { { TX_16X8, TX_8X4 }, { TX_8X8, TX_8X4 } },  // used
+      { { TX_16X8, TX_8X4 }, { TX_8X8, TX_8X4 } },
+      { { TX_16X8, TX_8X4 }, { TX_8X8, TX_8X4 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+// BLOCK_16X16
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
+#if CONFIG_TX64X64
+      { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
+#endif  // CONFIG_TX64X64
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
+      { { TX_16X8, TX_16X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_16X8 }, { TX_8X16, TX_8X8 } },
+      { { TX_16X16, TX_16X8 }, { TX_8X16, TX_8X8 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+// BLOCK_16X32
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
+#if CONFIG_TX64X64
+      { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
+#endif  // CONFIG_TX64X64
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
+      { { TX_16X8, TX_16X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X32, TX_16X16 }, { TX_8X16, TX_8X16 } },  // used
+      { { TX_16X32, TX_16X16 }, { TX_8X16, TX_8X16 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+// BLOCK_32X16
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
+      { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
+#if CONFIG_TX64X64
+      { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
+#endif  // CONFIG_TX64X64
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
+      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
+      { { TX_32X16, TX_16X8 }, { TX_16X16, TX_16X8 } },
+      { { TX_32X16, TX_16X8 }, { TX_16X16, TX_16X8 } },  // used
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+// BLOCK_32X32
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
+      { { TX_32X32, TX_16X16 }, { TX_16X16, TX_16X16 } },
+#if CONFIG_TX64X64
+      { { TX_32X32, TX_16X16 }, { TX_16X16, TX_16X16 } },
+#endif  // CONFIG_TX64X64
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
+      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
+      { { TX_16X32, TX_16X16 }, { TX_16X32, TX_16X16 } },
+      { { TX_32X16, TX_32X16 }, { TX_16X16, TX_16X16 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+// BLOCK_32X64
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
+      { { TX_32X32, TX_32X32 }, { TX_16X16, TX_16X16 } },
+#if CONFIG_TX64X64
+      { { TX_32X32, TX_32X32 }, { TX_16X16, TX_16X16 } },
+#endif  // CONFIG_TX64X64
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
+      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
+      { { TX_16X32, TX_16X32 }, { TX_16X16, TX_16X16 } },
+      { { TX_32X16, TX_32X16 }, { TX_16X16, TX_16X16 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+// BLOCK_64X32
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
+      { { TX_32X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
+#if CONFIG_TX64X64
+      { { TX_32X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
+#endif  // CONFIG_TX64X64
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
+      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
+      { { TX_16X32, TX_16X16 }, { TX_16X32, TX_16X16 } },
+      { { TX_32X16, TX_16X16 }, { TX_32X16, TX_16X16 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+// BLOCK_64X64
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
+      { { TX_32X32, TX_32X32 }, { TX_32X32, TX_32X32 } },
+#if CONFIG_TX64X64
+      { { TX_64X64, TX_32X32 }, { TX_32X32, TX_32X32 } },
+#endif  // CONFIG_TX64X64
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
+      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
+      { { TX_16X32, TX_16X32 }, { TX_16X32, TX_16X32 } },
+      { { TX_32X16, TX_32X16 }, { TX_32X16, TX_16X16 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+#if CONFIG_EXT_PARTITION
+  {
+// BLOCK_64X128
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
+      { { TX_32X32, TX_32X32 }, { TX_32X32, TX_32X32 } },
+#if CONFIG_TX64X64
+      { { TX_64X64, TX_64X64 }, { TX_32X32, TX_32X32 } },
+#endif  // CONFIG_TX64X64
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
+      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
+      { { TX_16X32, TX_16X32 }, { TX_16X32, TX_16X32 } },
+      { { TX_32X16, TX_32X16 }, { TX_32X16, TX_32X16 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+// BLOCK_128X64
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
+      { { TX_32X32, TX_32X32 }, { TX_32X32, TX_32X32 } },
+#if CONFIG_TX64X64
+      { { TX_64X64, TX_32X32 }, { TX_64X64, TX_32X32 } },
+#endif  // CONFIG_TX64X64
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
+      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
+      { { TX_16X32, TX_16X32 }, { TX_16X32, TX_16X32 } },
+      { { TX_32X16, TX_32X16 }, { TX_32X16, TX_32X16 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+// BLOCK_128X128
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
+      { { TX_32X32, TX_32X32 }, { TX_32X32, TX_32X32 } },
+#if CONFIG_TX64X64
+      { { TX_64X64, TX_64X64 }, { TX_64X64, TX_64X64 } },
+#endif  // CONFIG_TX64X64
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
+      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
+      { { TX_16X32, TX_16X32 }, { TX_16X32, TX_16X32 } },
+      { { TX_32X16, TX_32X16 }, { TX_32X16, TX_32X16 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+#endif  // CONFIG_EXT_PARTITION
+};
+
+// Generates 4 bit field in which each bit set to 1 represents
+// a blocksize partition  1111 means we split 64x64, 32x32, 16x16
+// and 8x8.  1000 means we just split the 64x64 to 32x32
+/* clang-format off */
+static const struct {
+  PARTITION_CONTEXT above;
+  PARTITION_CONTEXT left;
+} partition_context_lookup[BLOCK_SIZES] = {
+#if CONFIG_EXT_PARTITION
+#if CONFIG_CB4X4
+  { 31, 31 },  // 2X2   - {0b11111, 0b11111}
+  { 31, 31 },  // 2X4   - {0b11111, 0b11111}
+  { 31, 31 },  // 4X2   - {0b11111, 0b11111}
+#endif
+  { 31, 31 },  // 4X4   - {0b11111, 0b11111}
+  { 31, 30 },  // 4X8   - {0b11111, 0b11110}
+  { 30, 31 },  // 8X4   - {0b11110, 0b11111}
+  { 30, 30 },  // 8X8   - {0b11110, 0b11110}
+  { 30, 28 },  // 8X16  - {0b11110, 0b11100}
+  { 28, 30 },  // 16X8  - {0b11100, 0b11110}
+  { 28, 28 },  // 16X16 - {0b11100, 0b11100}
+  { 28, 24 },  // 16X32 - {0b11100, 0b11000}
+  { 24, 28 },  // 32X16 - {0b11000, 0b11100}
+  { 24, 24 },  // 32X32 - {0b11000, 0b11000}
+  { 24, 16 },  // 32X64 - {0b11000, 0b10000}
+  { 16, 24 },  // 64X32 - {0b10000, 0b11000}
+  { 16, 16 },  // 64X64 - {0b10000, 0b10000}
+  { 16, 0 },   // 64X128- {0b10000, 0b00000}
+  { 0, 16 },   // 128X64- {0b00000, 0b10000}
+  { 0, 0 },    // 128X128-{0b00000, 0b00000}
+#else
+#if CONFIG_CB4X4
+  { 15, 15 },  // 2X2   - {0b1111, 0b1111}
+  { 15, 15 },  // 2X4   - {0b1111, 0b1111}
+  { 15, 15 },  // 4X2   - {0b1111, 0b1111}
+#endif
+  { 15, 15 },  // 4X4   - {0b1111, 0b1111}
+  { 15, 14 },  // 4X8   - {0b1111, 0b1110}
+  { 14, 15 },  // 8X4   - {0b1110, 0b1111}
+  { 14, 14 },  // 8X8   - {0b1110, 0b1110}
+  { 14, 12 },  // 8X16  - {0b1110, 0b1100}
+  { 12, 14 },  // 16X8  - {0b1100, 0b1110}
+  { 12, 12 },  // 16X16 - {0b1100, 0b1100}
+  { 12, 8 },   // 16X32 - {0b1100, 0b1000}
+  { 8, 12 },   // 32X16 - {0b1000, 0b1100}
+  { 8, 8 },    // 32X32 - {0b1000, 0b1000}
+  { 8, 0 },    // 32X64 - {0b1000, 0b0000}
+  { 0, 8 },    // 64X32 - {0b0000, 0b1000}
+  { 0, 0 },    // 64X64 - {0b0000, 0b0000}
+#endif  // CONFIG_EXT_PARTITION
+};
+/* clang-format on */
+
+#if CONFIG_SUPERTX
+static const TX_SIZE uvsupertx_size_lookup[TX_SIZES][2][2] = {
+//  ss_x == 0 ss_x == 0   ss_x == 1 ss_x == 1
+//  ss_y == 0 ss_y == 1   ss_y == 0 ss_y == 1
+#if CONFIG_CB4X4
+  { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+  { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+  { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
+  { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
+  { { TX_32X32, TX_16X16 }, { TX_16X16, TX_16X16 } },
+#if CONFIG_TX64X64
+  { { TX_64X64, TX_32X32 }, { TX_32X32, TX_32X32 } },
+#endif  // CONFIG_TX64X64
+};
+
+#if CONFIG_EXT_PARTITION_TYPES
+static const int partition_supertx_context_lookup[EXT_PARTITION_TYPES] = {
+  -1, 0, 0, 1, 0, 0, 0, 0
+};
+
+#else
+static const int partition_supertx_context_lookup[PARTITION_TYPES] = { -1, 0, 0,
+                                                                       1 };
+#endif  // CONFIG_EXT_PARTITION_TYPES
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_ADAPT_SCAN
+#define EOB_THRESHOLD_NUM 2
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_COMMON_DATA_H_
diff --git a/third_party/aom/av1/common/convolve.c b/third_party/aom/av1/common/convolve.c
new file mode 100644
index 000000000..eab6fe7a3
--- /dev/null
+++ b/third_party/aom/av1/common/convolve.c
@@ -0,0 +1,775 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/onyxc_int.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+#define MAX_BLOCK_WIDTH (MAX_SB_SIZE)
+#define MAX_BLOCK_HEIGHT (MAX_SB_SIZE)
+#define MAX_STEP (32)
+
+void av1_convolve_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                          int dst_stride, int w, int h,
+                          const InterpFilterParams filter_params,
+                          const int subpel_x_q4, int x_step_q4,
+                          ConvolveParams *conv_params) {
+  int x, y;
+  int filter_size = filter_params.taps;
+  assert(conv_params->round == CONVOLVE_OPT_ROUND);
+  src -= filter_size / 2 - 1;
+  for (y = 0; y < h; ++y) {
+    int x_q4 = subpel_x_q4;
+    for (x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+          filter_params, x_q4 & SUBPEL_MASK);
+      int k, sum = 0;
+      for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
+
+      sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      if (conv_params->ref)
+        dst[x] = ROUND_POWER_OF_TWO(dst[x] + sum, 1);
+      else
+        dst[x] = sum;
+
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void av1_convolve_vert_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                         int dst_stride, int w, int h,
+                         const InterpFilterParams filter_params,
+                         const int subpel_y_q4, int y_step_q4,
+                         ConvolveParams *conv_params) {
+  int x, y;
+  int filter_size = filter_params.taps;
+  assert(conv_params->round == CONVOLVE_OPT_ROUND);
+  src -= src_stride * (filter_size / 2 - 1);
+  for (x = 0; x < w; ++x) {
+    int y_q4 = subpel_y_q4;
+    for (y = 0; y < h; ++y) {
+      const uint8_t *const src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+          filter_params, y_q4 & SUBPEL_MASK);
+      int k, sum = 0;
+      for (k = 0; k < filter_size; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+
+      sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      if (conv_params->ref)
+        dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + sum, 1);
+      else
+        dst[y * dst_stride] = sum;
+
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void convolve_copy(const uint8_t *src, int src_stride, uint8_t *dst,
+                          int dst_stride, int w, int h,
+                          ConvolveParams *conv_params) {
+  assert(conv_params->round == CONVOLVE_OPT_ROUND);
+  if (conv_params->ref == 0) {
+    int r;
+    for (r = 0; r < h; ++r) {
+      memcpy(dst, src, w);
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else {
+    int r, c;
+    for (r = 0; r < h; ++r) {
+      for (c = 0; c < w; ++c) {
+        dst[c] = clip_pixel(ROUND_POWER_OF_TWO(dst[c] + src[c], 1));
+      }
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+}
+
+void av1_convolve_horiz_facade(const uint8_t *src, int src_stride, uint8_t *dst,
+                               int dst_stride, int w, int h,
+                               const InterpFilterParams filter_params,
+                               const int subpel_x_q4, int x_step_q4,
+                               ConvolveParams *conv_params) {
+  assert(conv_params->round == CONVOLVE_OPT_ROUND);
+  if (filter_params.taps == SUBPEL_TAPS) {
+    const int16_t *filter_x =
+        av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
+    if (conv_params->ref == 0)
+      aom_convolve8_horiz(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+                          NULL, -1, w, h);
+    else
+      aom_convolve8_avg_horiz(src, src_stride, dst, dst_stride, filter_x,
+                              x_step_q4, NULL, -1, w, h);
+  } else {
+    av1_convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params,
+                       subpel_x_q4, x_step_q4, conv_params);
+  }
+}
+
+void av1_convolve_horiz_facade_c(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride, int w, int h,
+                                 const InterpFilterParams filter_params,
+                                 const int subpel_x_q4, int x_step_q4,
+                                 ConvolveParams *conv_params) {
+  assert(conv_params->round == CONVOLVE_OPT_ROUND);
+  if (filter_params.taps == SUBPEL_TAPS) {
+    const int16_t *filter_x =
+        av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
+    if (conv_params->ref == 0)
+      aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+                            x_step_q4, NULL, -1, w, h);
+    else
+      aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+                                x_step_q4, NULL, -1, w, h);
+  } else {
+    av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
+                         subpel_x_q4, x_step_q4, conv_params);
+  }
+}
+
+void av1_convolve_vert_facade(const uint8_t *src, int src_stride, uint8_t *dst,
+                              int dst_stride, int w, int h,
+                              const InterpFilterParams filter_params,
+                              const int subpel_y_q4, int y_step_q4,
+                              ConvolveParams *conv_params) {
+  assert(conv_params->round == CONVOLVE_OPT_ROUND);
+  if (filter_params.taps == SUBPEL_TAPS) {
+    const int16_t *filter_y =
+        av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
+    if (conv_params->ref == 0) {
+      aom_convolve8_vert(src, src_stride, dst, dst_stride, NULL, -1, filter_y,
+                         y_step_q4, w, h);
+    } else {
+      aom_convolve8_avg_vert(src, src_stride, dst, dst_stride, NULL, -1,
+                             filter_y, y_step_q4, w, h);
+    }
+  } else {
+    av1_convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
+                      subpel_y_q4, y_step_q4, conv_params);
+  }
+}
+
+void av1_convolve_vert_facade_c(const uint8_t *src, int src_stride,
+                                uint8_t *dst, int dst_stride, int w, int h,
+                                const InterpFilterParams filter_params,
+                                const int subpel_y_q4, int y_step_q4,
+                                ConvolveParams *conv_params) {
+  assert(conv_params->round == CONVOLVE_OPT_ROUND);
+  if (filter_params.taps == SUBPEL_TAPS) {
+    const int16_t *filter_y =
+        av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
+    if (conv_params->ref == 0) {
+      aom_convolve8_vert_c(src, src_stride, dst, dst_stride, NULL, -1, filter_y,
+                           y_step_q4, w, h);
+    } else {
+      aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, NULL, -1,
+                               filter_y, y_step_q4, w, h);
+    }
+  } else {
+    av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
+                        subpel_y_q4, y_step_q4, conv_params);
+  }
+}
+
+#if CONFIG_CONVOLVE_ROUND
+void av1_convolve_rounding(const int32_t *src, int src_stride, uint8_t *dst,
+                           int dst_stride, int w, int h, int bits) {
+  int r, c;
+  for (r = 0; r < h; ++r) {
+    for (c = 0; c < w; ++c) {
+      dst[r * dst_stride + c] =
+          clip_pixel(ROUND_POWER_OF_TWO_SIGNED(src[r * src_stride + c], bits));
+    }
+  }
+}
+
+void av1_convolve_2d(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
+                     int dst_stride, int w, int h,
+                     InterpFilterParams *filter_params_x,
+                     InterpFilterParams *filter_params_y, const int subpel_x_q4,
+                     const int subpel_y_q4, ConvolveParams *conv_params) {
+  int x, y, k;
+  CONV_BUF_TYPE im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = w;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  (void)conv_params;
+  // horizontal filter
+  const uint8_t *src_horiz = src - fo_vert * src_stride;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  for (y = 0; y < im_h; ++y) {
+    for (x = 0; x < w; ++x) {
+      CONV_BUF_TYPE sum = 0;
+      for (k = 0; k < filter_params_x->taps; ++k) {
+        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+      }
+#if CONFIG_COMPOUND_ROUND
+      im_block[y * im_stride + x] =
+          clip_pixel(ROUND_POWER_OF_TWO_SIGNED(sum, conv_params->round_0));
+#else
+      im_block[y * im_stride + x] =
+          ROUND_POWER_OF_TWO_SIGNED(sum, conv_params->round_0);
+#endif
+    }
+  }
+
+  // vertical filter
+  CONV_BUF_TYPE *src_vert = im_block + fo_vert * im_stride;
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      CONV_BUF_TYPE sum = 0;
+      for (k = 0; k < filter_params_y->taps; ++k) {
+        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+      }
+      dst[y * dst_stride + x] +=
+          ROUND_POWER_OF_TWO_SIGNED(sum, conv_params->round_1);
+    }
+  }
+}
+
+static INLINE void transpose_uint8(uint8_t *dst, int dst_stride,
+                                   const uint8_t *src, int src_stride, int w,
+                                   int h) {
+  int r, c;
+  for (r = 0; r < h; ++r)
+    for (c = 0; c < w; ++c)
+      dst[c * (dst_stride) + r] = src[r * (src_stride) + c];
+}
+
+static INLINE void transpose_int32(int32_t *dst, int dst_stride,
+                                   const int32_t *src, int src_stride, int w,
+                                   int h) {
+  int r, c;
+  for (r = 0; r < h; ++r)
+    for (c = 0; c < w; ++c)
+      dst[c * (dst_stride) + r] = src[r * (src_stride) + c];
+}
+
+void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilter *interp_filter,
+                            const int subpel_x_q4, int x_step_q4,
+                            const int subpel_y_q4, int y_step_q4,
+                            ConvolveParams *conv_params) {
+  (void)x_step_q4;
+  (void)y_step_q4;
+  (void)dst;
+  (void)dst_stride;
+#if CONFIG_DUAL_FILTER
+  InterpFilterParams filter_params_x =
+      av1_get_interp_filter_params(interp_filter[1 + 2 * conv_params->ref]);
+  InterpFilterParams filter_params_y =
+      av1_get_interp_filter_params(interp_filter[0 + 2 * conv_params->ref]);
+
+  if (filter_params_x.interp_filter == MULTITAP_SHARP &&
+      filter_params_y.interp_filter == MULTITAP_SHARP) {
+    // Avoid two directions both using 12-tap filter.
+    // This will reduce hardware implementation cost.
+    filter_params_y = av1_get_interp_filter_params(EIGHTTAP_SHARP);
+  }
+#else
+  InterpFilterParams filter_params_x =
+      av1_get_interp_filter_params(*interp_filter);
+  InterpFilterParams filter_params_y =
+      av1_get_interp_filter_params(*interp_filter);
+#endif
+
+  if (filter_params_y.taps < filter_params_x.taps) {
+    uint8_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) *
+                   (MAX_SB_SIZE + MAX_FILTER_TAP - 1)];
+    int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
+    CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE];
+    int tr_dst_stride = MAX_SB_SIZE;
+    int fo_vert = filter_params_y.taps / 2 - 1;
+    int fo_horiz = filter_params_x.taps / 2 - 1;
+
+    transpose_uint8(tr_src, tr_src_stride,
+                    src - fo_vert * src_stride - fo_horiz, src_stride,
+                    w + filter_params_x.taps - 1, h + filter_params_y.taps - 1);
+    transpose_int32(tr_dst, tr_dst_stride, conv_params->dst,
+                    conv_params->dst_stride, w, h);
+
+    // horizontal and vertical parameters are swapped because of the transpose
+    av1_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride,
+                    tr_dst, tr_dst_stride, h, w, &filter_params_y,
+                    &filter_params_x, subpel_y_q4, subpel_x_q4, conv_params);
+    transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst,
+                    tr_dst_stride, h, w);
+  } else {
+    av1_convolve_2d(src, src_stride, conv_params->dst, conv_params->dst_stride,
+                    w, h, &filter_params_x, &filter_params_y, subpel_x_q4,
+                    subpel_y_q4, conv_params);
+  }
+}
+
+#endif  // CONFIG_CONVOLVE_ROUND
+
+typedef void (*ConvolveFunc)(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             const InterpFilterParams filter_params,
+                             const int subpel_q4, int step_q4,
+                             ConvolveParams *conv_params);
+
+static void convolve_helper(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+#if CONFIG_DUAL_FILTER
+                            const InterpFilter *interp_filter,
+#else
+                            const InterpFilter interp_filter,
+#endif
+                            const int subpel_x_q4, int x_step_q4,
+                            const int subpel_y_q4, int y_step_q4,
+                            ConvolveParams *conv_params,
+                            ConvolveFunc convolve_horiz,
+                            ConvolveFunc convolve_vert) {
+  int ignore_horiz = x_step_q4 == 16 && subpel_x_q4 == 0;
+  int ignore_vert = y_step_q4 == 16 && subpel_y_q4 == 0;
+#if CONFIG_DUAL_FILTER
+  InterpFilterParams filter_params_x =
+      av1_get_interp_filter_params(interp_filter[1 + 2 * conv_params->ref]);
+  InterpFilterParams filter_params_y =
+      av1_get_interp_filter_params(interp_filter[0 + 2 * conv_params->ref]);
+  InterpFilterParams filter_params;
+#else
+  InterpFilterParams filter_params =
+      av1_get_interp_filter_params(interp_filter);
+#endif
+  assert(conv_params->round == CONVOLVE_OPT_ROUND);
+
+  assert(w <= MAX_BLOCK_WIDTH);
+  assert(h <= MAX_BLOCK_HEIGHT);
+  assert(y_step_q4 <= MAX_STEP);
+  assert(x_step_q4 <= MAX_STEP);
+
+  if (ignore_horiz && ignore_vert) {
+    convolve_copy(src, src_stride, dst, dst_stride, w, h, conv_params);
+  } else if (ignore_vert) {
+#if CONFIG_DUAL_FILTER
+    filter_params = filter_params_x;
+#endif
+    assert(filter_params.taps <= MAX_FILTER_TAP);
+    convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params,
+                   subpel_x_q4, x_step_q4, conv_params);
+  } else if (ignore_horiz) {
+#if CONFIG_DUAL_FILTER
+    filter_params = filter_params_y;
+#endif
+    assert(filter_params.taps <= MAX_FILTER_TAP);
+    convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
+                  subpel_y_q4, y_step_q4, conv_params);
+  } else {
+    // temp's size is set to a 256 aligned value to facilitate SIMD
+    // implementation. The value is greater than (maximum possible intermediate
+    // height or width) * MAX_SB_SIZE
+    DECLARE_ALIGNED(16, uint8_t,
+                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+    int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
+    int filter_size;
+#if CONFIG_DUAL_FILTER
+    if (interp_filter[0 + 2 * conv_params->ref] == MULTITAP_SHARP &&
+        interp_filter[1 + 2 * conv_params->ref] == MULTITAP_SHARP) {
+      // Avoid two directions both using 12-tap filter.
+      // This will reduce hardware implementation cost.
+      filter_params_y = av1_get_interp_filter_params(EIGHTTAP_SHARP);
+    }
+
+    // we do filter with fewer taps first to reduce hardware implementation
+    // complexity
+    if (filter_params_y.taps < filter_params_x.taps) {
+      int intermediate_width;
+      int temp_stride = max_intermediate_size;
+      ConvolveParams temp_conv_params;
+      temp_conv_params.ref = 0;
+      temp_conv_params.round = CONVOLVE_OPT_ROUND;
+      filter_params = filter_params_y;
+      filter_size = filter_params_x.taps;
+      intermediate_width =
+          (((w - 1) * x_step_q4 + subpel_x_q4) >> SUBPEL_BITS) + filter_size;
+      assert(intermediate_width <= max_intermediate_size);
+
+      assert(filter_params.taps <= MAX_FILTER_TAP);
+
+      convolve_vert(src - (filter_size / 2 - 1), src_stride, temp, temp_stride,
+                    intermediate_width, h, filter_params, subpel_y_q4,
+                    y_step_q4, &temp_conv_params);
+
+      filter_params = filter_params_x;
+      assert(filter_params.taps <= MAX_FILTER_TAP);
+      convolve_horiz(temp + (filter_size / 2 - 1), temp_stride, dst, dst_stride,
+                     w, h, filter_params, subpel_x_q4, x_step_q4, conv_params);
+    } else
+#endif  // CONFIG_DUAL_FILTER
+    {
+      int intermediate_height;
+      int temp_stride = MAX_SB_SIZE;
+      ConvolveParams temp_conv_params;
+      temp_conv_params.ref = 0;
+      temp_conv_params.round = CONVOLVE_OPT_ROUND;
+#if CONFIG_DUAL_FILTER
+      filter_params = filter_params_x;
+      filter_size = filter_params_y.taps;
+#else
+      filter_size = filter_params.taps;
+#endif
+      intermediate_height =
+          (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
+      assert(intermediate_height <= max_intermediate_size);
+      (void)max_intermediate_size;
+
+      assert(filter_params.taps <= MAX_FILTER_TAP);
+
+      convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride, temp,
+                     temp_stride, w, intermediate_height, filter_params,
+                     subpel_x_q4, x_step_q4, &temp_conv_params);
+
+#if CONFIG_DUAL_FILTER
+      filter_params = filter_params_y;
+#endif
+      assert(filter_params.taps <= MAX_FILTER_TAP);
+
+      convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride,
+                    dst, dst_stride, w, h, filter_params, subpel_y_q4,
+                    y_step_q4, conv_params);
+    }
+  }
+}
+
+void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
+                  int dst_stride, int w, int h,
+#if CONFIG_DUAL_FILTER
+                  const InterpFilter *interp_filter,
+#else
+                  const InterpFilter interp_filter,
+#endif
+                  const int subpel_x_q4, int x_step_q4, const int subpel_y_q4,
+                  int y_step_q4, ConvolveParams *conv_params) {
+  convolve_helper(src, src_stride, dst, dst_stride, w, h, interp_filter,
+                  subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, conv_params,
+                  av1_convolve_horiz_facade, av1_convolve_vert_facade);
+}
+
+void av1_convolve_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                    int dst_stride, int w, int h,
+#if CONFIG_DUAL_FILTER
+                    const InterpFilter *interp_filter,
+#else
+                    const InterpFilter interp_filter,
+#endif
+                    const int subpel_x_q4, int x_step_q4, const int subpel_y_q4,
+                    int y_step_q4, ConvolveParams *conv_params) {
+  convolve_helper(src, src_stride, dst, dst_stride, w, h, interp_filter,
+                  subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, conv_params,
+                  av1_convolve_horiz_facade_c, av1_convolve_vert_facade_c);
+}
+
+void av1_lowbd_convolve_init_c(void) {
+  // A placeholder for SIMD initialization
+  return;
+}
+
+void av1_highbd_convolve_init_c(void) {
+  // A placeholder for SIMD initialization
+  return;
+}
+
+void av1_convolve_init(AV1_COMMON *cm) {
+#if CONFIG_HIGHBITDEPTH
+  if (cm->use_highbitdepth)
+    av1_highbd_convolve_init();
+  else
+    av1_lowbd_convolve_init();
+#else
+  (void)cm;
+  av1_lowbd_convolve_init();
+#endif
+  return;
+}
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_convolve_horiz_c(const uint16_t *src, int src_stride,
+                                 uint16_t *dst, int dst_stride, int w, int h,
+                                 const InterpFilterParams filter_params,
+                                 const int subpel_x_q4, int x_step_q4, int avg,
+                                 int bd) {
+  int x, y;
+  int filter_size = filter_params.taps;
+  src -= filter_size / 2 - 1;
+  for (y = 0; y < h; ++y) {
+    int x_q4 = subpel_x_q4;
+    for (x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+          filter_params, x_q4 & SUBPEL_MASK);
+      int k, sum = 0;
+      for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
+      if (avg)
+        dst[x] = ROUND_POWER_OF_TWO(
+            dst[x] +
+                clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
+            1);
+      else
+        dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void av1_highbd_convolve_vert_c(const uint16_t *src, int src_stride,
+                                uint16_t *dst, int dst_stride, int w, int h,
+                                const InterpFilterParams filter_params,
+                                const int subpel_y_q4, int y_step_q4, int avg,
+                                int bd) {
+  int x, y;
+  int filter_size = filter_params.taps;
+  src -= src_stride * (filter_size / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = subpel_y_q4;
+    for (y = 0; y < h; ++y) {
+      const uint16_t *const src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+          filter_params, y_q4 & SUBPEL_MASK);
+      int k, sum = 0;
+      for (k = 0; k < filter_size; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      if (avg) {
+        dst[y * dst_stride] = ROUND_POWER_OF_TWO(
+            dst[y * dst_stride] +
+                clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
+            1);
+      } else {
+        dst[y * dst_stride] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      }
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void highbd_convolve_copy(const uint16_t *src, int src_stride,
+                                 uint16_t *dst, int dst_stride, int w, int h,
+                                 int avg, int bd) {
+  if (avg == 0) {
+    int r;
+    for (r = 0; r < h; ++r) {
+      memcpy(dst, src, w * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else {
+    int r, c;
+    for (r = 0; r < h; ++r) {
+      for (c = 0; c < w; ++c) {
+        dst[c] = clip_pixel_highbd(ROUND_POWER_OF_TWO(dst[c] + src[c], 1), bd);
+      }
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+}
+
+void av1_highbd_convolve_horiz_facade(const uint8_t *src8, int src_stride,
+                                      uint8_t *dst8, int dst_stride, int w,
+                                      int h,
+                                      const InterpFilterParams filter_params,
+                                      const int subpel_x_q4, int x_step_q4,
+                                      int avg, int bd) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  if (filter_params.taps == SUBPEL_TAPS) {
+    const int16_t *filter_x =
+        av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
+    if (avg == 0)
+      aom_highbd_convolve8_horiz(src8, src_stride, dst8, dst_stride, filter_x,
+                                 x_step_q4, NULL, -1, w, h, bd);
+    else
+      aom_highbd_convolve8_avg_horiz(src8, src_stride, dst8, dst_stride,
+                                     filter_x, x_step_q4, NULL, -1, w, h, bd);
+  } else {
+    av1_highbd_convolve_horiz(src, src_stride, dst, dst_stride, w, h,
+                              filter_params, subpel_x_q4, x_step_q4, avg, bd);
+  }
+}
+
+void av1_highbd_convolve_vert_facade(const uint8_t *src8, int src_stride,
+                                     uint8_t *dst8, int dst_stride, int w,
+                                     int h,
+                                     const InterpFilterParams filter_params,
+                                     const int subpel_y_q4, int y_step_q4,
+                                     int avg, int bd) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+  if (filter_params.taps == SUBPEL_TAPS) {
+    const int16_t *filter_y =
+        av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
+    if (avg == 0) {
+      aom_highbd_convolve8_vert(src8, src_stride, dst8, dst_stride, NULL, -1,
+                                filter_y, y_step_q4, w, h, bd);
+    } else {
+      aom_highbd_convolve8_avg_vert(src8, src_stride, dst8, dst_stride, NULL,
+                                    -1, filter_y, y_step_q4, w, h, bd);
+    }
+  } else {
+    av1_highbd_convolve_vert(src, src_stride, dst, dst_stride, w, h,
+                             filter_params, subpel_y_q4, y_step_q4, avg, bd);
+  }
+}
+
+void av1_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
+                         int dst_stride, int w, int h,
+#if CONFIG_DUAL_FILTER
+                         const InterpFilter *interp_filter,
+#else
+                         const InterpFilter interp_filter,
+#endif
+                         const int subpel_x_q4, int x_step_q4,
+                         const int subpel_y_q4, int y_step_q4, int ref_idx,
+                         int bd) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  int ignore_horiz = x_step_q4 == 16 && subpel_x_q4 == 0;
+  int ignore_vert = y_step_q4 == 16 && subpel_y_q4 == 0;
+
+  assert(w <= MAX_BLOCK_WIDTH);
+  assert(h <= MAX_BLOCK_HEIGHT);
+  assert(y_step_q4 <= MAX_STEP);
+  assert(x_step_q4 <= MAX_STEP);
+
+  if (ignore_horiz && ignore_vert) {
+    highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h, ref_idx, bd);
+  } else if (ignore_vert) {
+#if CONFIG_DUAL_FILTER
+    InterpFilterParams filter_params =
+        av1_get_interp_filter_params(interp_filter[1 + 2 * ref_idx]);
+#else
+    InterpFilterParams filter_params =
+        av1_get_interp_filter_params(interp_filter);
+#endif
+    av1_highbd_convolve_horiz_facade(src8, src_stride, dst8, dst_stride, w, h,
+                                     filter_params, subpel_x_q4, x_step_q4,
+                                     ref_idx, bd);
+  } else if (ignore_horiz) {
+#if CONFIG_DUAL_FILTER
+    InterpFilterParams filter_params =
+        av1_get_interp_filter_params(interp_filter[0 + 2 * ref_idx]);
+#else
+    InterpFilterParams filter_params =
+        av1_get_interp_filter_params(interp_filter);
+#endif
+    av1_highbd_convolve_vert_facade(src8, src_stride, dst8, dst_stride, w, h,
+                                    filter_params, subpel_y_q4, y_step_q4,
+                                    ref_idx, bd);
+  } else {
+    // temp's size is set to a 256 aligned value to facilitate SIMD
+    // implementation. The value is greater than (maximum possible intermediate
+    // height or width) * MAX_SB_SIZE
+    DECLARE_ALIGNED(16, uint16_t,
+                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+    uint8_t *temp8 = CONVERT_TO_BYTEPTR(temp);
+    int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
+    int filter_size;
+    InterpFilterParams filter_params;
+#if CONFIG_DUAL_FILTER
+    InterpFilterParams filter_params_x =
+        av1_get_interp_filter_params(interp_filter[1 + 2 * ref_idx]);
+    InterpFilterParams filter_params_y =
+        av1_get_interp_filter_params(interp_filter[0 + 2 * ref_idx]);
+    if (interp_filter[0 + 2 * ref_idx] == MULTITAP_SHARP &&
+        interp_filter[1 + 2 * ref_idx] == MULTITAP_SHARP) {
+      // Avoid two directions both using 12-tap filter.
+      // This will reduce hardware implementation cost.
+      filter_params_y = av1_get_interp_filter_params(EIGHTTAP_SHARP);
+    }
+#endif
+
+#if CONFIG_DUAL_FILTER
+    if (filter_params_y.taps < filter_params_x.taps) {
+      int intermediate_width;
+      int temp_stride = max_intermediate_size;
+      filter_params = filter_params_y;
+      filter_size = filter_params_x.taps;
+      intermediate_width =
+          (((w - 1) * x_step_q4 + subpel_x_q4) >> SUBPEL_BITS) + filter_size;
+      assert(intermediate_width <= max_intermediate_size);
+
+      assert(filter_params.taps <= MAX_FILTER_TAP);
+
+      av1_highbd_convolve_vert_facade(
+          src8 - (filter_size / 2 - 1), src_stride, temp8, temp_stride,
+          intermediate_width, h, filter_params, subpel_y_q4, y_step_q4, 0, bd);
+
+      filter_params = filter_params_x;
+      assert(filter_params.taps <= MAX_FILTER_TAP);
+
+      av1_highbd_convolve_horiz_facade(
+          temp8 + (filter_size / 2 - 1), temp_stride, dst8, dst_stride, w, h,
+          filter_params, subpel_x_q4, x_step_q4, ref_idx, bd);
+    } else
+#endif  // CONFIG_DUAL_FILTER
+    {
+      int intermediate_height;
+      int temp_stride = MAX_SB_SIZE;
+#if CONFIG_DUAL_FILTER
+      filter_params = filter_params_x;
+      filter_size = filter_params_y.taps;
+#else
+      filter_params = av1_get_interp_filter_params(interp_filter);
+      filter_size = filter_params.taps;
+#endif
+      intermediate_height =
+          (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
+      assert(intermediate_height <= max_intermediate_size);
+      (void)max_intermediate_size;
+
+      av1_highbd_convolve_horiz_facade(
+          src8 - src_stride * (filter_size / 2 - 1), src_stride, temp8,
+          temp_stride, w, intermediate_height, filter_params, subpel_x_q4,
+          x_step_q4, 0, bd);
+
+#if CONFIG_DUAL_FILTER
+      filter_params = filter_params_y;
+#endif
+      filter_size = filter_params.taps;
+      assert(filter_params.taps <= MAX_FILTER_TAP);
+
+      av1_highbd_convolve_vert_facade(
+          temp8 + temp_stride * (filter_size / 2 - 1), temp_stride, dst8,
+          dst_stride, w, h, filter_params, subpel_y_q4, y_step_q4, ref_idx, bd);
+    }
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/av1/common/convolve.h b/third_party/aom/av1/common/convolve.h
new file mode 100644
index 000000000..4a4dd8cdb
--- /dev/null
+++ b/third_party/aom/av1/common/convolve.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_AV1_CONVOLVE_H_
+#define AV1_COMMON_AV1_CONVOLVE_H_
+#include "av1/common/filter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum CONVOLVE_OPT {
+  // indicate the results in dst buf is rounded by FILTER_BITS or not
+  CONVOLVE_OPT_ROUND,
+  CONVOLVE_OPT_NO_ROUND,
+} CONVOLVE_OPT;
+
+typedef int32_t CONV_BUF_TYPE;
+
+typedef struct ConvolveParams {
+  int ref;
+  CONVOLVE_OPT round;
+  CONV_BUF_TYPE *dst;
+  int dst_stride;
+  int round_0;
+  int round_1;
+  int plane;
+} ConvolveParams;
+
+static INLINE ConvolveParams get_conv_params(int ref, int plane) {
+  ConvolveParams conv_params;
+  conv_params.ref = ref;
+  conv_params.round = CONVOLVE_OPT_ROUND;
+  conv_params.plane = plane;
+  return conv_params;
+}
+struct AV1Common;
+void av1_convolve_init(struct AV1Common *cm);
+#if CONFIG_CONVOLVE_ROUND
+void av1_convolve_2d(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
+                     int dst_stride, int w, int h,
+                     InterpFilterParams *filter_params_x,
+                     InterpFilterParams *filter_params_y, const int subpel_x_q4,
+                     const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilter *interp_filter,
+                            const int subpel_x_q4, int x_step_q4,
+                            const int subpel_y_q4, int y_step_q4,
+                            ConvolveParams *conv_params);
+
+static INLINE ConvolveParams get_conv_params_no_round(int ref, int plane,
+                                                      int32_t *dst,
+                                                      int dst_stride) {
+  ConvolveParams conv_params;
+  conv_params.ref = ref;
+  conv_params.round = CONVOLVE_OPT_NO_ROUND;
+#if CONFIG_COMPOUND_ROUND
+  conv_params.round_0 = FILTER_BITS;
+#else
+  conv_params.round_0 = 5;
+#endif
+  conv_params.round_1 = 0;
+  conv_params.dst = dst;
+  conv_params.dst_stride = dst_stride;
+  conv_params.plane = plane;
+  return conv_params;
+}
+
+void av1_convolve_rounding(const int32_t *src, int src_stride, uint8_t *dst,
+                           int dst_stride, int w, int h, int bits);
+#endif  // CONFIG_CONVOLVE_ROUND
+
+void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
+                  int dst_stride, int w, int h,
+#if CONFIG_DUAL_FILTER
+                  const InterpFilter *interp_filter,
+#else
+                  const InterpFilter interp_filter,
+#endif
+                  const int subpel_x, int xstep, const int subpel_y, int ystep,
+                  ConvolveParams *conv_params);
+
+void av1_convolve_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                    int dst_stride, int w, int h,
+#if CONFIG_DUAL_FILTER
+                    const InterpFilter *interp_filter,
+#else
+                    const InterpFilter interp_filter,
+#endif
+                    const int subpel_x, int xstep, const int subpel_y,
+                    int ystep, ConvolveParams *conv_params);
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
+                         int dst_stride, int w, int h,
+#if CONFIG_DUAL_FILTER
+                         const InterpFilter *interp_filter,
+#else
+                         const InterpFilter interp_filter,
+#endif
+                         const int subpel_x, int xstep, const int subpel_y,
+                         int ystep, int avg, int bd);
+#endif  // CONFIG_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_AV1_CONVOLVE_H_
diff --git a/third_party/aom/av1/common/debugmodes.c b/third_party/aom/av1/common/debugmodes.c
new file mode 100644
index 000000000..d7b31c1e4
--- /dev/null
+++ b/third_party/aom/av1/common/debugmodes.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+
+static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) {
+  fprintf(f, "%s", str);
+  fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_video_frame,
+          cm->show_frame, cm->base_qindex);
+}
+/* This function dereferences a pointer to the mbmi structure
+ * and uses the passed in member offset to print out the value of an integer
+ * for each mbmi member value in the mi structure.
+ */
+static void print_mi_data(AV1_COMMON *cm, FILE *file, const char *descriptor,
+                          size_t member_offset) {
+  int mi_row, mi_col;
+  MODE_INFO **mi = cm->mi_grid_visible;
+  int rows = cm->mi_rows;
+  int cols = cm->mi_cols;
+  char prefix = descriptor[0];
+
+  log_frame_info(cm, descriptor, file);
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(file, "%c ", prefix);
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(file, "%2d ", *((int *)((char *)(&mi[0]->mbmi) + member_offset)));
+      mi++;
+    }
+    fprintf(file, "\n");
+    mi += 8;
+  }
+  fprintf(file, "\n");
+}
+
+void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
+  int mi_row;
+  int mi_col;
+  FILE *mvs = fopen(file, "a");
+  MODE_INFO **mi = cm->mi_grid_visible;
+  int rows = cm->mi_rows;
+  int cols = cm->mi_cols;
+
+  print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
+  print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
+  print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
+  print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
+  print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
+
+  // output skip infomation.
+  log_frame_info(cm, "Skips:", mvs);
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(mvs, "S ");
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%2d ", mi[0]->mbmi.skip);
+      mi++;
+    }
+    fprintf(mvs, "\n");
+    mi += 8;
+  }
+  fprintf(mvs, "\n");
+
+  // output motion vectors.
+  log_frame_info(cm, "Vectors ", mvs);
+  mi = cm->mi_grid_visible;
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(mvs, "V ");
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%4d:%4d ", mi[0]->mbmi.mv[0].as_mv.row,
+              mi[0]->mbmi.mv[0].as_mv.col);
+      mi++;
+    }
+    fprintf(mvs, "\n");
+    mi += 8;
+  }
+  fprintf(mvs, "\n");
+
+  fclose(mvs);
+}
diff --git a/third_party/aom/av1/common/entropy.c b/third_party/aom/av1/common/entropy.c
new file mode 100644
index 000000000..14ab53ca0
--- /dev/null
+++ b/third_party/aom/av1/common/entropy.c
@@ -0,0 +1,6438 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/blockd.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/scan.h"
+#if CONFIG_LV_MAP
+#include "av1/common/txb_common.h"
+#endif
+
+// Unconstrained Node Tree
+/* clang-format off */
+const aom_tree_index av1_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+  2, 6,                                // 0 = LOW_VAL
+  -TWO_TOKEN, 4,                       // 1 = TWO
+  -THREE_TOKEN, -FOUR_TOKEN,           // 2 = THREE
+  8, 10,                               // 3 = HIGH_LOW
+  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 4 = CAT_ONE
+  12, 14,                              // 5 = CAT_THREEFOUR
+  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 6 = CAT_THREE
+  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 7 = CAT_FIVE
+};
+/* clang-format on */
+
+#if CONFIG_NEW_MULTISYMBOL
+/* Extra bits coded from LSB to MSB */
+const aom_cdf_prob av1_cat1_cdf0[CDF_SIZE(2)] = { AOM_ICDF(20352),
+                                                  AOM_ICDF(32768), 0 };
+const aom_cdf_prob *av1_cat1_cdf[] = { av1_cat1_cdf0 };
+
+const aom_cdf_prob av1_cat2_cdf0[CDF_SIZE(4)] = {
+  AOM_ICDF(11963), AOM_ICDF(21121), AOM_ICDF(27719), AOM_ICDF(32768), 0
+};
+const aom_cdf_prob *av1_cat2_cdf[] = { av1_cat2_cdf0 };
+const aom_cdf_prob av1_cat3_cdf0[CDF_SIZE(8)] = {
+  AOM_ICDF(7001),  AOM_ICDF(12802), AOM_ICDF(17911),
+  AOM_ICDF(22144), AOM_ICDF(25503), AOM_ICDF(28286),
+  AOM_ICDF(30737), AOM_ICDF(32768), 0
+};
+const aom_cdf_prob *av1_cat3_cdf[] = { av1_cat3_cdf0 };
+
+const aom_cdf_prob av1_cat4_cdf0[CDF_SIZE(16)] = { AOM_ICDF(3934),
+                                                   AOM_ICDF(7460),
+                                                   AOM_ICDF(10719),
+                                                   AOM_ICDF(13640),
+                                                   AOM_ICDF(16203),
+                                                   AOM_ICDF(18500),
+                                                   AOM_ICDF(20624),
+                                                   AOM_ICDF(22528),
+                                                   AOM_ICDF(24316),
+                                                   AOM_ICDF(25919),
+                                                   AOM_ICDF(27401),
+                                                   AOM_ICDF(28729),
+                                                   AOM_ICDF(29894),
+                                                   AOM_ICDF(30938),
+                                                   AOM_ICDF(31903),
+                                                   AOM_ICDF(32768),
+                                                   0 };
+const aom_cdf_prob *av1_cat4_cdf[] = { av1_cat4_cdf0 };
+
+const aom_cdf_prob av1_cat5_cdf0[CDF_SIZE(16)] = { AOM_ICDF(2942),
+                                                   AOM_ICDF(5794),
+                                                   AOM_ICDF(8473),
+                                                   AOM_ICDF(11069),
+                                                   AOM_ICDF(13469),
+                                                   AOM_ICDF(15795),
+                                                   AOM_ICDF(17980),
+                                                   AOM_ICDF(20097),
+                                                   AOM_ICDF(21952),
+                                                   AOM_ICDF(23750),
+                                                   AOM_ICDF(25439),
+                                                   AOM_ICDF(27076),
+                                                   AOM_ICDF(28589),
+                                                   AOM_ICDF(30056),
+                                                   AOM_ICDF(31434),
+                                                   AOM_ICDF(32768),
+                                                   0 };
+const aom_cdf_prob av1_cat5_cdf1[CDF_SIZE(2)] = { AOM_ICDF(23040),
+                                                  AOM_ICDF(32768), 0 };
+const aom_cdf_prob *av1_cat5_cdf[] = { av1_cat5_cdf0, av1_cat5_cdf1 };
+
+const aom_cdf_prob av1_cat6_cdf0[CDF_SIZE(16)] = {
+  AOM_ICDF(2382),  AOM_ICDF(4727),  AOM_ICDF(7036),  AOM_ICDF(9309),
+  AOM_ICDF(11512), AOM_ICDF(13681), AOM_ICDF(15816), AOM_ICDF(17918),
+  AOM_ICDF(19892), AOM_ICDF(21835), AOM_ICDF(23748), AOM_ICDF(25632),
+  AOM_ICDF(27458), AOM_ICDF(29255), AOM_ICDF(31024), AOM_ICDF(32768)
+};
+const aom_cdf_prob av1_cat6_cdf1[CDF_SIZE(16)] = {
+  AOM_ICDF(9314),  AOM_ICDF(15584), AOM_ICDF(19741), AOM_ICDF(22540),
+  AOM_ICDF(25391), AOM_ICDF(27310), AOM_ICDF(28583), AOM_ICDF(29440),
+  AOM_ICDF(30493), AOM_ICDF(31202), AOM_ICDF(31672), AOM_ICDF(31988),
+  AOM_ICDF(32310), AOM_ICDF(32527), AOM_ICDF(32671), AOM_ICDF(32768)
+};
+const aom_cdf_prob av1_cat6_cdf2[CDF_SIZE(16)] = {
+  AOM_ICDF(29548), AOM_ICDF(31129), AOM_ICDF(31960), AOM_ICDF(32004),
+  AOM_ICDF(32473), AOM_ICDF(32498), AOM_ICDF(32511), AOM_ICDF(32512),
+  AOM_ICDF(32745), AOM_ICDF(32757), AOM_ICDF(32763), AOM_ICDF(32764),
+  AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)
+};
+const aom_cdf_prob av1_cat6_cdf3[CDF_SIZE(16)] = {
+  AOM_ICDF(32006), AOM_ICDF(32258), AOM_ICDF(32510), AOM_ICDF(32512),
+  AOM_ICDF(32638), AOM_ICDF(32639), AOM_ICDF(32640), AOM_ICDF(32641),
+  AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+  AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)
+};
+const aom_cdf_prob av1_cat6_cdf4[CDF_SIZE(4)] = {
+  AOM_ICDF(32513), AOM_ICDF(32641), AOM_ICDF(32767), AOM_ICDF(32768)
+};
+const aom_cdf_prob *av1_cat6_cdf[] = {
+  av1_cat6_cdf0, av1_cat6_cdf1, av1_cat6_cdf2, av1_cat6_cdf3, av1_cat6_cdf4
+};
+#endif
+/* Extra bits coded from MSB to LSB */
+const aom_prob av1_cat1_prob[] = { 159 };
+const aom_prob av1_cat2_prob[] = { 165, 145 };
+const aom_prob av1_cat3_prob[] = { 173, 148, 140 };
+const aom_prob av1_cat4_prob[] = { 176, 155, 140, 135 };
+const aom_prob av1_cat5_prob[] = { 180, 157, 141, 134, 130 };
+const aom_prob av1_cat6_prob[] = {
+  255, 255, 255, 255, 254, 254, 254, 252, 249,
+  243, 230, 196, 177, 153, 140, 133, 130, 129
+};
+
+const uint16_t band_count_table[TX_SIZES_ALL][8] = {
+#if CONFIG_CB4X4
+  { 1, 2, 2, 3, 0, 0, 0 },
+#endif
+  { 1, 2, 3, 4, 3, 16 - 13, 0 },    { 1, 2, 3, 4, 11, 64 - 21, 0 },
+  { 1, 2, 3, 4, 11, 256 - 21, 0 },  { 1, 2, 3, 4, 11, 1024 - 21, 0 },
+#if CONFIG_TX64X64
+  { 1, 2, 3, 4, 11, 4096 - 21, 0 },
+#endif  // CONFIG_TX64X64
+  { 1, 2, 3, 4, 8, 32 - 18, 0 },    { 1, 2, 3, 4, 8, 32 - 18, 0 },
+  { 1, 2, 3, 4, 11, 128 - 21, 0 },  { 1, 2, 3, 4, 11, 128 - 21, 0 },
+  { 1, 2, 3, 4, 11, 512 - 21, 0 },  { 1, 2, 3, 4, 11, 512 - 21, 0 },
+  { 1, 2, 3, 4, 11, 64 - 21, 0 },   { 1, 2, 3, 4, 11, 64 - 21, 0 },
+  { 1, 2, 3, 4, 11, 256 - 21, 0 },  { 1, 2, 3, 4, 11, 256 - 21, 0 },
+};
+
+const uint16_t band_cum_count_table[TX_SIZES_ALL][8] = {
+#if CONFIG_CB4X4
+  { 0, 1, 3, 6, 10, 13, 16, 0 },
+#endif
+  { 0, 1, 3, 6, 10, 13, 16, 0 },   { 0, 1, 3, 6, 10, 21, 64, 0 },
+  { 0, 1, 3, 6, 10, 21, 256, 0 },  { 0, 1, 3, 6, 10, 21, 1024, 0 },
+#if CONFIG_TX64X64
+  { 0, 1, 3, 6, 10, 21, 4096, 0 },
+#endif  // CONFIG_TX64X64
+  { 0, 1, 3, 6, 10, 18, 32, 0 },   { 0, 1, 3, 6, 10, 18, 32, 0 },
+  { 0, 1, 3, 6, 10, 21, 128, 0 },  { 0, 1, 3, 6, 10, 21, 128, 0 },
+  { 0, 1, 3, 6, 10, 21, 512, 0 },  { 0, 1, 3, 6, 10, 21, 512, 0 },
+  { 0, 1, 3, 6, 10, 21, 64, 0 },   { 0, 1, 3, 6, 10, 21, 64, 0 },
+  { 0, 1, 3, 6, 10, 21, 256, 0 },  { 0, 1, 3, 6, 10, 21, 256, 0 },
+};
+
+const uint8_t av1_coefband_trans_8x8plus[MAX_TX_SQUARE] = {
+  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5,
+  // beyond MAXBAND_INDEX+1 all values are filled as 5
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+#if CONFIG_TX64X64
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5
+#endif  // CONFIG_TX64X64
+};
+
+const uint8_t av1_coefband_trans_4x8_8x4[32] = {
+  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+  4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+};
+
+const uint8_t av1_coefband_trans_4x4[16] = {
+  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
+};
+
+const uint8_t av1_pt_energy_class[ENTROPY_TOKENS] = { 0, 1, 2, 3, 3, 4,
+                                                      4, 5, 5, 5, 5, 5 };
+
+// Model obtained from a 2-sided zero-centered distribution derived
+// from a Pareto distribution. The cdf of the distribution is:
+// cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
+//
+// For a given beta and a given probablity of the 1-node, the alpha
+// is first solved, and then the {alpha, beta} pair is used to generate
+// the probabilities for the rest of the nodes.
+
+// beta = 8
+
+// Every odd line in this table can be generated from the even lines
+// by averaging :
+// av1_pareto8_full[l][node] = (av1_pareto8_full[l-1][node] +
+//                              av1_pareto8_full[l+1][node] ) >> 1;
+// Values for tokens ONE_TOKEN through CATEGORY6_TOKEN included here.
+const aom_prob av1_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES] = {
+  { 3, 86, 128, 6, 86, 23, 88, 29 },
+  { 6, 86, 128, 11, 87, 42, 91, 52 },
+  { 9, 86, 129, 17, 88, 61, 94, 76 },
+  { 12, 86, 129, 22, 88, 77, 97, 93 },
+  { 15, 87, 129, 28, 89, 93, 100, 110 },
+  { 17, 87, 129, 33, 90, 105, 103, 123 },
+  { 20, 88, 130, 38, 91, 118, 106, 136 },
+  { 23, 88, 130, 43, 91, 128, 108, 146 },
+  { 26, 89, 131, 48, 92, 139, 111, 156 },
+  { 28, 89, 131, 53, 93, 147, 114, 163 },
+  { 31, 90, 131, 58, 94, 156, 117, 171 },
+  { 34, 90, 131, 62, 94, 163, 119, 177 },
+  { 37, 90, 132, 66, 95, 171, 122, 184 },
+  { 39, 90, 132, 70, 96, 177, 124, 189 },
+  { 42, 91, 132, 75, 97, 183, 127, 194 },
+  { 44, 91, 132, 79, 97, 188, 129, 198 },
+  { 47, 92, 133, 83, 98, 193, 132, 202 },
+  { 49, 92, 133, 86, 99, 197, 134, 205 },
+  { 52, 93, 133, 90, 100, 201, 137, 208 },
+  { 54, 93, 133, 94, 100, 204, 139, 211 },
+  { 57, 94, 134, 98, 101, 208, 142, 214 },
+  { 59, 94, 134, 101, 102, 211, 144, 216 },
+  { 62, 94, 135, 105, 103, 214, 146, 218 },
+  { 64, 94, 135, 108, 103, 216, 148, 220 },
+  { 66, 95, 135, 111, 104, 219, 151, 222 },
+  { 68, 95, 135, 114, 105, 221, 153, 223 },
+  { 71, 96, 136, 117, 106, 224, 155, 225 },
+  { 73, 96, 136, 120, 106, 225, 157, 226 },
+  { 76, 97, 136, 123, 107, 227, 159, 228 },
+  { 78, 97, 136, 126, 108, 229, 160, 229 },
+  { 80, 98, 137, 129, 109, 231, 162, 231 },
+  { 82, 98, 137, 131, 109, 232, 164, 232 },
+  { 84, 98, 138, 134, 110, 234, 166, 233 },
+  { 86, 98, 138, 137, 111, 235, 168, 234 },
+  { 89, 99, 138, 140, 112, 236, 170, 235 },
+  { 91, 99, 138, 142, 112, 237, 171, 235 },
+  { 93, 100, 139, 145, 113, 238, 173, 236 },
+  { 95, 100, 139, 147, 114, 239, 174, 237 },
+  { 97, 101, 140, 149, 115, 240, 176, 238 },
+  { 99, 101, 140, 151, 115, 241, 177, 238 },
+  { 101, 102, 140, 154, 116, 242, 179, 239 },
+  { 103, 102, 140, 156, 117, 242, 180, 239 },
+  { 105, 103, 141, 158, 118, 243, 182, 240 },
+  { 107, 103, 141, 160, 118, 243, 183, 240 },
+  { 109, 104, 141, 162, 119, 244, 185, 241 },
+  { 111, 104, 141, 164, 119, 244, 186, 241 },
+  { 113, 104, 142, 166, 120, 245, 187, 242 },
+  { 114, 104, 142, 168, 121, 245, 188, 242 },
+  { 116, 105, 143, 170, 122, 246, 190, 243 },
+  { 118, 105, 143, 171, 122, 246, 191, 243 },
+  { 120, 106, 143, 173, 123, 247, 192, 244 },
+  { 121, 106, 143, 175, 124, 247, 193, 244 },
+  { 123, 107, 144, 177, 125, 248, 195, 244 },
+  { 125, 107, 144, 178, 125, 248, 196, 244 },
+  { 127, 108, 145, 180, 126, 249, 197, 245 },
+  { 128, 108, 145, 181, 127, 249, 198, 245 },
+  { 130, 109, 145, 183, 128, 249, 199, 245 },
+  { 132, 109, 145, 184, 128, 249, 200, 245 },
+  { 134, 110, 146, 186, 129, 250, 201, 246 },
+  { 135, 110, 146, 187, 130, 250, 202, 246 },
+  { 137, 111, 147, 189, 131, 251, 203, 246 },
+  { 138, 111, 147, 190, 131, 251, 204, 246 },
+  { 140, 112, 147, 192, 132, 251, 205, 247 },
+  { 141, 112, 147, 193, 132, 251, 206, 247 },
+  { 143, 113, 148, 194, 133, 251, 207, 247 },
+  { 144, 113, 148, 195, 134, 251, 207, 247 },
+  { 146, 114, 149, 197, 135, 252, 208, 248 },
+  { 147, 114, 149, 198, 135, 252, 209, 248 },
+  { 149, 115, 149, 199, 136, 252, 210, 248 },
+  { 150, 115, 149, 200, 137, 252, 210, 248 },
+  { 152, 115, 150, 201, 138, 252, 211, 248 },
+  { 153, 115, 150, 202, 138, 252, 212, 248 },
+  { 155, 116, 151, 204, 139, 253, 213, 249 },
+  { 156, 116, 151, 205, 139, 253, 213, 249 },
+  { 158, 117, 151, 206, 140, 253, 214, 249 },
+  { 159, 117, 151, 207, 141, 253, 215, 249 },
+  { 161, 118, 152, 208, 142, 253, 216, 249 },
+  { 162, 118, 152, 209, 142, 253, 216, 249 },
+  { 163, 119, 153, 210, 143, 253, 217, 249 },
+  { 164, 119, 153, 211, 143, 253, 217, 249 },
+  { 166, 120, 153, 212, 144, 254, 218, 250 },
+  { 167, 120, 153, 212, 145, 254, 219, 250 },
+  { 168, 121, 154, 213, 146, 254, 220, 250 },
+  { 169, 121, 154, 214, 146, 254, 220, 250 },
+  { 171, 122, 155, 215, 147, 254, 221, 250 },
+  { 172, 122, 155, 216, 147, 254, 221, 250 },
+  { 173, 123, 155, 217, 148, 254, 222, 250 },
+  { 174, 123, 155, 217, 149, 254, 222, 250 },
+  { 176, 124, 156, 218, 150, 254, 223, 250 },
+  { 177, 124, 156, 219, 150, 254, 223, 250 },
+  { 178, 125, 157, 220, 151, 254, 224, 251 },
+  { 179, 125, 157, 220, 151, 254, 224, 251 },
+  { 180, 126, 157, 221, 152, 254, 225, 251 },
+  { 181, 126, 157, 221, 152, 254, 225, 251 },
+  { 183, 127, 158, 222, 153, 254, 226, 251 },
+  { 184, 127, 158, 223, 154, 254, 226, 251 },
+  { 185, 128, 159, 224, 155, 255, 227, 251 },
+  { 186, 128, 159, 224, 155, 255, 227, 251 },
+  { 187, 129, 160, 225, 156, 255, 228, 251 },
+  { 188, 130, 160, 225, 156, 255, 228, 251 },
+  { 189, 131, 160, 226, 157, 255, 228, 251 },
+  { 190, 131, 160, 226, 158, 255, 228, 251 },
+  { 191, 132, 161, 227, 159, 255, 229, 251 },
+  { 192, 132, 161, 227, 159, 255, 229, 251 },
+  { 193, 133, 162, 228, 160, 255, 230, 252 },
+  { 194, 133, 162, 229, 160, 255, 230, 252 },
+  { 195, 134, 163, 230, 161, 255, 231, 252 },
+  { 196, 134, 163, 230, 161, 255, 231, 252 },
+  { 197, 135, 163, 231, 162, 255, 231, 252 },
+  { 198, 135, 163, 231, 162, 255, 231, 252 },
+  { 199, 136, 164, 232, 163, 255, 232, 252 },
+  { 200, 136, 164, 232, 164, 255, 232, 252 },
+  { 201, 137, 165, 233, 165, 255, 233, 252 },
+  { 201, 137, 165, 233, 165, 255, 233, 252 },
+  { 202, 138, 166, 233, 166, 255, 233, 252 },
+  { 203, 138, 166, 233, 166, 255, 233, 252 },
+  { 204, 139, 166, 234, 167, 255, 234, 252 },
+  { 205, 139, 166, 234, 167, 255, 234, 252 },
+  { 206, 140, 167, 235, 168, 255, 235, 252 },
+  { 206, 140, 167, 235, 168, 255, 235, 252 },
+  { 207, 141, 168, 236, 169, 255, 235, 252 },
+  { 208, 141, 168, 236, 170, 255, 235, 252 },
+  { 209, 142, 169, 237, 171, 255, 236, 252 },
+  { 209, 143, 169, 237, 171, 255, 236, 252 },
+  { 210, 144, 169, 237, 172, 255, 236, 252 },
+  { 211, 144, 169, 237, 172, 255, 236, 252 },
+  { 212, 145, 170, 238, 173, 255, 237, 252 },
+  { 213, 145, 170, 238, 173, 255, 237, 252 },
+  { 214, 146, 171, 239, 174, 255, 237, 253 },
+  { 214, 146, 171, 239, 174, 255, 237, 253 },
+  { 215, 147, 172, 240, 175, 255, 238, 253 },
+  { 215, 147, 172, 240, 175, 255, 238, 253 },
+  { 216, 148, 173, 240, 176, 255, 238, 253 },
+  { 217, 148, 173, 240, 176, 255, 238, 253 },
+  { 218, 149, 173, 241, 177, 255, 239, 253 },
+  { 218, 149, 173, 241, 178, 255, 239, 253 },
+  { 219, 150, 174, 241, 179, 255, 239, 253 },
+  { 219, 151, 174, 241, 179, 255, 239, 253 },
+  { 220, 152, 175, 242, 180, 255, 240, 253 },
+  { 221, 152, 175, 242, 180, 255, 240, 253 },
+  { 222, 153, 176, 242, 181, 255, 240, 253 },
+  { 222, 153, 176, 242, 181, 255, 240, 253 },
+  { 223, 154, 177, 243, 182, 255, 240, 253 },
+  { 223, 154, 177, 243, 182, 255, 240, 253 },
+  { 224, 155, 178, 244, 183, 255, 241, 253 },
+  { 224, 155, 178, 244, 183, 255, 241, 253 },
+  { 225, 156, 178, 244, 184, 255, 241, 253 },
+  { 225, 157, 178, 244, 184, 255, 241, 253 },
+  { 226, 158, 179, 244, 185, 255, 242, 253 },
+  { 227, 158, 179, 244, 185, 255, 242, 253 },
+  { 228, 159, 180, 245, 186, 255, 242, 253 },
+  { 228, 159, 180, 245, 186, 255, 242, 253 },
+  { 229, 160, 181, 245, 187, 255, 242, 253 },
+  { 229, 160, 181, 245, 187, 255, 242, 253 },
+  { 230, 161, 182, 246, 188, 255, 243, 253 },
+  { 230, 162, 182, 246, 188, 255, 243, 253 },
+  { 231, 163, 183, 246, 189, 255, 243, 253 },
+  { 231, 163, 183, 246, 189, 255, 243, 253 },
+  { 232, 164, 184, 247, 190, 255, 243, 253 },
+  { 232, 164, 184, 247, 190, 255, 243, 253 },
+  { 233, 165, 185, 247, 191, 255, 244, 253 },
+  { 233, 165, 185, 247, 191, 255, 244, 253 },
+  { 234, 166, 185, 247, 192, 255, 244, 253 },
+  { 234, 167, 185, 247, 192, 255, 244, 253 },
+  { 235, 168, 186, 248, 193, 255, 244, 253 },
+  { 235, 168, 186, 248, 193, 255, 244, 253 },
+  { 236, 169, 187, 248, 194, 255, 244, 253 },
+  { 236, 169, 187, 248, 194, 255, 244, 253 },
+  { 236, 170, 188, 248, 195, 255, 245, 253 },
+  { 236, 170, 188, 248, 195, 255, 245, 253 },
+  { 237, 171, 189, 249, 196, 255, 245, 254 },
+  { 237, 172, 189, 249, 196, 255, 245, 254 },
+  { 238, 173, 190, 249, 197, 255, 245, 254 },
+  { 238, 173, 190, 249, 197, 255, 245, 254 },
+  { 239, 174, 191, 249, 198, 255, 245, 254 },
+  { 239, 174, 191, 249, 198, 255, 245, 254 },
+  { 240, 175, 192, 249, 199, 255, 246, 254 },
+  { 240, 176, 192, 249, 199, 255, 246, 254 },
+  { 240, 177, 193, 250, 200, 255, 246, 254 },
+  { 240, 177, 193, 250, 200, 255, 246, 254 },
+  { 241, 178, 194, 250, 201, 255, 246, 254 },
+  { 241, 178, 194, 250, 201, 255, 246, 254 },
+  { 242, 179, 195, 250, 202, 255, 246, 254 },
+  { 242, 180, 195, 250, 202, 255, 246, 254 },
+  { 242, 181, 196, 250, 203, 255, 247, 254 },
+  { 242, 181, 196, 250, 203, 255, 247, 254 },
+  { 243, 182, 197, 251, 204, 255, 247, 254 },
+  { 243, 183, 197, 251, 204, 255, 247, 254 },
+  { 244, 184, 198, 251, 205, 255, 247, 254 },
+  { 244, 184, 198, 251, 205, 255, 247, 254 },
+  { 244, 185, 199, 251, 206, 255, 247, 254 },
+  { 244, 185, 199, 251, 206, 255, 247, 254 },
+  { 245, 186, 200, 251, 207, 255, 247, 254 },
+  { 245, 187, 200, 251, 207, 255, 247, 254 },
+  { 246, 188, 201, 252, 207, 255, 248, 254 },
+  { 246, 188, 201, 252, 207, 255, 248, 254 },
+  { 246, 189, 202, 252, 208, 255, 248, 254 },
+  { 246, 190, 202, 252, 208, 255, 248, 254 },
+  { 247, 191, 203, 252, 209, 255, 248, 254 },
+  { 247, 191, 203, 252, 209, 255, 248, 254 },
+  { 247, 192, 204, 252, 210, 255, 248, 254 },
+  { 247, 193, 204, 252, 210, 255, 248, 254 },
+  { 248, 194, 205, 252, 211, 255, 248, 254 },
+  { 248, 194, 205, 252, 211, 255, 248, 254 },
+  { 248, 195, 206, 252, 212, 255, 249, 254 },
+  { 248, 196, 206, 252, 212, 255, 249, 254 },
+  { 249, 197, 207, 253, 213, 255, 249, 254 },
+  { 249, 197, 207, 253, 213, 255, 249, 254 },
+  { 249, 198, 208, 253, 214, 255, 249, 254 },
+  { 249, 199, 209, 253, 214, 255, 249, 254 },
+  { 250, 200, 210, 253, 215, 255, 249, 254 },
+  { 250, 200, 210, 253, 215, 255, 249, 254 },
+  { 250, 201, 211, 253, 215, 255, 249, 254 },
+  { 250, 202, 211, 253, 215, 255, 249, 254 },
+  { 250, 203, 212, 253, 216, 255, 249, 254 },
+  { 250, 203, 212, 253, 216, 255, 249, 254 },
+  { 251, 204, 213, 253, 217, 255, 250, 254 },
+  { 251, 205, 213, 253, 217, 255, 250, 254 },
+  { 251, 206, 214, 254, 218, 255, 250, 254 },
+  { 251, 206, 215, 254, 218, 255, 250, 254 },
+  { 252, 207, 216, 254, 219, 255, 250, 254 },
+  { 252, 208, 216, 254, 219, 255, 250, 254 },
+  { 252, 209, 217, 254, 220, 255, 250, 254 },
+  { 252, 210, 217, 254, 220, 255, 250, 254 },
+  { 252, 211, 218, 254, 221, 255, 250, 254 },
+  { 252, 212, 218, 254, 221, 255, 250, 254 },
+  { 253, 213, 219, 254, 222, 255, 250, 254 },
+  { 253, 213, 220, 254, 222, 255, 250, 254 },
+  { 253, 214, 221, 254, 223, 255, 250, 254 },
+  { 253, 215, 221, 254, 223, 255, 250, 254 },
+  { 253, 216, 222, 254, 224, 255, 251, 254 },
+  { 253, 217, 223, 254, 224, 255, 251, 254 },
+  { 253, 218, 224, 254, 225, 255, 251, 254 },
+  { 253, 219, 224, 254, 225, 255, 251, 254 },
+  { 254, 220, 225, 254, 225, 255, 251, 254 },
+  { 254, 221, 226, 254, 225, 255, 251, 254 },
+  { 254, 222, 227, 255, 226, 255, 251, 254 },
+  { 254, 223, 227, 255, 226, 255, 251, 254 },
+  { 254, 224, 228, 255, 227, 255, 251, 254 },
+  { 254, 225, 229, 255, 227, 255, 251, 254 },
+  { 254, 226, 230, 255, 228, 255, 251, 254 },
+  { 254, 227, 230, 255, 229, 255, 251, 254 },
+  { 255, 228, 231, 255, 230, 255, 251, 254 },
+  { 255, 229, 232, 255, 230, 255, 251, 254 },
+  { 255, 230, 233, 255, 231, 255, 252, 254 },
+  { 255, 231, 234, 255, 231, 255, 252, 254 },
+  { 255, 232, 235, 255, 232, 255, 252, 254 },
+  { 255, 233, 236, 255, 232, 255, 252, 254 },
+  { 255, 235, 237, 255, 233, 255, 252, 254 },
+  { 255, 236, 238, 255, 234, 255, 252, 254 },
+  { 255, 238, 240, 255, 235, 255, 252, 255 },
+  { 255, 239, 241, 255, 235, 255, 252, 254 },
+  { 255, 241, 243, 255, 236, 255, 252, 254 },
+  { 255, 243, 245, 255, 237, 255, 252, 254 },
+  { 255, 246, 247, 255, 239, 255, 253, 255 },
+};
+
+// Model obtained from a 2-sided zero-centered distribution derived
+// from a Pareto distribution. The cdf of the distribution is:
+// cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
+//
+// For a given beta and a given probability of the 1-node, the alpha
+// is first solved, and then the {alpha, beta} pair is used to generate
+// the probabilities for the rest of the nodes.
+//
+// The full source code of the generating program is available in:
+// tools/gen_constrained_tokenset.py
+//
+#if CONFIG_NEW_TOKENSET
+// Values for tokens TWO_TOKEN through CATEGORY6_TOKEN included
+// in the table here : the ONE_TOKEN probability is
+// removed and the probabilities rescaled.
+//
+// ZERO_TOKEN and ONE_TOKEN are coded as one CDF,
+// and EOB_TOKEN is coded as flags outside this coder.
+const aom_cdf_prob av1_pareto8_tail_probs[COEFF_PROB_MODELS][TAIL_NODES] = {
+  { 128, 127, 127, 252, 497, 969, 1839, 3318, 25511 },
+  { 256, 254, 251, 496, 966, 1834, 3308, 5408, 19995 },
+  { 383, 378, 373, 732, 1408, 2605, 4470, 6646, 15773 },
+  { 511, 502, 493, 961, 1824, 3289, 5373, 7298, 12517 },
+  { 638, 625, 611, 1182, 2215, 3894, 6064, 7548, 9991 },
+  { 766, 746, 726, 1396, 2582, 4428, 6578, 7529, 8017 },
+  { 893, 866, 839, 1603, 2927, 4896, 6945, 7332, 6467 },
+  { 1020, 984, 950, 1803, 3250, 5305, 7191, 7022, 5243 },
+  { 1147, 1102, 1059, 1996, 3552, 5659, 7338, 6646, 4269 },
+  { 1274, 1218, 1166, 2183, 3835, 5963, 7403, 6234, 3492 },
+  { 1400, 1334, 1270, 2363, 4099, 6223, 7401, 5809, 2869 },
+  { 1527, 1447, 1372, 2537, 4345, 6442, 7346, 5386, 2366 },
+  { 1654, 1560, 1473, 2704, 4574, 6624, 7247, 4973, 1959 },
+  { 1780, 1672, 1571, 2866, 4787, 6771, 7114, 4579, 1628 },
+  { 1906, 1782, 1667, 3022, 4984, 6889, 6954, 4206, 1358 },
+  { 2032, 1891, 1762, 3172, 5167, 6979, 6773, 3856, 1136 },
+  { 2158, 2000, 1854, 3316, 5335, 7044, 6577, 3530, 954 },
+  { 2284, 2106, 1944, 3455, 5490, 7087, 6370, 3229, 803 },
+  { 2410, 2212, 2032, 3588, 5632, 7109, 6155, 2951, 679 },
+  { 2535, 2317, 2119, 3717, 5761, 7113, 5936, 2695, 575 },
+  { 2661, 2420, 2203, 3840, 5880, 7101, 5714, 2461, 488 },
+  { 2786, 2522, 2286, 3958, 5987, 7074, 5493, 2246, 416 },
+  { 2911, 2624, 2367, 4072, 6083, 7033, 5273, 2050, 355 },
+  { 3037, 2724, 2446, 4180, 6170, 6981, 5055, 1871, 304 },
+  { 3162, 2822, 2523, 4284, 6247, 6919, 4842, 1708, 261 },
+  { 3286, 2920, 2599, 4384, 6315, 6848, 4633, 1559, 224 },
+  { 3411, 3017, 2672, 4478, 6374, 6768, 4430, 1424, 194 },
+  { 3536, 3112, 2745, 4569, 6426, 6681, 4232, 1300, 167 },
+  { 3660, 3207, 2815, 4656, 6469, 6588, 4040, 1188, 145 },
+  { 3785, 3300, 2883, 4738, 6505, 6490, 3855, 1086, 126 },
+  { 3909, 3392, 2950, 4817, 6534, 6387, 3677, 993, 109 },
+  { 4033, 3483, 3015, 4891, 6557, 6281, 3505, 908, 95 },
+  { 4157, 3573, 3079, 4962, 6573, 6170, 3340, 831, 83 },
+  { 4281, 3662, 3141, 5029, 6584, 6058, 3181, 760, 72 },
+  { 4405, 3750, 3201, 5093, 6588, 5943, 3029, 696, 63 },
+  { 4529, 3837, 3260, 5152, 6587, 5826, 2883, 638, 56 },
+  { 4652, 3922, 3317, 5209, 6582, 5709, 2744, 584, 49 },
+  { 4775, 4007, 3373, 5262, 6572, 5590, 2610, 536, 43 },
+  { 4899, 4090, 3427, 5312, 6557, 5470, 2483, 492, 38 },
+  { 5022, 4173, 3480, 5359, 6538, 5351, 2361, 451, 33 },
+  { 5145, 4254, 3531, 5403, 6515, 5231, 2246, 414, 29 },
+  { 5268, 4334, 3581, 5443, 6489, 5112, 2135, 380, 26 },
+  { 5391, 4414, 3629, 5481, 6458, 4993, 2029, 350, 23 },
+  { 5514, 4492, 3676, 5515, 6425, 4875, 1929, 321, 21 },
+  { 5637, 4569, 3721, 5548, 6388, 4758, 1833, 296, 18 },
+  { 5759, 4645, 3766, 5577, 6349, 4642, 1742, 272, 16 },
+  { 5881, 4720, 3808, 5604, 6307, 4528, 1656, 250, 14 },
+  { 6004, 4794, 3849, 5628, 6262, 4414, 1573, 231, 13 },
+  { 6126, 4867, 3890, 5649, 6215, 4302, 1495, 213, 11 },
+  { 6248, 4939, 3928, 5669, 6166, 4192, 1420, 196, 10 },
+  { 6370, 5010, 3966, 5686, 6114, 4083, 1349, 181, 9 },
+  { 6492, 5080, 4002, 5700, 6061, 3976, 1282, 167, 8 },
+  { 6614, 5149, 4037, 5712, 6006, 3871, 1218, 154, 7 },
+  { 6735, 5217, 4070, 5723, 5950, 3767, 1157, 142, 7 },
+  { 6857, 5284, 4103, 5731, 5891, 3666, 1099, 131, 6 },
+  { 6978, 5351, 4134, 5737, 5832, 3566, 1044, 121, 5 },
+  { 7099, 5415, 4164, 5741, 5771, 3469, 992, 112, 5 },
+  { 7221, 5479, 4192, 5743, 5709, 3373, 943, 104, 4 },
+  { 7342, 5542, 4220, 5743, 5646, 3279, 896, 96, 4 },
+  { 7462, 5604, 4246, 5742, 5583, 3187, 851, 89, 4 },
+  { 7584, 5665, 4272, 5739, 5518, 3097, 808, 82, 3 },
+  { 7704, 5725, 4296, 5734, 5453, 3009, 768, 76, 3 },
+  { 7825, 5784, 4318, 5727, 5386, 2924, 730, 71, 3 },
+  { 7945, 5843, 4341, 5719, 5320, 2840, 693, 65, 2 },
+  { 8066, 5900, 4361, 5709, 5252, 2758, 659, 61, 2 },
+  { 8186, 5956, 4381, 5698, 5185, 2678, 626, 56, 2 },
+  { 8306, 6011, 4400, 5685, 5117, 2600, 595, 52, 2 },
+  { 8426, 6066, 4418, 5671, 5049, 2523, 565, 48, 2 },
+  { 8547, 6119, 4434, 5655, 4981, 2449, 537, 45, 1 },
+  { 8666, 6171, 4450, 5638, 4912, 2377, 511, 42, 1 },
+  { 8786, 6223, 4465, 5620, 4843, 2306, 485, 39, 1 },
+  { 8906, 6274, 4478, 5600, 4775, 2237, 461, 36, 1 },
+  { 9025, 6323, 4491, 5580, 4706, 2170, 438, 34, 1 },
+  { 9144, 6372, 4503, 5558, 4637, 2105, 417, 31, 1 },
+  { 9264, 6420, 4514, 5535, 4568, 2041, 396, 29, 1 },
+  { 9383, 6467, 4524, 5511, 4500, 1979, 376, 27, 1 },
+  { 9502, 6513, 4532, 5486, 4432, 1919, 358, 25, 1 },
+  { 9621, 6558, 4541, 5460, 4364, 1860, 340, 23, 1 },
+  { 9740, 6602, 4548, 5433, 4296, 1803, 323, 22, 1 },
+  { 9859, 6645, 4554, 5405, 4229, 1748, 307, 20, 1 },
+  { 9978, 6688, 4559, 5376, 4161, 1694, 292, 19, 1 },
+  { 10096, 6729, 4564, 5347, 4094, 1641, 278, 18, 1 },
+  { 10215, 6770, 4568, 5316, 4028, 1590, 264, 16, 1 },
+  { 10333, 6809, 4571, 5285, 3962, 1541, 251, 15, 1 },
+  { 10452, 6848, 4573, 5253, 3896, 1492, 239, 14, 1 },
+  { 10570, 6886, 4574, 5220, 3831, 1446, 227, 13, 1 },
+  { 10688, 6923, 4575, 5186, 3767, 1400, 216, 12, 1 },
+  { 10806, 6959, 4575, 5152, 3702, 1356, 205, 12, 1 },
+  { 10924, 6994, 4574, 5117, 3639, 1313, 195, 11, 1 },
+  { 11041, 7029, 4572, 5082, 3576, 1271, 186, 10, 1 },
+  { 11159, 7062, 4570, 5046, 3513, 1231, 177, 9, 1 },
+  { 11277, 7095, 4566, 5009, 3451, 1192, 168, 9, 1 },
+  { 11394, 7127, 4563, 4972, 3390, 1153, 160, 8, 1 },
+  { 11512, 7158, 4558, 4934, 3329, 1116, 152, 8, 1 },
+  { 11629, 7188, 4553, 4896, 3269, 1080, 145, 7, 1 },
+  { 11746, 7217, 4547, 4857, 3210, 1045, 138, 7, 1 },
+  { 11864, 7245, 4540, 4818, 3151, 1012, 131, 6, 1 },
+  { 11980, 7273, 4533, 4779, 3093, 979, 124, 6, 1 },
+  { 12097, 7300, 4525, 4739, 3035, 947, 118, 6, 1 },
+  { 12215, 7326, 4516, 4698, 2978, 916, 113, 5, 1 },
+  { 12331, 7351, 4507, 4658, 2922, 886, 107, 5, 1 },
+  { 12448, 7375, 4497, 4617, 2866, 857, 102, 5, 1 },
+  { 12564, 7398, 4487, 4576, 2812, 829, 97, 4, 1 },
+  { 12681, 7421, 4476, 4534, 2757, 802, 92, 4, 1 },
+  { 12797, 7443, 4464, 4492, 2704, 775, 88, 4, 1 },
+  { 12914, 7464, 4452, 4450, 2651, 749, 84, 3, 1 },
+  { 13030, 7484, 4439, 4408, 2599, 725, 79, 3, 1 },
+  { 13147, 7503, 4426, 4365, 2547, 700, 76, 3, 1 },
+  { 13262, 7522, 4412, 4322, 2497, 677, 72, 3, 1 },
+  { 13378, 7539, 4398, 4280, 2447, 654, 68, 3, 1 },
+  { 13494, 7556, 4383, 4237, 2397, 632, 65, 3, 1 },
+  { 13610, 7573, 4368, 4193, 2348, 611, 62, 2, 1 },
+  { 13726, 7588, 4352, 4150, 2300, 590, 59, 2, 1 },
+  { 13841, 7602, 4335, 4107, 2253, 571, 56, 2, 1 },
+  { 13957, 7616, 4318, 4063, 2207, 551, 53, 2, 1 },
+  { 14072, 7629, 4301, 4019, 2161, 532, 51, 2, 1 },
+  { 14188, 7641, 4283, 3976, 2115, 514, 48, 2, 1 },
+  { 14302, 7652, 4265, 3932, 2071, 497, 46, 2, 1 },
+  { 14418, 7663, 4246, 3888, 2027, 480, 44, 1, 1 },
+  { 14533, 7673, 4227, 3844, 1984, 463, 42, 1, 1 },
+  { 14649, 7682, 4207, 3800, 1941, 447, 40, 1, 1 },
+  { 14763, 7690, 4187, 3757, 1899, 432, 38, 1, 1 },
+  { 14878, 7698, 4166, 3713, 1858, 417, 36, 1, 1 },
+  { 14993, 7705, 4146, 3669, 1817, 402, 34, 1, 1 },
+  { 15109, 7711, 4124, 3625, 1777, 388, 32, 1, 1 },
+  { 15223, 7715, 4103, 3581, 1738, 375, 31, 1, 1 },
+  { 15337, 7720, 4081, 3538, 1699, 362, 29, 1, 1 },
+  { 15452, 7724, 4058, 3494, 1661, 349, 28, 1, 1 },
+  { 15567, 7727, 4035, 3450, 1624, 337, 26, 1, 1 },
+  { 15681, 7729, 4012, 3407, 1587, 325, 25, 1, 1 },
+  { 15795, 7730, 3989, 3364, 1551, 313, 24, 1, 1 },
+  { 15909, 7731, 3965, 3320, 1516, 302, 23, 1, 1 },
+  { 16024, 7731, 3940, 3277, 1481, 291, 22, 1, 1 },
+  { 16138, 7730, 3916, 3234, 1446, 281, 21, 1, 1 },
+  { 16252, 7728, 3891, 3191, 1413, 271, 20, 1, 1 },
+  { 16366, 7726, 3866, 3148, 1380, 261, 19, 1, 1 },
+  { 16480, 7723, 3840, 3106, 1347, 252, 18, 1, 1 },
+  { 16594, 7720, 3814, 3063, 1315, 243, 17, 1, 1 },
+  { 16708, 7715, 3788, 3021, 1284, 234, 16, 1, 1 },
+  { 16822, 7710, 3762, 2979, 1253, 225, 15, 1, 1 },
+  { 16936, 7704, 3735, 2937, 1223, 217, 14, 1, 1 },
+  { 17050, 7697, 3708, 2895, 1193, 209, 14, 1, 1 },
+  { 17162, 7690, 3681, 2854, 1164, 202, 13, 1, 1 },
+  { 17276, 7682, 3654, 2812, 1136, 194, 12, 1, 1 },
+  { 17389, 7673, 3626, 2771, 1108, 187, 12, 1, 1 },
+  { 17504, 7663, 3598, 2730, 1080, 180, 11, 1, 1 },
+  { 17617, 7653, 3570, 2689, 1053, 173, 11, 1, 1 },
+  { 17730, 7642, 3541, 2649, 1027, 167, 10, 1, 1 },
+  { 17843, 7630, 3513, 2608, 1001, 161, 10, 1, 1 },
+  { 17957, 7618, 3484, 2569, 975, 154, 9, 1, 1 },
+  { 18069, 7605, 3455, 2529, 950, 149, 9, 1, 1 },
+  { 18183, 7591, 3426, 2489, 926, 143, 8, 1, 1 },
+  { 18296, 7576, 3396, 2450, 902, 138, 8, 1, 1 },
+  { 18410, 7562, 3366, 2411, 878, 132, 7, 1, 1 },
+  { 18523, 7545, 3337, 2372, 855, 127, 7, 1, 1 },
+  { 18636, 7529, 3306, 2333, 833, 122, 7, 1, 1 },
+  { 18749, 7511, 3276, 2295, 811, 118, 6, 1, 1 },
+  { 18862, 7493, 3246, 2257, 789, 113, 6, 1, 1 },
+  { 18975, 7474, 3215, 2219, 768, 109, 6, 1, 1 },
+  { 19088, 7455, 3185, 2182, 747, 104, 5, 1, 1 },
+  { 19201, 7435, 3154, 2144, 727, 100, 5, 1, 1 },
+  { 19314, 7414, 3123, 2107, 707, 96, 5, 1, 1 },
+  { 19427, 7392, 3092, 2071, 687, 92, 5, 1, 1 },
+  { 19541, 7370, 3060, 2034, 668, 89, 4, 1, 1 },
+  { 19654, 7347, 3029, 1998, 649, 85, 4, 1, 1 },
+  { 19766, 7323, 2997, 1963, 631, 82, 4, 1, 1 },
+  { 19878, 7299, 2966, 1927, 613, 79, 4, 1, 1 },
+  { 19991, 7274, 2934, 1892, 596, 75, 4, 1, 1 },
+  { 20105, 7248, 2902, 1857, 579, 72, 3, 1, 1 },
+  { 20218, 7222, 2870, 1822, 562, 69, 3, 1, 1 },
+  { 20331, 7195, 2838, 1788, 545, 66, 3, 1, 1 },
+  { 20443, 7167, 2806, 1754, 529, 64, 3, 1, 1 },
+  { 20556, 7138, 2774, 1720, 514, 61, 3, 1, 1 },
+  { 20670, 7109, 2741, 1687, 498, 58, 3, 1, 1 },
+  { 20783, 7079, 2709, 1654, 483, 56, 2, 1, 1 },
+  { 20895, 7049, 2676, 1621, 469, 54, 2, 1, 1 },
+  { 21008, 7017, 2644, 1589, 455, 51, 2, 1, 1 },
+  { 21121, 6985, 2611, 1557, 441, 49, 2, 1, 1 },
+  { 21234, 6953, 2578, 1525, 427, 47, 2, 1, 1 },
+  { 21347, 6919, 2545, 1494, 414, 45, 2, 1, 1 },
+  { 21460, 6885, 2513, 1462, 401, 43, 2, 1, 1 },
+  { 21573, 6850, 2480, 1432, 388, 41, 2, 1, 1 },
+  { 21687, 6815, 2447, 1401, 375, 39, 2, 1, 1 },
+  { 21801, 6778, 2414, 1371, 363, 38, 1, 1, 1 },
+  { 21914, 6741, 2381, 1341, 352, 36, 1, 1, 1 },
+  { 22028, 6704, 2348, 1311, 340, 34, 1, 1, 1 },
+  { 22141, 6665, 2315, 1282, 329, 33, 1, 1, 1 },
+  { 22255, 6626, 2282, 1253, 318, 31, 1, 1, 1 },
+  { 22368, 6586, 2249, 1225, 307, 30, 1, 1, 1 },
+  { 22482, 6546, 2216, 1196, 297, 28, 1, 1, 1 },
+  { 22595, 6505, 2183, 1169, 286, 27, 1, 1, 1 },
+  { 22709, 6463, 2149, 1141, 277, 26, 1, 1, 1 },
+  { 22823, 6420, 2116, 1114, 267, 25, 1, 1, 1 },
+  { 22938, 6377, 2083, 1087, 257, 23, 1, 1, 1 },
+  { 23053, 6332, 2050, 1060, 248, 22, 1, 1, 1 },
+  { 23167, 6287, 2017, 1034, 239, 21, 1, 1, 1 },
+  { 23280, 6242, 1984, 1008, 231, 20, 1, 1, 1 },
+  { 23396, 6195, 1951, 982, 222, 19, 1, 1, 1 },
+  { 23510, 6148, 1918, 957, 214, 18, 1, 1, 1 },
+  { 23625, 6100, 1885, 932, 206, 17, 1, 1, 1 },
+  { 23741, 6051, 1852, 907, 198, 16, 1, 1, 1 },
+  { 23855, 6002, 1819, 883, 190, 16, 1, 1, 1 },
+  { 23971, 5951, 1786, 859, 183, 15, 1, 1, 1 },
+  { 24087, 5900, 1753, 835, 176, 14, 1, 1, 1 },
+  { 24203, 5848, 1720, 812, 169, 13, 1, 1, 1 },
+  { 24318, 5796, 1687, 789, 162, 13, 1, 1, 1 },
+  { 24435, 5742, 1655, 766, 155, 12, 1, 1, 1 },
+  { 24552, 5688, 1622, 743, 149, 11, 1, 1, 1 },
+  { 24669, 5632, 1589, 721, 143, 11, 1, 1, 1 },
+  { 24786, 5576, 1557, 699, 137, 10, 1, 1, 1 },
+  { 24903, 5519, 1524, 678, 131, 10, 1, 1, 1 },
+  { 25021, 5462, 1491, 657, 125, 9, 1, 1, 1 },
+  { 25139, 5403, 1459, 636, 120, 8, 1, 1, 1 },
+  { 25258, 5343, 1427, 615, 114, 8, 1, 1, 1 },
+  { 25376, 5283, 1394, 595, 109, 8, 1, 1, 1 },
+  { 25496, 5221, 1362, 575, 104, 7, 1, 1, 1 },
+  { 25614, 5159, 1330, 556, 99, 7, 1, 1, 1 },
+  { 25735, 5096, 1298, 536, 94, 6, 1, 1, 1 },
+  { 25856, 5031, 1265, 517, 90, 6, 1, 1, 1 },
+  { 25977, 4966, 1233, 499, 85, 5, 1, 1, 1 },
+  { 26098, 4899, 1202, 480, 81, 5, 1, 1, 1 },
+  { 26220, 4831, 1170, 462, 77, 5, 1, 1, 1 },
+  { 26343, 4763, 1138, 444, 73, 4, 1, 1, 1 },
+  { 26466, 4693, 1106, 427, 69, 4, 1, 1, 1 },
+  { 26589, 4622, 1075, 410, 65, 4, 1, 1, 1 },
+  { 26713, 4550, 1043, 393, 62, 4, 1, 1, 1 },
+  { 26840, 4476, 1012, 376, 58, 3, 1, 1, 1 },
+  { 26966, 4401, 980, 360, 55, 3, 1, 1, 1 },
+  { 27092, 4325, 949, 344, 52, 3, 1, 1, 1 },
+  { 27220, 4248, 918, 328, 48, 3, 1, 1, 1 },
+  { 27350, 4169, 886, 313, 45, 2, 1, 1, 1 },
+  { 27480, 4088, 855, 298, 42, 2, 1, 1, 1 },
+  { 27610, 4006, 824, 283, 40, 2, 1, 1, 1 },
+  { 27743, 3922, 793, 268, 37, 2, 1, 1, 1 },
+  { 27876, 3837, 762, 254, 34, 2, 1, 1, 1 },
+  { 28011, 3749, 731, 240, 32, 2, 1, 1, 1 },
+  { 28147, 3659, 701, 227, 30, 1, 1, 1, 1 },
+  { 28286, 3568, 670, 213, 27, 1, 1, 1, 1 },
+  { 28426, 3474, 639, 200, 25, 1, 1, 1, 1 },
+  { 28569, 3377, 608, 187, 23, 1, 1, 1, 1 },
+  { 28714, 3278, 577, 174, 21, 1, 1, 1, 1 },
+  { 28860, 3176, 547, 162, 19, 1, 1, 1, 1 },
+  { 29010, 3071, 516, 150, 17, 1, 1, 1, 1 },
+  { 29163, 2962, 485, 138, 16, 1, 1, 1, 1 },
+  { 29320, 2849, 454, 127, 14, 1, 1, 1, 1 },
+  { 29483, 2731, 423, 115, 12, 1, 1, 1, 1 },
+  { 29650, 2608, 391, 104, 11, 1, 1, 1, 1 },
+  { 29823, 2479, 360, 93, 9, 1, 1, 1, 1 },
+  { 30002, 2343, 328, 83, 8, 1, 1, 1, 1 },
+  { 30192, 2198, 295, 72, 7, 1, 1, 1, 1 },
+  { 30393, 2041, 262, 62, 6, 1, 1, 1, 1 },
+  { 30612, 1869, 227, 52, 4, 1, 1, 1, 1 },
+  { 30853, 1676, 191, 41, 3, 1, 1, 1, 1 },
+  { 31131, 1448, 152, 31, 2, 1, 1, 1, 1 },
+  { 31486, 1150, 107, 20, 1, 1, 1, 1, 1 },
+};
+#elif CONFIG_EC_MULTISYMBOL
+// Values for tokens ONE_TOKEN through CATEGORY6_TOKEN included here.
+// ZERO_TOKEN and EOB_TOKEN are coded as flags outside this coder.
+const aom_cdf_prob
+    av1_pareto8_token_probs[COEFF_PROB_MODELS][ENTROPY_TOKENS - 2] = {
+      { 128, 127, 127, 126, 251, 495, 965, 1832, 3305, 25412 },
+      { 256, 254, 252, 249, 492, 959, 1820, 3283, 5365, 19838 },
+      { 384, 379, 374, 369, 724, 1392, 2574, 4417, 6568, 15587 },
+      { 512, 503, 494, 486, 946, 1795, 3238, 5289, 7184, 12321 },
+      { 640, 626, 612, 599, 1159, 2172, 3818, 5946, 7401, 9795 },
+      { 768, 748, 728, 709, 1363, 2522, 4324, 6424, 7352, 7830 },
+      { 896, 869, 842, 816, 1559, 2847, 4762, 6755, 7131, 6291 },
+      { 1024, 988, 954, 921, 1747, 3148, 5139, 6966, 6803, 5078 },
+      { 1152, 1107, 1063, 1022, 1926, 3427, 5460, 7080, 6412, 4119 },
+      { 1280, 1224, 1171, 1120, 2098, 3685, 5730, 7113, 5991, 3356 },
+      { 1408, 1340, 1276, 1216, 2261, 3923, 5955, 7083, 5560, 2746 },
+      { 1536, 1455, 1380, 1308, 2418, 4142, 6140, 7001, 5133, 2255 },
+      { 1664, 1569, 1481, 1398, 2567, 4342, 6287, 6879, 4721, 1860 },
+      { 1792, 1683, 1580, 1485, 2709, 4525, 6401, 6725, 4329, 1539 },
+      { 1920, 1794, 1678, 1570, 2845, 4692, 6486, 6546, 3959, 1278 },
+      { 2048, 1905, 1773, 1651, 2974, 4844, 6543, 6350, 3615, 1065 },
+      { 2176, 2015, 1867, 1731, 3096, 4980, 6576, 6140, 3296, 891 },
+      { 2304, 2123, 1958, 1807, 3212, 5104, 6589, 5922, 3002, 747 },
+      { 2432, 2231, 2048, 1882, 3322, 5214, 6581, 5698, 2732, 628 },
+      { 2560, 2337, 2136, 1953, 3427, 5311, 6557, 5472, 2485, 530 },
+      { 2688, 2442, 2222, 2023, 3525, 5397, 6518, 5246, 2259, 448 },
+      { 2816, 2547, 2306, 2090, 3618, 5472, 6465, 5021, 2053, 380 },
+      { 2944, 2650, 2388, 2154, 3706, 5537, 6401, 4799, 1866, 323 },
+      { 3072, 2752, 2468, 2217, 3788, 5591, 6327, 4581, 1696, 276 },
+      { 3200, 2853, 2547, 2277, 3866, 5637, 6243, 4369, 1541, 235 },
+      { 3328, 2952, 2624, 2335, 3938, 5673, 6152, 4163, 1401, 202 },
+      { 3456, 3051, 2699, 2391, 4006, 5702, 6054, 3962, 1274, 173 },
+      { 3584, 3149, 2772, 2444, 4070, 5723, 5950, 3769, 1158, 149 },
+      { 3712, 3246, 2843, 2496, 4128, 5736, 5842, 3583, 1054, 128 },
+      { 3840, 3341, 2913, 2545, 4183, 5743, 5729, 3404, 959, 111 },
+      { 3968, 3436, 2981, 2593, 4233, 5743, 5614, 3232, 872, 96 },
+      { 4096, 3529, 3048, 2638, 4280, 5737, 5496, 3067, 794, 83 },
+      { 4224, 3621, 3113, 2682, 4322, 5726, 5375, 2909, 724, 72 },
+      { 4352, 3712, 3176, 2724, 4361, 5709, 5253, 2759, 659, 63 },
+      { 4480, 3803, 3237, 2764, 4396, 5687, 5130, 2615, 601, 55 },
+      { 4608, 3892, 3297, 2801, 4428, 5661, 5007, 2478, 548, 48 },
+      { 4736, 3980, 3355, 2838, 4456, 5631, 4883, 2347, 500, 42 },
+      { 4864, 4067, 3412, 2872, 4481, 5596, 4760, 2223, 456, 37 },
+      { 4992, 4152, 3467, 2905, 4503, 5558, 4637, 2105, 417, 32 },
+      { 5120, 4237, 3521, 2936, 4521, 5516, 4515, 1993, 381, 28 },
+      { 5248, 4321, 3573, 2966, 4537, 5471, 4393, 1886, 348, 25 },
+      { 5376, 4404, 3623, 2993, 4550, 5424, 4273, 1785, 318, 22 },
+      { 5504, 4486, 3672, 3020, 4560, 5373, 4155, 1688, 291, 19 },
+      { 5632, 4566, 3720, 3044, 4568, 5321, 4037, 1597, 266, 17 },
+      { 5760, 4646, 3766, 3067, 4572, 5265, 3922, 1511, 244, 15 },
+      { 5888, 4724, 3811, 3089, 4575, 5208, 3808, 1429, 223, 13 },
+      { 6016, 4802, 3854, 3109, 4575, 5148, 3696, 1352, 204, 12 },
+      { 6144, 4878, 3895, 3128, 4573, 5088, 3587, 1278, 187, 10 },
+      { 6272, 4953, 3936, 3145, 4568, 5025, 3479, 1209, 172, 9 },
+      { 6400, 5028, 3975, 3161, 4561, 4961, 3373, 1143, 158, 8 },
+      { 6528, 5101, 4012, 3175, 4553, 4896, 3270, 1081, 145, 7 },
+      { 6656, 5173, 4048, 3189, 4542, 4830, 3168, 1022, 133, 7 },
+      { 6784, 5244, 4083, 3201, 4530, 4763, 3069, 966, 122, 6 },
+      { 6912, 5314, 4117, 3212, 4516, 4694, 2973, 913, 112, 5 },
+      { 7040, 5383, 4149, 3221, 4500, 4626, 2878, 863, 103, 5 },
+      { 7168, 5452, 4180, 3229, 4482, 4556, 2786, 816, 95, 4 },
+      { 7296, 5519, 4210, 3236, 4463, 4486, 2696, 771, 87, 4 },
+      { 7424, 5585, 4238, 3242, 4442, 4416, 2609, 729, 80, 3 },
+      { 7552, 5650, 4265, 3247, 4420, 4345, 2523, 689, 74, 3 },
+      { 7680, 5714, 4291, 3251, 4396, 4274, 2440, 651, 68, 3 },
+      { 7808, 5777, 4315, 3254, 4371, 4203, 2359, 616, 63, 2 },
+      { 7936, 5838, 4339, 3255, 4345, 4132, 2281, 582, 58, 2 },
+      { 8064, 5899, 4361, 3256, 4318, 4061, 2204, 550, 53, 2 },
+      { 8192, 5959, 4382, 3255, 4289, 3990, 2130, 520, 49, 2 },
+      { 8320, 6018, 4402, 3254, 4259, 3919, 2057, 492, 45, 2 },
+      { 8448, 6075, 4421, 3252, 4229, 3848, 1987, 465, 42, 1 },
+      { 8576, 6133, 4438, 3248, 4197, 3778, 1919, 439, 39, 1 },
+      { 8704, 6188, 4455, 3244, 4164, 3708, 1853, 415, 36, 1 },
+      { 8832, 6243, 4470, 3239, 4131, 3638, 1789, 392, 33, 1 },
+      { 8960, 6297, 4484, 3233, 4096, 3569, 1727, 371, 30, 1 },
+      { 9088, 6349, 4497, 3226, 4061, 3500, 1667, 351, 28, 1 },
+      { 9216, 6401, 4509, 3219, 4025, 3432, 1608, 331, 26, 1 },
+      { 9344, 6452, 4520, 3210, 3989, 3364, 1551, 313, 24, 1 },
+      { 9472, 6501, 4530, 3201, 3952, 3297, 1496, 296, 22, 1 },
+      { 9600, 6550, 4539, 3191, 3914, 3230, 1443, 280, 20, 1 },
+      { 9728, 6597, 4547, 3180, 3875, 3164, 1392, 265, 19, 1 },
+      { 9856, 6644, 4554, 3169, 3836, 3098, 1342, 250, 18, 1 },
+      { 9984, 6690, 4560, 3157, 3796, 3034, 1293, 237, 16, 1 },
+      { 10112, 6734, 4565, 3144, 3756, 2970, 1247, 224, 15, 1 },
+      { 10240, 6778, 4568, 3131, 3716, 2907, 1202, 211, 14, 1 },
+      { 10368, 6821, 4571, 3117, 3675, 2844, 1158, 200, 13, 1 },
+      { 10496, 6862, 4573, 3102, 3634, 2783, 1116, 189, 12, 1 },
+      { 10624, 6903, 4574, 3087, 3592, 2722, 1075, 179, 11, 1 },
+      { 10752, 6942, 4575, 3071, 3551, 2662, 1035, 169, 10, 1 },
+      { 10880, 6981, 4574, 3054, 3508, 2603, 997, 160, 10, 1 },
+      { 11008, 7019, 4572, 3038, 3466, 2544, 960, 151, 9, 1 },
+      { 11136, 7055, 4570, 3020, 3424, 2487, 924, 143, 8, 1 },
+      { 11264, 7091, 4566, 3002, 3381, 2430, 890, 135, 8, 1 },
+      { 11392, 7126, 4563, 2984, 3338, 2374, 856, 127, 7, 1 },
+      { 11520, 7159, 4557, 2965, 3295, 2319, 824, 121, 7, 1 },
+      { 11648, 7193, 4552, 2945, 3252, 2264, 793, 114, 6, 1 },
+      { 11776, 7224, 4545, 2925, 3209, 2211, 763, 108, 6, 1 },
+      { 11904, 7255, 4538, 2905, 3165, 2159, 734, 102, 5, 1 },
+      { 12032, 7285, 4530, 2884, 3122, 2107, 706, 96, 5, 1 },
+      { 12160, 7314, 4520, 2863, 3079, 2056, 679, 91, 5, 1 },
+      { 12288, 7341, 4511, 2842, 3036, 2006, 653, 86, 4, 1 },
+      { 12416, 7368, 4500, 2820, 2993, 1957, 628, 81, 4, 1 },
+      { 12544, 7394, 4489, 2797, 2949, 1909, 604, 77, 4, 1 },
+      { 12672, 7419, 4477, 2775, 2906, 1861, 581, 73, 3, 1 },
+      { 12800, 7443, 4464, 2752, 2863, 1815, 558, 69, 3, 1 },
+      { 12928, 7466, 4451, 2729, 2820, 1769, 536, 65, 3, 1 },
+      { 13056, 7488, 4437, 2705, 2777, 1724, 516, 61, 3, 1 },
+      { 13184, 7509, 4422, 2682, 2734, 1680, 495, 58, 3, 1 },
+      { 13312, 7529, 4406, 2658, 2692, 1637, 476, 55, 2, 1 },
+      { 13440, 7548, 4390, 2633, 2650, 1595, 457, 52, 2, 1 },
+      { 13568, 7567, 4373, 2609, 2607, 1553, 439, 49, 2, 1 },
+      { 13696, 7583, 4356, 2584, 2565, 1513, 422, 46, 2, 1 },
+      { 13824, 7600, 4337, 2559, 2523, 1473, 405, 44, 2, 1 },
+      { 13952, 7615, 4319, 2533, 2482, 1434, 389, 41, 2, 1 },
+      { 14080, 7629, 4300, 2508, 2441, 1395, 373, 39, 2, 1 },
+      { 14208, 7643, 4280, 2482, 2400, 1358, 358, 37, 1, 1 },
+      { 14336, 7655, 4259, 2457, 2359, 1321, 344, 35, 1, 1 },
+      { 14464, 7667, 4238, 2431, 2318, 1285, 330, 33, 1, 1 },
+      { 14592, 7677, 4217, 2405, 2278, 1250, 316, 31, 1, 1 },
+      { 14720, 7687, 4195, 2378, 2238, 1215, 304, 29, 1, 1 },
+      { 14848, 7696, 4172, 2352, 2198, 1181, 291, 28, 1, 1 },
+      { 14976, 7703, 4149, 2326, 2159, 1148, 279, 26, 1, 1 },
+      { 15104, 7710, 4125, 2299, 2119, 1116, 268, 25, 1, 1 },
+      { 15232, 7715, 4101, 2272, 2081, 1085, 257, 23, 1, 1 },
+      { 15360, 7721, 4076, 2245, 2042, 1054, 246, 22, 1, 1 },
+      { 15488, 7724, 4051, 2219, 2004, 1023, 236, 21, 1, 1 },
+      { 15616, 7727, 4025, 2192, 1966, 994, 226, 20, 1, 1 },
+      { 15744, 7729, 3999, 2164, 1929, 965, 217, 19, 1, 1 },
+      { 15872, 7731, 3972, 2137, 1892, 937, 207, 18, 1, 1 },
+      { 16000, 7731, 3945, 2110, 1855, 909, 199, 17, 1, 1 },
+      { 16128, 7730, 3918, 2083, 1819, 882, 190, 16, 1, 1 },
+      { 16256, 7728, 3890, 2056, 1783, 856, 182, 15, 1, 1 },
+      { 16384, 7725, 3862, 2029, 1747, 831, 174, 14, 1, 1 },
+      { 16512, 7721, 3833, 2002, 1712, 806, 167, 13, 1, 1 },
+      { 16640, 7717, 3804, 1975, 1677, 781, 160, 12, 1, 1 },
+      { 16768, 7712, 3775, 1947, 1642, 757, 153, 12, 1, 1 },
+      { 16896, 7706, 3745, 1920, 1608, 734, 146, 11, 1, 1 },
+      { 17024, 7699, 3714, 1893, 1575, 711, 140, 10, 1, 1 },
+      { 17152, 7690, 3684, 1866, 1541, 689, 134, 10, 1, 1 },
+      { 17280, 7681, 3653, 1839, 1508, 668, 128, 9, 1, 1 },
+      { 17408, 7671, 3621, 1812, 1476, 647, 122, 9, 1, 1 },
+      { 17536, 7660, 3590, 1785, 1444, 626, 117, 8, 1, 1 },
+      { 17664, 7648, 3558, 1758, 1412, 606, 112, 8, 1, 1 },
+      { 17792, 7635, 3526, 1731, 1381, 587, 107, 7, 1, 1 },
+      { 17920, 7622, 3493, 1704, 1350, 568, 102, 7, 1, 1 },
+      { 18048, 7607, 3461, 1678, 1319, 549, 98, 6, 1, 1 },
+      { 18176, 7592, 3428, 1651, 1289, 531, 93, 6, 1, 1 },
+      { 18304, 7575, 3394, 1625, 1259, 514, 89, 6, 1, 1 },
+      { 18432, 7558, 3361, 1598, 1230, 497, 85, 5, 1, 1 },
+      { 18560, 7540, 3327, 1572, 1201, 480, 81, 5, 1, 1 },
+      { 18688, 7520, 3293, 1546, 1173, 464, 77, 5, 1, 1 },
+      { 18816, 7500, 3258, 1520, 1145, 448, 74, 5, 1, 1 },
+      { 18944, 7480, 3224, 1494, 1117, 433, 70, 4, 1, 1 },
+      { 19072, 7458, 3189, 1468, 1090, 418, 67, 4, 1, 1 },
+      { 19200, 7435, 3154, 1442, 1063, 404, 64, 4, 1, 1 },
+      { 19328, 7410, 3119, 1417, 1037, 390, 61, 4, 1, 1 },
+      { 19456, 7386, 3084, 1392, 1011, 376, 58, 3, 1, 1 },
+      { 19584, 7361, 3048, 1366, 986, 363, 55, 3, 1, 1 },
+      { 19712, 7335, 3012, 1341, 960, 350, 53, 3, 1, 1 },
+      { 19840, 7307, 2977, 1316, 936, 337, 50, 3, 1, 1 },
+      { 19968, 7279, 2941, 1291, 911, 325, 48, 3, 1, 1 },
+      { 20096, 7251, 2905, 1267, 887, 313, 45, 2, 1, 1 },
+      { 20224, 7220, 2868, 1243, 864, 302, 43, 2, 1, 1 },
+      { 20352, 7189, 2832, 1218, 841, 291, 41, 2, 1, 1 },
+      { 20480, 7158, 2795, 1194, 818, 280, 39, 2, 1, 1 },
+      { 20608, 7124, 2759, 1170, 796, 270, 37, 2, 1, 1 },
+      { 20736, 7091, 2722, 1147, 774, 259, 35, 2, 1, 1 },
+      { 20864, 7056, 2685, 1123, 752, 250, 34, 2, 1, 1 },
+      { 20992, 7021, 2648, 1100, 731, 240, 32, 2, 1, 1 },
+      { 21120, 6985, 2612, 1077, 710, 231, 30, 1, 1, 1 },
+      { 21248, 6948, 2574, 1054, 690, 222, 29, 1, 1, 1 },
+      { 21376, 6911, 2537, 1031, 670, 213, 27, 1, 1, 1 },
+      { 21504, 6872, 2500, 1008, 650, 205, 26, 1, 1, 1 },
+      { 21632, 6831, 2463, 986, 631, 197, 25, 1, 1, 1 },
+      { 21760, 6791, 2426, 964, 612, 189, 23, 1, 1, 1 },
+      { 21888, 6749, 2389, 942, 594, 181, 22, 1, 1, 1 },
+      { 22016, 6707, 2351, 921, 575, 174, 21, 1, 1, 1 },
+      { 22144, 6663, 2314, 899, 558, 167, 20, 1, 1, 1 },
+      { 22272, 6619, 2277, 878, 540, 160, 19, 1, 1, 1 },
+      { 22400, 6574, 2240, 857, 523, 153, 18, 1, 1, 1 },
+      { 22528, 6529, 2202, 836, 507, 146, 17, 1, 1, 1 },
+      { 22656, 6482, 2165, 816, 490, 140, 16, 1, 1, 1 },
+      { 22784, 6435, 2128, 795, 474, 134, 15, 1, 1, 1 },
+      { 22912, 6386, 2091, 775, 459, 128, 14, 1, 1, 1 },
+      { 23040, 6336, 2054, 756, 443, 123, 13, 1, 1, 1 },
+      { 23168, 6286, 2017, 736, 428, 117, 13, 1, 1, 1 },
+      { 23296, 6234, 1980, 717, 414, 112, 12, 1, 1, 1 },
+      { 23424, 6183, 1943, 698, 399, 107, 11, 1, 1, 1 },
+      { 23552, 6130, 1906, 679, 385, 102, 11, 1, 1, 1 },
+      { 23680, 6077, 1869, 660, 372, 97, 10, 1, 1, 1 },
+      { 23808, 6022, 1833, 642, 358, 93, 9, 1, 1, 1 },
+      { 23936, 5966, 1796, 624, 345, 89, 9, 1, 1, 1 },
+      { 24064, 5910, 1760, 606, 333, 84, 8, 1, 1, 1 },
+      { 24192, 5853, 1724, 588, 320, 80, 8, 1, 1, 1 },
+      { 24320, 5796, 1687, 571, 308, 76, 7, 1, 1, 1 },
+      { 24448, 5735, 1651, 554, 297, 73, 7, 1, 1, 1 },
+      { 24576, 5677, 1615, 537, 285, 69, 6, 1, 1, 1 },
+      { 24704, 5615, 1579, 521, 274, 66, 6, 1, 1, 1 },
+      { 24832, 5554, 1544, 504, 263, 62, 6, 1, 1, 1 },
+      { 24960, 5492, 1508, 488, 253, 59, 5, 1, 1, 1 },
+      { 25088, 5428, 1473, 473, 242, 56, 5, 1, 1, 1 },
+      { 25216, 5364, 1438, 457, 232, 53, 5, 1, 1, 1 },
+      { 25344, 5300, 1403, 442, 222, 50, 4, 1, 1, 1 },
+      { 25472, 5233, 1368, 427, 213, 48, 4, 1, 1, 1 },
+      { 25600, 5166, 1334, 412, 204, 45, 4, 1, 1, 1 },
+      { 25728, 5098, 1299, 398, 195, 43, 4, 1, 1, 1 },
+      { 25856, 5030, 1266, 384, 186, 40, 3, 1, 1, 1 },
+      { 25984, 4960, 1232, 370, 178, 38, 3, 1, 1, 1 },
+      { 26112, 4890, 1198, 356, 170, 36, 3, 1, 1, 1 },
+      { 26240, 4819, 1164, 343, 162, 34, 3, 1, 1, 1 },
+      { 26368, 4748, 1132, 329, 154, 32, 2, 1, 1, 1 },
+      { 26496, 4675, 1098, 317, 147, 30, 2, 1, 1, 1 },
+      { 26624, 4602, 1066, 304, 139, 28, 2, 1, 1, 1 },
+      { 26752, 4527, 1034, 292, 132, 26, 2, 1, 1, 1 },
+      { 26880, 4451, 1001, 280, 126, 25, 2, 1, 1, 1 },
+      { 27008, 4375, 970, 268, 119, 23, 2, 1, 1, 1 },
+      { 27136, 4299, 938, 256, 113, 21, 2, 1, 1, 1 },
+      { 27264, 4221, 907, 245, 107, 20, 1, 1, 1, 1 },
+      { 27392, 4142, 876, 234, 101, 19, 1, 1, 1, 1 },
+      { 27520, 4063, 846, 223, 95, 17, 1, 1, 1, 1 },
+      { 27648, 3982, 815, 213, 90, 16, 1, 1, 1, 1 },
+      { 27776, 3900, 786, 202, 85, 15, 1, 1, 1, 1 },
+      { 27904, 3818, 756, 192, 80, 14, 1, 1, 1, 1 },
+      { 28032, 3734, 727, 183, 75, 13, 1, 1, 1, 1 },
+      { 28160, 3651, 698, 173, 70, 12, 1, 1, 1, 1 },
+      { 28288, 3566, 669, 164, 66, 11, 1, 1, 1, 1 },
+      { 28416, 3481, 641, 155, 61, 10, 1, 1, 1, 1 },
+      { 28544, 3393, 614, 147, 57, 9, 1, 1, 1, 1 },
+      { 28672, 3306, 586, 138, 53, 9, 1, 1, 1, 1 },
+      { 28800, 3217, 559, 130, 50, 8, 1, 1, 1, 1 },
+      { 28928, 3128, 533, 122, 46, 7, 1, 1, 1, 1 },
+      { 29056, 3037, 507, 114, 43, 7, 1, 1, 1, 1 },
+      { 29184, 2947, 481, 107, 39, 6, 1, 1, 1, 1 },
+      { 29312, 2855, 456, 100, 36, 5, 1, 1, 1, 1 },
+      { 29440, 2762, 431, 93, 33, 5, 1, 1, 1, 1 },
+      { 29568, 2668, 407, 86, 31, 4, 1, 1, 1, 1 },
+      { 29696, 2573, 383, 80, 28, 4, 1, 1, 1, 1 },
+      { 29824, 2478, 359, 74, 25, 4, 1, 1, 1, 1 },
+      { 29952, 2381, 337, 68, 23, 3, 1, 1, 1, 1 },
+      { 30080, 2284, 314, 62, 21, 3, 1, 1, 1, 1 },
+      { 30208, 2185, 293, 57, 19, 2, 1, 1, 1, 1 },
+      { 30336, 2086, 271, 52, 17, 2, 1, 1, 1, 1 },
+      { 30464, 1986, 250, 47, 15, 2, 1, 1, 1, 1 },
+      { 30592, 1885, 230, 42, 13, 2, 1, 1, 1, 1 },
+      { 30720, 1782, 211, 38, 12, 1, 1, 1, 1, 1 },
+      { 30848, 1679, 192, 34, 10, 1, 1, 1, 1, 1 },
+      { 30976, 1575, 173, 30, 9, 1, 1, 1, 1, 1 },
+      { 31104, 1469, 156, 26, 8, 1, 1, 1, 1, 1 },
+      { 31232, 1364, 138, 23, 6, 1, 1, 1, 1, 1 },
+      { 31360, 1257, 122, 19, 5, 1, 1, 1, 1, 1 },
+      { 31488, 1149, 106, 16, 4, 1, 1, 1, 1, 1 },
+      { 31616, 1038, 91, 14, 4, 1, 1, 1, 1, 1 },
+      { 31744, 928, 77, 11, 3, 1, 1, 1, 1, 1 },
+      { 31872, 816, 64, 9, 2, 1, 1, 1, 1, 1 },
+      { 32000, 703, 51, 7, 2, 1, 1, 1, 1, 1 },
+      { 32128, 589, 40, 5, 1, 1, 1, 1, 1, 1 },
+      { 32256, 473, 29, 4, 1, 1, 1, 1, 1, 1 },
+      { 32384, 357, 19, 2, 1, 1, 1, 1, 1, 1 },
+      { 32512, 238, 11, 1, 1, 1, 1, 1, 1, 1 },
+      { 32640, 117, 4, 1, 1, 1, 1, 1, 1, 1 },
+    };
+#endif  // CONFIG_NEW_TOKENSET
+
+/* clang-format off */
+#if CONFIG_Q_ADAPT_PROBS
+const av1_coeff_probs_model
+default_qctx_coef_probs[QCTX_BINS][TX_SIZES][PLANE_TYPES] = {
+    {  // Q_Index 0
+#if CONFIG_CB4X4
+        {  // TX_SIZE 0
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        {182,  34, 137}, { 79,  39, 103}, { 10,  28,  51},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 45,  88, 147}, { 46,  80, 140}, { 25,  69, 119},
+                        { 12,  57,  96}, {  4,  41,  65}, {  1,  20,  31},
+                    },
+                    {  // band 2
+                        { 58, 124, 190}, { 39, 106, 178}, { 16,  86, 147},
+                        {  7,  69, 114}, {  3,  50,  80}, {  1,  25,  42},
+                    },
+                    {  // band 3
+                        { 90, 138, 215}, { 54, 116, 198}, { 18,  86, 155},
+                        {  5,  62, 112}, {  1,  38,  68}, {  1,  17,  30},
+                    },
+                    {  // band 4
+                        {126, 149, 231}, { 82, 114, 211}, { 21,  80, 157},
+                        {  6,  56, 105}, {  1,  36,  64}, {  1,  17,  31},
+                    },
+                    {  // band 5
+                        {171,  56, 236}, {140,  54, 219}, { 57,  45, 167},
+                        { 26,  36, 113}, { 11,  29,  72}, {  3,  18,  39},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        {153, 122, 186}, {106, 109, 171}, { 36,  84, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 27, 151, 201}, { 34, 131, 199}, { 23, 102, 161},
+                        { 10,  80, 120}, {  4,  52,  78}, {  1,  24,  37},
+                    },
+                    {  // band 2
+                        { 43, 158, 213}, { 35, 133, 203}, {  8,  92, 151},
+                        {  2,  64, 106}, {  1,  36,  60}, {  1,  13,  24},
+                    },
+                    {  // band 3
+                        { 68, 167, 223}, { 36, 135, 211}, {  9,  94, 157},
+                        {  2,  67, 112}, {  1,  40,  68}, {  1,  17,  31},
+                    },
+                    {  // band 4
+                        {131, 146, 237}, { 72, 119, 223}, { 17,  82, 164},
+                        {  4,  55, 107}, {  1,  34,  63}, {  1,  16,  29},
+                    },
+                    {  // band 5
+                        {184,  68, 244}, {153,  59, 232}, { 68,  51, 179},
+                        { 31,  40, 123}, { 13,  29,  77}, {  4,  17,  37},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {203,  41, 203}, {127,  56, 174}, { 49,  56, 127},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {110, 121, 217}, {119, 113, 213}, { 64,  95, 185},
+                        { 30,  72, 144}, {  8,  42,  76}, {  2,  17,  25},
+                    },
+                    {  // band 2
+                        {127, 159, 229}, {115, 134, 223}, { 36, 100, 189},
+                        { 11,  75, 142}, {  3,  48,  83}, {  1,  19,  33},
+                    },
+                    {  // band 3
+                        {150, 172, 241}, { 90, 133, 231}, { 28, 102, 192},
+                        {  7,  81, 147}, {  1,  53,  91}, {  1,  25,  42},
+                    },
+                    {  // band 4
+                        {184, 144, 248}, {114, 117, 237}, { 37,  89, 192},
+                        { 10,  63, 130}, {  4,  42,  76}, {  1,  19,  38},
+                    },
+                    {  // band 5
+                        {207,  79, 250}, {179,  74, 241}, { 83,  67, 199},
+                        { 38,  51, 142}, { 17,  37,  97}, { 10,  14,  55},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {220,  82, 232}, {150,  93, 214}, { 66,  95, 177},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {116, 160, 227}, {136, 141, 227}, { 67, 114, 190},
+                        { 40,  94, 148}, { 21,  70, 107}, { 10,  43,  63},
+                    },
+                    {  // band 2
+                        {124, 173, 235}, {105, 147, 226}, { 27, 107, 184},
+                        { 10,  80, 142}, {  3,  50,  86}, {  1,  16,  32},
+                    },
+                    {  // band 3
+                        {149, 179, 243}, { 89, 147, 234}, { 29, 112, 193},
+                        {  9,  94, 157}, {  1,  64, 111}, {  1,  25,  43},
+                    },
+                    {  // band 4
+                        {187, 153, 248}, {127, 130, 241}, { 52,  99, 202},
+                        { 20,  79, 152}, {  4,  50,  93}, {  1,  19,  32},
+                    },
+                    {  // band 5
+                        {215,  82, 251}, {195,  80, 246}, { 93,  70, 204},
+                        { 39,  54, 147}, { 14,  33,  88}, {  6,  14,  39},
+                    },
+                },
+            },
+        },
+#endif
+        {  // TX_SIZE 0
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        {182,  34, 137}, { 79,  39, 103}, { 10,  28,  51},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 45,  88, 147}, { 46,  80, 140}, { 25,  69, 119},
+                        { 12,  57,  96}, {  4,  41,  65}, {  1,  20,  31},
+                    },
+                    {  // band 2
+                        { 58, 124, 190}, { 39, 106, 178}, { 16,  86, 147},
+                        {  7,  69, 114}, {  3,  50,  80}, {  1,  25,  42},
+                    },
+                    {  // band 3
+                        { 90, 138, 215}, { 54, 116, 198}, { 18,  86, 155},
+                        {  5,  62, 112}, {  1,  38,  68}, {  1,  17,  30},
+                    },
+                    {  // band 4
+                        {126, 149, 231}, { 82, 114, 211}, { 21,  80, 157},
+                        {  6,  56, 105}, {  1,  36,  64}, {  1,  17,  31},
+                    },
+                    {  // band 5
+                        {171,  56, 236}, {140,  54, 219}, { 57,  45, 167},
+                        { 26,  36, 113}, { 11,  29,  72}, {  3,  18,  39},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        {153, 122, 186}, {106, 109, 171}, { 36,  84, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 27, 151, 201}, { 34, 131, 199}, { 23, 102, 161},
+                        { 10,  80, 120}, {  4,  52,  78}, {  1,  24,  37},
+                    },
+                    {  // band 2
+                        { 43, 158, 213}, { 35, 133, 203}, {  8,  92, 151},
+                        {  2,  64, 106}, {  1,  36,  60}, {  1,  13,  24},
+                    },
+                    {  // band 3
+                        { 68, 167, 223}, { 36, 135, 211}, {  9,  94, 157},
+                        {  2,  67, 112}, {  1,  40,  68}, {  1,  17,  31},
+                    },
+                    {  // band 4
+                        {131, 146, 237}, { 72, 119, 223}, { 17,  82, 164},
+                        {  4,  55, 107}, {  1,  34,  63}, {  1,  16,  29},
+                    },
+                    {  // band 5
+                        {184,  68, 244}, {153,  59, 232}, { 68,  51, 179},
+                        { 31,  40, 123}, { 13,  29,  77}, {  4,  17,  37},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {203,  41, 203}, {127,  56, 174}, { 49,  56, 127},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {110, 121, 217}, {119, 113, 213}, { 64,  95, 185},
+                        { 30,  72, 144}, {  8,  42,  76}, {  2,  17,  25},
+                    },
+                    {  // band 2
+                        {127, 159, 229}, {115, 134, 223}, { 36, 100, 189},
+                        { 11,  75, 142}, {  3,  48,  83}, {  1,  19,  33},
+                    },
+                    {  // band 3
+                        {150, 172, 241}, { 90, 133, 231}, { 28, 102, 192},
+                        {  7,  81, 147}, {  1,  53,  91}, {  1,  25,  42},
+                    },
+                    {  // band 4
+                        {184, 144, 248}, {114, 117, 237}, { 37,  89, 192},
+                        { 10,  63, 130}, {  4,  42,  76}, {  1,  19,  38},
+                    },
+                    {  // band 5
+                        {207,  79, 250}, {179,  74, 241}, { 83,  67, 199},
+                        { 38,  51, 142}, { 17,  37,  97}, { 10,  14,  55},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {220,  82, 232}, {150,  93, 214}, { 66,  95, 177},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {116, 160, 227}, {136, 141, 227}, { 67, 114, 190},
+                        { 40,  94, 148}, { 21,  70, 107}, { 10,  43,  63},
+                    },
+                    {  // band 2
+                        {124, 173, 235}, {105, 147, 226}, { 27, 107, 184},
+                        { 10,  80, 142}, {  3,  50,  86}, {  1,  16,  32},
+                    },
+                    {  // band 3
+                        {149, 179, 243}, { 89, 147, 234}, { 29, 112, 193},
+                        {  9,  94, 157}, {  1,  64, 111}, {  1,  25,  43},
+                    },
+                    {  // band 4
+                        {187, 153, 248}, {127, 130, 241}, { 52,  99, 202},
+                        { 20,  79, 152}, {  4,  50,  93}, {  1,  19,  32},
+                    },
+                    {  // band 5
+                        {215,  82, 251}, {195,  80, 246}, { 93,  70, 204},
+                        { 39,  54, 147}, { 14,  33,  88}, {  6,  14,  39},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 1
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        {116,  43, 131}, { 39,  41,  94}, {  4,  28,  47},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 28, 101, 141}, { 27,  95, 140}, { 18,  80, 121},
+                        { 10,  61,  95}, {  4,  39,  60}, {  1,  19,  26},
+                    },
+                    {  // band 2
+                        { 29, 150, 183}, { 19, 127, 175}, {  8,  98, 147},
+                        {  3,  76, 115}, {  1,  55,  84}, {  1,  29,  43},
+                    },
+                    {  // band 3
+                        { 26, 168, 202}, { 12, 138, 188}, {  2,  98, 149},
+                        {  1,  69, 110}, {  1,  40,  65}, {  1,  17,  25},
+                    },
+                    {  // band 4
+                        { 33, 188, 225}, { 12, 155, 207}, {  2, 101, 155},
+                        {  1,  65, 106}, {  1,  36,  60}, {  1,  18,  26},
+                    },
+                    {  // band 5
+                        { 79, 205, 242}, { 30, 168, 224}, {  5, 106, 164},
+                        {  1,  68, 110}, {  1,  39,  65}, {  1,  18,  28},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 96,  80, 201}, { 51,  88, 168}, { 14,  78, 116},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {  6, 167, 216}, { 32, 152, 211}, { 24, 121, 182},
+                        { 13,  98, 149}, { 12,  76, 108}, {  8,  48,  62},
+                    },
+                    {  // band 2
+                        { 17, 176, 225}, { 13, 147, 209}, {  3,  96, 155},
+                        {  1,  65, 108}, {  2,  43,  63}, {  2,  23,  25},
+                    },
+                    {  // band 3
+                        { 18, 183, 232}, { 10, 153, 214}, {  1,  96, 154},
+                        {  1,  63, 105}, {  1,  39,  59}, {  1,  21,  24},
+                    },
+                    {  // band 4
+                        { 23, 191, 239}, {  8, 159, 221}, {  1,  97, 158},
+                        {  1,  61, 105}, {  1,  37,  60}, {  1,  20,  26},
+                    },
+                    {  // band 5
+                        { 70, 201, 243}, { 29, 163, 228}, {  4, 102, 169},
+                        {  1,  67, 114}, {  1,  39,  66}, {  1,  17,  29},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {181,  38, 192}, { 95,  47, 151}, { 29,  49, 102},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 72, 131, 202}, { 93, 120, 205}, { 50, 103, 179},
+                        { 24,  79, 143}, { 11,  47,  78}, {  7,  19,  25},
+                    },
+                    {  // band 2
+                        { 84, 176, 221}, { 56, 144, 214}, { 21, 108, 182},
+                        {  8,  83, 139}, {  3,  55,  90}, {  2,  27,  41},
+                    },
+                    {  // band 3
+                        { 84, 195, 234}, { 42, 156, 222}, { 10, 109, 180},
+                        {  4,  77, 133}, {  1,  48,  80}, {  1,  23,  35},
+                    },
+                    {  // band 4
+                        { 89, 210, 238}, { 35, 165, 221}, {  6, 106, 172},
+                        {  2,  70, 123}, {  1,  44,  74}, {  1,  21,  30},
+                    },
+                    {  // band 5
+                        {114, 221, 247}, { 49, 170, 234}, {  7, 113, 184},
+                        {  2,  77, 132}, {  1,  48,  79}, {  1,  25,  33},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {192,  66, 237}, {113,  84, 211}, { 35,  84, 154},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 81, 180, 234}, {127, 165, 229}, { 58, 137, 204},
+                        { 41, 114, 174}, { 44,  94, 136}, { 29,  66,  86},
+                    },
+                    {  // band 2
+                        { 82, 193, 240}, { 39, 162, 223}, {  8, 113, 179},
+                        {  3,  83, 136}, {  6,  62,  84}, {  5,  45,  45},
+                    },
+                    {  // band 3
+                        { 78, 203, 242}, { 31, 170, 227}, {  4, 115, 181},
+                        {  1,  82, 135}, {  2,  59,  82}, {  1,  45,  47},
+                    },
+                    {  // band 4
+                        { 76, 210, 239}, { 25, 170, 213}, {  2,  99, 152},
+                        {  1,  69, 115}, {  1,  49,  80}, {  1,  47,  57},
+                    },
+                    {  // band 5
+                        {103, 217, 250}, { 42, 180, 237}, {  3, 124, 191},
+                        {  1,  90, 150}, {  1,  69, 116}, {  1,  52,  46},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 2
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 58,  38,  99}, {  9,  26,  51}, {  1,  14,  22},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 14,  78, 109}, { 16,  73, 105}, { 11,  62,  92},
+                        {  6,  47,  72}, {  2,  29,  45}, {  1,  12,  18},
+                    },
+                    {  // band 2
+                        { 17, 131, 148}, { 11, 112, 140}, {  5,  87, 118},
+                        {  2,  63,  90}, {  1,  42,  63}, {  1,  19,  31},
+                    },
+                    {  // band 3
+                        { 12, 151, 168}, {  6, 116, 152}, {  1,  76, 115},
+                        {  1,  50,  81}, {  1,  32,  52}, {  1,  14,  23},
+                    },
+                    {  // band 4
+                        { 10, 174, 191}, {  3, 130, 172}, {  1,  80, 126},
+                        {  1,  53,  88}, {  1,  32,  55}, {  1,  14,  24},
+                    },
+                    {  // band 5
+                        { 19, 219, 237}, {  3, 168, 211}, {  1,  90, 142},
+                        {  1,  53,  91}, {  1,  29,  51}, {  1,  12,  21},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 21,  46, 184}, { 10,  53, 130}, {  2,  49,  78},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {  3, 169, 198}, { 37, 165, 196}, { 26, 134, 176},
+                        { 11, 108, 149}, {  5,  81, 112}, {  3,  47,  64},
+                    },
+                    {  // band 2
+                        { 11, 183, 215}, {  8, 142, 192}, {  2,  91, 141},
+                        {  1,  62, 100}, {  1,  38,  62}, {  1,  17,  28},
+                    },
+                    {  // band 3
+                        { 12, 190, 223}, {  6, 149, 199}, {  1,  88, 139},
+                        {  1,  56,  93}, {  1,  31,  54}, {  1,  13,  21},
+                    },
+                    {  // band 4
+                        { 11, 197, 230}, {  3, 154, 204}, {  1,  83, 134},
+                        {  1,  50,  86}, {  1,  28,  49}, {  1,  12,  21},
+                    },
+                    {  // band 5
+                        { 17, 211, 240}, {  2, 167, 217}, {  1,  88, 143},
+                        {  1,  53,  91}, {  1,  30,  53}, {  1,  14,  24},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {151,  30, 151}, { 50,  36, 105}, {  8,  34,  66},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 39, 111, 160}, { 62, 111, 165}, { 37,  99, 147},
+                        { 15,  77, 118}, {  3,  47,  73}, {  1,  17,  27},
+                    },
+                    {  // band 2
+                        { 48, 170, 190}, { 32, 135, 180}, { 11, 100, 149},
+                        {  4,  76, 116}, {  1,  51,  80}, {  1,  22,  36},
+                    },
+                    {  // band 3
+                        { 39, 191, 208}, { 18, 141, 191}, {  3,  96, 150},
+                        {  1,  66, 110}, {  1,  41,  69}, {  1,  17,  28},
+                    },
+                    {  // band 4
+                        { 32, 209, 219}, {  8, 152, 201}, {  1,  96, 153},
+                        {  1,  63, 106}, {  1,  38,  66}, {  1,  17,  29},
+                    },
+                    {  // band 5
+                        { 33, 230, 237}, {  5, 173, 214}, {  1, 100, 155},
+                        {  1,  62, 105}, {  1,  38,  66}, {  1,  18,  32},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {149,  38, 231}, { 59,  51, 186}, { 12,  54, 117},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 53, 179, 226}, {126, 176, 223}, { 58, 147, 202},
+                        { 28, 118, 174}, { 15,  94, 138}, { 14,  63,  87},
+                    },
+                    {  // band 2
+                        { 58, 196, 232}, { 26, 158, 213}, {  5, 106, 166},
+                        {  1,  75, 124}, {  1,  46,  79}, {  1,  23,  39},
+                    },
+                    {  // band 3
+                        { 46, 203, 235}, { 17, 162, 213}, {  2, 104, 165},
+                        {  1,  72, 120}, {  1,  44,  74}, {  1,  22,  33},
+                    },
+                    {  // band 4
+                        { 37, 213, 238}, {  8, 167, 216}, {  1, 104, 168},
+                        {  1,  68, 119}, {  1,  40,  67}, {  1,  17,  29},
+                    },
+                    {  // band 5
+                        { 30, 228, 239}, {  4, 181, 213}, {  1, 103, 153},
+                        {  1,  65, 110}, {  1,  43,  79}, {  1,  27,  56},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 3
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 76,  25,  53}, {  9,  18,  32}, {  1,  12,  18},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 29,  55,  91}, { 19,  58,  95}, { 15,  57,  89},
+                        { 12,  49,  77}, {  3,  29,  44}, {  1,   8,  12},
+                    },
+                    {  // band 2
+                        { 32, 160, 148}, { 33, 143, 146}, { 19, 122, 132},
+                        {  6,  90, 102}, {  1,  58,  70}, {  1,  17,  24},
+                    },
+                    {  // band 3
+                        { 16, 181, 181}, {  6, 142, 165}, {  1,  90, 120},
+                        {  1,  50,  71}, {  1,  25,  38}, {  1,   9,  14},
+                    },
+                    {  // band 4
+                        { 13, 203, 203}, {  3, 154, 176}, {  1,  80, 108},
+                        {  1,  41,  61}, {  1,  24,  37}, {  1,  11,  17},
+                    },
+                    {  // band 5
+                        {  6, 234, 240}, {  1, 178, 204}, {  1,  80, 119},
+                        {  1,  45,  71}, {  1,  26,  42}, {  1,  12,  19},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 78,  20, 135}, { 25,  18, 101}, {  5,  19,  57},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {  7, 144, 183}, {117, 151, 195}, {109, 151, 187},
+                        { 39, 130, 168}, { 11, 100, 125}, {  4,  59,  64},
+                    },
+                    {  // band 2
+                        { 20, 184, 212}, { 12, 148, 191}, {  2,  98, 141},
+                        {  1,  65, 100}, {  1,  39,  61}, {  1,  14,  22},
+                    },
+                    {  // band 3
+                        { 15, 194, 222}, {  6, 153, 198}, {  1,  92, 138},
+                        {  1,  58,  91}, {  1,  32,  52}, {  1,  12,  18},
+                    },
+                    {  // band 4
+                        { 14, 206, 232}, {  3, 162, 206}, {  1,  89, 134},
+                        {  1,  52,  83}, {  1,  28,  46}, {  1,  11,  17},
+                    },
+                    {  // band 5
+                        {  6, 225, 241}, {  1, 175, 210}, {  1,  81, 125},
+                        {  1,  48,  78}, {  1,  28,  46}, {  1,  13,  21},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {124,  23,  93}, { 31,  24,  63}, {  6,  24,  46},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 23,  86, 126}, { 45,  90, 145}, { 31,  91, 133},
+                        { 19,  80, 114}, {  7,  53,  72}, {  1,  20,  27},
+                    },
+                    {  // band 2
+                        { 51, 186, 189}, { 48, 159, 182}, { 33, 128, 156},
+                        { 15,  92, 124}, {  2,  62,  83}, {  1,  29,  43},
+                    },
+                    {  // band 3
+                        { 36, 198, 211}, { 15, 156, 187}, {  3,  97, 137},
+                        {  1,  61,  93}, {  1,  35,  57}, {  1,  15,  23},
+                    },
+                    {  // band 4
+                        { 34, 219, 223}, {  9, 162, 193}, {  1,  91, 136},
+                        {  1,  58,  92}, {  1,  35,  54}, {  1,  14,  23},
+                    },
+                    {  // band 5
+                        { 19, 243, 243}, {  3, 191, 208}, {  1,  91, 137},
+                        {  1,  56,  90}, {  1,  34,  55}, {  1,  16,  24},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {119,  20, 197}, { 19,  29, 156}, {  3,  30, 107},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 24, 192, 226}, {161, 193, 227}, { 97, 185, 222},
+                        { 31, 158, 204}, { 16, 122, 165}, { 17,  84, 112},
+                    },
+                    {  // band 2
+                        { 26, 202, 229}, { 11, 165, 210}, {  2, 103, 152},
+                        {  1,  68, 104}, {  1,  42,  70}, {  1,  16,  36},
+                    },
+                    {  // band 3
+                        { 24, 209, 237}, {  6, 169, 214}, {  1, 102, 154},
+                        {  1,  65, 107}, {  1,  45,  68}, {  1,  17,  24},
+                    },
+                    {  // band 4
+                        { 19, 219, 243}, {  4, 183, 226}, {  1, 115, 172},
+                        {  1,  73, 119}, {  1,  43,  77}, {  1,  15,  37},
+                    },
+                    {  // band 5
+                        { 11, 237, 241}, {  2, 190, 216}, {  1, 108, 146},
+                        {  1,  59,  94}, {  1,  40,  67}, {  1,  30,  53},
+                    },
+                },
+            },
+        },
+#if CONFIG_TX64X64
+        {  // TX_SIZE 4
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 76,  25,  53}, {  9,  18,  32}, {  1,  12,  18},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 29,  55,  91}, { 19,  58,  95}, { 15,  57,  89},
+                        { 12,  49,  77}, {  3,  29,  44}, {  1,   8,  12},
+                    },
+                    {  // band 2
+                        { 32, 160, 148}, { 33, 143, 146}, { 19, 122, 132},
+                        {  6,  90, 102}, {  1,  58,  70}, {  1,  17,  24},
+                    },
+                    {  // band 3
+                        { 16, 181, 181}, {  6, 142, 165}, {  1,  90, 120},
+                        {  1,  50,  71}, {  1,  25,  38}, {  1,   9,  14},
+                    },
+                    {  // band 4
+                        { 13, 203, 203}, {  3, 154, 176}, {  1,  80, 108},
+                        {  1,  41,  61}, {  1,  24,  37}, {  1,  11,  17},
+                    },
+                    {  // band 5
+                        {  6, 234, 240}, {  1, 178, 204}, {  1,  80, 119},
+                        {  1,  45,  71}, {  1,  26,  42}, {  1,  12,  19},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 78,  20, 135}, { 25,  18, 101}, {  5,  19,  57},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {  7, 144, 183}, {117, 151, 195}, {109, 151, 187},
+                        { 39, 130, 168}, { 11, 100, 125}, {  4,  59,  64},
+                    },
+                    {  // band 2
+                        { 20, 184, 212}, { 12, 148, 191}, {  2,  98, 141},
+                        {  1,  65, 100}, {  1,  39,  61}, {  1,  14,  22},
+                    },
+                    {  // band 3
+                        { 15, 194, 222}, {  6, 153, 198}, {  1,  92, 138},
+                        {  1,  58,  91}, {  1,  32,  52}, {  1,  12,  18},
+                    },
+                    {  // band 4
+                        { 14, 206, 232}, {  3, 162, 206}, {  1,  89, 134},
+                        {  1,  52,  83}, {  1,  28,  46}, {  1,  11,  17},
+                    },
+                    {  // band 5
+                        {  6, 225, 241}, {  1, 175, 210}, {  1,  81, 125},
+                        {  1,  48,  78}, {  1,  28,  46}, {  1,  13,  21},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {124,  23,  93}, { 31,  24,  63}, {  6,  24,  46},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 23,  86, 126}, { 45,  90, 145}, { 31,  91, 133},
+                        { 19,  80, 114}, {  7,  53,  72}, {  1,  20,  27},
+                    },
+                    {  // band 2
+                        { 51, 186, 189}, { 48, 159, 182}, { 33, 128, 156},
+                        { 15,  92, 124}, {  2,  62,  83}, {  1,  29,  43},
+                    },
+                    {  // band 3
+                        { 36, 198, 211}, { 15, 156, 187}, {  3,  97, 137},
+                        {  1,  61,  93}, {  1,  35,  57}, {  1,  15,  23},
+                    },
+                    {  // band 4
+                        { 34, 219, 223}, {  9, 162, 193}, {  1,  91, 136},
+                        {  1,  58,  92}, {  1,  35,  54}, {  1,  14,  23},
+                    },
+                    {  // band 5
+                        { 19, 243, 243}, {  3, 191, 208}, {  1,  91, 137},
+                        {  1,  56,  90}, {  1,  34,  55}, {  1,  16,  24},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {119,  20, 197}, { 19,  29, 156}, {  3,  30, 107},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 24, 192, 226}, {161, 193, 227}, { 97, 185, 222},
+                        { 31, 158, 204}, { 16, 122, 165}, { 17,  84, 112},
+                    },
+                    {  // band 2
+                        { 26, 202, 229}, { 11, 165, 210}, {  2, 103, 152},
+                        {  1,  68, 104}, {  1,  42,  70}, {  1,  16,  36},
+                    },
+                    {  // band 3
+                        { 24, 209, 237}, {  6, 169, 214}, {  1, 102, 154},
+                        {  1,  65, 107}, {  1,  45,  68}, {  1,  17,  24},
+                    },
+                    {  // band 4
+                        { 19, 219, 243}, {  4, 183, 226}, {  1, 115, 172},
+                        {  1,  73, 119}, {  1,  43,  77}, {  1,  15,  37},
+                    },
+                    {  // band 5
+                        { 11, 237, 241}, {  2, 190, 216}, {  1, 108, 146},
+                        {  1,  59,  94}, {  1,  40,  67}, {  1,  30,  53},
+                    },
+                },
+            },
+        },
+#endif  // CONFIG_TX64X64
+    },
+    {  // Q_Index 1
+#if CONFIG_CB4X4
+        {  // TX_SIZE 0
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        {174,  30, 159}, { 76,  38, 115}, { 15,  33,  65},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 60,  80, 153}, { 72,  75, 147}, { 36,  68, 129},
+                        { 15,  59, 104}, {  4,  45,  74}, {  1,  28,  45},
+                    },
+                    {  // band 2
+                        { 70, 122, 186}, { 55, 104, 175}, { 21,  83, 144},
+                        {  8,  67, 112}, {  2,  51,  82}, {  1,  34,  57},
+                    },
+                    {  // band 3
+                        { 97, 144, 207}, { 52, 109, 195}, { 16,  77, 153},
+                        {  4,  58, 113}, {  1,  43,  77}, {  1,  27,  48},
+                    },
+                    {  // band 4
+                        {128, 148, 229}, { 76, 104, 210}, { 18,  77, 159},
+                        {  4,  65, 110}, {  1,  52,  82}, {  1,  31,  55},
+                    },
+                    {  // band 5
+                        {165,  51, 238}, {128,  50, 230}, { 57,  49, 185},
+                        { 28,  47, 130}, { 12,  44,  96}, {  3,  36,  60},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        {169, 103, 203}, {117,  96, 176}, { 56,  81, 137},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 31, 150, 224}, { 49, 128, 212}, { 19,  92, 165},
+                        {  6,  67, 116}, {  2,  43,  71}, {  1,  21,  36},
+                    },
+                    {  // band 2
+                        { 58, 156, 230}, { 47, 130, 215}, {  7,  87, 158},
+                        {  2,  63, 114}, {  1,  39,  71}, {  1,  18,  36},
+                    },
+                    {  // band 3
+                        { 85, 176, 240}, { 43, 138, 226}, {  8,  93, 172},
+                        {  2,  70, 127}, {  1,  46,  81}, {  1,  26,  47},
+                    },
+                    {  // band 4
+                        {155, 144, 248}, { 93, 116, 235}, { 21,  83, 180},
+                        {  4,  59, 119}, {  1,  43,  80}, {  1,  25,  50},
+                    },
+                    {  // band 5
+                        {203,  61, 250}, {171,  57, 243}, { 71,  57, 199},
+                        { 31,  49, 144}, { 13,  42,  96}, {  7,  30,  52},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {204,  44, 204}, {137,  57, 184}, { 72,  62, 152},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {145, 117, 236}, {151, 112, 231}, { 87,  95, 208},
+                        { 31,  77, 165}, {  5,  49,  98}, {  1,  24,  39},
+                    },
+                    {  // band 2
+                        {146, 152, 241}, {140, 132, 236}, { 41, 103, 209},
+                        { 10,  86, 165}, {  2,  55, 106}, {  1,  25,  58},
+                    },
+                    {  // band 3
+                        {154, 181, 249}, { 84, 143, 240}, { 23, 114, 210},
+                        {  6, 102, 182}, {  2,  71, 137}, {  1,  35,  90},
+                    },
+                    {  // band 4
+                        {184, 150, 251}, {115, 130, 244}, { 34, 105, 215},
+                        { 15,  89, 173}, {  1,  51, 141}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {211,  71, 253}, {193,  78, 249}, {106,  91, 232},
+                        { 61,  87, 198}, { 85, 153, 254}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {232, 104, 242}, {165, 114, 227}, { 96, 120, 206},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {137, 178, 250}, {146, 153, 245}, { 74, 108, 205},
+                        { 41,  81, 149}, { 24,  55, 104}, { 13,  36,  68},
+                    },
+                    {  // band 2
+                        {147, 185, 252}, {127, 161, 246}, { 30, 104, 208},
+                        { 11,  74, 154}, {  6,  54, 100}, {  2,  29,  63},
+                    },
+                    {  // band 3
+                        {163, 191, 254}, {101, 161, 249}, { 22, 114, 215},
+                        {  6,  89, 173}, {  1,  65, 120}, {  1,   1, 170},
+                    },
+                    {  // band 4
+                        {197, 160, 254}, {142, 141, 251}, { 39, 102, 218},
+                        { 10,  76, 158}, {  1,  56, 122}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {224,  76, 254}, {215,  84, 253}, {107,  85, 232},
+                        { 43,  71, 177}, {  1,   1, 254}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+#endif
+        {  // TX_SIZE 0
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        {174,  30, 159}, { 76,  38, 115}, { 15,  33,  65},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 60,  80, 153}, { 72,  75, 147}, { 36,  68, 129},
+                        { 15,  59, 104}, {  4,  45,  74}, {  1,  28,  45},
+                    },
+                    {  // band 2
+                        { 70, 122, 186}, { 55, 104, 175}, { 21,  83, 144},
+                        {  8,  67, 112}, {  2,  51,  82}, {  1,  34,  57},
+                    },
+                    {  // band 3
+                        { 97, 144, 207}, { 52, 109, 195}, { 16,  77, 153},
+                        {  4,  58, 113}, {  1,  43,  77}, {  1,  27,  48},
+                    },
+                    {  // band 4
+                        {128, 148, 229}, { 76, 104, 210}, { 18,  77, 159},
+                        {  4,  65, 110}, {  1,  52,  82}, {  1,  31,  55},
+                    },
+                    {  // band 5
+                        {165,  51, 238}, {128,  50, 230}, { 57,  49, 185},
+                        { 28,  47, 130}, { 12,  44,  96}, {  3,  36,  60},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        {169, 103, 203}, {117,  96, 176}, { 56,  81, 137},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 31, 150, 224}, { 49, 128, 212}, { 19,  92, 165},
+                        {  6,  67, 116}, {  2,  43,  71}, {  1,  21,  36},
+                    },
+                    {  // band 2
+                        { 58, 156, 230}, { 47, 130, 215}, {  7,  87, 158},
+                        {  2,  63, 114}, {  1,  39,  71}, {  1,  18,  36},
+                    },
+                    {  // band 3
+                        { 85, 176, 240}, { 43, 138, 226}, {  8,  93, 172},
+                        {  2,  70, 127}, {  1,  46,  81}, {  1,  26,  47},
+                    },
+                    {  // band 4
+                        {155, 144, 248}, { 93, 116, 235}, { 21,  83, 180},
+                        {  4,  59, 119}, {  1,  43,  80}, {  1,  25,  50},
+                    },
+                    {  // band 5
+                        {203,  61, 250}, {171,  57, 243}, { 71,  57, 199},
+                        { 31,  49, 144}, { 13,  42,  96}, {  7,  30,  52},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {204,  44, 204}, {137,  57, 184}, { 72,  62, 152},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {145, 117, 236}, {151, 112, 231}, { 87,  95, 208},
+                        { 31,  77, 165}, {  5,  49,  98}, {  1,  24,  39},
+                    },
+                    {  // band 2
+                        {146, 152, 241}, {140, 132, 236}, { 41, 103, 209},
+                        { 10,  86, 165}, {  2,  55, 106}, {  1,  25,  58},
+                    },
+                    {  // band 3
+                        {154, 181, 249}, { 84, 143, 240}, { 23, 114, 210},
+                        {  6, 102, 182}, {  2,  71, 137}, {  1,  35,  90},
+                    },
+                    {  // band 4
+                        {184, 150, 251}, {115, 130, 244}, { 34, 105, 215},
+                        { 15,  89, 173}, {  1,  51, 141}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {211,  71, 253}, {193,  78, 249}, {106,  91, 232},
+                        { 61,  87, 198}, { 85, 153, 254}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {232, 104, 242}, {165, 114, 227}, { 96, 120, 206},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {137, 178, 250}, {146, 153, 245}, { 74, 108, 205},
+                        { 41,  81, 149}, { 24,  55, 104}, { 13,  36,  68},
+                    },
+                    {  // band 2
+                        {147, 185, 252}, {127, 161, 246}, { 30, 104, 208},
+                        { 11,  74, 154}, {  6,  54, 100}, {  2,  29,  63},
+                    },
+                    {  // band 3
+                        {163, 191, 254}, {101, 161, 249}, { 22, 114, 215},
+                        {  6,  89, 173}, {  1,  65, 120}, {  1,   1, 170},
+                    },
+                    {  // band 4
+                        {197, 160, 254}, {142, 141, 251}, { 39, 102, 218},
+                        { 10,  76, 158}, {  1,  56, 122}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {224,  76, 254}, {215,  84, 253}, {107,  85, 232},
+                        { 43,  71, 177}, {  1,   1, 254}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 1
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 68,  37, 120}, { 21,  34,  82}, {  5,  26,  49},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 41,  89, 138}, { 56,  83, 132}, { 31,  73, 115},
+                        { 16,  62,  92}, {  5,  45,  62}, {  1,  24,  32},
+                    },
+                    {  // band 2
+                        { 48, 139, 165}, { 30, 114, 160}, { 13,  92, 132},
+                        {  6,  72, 103}, {  3,  49,  72}, {  1,  26,  41},
+                    },
+                    {  // band 3
+                        { 44, 162, 191}, { 20, 127, 175}, {  5,  90, 137},
+                        {  1,  62, 100}, {  1,  38,  63}, {  1,  20,  32},
+                    },
+                    {  // band 4
+                        { 51, 184, 213}, { 16, 137, 193}, {  2,  89, 143},
+                        {  1,  60, 102}, {  1,  39,  66}, {  1,  23,  37},
+                    },
+                    {  // band 5
+                        { 76, 200, 235}, { 27, 150, 216}, {  3,  99, 164},
+                        {  1,  70, 119}, {  1,  45,  77}, {  1,  22,  38},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 81, 112, 199}, { 49, 101, 164}, { 19,  80, 119},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 12, 181, 217}, { 48, 151, 212}, { 38, 118, 180},
+                        { 22,  95, 140}, { 11,  67,  92}, { 13,  46,  44},
+                    },
+                    {  // band 2
+                        { 29, 188, 226}, { 19, 147, 210}, {  5,  95, 154},
+                        {  4,  68, 106}, {  3,  44,  60}, {  1,  24,  27},
+                    },
+                    {  // band 3
+                        { 30, 195, 234}, { 15, 153, 216}, {  3,  95, 156},
+                        {  2,  66, 108}, {  2,  44,  62}, {  1,  24,  29},
+                    },
+                    {  // band 4
+                        { 36, 203, 243}, { 12, 162, 225}, {  2,  98, 163},
+                        {  2,  67, 113}, {  2,  45,  68}, {  1,  24,  34},
+                    },
+                    {  // band 5
+                        { 86, 207, 248}, { 35, 165, 236}, {  3, 107, 180},
+                        {  1,  73, 128}, {  1,  45,  78}, {  1,  20,  34},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {188,  37, 205}, {118,  51, 172}, { 56,  57, 135},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {116, 135, 225}, {144, 123, 221}, { 72, 103, 197},
+                        { 35,  77, 153}, { 15,  47,  82}, {  6,  25,  34},
+                    },
+                    {  // band 2
+                        {128, 171, 233}, { 82, 142, 226}, { 31, 106, 191},
+                        { 16,  82, 146}, {  9,  59,  98}, {  4,  33,  54},
+                    },
+                    {  // band 3
+                        {126, 197, 241}, { 66, 155, 230}, { 18, 108, 190},
+                        {  7,  82, 148}, {  3,  58,  98}, {  1,  25,  50},
+                    },
+                    {  // band 4
+                        {117, 207, 244}, { 44, 163, 233}, {  9, 112, 191},
+                        {  5,  84, 148}, {  3,  61,  87}, {  1,  28,  38},
+                    },
+                    {  // band 5
+                        {112, 214, 249}, { 39, 174, 240}, {  6, 125, 205},
+                        {  4,  96, 163}, {  5,  66, 100}, {  1, 128, 254},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {227,  70, 234}, {145,  91, 213}, { 61, 100, 173},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {108, 198, 243}, {171, 172, 240}, {118, 130, 210},
+                        {104, 107, 165}, { 64,  85, 114}, { 55,  64,  60},
+                    },
+                    {  // band 2
+                        {110, 208, 247}, { 64, 175, 237}, { 24, 112, 187},
+                        { 24,  81, 133}, { 24,  63,  83}, { 21,  47,  53},
+                    },
+                    {  // band 3
+                        { 91, 218, 249}, { 46, 188, 238}, {  8, 113, 184},
+                        {  5,  83, 137}, {  6,  62,  95}, { 17,  44,  94},
+                    },
+                    {  // band 4
+                        { 84, 216, 248}, { 30, 187, 237}, {  2, 117, 188},
+                        {  1,  88, 141}, {  3,  63,  98}, {  1,   1,   1},
+                    },
+                    {  // band 5
+                        {116, 218, 252}, { 47, 186, 242}, {  2, 132, 204},
+                        {  1, 106, 175}, {  1,  88, 104}, {  1, 254, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 2
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 35,  41, 129}, { 12,  30,  70}, {  2,  19,  32},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 30,  77, 116}, { 39,  70, 110}, { 20,  58,  96},
+                        {  8,  47,  77}, {  2,  33,  52}, {  1,  17,  26},
+                    },
+                    {  // band 2
+                        { 31, 123, 146}, { 18, 103, 140}, {  7,  81, 119},
+                        {  2,  62,  95}, {  1,  44,  70}, {  1,  26,  42},
+                    },
+                    {  // band 3
+                        { 21, 149, 170}, {  9, 114, 158}, {  2,  80, 126},
+                        {  1,  57,  94}, {  1,  36,  61}, {  1,  18,  31},
+                    },
+                    {  // band 4
+                        { 20, 178, 199}, {  6, 134, 183}, {  1,  87, 139},
+                        {  1,  60, 100}, {  1,  37,  64}, {  1,  18,  31},
+                    },
+                    {  // band 5
+                        { 36, 218, 233}, {  6, 160, 207}, {  1,  92, 147},
+                        {  1,  59, 101}, {  1,  35,  62}, {  1,  18,  31},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 17,  62, 211}, { 14,  62, 153}, {  5,  50,  84},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 11, 180, 205}, { 87, 160, 205}, { 53, 128, 184},
+                        { 27, 106, 156}, { 13,  79, 115}, {  6,  46,  67},
+                    },
+                    {  // band 2
+                        { 32, 194, 220}, { 20, 145, 202}, {  4,  96, 152},
+                        {  1,  67, 111}, {  1,  42,  70}, {  1,  21,  37},
+                    },
+                    {  // band 3
+                        { 30, 204, 228}, { 14, 152, 207}, {  1,  92, 149},
+                        {  1,  61, 103}, {  1,  34,  59}, {  1,  16,  28},
+                    },
+                    {  // band 4
+                        { 27, 213, 235}, {  7, 159, 210}, {  1,  88, 143},
+                        {  1,  55,  94}, {  1,  31,  53}, {  1,  16,  27},
+                    },
+                    {  // band 5
+                        { 28, 223, 243}, {  4, 173, 217}, {  1,  91, 146},
+                        {  1,  58,  98}, {  1,  35,  60}, {  1,  19,  33},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {172,  37, 202}, { 83,  51, 156}, { 24,  53, 110},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 76, 134, 206}, {110, 124, 200}, { 47, 106, 180},
+                        { 15,  82, 145}, {  3,  48,  83}, {  1,  19,  32},
+                    },
+                    {  // band 2
+                        { 80, 176, 220}, { 49, 145, 212}, { 17, 112, 180},
+                        {  7,  84, 140}, {  1,  53,  89}, {  1,  27,  43},
+                    },
+                    {  // band 3
+                        { 74, 201, 232}, { 38, 158, 221}, {  8, 112, 179},
+                        {  2,  79, 132}, {  1,  47,  82}, {  1,  26,  42},
+                    },
+                    {  // band 4
+                        { 73, 215, 239}, { 28, 169, 227}, {  3, 112, 176},
+                        {  1,  74, 126}, {  1,  48,  79}, {  1,  27,  44},
+                    },
+                    {  // band 5
+                        { 71, 233, 244}, { 18, 180, 230}, {  1, 114, 180},
+                        {  1,  80, 134}, {  1,  51,  85}, {  1,  26,  36},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {213,  34, 244}, {126,  57, 212}, { 46,  67, 151},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {120, 202, 245}, {198, 173, 241}, {119, 146, 224},
+                        { 76, 126, 195}, { 44, 102, 159}, { 40,  76, 115},
+                    },
+                    {  // band 2
+                        {120, 215, 248}, { 69, 171, 237}, { 23, 119, 194},
+                        { 10,  86, 147}, {  2,  56,  94}, {  1,  25,  44},
+                    },
+                    {  // band 3
+                        {102, 226, 250}, { 53, 183, 239}, {  9, 118, 188},
+                        {  2,  78, 131}, {  1,  48,  89}, {  1,  17,  36},
+                    },
+                    {  // band 4
+                        { 86, 235, 252}, { 34, 194, 240}, {  2, 109, 173},
+                        {  1,  68, 118}, {  1,  44,  79}, {  1,   1,  38},
+                    },
+                    {  // band 5
+                        { 59, 236, 243}, { 11, 189, 228}, {  1, 112, 187},
+                        {  1,  88, 145}, {  1,  55,  92}, {  1,   1, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 3
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 41,  40, 104}, { 12,  31,  64}, {  2,  16,  28},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 65,  58, 132}, { 50,  61, 130}, { 40,  57, 116},
+                        { 22,  46,  87}, {  2,  28,  44}, {  1,  11,  17},
+                    },
+                    {  // band 2
+                        { 55, 139, 135}, { 46, 122, 132}, { 21,  89, 110},
+                        {  6,  60,  78}, {  1,  38,  54}, {  1,  17,  27},
+                    },
+                    {  // band 3
+                        { 29, 167, 161}, { 10, 120, 141}, {  1,  69,  98},
+                        {  1,  42,  66}, {  1,  28,  44}, {  1,  15,  24},
+                    },
+                    {  // band 4
+                        { 19, 191, 180}, {  4, 125, 154}, {  1,  70, 107},
+                        {  1,  48,  77}, {  1,  33,  53}, {  1,  17,  28},
+                    },
+                    {  // band 5
+                        { 16, 238, 231}, {  2, 163, 198}, {  1,  85, 134},
+                        {  1,  54,  90}, {  1,  34,  57}, {  1,  17,  29},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 70,  15, 216}, { 40,  18, 164}, { 14,  17,  83},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 25, 150, 200}, {185, 154, 211}, {123, 137, 199},
+                        { 67, 119, 177}, { 31,  96, 137}, { 18,  63,  86},
+                    },
+                    {  // band 2
+                        { 57, 187, 223}, { 35, 148, 207}, {  7, 104, 159},
+                        {  2,  72, 113}, {  1,  44,  71}, {  1,  20,  34},
+                    },
+                    {  // band 3
+                        { 44, 203, 233}, { 18, 157, 212}, {  1,  98, 150},
+                        {  1,  61, 102}, {  1,  38,  62}, {  1,  19,  31},
+                    },
+                    {  // band 4
+                        { 41, 215, 238}, { 11, 166, 215}, {  1,  94, 146},
+                        {  1,  60, 101}, {  1,  37,  63}, {  1,  17,  28},
+                    },
+                    {  // band 5
+                        { 19, 236, 246}, {  3, 188, 223}, {  1,  95, 146},
+                        {  1,  58,  95}, {  1,  34,  56}, {  1,  17,  27},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {146,  27, 156}, { 49,  32, 116}, { 10,  39,  77},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 47, 101, 172}, { 93, 100, 178}, { 58,  91, 165},
+                        { 26,  75, 134}, {  4,  49,  82}, {  2,  22,  33},
+                    },
+                    {  // band 2
+                        { 60, 158, 196}, { 44, 135, 186}, { 25, 106, 157},
+                        {  8,  81, 124}, {  2,  56,  86}, {  1,  28,  45},
+                    },
+                    {  // band 3
+                        { 44, 169, 212}, { 15, 138, 196}, {  2, 100, 157},
+                        {  1,  74, 119}, {  1,  49,  76}, {  1,  20,  34},
+                    },
+                    {  // band 4
+                        { 38, 199, 231}, { 11, 158, 214}, {  1, 111, 167},
+                        {  1,  76, 122}, {  1,  44,  76}, {  1,  17,  39},
+                    },
+                    {  // band 5
+                        { 40, 236, 246}, { 10, 187, 230}, {  1, 115, 175},
+                        {  1,  74, 122}, {  1,  42,  71}, {  1,  14,  59},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {161,  26, 237}, { 65,  46, 209}, { 21,  46, 161},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 87, 229, 245}, {206, 214, 244}, {148, 186, 236},
+                        { 89, 165, 221}, { 41, 132, 186}, { 37,  93, 141},
+                    },
+                    {  // band 2
+                        { 93, 231, 246}, { 47, 181, 231}, {  8, 117, 188},
+                        {  2,  84, 138}, {  1,  43,  87}, {  1,  27,  41},
+                    },
+                    {  // band 3
+                        { 80, 239, 250}, { 28, 190, 236}, {  1, 119, 183},
+                        {  1,  84, 135}, {  1,  81,  69}, {  1, 102,   1},
+                    },
+                    {  // band 4
+                        { 67, 245, 252}, { 22, 206, 242}, {  1, 130, 195},
+                        {  1,  77, 136}, {  1,  35,  88}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        { 43, 250, 228}, { 31, 185, 204}, {  6, 101, 183},
+                        {  1,  92, 151}, {  1,  84, 137}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+#if CONFIG_TX64X64
+        {  // TX_SIZE 4
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 41,  40, 104}, { 12,  31,  64}, {  2,  16,  28},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 65,  58, 132}, { 50,  61, 130}, { 40,  57, 116},
+                        { 22,  46,  87}, {  2,  28,  44}, {  1,  11,  17},
+                    },
+                    {  // band 2
+                        { 55, 139, 135}, { 46, 122, 132}, { 21,  89, 110},
+                        {  6,  60,  78}, {  1,  38,  54}, {  1,  17,  27},
+                    },
+                    {  // band 3
+                        { 29, 167, 161}, { 10, 120, 141}, {  1,  69,  98},
+                        {  1,  42,  66}, {  1,  28,  44}, {  1,  15,  24},
+                    },
+                    {  // band 4
+                        { 19, 191, 180}, {  4, 125, 154}, {  1,  70, 107},
+                        {  1,  48,  77}, {  1,  33,  53}, {  1,  17,  28},
+                    },
+                    {  // band 5
+                        { 16, 238, 231}, {  2, 163, 198}, {  1,  85, 134},
+                        {  1,  54,  90}, {  1,  34,  57}, {  1,  17,  29},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 70,  15, 216}, { 40,  18, 164}, { 14,  17,  83},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 25, 150, 200}, {185, 154, 211}, {123, 137, 199},
+                        { 67, 119, 177}, { 31,  96, 137}, { 18,  63,  86},
+                    },
+                    {  // band 2
+                        { 57, 187, 223}, { 35, 148, 207}, {  7, 104, 159},
+                        {  2,  72, 113}, {  1,  44,  71}, {  1,  20,  34},
+                    },
+                    {  // band 3
+                        { 44, 203, 233}, { 18, 157, 212}, {  1,  98, 150},
+                        {  1,  61, 102}, {  1,  38,  62}, {  1,  19,  31},
+                    },
+                    {  // band 4
+                        { 41, 215, 238}, { 11, 166, 215}, {  1,  94, 146},
+                        {  1,  60, 101}, {  1,  37,  63}, {  1,  17,  28},
+                    },
+                    {  // band 5
+                        { 19, 236, 246}, {  3, 188, 223}, {  1,  95, 146},
+                        {  1,  58,  95}, {  1,  34,  56}, {  1,  17,  27},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {146,  27, 156}, { 49,  32, 116}, { 10,  39,  77},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 47, 101, 172}, { 93, 100, 178}, { 58,  91, 165},
+                        { 26,  75, 134}, {  4,  49,  82}, {  2,  22,  33},
+                    },
+                    {  // band 2
+                        { 60, 158, 196}, { 44, 135, 186}, { 25, 106, 157},
+                        {  8,  81, 124}, {  2,  56,  86}, {  1,  28,  45},
+                    },
+                    {  // band 3
+                        { 44, 169, 212}, { 15, 138, 196}, {  2, 100, 157},
+                        {  1,  74, 119}, {  1,  49,  76}, {  1,  20,  34},
+                    },
+                    {  // band 4
+                        { 38, 199, 231}, { 11, 158, 214}, {  1, 111, 167},
+                        {  1,  76, 122}, {  1,  44,  76}, {  1,  17,  39},
+                    },
+                    {  // band 5
+                        { 40, 236, 246}, { 10, 187, 230}, {  1, 115, 175},
+                        {  1,  74, 122}, {  1,  42,  71}, {  1,  14,  59},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {161,  26, 237}, { 65,  46, 209}, { 21,  46, 161},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 87, 229, 245}, {206, 214, 244}, {148, 186, 236},
+                        { 89, 165, 221}, { 41, 132, 186}, { 37,  93, 141},
+                    },
+                    {  // band 2
+                        { 93, 231, 246}, { 47, 181, 231}, {  8, 117, 188},
+                        {  2,  84, 138}, {  1,  43,  87}, {  1,  27,  41},
+                    },
+                    {  // band 3
+                        { 80, 239, 250}, { 28, 190, 236}, {  1, 119, 183},
+                        {  1,  84, 135}, {  1,  81,  69}, {  1, 102,   1},
+                    },
+                    {  // band 4
+                        { 67, 245, 252}, { 22, 206, 242}, {  1, 130, 195},
+                        {  1,  77, 136}, {  1,  35,  88}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        { 43, 250, 228}, { 31, 185, 204}, {  6, 101, 183},
+                        {  1,  92, 151}, {  1,  84, 137}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+#endif  // CONFIG_TX64X64
+    },
+    {  // Q_Index 2
+#if CONFIG_CB4X4
+        {  // TX_SIZE 0
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        {181,  22, 175}, { 96,  37, 147}, { 35,  41, 105},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 80,  95, 197}, {111,  92, 193}, { 59,  87, 175},
+                        { 29,  79, 150}, { 10,  65, 118}, {  2,  47,  82},
+                    },
+                    {  // band 2
+                        { 90, 141, 216}, { 77, 120, 210}, { 23,  95, 184},
+                        { 11,  81, 151}, {  6,  75, 130}, {  2,  58, 113},
+                    },
+                    {  // band 3
+                        {122, 167, 231}, { 66, 119, 225}, { 26,  87, 189},
+                        {  7,  76, 151}, {  2,  63, 125}, {  1,  59,  77},
+                    },
+                    {  // band 4
+                        {162, 147, 244}, {110,  97, 236}, { 32,  88, 204},
+                        { 11,  89, 174}, {  5,  78, 151}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {205,  59, 251}, {176,  68, 248}, { 90,  71, 223},
+                        { 49,  72, 188}, { 17,  74, 203}, {128, 128, 128},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        {188,  70, 207}, {140,  73, 189}, { 85,  73, 163},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 59, 144, 239}, { 79, 126, 237}, { 31, 102, 202},
+                        { 10,  81, 153}, {  3,  56, 102}, {  2,  33,  59},
+                    },
+                    {  // band 2
+                        {100, 152, 243}, { 80, 129, 236}, { 14,  94, 194},
+                        {  4,  72, 150}, {  1,  50, 103}, {  1,  35,  60},
+                    },
+                    {  // band 3
+                        {130, 183, 247}, { 70, 139, 242}, { 19, 100, 203},
+                        {  4,  83, 159}, {  1,  59, 119}, {  1,  44,  72},
+                    },
+                    {  // band 4
+                        {197, 138, 252}, {135, 107, 247}, { 31,  86, 210},
+                        {  7,  74, 160}, {  1,  53, 107}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {229,  54, 254}, {200,  51, 251}, { 83,  61, 226},
+                        { 33,  55, 177}, { 12,  74, 145}, {128, 128, 128},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {229,  20, 235}, {183,  37, 221}, {127,  47, 198},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {188, 115, 251}, {208, 110, 250}, {101,  99, 235},
+                        { 38,  81, 197}, {  9,  56, 132}, {  9,  52,  63},
+                    },
+                    {  // band 2
+                        {189, 150, 252}, {186, 137, 251}, { 54, 107, 236},
+                        { 14,  90, 195}, {  1,  89, 104}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {209, 180, 254}, {142, 145, 253}, { 51, 130, 236},
+                        {  6, 128, 214}, {  1, 128, 254}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {231, 140, 254}, {194, 128, 254}, { 75, 119, 233},
+                        {128,  23, 230}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {244,  59, 254}, {239,  81, 254}, {128,  85, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {246,  55, 247}, {197,  64, 235}, {141,  74, 218},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {178, 163, 254}, {192, 138, 252}, { 85, 103, 231},
+                        { 49,  81, 179}, { 32,  54, 133}, { 12,  26,  98},
+                    },
+                    {  // band 2
+                        {189, 173, 254}, {179, 150, 253}, { 60,  94, 237},
+                        { 34,  81, 198}, { 20,  53, 187}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {202, 191, 254}, {157, 160, 254}, { 57, 117, 240},
+                        { 28, 105, 211}, {  1, 128,   1}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {231, 146, 254}, {208, 133, 254}, { 66,  78, 233},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {246,  49, 254}, {246,  63, 254}, { 85, 142, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+#endif
+        {  // TX_SIZE 0
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        {181,  22, 175}, { 96,  37, 147}, { 35,  41, 105},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 80,  95, 197}, {111,  92, 193}, { 59,  87, 175},
+                        { 29,  79, 150}, { 10,  65, 118}, {  2,  47,  82},
+                    },
+                    {  // band 2
+                        { 90, 141, 216}, { 77, 120, 210}, { 23,  95, 184},
+                        { 11,  81, 151}, {  6,  75, 130}, {  2,  58, 113},
+                    },
+                    {  // band 3
+                        {122, 167, 231}, { 66, 119, 225}, { 26,  87, 189},
+                        {  7,  76, 151}, {  2,  63, 125}, {  1,  59,  77},
+                    },
+                    {  // band 4
+                        {162, 147, 244}, {110,  97, 236}, { 32,  88, 204},
+                        { 11,  89, 174}, {  5,  78, 151}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {205,  59, 251}, {176,  68, 248}, { 90,  71, 223},
+                        { 49,  72, 188}, { 17,  74, 203}, {128, 128, 128},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        {188,  70, 207}, {140,  73, 189}, { 85,  73, 163},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 59, 144, 239}, { 79, 126, 237}, { 31, 102, 202},
+                        { 10,  81, 153}, {  3,  56, 102}, {  2,  33,  59},
+                    },
+                    {  // band 2
+                        {100, 152, 243}, { 80, 129, 236}, { 14,  94, 194},
+                        {  4,  72, 150}, {  1,  50, 103}, {  1,  35,  60},
+                    },
+                    {  // band 3
+                        {130, 183, 247}, { 70, 139, 242}, { 19, 100, 203},
+                        {  4,  83, 159}, {  1,  59, 119}, {  1,  44,  72},
+                    },
+                    {  // band 4
+                        {197, 138, 252}, {135, 107, 247}, { 31,  86, 210},
+                        {  7,  74, 160}, {  1,  53, 107}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {229,  54, 254}, {200,  51, 251}, { 83,  61, 226},
+                        { 33,  55, 177}, { 12,  74, 145}, {128, 128, 128},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {229,  20, 235}, {183,  37, 221}, {127,  47, 198},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {188, 115, 251}, {208, 110, 250}, {101,  99, 235},
+                        { 38,  81, 197}, {  9,  56, 132}, {  9,  52,  63},
+                    },
+                    {  // band 2
+                        {189, 150, 252}, {186, 137, 251}, { 54, 107, 236},
+                        { 14,  90, 195}, {  1,  89, 104}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {209, 180, 254}, {142, 145, 253}, { 51, 130, 236},
+                        {  6, 128, 214}, {  1, 128, 254}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {231, 140, 254}, {194, 128, 254}, { 75, 119, 233},
+                        {128,  23, 230}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {244,  59, 254}, {239,  81, 254}, {128,  85, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {246,  55, 247}, {197,  64, 235}, {141,  74, 218},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {178, 163, 254}, {192, 138, 252}, { 85, 103, 231},
+                        { 49,  81, 179}, { 32,  54, 133}, { 12,  26,  98},
+                    },
+                    {  // band 2
+                        {189, 173, 254}, {179, 150, 253}, { 60,  94, 237},
+                        { 34,  81, 198}, { 20,  53, 187}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {202, 191, 254}, {157, 160, 254}, { 57, 117, 240},
+                        { 28, 105, 211}, {  1, 128,   1}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {231, 146, 254}, {208, 133, 254}, { 66,  78, 233},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {246,  49, 254}, {246,  63, 254}, { 85, 142, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 1
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 45,  28, 124}, { 23,  35, 107}, { 10,  34,  78},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 53,  99, 177}, { 82,  96, 174}, { 46,  89, 158},
+                        { 21,  76, 133}, {  6,  56,  94}, {  1,  33,  54},
+                    },
+                    {  // band 2
+                        { 68, 147, 201}, { 42, 124, 195}, { 17,  98, 166},
+                        {  7,  75, 131}, {  2,  53,  93}, {  1,  33,  59},
+                    },
+                    {  // band 3
+                        { 65, 176, 217}, { 30, 137, 206}, {  6,  97, 167},
+                        {  2,  70, 128}, {  1,  47,  88}, {  1,  29,  46},
+                    },
+                    {  // band 4
+                        { 69, 195, 232}, { 24, 146, 218}, {  4, 100, 175},
+                        {  2,  72, 134}, {  1,  51,  93}, {  1,  29,  52},
+                    },
+                    {  // band 5
+                        { 96, 212, 246}, { 39, 158, 234}, {  6, 109, 192},
+                        {  2,  77, 144}, {  1,  50,  95}, {  1,  20,  45},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 71,  80, 213}, { 53,  73, 181}, { 25,  66, 141},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 35, 168, 231}, { 91, 150, 229}, { 49, 122, 202},
+                        { 22,  97, 162}, { 10,  68, 108}, {  9,  48,  57},
+                    },
+                    {  // band 2
+                        { 56, 178, 236}, { 32, 148, 225}, {  9,  99, 176},
+                        {  4,  69, 127}, {  2,  44,  78}, {  1,  25,  41},
+                    },
+                    {  // band 3
+                        { 57, 191, 242}, { 27, 155, 230}, {  5, 102, 180},
+                        {  2,  71, 133}, {  1,  44,  78}, {  1,  27,  41},
+                    },
+                    {  // band 4
+                        { 67, 201, 247}, { 24, 162, 237}, {  3, 106, 188},
+                        {  3,  74, 137}, {  1,  46,  85}, {  1,  34,  48},
+                    },
+                    {  // band 5
+                        {111, 210, 251}, { 47, 166, 244}, {  3, 113, 199},
+                        {  2,  77, 146}, {  1,  48,  93}, {  1,  38,  22},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {206,  21, 221}, {150,  36, 195}, { 94,  44, 164},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {147, 128, 239}, {194, 122, 238}, { 95, 104, 220},
+                        { 39,  81, 183}, { 13,  53, 111}, {  3,  24,  49},
+                    },
+                    {  // band 2
+                        {164, 163, 244}, {106, 142, 239}, { 50, 112, 215},
+                        { 26,  90, 177}, { 12,  67, 130}, {  1,   1,  64},
+                    },
+                    {  // band 3
+                        {155, 193, 249}, { 88, 158, 244}, { 26, 124, 220},
+                        { 10,  98, 173}, {  1,  77, 126}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {141, 205, 252}, { 64, 174, 248}, { 17, 124, 221},
+                        { 12,  92, 176}, {  1,  29, 148}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {150, 217, 254}, { 74, 191, 252}, { 30, 144, 215},
+                        {  1, 106, 137}, {128,   1, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {241,  37, 242}, {175,  48, 223}, { 99,  53, 189},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {153, 183, 248}, {212, 156, 247}, {134, 124, 221},
+                        { 88, 103, 184}, { 59,  86, 132}, { 29,  61,  67},
+                    },
+                    {  // band 2
+                        {162, 199, 250}, {106, 167, 247}, { 56, 110, 207},
+                        { 32,  85, 165}, { 16,  71, 130}, {  1,  93, 254},
+                    },
+                    {  // band 3
+                        {143, 213, 252}, { 86, 187, 250}, { 23, 124, 220},
+                        {  7,  95, 176}, {  1, 109, 102}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {130, 219, 254}, { 70, 201, 253}, { 15, 128, 215},
+                        {  1, 101, 201}, {  1,  64, 170}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {155, 219, 254}, {105, 207, 254}, { 28, 155, 229},
+                        {  1, 153, 191}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 2
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 18,  26, 117}, { 10,  29,  82}, {  3,  25,  52},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 35,  88, 152}, { 62,  85, 150}, { 36,  77, 137},
+                        { 16,  66, 116}, {  4,  47,  81}, {  1,  26,  44},
+                    },
+                    {  // band 2
+                        { 55, 141, 182}, { 32, 119, 177}, { 12,  93, 154},
+                        {  4,  71, 123}, {  1,  51,  89}, {  1,  32,  56},
+                    },
+                    {  // band 3
+                        { 46, 171, 202}, { 21, 130, 191}, {  5,  91, 154},
+                        {  1,  64, 115}, {  1,  42,  77}, {  1,  25,  41},
+                    },
+                    {  // band 4
+                        { 43, 195, 219}, { 12, 142, 203}, {  1,  91, 156},
+                        {  1,  63, 115}, {  1,  41,  77}, {  1,  22,  43},
+                    },
+                    {  // band 5
+                        { 42, 221, 238}, {  8, 162, 219}, {  1,  98, 167},
+                        {  1,  67, 123}, {  1,  43,  83}, {  1,  25,  38},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 16,  51, 216}, { 20,  48, 168}, {  9,  44, 109},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 34, 164, 226}, {124, 148, 226}, { 72, 127, 207},
+                        { 36, 107, 175}, { 15,  81, 129}, {  6,  51,  79},
+                    },
+                    {  // band 2
+                        { 61, 182, 234}, { 35, 148, 220}, {  9, 101, 178},
+                        {  4,  71, 134}, {  1,  46,  90}, {  1,  24,  51},
+                    },
+                    {  // band 3
+                        { 54, 198, 239}, { 25, 156, 224}, {  3,  98, 173},
+                        {  1,  66, 124}, {  1,  41,  78}, {  1,  15,  37},
+                    },
+                    {  // band 4
+                        { 48, 209, 242}, { 12, 162, 226}, {  1,  96, 169},
+                        {  1,  63, 119}, {  1,  40,  78}, {  1,  18,  45},
+                    },
+                    {  // band 5
+                        { 44, 223, 247}, {  6, 173, 232}, {  1, 105, 178},
+                        {  1,  71, 131}, {  1,  44,  84}, {  1,  13,  46},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {188,  26, 214}, {121,  42, 181}, { 66,  49, 149},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {136, 128, 233}, {172, 124, 230}, { 80, 106, 211},
+                        { 27,  81, 174}, {  6,  49,  98}, {  8,  28,  49},
+                    },
+                    {  // band 2
+                        {145, 166, 239}, { 92, 141, 229}, { 28, 108, 196},
+                        {  8,  87, 154}, {  1,  58, 105}, {  1,  27,  59},
+                    },
+                    {  // band 3
+                        {131, 193, 242}, { 66, 151, 231}, { 13, 112, 192},
+                        {  2,  81, 152}, {  1,  66, 121}, {  1,  23,  64},
+                    },
+                    {  // band 4
+                        {112, 211, 246}, { 41, 164, 235}, {  5, 117, 202},
+                        {  1,  83, 162}, {  1,  64, 111}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        { 96, 230, 250}, { 28, 185, 243}, {  2, 132, 204},
+                        {  1,  91, 166}, {  1,  85,  46}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {238,  23, 242}, {157,  29, 215}, { 73,  27, 162},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {165, 173, 250}, {222, 151, 247}, {152, 134, 235},
+                        {114, 120, 210}, { 86, 109, 176}, { 53,  88, 145},
+                    },
+                    {  // band 2
+                        {164, 194, 249}, {100, 158, 241}, { 35, 111, 212},
+                        { 17,  85, 167}, {  1,  52, 112}, {  1,  73,   1},
+                    },
+                    {  // band 3
+                        {151, 215, 252}, { 83, 172, 245}, { 16, 122, 208},
+                        {  6, 101, 165}, {  1,  74, 113}, {  1,   1,   1},
+                    },
+                    {  // band 4
+                        {138, 230, 253}, { 65, 184, 248}, {  8, 128, 212},
+                        {  1, 111, 182}, {128,   1,   1}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {123, 240, 253}, { 36, 201, 250}, {  3, 127, 211},
+                        {  1,  68, 204}, {128,   1,   1}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 3
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 51,  21, 156}, { 30,  23,  86}, {  4,  18,  37},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 38,  77, 129}, { 79,  76, 129}, { 40,  66, 117},
+                        { 12,  54,  95}, {  1,  36,  60}, {  1,  17,  29},
+                    },
+                    {  // band 2
+                        { 44, 133, 149}, { 24, 107, 143}, {  8,  78, 121},
+                        {  3,  59,  97}, {  1,  42,  71}, {  1,  22,  37},
+                    },
+                    {  // band 3
+                        { 29, 160, 171}, {  9, 114, 158}, {  1,  76, 125},
+                        {  1,  54,  93}, {  1,  36,  63}, {  1,  20,  35},
+                    },
+                    {  // band 4
+                        { 22, 188, 205}, {  6, 132, 186}, {  1,  87, 144},
+                        {  1,  62, 107}, {  1,  41,  72}, {  1,  23,  41},
+                    },
+                    {  // band 5
+                        { 25, 233, 236}, {  5, 165, 214}, {  1,  96, 158},
+                        {  1,  63, 112}, {  1,  40,  73}, {  1,  23,  40},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 48,  20, 231}, { 37,  21, 179}, { 15,  18, 109},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 41, 154, 216}, {196, 142, 221}, {131, 125, 207},
+                        { 84, 111, 181}, { 45,  91, 142}, { 27,  62,  89},
+                    },
+                    {  // band 2
+                        { 72, 181, 230}, { 41, 147, 215}, { 10, 102, 173},
+                        {  3,  73, 132}, {  1,  47,  89}, {  1,  23,  50},
+                    },
+                    {  // band 3
+                        { 60, 201, 236}, { 23, 157, 219}, {  2,  99, 167},
+                        {  1,  69, 124}, {  1,  43,  80}, {  1,  22,  39},
+                    },
+                    {  // band 4
+                        { 53, 214, 242}, { 15, 165, 224}, {  1, 101, 173},
+                        {  1,  70, 131}, {  1,  44,  83}, {  1,  23,  49},
+                    },
+                    {  // band 5
+                        { 39, 239, 248}, {  7, 186, 233}, {  1, 108, 174},
+                        {  1,  70, 123}, {  1,  43,  77}, {  1,  16,  42},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {161,  26, 204}, { 77,  40, 160}, { 26,  50, 117},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 80, 140, 218}, {136, 133, 215}, { 63, 117, 197},
+                        { 20,  93, 170}, {  7,  55, 102}, { 13,  32,  52},
+                    },
+                    {  // band 2
+                        { 86, 173, 231}, { 46, 150, 220}, { 18, 118, 190},
+                        {  8,  90, 150}, {  2,  60,  95}, {  1,  39,  41},
+                    },
+                    {  // band 3
+                        { 80, 183, 242}, { 37, 160, 231}, {  6, 120, 182},
+                        {  1,  86, 137}, {  1,  46,  78}, {  1,  15,  24},
+                    },
+                    {  // band 4
+                        { 88, 215, 247}, { 42, 179, 235}, {  4, 116, 182},
+                        {  2,  80, 133}, {  1,  46,  85}, {  1,  64,  43},
+                    },
+                    {  // band 5
+                        {100, 236, 250}, { 31, 186, 234}, {  1, 114, 181},
+                        {  1,  85, 135}, {  1,  78,  64}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {213,  13, 245}, {106,  16, 211}, { 32,  11, 156},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {140, 214, 247}, {241, 186, 243}, {177, 172, 235},
+                        {128, 156, 219}, {106, 130, 191}, { 99, 105, 152},
+                    },
+                    {  // band 2
+                        {125, 218, 248}, { 75, 167, 239}, { 29, 111, 212},
+                        {  6,  66, 152}, {  1,  42,  96}, {  1,  85, 128},
+                    },
+                    {  // band 3
+                        {120, 232, 252}, { 60, 189, 247}, {  8, 141, 200},
+                        {  1,  89, 134}, {  1,  32, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {111, 238, 253}, { 56, 198, 245}, {  1, 123, 208},
+                        {  1,  93, 176}, {  1,   1,  73}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        { 98, 251, 249}, { 56, 189, 244}, { 17, 113, 220},
+                        {  1, 109, 179}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+#if CONFIG_TX64X64
+        {  // TX_SIZE 4
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 51,  21, 156}, { 30,  23,  86}, {  4,  18,  37},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 38,  77, 129}, { 79,  76, 129}, { 40,  66, 117},
+                        { 12,  54,  95}, {  1,  36,  60}, {  1,  17,  29},
+                    },
+                    {  // band 2
+                        { 44, 133, 149}, { 24, 107, 143}, {  8,  78, 121},
+                        {  3,  59,  97}, {  1,  42,  71}, {  1,  22,  37},
+                    },
+                    {  // band 3
+                        { 29, 160, 171}, {  9, 114, 158}, {  1,  76, 125},
+                        {  1,  54,  93}, {  1,  36,  63}, {  1,  20,  35},
+                    },
+                    {  // band 4
+                        { 22, 188, 205}, {  6, 132, 186}, {  1,  87, 144},
+                        {  1,  62, 107}, {  1,  41,  72}, {  1,  23,  41},
+                    },
+                    {  // band 5
+                        { 25, 233, 236}, {  5, 165, 214}, {  1,  96, 158},
+                        {  1,  63, 112}, {  1,  40,  73}, {  1,  23,  40},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 48,  20, 231}, { 37,  21, 179}, { 15,  18, 109},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 41, 154, 216}, {196, 142, 221}, {131, 125, 207},
+                        { 84, 111, 181}, { 45,  91, 142}, { 27,  62,  89},
+                    },
+                    {  // band 2
+                        { 72, 181, 230}, { 41, 147, 215}, { 10, 102, 173},
+                        {  3,  73, 132}, {  1,  47,  89}, {  1,  23,  50},
+                    },
+                    {  // band 3
+                        { 60, 201, 236}, { 23, 157, 219}, {  2,  99, 167},
+                        {  1,  69, 124}, {  1,  43,  80}, {  1,  22,  39},
+                    },
+                    {  // band 4
+                        { 53, 214, 242}, { 15, 165, 224}, {  1, 101, 173},
+                        {  1,  70, 131}, {  1,  44,  83}, {  1,  23,  49},
+                    },
+                    {  // band 5
+                        { 39, 239, 248}, {  7, 186, 233}, {  1, 108, 174},
+                        {  1,  70, 123}, {  1,  43,  77}, {  1,  16,  42},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {161,  26, 204}, { 77,  40, 160}, { 26,  50, 117},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 80, 140, 218}, {136, 133, 215}, { 63, 117, 197},
+                        { 20,  93, 170}, {  7,  55, 102}, { 13,  32,  52},
+                    },
+                    {  // band 2
+                        { 86, 173, 231}, { 46, 150, 220}, { 18, 118, 190},
+                        {  8,  90, 150}, {  2,  60,  95}, {  1,  39,  41},
+                    },
+                    {  // band 3
+                        { 80, 183, 242}, { 37, 160, 231}, {  6, 120, 182},
+                        {  1,  86, 137}, {  1,  46,  78}, {  1,  15,  24},
+                    },
+                    {  // band 4
+                        { 88, 215, 247}, { 42, 179, 235}, {  4, 116, 182},
+                        {  2,  80, 133}, {  1,  46,  85}, {  1,  64,  43},
+                    },
+                    {  // band 5
+                        {100, 236, 250}, { 31, 186, 234}, {  1, 114, 181},
+                        {  1,  85, 135}, {  1,  78,  64}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {213,  13, 245}, {106,  16, 211}, { 32,  11, 156},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {140, 214, 247}, {241, 186, 243}, {177, 172, 235},
+                        {128, 156, 219}, {106, 130, 191}, { 99, 105, 152},
+                    },
+                    {  // band 2
+                        {125, 218, 248}, { 75, 167, 239}, { 29, 111, 212},
+                        {  6,  66, 152}, {  1,  42,  96}, {  1,  85, 128},
+                    },
+                    {  // band 3
+                        {120, 232, 252}, { 60, 189, 247}, {  8, 141, 200},
+                        {  1,  89, 134}, {  1,  32, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {111, 238, 253}, { 56, 198, 245}, {  1, 123, 208},
+                        {  1,  93, 176}, {  1,   1,  73}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        { 98, 251, 249}, { 56, 189, 244}, { 17, 113, 220},
+                        {  1, 109, 179}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+#endif  // CONFIG_TX64X64
+    },
+    {  // Q_Index 3
+#if CONFIG_CB4X4
+        {  // TX_SIZE 0
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        {186,  16, 200}, {122,  31, 187}, { 78,  40, 161},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {146, 119, 245}, {182, 115, 244}, {130, 113, 238},
+                        { 88, 110, 225}, { 47, 103, 208}, {  5, 102, 188},
+                    },
+                    {  // band 2
+                        {164, 157, 248}, {155, 141, 250}, { 71, 116, 243},
+                        { 88, 129, 233}, { 50,  99, 228}, { 26, 148, 191},
+                    },
+                    {  // band 3
+                        {200, 158, 253}, {177, 118, 252}, { 99, 113, 245},
+                        { 77, 120, 210}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {231, 104, 254}, {209,  82, 254}, {143, 112, 252},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {250,  36, 254}, {243,  55, 254}, {223, 170, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        {207,  37, 226}, {164,  46, 218}, {122,  58, 201},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {149, 154, 253}, {170, 137, 253}, { 94, 123, 247},
+                        { 42, 113, 222}, { 16,  97, 174}, { 49,  98, 159},
+                    },
+                    {  // band 2
+                        {177, 162, 253}, {165, 142, 252}, { 51, 108, 243},
+                        { 18, 108, 213}, {  1,  98, 254}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {211, 152, 254}, {184, 116, 254}, { 70, 110, 244},
+                        {  8, 108, 237}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {236,  89, 254}, {210,  67, 254}, {112, 111, 248},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {246,  26, 254}, {233,  35, 254}, {128,   1, 254},
+                        {254, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {247,   2, 247}, {226,   8, 242}, {191,  14, 235},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {231,  94, 254}, {248,  91, 254}, {186,  89, 252},
+                        {128,  92, 244}, { 79, 112, 254}, {128, 128, 128},
+                    },
+                    {  // band 2
+                        {228, 145, 253}, {240, 130, 254}, {223, 105, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {245, 153, 253}, {240, 120, 254}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {254, 128, 254}, {204, 128, 254}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {253,   7, 249}, {224,   9, 244}, {182,  13, 231},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {234, 109, 254}, {242, 104, 254}, {160,  98, 254},
+                        {123,  85, 243}, { 82,  43, 217}, {128, 128, 128},
+                    },
+                    {  // band 2
+                        {243, 137, 254}, {240, 118, 254}, {136,  53, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {251, 173, 254}, {229, 129, 250}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {254, 119, 254}, {254, 128, 128}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+#endif
+        {  // TX_SIZE 0
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        {186,  16, 200}, {122,  31, 187}, { 78,  40, 161},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {146, 119, 245}, {182, 115, 244}, {130, 113, 238},
+                        { 88, 110, 225}, { 47, 103, 208}, {  5, 102, 188},
+                    },
+                    {  // band 2
+                        {164, 157, 248}, {155, 141, 250}, { 71, 116, 243},
+                        { 88, 129, 233}, { 50,  99, 228}, { 26, 148, 191},
+                    },
+                    {  // band 3
+                        {200, 158, 253}, {177, 118, 252}, { 99, 113, 245},
+                        { 77, 120, 210}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {231, 104, 254}, {209,  82, 254}, {143, 112, 252},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {250,  36, 254}, {243,  55, 254}, {223, 170, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        {207,  37, 226}, {164,  46, 218}, {122,  58, 201},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {149, 154, 253}, {170, 137, 253}, { 94, 123, 247},
+                        { 42, 113, 222}, { 16,  97, 174}, { 49,  98, 159},
+                    },
+                    {  // band 2
+                        {177, 162, 253}, {165, 142, 252}, { 51, 108, 243},
+                        { 18, 108, 213}, {  1,  98, 254}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {211, 152, 254}, {184, 116, 254}, { 70, 110, 244},
+                        {  8, 108, 237}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {236,  89, 254}, {210,  67, 254}, {112, 111, 248},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {246,  26, 254}, {233,  35, 254}, {128,   1, 254},
+                        {254, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {247,   2, 247}, {226,   8, 242}, {191,  14, 235},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {231,  94, 254}, {248,  91, 254}, {186,  89, 252},
+                        {128,  92, 244}, { 79, 112, 254}, {128, 128, 128},
+                    },
+                    {  // band 2
+                        {228, 145, 253}, {240, 130, 254}, {223, 105, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {245, 153, 253}, {240, 120, 254}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {254, 128, 254}, {204, 128, 254}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {253,   7, 249}, {224,   9, 244}, {182,  13, 231},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {234, 109, 254}, {242, 104, 254}, {160,  98, 254},
+                        {123,  85, 243}, { 82,  43, 217}, {128, 128, 128},
+                    },
+                    {  // band 2
+                        {243, 137, 254}, {240, 118, 254}, {136,  53, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {251, 173, 254}, {229, 129, 250}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {254, 119, 254}, {254, 128, 128}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 1
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 49,  26, 159}, { 36,  34, 150}, { 26,  38, 124},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 99, 122, 226}, {143, 119, 225}, { 90, 113, 213},
+                        { 46, 102, 193}, { 14,  84, 157}, {  3,  59, 107},
+                    },
+                    {  // band 2
+                        {109, 164, 237}, { 74, 142, 233}, { 29, 112, 216},
+                        { 14,  92, 184}, { 10,  80, 156}, {  1,  52, 137},
+                    },
+                    {  // band 3
+                        {110, 191, 245}, { 59, 156, 240}, { 18, 121, 220},
+                        {  8,  97, 184}, {  3,  84, 150}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {115, 203, 250}, { 59, 167, 246}, { 16, 130, 226},
+                        {  7,  97, 192}, {  1,  71,  99}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {149, 218, 253}, { 93, 171, 251}, { 28, 125, 233},
+                        { 28,  99, 192}, {128,  85,  85}, {128, 128, 128},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 97,  45, 229}, { 79,  52, 205}, { 46,  58, 171},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 99, 180, 249}, {156, 165, 249}, { 73, 141, 237},
+                        { 31, 116, 208}, { 13,  81, 153}, {  5,  42,  86},
+                    },
+                    {  // band 2
+                        {113, 188, 251}, { 68, 161, 244}, { 16, 108, 216},
+                        {  6,  81, 168}, {  2,  65, 118}, {128,   1,   1},
+                    },
+                    {  // band 3
+                        {117, 201, 252}, { 62, 171, 248}, { 12, 119, 221},
+                        {  5,  90, 182}, {  4,  66, 116}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {128, 207, 253}, { 70, 176, 251}, { 11, 126, 228},
+                        {  6,  89, 189}, {  1,  44, 148}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {162, 218, 254}, {107, 170, 253}, { 22, 131, 238},
+                        {  1,  77, 182}, {  1, 254, 128}, {128, 128, 128},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {235,   5, 238}, {194,  14, 223}, {152,  22, 205},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {200, 121, 251}, {241, 115, 252}, {167, 108, 248},
+                        { 93,  93, 233}, { 36,  66, 189}, {128, 128, 128},
+                    },
+                    {  // band 2
+                        {220, 151, 253}, {176, 135, 252}, { 95, 124, 254},
+                        { 64, 105, 217}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {225, 189, 254}, {175, 155, 254}, {102, 119, 254},
+                        {  1,   1,   1}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {218, 195, 254}, {125, 157, 253}, {128, 128, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {221, 197, 254}, { 85, 210, 254}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {250,   9, 246}, {204,  13, 234}, {144,  18, 211},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {213, 157, 253}, {243, 138, 253}, {170, 117, 250},
+                        {109,  91, 233}, { 66,  77, 163}, { 64,  85, 254},
+                    },
+                    {  // band 2
+                        {221, 169, 254}, {182, 141, 253}, {112, 120, 239},
+                        { 85, 165, 254}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {226, 192, 254}, {189, 174, 251}, {153, 128, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {232, 192, 254}, {195, 187, 247}, {  1, 191, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {247, 185, 254}, {254,  93, 254}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 2
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 14,  30, 136}, { 15,  33, 120}, { 10,  33,  90},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 92, 109, 209}, {113, 108, 207}, { 77, 102, 193},
+                        { 39,  91, 171}, { 11,  70, 129}, {  2,  44,  77},
+                    },
+                    {  // band 2
+                        { 99, 158, 223}, { 66, 135, 217}, { 23, 109, 194},
+                        {  9,  85, 160}, {  3,  66, 124}, {  1,  51, 100},
+                    },
+                    {  // band 3
+                        { 89, 189, 234}, { 46, 149, 225}, { 10, 110, 194},
+                        {  2,  83, 156}, {  1,  57, 113}, {  1,  47,  73},
+                    },
+                    {  // band 4
+                        { 78, 206, 242}, { 28, 161, 232}, {  3, 114, 200},
+                        {  1,  86, 161}, {  1,  62, 118}, {  1,   1,   1},
+                    },
+                    {  // band 5
+                        { 72, 227, 250}, { 20, 182, 242}, {  3, 126, 210},
+                        {  2,  91, 166}, {  1,  64, 126}, {128, 128, 128},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 23,  42, 227}, { 41,  43, 195}, { 25,  45, 146},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {100, 172, 245}, {165, 158, 246}, { 88, 137, 234},
+                        { 44, 116, 203}, { 18,  85, 149}, {  7,  56,  92},
+                    },
+                    {  // band 2
+                        {117, 188, 247}, { 70, 155, 239}, { 18, 105, 204},
+                        {  7,  78, 158}, {  2,  50, 111}, {  1,  38,  77},
+                    },
+                    {  // band 3
+                        {104, 207, 250}, { 54, 166, 241}, {  6, 110, 199},
+                        {  1,  78, 155}, {  1,  45, 100}, {  1,   1,   1},
+                    },
+                    {  // band 4
+                        { 87, 216, 251}, { 30, 177, 243}, {  1, 114, 203},
+                        {  1,  85, 157}, {  1,  53, 108}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        { 80, 230, 253}, { 23, 193, 248}, {  1, 127, 215},
+                        {  1,  94, 170}, {  1,  71,  59}, {128, 128, 128},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {222,   9, 234}, {161,  20, 210}, {113,  30, 185},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {195, 120, 248}, {231, 124, 247}, {148, 116, 238},
+                        { 64,  98, 207}, { 20,  70, 147}, { 87,  68, 100},
+                    },
+                    {  // band 2
+                        {186, 161, 250}, {124, 148, 245}, { 44, 123, 230},
+                        { 23, 107, 205}, {  1,  80, 131}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {172, 196, 252}, {110, 160, 248}, { 37, 134, 235},
+                        { 23, 125, 200}, {128, 254, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {173, 209, 253}, {103, 175, 250}, {  1, 120, 240},
+                        {  1, 146, 254}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {184, 235, 254}, { 81, 186, 251}, {128, 109, 254},
+                        {128, 254, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {248,   8, 243}, {185,  11, 225}, {108,  11, 189},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {208, 158, 254}, {244, 147, 252}, {195, 132, 248},
+                        {161, 122, 224}, {129, 114, 188}, { 59, 119, 159},
+                    },
+                    {  // band 2
+                        {202, 182, 253}, {143, 161, 251}, { 73, 115, 247},
+                        {146, 175, 204}, {128,   1, 254}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {202, 204, 254}, {131, 174, 251}, { 18, 153, 207},
+                        {128, 254, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {192, 221, 254}, {114, 190, 254}, {128, 170, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {166, 236, 254}, {119, 200, 254}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 3
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 30,  32, 144}, { 21,  35,  96}, {  4,  27,  55},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 35, 107, 172}, { 61, 104, 170}, { 33,  94, 160},
+                        { 13,  80, 139}, {  2,  55,  97}, {  1,  28,  49},
+                    },
+                    {  // band 2
+                        { 51, 153, 195}, { 29, 129, 189}, {  9,  99, 163},
+                        {  3,  75, 129}, {  1,  49,  88}, {  1,  29,  50},
+                    },
+                    {  // band 3
+                        { 53, 164, 210}, { 21, 134, 201}, {  3,  97, 164},
+                        {  1,  69, 124}, {  1,  45,  82}, {  1,  31,  58},
+                    },
+                    {  // band 4
+                        { 47, 205, 234}, { 18, 158, 220}, {  2, 109, 177},
+                        {  1,  78, 137}, {  1,  53, 101}, {  1,  34,  70},
+                    },
+                    {  // band 5
+                        { 55, 233, 245}, { 16, 179, 233}, {  1, 116, 191},
+                        {  1,  79, 145}, {  1,  53, 101}, {  1,  37,  58},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 36,  33, 227}, { 39,  28, 190}, { 18,  27, 134},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 76, 156, 235}, {184, 147, 235}, {114, 130, 220},
+                        { 72, 112, 191}, { 42,  87, 144}, { 21,  65,  93},
+                    },
+                    {  // band 2
+                        { 96, 179, 240}, { 51, 149, 228}, { 12, 105, 191},
+                        {  6,  74, 148}, {  1,  47, 100}, {  1,  29,  53},
+                    },
+                    {  // band 3
+                        { 88, 191, 242}, { 35, 154, 231}, {  3, 106, 187},
+                        {  1,  74, 140}, {  1,  41,  84}, {  1,  25,  38},
+                    },
+                    {  // band 4
+                        { 77, 212, 249}, { 28, 171, 239}, {  2, 117, 199},
+                        {  1,  79, 151}, {  1,  45,  99}, {  1,   1,   1},
+                    },
+                    {  // band 5
+                        { 77, 236, 252}, { 27, 190, 246}, {  2, 120, 203},
+                        {  1,  78, 147}, {  1,  42,  72}, {128, 128, 128},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {185,  11, 227}, {113,  30, 182}, { 57,  44, 144},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {151, 139, 244}, {212, 139, 241}, {124, 126, 231},
+                        { 59, 104, 213}, { 26,  73, 158}, { 20,  45,  95},
+                    },
+                    {  // band 2
+                        {155, 163, 247}, {108, 152, 239}, { 39, 124, 214},
+                        {  7, 109, 162}, { 29,  57, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {158, 176, 250}, { 89, 164, 243}, { 11, 114, 196},
+                        {  1,  96, 141}, {  1,  81, 118}, {128,   1,   1},
+                    },
+                    {  // band 4
+                        {148, 212, 251}, { 59, 174, 240}, {  2, 130, 203},
+                        {  1,  70, 168}, {  1,  51, 106}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {104, 237, 252}, { 39, 190, 246}, {  1, 154, 220},
+                        {128, 102,   1}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {236,   6, 242}, {111,   6, 206}, { 36,   5, 161},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {193, 193, 252}, {248, 182, 251}, {218, 150, 246},
+                        {182, 134, 244}, {151, 137, 227}, { 45, 102, 195},
+                    },
+                    {  // band 2
+                        {188, 202, 251}, {125, 165, 249}, { 64,  75, 218},
+                        {  1, 128, 254}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {178, 225, 254}, {107, 188, 231}, { 21, 135, 233},
+                        {128,   1, 254}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {164, 227, 253}, { 55, 193, 251}, {  1, 111, 225},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {151, 243, 254}, { 50, 203, 254}, {128, 179, 254},
+                        {128,   1, 254}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+#if CONFIG_TX64X64
+        {  // TX_SIZE 4
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 30,  32, 144}, { 21,  35,  96}, {  4,  27,  55},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 35, 107, 172}, { 61, 104, 170}, { 33,  94, 160},
+                        { 13,  80, 139}, {  2,  55,  97}, {  1,  28,  49},
+                    },
+                    {  // band 2
+                        { 51, 153, 195}, { 29, 129, 189}, {  9,  99, 163},
+                        {  3,  75, 129}, {  1,  49,  88}, {  1,  29,  50},
+                    },
+                    {  // band 3
+                        { 53, 164, 210}, { 21, 134, 201}, {  3,  97, 164},
+                        {  1,  69, 124}, {  1,  45,  82}, {  1,  31,  58},
+                    },
+                    {  // band 4
+                        { 47, 205, 234}, { 18, 158, 220}, {  2, 109, 177},
+                        {  1,  78, 137}, {  1,  53, 101}, {  1,  34,  70},
+                    },
+                    {  // band 5
+                        { 55, 233, 245}, { 16, 179, 233}, {  1, 116, 191},
+                        {  1,  79, 145}, {  1,  53, 101}, {  1,  37,  58},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 36,  33, 227}, { 39,  28, 190}, { 18,  27, 134},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 76, 156, 235}, {184, 147, 235}, {114, 130, 220},
+                        { 72, 112, 191}, { 42,  87, 144}, { 21,  65,  93},
+                    },
+                    {  // band 2
+                        { 96, 179, 240}, { 51, 149, 228}, { 12, 105, 191},
+                        {  6,  74, 148}, {  1,  47, 100}, {  1,  29,  53},
+                    },
+                    {  // band 3
+                        { 88, 191, 242}, { 35, 154, 231}, {  3, 106, 187},
+                        {  1,  74, 140}, {  1,  41,  84}, {  1,  25,  38},
+                    },
+                    {  // band 4
+                        { 77, 212, 249}, { 28, 171, 239}, {  2, 117, 199},
+                        {  1,  79, 151}, {  1,  45,  99}, {  1,   1,   1},
+                    },
+                    {  // band 5
+                        { 77, 236, 252}, { 27, 190, 246}, {  2, 120, 203},
+                        {  1,  78, 147}, {  1,  42,  72}, {128, 128, 128},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {185,  11, 227}, {113,  30, 182}, { 57,  44, 144},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {151, 139, 244}, {212, 139, 241}, {124, 126, 231},
+                        { 59, 104, 213}, { 26,  73, 158}, { 20,  45,  95},
+                    },
+                    {  // band 2
+                        {155, 163, 247}, {108, 152, 239}, { 39, 124, 214},
+                        {  7, 109, 162}, { 29,  57, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {158, 176, 250}, { 89, 164, 243}, { 11, 114, 196},
+                        {  1,  96, 141}, {  1,  81, 118}, {128,   1,   1},
+                    },
+                    {  // band 4
+                        {148, 212, 251}, { 59, 174, 240}, {  2, 130, 203},
+                        {  1,  70, 168}, {  1,  51, 106}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {104, 237, 252}, { 39, 190, 246}, {  1, 154, 220},
+                        {128, 102,   1}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {236,   6, 242}, {111,   6, 206}, { 36,   5, 161},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {193, 193, 252}, {248, 182, 251}, {218, 150, 246},
+                        {182, 134, 244}, {151, 137, 227}, { 45, 102, 195},
+                    },
+                    {  // band 2
+                        {188, 202, 251}, {125, 165, 249}, { 64,  75, 218},
+                        {  1, 128, 254}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {178, 225, 254}, {107, 188, 231}, { 21, 135, 233},
+                        {128,   1, 254}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {164, 227, 253}, { 55, 193, 251}, {  1, 111, 225},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {151, 243, 254}, { 50, 203, 254}, {128, 179, 254},
+                        {128,   1, 254}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+#endif  // CONFIG_TX64X64
+    },
+};
+#else
+#if CONFIG_NEW_TOKENSET
+static const av1_coeff_probs_model default_coef_probs_4x4[PLANE_TYPES] = {
+  { // Y plane
+    { // Intra
+      { // Band 0
+        {97, 27, 144}, {81, 38, 128}, {51, 42, 99}
+      },
+      { // Band 1
+        {74, 113, 204}, {68, 101, 199}, {50, 87, 173},
+        {31, 76, 133}, {13, 55, 86}, {3, 30, 39}
+      },
+      { // Band 2
+        {83, 156, 222}, {74, 127, 215}, {46, 101, 179},
+        {30, 80, 129}, {14, 57, 81}, {3, 27, 37}
+      },
+      { // Band 3
+        {105, 164, 233}, {84, 128, 224}, {49, 92, 175},
+        {28, 60, 114}, {12, 34, 53}, {20, 59, 98}
+      },
+      { // Band 4
+        {131, 159, 243}, {98, 123, 228}, {40, 78, 151},
+        {19, 46, 97}, {13, 47, 19}, {19, 16, 19}
+      },
+      { // Band 5
+        {192, 71, 241}, {174, 70, 226}, {125, 46, 153},
+        {108, 49, 116}, {82, 24, 46}, {60, 14, 30}
+      }
+    },
+    { // Inter
+      { // Band 0
+        {111, 66, 218}, {107, 87, 211}, {93, 99, 207}
+      },
+      { // Band 1
+        {107, 166, 250}, {107, 143, 247}, {73, 119, 221},
+        {43, 91, 166}, {17, 74, 102}, {3, 70, 53}
+      },
+      { // Band 2
+        {126, 177, 251}, {109, 148, 246}, {64, 99, 204},
+        {42, 68, 140}, {28, 52, 84}, {20, 34, 1}
+      },
+      { // Band 3
+        {143, 178, 252}, {114, 144, 245}, {46, 92, 188},
+        {45, 65, 104}, {40, 44, 76}, {1, 1, 1}
+      },
+      { // Band 4
+        {163, 159, 251}, {120, 131, 243}, {47, 81, 182},
+        {32, 39, 128}, {33, 44, 56}, {1, 17, 34}
+      },
+      { // Band 5
+        {209, 94, 251}, {190, 81, 241}, {139, 45, 147},
+        {123, 35, 73}, {118, 1, 118}, {3, 16, 42}
+      }
+    }
+  },
+  { // UV plane
+    { // Intra
+      { // Band 0
+        {189, 37, 229}, {145, 68, 205}, {99, 74, 171}
+      },
+      { // Band 1
+        {153, 139, 242}, {135, 125, 235}, {84, 100, 200},
+        {49, 75, 162}, {9, 21, 84}, {3, 31, 69}
+      },
+      { // Band 2
+        {165, 165, 244}, {128, 144, 240}, {68, 94, 204},
+        {39, 72, 132}, {22, 44, 93}, {26, 73, 26}
+      },
+      { // Band 3
+        {181, 174, 246}, {142, 132, 241}, {81, 96, 212},
+        {41, 70, 166}, {9, 48, 92}, {1, 19, 38}
+      },
+      { // Band 4
+        {197, 159, 251}, {168, 121, 245}, {107, 75, 218},
+        {70, 43, 158}, {1, 128, 1}, {1, 18, 37}
+      },
+      { // Band 5
+        {231, 79, 255}, {211, 74, 249}, {157, 104, 210},
+        {128, 102, 213}, {12, 34, 96}, {2, 20, 47}
+      }
+    },
+    { // Inter
+      { // Band 0
+        {220, 53, 252}, {191, 80, 248}, {154, 100, 245}
+      },
+      { // Band 1
+        {205, 153, 255}, {182, 147, 254}, {110, 131, 231},
+        {68, 114, 161}, {50, 114, 140}, {1, 33, 57}
+      },
+      { // Band 2
+        {213, 171, 255}, {184, 163, 254}, {116, 104, 235},
+        {79, 71, 207}, {1, 41, 79}, {1, 20, 39}
+      },
+      { // Band 3
+        {223, 158, 255}, {203, 137, 255}, {111, 142, 244},
+        {2, 255, 133}, {1, 44, 85}, {1, 22, 47}
+      },
+      { // Band 4
+        {232, 148, 255}, {222, 123, 255}, {255, 128, 255},
+        {3, 61, 124}, {1, 41, 84}, {1, 21, 52}
+      },
+      { // Band 5
+        {248, 92, 255}, {248, 96, 255}, {69, 58, 184},
+        {31, 44, 137}, {14, 38, 105}, {8, 23, 61}
+      }
+    }
+  }
+};
+static const av1_coeff_probs_model default_coef_probs_8x8[PLANE_TYPES] = {
+  { // Y plane
+    { // Intra
+      { // Band 0
+        {112, 31, 159}, {72, 37, 119}, {22, 35, 68}
+      },
+      { // Band 1
+        {42, 109, 174}, {45, 99, 172}, {32, 84, 149},
+        {18, 69, 119}, {6, 46, 76}, {1, 19, 31}
+      },
+      { // Band 2
+        {40, 154, 202}, {35, 126, 191}, {19, 98, 160},
+        {10, 75, 122}, {5, 53, 82}, {1, 23, 39}
+      },
+      { // Band 3
+        {39, 176, 215}, {28, 135, 200}, {11, 93, 156},
+        {5, 63, 109}, {1, 36, 64}, {1, 14, 26}
+      },
+      { // Band 4
+        {41, 191, 230}, {25, 147, 212}, {9, 97, 160},
+        {3, 65, 109}, {1, 33, 58}, {1, 14, 20}
+      },
+      { // Band 5
+        {68, 203, 242}, {40, 159, 220}, {12, 97, 153},
+        {5, 58, 97}, {1, 29, 55}, {1, 11, 18}
+      }
+    },
+    { // Inter
+      { // Band 0
+        {99, 67, 221}, {86, 80, 204}, {60, 87, 184}
+      },
+      { // Band 1
+        {73, 169, 246}, {79, 158, 242}, {50, 135, 220},
+        {30, 113, 181}, {18, 76, 126}, {5, 54, 85}
+      },
+      { // Band 2
+        {90, 184, 250}, {78, 162, 243}, {47, 118, 214},
+        {35, 85, 171}, {32, 53, 115}, {20, 28, 76}
+      },
+      { // Band 3
+        {109, 197, 252}, {89, 172, 247}, {52, 119, 217},
+        {37, 80, 161}, {23, 44, 100}, {1, 18, 34}
+      },
+      { // Band 4
+        {132, 202, 254}, {110, 175, 251}, {63, 128, 228},
+        {37, 86, 168}, {64, 91, 102}, {1, 17, 34}
+      },
+      { // Band 5
+        {126, 204, 253}, {100, 174, 250}, {50, 148, 237},
+        {25, 90, 133}, {1, 64, 85}, {3, 16, 42}
+      }
+    }
+  },
+  { // UV plane
+    { // Intra
+      { // Band 0
+        {195, 35, 235}, {137, 63, 201}, {62, 70, 145}
+      },
+      { // Band 1
+        {110, 158, 233}, {102, 143, 227}, {60, 120, 199},
+        {30, 85, 156}, {9, 50, 90}, {1, 16, 33}
+      },
+      { // Band 2
+        {102, 185, 233}, {71, 152, 224}, {29, 111, 187},
+        {18, 74, 138}, {4, 56, 87}, {1, 18, 46}
+      },
+      { // Band 3
+        {101, 205, 239}, {66, 161, 229}, {23, 109, 183},
+        {9, 85, 135}, {5, 71, 142}, {1, 1, 102}
+      },
+      { // Band 4
+        {109, 216, 243}, {69, 168, 233}, {23, 119, 191},
+        {8, 137, 115}, {1, 54, 98}, {1, 1, 255}
+      },
+      { // Band 5
+        {139, 224, 249}, {98, 176, 238}, {55, 129, 187},
+        {25, 101, 131}, {26, 59, 154}, {2, 20, 47}
+      }
+    },
+    { // Inter
+      { // Band 0
+        {220, 72, 254}, {176, 108, 251}, {114, 132, 247}
+      },
+      { // Band 1
+        {161, 185, 255}, {141, 185, 254}, {131, 180, 249},
+        {111, 164, 186}, {50, 98, 142}, {1, 128, 1}
+      },
+      { // Band 2
+        {171, 195, 255}, {133, 184, 254}, {68, 140, 231},
+        {102, 96, 205}, {1, 1, 128}, {1, 20, 39}
+      },
+      { // Band 3
+        {180, 206, 255}, {148, 191, 254}, {83, 157, 241},
+        {128, 171, 128}, {1, 44, 85}, {1, 22, 47}
+      },
+      { // Band 4
+        {194, 214, 255}, {159, 188, 255}, {122, 148, 250},
+        {3, 255, 124}, {1, 41, 84}, {1, 21, 52}
+      },
+      { // Band 5
+        {231, 217, 255}, {209, 149, 255}, {205, 145, 205},
+        {31, 44, 137}, {14, 38, 105}, {8, 23, 61}
+      }
+    }
+  }
+};
+static const av1_coeff_probs_model default_coef_probs_16x16[PLANE_TYPES] = {
+  { // Y plane
+    { // Intra
+      { // Band 0
+        {91, 31, 117}, {49, 31, 89}, {14, 25, 48}
+      },
+      { // Band 1
+        {31, 97, 151}, {33, 89, 148}, {28, 76, 133},
+        {17, 60, 106}, {7, 42, 72}, {1, 19, 32}
+      },
+      { // Band 2
+        {28, 152, 182}, {28, 120, 174}, {15, 93, 146},
+        {9, 72, 116}, {5, 47, 82}, {1, 21, 37}
+      },
+      { // Band 3
+        {29, 174, 203}, {23, 127, 187}, {9, 89, 145},
+        {2, 56, 100}, {1, 31, 56}, {1, 12, 25}
+      },
+      { // Band 4
+        {28, 193, 220}, {17, 141, 197}, {4, 87, 142},
+        {1, 54, 95}, {1, 31, 56}, {1, 12, 26}
+      },
+      { // Band 5
+        {29, 221, 240}, {11, 167, 215}, {2, 93, 149},
+        {1, 58, 100}, {1, 35, 61}, {1, 16, 28}
+      }
+    },
+    { // Inter
+      { // Band 0
+        {108, 52, 214}, {84, 60, 186}, {45, 69, 161}
+      },
+      { // Band 1
+        {43, 164, 236}, {57, 161, 233}, {38, 146, 214},
+        {24, 120, 182}, {15, 80, 126}, {5, 28, 66}
+      },
+      { // Band 2
+        {58, 187, 242}, {47, 163, 234}, {28, 118, 204},
+        {26, 82, 165}, {21, 54, 112}, {4, 28, 55}
+      },
+      { // Band 3
+        {65, 201, 248}, {51, 170, 239}, {22, 117, 204},
+        {11, 81, 159}, {10, 43, 102}, {1, 1, 1}
+      },
+      { // Band 4
+        {80, 206, 252}, {57, 179, 245}, {25, 129, 214},
+        {16, 97, 170}, {6, 60, 130}, {1, 128, 1}
+      },
+      { // Band 5
+        {97, 217, 253}, {68, 186, 250}, {26, 138, 216},
+        {20, 105, 166}, {11, 78, 111}, {3, 16, 42}
+      }
+    }
+  },
+  { // UV plane
+    { // Intra
+      { // Band 0
+        {181, 37, 233}, {121, 55, 192}, {46, 52, 124}
+      },
+      { // Band 1
+        {108, 157, 221}, {98, 140, 215}, {59, 124, 187},
+        {34, 92, 158}, {9, 68, 112}, {1, 41, 70}
+      },
+      { // Band 2
+        {80, 188, 223}, {46, 153, 204}, {25, 91, 173},
+        {11, 73, 131}, {5, 43, 82}, {1, 17, 91}
+      },
+      { // Band 3
+        {63, 209, 228}, {31, 157, 206}, {8, 104, 167},
+        {3, 63, 122}, {1, 44, 87}, {1, 43, 51}
+      },
+      { // Band 4
+        {52, 220, 234}, {22, 165, 216}, {4, 104, 163},
+        {2, 62, 129}, {1, 33, 50}, {1, 26, 28}
+      },
+      { // Band 5
+        {58, 238, 242}, {24, 183, 224}, {4, 109, 172},
+        {2, 87, 141}, {1, 52, 79}, {1, 51, 64}
+      }
+    },
+    { // Inter
+      { // Band 0
+        {224, 52, 250}, {188, 81, 239}, {138, 114, 228}
+      },
+      { // Band 1
+        {131, 206, 255}, {128, 193, 254}, {119, 173, 247},
+        {106, 127, 187}, {50, 100, 124}, {1, 96, 1}
+      },
+      { // Band 2
+        {123, 214, 254}, {86, 194, 254}, {64, 119, 221},
+        {43, 51, 128}, {1, 32, 110}, {1, 20, 39}
+      },
+      { // Band 3
+        {115, 223, 255}, {78, 200, 254}, {75, 164, 203},
+        {128, 85, 255}, {1, 44, 85}, {1, 22, 47}
+      },
+      { // Band 4
+        {132, 226, 255}, {88, 207, 254}, {20, 140, 225},
+        {3, 61, 124}, {1, 41, 84}, {1, 21, 52}
+      },
+      { // Band 5
+        {180, 236, 255}, {138, 223, 254}, {73, 166, 238},
+        {31, 255, 137}, {14, 38, 105}, {8, 23, 61}
+      }
+    }
+  }
+};
+static const av1_coeff_probs_model default_coef_probs_32x32[PLANE_TYPES] = {
+  { // Y plane
+    { // Intra
+      { // Band 0
+        {163, 26, 188}, {78, 29, 105}, {22, 22, 48}
+      },
+      { // Band 1
+        {72, 93, 168}, {74, 91, 170}, {62, 72, 151},
+        {37, 55, 112}, {10, 33, 63}, {1, 14, 23}
+      },
+      { // Band 2
+        {41, 163, 182}, {36, 136, 177}, {20, 102, 153},
+        {10, 76, 114}, {5, 45, 71}, {1, 17, 27}
+      },
+      { // Band 3
+        {43, 202, 213}, {28, 142, 193}, {10, 90, 141},
+        {2, 51, 93}, {1, 24, 48}, {1, 10, 19}
+      },
+      { // Band 4
+        {46, 216, 220}, {26, 150, 199}, {7, 87, 136},
+        {2, 49, 86}, {1, 28, 47}, {1, 12, 24}
+      },
+      { // Band 5
+        {19, 241, 237}, {5, 172, 200}, {1, 82, 126},
+        {1, 47, 79}, {1, 29, 47}, {1, 14, 25}
+      }
+    },
+    { // Inter
+      { // Band 0
+        {185, 20, 226}, {151, 26, 187}, {109, 34, 144}
+      },
+      { // Band 1
+        {56, 151, 227}, {76, 165, 232}, {62, 161, 222},
+        {47, 139, 201}, {29, 106, 150}, {14, 61, 98}
+      },
+      { // Band 2
+        {57, 200, 237}, {43, 164, 227}, {22, 106, 190},
+        {14, 68, 140}, {10, 48, 90}, {1, 15, 40}
+      },
+      { // Band 3
+        {46, 209, 238}, {28, 165, 225}, {7, 107, 180},
+        {2, 69, 125}, {2, 36, 94}, {1, 1, 1}
+      },
+      { // Band 4
+        {55, 225, 248}, {28, 181, 237}, {7, 117, 198},
+        {6, 77, 144}, {3, 60, 90}, {1, 1, 1}
+      },
+      { // Band 5
+        {63, 243, 251}, {27, 193, 242}, {4, 124, 200},
+        {1, 58, 153}, {1, 59, 124}, {3, 16, 42}
+      }
+    }
+  },
+  { // UV plane
+    { // Intra
+      { // Band 0
+        {208, 28, 218}, {183, 32, 188}, {169, 21, 189}
+      },
+      { // Band 1
+        {205, 124, 247}, {190, 96, 240}, {233, 89, 233},
+        {177, 44, 212}, {59, 58, 59}, {32, 33, 38}
+      },
+      { // Band 2
+        {194, 195, 250}, {179, 190, 226}, {32, 174, 128},
+        {32, 85, 128}, {12, 64, 122}, {1, 85, 90}
+      },
+      { // Band 3
+        {149, 232, 249}, {95, 159, 227}, {28, 91, 171},
+        {28, 102, 114}, {1, 1, 73}, {1, 19, 38}
+      },
+      { // Band 4
+        {154, 239, 246}, {138, 151, 235}, {1, 123, 138},
+        {128, 183, 255}, {1, 128, 1}, {1, 18, 37}
+      },
+      { // Band 5
+        {157, 255, 253}, {75, 171, 241}, {43, 102, 171},
+        {30, 44, 136}, {12, 34, 96}, {2, 20, 47}
+      }
+    },
+    { // Inter
+      { // Band 0
+        {249, 13, 248}, {238, 14, 220}, {225, 16, 174}
+      },
+      { // Band 1
+        {190, 189, 254}, {169, 134, 253}, {124, 179, 248},
+        {138, 131, 223}, {64, 133, 192}, {1, 85, 128}
+      },
+      { // Band 2
+        {139, 212, 254}, {126, 177, 255}, {93, 39, 186},
+        {1, 1, 171}, {1, 41, 79}, {1, 20, 39}
+      },
+      { // Band 3
+        {153, 216, 255}, {165, 204, 255}, {1, 1, 255},
+        {2, 73, 133}, {1, 1, 1}, {1, 22, 47}
+      },
+      { // Band 4
+        {147, 226, 254}, {119, 196, 255}, {1, 128, 255},
+        {1, 1, 171}, {1, 1, 1}, {1, 21, 52}
+      },
+      { // Band 5
+        {168, 240, 255}, {95, 179, 255}, {1, 171, 1},
+        {31, 44, 137}, {14, 38, 105}, {8, 23, 61}
+      }
+    }
+  }
+};
+#else  // CONFIG_NEW_TOKENSET
+static const av1_coeff_probs_model default_coef_probs_4x4[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        { 195,  29, 183 }, {  84,  49, 136 }, {   8,  42,  71 }
+      }, {  // Band 1
+        {  31, 107, 169 }, {  35,  99, 159 }, {  17,  82, 140 },
+        {   8,  66, 114 }, {   2,  44,  76 }, {   1,  19,  32 }
+      }, {  // Band 2
+        {  40, 132, 201 }, {  29, 114, 187 }, {  13,  91, 157 },
+        {   7,  75, 127 }, {   3,  58,  95 }, {   1,  28,  47 }
+      }, {  // Band 3
+        {  69, 142, 221 }, {  42, 122, 201 }, {  15,  91, 159 },
+        {   6,  67, 121 }, {   1,  42,  77 }, {   1,  17,  31 }
+      }, {  // Band 4
+        { 102, 148, 228 }, {  67, 117, 204 }, {  17,  82, 154 },
+        {   6,  59, 114 }, {   2,  39,  75 }, {   1,  15,  29 }
+      }, {  // Band 5
+        { 156,  57, 233 }, { 119,  57, 212 }, {  58,  48, 163 },
+        {  29,  40, 124 }, {  12,  30,  81 }, {   3,  12,  31 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 191, 107, 226 }, { 124, 117, 204 }, {  25,  99, 155 }
+      }, {  // Band 1
+        {  29, 148, 210 }, {  37, 126, 194 }, {   8,  93, 157 },
+        {   2,  68, 118 }, {   1,  39,  69 }, {   1,  17,  33 }
+      }, {  // Band 2
+        {  41, 151, 213 }, {  27, 123, 193 }, {   3,  82, 144 },
+        {   1,  58, 105 }, {   1,  32,  60 }, {   1,  13,  26 }
+      }, {  // Band 3
+        {  59, 159, 220 }, {  23, 126, 198 }, {   4,  88, 151 },
+        {   1,  66, 114 }, {   1,  38,  71 }, {   1,  18,  34 }
+      }, {  // Band 4
+        { 114, 136, 232 }, {  51, 114, 207 }, {  11,  83, 155 },
+        {   3,  56, 105 }, {   1,  33,  65 }, {   1,  17,  34 }
+      }, {  // Band 5
+        { 149,  65, 234 }, { 121,  57, 215 }, {  61,  49, 166 },
+        {  28,  36, 114 }, {  12,  25,  76 }, {   3,  16,  42 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 214,  49, 220 }, { 132,  63, 188 }, {  42,  65, 137 }
+      }, {  // Band 1
+        {  85, 137, 221 }, { 104, 131, 216 }, {  49, 111, 192 },
+        {  21,  87, 155 }, {   2,  49,  87 }, {   1,  16,  28 }
+      }, {  // Band 2
+        {  89, 163, 230 }, {  90, 137, 220 }, {  29, 100, 183 },
+        {  10,  70, 135 }, {   2,  42,  81 }, {   1,  17,  33 }
+      }, {  // Band 3
+        { 108, 167, 237 }, {  55, 133, 222 }, {  15,  97, 179 },
+        {   4,  72, 135 }, {   1,  45,  85 }, {   1,  19,  38 }
+      }, {  // Band 4
+        { 124, 146, 240 }, {  66, 124, 224 }, {  17,  88, 175 },
+        {   4,  58, 122 }, {   1,  36,  75 }, {   1,  18,  37 }
+      }, {  //  Band 5
+        { 141,  79, 241 }, { 126,  70, 227 }, {  66,  58, 182 },
+        {  30,  44, 136 }, {  12,  34,  96 }, {   2,  20,  47 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 229,  99, 249 }, { 143, 111, 235 }, {  46, 109, 192 }
+      }, {  // Band 1
+        {  82, 158, 236 }, {  94, 146, 224 }, {  25, 117, 191 },
+        {   9,  87, 149 }, {   3,  56,  99 }, {   1,  33,  57 }
+      }, {  // Band 2
+        {  83, 167, 237 }, {  68, 145, 222 }, {  10, 103, 177 },
+        {   2,  72, 131 }, {   1,  41,  79 }, {   1,  20,  39 }
+      }, {  // Band 3
+        {  99, 167, 239 }, {  47, 141, 224 }, {  10, 104, 178 },
+        {   2,  73, 133 }, {   1,  44,  85 }, {   1,  22,  47 }
+      }, {  // Band 4
+        { 127, 145, 243 }, {  71, 129, 228 }, {  17,  93, 177 },
+        {   3,  61, 124 }, {   1,  41,  84 }, {   1,  21,  52 }
+      }, {  // Band 5
+        { 157,  78, 244 }, { 140,  72, 231 }, {  69,  58, 184 },
+        {  31,  44, 137 }, {  14,  38, 105 }, {   8,  23,  61 }
+      }
+    }
+  }
+};
+
+static const av1_coeff_probs_model default_coef_probs_8x8[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        { 125,  34, 187 }, {  52,  41, 133 }, {   6,  31,  56 }
+      }, {  // Band 1
+        {  37, 109, 153 }, {  51, 102, 147 }, {  23,  87, 128 },
+        {   8,  67, 101 }, {   1,  41,  63 }, {   1,  19,  29 }
+      }, {  // Band 2
+        {  31, 154, 185 }, {  17, 127, 175 }, {   6,  96, 145 },
+        {   2,  73, 114 }, {   1,  51,  82 }, {   1,  28,  45 }
+      }, {  // Band 3
+        {  23, 163, 200 }, {  10, 131, 185 }, {   2,  93, 148 },
+        {   1,  67, 111 }, {   1,  41,  69 }, {   1,  14,  24 }
+      }, {  // Band 4
+        {  29, 176, 217 }, {  12, 145, 201 }, {   3, 101, 156 },
+        {   1,  69, 111 }, {   1,  39,  63 }, {   1,  14,  23 }
+      }, {  // Band 5
+        {  57, 192, 233 }, {  25, 154, 215 }, {   6, 109, 167 },
+        {   3,  78, 118 }, {   1,  48,  69 }, {   1,  21,  29 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 202, 105, 245 }, { 108, 106, 216 }, {  18,  90, 144 }
+      }, {  // Band 1
+        {  33, 172, 219 }, {  64, 149, 206 }, {  14, 117, 177 },
+        {   5,  90, 141 }, {   2,  61,  95 }, {   1,  37,  57 }
+      }, {  // Band 2
+        {  33, 179, 220 }, {  11, 140, 198 }, {   1,  89, 148 },
+        {   1,  60, 104 }, {   1,  33,  57 }, {   1,  12,  21 }
+      }, {  // Band 3
+        {  30, 181, 221 }, {   8, 141, 198 }, {   1,  87, 145 },
+        {   1,  58, 100 }, {   1,  31,  55 }, {   1,  12,  20 }
+      }, {  // Band 4
+        {  32, 186, 224 }, {   7, 142, 198 }, {   1,  86, 143 },
+        {   1,  58, 100 }, {   1,  31,  55 }, {   1,  12,  22 }
+      }, {  // Band 5
+        {  57, 192, 227 }, {  20, 143, 204 }, {   3,  96, 154 },
+        {   1,  68, 112 }, {   1,  42,  69 }, {   1,  19,  32 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 212,  35, 215 }, { 113,  47, 169 }, {  29,  48, 105 }
+      }, {  // Band 1
+        {  74, 129, 203 }, { 106, 120, 203 }, {  49, 107, 178 },
+        {  19,  84, 144 }, {   4,  50,  84 }, {   1,  15,  25 }
+      }, {  // Band 2
+        {  71, 172, 217 }, {  44, 141, 209 }, {  15, 102, 173 },
+        {   6,  76, 133 }, {   2,  51,  89 }, {   1,  24,  42 }
+      }, {  // Band 3
+        {  64, 185, 231 }, {  31, 148, 216 }, {   8, 103, 175 },
+        {   3,  74, 131 }, {   1,  46,  81 }, {   1,  18,  30 }
+      }, {  // Band 4
+        {  65, 196, 235 }, {  25, 157, 221 }, {   5, 105, 174 },
+        {   1,  67, 120 }, {   1,  38,  69 }, {   1,  15,  30 }
+      }, {  // Band 5
+        {  65, 204, 238 }, {  30, 156, 224 }, {   7, 107, 177 },
+        {   2,  70, 124 }, {   1,  42,  73 }, {   1,  18,  34 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 225,  86, 251 }, { 144, 104, 235 }, {  42,  99, 181 }
+      }, {  // Band 1
+        {  85, 175, 239 }, { 112, 165, 229 }, {  29, 136, 200 },
+        {  12, 103, 162 }, {   6,  77, 123 }, {   2,  53,  84 }
+      }, {  // Band 2
+        {  75, 183, 239 }, {  30, 155, 221 }, {   3, 106, 171 },
+        {   1,  74, 128 }, {   1,  44,  76 }, {   1,  17,  28 }
+      }, {  // Band 3
+        {  73, 185, 240 }, {  27, 159, 222 }, {   2, 107, 172 },
+        {   1,  75, 127 }, {   1,  42,  73 }, {   1,  17,  29 }
+      }, {  // Band 4
+        {  62, 190, 238 }, {  21, 159, 222 }, {   2, 107, 172 },
+        {   1,  72, 122 }, {   1,  40,  71 }, {   1,  18,  32 }
+      }, {  // Band 5
+        {  61, 199, 240 }, {  27, 161, 226 }, {   4, 113, 180 },
+        {   1,  76, 129 }, {   1,  46,  80 }, {   1,  23,  41 }
+      }
+    }
+  }
+};
+
+static const av1_coeff_probs_model default_coef_probs_16x16[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        {   7,  27, 153 }, {   5,  30,  95 }, {   1,  16,  30 }
+      }, {  // Band 1
+        {  50,  75, 127 }, {  57,  75, 124 }, {  27,  67, 108 },
+        {  10,  54,  86 }, {   1,  33,  52 }, {   1,  12,  18 }
+      }, {  // Band 2
+        {  43, 125, 151 }, {  26, 108, 148 }, {   7,  83, 122 },
+        {   2,  59,  89 }, {   1,  38,  60 }, {   1,  17,  27 }
+      }, {  // Band 3
+        {  23, 144, 163 }, {  13, 112, 154 }, {   2,  75, 117 },
+        {   1,  50,  81 }, {   1,  31,  51 }, {   1,  14,  23 }
+      }, {  // Band 4
+        {  18, 162, 185 }, {   6, 123, 171 }, {   1,  78, 125 },
+        {   1,  51,  86 }, {   1,  31,  54 }, {   1,  14,  23 }
+      }, {  // Band 5
+        {  15, 199, 227 }, {   3, 150, 204 }, {   1,  91, 146 },
+        {   1,  55,  95 }, {   1,  30,  53 }, {   1,  11,  20 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        {  19,  55, 240 }, {  19,  59, 196 }, {   3,  52, 105 }
+      }, {  // Band 1
+        {  41, 166, 207 }, { 104, 153, 199 }, {  31, 123, 181 },
+        {  14, 101, 152 }, {   5,  72, 106 }, {   1,  36,  52 }
+      }, {  // Band 2
+        {  35, 176, 211 }, {  12, 131, 190 }, {   2,  88, 144 },
+        {   1,  60, 101 }, {   1,  36,  60 }, {   1,  16,  28 }
+      }, {  // Band 3
+        {  28, 183, 213 }, {   8, 134, 191 }, {   1,  86, 142 },
+        {   1,  56,  96 }, {   1,  30,  53 }, {   1,  12,  20 }
+      }, {  // Band 4
+        {  20, 190, 215 }, {   4, 135, 192 }, {   1,  84, 139 },
+        {   1,  53,  91 }, {   1,  28,  49 }, {   1,  11,  20 }
+      }, {  // Band 5
+        {  13, 196, 216 }, {   2, 137, 192 }, {   1,  86, 143 },
+        {   1,  57,  99 }, {   1,  32,  56 }, {   1,  13,  24 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 211,  29, 217 }, {  96,  47, 156 }, {  22,  43,  87 }
+      }, {  // Band 1
+        {  78, 120, 193 }, { 111, 116, 186 }, {  46, 102, 164 },
+        {  15,  80, 128 }, {   2,  49,  76 }, {   1,  18,  28 }
+      }, {  // Band 2
+        {  71, 161, 203 }, {  42, 132, 192 }, {  10,  98, 150 },
+        {   3,  69, 109 }, {   1,  44,  70 }, {   1,  18,  29 }
+      }, {  // Band 3
+        {  57, 186, 211 }, {  30, 140, 196 }, {   4,  93, 146 },
+        {   1,  62, 102 }, {   1,  38,  65 }, {   1,  16,  27 }
+      }, {  // Band 4
+        {  47, 199, 217 }, {  14, 145, 196 }, {   1,  88, 142 },
+        {   1,  57,  98 }, {   1,  36,  62 }, {   1,  15,  26 }
+      }, {  // Band 5
+        {  26, 219, 229 }, {   5, 155, 207 }, {   1,  94, 151 },
+        {   1,  60, 104 }, {   1,  36,  62 }, {   1,  16,  28 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 233,  29, 248 }, { 146,  47, 220 }, {  43,  52, 140 }
+      }, {  // Band 1
+        { 100, 163, 232 }, { 179, 161, 222 }, {  63, 142, 204 },
+        {  37, 113, 174 }, {  26,  89, 137 }, {  18,  68,  97 }
+      }, {  // Band 2
+        {  85, 181, 230 }, {  32, 146, 209 }, {   7, 100, 164 },
+        {   3,  71, 121 }, {   1,  45,  77 }, {   1,  18,  30 }
+      }, {  // Band 3
+        {  65, 187, 230 }, {  20, 148, 207 }, {   2,  97, 159 },
+        {   1,  68, 116 }, {   1,  40,  70 }, {   1,  14,  29 }
+      }, {  // Band 4
+        {  40, 194, 227 }, {   8, 147, 204 }, {   1,  94, 155 },
+        {   1,  65, 112 }, {   1,  39,  66 }, {   1,  14,  26 }
+      }, {  // Band 5
+        {  16, 208, 228 }, {   3, 151, 207 }, {   1,  98, 160 },
+        {   1,  67, 117 }, {   1,  41,  74 }, {   1,  17,  31 }
+      }
+    }
+  }
+};
+
+static const av1_coeff_probs_model default_coef_probs_32x32[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        {  17,  38, 140 }, {   7,  34,  80 }, {   1,  17,  29 }
+      }, {  // Band 1
+        {  37,  75, 128 }, {  41,  76, 128 }, {  26,  66, 116 },
+        {  12,  52,  94 }, {   2,  32,  55 }, {   1,  10,  16 }
+      }, {  // Band 2
+        {  50, 127, 154 }, {  37, 109, 152 }, {  16,  82, 121 },
+        {   5,  59,  85 }, {   1,  35,  54 }, {   1,  13,  20 }
+      }, {  // Band 3
+        {  40, 142, 167 }, {  17, 110, 157 }, {   2,  71, 112 },
+        {   1,  44,  72 }, {   1,  27,  45 }, {   1,  11,  17 }
+      }, {  // Band 4
+        {  30, 175, 188 }, {   9, 124, 169 }, {   1,  74, 116 },
+        {   1,  48,  78 }, {   1,  30,  49 }, {   1,  11,  18 }
+      }, {  // Band 5
+        {  10, 222, 223 }, {   2, 150, 194 }, {   1,  83, 128 },
+        {   1,  48,  79 }, {   1,  27,  45 }, {   1,  11,  17 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        {  36,  41, 235 }, {  29,  36, 193 }, {  10,  27, 111 }
+      }, {  // Band 1
+        {  85, 165, 222 }, { 177, 162, 215 }, { 110, 135, 195 },
+        {  57, 113, 168 }, {  23,  83, 120 }, {  10,  49,  61 }
+      }, {  // Band 2
+        {  85, 190, 223 }, {  36, 139, 200 }, {   5,  90, 146 },
+        {   1,  60, 103 }, {   1,  38,  65 }, {   1,  18,  30 }
+      }, {  // Band 3
+        {  72, 202, 223 }, {  23, 141, 199 }, {   2,  86, 140 },
+        {   1,  56,  97 }, {   1,  36,  61 }, {   1,  16,  27 }
+      }, {  // Band 4
+        {  55, 218, 225 }, {  13, 145, 200 }, {   1,  86, 141 },
+        {   1,  57,  99 }, {   1,  35,  61 }, {   1,  13,  22 }
+      }, {  // Band 5
+        {  15, 235, 212 }, {   1, 132, 184 }, {   1,  84, 139 },
+        {   1,  57,  97 }, {   1,  34,  56 }, {   1,  14,  23 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 181,  21, 201 }, {  61,  37, 123 }, {  10,  38,  71 }
+      }, {  // Band 1
+        {  47, 106, 172 }, {  95, 104, 173 }, {  42,  93, 159 },
+        {  18,  77, 131 }, {   4,  50,  81 }, {   1,  17,  23 }
+      }, {  // Band 2
+        {  62, 147, 199 }, {  44, 130, 189 }, {  28, 102, 154 },
+        {  18,  75, 115 }, {   2,  44,  65 }, {   1,  12,  19 }
+      }, {  // Band 3
+        {  55, 153, 210 }, {  24, 130, 194 }, {   3,  93, 146 },
+        {   1,  61,  97 }, {   1,  31,  50 }, {   1,  10,  16 }
+      }, {  // Band 4
+        {  49, 186, 223 }, {  17, 148, 204 }, {   1,  96, 142 },
+        {   1,  53,  83 }, {   1,  26,  44 }, {   1,  11,  17 }
+      }, {  // Band 5
+        {  13, 217, 212 }, {   2, 136, 180 }, {   1,  78, 124 },
+        {   1,  50,  83 }, {   1,  29,  49 }, {   1,  14,  23 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 197,  13, 247 }, {  82,  17, 222 }, {  25,  17, 162 }
+      }, {  // Band 1
+        { 126, 186, 247 }, { 234, 191, 243 }, { 176, 177, 234 },
+        { 104, 158, 220 }, {  66, 128, 186 }, {  55,  90, 137 }
+      }, {  // Band 2
+        { 111, 197, 242 }, {  46, 158, 219 }, {   9, 104, 171 },
+        {   2,  65, 125 }, {   1,  44,  80 }, {   1,  17,  91 }
+      }, {  // Band 3
+        { 104, 208, 245 }, {  39, 168, 224 }, {   3, 109, 162 },
+        {   1,  79, 124 }, {   1,  50, 102 }, {   1,  43, 102 }
+      }, {  // Band 4
+        {  84, 220, 246 }, {  31, 177, 231 }, {   2, 115, 180 },
+        {   1,  79, 134 }, {   1,  55,  77 }, {   1,  60,  79 }
+      }, {  // Band 5
+        {  43, 243, 240 }, {   8, 180, 217 }, {   1, 115, 166 },
+        {   1,  84, 121 }, {   1,  51,  67 }, {   1,  16,   6 }
+      }
+    }
+  }
+};
+#endif  // CONFIG_NEW_TOKENSET
+
+#if CONFIG_TX64X64
+// FIXME. Optimize for EC_MULTISYMBOL
+static const av1_coeff_probs_model default_coef_probs_64x64[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        {  17,  38, 140 }, {   7,  34,  80 }, {   1,  17,  29 }
+      }, {  // Band 1
+        {  37,  75, 128 }, {  41,  76, 128 }, {  26,  66, 116 },
+        {  12,  52,  94 }, {   2,  32,  55 }, {   1,  10,  16 }
+      }, {  // Band 2
+        {  50, 127, 154 }, {  37, 109, 152 }, {  16,  82, 121 },
+        {   5,  59,  85 }, {   1,  35,  54 }, {   1,  13,  20 }
+      }, {  // Band 3
+        {  40, 142, 167 }, {  17, 110, 157 }, {   2,  71, 112 },
+        {   1,  44,  72 }, {   1,  27,  45 }, {   1,  11,  17 }
+      }, {  // Band 4
+        {  30, 175, 188 }, {   9, 124, 169 }, {   1,  74, 116 },
+        {   1,  48,  78 }, {   1,  30,  49 }, {   1,  11,  18 }
+      }, {  // Band 5
+        {  10, 222, 223 }, {   2, 150, 194 }, {   1,  83, 128 },
+        {   1,  48,  79 }, {   1,  27,  45 }, {   1,  11,  17 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        {  36,  41, 235 }, {  29,  36, 193 }, {  10,  27, 111 }
+      }, {  // Band 1
+        {  85, 165, 222 }, { 177, 162, 215 }, { 110, 135, 195 },
+        {  57, 113, 168 }, {  23,  83, 120 }, {  10,  49,  61 }
+      }, {  // Band 2
+        {  85, 190, 223 }, {  36, 139, 200 }, {   5,  90, 146 },
+        {   1,  60, 103 }, {   1,  38,  65 }, {   1,  18,  30 }
+      }, {  // Band 3
+        {  72, 202, 223 }, {  23, 141, 199 }, {   2,  86, 140 },
+        {   1,  56,  97 }, {   1,  36,  61 }, {   1,  16,  27 }
+      }, {  // Band 4
+        {  55, 218, 225 }, {  13, 145, 200 }, {   1,  86, 141 },
+        {   1,  57,  99 }, {   1,  35,  61 }, {   1,  13,  22 }
+      }, {  // Band 5
+        {  15, 235, 212 }, {   1, 132, 184 }, {   1,  84, 139 },
+        {   1,  57,  97 }, {   1,  34,  56 }, {   1,  14,  23 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 181,  21, 201 }, {  61,  37, 123 }, {  10,  38,  71 }
+      }, {  // Band 1
+        {  47, 106, 172 }, {  95, 104, 173 }, {  42,  93, 159 },
+        {  18,  77, 131 }, {   4,  50,  81 }, {   1,  17,  23 }
+      }, {  // Band 2
+        {  62, 147, 199 }, {  44, 130, 189 }, {  28, 102, 154 },
+        {  18,  75, 115 }, {   2,  44,  65 }, {   1,  12,  19 }
+      }, {  // Band 3
+        {  55, 153, 210 }, {  24, 130, 194 }, {   3,  93, 146 },
+        {   1,  61,  97 }, {   1,  31,  50 }, {   1,  10,  16 }
+      }, {  // Band 4
+        {  49, 186, 223 }, {  17, 148, 204 }, {   1,  96, 142 },
+        {   1,  53,  83 }, {   1,  26,  44 }, {   1,  11,  17 }
+      }, {  // Band 5
+        {  13, 217, 212 }, {   2, 136, 180 }, {   1,  78, 124 },
+        {   1,  50,  83 }, {   1,  29,  49 }, {   1,  14,  23 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 197,  13, 247 }, {  82,  17, 222 }, {  25,  17, 162 }
+      }, {  // Band 1
+        { 126, 186, 247 }, { 234, 191, 243 }, { 176, 177, 234 },
+        { 104, 158, 220 }, {  66, 128, 186 }, {  55,  90, 137 }
+      }, {  // Band 2
+        { 111, 197, 242 }, {  46, 158, 219 }, {   9, 104, 171 },
+        {   2,  65, 125 }, {   1,  44,  80 }, {   1,  17,  91 }
+      }, {  // Band 3
+        { 104, 208, 245 }, {  39, 168, 224 }, {   3, 109, 162 },
+        {   1,  79, 124 }, {   1,  50, 102 }, {   1,  43, 102 }
+      }, {  // Band 4
+        {  84, 220, 246 }, {  31, 177, 231 }, {   2, 115, 180 },
+        {   1,  79, 134 }, {   1,  55,  77 }, {   1,  60,  79 }
+      }, {  // Band 5
+        {  43, 243, 240 }, {   8, 180, 217 }, {   1, 115, 166 },
+        {   1,  84, 121 }, {   1,  51,  67 }, {   1,  16,   6 }
+      }
+    }
+  }
+};
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_Q_ADAPT_PROBS
+#if CONFIG_NEW_TOKENSET
+static const aom_prob av1_default_blockzero_probs[TX_SIZES][PLANE_TYPES]
+                                           [REF_TYPES][BLOCKZ_CONTEXTS] = {
+  { // TX_4x4
+    { // Y plane
+      { 195, 84, 8, },  // Intra
+      { 191, 124, 25, },  // Inter
+    },
+    { // UV plane
+      { 214, 132, 42, },  // Intra
+      { 229, 143, 46, },  // Inter
+    },
+  },
+  { // TX_8x8
+    { // Y plane
+      { 125, 52, 6, },  // Intra
+      { 202, 108, 18, },  // Inter
+    },
+    { // UV plane
+      { 212, 113, 29, },  // Intra
+      { 225, 144, 42, },  // Inter
+    },
+  },
+  { // TX_16x16
+    { // Y plane
+      { 7, 5, 1, },  // Intra
+      { 19, 19, 3, },  // Inter
+    },
+    { // UV plane
+      { 211, 96, 22, },  // Intra
+      { 233, 146, 43, },  // Inter
+    },
+  },
+  { // TX_32x32
+    { //  Y plane
+      { 17, 7, 1, },  // Intra
+      { 36, 29, 10, },  // Inter
+    },
+    { // UV plane
+      { 181, 61, 10, },  // Intra
+      { 197, 82, 25, },  // Inter
+    },
+  },
+#if CONFIG_TX64X64
+  { // TX_64x64 FIXME: currently the same as 32x32
+    { //  Y plane
+      { 17, 7, 1, },  // Intra
+      { 36, 29, 10, },  // Inter
+    },
+    { // UV plane
+      { 181, 61, 10, },  // Intra
+      { 197, 82, 25, },  // Inter
+    },
+  },
+#endif
+};
+
+static const coeff_cdf_model default_coef_head_cdf_4x4[PLANE_TYPES] = {
+  {     // Y plane
+    {   // Intra
+      { // Band 0
+        { AOM_ICDF(25024), AOM_ICDF(25863), AOM_ICDF(27361), AOM_ICDF(29796),
+          AOM_ICDF(30374), AOM_ICDF(32768) },
+        { AOM_ICDF(10816), AOM_ICDF(14127), AOM_ICDF(17116), AOM_ICDF(23516),
+          AOM_ICDF(24999), AOM_ICDF(32768) },
+        { AOM_ICDF(1088), AOM_ICDF(6358), AOM_ICDF(8428), AOM_ICDF(16648),
+          AOM_ICDF(18276), AOM_ICDF(32768) } },
+      { // Band 1
+        {AOM_ICDF(14529), AOM_ICDF(18769), AOM_ICDF(29100), AOM_ICDF(29634),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(12993), AOM_ICDF(17117), AOM_ICDF(28404), AOM_ICDF(28988),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11201), AOM_ICDF(14084), AOM_ICDF(25818), AOM_ICDF(26504),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9793), AOM_ICDF(11267), AOM_ICDF(21775), AOM_ICDF(22451),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(7105), AOM_ICDF(7562), AOM_ICDF(15777), AOM_ICDF(16225),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(3905), AOM_ICDF(3966), AOM_ICDF(8359), AOM_ICDF(8526),
+         AOM_ICDF(32768) } },
+      { // Band 2
+        {AOM_ICDF(20033), AOM_ICDF(23643), AOM_ICDF(31102), AOM_ICDF(31374),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(16321), AOM_ICDF(20350), AOM_ICDF(30167), AOM_ICDF(30546),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(12993), AOM_ICDF(15512), AOM_ICDF(26859), AOM_ICDF(27396),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(10305), AOM_ICDF(11659), AOM_ICDF(21669), AOM_ICDF(22330),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(7361), AOM_ICDF(7819), AOM_ICDF(15450), AOM_ICDF(15940),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(3521), AOM_ICDF(3580), AOM_ICDF(7805), AOM_ICDF(7976),
+         AOM_ICDF(32768) } },
+      { // Band 3
+        {AOM_ICDF(21057), AOM_ICDF(25460), AOM_ICDF(31740), AOM_ICDF(31952),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(16449), AOM_ICDF(21173), AOM_ICDF(30761), AOM_ICDF(31092),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11841), AOM_ICDF(14615), AOM_ICDF(26188), AOM_ICDF(26824),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(7745), AOM_ICDF(8991), AOM_ICDF(18937), AOM_ICDF(19707),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4417), AOM_ICDF(4706), AOM_ICDF(10342), AOM_ICDF(10890),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(7617), AOM_ICDF(8392), AOM_ICDF(17295), AOM_ICDF(17915),
+         AOM_ICDF(32768) } },
+      { // Band 4
+        {AOM_ICDF(20417), AOM_ICDF(26452), AOM_ICDF(32166), AOM_ICDF(32321),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(15809), AOM_ICDF(21634), AOM_ICDF(30947), AOM_ICDF(31298),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(10049), AOM_ICDF(12176), AOM_ICDF(23495), AOM_ICDF(24229),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5953), AOM_ICDF(6731), AOM_ICDF(16166), AOM_ICDF(16798),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(6081), AOM_ICDF(6188), AOM_ICDF(8114), AOM_ICDF(8764),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2113), AOM_ICDF(2291), AOM_ICDF(4448), AOM_ICDF(5527),
+         AOM_ICDF(32768) } },
+      { // Band 5
+        {AOM_ICDF(9153), AOM_ICDF(25905), AOM_ICDF(31431), AOM_ICDF(31934),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9025), AOM_ICDF(23345), AOM_ICDF(30033), AOM_ICDF(30965),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5953), AOM_ICDF(13835), AOM_ICDF(22032), AOM_ICDF(24664),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(6337), AOM_ICDF(11435), AOM_ICDF(18366), AOM_ICDF(21418),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(3137), AOM_ICDF(4871), AOM_ICDF(8519), AOM_ICDF(12426),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(1857), AOM_ICDF(2727), AOM_ICDF(5540), AOM_ICDF(8757),
+         AOM_ICDF(32768) } } },
+    {   // Intra
+      { // Band 0
+        { AOM_ICDF(24512), AOM_ICDF(26673), AOM_ICDF(28962), AOM_ICDF(31929),
+          AOM_ICDF(32126), AOM_ICDF(32768) },
+        { AOM_ICDF(15936), AOM_ICDF(21711), AOM_ICDF(25569), AOM_ICDF(30899),
+          AOM_ICDF(31305), AOM_ICDF(32768) },
+        { AOM_ICDF(3264), AOM_ICDF(14756), AOM_ICDF(20107), AOM_ICDF(29407),
+          AOM_ICDF(30032), AOM_ICDF(32768) } },
+      { // Band 1
+        {AOM_ICDF(21313), AOM_ICDF(26020), AOM_ICDF(32523), AOM_ICDF(32575),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(18369), AOM_ICDF(24215), AOM_ICDF(32291), AOM_ICDF(32391),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(15297), AOM_ICDF(19637), AOM_ICDF(30414), AOM_ICDF(30752),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11713), AOM_ICDF(14040), AOM_ICDF(25408), AOM_ICDF(26033),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9537), AOM_ICDF(10173), AOM_ICDF(18839), AOM_ICDF(19315),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9025), AOM_ICDF(9093), AOM_ICDF(13987), AOM_ICDF(14115),
+         AOM_ICDF(32768) } },
+      { // Band 2
+        {AOM_ICDF(22721), AOM_ICDF(27599), AOM_ICDF(32592), AOM_ICDF(32636),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(19009), AOM_ICDF(24676), AOM_ICDF(32258), AOM_ICDF(32367),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(12737), AOM_ICDF(16769), AOM_ICDF(28739), AOM_ICDF(29247),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(8769), AOM_ICDF(10956), AOM_ICDF(21941), AOM_ICDF(22840),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(6721), AOM_ICDF(7678), AOM_ICDF(15319), AOM_ICDF(16290),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4417), AOM_ICDF(4430), AOM_ICDF(4583), AOM_ICDF(5712),
+         AOM_ICDF(32768) } },
+      { // Band 3
+        {AOM_ICDF(22849), AOM_ICDF(28333), AOM_ICDF(32633), AOM_ICDF(32671),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(18497), AOM_ICDF(24619), AOM_ICDF(32184), AOM_ICDF(32315),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11841), AOM_ICDF(14640), AOM_ICDF(27251), AOM_ICDF(27752),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(8385), AOM_ICDF(10154), AOM_ICDF(18339), AOM_ICDF(19621),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5697), AOM_ICDF(6977), AOM_ICDF(13787), AOM_ICDF(15289),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(193), AOM_ICDF(194), AOM_ICDF(384), AOM_ICDF(479),
+         AOM_ICDF(32768) } },
+      { // Band 4
+        {AOM_ICDF(20417), AOM_ICDF(28167), AOM_ICDF(32552), AOM_ICDF(32621),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(16833), AOM_ICDF(23968), AOM_ICDF(31991), AOM_ICDF(32174),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(10433), AOM_ICDF(13387), AOM_ICDF(26356), AOM_ICDF(26951),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5057), AOM_ICDF(6823), AOM_ICDF(18967), AOM_ICDF(19843),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5697), AOM_ICDF(6479), AOM_ICDF(11672), AOM_ICDF(13052),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2241), AOM_ICDF(2265), AOM_ICDF(6355), AOM_ICDF(6432),
+         AOM_ICDF(32768) } },
+      { // Band 5
+        {AOM_ICDF(12097), AOM_ICDF(28717), AOM_ICDF(32406), AOM_ICDF(32555),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(10433), AOM_ICDF(26113), AOM_ICDF(31504), AOM_ICDF(31975),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5825), AOM_ICDF(14284), AOM_ICDF(21349), AOM_ICDF(24461),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4545), AOM_ICDF(8454), AOM_ICDF(12648), AOM_ICDF(17501),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(193), AOM_ICDF(7173), AOM_ICDF(15272), AOM_ICDF(19322),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2113), AOM_ICDF(2183), AOM_ICDF(7202), AOM_ICDF(7377),
+         AOM_ICDF(32768) } } } },
+  {     // UV plane
+    {   // Inter
+      { // Band 0
+        { AOM_ICDF(27456), AOM_ICDF(28244), AOM_ICDF(31289), AOM_ICDF(32358),
+          AOM_ICDF(32534), AOM_ICDF(32768) },
+        { AOM_ICDF(16960), AOM_ICDF(21207), AOM_ICDF(26511), AOM_ICDF(30539),
+          AOM_ICDF(31190), AOM_ICDF(32768) },
+        { AOM_ICDF(5440), AOM_ICDF(13412), AOM_ICDF(18469), AOM_ICDF(26423),
+          AOM_ICDF(27669), AOM_ICDF(32768) } },
+      { // Band 1
+        {AOM_ICDF(17857), AOM_ICDF(26327), AOM_ICDF(31983), AOM_ICDF(32219),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(16065), AOM_ICDF(24198), AOM_ICDF(31431), AOM_ICDF(31785),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(12865), AOM_ICDF(18011), AOM_ICDF(28454), AOM_ICDF(29166),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9665), AOM_ICDF(12501), AOM_ICDF(24331), AOM_ICDF(25147),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2753), AOM_ICDF(3121), AOM_ICDF(12661), AOM_ICDF(13034),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4033), AOM_ICDF(4140), AOM_ICDF(11834), AOM_ICDF(11977),
+         AOM_ICDF(32768) } },
+      { // Band 2
+        {AOM_ICDF(21185), AOM_ICDF(28338), AOM_ICDF(32249), AOM_ICDF(32417),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(18497), AOM_ICDF(25227), AOM_ICDF(31905), AOM_ICDF(32122),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(12097), AOM_ICDF(16516), AOM_ICDF(28610), AOM_ICDF(29166),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9281), AOM_ICDF(11157), AOM_ICDF(21438), AOM_ICDF(22312),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5697), AOM_ICDF(6566), AOM_ICDF(15585), AOM_ICDF(16340),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9409), AOM_ICDF(9659), AOM_ICDF(11827), AOM_ICDF(12911),
+         AOM_ICDF(32768) } },
+      { // Band 3
+        {AOM_ICDF(22337), AOM_ICDF(29459), AOM_ICDF(32382), AOM_ICDF(32519),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(16961), AOM_ICDF(25262), AOM_ICDF(31874), AOM_ICDF(32123),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(12353), AOM_ICDF(17748), AOM_ICDF(29300), AOM_ICDF(29852),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9025), AOM_ICDF(11528), AOM_ICDF(24468), AOM_ICDF(25141),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(6209), AOM_ICDF(6565), AOM_ICDF(15806), AOM_ICDF(16121),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2497), AOM_ICDF(2524), AOM_ICDF(7050), AOM_ICDF(7125),
+         AOM_ICDF(32768) } },
+      { // Band 4
+        {AOM_ICDF(20417), AOM_ICDF(29779), AOM_ICDF(32552), AOM_ICDF(32636),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(15553), AOM_ICDF(26420), AOM_ICDF(32063), AOM_ICDF(32295),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9665), AOM_ICDF(17946), AOM_ICDF(29385), AOM_ICDF(30096),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5569), AOM_ICDF(10207), AOM_ICDF(22410), AOM_ICDF(23836),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(16449), AOM_ICDF(16450), AOM_ICDF(16545), AOM_ICDF(16593),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2369), AOM_ICDF(2395), AOM_ICDF(6822), AOM_ICDF(6898),
+         AOM_ICDF(32768) } },
+      { // Band 5
+        {AOM_ICDF(10177), AOM_ICDF(30567), AOM_ICDF(32725), AOM_ICDF(32745),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9537), AOM_ICDF(28243), AOM_ICDF(32179), AOM_ICDF(32423),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(13377), AOM_ICDF(23187), AOM_ICDF(29322), AOM_ICDF(30382),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(13121), AOM_ICDF(21346), AOM_ICDF(29507), AOM_ICDF(30326),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4417), AOM_ICDF(4939), AOM_ICDF(15104), AOM_ICDF(15535),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2625), AOM_ICDF(2680), AOM_ICDF(8218), AOM_ICDF(8338),
+         AOM_ICDF(32768) } } },
+    {   // Inter
+      { // Band 0
+        { AOM_ICDF(29376), AOM_ICDF(30098), AOM_ICDF(32421), AOM_ICDF(32766),
+          AOM_ICDF(32767), AOM_ICDF(32768) },
+        { AOM_ICDF(18368), AOM_ICDF(22916), AOM_ICDF(30116), AOM_ICDF(32541),
+          AOM_ICDF(32650), AOM_ICDF(32768) },
+        { AOM_ICDF(5952), AOM_ICDF(16505), AOM_ICDF(25955), AOM_ICDF(32163),
+          AOM_ICDF(32365), AOM_ICDF(32768) } },
+      { // Band 1
+        {AOM_ICDF(19649), AOM_ICDF(30160), AOM_ICDF(32743), AOM_ICDF(32753),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(18881), AOM_ICDF(28724), AOM_ICDF(32688), AOM_ICDF(32717),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(16833), AOM_ICDF(23053), AOM_ICDF(31244), AOM_ICDF(31573),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(14657), AOM_ICDF(17714), AOM_ICDF(26083), AOM_ICDF(26978),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(14657), AOM_ICDF(16618), AOM_ICDF(24597), AOM_ICDF(25403),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4289), AOM_ICDF(4326), AOM_ICDF(10686), AOM_ICDF(10751),
+         AOM_ICDF(32768) } },
+      { // Band 2
+        {AOM_ICDF(21953), AOM_ICDF(30956), AOM_ICDF(32748), AOM_ICDF(32757),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(20929), AOM_ICDF(29412), AOM_ICDF(32700), AOM_ICDF(32725),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(13377), AOM_ICDF(21495), AOM_ICDF(31216), AOM_ICDF(31569),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9153), AOM_ICDF(15097), AOM_ICDF(28295), AOM_ICDF(28990),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5313), AOM_ICDF(5363), AOM_ICDF(13839), AOM_ICDF(13894),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2625), AOM_ICDF(2652), AOM_ICDF(7276), AOM_ICDF(7351),
+         AOM_ICDF(32768) } },
+      { // Band 3
+        {AOM_ICDF(20289), AOM_ICDF(31164), AOM_ICDF(32745), AOM_ICDF(32755),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(17601), AOM_ICDF(29635), AOM_ICDF(32739), AOM_ICDF(32751),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(18241), AOM_ICDF(24284), AOM_ICDF(32116), AOM_ICDF(32258),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(32705), AOM_ICDF(32706), AOM_ICDF(32739), AOM_ICDF(32740),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5697), AOM_ICDF(5750), AOM_ICDF(14739), AOM_ICDF(14792),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2881), AOM_ICDF(2913), AOM_ICDF(8427), AOM_ICDF(8498),
+         AOM_ICDF(32768) } },
+      { // Band 4
+        {AOM_ICDF(19009), AOM_ICDF(31481), AOM_ICDF(32742), AOM_ICDF(32754),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(15809), AOM_ICDF(30521), AOM_ICDF(32736), AOM_ICDF(32750),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(16449), AOM_ICDF(32705), AOM_ICDF(32737), AOM_ICDF(32753),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(7873), AOM_ICDF(8039), AOM_ICDF(19981), AOM_ICDF(20068),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5313), AOM_ICDF(5366), AOM_ICDF(14376), AOM_ICDF(14430),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2753), AOM_ICDF(2789), AOM_ICDF(8909), AOM_ICDF(8979),
+         AOM_ICDF(32768) } },
+      { // Band 5
+        {AOM_ICDF(11841), AOM_ICDF(32116), AOM_ICDF(32728), AOM_ICDF(32748),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(12353), AOM_ICDF(32132), AOM_ICDF(32729), AOM_ICDF(32748),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(7489), AOM_ICDF(12435), AOM_ICDF(25708), AOM_ICDF(26666),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5697), AOM_ICDF(7486), AOM_ICDF(20238), AOM_ICDF(21009),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4929), AOM_ICDF(5579), AOM_ICDF(16402), AOM_ICDF(16866),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(3009), AOM_ICDF(3246), AOM_ICDF(10158), AOM_ICDF(10533),
+         AOM_ICDF(32768) } } } }
+};
+static const coeff_cdf_model default_coef_head_cdf_8x8[PLANE_TYPES] = {
+  {     // Y plane
+    {   // Intra
+      { // Band 0
+        { AOM_ICDF(16064), AOM_ICDF(18127), AOM_ICDF(22153), AOM_ICDF(27289),
+          AOM_ICDF(28507), AOM_ICDF(32768) },
+        { AOM_ICDF(6720), AOM_ICDF(10545), AOM_ICDF(13491), AOM_ICDF(20948),
+          AOM_ICDF(22631), AOM_ICDF(32768) },
+        { AOM_ICDF(832), AOM_ICDF(5270), AOM_ICDF(5918), AOM_ICDF(12645),
+          AOM_ICDF(13532), AOM_ICDF(32768) } },
+      { // Band 1
+        {AOM_ICDF(14017), AOM_ICDF(16139), AOM_ICDF(26799), AOM_ICDF(27295),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(12737), AOM_ICDF(15136), AOM_ICDF(26235), AOM_ICDF(26816),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(10817), AOM_ICDF(12445), AOM_ICDF(23637), AOM_ICDF(24217),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(8897), AOM_ICDF(9702), AOM_ICDF(20040), AOM_ICDF(20500),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5953), AOM_ICDF(6156), AOM_ICDF(13966), AOM_ICDF(14205),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2497), AOM_ICDF(2519), AOM_ICDF(6222), AOM_ICDF(6300),
+         AOM_ICDF(32768) } },
+      { // Band 2
+        {AOM_ICDF(19777), AOM_ICDF(21403), AOM_ICDF(30054), AOM_ICDF(30269),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(16193), AOM_ICDF(17913), AOM_ICDF(28593), AOM_ICDF(28883),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(12609), AOM_ICDF(13572), AOM_ICDF(25248), AOM_ICDF(25534),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9665), AOM_ICDF(10118), AOM_ICDF(20721), AOM_ICDF(20968),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(6849), AOM_ICDF(7028), AOM_ICDF(15202), AOM_ICDF(15391),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(3009), AOM_ICDF(3036), AOM_ICDF(7601), AOM_ICDF(7675),
+         AOM_ICDF(32768) } },
+      { // Band 3
+        {AOM_ICDF(22593), AOM_ICDF(23915), AOM_ICDF(31159), AOM_ICDF(31283),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(17345), AOM_ICDF(18690), AOM_ICDF(29425), AOM_ICDF(29611),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11969), AOM_ICDF(12540), AOM_ICDF(24685), AOM_ICDF(24867),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(8129), AOM_ICDF(8355), AOM_ICDF(18668), AOM_ICDF(18819),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4673), AOM_ICDF(4714), AOM_ICDF(11752), AOM_ICDF(11814),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(1857), AOM_ICDF(1876), AOM_ICDF(5057), AOM_ICDF(5138),
+         AOM_ICDF(32768) } },
+      { // Band 4
+        {AOM_ICDF(24513), AOM_ICDF(25718), AOM_ICDF(31947), AOM_ICDF(32014),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(18881), AOM_ICDF(20029), AOM_ICDF(30409), AOM_ICDF(30527),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(12481), AOM_ICDF(12953), AOM_ICDF(25201), AOM_ICDF(25341),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(8385), AOM_ICDF(8528), AOM_ICDF(18815), AOM_ICDF(18910),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4289), AOM_ICDF(4327), AOM_ICDF(10797), AOM_ICDF(10861),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(1857), AOM_ICDF(1872), AOM_ICDF(4332), AOM_ICDF(4415),
+         AOM_ICDF(32768) } },
+      { // Band 5
+        {AOM_ICDF(26049), AOM_ICDF(27752), AOM_ICDF(32415), AOM_ICDF(32462),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(20417), AOM_ICDF(22100), AOM_ICDF(31056), AOM_ICDF(31192),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(12481), AOM_ICDF(13075), AOM_ICDF(24646), AOM_ICDF(24844),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(7489), AOM_ICDF(7696), AOM_ICDF(17117), AOM_ICDF(17285),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(3777), AOM_ICDF(3814), AOM_ICDF(10062), AOM_ICDF(10129),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(1473), AOM_ICDF(1486), AOM_ICDF(3735), AOM_ICDF(3820),
+         AOM_ICDF(32768) } } },
+    {   // Intra
+      { // Band 0
+        { AOM_ICDF(25920), AOM_ICDF(27743), AOM_ICDF(29455), AOM_ICDF(32147),
+          AOM_ICDF(32280), AOM_ICDF(32768) },
+        { AOM_ICDF(13888), AOM_ICDF(19845), AOM_ICDF(23350), AOM_ICDF(30219),
+          AOM_ICDF(30660), AOM_ICDF(32768) },
+        { AOM_ICDF(2368), AOM_ICDF(12781), AOM_ICDF(16196), AOM_ICDF(27232),
+          AOM_ICDF(27894), AOM_ICDF(32768) } },
+      { // Band 1
+        {AOM_ICDF(21697), AOM_ICDF(24758), AOM_ICDF(32358), AOM_ICDF(32417),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(20289), AOM_ICDF(23960), AOM_ICDF(32111), AOM_ICDF(32213),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(17345), AOM_ICDF(19966), AOM_ICDF(30630), AOM_ICDF(30841),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(14529), AOM_ICDF(16070), AOM_ICDF(27461), AOM_ICDF(27777),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9793), AOM_ICDF(10613), AOM_ICDF(21146), AOM_ICDF(21566),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(6977), AOM_ICDF(7162), AOM_ICDF(15591), AOM_ICDF(15776),
+         AOM_ICDF(32768) } },
+      { // Band 2
+        {AOM_ICDF(23617), AOM_ICDF(26783), AOM_ICDF(32572), AOM_ICDF(32607),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(20801), AOM_ICDF(24292), AOM_ICDF(32185), AOM_ICDF(32275),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(15169), AOM_ICDF(17905), AOM_ICDF(29916), AOM_ICDF(30181),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(10945), AOM_ICDF(12972), AOM_ICDF(25565), AOM_ICDF(26064),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(6849), AOM_ICDF(8334), AOM_ICDF(18543), AOM_ICDF(19446),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(3649), AOM_ICDF(4346), AOM_ICDF(12351), AOM_ICDF(13169),
+         AOM_ICDF(32768) } },
+      { // Band 3
+        {AOM_ICDF(25281), AOM_ICDF(28440), AOM_ICDF(32667), AOM_ICDF(32689),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(22081), AOM_ICDF(25694), AOM_ICDF(32414), AOM_ICDF(32476),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(15297), AOM_ICDF(18341), AOM_ICDF(30141), AOM_ICDF(30410),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(10305), AOM_ICDF(12381), AOM_ICDF(24477), AOM_ICDF(25084),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5697), AOM_ICDF(6673), AOM_ICDF(16325), AOM_ICDF(17080),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2369), AOM_ICDF(2393), AOM_ICDF(6466), AOM_ICDF(6543),
+         AOM_ICDF(32768) } },
+      { // Band 4
+        {AOM_ICDF(25921), AOM_ICDF(29445), AOM_ICDF(32729), AOM_ICDF(32739),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(22465), AOM_ICDF(26834), AOM_ICDF(32588), AOM_ICDF(32627),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(16449), AOM_ICDF(20062), AOM_ICDF(31016), AOM_ICDF(31233),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11073), AOM_ICDF(13165), AOM_ICDF(25353), AOM_ICDF(25896),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11713), AOM_ICDF(13837), AOM_ICDF(20144), AOM_ICDF(21734),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2241), AOM_ICDF(2265), AOM_ICDF(6355), AOM_ICDF(6432),
+         AOM_ICDF(32768) } },
+      { // Band 5
+        {AOM_ICDF(26177), AOM_ICDF(29403), AOM_ICDF(32705), AOM_ICDF(32721),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(22337), AOM_ICDF(26344), AOM_ICDF(32545), AOM_ICDF(32589),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(19009), AOM_ICDF(21527), AOM_ICDF(31775), AOM_ICDF(31873),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11585), AOM_ICDF(12685), AOM_ICDF(22632), AOM_ICDF(23137),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(8257), AOM_ICDF(8305), AOM_ICDF(16444), AOM_ICDF(16492),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2113), AOM_ICDF(2183), AOM_ICDF(7202), AOM_ICDF(7377),
+         AOM_ICDF(32768) } } } },
+  {     // UV plane
+    {   // Inter
+      { // Band 0
+        { AOM_ICDF(27200), AOM_ICDF(27981), AOM_ICDF(31389), AOM_ICDF(32444),
+          AOM_ICDF(32592), AOM_ICDF(32768) },
+        { AOM_ICDF(14528), AOM_ICDF(19068), AOM_ICDF(24887), AOM_ICDF(29901),
+          AOM_ICDF(30688), AOM_ICDF(32768) },
+        { AOM_ICDF(3776), AOM_ICDF(11778), AOM_ICDF(14700), AOM_ICDF(23745),
+          AOM_ICDF(24854), AOM_ICDF(32768) } },
+      { // Band 1
+        {AOM_ICDF(20289), AOM_ICDF(25202), AOM_ICDF(31672), AOM_ICDF(31909),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(18369), AOM_ICDF(23493), AOM_ICDF(31166), AOM_ICDF(31487),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(15425), AOM_ICDF(18619), AOM_ICDF(28941), AOM_ICDF(29393),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(10945), AOM_ICDF(12535), AOM_ICDF(24287), AOM_ICDF(24792),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(6465), AOM_ICDF(6810), AOM_ICDF(15764), AOM_ICDF(16080),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2113), AOM_ICDF(2137), AOM_ICDF(6125), AOM_ICDF(6203),
+         AOM_ICDF(32768) } },
+      { // Band 2
+        {AOM_ICDF(23745), AOM_ICDF(27041), AOM_ICDF(31976), AOM_ICDF(32135),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(19521), AOM_ICDF(22766), AOM_ICDF(31139), AOM_ICDF(31367),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(14273), AOM_ICDF(15834), AOM_ICDF(27820), AOM_ICDF(28105),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9537), AOM_ICDF(10445), AOM_ICDF(22106), AOM_ICDF(22491),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(7233), AOM_ICDF(7386), AOM_ICDF(15961), AOM_ICDF(16109),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2369), AOM_ICDF(2401), AOM_ICDF(7891), AOM_ICDF(7964),
+         AOM_ICDF(32768) } },
+      { // Band 3
+        {AOM_ICDF(26305), AOM_ICDF(28703), AOM_ICDF(32352), AOM_ICDF(32435),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(20673), AOM_ICDF(23490), AOM_ICDF(31517), AOM_ICDF(31680),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(14017), AOM_ICDF(15251), AOM_ICDF(27458), AOM_ICDF(27702),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(10945), AOM_ICDF(11374), AOM_ICDF(22496), AOM_ICDF(22687),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9153), AOM_ICDF(9435), AOM_ICDF(22299), AOM_ICDF(22411),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(193), AOM_ICDF(269), AOM_ICDF(13236), AOM_ICDF(13293),
+         AOM_ICDF(32768) } },
+      { // Band 4
+        {AOM_ICDF(27713), AOM_ICDF(29770), AOM_ICDF(32522), AOM_ICDF(32575),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(21569), AOM_ICDF(24342), AOM_ICDF(31785), AOM_ICDF(31919),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(15297), AOM_ICDF(16497), AOM_ICDF(28367), AOM_ICDF(28569),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(17601), AOM_ICDF(17828), AOM_ICDF(24444), AOM_ICDF(24582),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(6977), AOM_ICDF(7035), AOM_ICDF(16901), AOM_ICDF(16947),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(193), AOM_ICDF(384), AOM_ICDF(32706), AOM_ICDF(32707),
+         AOM_ICDF(32768) } },
+      { // Band 5
+        {AOM_ICDF(28737), AOM_ICDF(30879), AOM_ICDF(32667), AOM_ICDF(32695),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(22593), AOM_ICDF(26241), AOM_ICDF(32073), AOM_ICDF(32207),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(16577), AOM_ICDF(19148), AOM_ICDF(28436), AOM_ICDF(28906),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(12993), AOM_ICDF(14005), AOM_ICDF(23151), AOM_ICDF(23630),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(7617), AOM_ICDF(9188), AOM_ICDF(22797), AOM_ICDF(23313),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2625), AOM_ICDF(2680), AOM_ICDF(8218), AOM_ICDF(8338),
+         AOM_ICDF(32768) } } },
+    {   // Inter
+      { // Band 0
+        { AOM_ICDF(28864), AOM_ICDF(29988), AOM_ICDF(32423), AOM_ICDF(32766),
+          AOM_ICDF(32767), AOM_ICDF(32768) },
+        { AOM_ICDF(18496), AOM_ICDF(24572), AOM_ICDF(30167), AOM_ICDF(32687),
+          AOM_ICDF(32737), AOM_ICDF(32768) },
+        { AOM_ICDF(5440), AOM_ICDF(19618), AOM_ICDF(25332), AOM_ICDF(32393),
+          AOM_ICDF(32491), AOM_ICDF(32768) } },
+      { // Band 1
+        {AOM_ICDF(23745), AOM_ICDF(29427), AOM_ICDF(32751), AOM_ICDF(32757),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(23745), AOM_ICDF(28704), AOM_ICDF(32716), AOM_ICDF(32731),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(23105), AOM_ICDF(27943), AOM_ICDF(32524), AOM_ICDF(32587),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(21057), AOM_ICDF(24773), AOM_ICDF(29589), AOM_ICDF(30282),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(12609), AOM_ICDF(14823), AOM_ICDF(23831), AOM_ICDF(24713),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(16449), AOM_ICDF(16450), AOM_ICDF(16545), AOM_ICDF(16593),
+         AOM_ICDF(32768) } },
+      { // Band 2
+        {AOM_ICDF(25025), AOM_ICDF(30203), AOM_ICDF(32754), AOM_ICDF(32759),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(23617), AOM_ICDF(28361), AOM_ICDF(32715), AOM_ICDF(32729),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(17985), AOM_ICDF(21562), AOM_ICDF(31354), AOM_ICDF(31543),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(12353), AOM_ICDF(18915), AOM_ICDF(28742), AOM_ICDF(29548),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(193), AOM_ICDF(289), AOM_ICDF(16545), AOM_ICDF(16593),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2625), AOM_ICDF(2652), AOM_ICDF(7276), AOM_ICDF(7351),
+         AOM_ICDF(32768) } },
+      { // Band 3
+        {AOM_ICDF(26433), AOM_ICDF(30892), AOM_ICDF(32757), AOM_ICDF(32761),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(24513), AOM_ICDF(29274), AOM_ICDF(32721), AOM_ICDF(32735),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(20161), AOM_ICDF(24040), AOM_ICDF(32055), AOM_ICDF(32171),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(21953), AOM_ICDF(24678), AOM_ICDF(27382), AOM_ICDF(28734),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5697), AOM_ICDF(5750), AOM_ICDF(14739), AOM_ICDF(14792),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2881), AOM_ICDF(2913), AOM_ICDF(8427), AOM_ICDF(8498),
+         AOM_ICDF(32768) } },
+      { // Band 4
+        {AOM_ICDF(27457), AOM_ICDF(31485), AOM_ICDF(32759), AOM_ICDF(32763),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(24129), AOM_ICDF(29502), AOM_ICDF(32752), AOM_ICDF(32757),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(19009), AOM_ICDF(25452), AOM_ICDF(32473), AOM_ICDF(32544),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(32705), AOM_ICDF(32706), AOM_ICDF(32737), AOM_ICDF(32738),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5313), AOM_ICDF(5366), AOM_ICDF(14376), AOM_ICDF(14430),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2753), AOM_ICDF(2789), AOM_ICDF(8909), AOM_ICDF(8979),
+         AOM_ICDF(32768) } },
+      { // Band 5
+        {AOM_ICDF(27841), AOM_ICDF(32288), AOM_ICDF(32759), AOM_ICDF(32764),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(19137), AOM_ICDF(30271), AOM_ICDF(32742), AOM_ICDF(32753),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(18625), AOM_ICDF(27739), AOM_ICDF(29979), AOM_ICDF(31099),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5697), AOM_ICDF(7486), AOM_ICDF(20238), AOM_ICDF(21009),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4929), AOM_ICDF(5579), AOM_ICDF(16402), AOM_ICDF(16866),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(3009), AOM_ICDF(3246), AOM_ICDF(10158), AOM_ICDF(10533),
+         AOM_ICDF(32768) } } } }
+};
+static const coeff_cdf_model default_coef_head_cdf_16x16[PLANE_TYPES] = {
+  {     // Y plane
+    {   // Intra
+      { // Band 0
+        { AOM_ICDF(960), AOM_ICDF(4882), AOM_ICDF(9467), AOM_ICDF(17710),
+          AOM_ICDF(20412), AOM_ICDF(32768) },
+        { AOM_ICDF(704), AOM_ICDF(4657), AOM_ICDF(6561), AOM_ICDF(14507),
+          AOM_ICDF(16279), AOM_ICDF(32768) },
+        { AOM_ICDF(192), AOM_ICDF(3443), AOM_ICDF(3759), AOM_ICDF(9011),
+          AOM_ICDF(9685), AOM_ICDF(32768) } },
+      { // Band 1
+        {AOM_ICDF(12481), AOM_ICDF(13958), AOM_ICDF(24487), AOM_ICDF(24997),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11457), AOM_ICDF(13075), AOM_ICDF(23820), AOM_ICDF(24406),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9793), AOM_ICDF(11127), AOM_ICDF(21775), AOM_ICDF(22387),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(7745), AOM_ICDF(8457), AOM_ICDF(18155), AOM_ICDF(18655),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5441), AOM_ICDF(5668), AOM_ICDF(13180), AOM_ICDF(13467),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2497), AOM_ICDF(2520), AOM_ICDF(6340), AOM_ICDF(6417),
+         AOM_ICDF(32768) } },
+      { // Band 2
+        {AOM_ICDF(19521), AOM_ICDF(20572), AOM_ICDF(28965), AOM_ICDF(29177),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(15425), AOM_ICDF(16741), AOM_ICDF(27247), AOM_ICDF(27554),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11969), AOM_ICDF(12690), AOM_ICDF(23872), AOM_ICDF(24141),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9281), AOM_ICDF(9678), AOM_ICDF(19970), AOM_ICDF(20207),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(6081), AOM_ICDF(6266), AOM_ICDF(14682), AOM_ICDF(14876),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2753), AOM_ICDF(2779), AOM_ICDF(7150), AOM_ICDF(7225),
+         AOM_ICDF(32768) } },
+      { // Band 3
+        {AOM_ICDF(22337), AOM_ICDF(23293), AOM_ICDF(30630), AOM_ICDF(30753),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(16321), AOM_ICDF(17427), AOM_ICDF(28368), AOM_ICDF(28570),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11457), AOM_ICDF(11907), AOM_ICDF(23570), AOM_ICDF(23741),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(7233), AOM_ICDF(7331), AOM_ICDF(17258), AOM_ICDF(17334),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4033), AOM_ICDF(4070), AOM_ICDF(10375), AOM_ICDF(10441),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(1601), AOM_ICDF(1619), AOM_ICDF(4706), AOM_ICDF(4788),
+         AOM_ICDF(32768) } },
+      { // Band 4
+        {AOM_ICDF(24769), AOM_ICDF(25536), AOM_ICDF(31660), AOM_ICDF(31722),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(18113), AOM_ICDF(18886), AOM_ICDF(29420), AOM_ICDF(29534),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11201), AOM_ICDF(11412), AOM_ICDF(23207), AOM_ICDF(23291),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(6977), AOM_ICDF(7033), AOM_ICDF(16599), AOM_ICDF(16646),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4033), AOM_ICDF(4070), AOM_ICDF(10375), AOM_ICDF(10441),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(1601), AOM_ICDF(1620), AOM_ICDF(4827), AOM_ICDF(4909),
+         AOM_ICDF(32768) } },
+      { // Band 5
+        {AOM_ICDF(28353), AOM_ICDF(28831), AOM_ICDF(32502), AOM_ICDF(32517),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(21441), AOM_ICDF(21869), AOM_ICDF(30977), AOM_ICDF(31017),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11969), AOM_ICDF(12088), AOM_ICDF(24116), AOM_ICDF(24158),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(7489), AOM_ICDF(7547), AOM_ICDF(17413), AOM_ICDF(17458),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4545), AOM_ICDF(4585), AOM_ICDF(11325), AOM_ICDF(11388),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2113), AOM_ICDF(2133), AOM_ICDF(5526), AOM_ICDF(5606),
+         AOM_ICDF(32768) } } },
+    {   // Intra
+      { // Band 0
+        { AOM_ICDF(2496), AOM_ICDF(8717), AOM_ICDF(17280), AOM_ICDF(28922),
+          AOM_ICDF(29751), AOM_ICDF(32768) },
+        { AOM_ICDF(2496), AOM_ICDF(9665), AOM_ICDF(15235), AOM_ICDF(26542),
+          AOM_ICDF(27580), AOM_ICDF(32768) },
+        { AOM_ICDF(448), AOM_ICDF(9240), AOM_ICDF(11886), AOM_ICDF(24124),
+          AOM_ICDF(24898), AOM_ICDF(32768) } },
+      { // Band 1
+        {AOM_ICDF(21057), AOM_ICDF(22896), AOM_ICDF(31877), AOM_ICDF(31953),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(20673), AOM_ICDF(23151), AOM_ICDF(31706), AOM_ICDF(31825),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(18753), AOM_ICDF(20519), AOM_ICDF(30497), AOM_ICDF(30668),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(15425), AOM_ICDF(16608), AOM_ICDF(27789), AOM_ICDF(28027),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(10305), AOM_ICDF(10977), AOM_ICDF(21405), AOM_ICDF(21749),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(3649), AOM_ICDF(3812), AOM_ICDF(11213), AOM_ICDF(11445),
+         AOM_ICDF(32768) } },
+      { // Band 2
+        {AOM_ICDF(24001), AOM_ICDF(25899), AOM_ICDF(32307), AOM_ICDF(32360),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(20929), AOM_ICDF(22941), AOM_ICDF(31775), AOM_ICDF(31867),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(15169), AOM_ICDF(16734), AOM_ICDF(29228), AOM_ICDF(29425),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(10561), AOM_ICDF(12047), AOM_ICDF(24918), AOM_ICDF(25324),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(6977), AOM_ICDF(7929), AOM_ICDF(18311), AOM_ICDF(18918),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(3649), AOM_ICDF(3760), AOM_ICDF(9962), AOM_ICDF(10162),
+         AOM_ICDF(32768) } },
+      { // Band 3
+        {AOM_ICDF(25793), AOM_ICDF(27526), AOM_ICDF(32565), AOM_ICDF(32591),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(21825), AOM_ICDF(23885), AOM_ICDF(32064), AOM_ICDF(32135),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(15041), AOM_ICDF(16286), AOM_ICDF(29203), AOM_ICDF(29360),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(10433), AOM_ICDF(11058), AOM_ICDF(24349), AOM_ICDF(24538),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5569), AOM_ICDF(6016), AOM_ICDF(16460), AOM_ICDF(16794),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(193), AOM_ICDF(194), AOM_ICDF(384), AOM_ICDF(479),
+         AOM_ICDF(32768) } },
+      { // Band 4
+        {AOM_ICDF(26433), AOM_ICDF(28398), AOM_ICDF(32682), AOM_ICDF(32696),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(22977), AOM_ICDF(25086), AOM_ICDF(32367), AOM_ICDF(32412),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(16577), AOM_ICDF(17928), AOM_ICDF(30144), AOM_ICDF(30275),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(12481), AOM_ICDF(13352), AOM_ICDF(25993), AOM_ICDF(26211),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(7745), AOM_ICDF(8069), AOM_ICDF(20501), AOM_ICDF(20657),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(16449), AOM_ICDF(16450), AOM_ICDF(16545), AOM_ICDF(16593),
+         AOM_ICDF(32768) } },
+      { // Band 5
+        {AOM_ICDF(27841), AOM_ICDF(29700), AOM_ICDF(32721), AOM_ICDF(32730),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(23873), AOM_ICDF(26202), AOM_ICDF(32578), AOM_ICDF(32604),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(17729), AOM_ICDF(19046), AOM_ICDF(30448), AOM_ICDF(30568),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(13505), AOM_ICDF(14508), AOM_ICDF(26034), AOM_ICDF(26304),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(10049), AOM_ICDF(10494), AOM_ICDF(19945), AOM_ICDF(20233),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2113), AOM_ICDF(2183), AOM_ICDF(7202), AOM_ICDF(7377),
+         AOM_ICDF(32768) } } } },
+  {     // UV plane
+    {   // Inter
+      { // Band 0
+        { AOM_ICDF(27072), AOM_ICDF(27916), AOM_ICDF(31095), AOM_ICDF(32400),
+          AOM_ICDF(32553), AOM_ICDF(32768) },
+        { AOM_ICDF(12352), AOM_ICDF(16792), AOM_ICDF(22516), AOM_ICDF(28853),
+          AOM_ICDF(29797), AOM_ICDF(32768) },
+        { AOM_ICDF(2880), AOM_ICDF(9023), AOM_ICDF(11126), AOM_ICDF(20602),
+          AOM_ICDF(21713), AOM_ICDF(32768) } },
+      { // Band 1
+        {AOM_ICDF(20161), AOM_ICDF(24785), AOM_ICDF(31070), AOM_ICDF(31430),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(17985), AOM_ICDF(22773), AOM_ICDF(30430), AOM_ICDF(30880),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(15937), AOM_ICDF(18802), AOM_ICDF(28265), AOM_ICDF(28788),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11841), AOM_ICDF(13587), AOM_ICDF(24798), AOM_ICDF(25335),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(8769), AOM_ICDF(9160), AOM_ICDF(19316), AOM_ICDF(19566),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5313), AOM_ICDF(5357), AOM_ICDF(12874), AOM_ICDF(12932),
+         AOM_ICDF(32768) } },
+      { // Band 2
+        {AOM_ICDF(24129), AOM_ICDF(26501), AOM_ICDF(31672), AOM_ICDF(31844),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(19649), AOM_ICDF(21553), AOM_ICDF(30130), AOM_ICDF(30370),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11713), AOM_ICDF(13134), AOM_ICDF(25983), AOM_ICDF(26321),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9409), AOM_ICDF(9948), AOM_ICDF(21408), AOM_ICDF(21663),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5569), AOM_ICDF(5757), AOM_ICDF(14335), AOM_ICDF(14533),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2241), AOM_ICDF(2305), AOM_ICDF(13152), AOM_ICDF(13209),
+         AOM_ICDF(32768) } },
+      { // Band 3
+        {AOM_ICDF(26817), AOM_ICDF(28135), AOM_ICDF(32130), AOM_ICDF(32209),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(20161), AOM_ICDF(21412), AOM_ICDF(30331), AOM_ICDF(30481),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(13377), AOM_ICDF(13798), AOM_ICDF(26065), AOM_ICDF(26176),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(8129), AOM_ICDF(8290), AOM_ICDF(19920), AOM_ICDF(20008),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5697), AOM_ICDF(5751), AOM_ICDF(14950), AOM_ICDF(15002),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5569), AOM_ICDF(5601), AOM_ICDF(11041), AOM_ICDF(11105),
+         AOM_ICDF(32768) } },
+      { // Band 4
+        {AOM_ICDF(28225), AOM_ICDF(29079), AOM_ICDF(32387), AOM_ICDF(32426),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(21185), AOM_ICDF(22046), AOM_ICDF(30982), AOM_ICDF(31061),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(13377), AOM_ICDF(13595), AOM_ICDF(25762), AOM_ICDF(25824),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(8001), AOM_ICDF(8123), AOM_ICDF(20530), AOM_ICDF(20590),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4289), AOM_ICDF(4322), AOM_ICDF(9907), AOM_ICDF(9974),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(3393), AOM_ICDF(3412), AOM_ICDF(6663), AOM_ICDF(6739),
+         AOM_ICDF(32768) } },
+      { // Band 5
+        {AOM_ICDF(30529), AOM_ICDF(31014), AOM_ICDF(32651), AOM_ICDF(32664),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(23489), AOM_ICDF(24268), AOM_ICDF(31627), AOM_ICDF(31682),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(14017), AOM_ICDF(14239), AOM_ICDF(26653), AOM_ICDF(26707),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11201), AOM_ICDF(11317), AOM_ICDF(23122), AOM_ICDF(23169),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(6721), AOM_ICDF(6768), AOM_ICDF(14810), AOM_ICDF(14863),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(6593), AOM_ICDF(6632), AOM_ICDF(13188), AOM_ICDF(13245),
+         AOM_ICDF(32768) } } },
+    {   // Inter
+      { // Band 0
+        { AOM_ICDF(29888), AOM_ICDF(30492), AOM_ICDF(32500), AOM_ICDF(32766),
+          AOM_ICDF(32767), AOM_ICDF(32768) },
+        { AOM_ICDF(18752), AOM_ICDF(23235), AOM_ICDF(29846), AOM_ICDF(32214),
+          AOM_ICDF(32442), AOM_ICDF(32768) },
+        { AOM_ICDF(5568), AOM_ICDF(17762), AOM_ICDF(25039), AOM_ICDF(31213),
+          AOM_ICDF(31651), AOM_ICDF(32768) } },
+      { // Band 1
+        {AOM_ICDF(26433), AOM_ICDF(29681), AOM_ICDF(32757), AOM_ICDF(32760),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(24769), AOM_ICDF(28761), AOM_ICDF(32722), AOM_ICDF(32734),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(22209), AOM_ICDF(26975), AOM_ICDF(32418), AOM_ICDF(32500),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(16321), AOM_ICDF(21333), AOM_ICDF(28368), AOM_ICDF(29283),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(12865), AOM_ICDF(14775), AOM_ICDF(22545), AOM_ICDF(23553),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(12353), AOM_ICDF(12354), AOM_ICDF(12473), AOM_ICDF(12532),
+         AOM_ICDF(32768) } },
+      { // Band 2
+        {AOM_ICDF(27457), AOM_ICDF(30005), AOM_ICDF(32738), AOM_ICDF(32745),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(24897), AOM_ICDF(27541), AOM_ICDF(32723), AOM_ICDF(32731),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(15297), AOM_ICDF(19106), AOM_ICDF(30414), AOM_ICDF(30711),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(6593), AOM_ICDF(8826), AOM_ICDF(19732), AOM_ICDF(20840),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4161), AOM_ICDF(4233), AOM_ICDF(16509), AOM_ICDF(16557),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2625), AOM_ICDF(2652), AOM_ICDF(7276), AOM_ICDF(7351),
+         AOM_ICDF(32768) } },
+      { // Band 3
+        {AOM_ICDF(28609), AOM_ICDF(30482), AOM_ICDF(32761), AOM_ICDF(32763),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(25665), AOM_ICDF(27830), AOM_ICDF(32727), AOM_ICDF(32733),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(21057), AOM_ICDF(23803), AOM_ICDF(30367), AOM_ICDF(30721),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(10945), AOM_ICDF(21878), AOM_ICDF(32726), AOM_ICDF(32737),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5697), AOM_ICDF(5750), AOM_ICDF(14739), AOM_ICDF(14792),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2881), AOM_ICDF(2913), AOM_ICDF(8427), AOM_ICDF(8498),
+         AOM_ICDF(32768) } },
+      { // Band 4
+        {AOM_ICDF(28993), AOM_ICDF(30944), AOM_ICDF(32762), AOM_ICDF(32764),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(26561), AOM_ICDF(28695), AOM_ICDF(32733), AOM_ICDF(32739),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(17985), AOM_ICDF(19028), AOM_ICDF(31008), AOM_ICDF(31079),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(7873), AOM_ICDF(8039), AOM_ICDF(19981), AOM_ICDF(20068),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5313), AOM_ICDF(5366), AOM_ICDF(14376), AOM_ICDF(14430),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2753), AOM_ICDF(2789), AOM_ICDF(8909), AOM_ICDF(8979),
+         AOM_ICDF(32768) } },
+      { // Band 5
+        {AOM_ICDF(30273), AOM_ICDF(32029), AOM_ICDF(32764), AOM_ICDF(32766),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(28609), AOM_ICDF(30847), AOM_ICDF(32745), AOM_ICDF(32751),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(21313), AOM_ICDF(24377), AOM_ICDF(31986), AOM_ICDF(32098),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(32705), AOM_ICDF(32709), AOM_ICDF(32739), AOM_ICDF(32741),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4929), AOM_ICDF(5579), AOM_ICDF(16402), AOM_ICDF(16866),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(3009), AOM_ICDF(3246), AOM_ICDF(10158), AOM_ICDF(10533),
+         AOM_ICDF(32768) } } } }
+};
+static const coeff_cdf_model default_coef_head_cdf_32x32[PLANE_TYPES] = {
+  {     // Y plane
+    {   // Intra
+      { // Band 0
+        { AOM_ICDF(2240), AOM_ICDF(5407), AOM_ICDF(18304), AOM_ICDF(25601),
+          AOM_ICDF(27911), AOM_ICDF(32768) },
+        { AOM_ICDF(960), AOM_ICDF(4633), AOM_ICDF(8197), AOM_ICDF(16254),
+          AOM_ICDF(18796), AOM_ICDF(32768) },
+        { AOM_ICDF(192), AOM_ICDF(3061), AOM_ICDF(3557), AOM_ICDF(8701),
+          AOM_ICDF(9762), AOM_ICDF(32768) } },
+      { // Band 1
+        {AOM_ICDF(11969), AOM_ICDF(15846), AOM_ICDF(25660), AOM_ICDF(26667),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11713), AOM_ICDF(15794), AOM_ICDF(25737), AOM_ICDF(26760),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9281), AOM_ICDF(12675), AOM_ICDF(23181), AOM_ICDF(24351),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(7105), AOM_ICDF(8757), AOM_ICDF(18383), AOM_ICDF(19437),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4289), AOM_ICDF(4579), AOM_ICDF(11353), AOM_ICDF(11792),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(1857), AOM_ICDF(1874), AOM_ICDF(4695), AOM_ICDF(4777),
+         AOM_ICDF(32768) } },
+      { // Band 2
+        {AOM_ICDF(20929), AOM_ICDF(22297), AOM_ICDF(29370), AOM_ICDF(29646),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(17473), AOM_ICDF(18985), AOM_ICDF(28079), AOM_ICDF(28413),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(13121), AOM_ICDF(14064), AOM_ICDF(24902), AOM_ICDF(25217),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9793), AOM_ICDF(10214), AOM_ICDF(20069), AOM_ICDF(20329),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5825), AOM_ICDF(5987), AOM_ICDF(13350), AOM_ICDF(13559),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2241), AOM_ICDF(2260), AOM_ICDF(5520), AOM_ICDF(5600),
+         AOM_ICDF(32768) } },
+      { // Band 3
+        {AOM_ICDF(25921), AOM_ICDF(26891), AOM_ICDF(31632), AOM_ICDF(31729),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(18241), AOM_ICDF(19463), AOM_ICDF(29222), AOM_ICDF(29419),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11585), AOM_ICDF(12065), AOM_ICDF(23294), AOM_ICDF(23488),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(6593), AOM_ICDF(6686), AOM_ICDF(16153), AOM_ICDF(16234),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(3137), AOM_ICDF(3170), AOM_ICDF(8751), AOM_ICDF(8821),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(1345), AOM_ICDF(1359), AOM_ICDF(3739), AOM_ICDF(3824),
+         AOM_ICDF(32768) } },
+      { // Band 4
+        {AOM_ICDF(27713), AOM_ICDF(28504), AOM_ICDF(32068), AOM_ICDF(32132),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(19265), AOM_ICDF(20354), AOM_ICDF(29789), AOM_ICDF(29943),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11201), AOM_ICDF(11538), AOM_ICDF(22701), AOM_ICDF(22848),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(6337), AOM_ICDF(6424), AOM_ICDF(15268), AOM_ICDF(15353),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(3649), AOM_ICDF(3681), AOM_ICDF(9052), AOM_ICDF(9121),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(1601), AOM_ICDF(1618), AOM_ICDF(4584), AOM_ICDF(4667),
+         AOM_ICDF(32768) } },
+      { // Band 5
+        {AOM_ICDF(30913), AOM_ICDF(31044), AOM_ICDF(32635), AOM_ICDF(32640),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(22081), AOM_ICDF(22261), AOM_ICDF(30452), AOM_ICDF(30477),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(10561), AOM_ICDF(10625), AOM_ICDF(21535), AOM_ICDF(21568),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(6081), AOM_ICDF(6130), AOM_ICDF(14369), AOM_ICDF(14423),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(3777), AOM_ICDF(3809), AOM_ICDF(9156), AOM_ICDF(9225),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(1857), AOM_ICDF(1875), AOM_ICDF(4936), AOM_ICDF(5018),
+         AOM_ICDF(32768) } } },
+    {   // Intra
+      { // Band 0
+        { AOM_ICDF(4672), AOM_ICDF(6927), AOM_ICDF(23534), AOM_ICDF(29846),
+          AOM_ICDF(30928), AOM_ICDF(32768) },
+        { AOM_ICDF(3776), AOM_ICDF(6784), AOM_ICDF(18075), AOM_ICDF(25863),
+          AOM_ICDF(27926), AOM_ICDF(32768) },
+        { AOM_ICDF(1344), AOM_ICDF(5588), AOM_ICDF(12166), AOM_ICDF(20966),
+          AOM_ICDF(23504), AOM_ICDF(32768) } },
+      { // Band 1
+        {AOM_ICDF(19393), AOM_ICDF(22016), AOM_ICDF(31280), AOM_ICDF(31444),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(21185), AOM_ICDF(24329), AOM_ICDF(31706), AOM_ICDF(31865),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(20673), AOM_ICDF(23240), AOM_ICDF(31186), AOM_ICDF(31379),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(17857), AOM_ICDF(20035), AOM_ICDF(29594), AOM_ICDF(29889),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(13633), AOM_ICDF(14929), AOM_ICDF(24883), AOM_ICDF(25337),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(7873), AOM_ICDF(8416), AOM_ICDF(17452), AOM_ICDF(17886),
+         AOM_ICDF(32768) } },
+      { // Band 2
+        {AOM_ICDF(25665), AOM_ICDF(27145), AOM_ICDF(32256), AOM_ICDF(32314),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(21057), AOM_ICDF(22826), AOM_ICDF(31465), AOM_ICDF(31576),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(13633), AOM_ICDF(14885), AOM_ICDF(27873), AOM_ICDF(28088),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(8769), AOM_ICDF(9515), AOM_ICDF(21941), AOM_ICDF(22248),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(6209), AOM_ICDF(6594), AOM_ICDF(15598), AOM_ICDF(15950),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(1985), AOM_ICDF(2014), AOM_ICDF(6855), AOM_ICDF(6931),
+         AOM_ICDF(32768) } },
+      { // Band 3
+        {AOM_ICDF(26817), AOM_ICDF(27824), AOM_ICDF(32362), AOM_ICDF(32399),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(21185), AOM_ICDF(22321), AOM_ICDF(31389), AOM_ICDF(31466),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(13761), AOM_ICDF(14154), AOM_ICDF(27163), AOM_ICDF(27245),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(8897), AOM_ICDF(9011), AOM_ICDF(20600), AOM_ICDF(20659),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4673), AOM_ICDF(4774), AOM_ICDF(15044), AOM_ICDF(15131),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(193), AOM_ICDF(194), AOM_ICDF(384), AOM_ICDF(479),
+         AOM_ICDF(32768) } },
+      { // Band 4
+        {AOM_ICDF(28865), AOM_ICDF(29687), AOM_ICDF(32655), AOM_ICDF(32667),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(23233), AOM_ICDF(24218), AOM_ICDF(32080), AOM_ICDF(32118),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(15041), AOM_ICDF(15444), AOM_ICDF(28787), AOM_ICDF(28845),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9921), AOM_ICDF(10248), AOM_ICDF(22818), AOM_ICDF(22944),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(7745), AOM_ICDF(7866), AOM_ICDF(16591), AOM_ICDF(16702),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(193), AOM_ICDF(194), AOM_ICDF(384), AOM_ICDF(479),
+         AOM_ICDF(32768) } },
+      { // Band 5
+        {AOM_ICDF(31169), AOM_ICDF(31559), AOM_ICDF(32741), AOM_ICDF(32744),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(24769), AOM_ICDF(25583), AOM_ICDF(32347), AOM_ICDF(32370),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(15937), AOM_ICDF(16169), AOM_ICDF(29120), AOM_ICDF(29152),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(7489), AOM_ICDF(7578), AOM_ICDF(22647), AOM_ICDF(22677),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(7617), AOM_ICDF(7689), AOM_ICDF(19849), AOM_ICDF(19887),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2113), AOM_ICDF(2183), AOM_ICDF(7202), AOM_ICDF(7377),
+         AOM_ICDF(32768) } } } },
+  {     // UV plane
+    {   // Inter
+      { // Band 0
+        { AOM_ICDF(23232), AOM_ICDF(24301), AOM_ICDF(30231), AOM_ICDF(31582),
+          AOM_ICDF(32091), AOM_ICDF(32768) },
+        { AOM_ICDF(7872), AOM_ICDF(11041), AOM_ICDF(22542), AOM_ICDF(27086),
+          AOM_ICDF(29145), AOM_ICDF(32768) },
+        { AOM_ICDF(1344), AOM_ICDF(3989), AOM_ICDF(18125), AOM_ICDF(25340),
+          AOM_ICDF(27820), AOM_ICDF(32768) } },
+      { // Band 1
+        {AOM_ICDF(15937), AOM_ICDF(29000), AOM_ICDF(32210), AOM_ICDF(32434),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(12353), AOM_ICDF(26626), AOM_ICDF(31533), AOM_ICDF(31993),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11457), AOM_ICDF(29187), AOM_ICDF(30896), AOM_ICDF(31750),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5697), AOM_ICDF(21278), AOM_ICDF(28169), AOM_ICDF(29764),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(7489), AOM_ICDF(8855), AOM_ICDF(13365), AOM_ICDF(15620),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4289), AOM_ICDF(4833), AOM_ICDF(8572), AOM_ICDF(10108),
+         AOM_ICDF(32768) } },
+      { // Band 2
+        {AOM_ICDF(25025), AOM_ICDF(30783), AOM_ICDF(32603), AOM_ICDF(32666),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(24385), AOM_ICDF(29586), AOM_ICDF(31803), AOM_ICDF(32142),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(22337), AOM_ICDF(23002), AOM_ICDF(27573), AOM_ICDF(27903),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(10945), AOM_ICDF(12336), AOM_ICDF(21900), AOM_ICDF(22590),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(8257), AOM_ICDF(8830), AOM_ICDF(19986), AOM_ICDF(20298),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(10945), AOM_ICDF(10990), AOM_ICDF(18660), AOM_ICDF(18701),
+         AOM_ICDF(32768) } },
+      { // Band 3
+        {AOM_ICDF(29761), AOM_ICDF(31473), AOM_ICDF(32693), AOM_ICDF(32715),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(20417), AOM_ICDF(24512), AOM_ICDF(31394), AOM_ICDF(31650),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(11713), AOM_ICDF(13283), AOM_ICDF(25819), AOM_ICDF(26206),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(13121), AOM_ICDF(14099), AOM_ICDF(21909), AOM_ICDF(22514),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(193), AOM_ICDF(248), AOM_ICDF(9546), AOM_ICDF(9614),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2497), AOM_ICDF(2524), AOM_ICDF(7050), AOM_ICDF(7125),
+         AOM_ICDF(32768) } },
+      { // Band 4
+        {AOM_ICDF(30657), AOM_ICDF(31885), AOM_ICDF(32691), AOM_ICDF(32715),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(19393), AOM_ICDF(26050), AOM_ICDF(31698), AOM_ICDF(31988),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(15809), AOM_ICDF(15863), AOM_ICDF(24985), AOM_ICDF(25008),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(23489), AOM_ICDF(28138), AOM_ICDF(32751), AOM_ICDF(32756),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(16449), AOM_ICDF(16450), AOM_ICDF(16545), AOM_ICDF(16593),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2369), AOM_ICDF(2395), AOM_ICDF(6822), AOM_ICDF(6898),
+         AOM_ICDF(32768) } },
+      { // Band 5
+        {AOM_ICDF(32705), AOM_ICDF(32744), AOM_ICDF(32766), AOM_ICDF(32767),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(21953), AOM_ICDF(24962), AOM_ICDF(32156), AOM_ICDF(32246),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(13121), AOM_ICDF(15358), AOM_ICDF(26284), AOM_ICDF(26835),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5697), AOM_ICDF(7417), AOM_ICDF(20132), AOM_ICDF(20885),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4417), AOM_ICDF(4939), AOM_ICDF(15104), AOM_ICDF(15535),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2625), AOM_ICDF(2680), AOM_ICDF(8218), AOM_ICDF(8338),
+         AOM_ICDF(32768) } } },
+    {   // Inter
+      { // Band 0
+        { AOM_ICDF(25280), AOM_ICDF(25678), AOM_ICDF(32446), AOM_ICDF(32622),
+          AOM_ICDF(32724), AOM_ICDF(32768) },
+        { AOM_ICDF(10560), AOM_ICDF(11822), AOM_ICDF(28682), AOM_ICDF(29919),
+          AOM_ICDF(31276), AOM_ICDF(32768) },
+        { AOM_ICDF(3264), AOM_ICDF(5170), AOM_ICDF(21779), AOM_ICDF(24026),
+          AOM_ICDF(27905), AOM_ICDF(32768) } },
+      { // Band 1
+        {AOM_ICDF(24257), AOM_ICDF(30554), AOM_ICDF(32719), AOM_ICDF(32738),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(17217), AOM_ICDF(27413), AOM_ICDF(32617), AOM_ICDF(32667),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(22977), AOM_ICDF(27600), AOM_ICDF(32482), AOM_ICDF(32552),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(16833), AOM_ICDF(24360), AOM_ICDF(30746), AOM_ICDF(31293),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(17089), AOM_ICDF(20060), AOM_ICDF(28880), AOM_ICDF(29370),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(10945), AOM_ICDF(11009), AOM_ICDF(21900), AOM_ICDF(21932),
+         AOM_ICDF(32768) } },
+      { // Band 2
+        {AOM_ICDF(27201), AOM_ICDF(30217), AOM_ICDF(32736), AOM_ICDF(32745),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(22721), AOM_ICDF(27676), AOM_ICDF(32749), AOM_ICDF(32754),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5057), AOM_ICDF(12431), AOM_ICDF(25246), AOM_ICDF(26620),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(193), AOM_ICDF(321), AOM_ICDF(22016), AOM_ICDF(22048),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5313), AOM_ICDF(5363), AOM_ICDF(13839), AOM_ICDF(13894),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2625), AOM_ICDF(2652), AOM_ICDF(7276), AOM_ICDF(7351),
+         AOM_ICDF(32768) } },
+      { // Band 3
+        {AOM_ICDF(27713), AOM_ICDF(30739), AOM_ICDF(32759), AOM_ICDF(32762),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(26177), AOM_ICDF(30430), AOM_ICDF(32756), AOM_ICDF(32760),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(193), AOM_ICDF(384), AOM_ICDF(32706), AOM_ICDF(32707),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(9409), AOM_ICDF(9528), AOM_ICDF(21591), AOM_ICDF(21646),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(193), AOM_ICDF(194), AOM_ICDF(384), AOM_ICDF(479),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2881), AOM_ICDF(2913), AOM_ICDF(8427), AOM_ICDF(8498),
+         AOM_ICDF(32768) } },
+      { // Band 4
+        {AOM_ICDF(28993), AOM_ICDF(31156), AOM_ICDF(32747), AOM_ICDF(32753),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(25153), AOM_ICDF(28701), AOM_ICDF(32754), AOM_ICDF(32758),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(16449), AOM_ICDF(16544), AOM_ICDF(32737), AOM_ICDF(32738),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(193), AOM_ICDF(321), AOM_ICDF(22016), AOM_ICDF(22048),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(193), AOM_ICDF(194), AOM_ICDF(384), AOM_ICDF(479),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(2753), AOM_ICDF(2789), AOM_ICDF(8909), AOM_ICDF(8979),
+         AOM_ICDF(32768) } },
+      { // Band 5
+        {AOM_ICDF(30785), AOM_ICDF(32088), AOM_ICDF(32765), AOM_ICDF(32766),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(22977), AOM_ICDF(26623), AOM_ICDF(32750), AOM_ICDF(32754),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(21953), AOM_ICDF(21954), AOM_ICDF(22017), AOM_ICDF(22049),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(5697), AOM_ICDF(7486), AOM_ICDF(20238), AOM_ICDF(21009),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(4929), AOM_ICDF(5579), AOM_ICDF(16402), AOM_ICDF(16866),
+         AOM_ICDF(32768) },
+        {AOM_ICDF(3009), AOM_ICDF(3246), AOM_ICDF(10158), AOM_ICDF(10533),
+         AOM_ICDF(32768) } } } }
+};
+#endif  // CONFIG_NEW_TOKENSET
+
+/* clang-format on */
+
+static void extend_to_full_distribution(aom_prob *probs, aom_prob p) {
+  assert(p != 0);
+  memcpy(probs, av1_pareto8_full[p - 1], MODEL_NODES * sizeof(aom_prob));
+}
+
+void av1_model_to_full_probs(const aom_prob *model, aom_prob *full) {
+  if (full != model)
+    memcpy(full, model, sizeof(aom_prob) * UNCONSTRAINED_NODES);
+  extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
+}
+
+#if CONFIG_NEW_TOKENSET
+
+static void build_tail_cdfs(aom_cdf_prob cdf_tail[CDF_SIZE(ENTROPY_TOKENS)],
+                            aom_cdf_prob cdf_head[CDF_SIZE(ENTROPY_TOKENS)],
+                            int band_zero) {
+  int probNZ, prob1, prob_idx, i;
+  int phead[HEAD_TOKENS + 1], sum;
+  const int is_dc = !!band_zero;
+  aom_cdf_prob prev_cdf;
+  prev_cdf = 0;
+  for (i = 0; i < HEAD_TOKENS + is_dc; ++i) {
+    phead[i] = AOM_ICDF(cdf_head[i]) - prev_cdf;
+    prev_cdf = AOM_ICDF(cdf_head[i]);
+  }
+  // Do the tail
+  probNZ = CDF_PROB_TOP - phead[ZERO_TOKEN + is_dc] - (is_dc ? phead[0] : 0);
+  prob1 = phead[is_dc + ONE_TOKEN_EOB] + phead[is_dc + ONE_TOKEN_NEOB];
+  prob_idx =
+      AOMMIN(COEFF_PROB_MODELS - 1, AOMMAX(0, ((256 * prob1) / probNZ) - 1));
+
+  sum = 0;
+  for (i = 0; i < TAIL_TOKENS; ++i) {
+    sum += av1_pareto8_tail_probs[prob_idx][i];
+    cdf_tail[i] = AOM_ICDF(sum);
+  }
+}
+
+static void build_head_cdfs(const aom_prob *pdf_model,
+                            const aom_prob *blockz_model,
+                            aom_cdf_prob cdf_head[ENTROPY_TOKENS + 1]) {
+  int i, p, p1, p2, phead[6], prob_NZ, prob_EOB_1, prob_EOB_2p, prob_NEOB_1,
+      prob_NEOB_2p;
+  int prob8_blocknz;
+  // We have the first coefficient position and so an extended CDF
+  const int is_dc = blockz_model != NULL;
+  const int last_head_val = HEAD_TOKENS - 1 + is_dc;
+
+  assert(pdf_model != NULL);
+  assert(pdf_model[2] != 0);
+
+  /* FIXME: maintain true CDF counts. */
+
+  /* Values are 0=BLOCK_ZERO 1=ZERO_TOKEN, 2=ONE_TOKEN_EOB
+     3=ONE_TOKEN_NEOB, 4=TWO_TOKEN_PLUS_EOB, 5=TWO_TOKEN_PLUS_NEOB
+     */
+  // Block zero probability
+  if (is_dc) {
+    phead[0] =
+        ((*blockz_model) << (CDF_PROB_BITS - 8)) + (1 << (CDF_PROB_BITS - 9));
+    phead[0] = AOMMIN(CDF_PROB_TOP - (HEAD_TOKENS + 1), AOMMAX(1, phead[0]));
+  }
+
+  // Will scale the remaining probabilities by the probability of the block
+  // being non-zero
+  prob8_blocknz = is_dc ? (256 - *blockz_model) : 256;
+
+  // Probability of zero
+  phead[is_dc + ZERO_TOKEN] =
+      (pdf_model[1] << (CDF_PROB_BITS - 8)) + (1 << (CDF_PROB_BITS - 9));
+
+  // Will scale the non-zero values
+  prob_NZ = CDF_PROB_TOP - phead[is_dc + ZERO_TOKEN];
+
+  // Will scale the EOBs by the probability of and EOB_TOKEN ..
+  prob_EOB_1 =
+      (pdf_model[0] << (CDF_PROB_BITS - 8)) + (1 << (CDF_PROB_BITS - 9));
+  // .. use a lower probability of EOB for larger values
+  prob_EOB_2p = prob_EOB_1 / 2;
+
+  prob_NEOB_1 = CDF_PROB_TOP - prob_EOB_1;
+  prob_NEOB_2p = CDF_PROB_TOP - prob_EOB_2p;
+  if (prob_NZ == 0 || prob_NZ == CDF_PROB_TOP) abort();
+  if (prob_EOB_1 == 0 || prob_EOB_1 == CDF_PROB_TOP) abort();
+  if (prob_EOB_2p == 0 || prob_EOB_2p == CDF_PROB_TOP) abort();
+
+  // ONE_CONTEXT_NODE prob
+  p = (pdf_model[2] << (CDF_PROB_BITS - 8)) + (1 << (CDF_PROB_BITS - 9));
+  // Scale by the non-zero factor to get the probability of token = 1
+  p1 = ROUND_POWER_OF_TWO(prob_NZ * p, 15);
+
+  // Scale by the EOB factors
+  phead[is_dc + ONE_TOKEN_EOB] = ROUND_POWER_OF_TWO(p1 * prob_EOB_1, 15);
+  phead[is_dc + ONE_TOKEN_NEOB] = ROUND_POWER_OF_TWO(p1 * prob_NEOB_1, 15);
+
+  // Probability token is 2 or more
+  p2 = CDF_PROB_TOP - p1 - phead[is_dc + ZERO_TOKEN];
+
+  phead[is_dc + TWO_TOKEN_PLUS_EOB] = ROUND_POWER_OF_TWO(p2 * prob_EOB_2p, 15);
+  phead[is_dc + TWO_TOKEN_PLUS_NEOB] =
+      ROUND_POWER_OF_TWO(p2 * prob_NEOB_2p, 15);
+
+  // Now use block non-zerp prob to scale the values
+  for (i = is_dc; i < last_head_val; ++i) {
+    phead[i] = (prob8_blocknz * phead[i] + 128) >> 8;
+  }
+
+  for (i = 0; i < last_head_val; ++i) {
+    int c0;
+    c0 = i > 0 ? AOM_ICDF(cdf_head[i - 1]) : 0;
+    p = AOMMAX(1, AOMMIN(CDF_PROB_TOP - (last_head_val - i) - c0, phead[i]));
+    cdf_head[i] = AOM_ICDF(c0 + p);
+  }
+  cdf_head[last_head_val] = AOM_ICDF(CDF_PROB_TOP);
+}
+
+static void av1_default_coef_cdfs(FRAME_CONTEXT *fc) {
+  int i, j, k, l;
+  for (i = 0; i < PLANE_TYPES; ++i)
+    for (j = 0; j < REF_TYPES; ++j)
+      for (k = 0; k < COEF_BANDS; ++k)
+        for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+#if CONFIG_CB4X4
+          av1_copy(fc->coef_head_cdfs[TX_2X2][i][j][k][l],
+                   default_coef_head_cdf_4x4[i][j][k][l]);
+#endif
+          av1_copy(fc->coef_head_cdfs[TX_4X4][i][j][k][l],
+                   default_coef_head_cdf_4x4[i][j][k][l]);
+          av1_copy(fc->coef_head_cdfs[TX_8X8][i][j][k][l],
+                   default_coef_head_cdf_8x8[i][j][k][l]);
+          av1_copy(fc->coef_head_cdfs[TX_16X16][i][j][k][l],
+                   default_coef_head_cdf_16x16[i][j][k][l]);
+          av1_copy(fc->coef_head_cdfs[TX_32X32][i][j][k][l],
+                   default_coef_head_cdf_32x32[i][j][k][l]);
+#if CONFIG_TX64X64
+          av1_copy(fc->coef_head_cdfs[TX_64X64][i][j][k][l],
+                   default_coef_head_cdf_32x32[i][j][k][l]);
+#endif
+        }
+}
+
+void av1_coef_head_cdfs(FRAME_CONTEXT *fc) {
+  TX_SIZE t;
+  int i, j, k, l;
+  for (t = 0; t < TX_SIZES; ++t)
+    for (i = 0; i < PLANE_TYPES; ++i)
+      for (j = 0; j < REF_TYPES; ++j)
+        for (k = 0; k < COEF_BANDS; ++k)
+          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+            build_head_cdfs(fc->coef_probs[t][i][j][k][l],
+                            k == 0 ? &fc->blockzero_probs[t][i][j][l] : NULL,
+                            fc->coef_head_cdfs[t][i][j][k][l]);
+          }
+}
+
+#elif CONFIG_EC_MULTISYMBOL
+static void build_token_cdfs(const aom_prob *pdf_model,
+                             aom_cdf_prob cdf[ENTROPY_TOKENS + 1]) {
+  int i, sum = 0;
+  assert(pdf_model[2] != 0);
+  for (i = 0; i < ENTROPY_TOKENS - 2; ++i) {
+    sum += av1_pareto8_token_probs[pdf_model[2] - 1][i];
+    cdf[i] = AOM_ICDF(sum);
+  }
+}
+#endif  // CONFIG_NEW_TOKENSET
+
+#if CONFIG_EC_MULTISYMBOL
+void av1_coef_pareto_cdfs(FRAME_CONTEXT *fc) {
+  /* Build the tail based on a Pareto distribution */
+  TX_SIZE t;
+  int i, j, k, l;
+  for (t = 0; t < TX_SIZES; ++t)
+    for (i = 0; i < PLANE_TYPES; ++i)
+      for (j = 0; j < REF_TYPES; ++j)
+        for (k = 0; k < COEF_BANDS; ++k)
+          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
+#if CONFIG_NEW_TOKENSET
+            build_tail_cdfs(fc->coef_tail_cdfs[t][i][j][k][l],
+                            fc->coef_head_cdfs[t][i][j][k][l], k == 0);
+#else
+            build_token_cdfs(fc->coef_probs[t][i][j][k][l],
+                             fc->coef_cdfs[t][i][j][k][l]);
+#endif
+}
+#endif
+
+void av1_default_coef_probs(AV1_COMMON *cm) {
+#if CONFIG_Q_ADAPT_PROBS
+  const int index = AOMMIN(
+      ROUND_POWER_OF_TWO(cm->base_qindex, 8 - QCTX_BIN_BITS), QCTX_BINS - 1);
+  av1_copy(cm->fc->coef_probs, default_qctx_coef_probs[index]);
+#else
+#if CONFIG_CB4X4
+  av1_copy(cm->fc->coef_probs[TX_2X2], default_coef_probs_4x4);
+#endif
+  av1_copy(cm->fc->coef_probs[TX_4X4], default_coef_probs_4x4);
+  av1_copy(cm->fc->coef_probs[TX_8X8], default_coef_probs_8x8);
+  av1_copy(cm->fc->coef_probs[TX_16X16], default_coef_probs_16x16);
+  av1_copy(cm->fc->coef_probs[TX_32X32], default_coef_probs_32x32);
+#if CONFIG_TX64X64
+  av1_copy(cm->fc->coef_probs[TX_64X64], default_coef_probs_64x64);
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_Q_ADAPT_PROBS
+#if CONFIG_NEW_TOKENSET
+  av1_copy(cm->fc->blockzero_probs, av1_default_blockzero_probs);
+#endif
+#if CONFIG_NEW_TOKENSET
+  /* Load the head tokens */
+  av1_default_coef_cdfs(cm->fc);
+#endif
+#if CONFIG_EC_MULTISYMBOL
+  av1_coef_pareto_cdfs(cm->fc);
+#endif  // CONFIG_EC_MULTISYMBOL
+}
+
+#if !CONFIG_LV_MAP
+static void adapt_coef_probs(AV1_COMMON *cm, TX_SIZE tx_size,
+                             unsigned int count_sat,
+                             unsigned int update_factor) {
+  const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+  av1_coeff_probs_model *const probs = cm->fc->coef_probs[tx_size];
+#if CONFIG_SUBFRAME_PROB_UPDATE
+  const av1_coeff_probs_model *const pre_probs =
+      cm->partial_prob_update
+          ? (const av1_coeff_probs_model *)cm->starting_coef_probs[tx_size]
+          : pre_fc->coef_probs[tx_size];
+#else
+  const av1_coeff_probs_model *const pre_probs = pre_fc->coef_probs[tx_size];
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+  const av1_coeff_count_model *const counts =
+      (const av1_coeff_count_model *)cm->counts.coef[tx_size];
+  const unsigned int(*eob_counts)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
+      (const unsigned int(*)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS])
+          cm->counts.eob_branch[tx_size];
+#if CONFIG_NEW_TOKENSET
+  const av1_blockz_probs_model *const pre_blockz_probs =
+      pre_fc->blockzero_probs[tx_size];
+  av1_blockz_probs_model *const blockz_probs = cm->fc->blockzero_probs[tx_size];
+  const av1_blockz_count_model *const blockz_counts =
+      (const av1_blockz_count_model *)&cm->counts.blockz_count[tx_size][0];
+#endif
+  int i, j, k, l, m;
+#if CONFIG_RECT_TX
+  assert(!is_rect_tx(tx_size));
+#endif  // CONFIG_RECT_TX
+
+  for (i = 0; i < PLANE_TYPES; ++i)
+    for (j = 0; j < REF_TYPES; ++j)
+      for (k = 0; k < COEF_BANDS; ++k)
+        for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+          const int n0 = counts[i][j][k][l][ZERO_TOKEN];
+          const int n1 = counts[i][j][k][l][ONE_TOKEN];
+          const int n2 = counts[i][j][k][l][TWO_TOKEN];
+          const int neob = counts[i][j][k][l][EOB_MODEL_TOKEN];
+          const unsigned int branch_ct[UNCONSTRAINED_NODES][2] = {
+            { neob, eob_counts[i][j][k][l] - neob }, { n0, n1 + n2 }, { n1, n2 }
+          };
+          for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+            probs[i][j][k][l][m] =
+                av1_merge_probs(pre_probs[i][j][k][l][m], branch_ct[m],
+                                count_sat, update_factor);
+        }
+
+#if CONFIG_NEW_TOKENSET
+  for (i = 0; i < PLANE_TYPES; ++i) {
+    for (j = 0; j < REF_TYPES; ++j) {
+      for (k = 0; k < BLOCKZ_CONTEXTS; ++k) {
+        const int n0 = blockz_counts[i][j][k][0];
+        const int n1 = blockz_counts[i][j][k][1];
+        const unsigned int branch_ct[2] = { n0, n1 };
+        blockz_probs[i][j][k] = av1_merge_probs(
+            pre_blockz_probs[i][j][k], branch_ct, count_sat, update_factor);
+      }
+    }
+  }
+#endif
+}
+#endif  // !CONFIG_LV_MAP
+
+void av1_adapt_coef_probs(AV1_COMMON *cm) {
+  unsigned int count_sat, update_factor;
+
+  if (!frame_is_intra_only(cm) && cm->last_frame_type == KEY_FRAME) {
+    update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY; /* adapt quickly */
+    count_sat = COEF_COUNT_SAT_AFTER_KEY;
+  } else {
+    update_factor = COEF_MAX_UPDATE_FACTOR;
+    count_sat = COEF_COUNT_SAT;
+  }
+#if CONFIG_SUBFRAME_PROB_UPDATE
+  if (cm->partial_prob_update == 1) update_factor = COEF_MAX_UPDATE_FACTOR;
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+
+#if CONFIG_LV_MAP
+  av1_adapt_txb_probs(cm, count_sat, update_factor);
+#else
+  TX_SIZE tx_size;
+  for (tx_size = 0; tx_size < TX_SIZES; tx_size++)
+    adapt_coef_probs(cm, tx_size, count_sat, update_factor);
+#endif
+}
+
+#if CONFIG_SUBFRAME_PROB_UPDATE
+void av1_partial_adapt_probs(AV1_COMMON *cm, int mi_row, int mi_col) {
+  (void)mi_row;
+  (void)mi_col;
+
+  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    cm->partial_prob_update = 1;
+    av1_adapt_coef_probs(cm);
+  }
+}
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+
+#if CONFIG_EC_ADAPT
+static void av1_average_cdf(aom_cdf_prob *cdf_ptr[], aom_cdf_prob *fc_cdf_ptr,
+                            int cdf_size, const int num_tiles) {
+  int i;
+  for (i = 0; i < cdf_size;) {
+    do {
+      int sum = 0;
+      int j;
+      assert(i < cdf_size);
+      for (j = 0; j < num_tiles; ++j) sum += AOM_ICDF(cdf_ptr[j][i]);
+      fc_cdf_ptr[i] = AOM_ICDF(sum / num_tiles);
+    } while (fc_cdf_ptr[i++] != AOM_ICDF(CDF_PROB_TOP));
+    // Zero symbol counts for the next frame
+    assert(i < cdf_size);
+    fc_cdf_ptr[i++] = 0;
+    // Skip trailing zeros until the start of the next CDF.
+    for (; i < cdf_size && fc_cdf_ptr[i] == 0; ++i) {
+    }
+  }
+}
+
+#define AVERAGE_TILE_CDFS(cname)                            \
+  for (i = 0; i < num_tiles; ++i)                           \
+    cdf_ptr[i] = (aom_cdf_prob *)&ec_ctxs[i]->cname;        \
+  fc_cdf_ptr = (aom_cdf_prob *)&fc->cname;                  \
+  cdf_size = (int)sizeof(fc->cname) / sizeof(aom_cdf_prob); \
+  av1_average_cdf(cdf_ptr, fc_cdf_ptr, cdf_size, num_tiles);
+
+void av1_average_tile_coef_cdfs(FRAME_CONTEXT *fc, FRAME_CONTEXT *ec_ctxs[],
+                                aom_cdf_prob *cdf_ptr[], const int num_tiles) {
+  int i, cdf_size;
+
+  aom_cdf_prob *fc_cdf_ptr;
+
+#if CONFIG_NEW_TOKENSET
+  AVERAGE_TILE_CDFS(coef_head_cdfs)
+  AVERAGE_TILE_CDFS(coef_tail_cdfs)
+#else
+  AVERAGE_TILE_CDFS(coef_cdfs)
+#endif
+}
+
+void av1_average_tile_mv_cdfs(FRAME_CONTEXT *fc, FRAME_CONTEXT *ec_ctxs[],
+                              aom_cdf_prob *cdf_ptr[], const int num_tiles) {
+  int i, k, cdf_size;
+
+  aom_cdf_prob *fc_cdf_ptr;
+
+#if CONFIG_REF_MV
+  int j;
+  for (j = 0; j < NMV_CONTEXTS; ++j) {
+    AVERAGE_TILE_CDFS(nmvc[j].joint_cdf)
+
+    for (k = 0; k < 2; ++k) {
+      AVERAGE_TILE_CDFS(nmvc[j].comps[k].class_cdf);
+      AVERAGE_TILE_CDFS(nmvc[j].comps[k].class0_fp_cdf);
+      AVERAGE_TILE_CDFS(nmvc[j].comps[k].fp_cdf);
+    }
+  }
+#else
+  AVERAGE_TILE_CDFS(nmvc.joint_cdf)
+
+  for (k = 0; k < 2; ++k) {
+    AVERAGE_TILE_CDFS(nmvc.comps[k].class_cdf)
+    AVERAGE_TILE_CDFS(nmvc.comps[k].class0_fp_cdf)
+    AVERAGE_TILE_CDFS(nmvc.comps[k].fp_cdf)
+  }
+#endif
+}
+
+void av1_average_tile_intra_cdfs(FRAME_CONTEXT *fc, FRAME_CONTEXT *ec_ctxs[],
+                                 aom_cdf_prob *cdf_ptr[], const int num_tiles) {
+  int i, cdf_size;
+
+  aom_cdf_prob *fc_cdf_ptr;
+
+  AVERAGE_TILE_CDFS(tx_size_cdf);
+
+#if CONFIG_VAR_TX
+// FIXME: txfm_partition probs
+#endif
+
+  // FIXME: skip probs
+
+  AVERAGE_TILE_CDFS(intra_ext_tx_cdf)
+  AVERAGE_TILE_CDFS(inter_ext_tx_cdf);
+
+  AVERAGE_TILE_CDFS(seg.tree_cdf)
+  AVERAGE_TILE_CDFS(uv_mode_cdf)
+
+  AVERAGE_TILE_CDFS(partition_cdf)
+
+#if CONFIG_DELTA_Q
+  AVERAGE_TILE_CDFS(delta_q_cdf)
+#if CONFIG_EXT_DELTA_Q
+  AVERAGE_TILE_CDFS(delta_lf_cdf)
+#endif
+#endif
+#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+  AVERAGE_TILE_CDFS(intra_filter_cdf)
+#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+#if CONFIG_FILTER_INTRA
+#endif  // CONFIG_FILTER_INTRA
+}
+
+void av1_average_tile_inter_cdfs(AV1_COMMON *cm, FRAME_CONTEXT *fc,
+                                 FRAME_CONTEXT *ec_ctxs[],
+                                 aom_cdf_prob *cdf_ptr[], const int num_tiles) {
+  int i, cdf_size;
+
+  aom_cdf_prob *fc_cdf_ptr;
+
+// FIXME: comp_inter_cdf not defined
+
+// FIXME: comp_ref_cdf and comp_bwd_ref not defined
+
+// FIXME: single_ref_cdf not defined
+
+#if CONFIG_REF_MV
+// FIXME: cdfs not defined for newmv_mode, zeromv_mode, drl_mode, new2mv_mode
+#else
+  AVERAGE_TILE_CDFS(inter_mode_cdf)
+#endif
+
+  // FIXME: cdfs not defined for motion_mode_prob, obmc_prob
+
+  // FIXME: cdfs not defined for super_tx
+
+  // FIXME: CONFIG_EXT_INTER cdfs not defined for inter_compound_mode,
+  // interintra_mode etc
+
+  AVERAGE_TILE_CDFS(y_mode_cdf)
+
+  if (cm->interp_filter == SWITCHABLE) {
+    AVERAGE_TILE_CDFS(switchable_interp_cdf)
+  }
+}
+
+#if CONFIG_PVQ
+// Averaging PVQ's expected values for symbol coding
+static void av1_average_pvq_ex(int *cxt_ptr[], int *fc_cxt_ptr, int cxt_size,
+                               const int num_tiles) {
+  int i, j;
+  for (i = 0; i < cxt_size; ++i) {
+    int sum = 0;
+    for (j = 0; j < num_tiles; ++j) sum += cxt_ptr[j][i];
+    fc_cxt_ptr[i] = sum / num_tiles;
+  }
+}
+
+#define AVERAGE_TILE_PVQ_EX(cname)                                        \
+  for (i = 0; i < num_tiles; ++i) cxt_ptr[i] = (int *)&ec_ctxs[i]->cname; \
+  fc_cxt_ptr = (int *)&fc->cname;                                         \
+  cxt_size = (int)sizeof(fc->cname) / sizeof(int);                        \
+  av1_average_pvq_ex(cxt_ptr, fc_cxt_ptr, cxt_size, num_tiles);
+
+void av1_default_pvq_probs(AV1_COMMON *cm) {
+  od_adapt_ctx *adapt = &cm->fc->pvq_context;
+
+  // Init with flat probabilities.
+  od_adapt_ctx_reset(adapt, 0);
+
+  // TODO(yushin): Prepare offline cdf and context table for PVQ,
+  // i.e. od_adapt_ctx, then load them from table,
+  // for example od_adapt_ctx default_pvq_context.
+  // Then do sth like this:
+  // av1_copy(cm->fc->pvq_context, default_pvq_context);
+}
+
+void av1_average_tile_pvq_cdfs(FRAME_CONTEXT *fc, FRAME_CONTEXT *ec_ctxs[],
+                               const int num_tiles) {
+  int i, j, cdf_size, cxt_size;
+
+  aom_cdf_prob *cdf_ptr[MAX_TILE_ROWS * MAX_TILE_COLS];
+  aom_cdf_prob *fc_cdf_ptr;
+  int *cxt_ptr[MAX_TILE_ROWS * MAX_TILE_COLS];
+  int *fc_cxt_ptr;
+
+  AVERAGE_TILE_PVQ_EX(pvq_context.ex_dc)
+  AVERAGE_TILE_PVQ_EX(pvq_context.ex_g)
+
+  for (j = 0; j < OD_NPLANES_MAX; j++) {
+    AVERAGE_TILE_CDFS(pvq_context.model_dc[j].cdf)
+  }
+
+  AVERAGE_TILE_CDFS(pvq_context.skip_cdf)
+
+  AVERAGE_TILE_PVQ_EX(pvq_context.pvq.pvq_codeword_ctx.pvq_adapt)
+  AVERAGE_TILE_CDFS(pvq_context.pvq.pvq_codeword_ctx.pvq_k1_cdf)
+  AVERAGE_TILE_CDFS(pvq_context.pvq.pvq_codeword_ctx.pvq_split_cdf)
+
+  for (j = 0; j < 3; j++) {
+    AVERAGE_TILE_CDFS(pvq_context.pvq.pvq_param_model[j].cdf)
+  }
+
+  AVERAGE_TILE_PVQ_EX(pvq_context.pvq.pvq_ext)
+  AVERAGE_TILE_PVQ_EX(pvq_context.pvq.pvq_exg)
+  AVERAGE_TILE_CDFS(pvq_context.pvq.pvq_gaintheta_cdf)
+  AVERAGE_TILE_CDFS(pvq_context.pvq.pvq_skip_dir_cdf)
+}
+#endif  // CONFIG_PVQ
+#endif  // CONFIG_EC_ADAPT
diff --git a/third_party/aom/av1/common/entropy.h b/third_party/aom/av1/common/entropy.h
new file mode 100644
index 000000000..b02d41bff
--- /dev/null
+++ b/third_party/aom/av1/common/entropy.h
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_ENTROPY_H_
+#define AV1_COMMON_ENTROPY_H_
+
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/prob.h"
+
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DIFF_UPDATE_PROB 252
+#define GROUP_DIFF_UPDATE_PROB 252
+
+#if CONFIG_Q_ADAPT_PROBS
+#define QCTX_BIN_BITS 2
+#define QCTX_BINS (1 << QCTX_BIN_BITS)
+#endif  // CONFIG_Q_ADAPT_PROBS
+
+#if CONFIG_SUBFRAME_PROB_UPDATE
+#define COEF_PROBS_BUFS 16
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+
+// Coefficient token alphabet
+#define ZERO_TOKEN 0        // 0     Extra Bits 0+0
+#define ONE_TOKEN 1         // 1     Extra Bits 0+1
+#define TWO_TOKEN 2         // 2     Extra Bits 0+1
+#define THREE_TOKEN 3       // 3     Extra Bits 0+1
+#define FOUR_TOKEN 4        // 4     Extra Bits 0+1
+#define CATEGORY1_TOKEN 5   // 5-6   Extra Bits 1+1
+#define CATEGORY2_TOKEN 6   // 7-10  Extra Bits 2+1
+#define CATEGORY3_TOKEN 7   // 11-18 Extra Bits 3+1
+#define CATEGORY4_TOKEN 8   // 19-34 Extra Bits 4+1
+#define CATEGORY5_TOKEN 9   // 35-66 Extra Bits 5+1
+#define CATEGORY6_TOKEN 10  // 67+   Extra Bits 14+1
+#define EOB_TOKEN 11        // EOB   Extra Bits 0+0
+#if CONFIG_NEW_TOKENSET
+#define NO_EOB 0           // Not an end-of-block
+#define EARLY_EOB 1        // End of block before the last position
+#define LAST_EOB 2         // End of block in the last position (implicit)
+#define BLOCK_Z_TOKEN 255  // block zero
+#define HEAD_TOKENS 5
+#define TAIL_TOKENS 9
+#define ONE_TOKEN_EOB 1
+#define ONE_TOKEN_NEOB 2
+#define TWO_TOKEN_PLUS_EOB 3
+#define TWO_TOKEN_PLUS_NEOB 4
+#endif
+#define ENTROPY_TOKENS 12
+
+#define ENTROPY_NODES 11
+
+#if CONFIG_LV_MAP
+#define TXB_SKIP_CONTEXTS 13
+#define SIG_COEF_CONTEXTS 20
+#define EOB_COEF_CONTEXTS 25
+#define COEFF_BASE_CONTEXTS 42
+#define DC_SIGN_CONTEXTS 3
+
+#define BR_TMP_OFFSET 12
+#define BR_REF_CAT 4
+#define LEVEL_CONTEXTS (BR_TMP_OFFSET * BR_REF_CAT)
+
+#define NUM_BASE_LEVELS 2
+#define COEFF_BASE_RANGE (15 - NUM_BASE_LEVELS)
+
+#define COEFF_CONTEXT_BITS 6
+#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
+#endif
+
+DECLARE_ALIGNED(16, extern const uint8_t, av1_pt_energy_class[ENTROPY_TOKENS]);
+
+#define CAT1_MIN_VAL 5
+#define CAT2_MIN_VAL 7
+#define CAT3_MIN_VAL 11
+#define CAT4_MIN_VAL 19
+#define CAT5_MIN_VAL 35
+#define CAT6_MIN_VAL 67
+
+// Extra bit probabilities.
+DECLARE_ALIGNED(16, extern const uint8_t, av1_cat1_prob[1]);
+DECLARE_ALIGNED(16, extern const uint8_t, av1_cat2_prob[2]);
+DECLARE_ALIGNED(16, extern const uint8_t, av1_cat3_prob[3]);
+DECLARE_ALIGNED(16, extern const uint8_t, av1_cat4_prob[4]);
+DECLARE_ALIGNED(16, extern const uint8_t, av1_cat5_prob[5]);
+DECLARE_ALIGNED(16, extern const uint8_t, av1_cat6_prob[18]);
+#if CONFIG_NEW_MULTISYMBOL
+extern const aom_cdf_prob *av1_cat1_cdf[];
+extern const aom_cdf_prob *av1_cat2_cdf[];
+extern const aom_cdf_prob *av1_cat3_cdf[];
+extern const aom_cdf_prob *av1_cat4_cdf[];
+extern const aom_cdf_prob *av1_cat5_cdf[];
+extern const aom_cdf_prob *av1_cat6_cdf[];
+#endif
+
+#define EOB_MODEL_TOKEN 3
+
+typedef struct {
+#if CONFIG_NEW_MULTISYMBOL
+  const aom_cdf_prob **cdf;
+#else
+  const aom_prob *prob;
+#endif
+  int len;
+  int base_val;
+  const int16_t *cost;
+} av1_extra_bit;
+
+// indexed by token value
+extern const av1_extra_bit av1_extra_bits[ENTROPY_TOKENS];
+
+static INLINE int av1_get_cat6_extrabits_size(TX_SIZE tx_size,
+                                              aom_bit_depth_t bit_depth) {
+  tx_size = txsize_sqr_up_map[tx_size];
+#if CONFIG_TX64X64
+  // TODO(debargha): Does TX_64X64 require an additional extrabit?
+  if (tx_size > TX_32X32) tx_size = TX_32X32;
+#endif
+#if CONFIG_CB4X4
+  int tx_offset = (tx_size < TX_4X4) ? 0 : (int)(tx_size - TX_4X4);
+#else
+  int tx_offset = (int)(tx_size - TX_4X4);
+#endif
+  int bits = (int)bit_depth + 3 + tx_offset;
+#if CONFIG_NEW_MULTISYMBOL
+  // Round up
+  bits = AOMMIN((int)sizeof(av1_cat6_prob), ((bits + 3) & ~3));
+#endif
+  assert(bits <= (int)sizeof(av1_cat6_prob));
+  return bits;
+}
+
+#define DCT_MAX_VALUE 16384
+#if CONFIG_HIGHBITDEPTH
+#define DCT_MAX_VALUE_HIGH10 65536
+#define DCT_MAX_VALUE_HIGH12 262144
+#endif  // CONFIG_HIGHBITDEPTH
+
+/* Coefficients are predicted via a 3-dimensional probability table. */
+
+#define REF_TYPES 2  // intra=0, inter=1
+
+/* Middle dimension reflects the coefficient position within the transform. */
+#define COEF_BANDS 6
+
+/* Inside dimension is measure of nearby complexity, that reflects the energy
+   of nearby coefficients are nonzero.  For the first coefficient (DC, unless
+   block type is 0), we look at the (already encoded) blocks above and to the
+   left of the current block.  The context index is then the number (0,1,or 2)
+   of these blocks having nonzero coefficients.
+   After decoding a coefficient, the measure is determined by the size of the
+   most recently decoded coefficient.
+   Note that the intuitive meaning of this measure changes as coefficients
+   are decoded, e.g., prior to the first token, a zero means that my neighbors
+   are empty while, after the first token, because of the use of end-of-block,
+   a zero means we just decoded a zero and hence guarantees that a non-zero
+   coefficient will appear later in this block.  However, this shift
+   in meaning is perfectly OK because our context depends also on the
+   coefficient band (and since zigzag positions 0, 1, and 2 are in
+   distinct bands). */
+
+#define COEFF_CONTEXTS 6
+#if CONFIG_EC_MULTISYMBOL
+#define BLOCKZ_CONTEXTS 3
+#endif
+#define COEFF_CONTEXTS0 3  // for band 0
+#define BAND_COEFF_CONTEXTS(band) \
+  ((band) == 0 ? COEFF_CONTEXTS0 : COEFF_CONTEXTS)
+
+// #define ENTROPY_STATS
+
+typedef unsigned int av1_coeff_count[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
+                                    [ENTROPY_TOKENS];
+typedef unsigned int av1_coeff_stats[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
+                                    [ENTROPY_NODES][2];
+
+#define SUBEXP_PARAM 4   /* Subexponential code parameter */
+#define MODULUS_PARAM 13 /* Modulus parameter */
+
+struct AV1Common;
+struct frame_contexts;
+void av1_default_coef_probs(struct AV1Common *cm);
+void av1_adapt_coef_probs(struct AV1Common *cm);
+#if CONFIG_EC_ADAPT
+void av1_adapt_coef_cdfs(struct AV1Common *cm, struct frame_contexts *pre_fc);
+#endif
+#if CONFIG_SUBFRAME_PROB_UPDATE
+void av1_partial_adapt_probs(struct AV1Common *cm, int mi_row, int mi_col);
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+
+// This is the index in the scan order beyond which all coefficients for
+// 8x8 transform and above are in the top band.
+// This macro is currently unused but may be used by certain implementations
+#define MAXBAND_INDEX 21
+
+DECLARE_ALIGNED(16, extern const uint8_t,
+                av1_coefband_trans_8x8plus[MAX_TX_SQUARE]);
+DECLARE_ALIGNED(16, extern const uint8_t, av1_coefband_trans_4x8_8x4[32]);
+DECLARE_ALIGNED(16, extern const uint8_t, av1_coefband_trans_4x4[16]);
+
+DECLARE_ALIGNED(16, extern const uint16_t, band_count_table[TX_SIZES_ALL][8]);
+DECLARE_ALIGNED(16, extern const uint16_t,
+                band_cum_count_table[TX_SIZES_ALL][8]);
+
+static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
+  switch (tx_size) {
+    case TX_4X4: return av1_coefband_trans_4x4;
+    case TX_8X4:
+    case TX_4X8: return av1_coefband_trans_4x8_8x4;
+    default: return av1_coefband_trans_8x8plus;
+  }
+}
+
+// 128 lists of probabilities are stored for the following ONE node probs:
+// 1, 3, 5, 7, ..., 253, 255
+// In between probabilities are interpolated linearly
+
+#define COEFF_PROB_MODELS 255
+
+#define UNCONSTRAINED_NODES 3
+
+#define PIVOT_NODE 2  // which node is pivot
+
+#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
+#define TAIL_NODES (MODEL_NODES + 1)
+extern const aom_tree_index av1_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)];
+extern const aom_prob av1_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
+
+typedef aom_prob av1_coeff_probs_model[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
+                                      [UNCONSTRAINED_NODES];
+
+typedef unsigned int av1_coeff_count_model[REF_TYPES][COEF_BANDS]
+                                          [COEFF_CONTEXTS]
+                                          [UNCONSTRAINED_NODES + 1];
+
+void av1_model_to_full_probs(const aom_prob *model, aom_prob *full);
+
+#if CONFIG_EC_MULTISYMBOL
+typedef aom_cdf_prob coeff_cdf_model[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
+                                    [CDF_SIZE(ENTROPY_TOKENS)];
+typedef aom_prob av1_blockz_probs_model[REF_TYPES][BLOCKZ_CONTEXTS];
+typedef unsigned int av1_blockz_count_model[REF_TYPES][BLOCKZ_CONTEXTS][2];
+extern const aom_cdf_prob av1_pareto8_token_probs[COEFF_PROB_MODELS]
+                                                 [ENTROPY_TOKENS - 2];
+extern const aom_cdf_prob av1_pareto8_tail_probs[COEFF_PROB_MODELS]
+                                                [ENTROPY_TOKENS - 3];
+struct frame_contexts;
+#if CONFIG_NEW_TOKENSET
+void av1_coef_head_cdfs(struct frame_contexts *fc);
+#endif
+void av1_coef_pareto_cdfs(struct frame_contexts *fc);
+#endif  // CONFIG_EC_MULTISYMBOL
+
+typedef char ENTROPY_CONTEXT;
+
+static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
+                                           ENTROPY_CONTEXT b) {
+  return (a != 0) + (b != 0);
+}
+
+static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
+                                      const ENTROPY_CONTEXT *l) {
+  ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
+
+#if CONFIG_CB4X4
+  switch (tx_size) {
+    case TX_2X2:
+      above_ec = a[0] != 0;
+      left_ec = l[0] != 0;
+      break;
+    case TX_4X4:
+      above_ec = !!*(const uint16_t *)a;
+      left_ec = !!*(const uint16_t *)l;
+      break;
+    case TX_4X8:
+      above_ec = !!*(const uint16_t *)a;
+      left_ec = !!*(const uint32_t *)l;
+      break;
+    case TX_8X4:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec = !!*(const uint16_t *)l;
+      break;
+    case TX_8X8:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec = !!*(const uint32_t *)l;
+      break;
+    case TX_8X16:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec = !!*(const uint64_t *)l;
+      break;
+    case TX_16X8:
+      above_ec = !!*(const uint64_t *)a;
+      left_ec = !!*(const uint32_t *)l;
+      break;
+    case TX_16X16:
+      above_ec = !!*(const uint64_t *)a;
+      left_ec = !!*(const uint64_t *)l;
+      break;
+    case TX_16X32:
+      above_ec = !!*(const uint64_t *)a;
+      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
+      break;
+    case TX_32X16:
+      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
+      left_ec = !!*(const uint64_t *)l;
+      break;
+    case TX_32X32:
+      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
+      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
+      break;
+    default: assert(0 && "Invalid transform size."); break;
+  }
+  return combine_entropy_contexts(above_ec, left_ec);
+#endif
+
+  switch (tx_size) {
+    case TX_4X4:
+      above_ec = a[0] != 0;
+      left_ec = l[0] != 0;
+      break;
+    case TX_4X8:
+      above_ec = a[0] != 0;
+      left_ec = !!*(const uint16_t *)l;
+      break;
+    case TX_8X4:
+      above_ec = !!*(const uint16_t *)a;
+      left_ec = l[0] != 0;
+      break;
+    case TX_8X16:
+      above_ec = !!*(const uint16_t *)a;
+      left_ec = !!*(const uint32_t *)l;
+      break;
+    case TX_16X8:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec = !!*(const uint16_t *)l;
+      break;
+    case TX_16X32:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec = !!*(const uint64_t *)l;
+      break;
+    case TX_32X16:
+      above_ec = !!*(const uint64_t *)a;
+      left_ec = !!*(const uint32_t *)l;
+      break;
+    case TX_8X8:
+      above_ec = !!*(const uint16_t *)a;
+      left_ec = !!*(const uint16_t *)l;
+      break;
+    case TX_16X16:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec = !!*(const uint32_t *)l;
+      break;
+    case TX_32X32:
+      above_ec = !!*(const uint64_t *)a;
+      left_ec = !!*(const uint64_t *)l;
+      break;
+#if CONFIG_TX64X64
+    case TX_64X64:
+      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
+      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
+      break;
+#endif  // CONFIG_TX64X64
+    default: assert(0 && "Invalid transform size."); break;
+  }
+  return combine_entropy_contexts(above_ec, left_ec);
+}
+
+#define COEF_COUNT_SAT 24
+#define COEF_MAX_UPDATE_FACTOR 112
+#define COEF_COUNT_SAT_AFTER_KEY 24
+#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
+
+#if CONFIG_ADAPT_SCAN
+#define ADAPT_SCAN_UPDATE_RATE_16 (1 << 13)
+#endif
+
+static INLINE aom_prob av1_merge_probs(aom_prob pre_prob,
+                                       const unsigned int ct[2],
+                                       unsigned int count_sat,
+                                       unsigned int max_update_factor) {
+  return merge_probs(pre_prob, ct, count_sat, max_update_factor);
+}
+
+static INLINE aom_prob av1_mode_mv_merge_probs(aom_prob pre_prob,
+                                               const unsigned int ct[2]) {
+  return mode_mv_merge_probs(pre_prob, ct);
+}
+
+#if CONFIG_EC_ADAPT
+void av1_average_tile_coef_cdfs(struct frame_contexts *fc,
+                                struct frame_contexts *ec_ctxs[],
+                                aom_cdf_prob *cdf_ptrs[], int num_tiles);
+void av1_average_tile_mv_cdfs(struct frame_contexts *fc,
+                              struct frame_contexts *ec_ctxs[],
+                              aom_cdf_prob *cdf_ptrs[], int num_tiles);
+void av1_average_tile_intra_cdfs(struct frame_contexts *fc,
+                                 struct frame_contexts *ec_ctxs[],
+                                 aom_cdf_prob *cdf_ptrs[], int num_tiles);
+void av1_average_tile_inter_cdfs(struct AV1Common *cm,
+                                 struct frame_contexts *fc,
+                                 struct frame_contexts *ec_ctxs[],
+                                 aom_cdf_prob *cdf_ptrs[], int num_tiles);
+#if CONFIG_PVQ
+void av1_default_pvq_probs(struct AV1Common *cm);
+void av1_average_tile_pvq_cdfs(struct frame_contexts *fc,
+                               struct frame_contexts *ec_ctxs[], int num_tiles);
+#endif  // CONFIG_PVQ
+#endif  // CONFIG_EC_ADAPT
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_ENTROPY_H_
diff --git a/third_party/aom/av1/common/entropymode.c b/third_party/aom/av1/common/entropymode.c
new file mode 100644
index 000000000..0fcf762d1
--- /dev/null
+++ b/third_party/aom/av1/common/entropymode.c
@@ -0,0 +1,3792 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/reconinter.h"
+#include "av1/common/scan.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/seg_common.h"
+
+#if CONFIG_LV_MAP
+const aom_prob default_txb_skip[TX_SIZES][TXB_SKIP_CONTEXTS] = {
+#if CONFIG_CB4X4
+  { 252, 71, 126, 184, 178, 218, 251, 49, 133, 221, 27, 92, 197 },
+#endif
+  { 252, 71, 126, 184, 178, 218, 251, 49, 133, 221, 27, 92, 197 },
+  { 252, 71, 126, 184, 178, 218, 251, 49, 133, 221, 27, 92, 197 },
+  { 252, 71, 126, 184, 178, 218, 251, 49, 133, 221, 27, 92, 197 },
+  { 252, 71, 126, 184, 178, 218, 251, 49, 133, 221, 27, 92, 197 },
+};
+const aom_prob default_dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS] = {
+  { 125, 102, 147 }, { 119, 101, 135 },
+};
+
+const aom_prob default_coeff_base
+    [TX_SIZES][PLANE_TYPES][NUM_BASE_LEVELS][COEFF_BASE_CONTEXTS] = {
+#if CONFIG_CB4X4
+      { // TX_2X2
+        {
+            { 73,  128, 131, 204, 165, 226, 169, 236, 18,  128, 51,
+              153, 97,  179, 123, 201, 145, 226, 20,  128, 59,  153,
+              107, 181, 129, 201, 142, 226, 3,   128, 19,  99,  46,
+              135, 92,  166, 129, 190, 157, 217, 128, 128 },
+
+            { 128, 128, 178, 218, 192, 236, 186, 243, 55,  128, 110,
+              183, 151, 205, 168, 221, 180, 238, 65,  128, 116, 178,
+              157, 206, 172, 222, 183, 238, 24,  128, 65,  127, 104,
+              164, 154, 195, 187, 216, 205, 230, 128, 128 },
+        },
+        {
+            { 73,  128, 131, 204, 165, 226, 169, 236, 18,  128, 51,
+              153, 97,  179, 123, 201, 145, 226, 20,  128, 59,  153,
+              107, 181, 129, 201, 142, 226, 3,   128, 19,  99,  46,
+              135, 92,  166, 129, 190, 157, 217, 128, 128 },
+
+            { 128, 128, 178, 218, 192, 236, 186, 243, 55,  128, 110,
+              183, 151, 205, 168, 221, 180, 238, 65,  128, 116, 178,
+              157, 206, 172, 222, 183, 238, 24,  128, 65,  127, 104,
+              164, 154, 195, 187, 216, 205, 230, 128, 128 },
+        } },
+#endif
+      { // TX_4X4
+        {
+            // PLANE_Y
+            { 73,  128, 131, 204, 165, 226, 169, 236, 18,  128, 51,
+              153, 97,  179, 123, 201, 145, 226, 20,  128, 59,  153,
+              107, 181, 129, 201, 142, 226, 3,   128, 19,  99,  46,
+              135, 92,  166, 129, 190, 157, 217, 128, 128 },
+
+            { 128, 128, 178, 218, 192, 236, 186, 243, 55,  128, 110,
+              183, 151, 205, 168, 221, 180, 238, 65,  128, 116, 178,
+              157, 206, 172, 222, 183, 238, 24,  128, 65,  127, 104,
+              164, 154, 195, 187, 216, 205, 230, 128, 128 },
+        },
+        {
+            // PLANE_UV
+            { 47,  128, 100, 176, 140, 207, 150, 223, 11,  128, 35,
+              133, 79,  165, 115, 186, 129, 210, 8,   128, 30,  114,
+              80,  159, 116, 187, 146, 214, 2,   128, 9,   59,  28,
+              86,  71,  131, 117, 165, 149, 188, 128, 128 },
+
+            { 83,  128, 152, 205, 168, 227, 192, 238, 42,  128, 92,
+              169, 138, 193, 165, 209, 128, 206, 36,  128, 86,  159,
+              141, 198, 181, 213, 102, 223, 18,  128, 50,  132, 90,
+              144, 141, 169, 180, 191, 128, 217, 128, 128 },
+        } },
+      {
+          // TX_8X8
+          {
+              // PLANE_Y
+              { 82,  128, 143, 203, 177, 225, 186, 237, 7,   128, 37,
+                109, 78,  151, 110, 182, 139, 213, 25,  128, 51,  115,
+                86,  146, 111, 175, 125, 205, 3,   128, 12,  55,  32,
+                78,  63,  111, 96,  148, 123, 185, 146, 206 },
+
+              { 136, 128, 182, 220, 201, 236, 205, 243, 46,  128, 101,
+                164, 147, 194, 170, 218, 177, 234, 62,  128, 104, 146,
+                143, 183, 165, 207, 183, 228, 30,  128, 60,  95,  95,
+                128, 135, 163, 166, 196, 175, 219, 192, 231 },
+          },
+          {
+              // PLANE_UV
+              { 47,  128, 112, 189, 164, 202, 163, 218, 8,   128, 32,
+                110, 68,  151, 102, 179, 134, 195, 5,   128, 22,  76,
+                54,  103, 80,  146, 101, 182, 1,   128, 5,   39,  17,
+                53,  46,  93,  79,  127, 112, 161, 64,  195 },
+
+              { 90,  128, 156, 210, 183, 225, 128, 236, 39,  128, 98,
+                164, 146, 201, 209, 219, 171, 208, 32,  128, 68,  123,
+                119, 169, 154, 184, 128, 213, 15,  128, 38,  111, 83,
+                112, 120, 163, 180, 170, 154, 213, 128, 205 },
+          },
+      },
+
+      {
+          // TX_16X16
+          {
+              // PLANE_Y
+              { 96,  128, 169, 218, 208, 233, 187, 244, 10,  128, 34,
+                101, 82,  153, 113, 184, 137, 212, 6,   128, 34,  104,
+                81,  145, 109, 176, 147, 202, 1,   128, 3,   43,  15,
+                53,  43,  89,  79,  129, 108, 168, 110, 194 },
+
+              { 156, 128, 206, 232, 218, 240, 128, 251, 39,  128, 108,
+                161, 156, 202, 187, 216, 179, 234, 40,  128, 103, 152,
+                144, 185, 159, 208, 205, 227, 14,  128, 39,  84,  76,
+                110, 121, 151, 157, 187, 201, 206, 64,  216 },
+          },
+          {
+              // PLANE_UV
+              { 42, 128, 139, 211, 180, 230, 199, 238, 3,   128, 32,
+                96, 69,  145, 102, 186, 117, 212, 4,   128, 25,  72,
+                55, 111, 81,  159, 116, 198, 1,   128, 4,   22,  16,
+                34, 35,  68,  63,  116, 89,  165, 102, 199 },
+
+              { 135, 128, 193, 227, 182, 239, 128, 246, 42,  128, 115,
+                156, 146, 203, 188, 216, 128, 229, 32,  128, 82,  127,
+                120, 178, 165, 203, 213, 229, 11,  128, 32,  73,  79,
+                111, 129, 158, 162, 187, 156, 209, 85,  222 },
+          },
+      },
+
+      {
+          // TX_32X32
+          {
+              // PLANE_Y
+              { 97,  128, 163, 232, 191, 246, 219, 252, 3,   128, 41,
+                108, 91,  147, 104, 183, 118, 225, 6,   128, 45,  91,
+                83,  125, 92,  160, 99,  215, 1,   128, 11,  36,  28,
+                46,  43,  59,  57,  86,  73,  145, 91,  210 },
+
+              { 127, 128, 201, 239, 247, 248, 128, 254, 40,  128, 103,
+                152, 158, 199, 186, 225, 181, 242, 38,  128, 92,  112,
+                146, 189, 162, 217, 112, 239, 17,  128, 30,  47,  63,
+                89,  113, 146, 147, 187, 168, 217, 150, 233 },
+          },
+          {
+              // PLANE_UV
+              { 65,  128, 155, 223, 166, 235, 154, 244, 15,  128, 57,
+                154, 110, 199, 159, 224, 149, 239, 9,   128, 57,  140,
+                97,  185, 148, 218, 176, 236, 1,   128, 3,   43,  19,
+                42,  64,  98,  117, 167, 154, 199, 128, 158 },
+
+              { 130, 128, 189, 231, 171, 247, 128, 246, 63,  128, 132,
+                222, 186, 224, 199, 244, 128, 247, 55,  128, 113, 211,
+                164, 230, 225, 243, 128, 239, 7,   128, 31,  102, 106,
+                138, 147, 183, 171, 223, 171, 224, 128, 128 },
+          },
+      },
+    };
+
+const aom_prob default_nz_map[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS] = {
+#if CONFIG_CB4X4
+  {
+      { 34, 103, 61, 106, 62,  160, 112, 54, 173, 121,
+        75, 157, 92, 75,  157, 129, 94,  65, 52,  37 },
+      { 52,  124, 84,  136, 107, 197, 161, 82, 183, 151,
+        109, 153, 140, 103, 152, 134, 109, 81, 69,  50 },
+  },
+#endif
+  {
+      { 34, 103, 61, 106, 62,  160, 112, 54, 173, 121,
+        75, 157, 92, 75,  157, 129, 94,  65, 52,  37 },
+      { 52,  124, 84,  136, 107, 197, 161, 82, 183, 151,
+        109, 153, 140, 103, 152, 134, 109, 81, 69,  50 },
+  },
+  {
+      { 34, 127, 74,  124, 74,  204, 153, 76,  226, 162,
+        92, 207, 126, 91,  227, 192, 149, 108, 85,  55 },
+      { 43,  136, 115, 158, 130, 212, 187, 112, 231, 180,
+        130, 202, 164, 130, 236, 204, 168, 139, 112, 114 },
+  },
+  {
+      { 25,  117, 70,  120, 77,  215, 171, 102, 234, 156,
+        105, 235, 155, 109, 247, 220, 176, 127, 92,  72 },
+      { 24,  88,  49,  100, 62,  202, 148, 62,  237, 178,
+        102, 233, 168, 105, 244, 198, 162, 127, 103, 71 },
+  },
+  {
+      { 11, 54,  17,  69, 26,  128, 125, 56,  232, 130,
+        60, 237, 121, 66, 250, 168, 134, 114, 93,  53 },
+      { 21, 52,  32,  95,  64,  171, 152, 70,  247, 159,
+        81, 252, 177, 100, 252, 221, 192, 143, 195, 146 },
+  },
+};
+
+const aom_prob default_eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS] = {
+#if CONFIG_CB4X4
+  {
+      { 229, 236, 231, 222, 239, 236, 214, 201, 236, 226, 195, 134, 228,
+        210, 150, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 182, 186, 172, 176, 207, 213, 152, 122, 187, 171, 131, 65, 170,
+        134, 101, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+  },
+#endif
+  {
+      { 229, 236, 231, 222, 239, 236, 214, 201, 236, 226, 195, 134, 228,
+        210, 150, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 182, 186, 172, 176, 207, 213, 152, 122, 187, 171, 131, 65, 170,
+        134, 101, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+  },
+  {
+      { 225, 234, 244, 236, 205, 242, 246, 247, 246, 234, 191, 242, 237,
+        215, 142, 224, 206, 142, 73,  128, 128, 128, 128, 128, 128 },
+      { 154, 171, 187, 175, 62,  199, 202, 206, 215, 200, 111, 197, 199,
+        174, 100, 135, 105, 104, 45,  128, 128, 128, 128, 128, 128 },
+  },
+  {
+      { 180, 213, 216, 229, 233, 232, 240, 235, 220, 178, 239, 238, 225,
+        187, 229, 214, 226, 200, 183, 141, 158, 179, 128, 128, 128 },
+      { 190, 225, 234, 248, 249, 248, 253, 251, 232, 110, 254, 252, 236,
+        57,  253, 248, 232, 85,  244, 189, 112, 64,  128, 128, 128 },
+  },
+  {
+      { 248, 224, 246, 244, 239, 245, 251, 246, 251, 255, 255, 255, 249,
+        255, 255, 255, 229, 255, 255, 255, 228, 255, 255, 247, 137 },
+      { 204, 207, 233, 215, 193, 228, 239, 221, 227, 250, 236, 207, 135,
+        236, 186, 182, 57,  209, 140, 128, 85,  184, 110, 128, 128 },
+  },
+};
+
+const aom_prob default_coeff_lps[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS] = {
+#if CONFIG_CB4X4
+  {
+      { 164, 128, 134, 165, 128, 137, 168, 128, 97,  136, 167, 128,
+        182, 205, 143, 172, 200, 145, 173, 193, 103, 137, 170, 191,
+        198, 214, 162, 187, 209, 162, 187, 207, 128, 156, 183, 201,
+        219, 230, 204, 210, 225, 201, 209, 225, 187, 190, 203, 214 },
+      { 106, 128, 98,  126, 128, 87,  122, 128, 54,  89,  131, 128,
+        142, 180, 123, 154, 189, 115, 149, 175, 79,  115, 157, 182,
+        175, 197, 147, 174, 199, 145, 174, 201, 89,  135, 173, 194,
+        212, 222, 206, 203, 223, 188, 201, 220, 128, 144, 202, 206 },
+  },
+#endif
+  {
+      { 164, 128, 134, 165, 128, 137, 168, 128, 97,  136, 167, 128,
+        182, 205, 143, 172, 200, 145, 173, 193, 103, 137, 170, 191,
+        198, 214, 162, 187, 209, 162, 187, 207, 128, 156, 183, 201,
+        219, 230, 204, 210, 225, 201, 209, 225, 187, 190, 203, 214 },
+      { 106, 128, 98,  126, 128, 87,  122, 128, 54,  89,  131, 128,
+        142, 180, 123, 154, 189, 115, 149, 175, 79,  115, 157, 182,
+        175, 197, 147, 174, 199, 145, 174, 201, 89,  135, 173, 194,
+        212, 222, 206, 203, 223, 188, 201, 220, 128, 144, 202, 206 },
+  },
+  {
+      { 171, 128, 123, 169, 128, 121, 165, 128, 82,  125, 168, 128,
+        191, 213, 143, 177, 199, 136, 170, 194, 95,  135, 171, 195,
+        206, 222, 166, 191, 212, 154, 184, 207, 115, 149, 180, 204,
+        223, 237, 196, 215, 231, 186, 209, 228, 158, 178, 201, 222 },
+      { 115, 128, 115, 146, 128, 91,  147, 128, 55,  93,  139, 128,
+        147, 190, 141, 176, 201, 123, 156, 173, 68,  114, 156, 195,
+        186, 205, 153, 191, 214, 141, 179, 205, 107, 132, 166, 184,
+        215, 225, 200, 212, 230, 102, 207, 222, 128, 119, 200, 212 },
+  },
+  {
+      { 185, 128, 134, 198, 128, 128, 195, 128, 58,  110, 162, 128,
+        208, 227, 154, 196, 206, 144, 188, 209, 83,  130, 168, 198,
+        219, 232, 167, 205, 222, 158, 196, 216, 107, 143, 178, 204,
+        233, 244, 202, 226, 238, 191, 217, 234, 153, 178, 200, 223 },
+      { 160, 128, 154, 197, 128, 129, 178, 128, 53,  112, 157, 128,
+        185, 214, 169, 196, 221, 134, 179, 186, 82,  131, 168, 194,
+        204, 220, 176, 209, 221, 173, 194, 209, 107, 154, 181, 203,
+        230, 241, 202, 226, 237, 185, 223, 234, 162, 187, 203, 222 },
+  },
+  {
+      { 177, 128, 165, 226, 128, 152, 219, 128, 45,  129, 188, 128,
+        198, 218, 179, 220, 228, 163, 214, 220, 72,  134, 181, 206,
+        216, 225, 177, 218, 231, 158, 213, 223, 112, 150, 185, 210,
+        245, 251, 204, 234, 247, 195, 231, 243, 163, 186, 213, 235 },
+      { 161, 128, 174, 205, 128, 146, 182, 128, 59,  125, 179, 128,
+        183, 208, 199, 220, 239, 184, 213, 217, 71,  141, 196, 217,
+        213, 219, 215, 230, 237, 171, 224, 238, 112, 173, 193, 221,
+        239, 246, 168, 243, 249, 93,  241, 247, 128, 195, 216, 233 },
+  },
+};
+#endif  // CONFIG_LV_MAP
+
+#if CONFIG_ALT_INTRA
+
+const aom_prob av1_kf_y_mode_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1] = {
+  {
+      // above = dc
+      { 121, 30, 54, 128, 164, 158, 45, 41, 57, 91 },   // left = dc
+      { 91, 38, 101, 102, 124, 141, 49, 48, 45, 73 },   // left = v
+      { 66, 28, 27, 177, 225, 178, 32, 27, 52, 114 },   // left = h
+      { 106, 23, 50, 101, 134, 148, 64, 50, 49, 107 },  // left = d45
+      { 75, 24, 32, 118, 66, 143, 42, 28, 57, 74 },     // left = d135
+      { 95, 24, 40, 142, 56, 141, 72, 121, 129, 255 },  // left = d117
+      { 71, 14, 25, 126, 117, 201, 28, 21, 117, 89 },   // left = d153
+      { 85, 16, 37, 110, 163, 178, 41, 28, 48, 134 },   // left = d207
+      { 86, 25, 32, 83, 105, 133, 58, 81, 46, 95 },     // left = d63
+      { 79, 25, 38, 75, 150, 255, 30, 49, 34, 51 },     // left = smooth
+      { 68, 59, 48, 122, 193, 158, 43, 46, 46, 112 },   // left = paeth
+  },
+  {
+      // above = v
+      { 66, 21, 118, 111, 145, 107, 27, 50, 27, 54 },    // left = dc
+      { 52, 25, 167, 81, 120, 101, 34, 55, 19, 32 },     // left = v
+      { 56, 18, 72, 134, 208, 139, 31, 34, 27, 89 },     // left = h
+      { 75, 21, 94, 88, 134, 123, 49, 57, 30, 68 },      // left = d45
+      { 54, 18, 95, 96, 78, 107, 33, 49, 28, 65 },       // left = d135
+      { 61, 19, 121, 131, 58, 101, 56, 143, 120, 255 },  // left = d117
+      { 53, 13, 78, 103, 110, 147, 31, 41, 64, 77 },     // left = d153
+      { 69, 14, 78, 93, 167, 121, 31, 39, 25, 113 },     // left = d207
+      { 64, 18, 103, 79, 90, 108, 34, 73, 27, 69 },      // left = d63
+      { 52, 20, 103, 61, 161, 255, 22, 42, 16, 35 },     // left = smooth
+      { 50, 31, 124, 92, 161, 120, 50, 53, 23, 60 },     // left = paeth
+  },
+  {
+      // above = h
+      { 94, 29, 31, 158, 214, 178, 35, 31, 72, 111 },   // left = dc
+      { 72, 37, 72, 149, 184, 177, 43, 40, 53, 105 },   // left = v
+      { 53, 21, 14, 196, 242, 209, 29, 19, 55, 145 },   // left = h
+      { 93, 36, 36, 104, 176, 166, 56, 37, 49, 141 },   // left = d45
+      { 84, 32, 27, 124, 108, 143, 38, 36, 76, 134 },   // left = d135
+      { 82, 31, 47, 142, 122, 161, 83, 73, 126, 255 },  // left = d117
+      { 66, 16, 20, 133, 148, 210, 30, 17, 113, 104 },  // left = d153
+      { 76, 16, 17, 129, 207, 181, 41, 20, 46, 163 },   // left = d207
+      { 72, 38, 21, 100, 142, 171, 37, 70, 49, 111 },   // left = d63
+      { 61, 30, 27, 115, 208, 255, 27, 31, 44, 63 },    // left = smooth
+      { 53, 45, 29, 157, 222, 185, 49, 37, 55, 102 },   // left = paeth
+  },
+  {
+      // above = d45
+      { 96, 18, 37, 98, 138, 154, 68, 56, 59, 96 },    // left = dc
+      { 73, 18, 92, 81, 125, 132, 75, 64, 27, 67 },    // left = v
+      { 73, 17, 27, 128, 213, 154, 56, 44, 32, 105 },  // left = h
+      { 101, 20, 21, 75, 138, 138, 82, 56, 23, 154 },  // left = d45
+      { 71, 15, 33, 91, 70, 150, 62, 55, 38, 118 },    // left = d135
+      { 80, 19, 38, 116, 69, 122, 88, 132, 92, 255 },  // left = d117
+      { 68, 11, 22, 101, 116, 179, 52, 44, 85, 96 },   // left = d153
+      { 101, 8, 59, 77, 151, 170, 53, 41, 35, 172 },   // left = d207
+      { 82, 19, 24, 81, 172, 129, 82, 128, 43, 108 },  // left = d63
+      { 66, 18, 42, 64, 143, 255, 52, 52, 25, 83 },    // left = smooth
+      { 57, 24, 42, 85, 169, 145, 104, 71, 34, 86 },   // left = paeth
+  },
+  {
+      // above = d135
+      { 85, 15, 29, 113, 83, 176, 26, 29, 70, 110 },    // left = dc
+      { 78, 28, 49, 111, 91, 141, 30, 42, 48, 75 },     // left = v
+      { 56, 21, 16, 146, 190, 178, 23, 31, 49, 92 },    // left = h
+      { 70, 19, 20, 65, 90, 173, 97, 36, 57, 98 },      // left = d45
+      { 77, 14, 26, 110, 51, 156, 34, 35, 54, 74 },     // left = d135
+      { 78, 18, 36, 153, 47, 131, 62, 102, 155, 255 },  // left = d117
+      { 56, 11, 15, 115, 85, 196, 32, 45, 81, 96 },     // left = d153
+      { 90, 18, 24, 95, 126, 159, 34, 31, 46, 136 },    // left = d207
+      { 80, 23, 28, 90, 75, 141, 39, 50, 46, 87 },      // left = d63
+      { 63, 22, 31, 91, 110, 255, 26, 43, 51, 51 },     // left = smooth
+      { 66, 32, 31, 122, 145, 165, 40, 43, 56, 79 },    // left = paeth
+  },
+  {
+      // above = d117
+      { 81, 16, 61, 170, 74, 105, 54, 105, 113, 255 },  // left = dc
+      { 74, 20, 86, 163, 64, 97, 65, 129, 101, 255 },   // left = v
+      { 63, 15, 47, 168, 141, 176, 69, 77, 77, 255 },   // left = h
+      { 70, 17, 59, 97, 78, 114, 74, 122, 80, 255 },    // left = d45
+      { 78, 13, 50, 153, 34, 126, 75, 114, 120, 255 },  // left = d135
+      { 72, 16, 69, 159, 28, 108, 63, 134, 107, 255 },  // left = d117
+      { 66, 9, 47, 131, 79, 148, 41, 88, 105, 255 },    // left = d153
+      { 78, 12, 60, 119, 105, 133, 47, 95, 63, 255 },   // left = d207
+      { 82, 21, 58, 128, 61, 98, 64, 136, 91, 255 },    // left = d63
+      { 23, 26, 28, 96, 85, 128, 51, 64, 85, 128 },     // left = smooth
+      { 58, 27, 62, 162, 109, 151, 75, 106, 78, 255 },  // left = paeth
+  },
+  {
+      // above = d153
+      { 91, 18, 25, 121, 166, 173, 25, 25, 128, 102 },  // left = dc
+      { 80, 27, 51, 111, 141, 147, 45, 38, 70, 85 },    // left = v
+      { 53, 12, 11, 154, 197, 225, 17, 17, 74, 145 },   // left = h
+      { 93, 27, 23, 111, 143, 188, 43, 39, 69, 112 },   // left = d45
+      { 83, 15, 21, 118, 67, 178, 40, 33, 73, 92 },     // left = d135
+      { 94, 13, 31, 132, 66, 110, 61, 82, 148, 255 },   // left = d117
+      { 76, 9, 11, 96, 105, 201, 16, 13, 157, 97 },     // left = d153
+      { 70, 10, 12, 100, 172, 201, 23, 17, 53, 158 },   // left = d207
+      { 114, 25, 21, 104, 108, 163, 30, 47, 53, 111 },  // left = d63
+      { 70, 16, 21, 80, 157, 255, 25, 30, 81, 69 },     // left = smooth
+      { 87, 32, 26, 120, 191, 168, 32, 33, 70, 118 },   // left = paeth
+  },
+  {
+      // above = d207
+      { 98, 20, 39, 122, 168, 188, 38, 36, 54, 132 },   // left = dc
+      { 81, 37, 62, 97, 122, 153, 38, 43, 36, 118 },    // left = v
+      { 71, 21, 22, 154, 227, 183, 37, 31, 46, 140 },   // left = h
+      { 90, 34, 19, 93, 144, 194, 65, 47, 41, 163 },    // left = d45
+      { 78, 20, 27, 91, 93, 173, 57, 52, 49, 113 },     // left = d135
+      { 79, 25, 45, 121, 101, 147, 69, 56, 122, 255 },  // left = d117
+      { 73, 13, 19, 105, 122, 206, 40, 28, 91, 126 },   // left = d153
+      { 101, 14, 22, 87, 153, 169, 33, 25, 26, 175 },   // left = d207
+      { 81, 28, 23, 86, 115, 169, 48, 56, 41, 111 },    // left = d63
+      { 70, 24, 30, 90, 180, 255, 38, 26, 36, 82 },     // left = smooth
+      { 61, 37, 30, 94, 189, 163, 76, 50, 36, 127 },    // left = paeth
+  },
+  {
+      // above = d63
+      { 77, 13, 46, 86, 138, 117, 55, 88, 34, 68 },     // left = dc
+      { 68, 17, 80, 64, 105, 108, 66, 115, 32, 45 },    // left = v
+      { 62, 13, 37, 124, 210, 131, 46, 57, 28, 103 },   // left = h
+      { 88, 15, 45, 73, 134, 145, 73, 101, 37, 87 },    // left = d45
+      { 68, 16, 35, 78, 81, 133, 54, 71, 33, 67 },      // left = d135
+      { 71, 16, 57, 108, 61, 135, 71, 184, 113, 255 },  // left = d117
+      { 55, 10, 27, 69, 107, 158, 39, 76, 82, 95 },     // left = d153
+      { 80, 9, 38, 78, 153, 145, 50, 63, 28, 123 },     // left = d207
+      { 86, 12, 33, 49, 107, 135, 64, 134, 57, 89 },    // left = d63
+      { 56, 19, 55, 60, 163, 255, 38, 84, 22, 36 },     // left = smooth
+      { 53, 17, 60, 69, 151, 126, 73, 113, 26, 80 },    // left = paeth
+  },
+  {
+      // above = smooth
+      { 79, 16, 46, 89, 167, 255, 22, 36, 29, 42 },   // left = dc
+      { 63, 22, 88, 71, 131, 255, 26, 41, 21, 35 },   // left = v
+      { 51, 18, 28, 142, 232, 255, 26, 25, 25, 75 },  // left = h
+      { 75, 18, 43, 70, 140, 255, 37, 49, 34, 89 },   // left = d45
+      { 70, 14, 35, 87, 83, 255, 30, 36, 34, 50 },    // left = d135
+      { 23, 26, 28, 96, 85, 128, 51, 64, 85, 128 },   // left = d117
+      { 74, 12, 33, 83, 128, 255, 27, 33, 58, 68 },   // left = d153
+      { 66, 11, 30, 77, 179, 255, 21, 27, 23, 113 },  // left = d207
+      { 68, 22, 40, 65, 118, 255, 28, 61, 30, 50 },   // left = d63
+      { 60, 18, 44, 69, 141, 255, 18, 32, 22, 40 },   // left = smooth
+      { 52, 32, 54, 96, 194, 255, 33, 37, 25, 53 },   // left = paeth
+  },
+  {
+      // above = paeth
+      { 76, 47, 67, 123, 182, 150, 41, 52, 55, 97 },    // left = dc
+      { 69, 40, 125, 102, 138, 138, 42, 55, 32, 70 },   // left = v
+      { 46, 28, 27, 160, 232, 169, 34, 21, 32, 122 },   // left = h
+      { 78, 35, 41, 99, 128, 124, 49, 43, 35, 111 },    // left = d45
+      { 66, 28, 47, 100, 113, 145, 37, 40, 72, 93 },    // left = d135
+      { 77, 37, 76, 134, 124, 124, 65, 122, 88, 255 },  // left = d117
+      { 53, 23, 38, 108, 128, 204, 26, 32, 115, 114 },  // left = d153
+      { 65, 20, 29, 101, 202, 186, 29, 24, 29, 188 },   // left = d207
+      { 71, 24, 49, 81, 126, 151, 36, 65, 28, 93 },     // left = d63
+      { 54, 36, 53, 94, 193, 255, 25, 38, 20, 64 },     // left = smooth
+      { 52, 54, 60, 108, 176, 168, 47, 44, 50, 105 },   // left = paeth
+  },
+};
+
+static const aom_prob default_if_y_probs[BLOCK_SIZE_GROUPS][INTRA_MODES - 1] = {
+  { 88, 16, 47, 133, 143, 150, 70, 48, 84, 122 },  // block_size < 8x8
+  { 75, 26, 51, 120, 158, 157, 44, 45, 56, 102 },  // block_size < 16x16
+  { 73, 24, 60, 115, 184, 164, 26, 36, 32, 63 },   // block_size < 32x32
+  { 96, 27, 50, 107, 221, 148, 16, 22, 14, 39 },   // block_size >= 32x32
+};
+
+static const aom_prob default_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
+  { 199, 3, 79, 179, 220, 109, 38, 50, 68, 138 },   // y = dc
+  { 17, 2, 219, 136, 131, 58, 21, 106, 23, 41 },    // y = v
+  { 26, 1, 5, 244, 253, 138, 16, 21, 68, 205 },     // y = h
+  { 183, 3, 66, 94, 195, 97, 101, 104, 41, 178 },   // y = d45
+  { 178, 2, 36, 158, 99, 175, 21, 29, 105, 77 },    // y = d135
+  { 154, 3, 65, 219, 40, 48, 45, 95, 146, 255 },    // y = d117
+  { 167, 1, 16, 160, 214, 187, 10, 10, 200, 155 },  // y = d153
+  { 154, 2, 18, 178, 238, 132, 25, 21, 34, 221 },   // y = d207
+  { 153, 4, 76, 85, 157, 90, 38, 165, 46, 104 },    // y = d63
+  { 163, 3, 68, 87, 190, 255, 19, 27, 25, 46 },     // y = smooth
+  { 185, 7, 113, 171, 203, 57, 18, 69, 49, 104 },   // y = paeth
+};
+
+#else
+
+const aom_prob av1_kf_y_mode_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1] = {
+  {
+      // above = dc
+      { 137, 30, 42, 148, 151, 207, 70, 52, 91 },   // left = dc
+      { 92, 45, 102, 136, 116, 180, 74, 90, 100 },  // left = v
+      { 73, 32, 19, 187, 222, 215, 46, 34, 100 },   // left = h
+      { 91, 30, 32, 116, 121, 186, 93, 86, 94 },    // left = d45
+      { 72, 35, 36, 149, 68, 206, 68, 63, 105 },    // left = d135
+      { 73, 31, 28, 138, 57, 124, 55, 122, 151 },   // left = d117
+      { 67, 23, 21, 140, 126, 197, 40, 37, 171 },   // left = d153
+      { 86, 27, 28, 128, 154, 212, 45, 43, 53 },    // left = d207
+      { 74, 32, 27, 107, 86, 160, 63, 134, 102 },   // left = d63
+      { 59, 67, 44, 140, 161, 202, 78, 67, 119 }    // left = tm
+  },
+  {
+      // above = v
+      { 63, 36, 126, 146, 123, 158, 60, 90, 96 },   // left = dc
+      { 43, 46, 168, 134, 107, 128, 69, 142, 92 },  // left = v
+      { 44, 29, 68, 159, 201, 177, 50, 57, 77 },    // left = h
+      { 58, 38, 76, 114, 97, 172, 78, 133, 92 },    // left = d45
+      { 46, 41, 76, 140, 63, 184, 69, 112, 57 },    // left = d135
+      { 38, 32, 85, 140, 46, 112, 54, 151, 133 },   // left = d117
+      { 39, 27, 61, 131, 110, 175, 44, 75, 136 },   // left = d153
+      { 52, 30, 74, 113, 130, 175, 51, 64, 58 },    // left = d207
+      { 47, 35, 80, 100, 74, 143, 64, 163, 74 },    // left = d63
+      { 36, 61, 116, 114, 128, 162, 80, 125, 82 }   // left = tm
+  },
+  {
+      // above = h
+      { 82, 26, 26, 171, 208, 204, 44, 32, 105 },  // left = dc
+      { 55, 44, 68, 166, 179, 192, 57, 57, 108 },  // left = v
+      { 42, 26, 11, 199, 241, 228, 23, 15, 85 },   // left = h
+      { 68, 42, 19, 131, 160, 199, 55, 52, 83 },   // left = d45
+      { 58, 50, 25, 139, 115, 232, 39, 52, 118 },  // left = d135
+      { 50, 35, 33, 153, 104, 162, 64, 59, 131 },  // left = d117
+      { 44, 24, 16, 150, 177, 202, 33, 19, 156 },  // left = d153
+      { 55, 27, 12, 153, 203, 218, 26, 27, 49 },   // left = d207
+      { 53, 49, 21, 110, 116, 168, 59, 80, 76 },   // left = d63
+      { 38, 72, 19, 168, 203, 212, 50, 50, 107 }   // left = tm
+  },
+  {
+      // above = d45
+      { 103, 26, 36, 129, 132, 201, 83, 80, 93 },  // left = dc
+      { 59, 38, 83, 112, 103, 162, 98, 136, 90 },  // left = v
+      { 62, 30, 23, 158, 200, 207, 59, 57, 50 },   // left = h
+      { 67, 30, 29, 84, 86, 191, 102, 91, 59 },    // left = d45
+      { 60, 32, 33, 112, 71, 220, 64, 89, 104 },   // left = d135
+      { 53, 26, 34, 130, 56, 149, 84, 120, 103 },  // left = d117
+      { 53, 21, 23, 133, 109, 210, 56, 77, 172 },  // left = d153
+      { 77, 19, 29, 112, 142, 228, 55, 66, 36 },   // left = d207
+      { 61, 29, 29, 93, 97, 165, 83, 175, 162 },   // left = d63
+      { 47, 47, 43, 114, 137, 181, 100, 99, 95 }   // left = tm
+  },
+  {
+      // above = d135
+      { 69, 23, 29, 128, 83, 199, 46, 44, 101 },   // left = dc
+      { 53, 40, 55, 139, 69, 183, 61, 80, 110 },   // left = v
+      { 40, 29, 19, 161, 180, 207, 43, 24, 91 },   // left = h
+      { 60, 34, 19, 105, 61, 198, 53, 64, 89 },    // left = d45
+      { 52, 31, 22, 158, 40, 209, 58, 62, 89 },    // left = d135
+      { 44, 31, 29, 147, 46, 158, 56, 102, 198 },  // left = d117
+      { 35, 19, 12, 135, 87, 209, 41, 45, 167 },   // left = d153
+      { 55, 25, 21, 118, 95, 215, 38, 39, 66 },    // left = d207
+      { 51, 38, 25, 113, 58, 164, 70, 93, 97 },    // left = d63
+      { 47, 54, 34, 146, 108, 203, 72, 103, 151 }  // left = tm
+  },
+  {
+      // above = d117
+      { 64, 19, 37, 156, 66, 138, 49, 95, 133 },   // left = dc
+      { 46, 27, 80, 150, 55, 124, 55, 121, 135 },  // left = v
+      { 36, 23, 27, 165, 149, 166, 54, 64, 118 },  // left = h
+      { 53, 21, 36, 131, 63, 163, 60, 109, 81 },   // left = d45
+      { 40, 26, 35, 154, 40, 185, 51, 97, 123 },   // left = d135
+      { 35, 19, 34, 179, 19, 97, 48, 129, 124 },   // left = d117
+      { 36, 20, 26, 136, 62, 164, 33, 77, 154 },   // left = d153
+      { 45, 18, 32, 130, 90, 157, 40, 79, 91 },    // left = d207
+      { 45, 26, 28, 129, 45, 129, 49, 147, 123 },  // left = d63
+      { 38, 44, 51, 136, 74, 162, 57, 97, 121 }    // left = tm
+  },
+  {
+      // above = d153
+      { 75, 17, 22, 136, 138, 185, 32, 34, 166 },  // left = dc
+      { 56, 39, 58, 133, 117, 173, 48, 53, 187 },  // left = v
+      { 35, 21, 12, 161, 212, 207, 20, 23, 145 },  // left = h
+      { 56, 29, 19, 117, 109, 181, 55, 68, 112 },  // left = d45
+      { 47, 29, 17, 153, 64, 220, 59, 51, 114 },   // left = d135
+      { 46, 16, 24, 136, 76, 147, 41, 64, 172 },   // left = d117
+      { 34, 17, 11, 108, 152, 187, 13, 15, 209 },  // left = d153
+      { 51, 24, 14, 115, 133, 209, 32, 26, 104 },  // left = d207
+      { 55, 30, 18, 122, 79, 179, 44, 88, 116 },   // left = d63
+      { 37, 49, 25, 129, 168, 164, 41, 54, 148 }   // left = tm
+  },
+  {
+      // above = d207
+      { 82, 22, 32, 127, 143, 213, 39, 41, 70 },   // left = dc
+      { 62, 44, 61, 123, 105, 189, 48, 57, 64 },   // left = v
+      { 47, 25, 17, 175, 222, 220, 24, 30, 86 },   // left = h
+      { 68, 36, 17, 106, 102, 206, 59, 74, 74 },   // left = d45
+      { 57, 39, 23, 151, 68, 216, 55, 63, 58 },    // left = d135
+      { 49, 30, 35, 141, 70, 168, 82, 40, 115 },   // left = d117
+      { 51, 25, 15, 136, 129, 202, 38, 35, 139 },  // left = d153
+      { 68, 26, 16, 111, 141, 215, 29, 28, 28 },   // left = d207
+      { 59, 39, 19, 114, 75, 180, 77, 104, 42 },   // left = d63
+      { 40, 61, 26, 126, 152, 206, 61, 59, 93 }    // left = tm
+  },
+  {
+      // above = d63
+      { 78, 23, 39, 111, 117, 170, 74, 124, 94 },   // left = dc
+      { 48, 34, 86, 101, 92, 146, 78, 179, 134 },   // left = v
+      { 47, 22, 24, 138, 187, 178, 68, 69, 59 },    // left = h
+      { 56, 25, 33, 105, 112, 187, 95, 177, 129 },  // left = d45
+      { 48, 31, 27, 114, 63, 183, 82, 116, 56 },    // left = d135
+      { 43, 28, 37, 121, 63, 123, 61, 192, 169 },   // left = d117
+      { 42, 17, 24, 109, 97, 177, 56, 76, 122 },    // left = d153
+      { 58, 18, 28, 105, 139, 182, 70, 92, 63 },    // left = d207
+      { 46, 23, 32, 74, 86, 150, 67, 183, 88 },     // left = d63
+      { 36, 38, 48, 92, 122, 165, 88, 137, 91 }     // left = tm
+  },
+  {
+      // above = tm
+      { 65, 70, 60, 155, 159, 199, 61, 60, 81 },    // left = dc
+      { 44, 78, 115, 132, 119, 173, 71, 112, 93 },  // left = v
+      { 39, 38, 21, 184, 227, 206, 42, 32, 64 },    // left = h
+      { 58, 47, 36, 124, 137, 193, 80, 82, 78 },    // left = d45
+      { 49, 50, 35, 144, 95, 205, 63, 78, 59 },     // left = d135
+      { 41, 53, 52, 148, 71, 142, 65, 128, 51 },    // left = d117
+      { 40, 36, 28, 143, 143, 202, 40, 55, 137 },   // left = d153
+      { 52, 34, 29, 129, 183, 227, 42, 35, 43 },    // left = d207
+      { 42, 44, 44, 104, 105, 164, 64, 130, 80 },   // left = d63
+      { 43, 81, 53, 140, 169, 204, 68, 84, 72 }     // left = tm
+  }
+};
+
+// Default probabilities for signaling Intra mode for Y plane -- used only for
+// inter frames. ('av1_kf_y_mode_prob' is used for intra-only frames).
+// Context used: block size group.
+static const aom_prob default_if_y_probs[BLOCK_SIZE_GROUPS][INTRA_MODES - 1] = {
+  { 65, 32, 18, 144, 162, 194, 41, 51, 98 },   // block_size < 8x8
+  { 132, 68, 18, 165, 217, 196, 45, 40, 78 },  // block_size < 16x16
+  { 173, 80, 19, 176, 240, 193, 64, 35, 46 },  // block_size < 32x32
+  { 221, 135, 38, 194, 248, 121, 96, 85, 29 }  // block_size >= 32x32
+};
+
+// Default probabilities for signaling Intra mode for UV plane -- common for
+// both intra and inter frames.
+// Context used: Intra mode used by Y plane of the same block.
+static const aom_prob default_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
+  { 120, 7, 76, 176, 208, 126, 28, 54, 103 },   // y = dc
+  { 48, 12, 154, 155, 139, 90, 34, 117, 119 },  // y = v
+  { 67, 6, 25, 204, 243, 158, 13, 21, 96 },     // y = h
+  { 97, 5, 44, 131, 176, 139, 48, 68, 97 },     // y = d45
+  { 83, 5, 42, 156, 111, 152, 26, 49, 152 },    // y = d135
+  { 80, 5, 58, 178, 74, 83, 33, 62, 145 },      // y = d117
+  { 86, 5, 32, 154, 192, 168, 14, 22, 163 },    // y = d153
+  { 85, 5, 32, 156, 216, 148, 19, 29, 73 },     // y = d207
+  { 77, 7, 64, 116, 132, 122, 37, 126, 120 },   // y = d63
+  { 101, 21, 107, 181, 192, 103, 19, 67, 125 }  // y = tm
+};
+
+#endif  // CONFIG_ALT_INTRA
+
+#if CONFIG_EXT_PARTITION_TYPES
+static const aom_prob
+    default_partition_probs[PARTITION_CONTEXTS][EXT_PARTITION_TYPES - 1] = {
+      // 8x8 -> 4x4
+      { 199, 122, 141, 128, 128, 128, 128 },  // a/l both not split
+      { 147, 63, 159, 128, 128, 128, 128 },   // a split, l not split
+      { 148, 133, 118, 128, 128, 128, 128 },  // l split, a not split
+      { 121, 104, 114, 128, 128, 128, 128 },  // a/l both split
+      // 16x16 -> 8x8
+      { 174, 73, 87, 128, 128, 128, 128 },  // a/l both not split
+      { 92, 41, 83, 128, 128, 128, 128 },   // a split, l not split
+      { 82, 99, 50, 128, 128, 128, 128 },   // l split, a not split
+      { 53, 39, 39, 128, 128, 128, 128 },   // a/l both split
+      // 32x32 -> 16x16
+      { 177, 58, 59, 128, 128, 128, 128 },  // a/l both not split
+      { 68, 26, 63, 128, 128, 128, 128 },   // a split, l not split
+      { 52, 79, 25, 128, 128, 128, 128 },   // l split, a not split
+      { 17, 14, 12, 128, 128, 128, 128 },   // a/l both split
+      // 64x64 -> 32x32
+      { 222, 34, 30, 128, 128, 128, 128 },  // a/l both not split
+      { 72, 16, 44, 128, 128, 128, 128 },   // a split, l not split
+      { 58, 32, 12, 128, 128, 128, 128 },   // l split, a not split
+      { 10, 7, 6, 128, 128, 128, 128 },     // a/l both split
+#if CONFIG_EXT_PARTITION
+      // 128x128 -> 64x64
+      { 222, 34, 30, 128, 128, 128, 128 },  // a/l both not split
+      { 72, 16, 44, 128, 128, 128, 128 },   // a split, l not split
+      { 58, 32, 12, 128, 128, 128, 128 },   // l split, a not split
+      { 10, 7, 6, 128, 128, 128, 128 },     // a/l both split
+#endif                                      // CONFIG_EXT_PARTITION
+#if CONFIG_UNPOISON_PARTITION_CTX
+      { 0, 0, 141, 0, 0, 0, 0 },  // 8x8 -> 4x4
+      { 0, 0, 87, 0, 0, 0, 0 },   // 16x16 -> 8x8
+      { 0, 0, 59, 0, 0, 0, 0 },   // 32x32 -> 16x16
+      { 0, 0, 30, 0, 0, 0, 0 },   // 64x64 -> 32x32
+#if CONFIG_EXT_PARTITION
+      { 0, 0, 30, 0, 0, 0, 0 },   // 128x128 -> 64x64
+#endif                            // CONFIG_EXT_PARTITION
+      { 0, 122, 0, 0, 0, 0, 0 },  // 8x8 -> 4x4
+      { 0, 73, 0, 0, 0, 0, 0 },   // 16x16 -> 8x8
+      { 0, 58, 0, 0, 0, 0, 0 },   // 32x32 -> 16x16
+      { 0, 34, 0, 0, 0, 0, 0 },   // 64x64 -> 32x32
+#if CONFIG_EXT_PARTITION
+      { 0, 34, 0, 0, 0, 0, 0 },  // 128x128 -> 64x64
+#endif                           // CONFIG_EXT_PARTITION
+#endif                           // CONFIG_UNPOISON_PARTITION_CTX
+    };
+#else
+static const aom_prob
+    default_partition_probs[PARTITION_CONTEXTS][PARTITION_TYPES - 1] = {
+      // 8x8 -> 4x4
+      { 199, 122, 141 },  // a/l both not split
+      { 147, 63, 159 },   // a split, l not split
+      { 148, 133, 118 },  // l split, a not split
+      { 121, 104, 114 },  // a/l both split
+      // 16x16 -> 8x8
+      { 174, 73, 87 },  // a/l both not split
+      { 92, 41, 83 },   // a split, l not split
+      { 82, 99, 50 },   // l split, a not split
+      { 53, 39, 39 },   // a/l both split
+      // 32x32 -> 16x16
+      { 177, 58, 59 },  // a/l both not split
+      { 68, 26, 63 },   // a split, l not split
+      { 52, 79, 25 },   // l split, a not split
+      { 17, 14, 12 },   // a/l both split
+      // 64x64 -> 32x32
+      { 222, 34, 30 },  // a/l both not split
+      { 72, 16, 44 },   // a split, l not split
+      { 58, 32, 12 },   // l split, a not split
+      { 10, 7, 6 },     // a/l both split
+#if CONFIG_EXT_PARTITION
+      // 128x128 -> 64x64
+      { 222, 34, 30 },  // a/l both not split
+      { 72, 16, 44 },   // a split, l not split
+      { 58, 32, 12 },   // l split, a not split
+      { 10, 7, 6 },     // a/l both split
+#endif  // CONFIG_EXT_PARTITION
+#if CONFIG_UNPOISON_PARTITION_CTX
+      { 0, 0, 141 },    // 8x8 -> 4x4
+      { 0, 0, 87 },     // 16x16 -> 8x8
+      { 0, 0, 59 },     // 32x32 -> 16x16
+      { 0, 0, 30 },     // 64x64 -> 32x32
+#if CONFIG_EXT_PARTITION
+      { 0, 0, 30 },     // 128x128 -> 64x64
+#endif  // CONFIG_EXT_PARTITION
+      { 0, 122, 0 },    // 8x8 -> 4x4
+      { 0, 73, 0 },     // 16x16 -> 8x8
+      { 0, 58, 0 },     // 32x32 -> 16x16
+      { 0, 34, 0 },     // 64x64 -> 32x32
+#if CONFIG_EXT_PARTITION
+      { 0, 34, 0 },     // 128x128 -> 64x64
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_UNPOISON_PARTITION_CTX
+    };
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
+#if CONFIG_REF_MV
+static const aom_prob default_newmv_prob[NEWMV_MODE_CONTEXTS] = {
+  200, 180, 150, 150, 110, 70, 60,
+};
+
+static const aom_prob default_zeromv_prob[ZEROMV_MODE_CONTEXTS] = {
+  192, 64,
+};
+
+static const aom_prob default_refmv_prob[REFMV_MODE_CONTEXTS] = {
+  220, 220, 200, 200, 180, 128, 30, 220, 30,
+};
+
+static const aom_prob default_drl_prob[DRL_MODE_CONTEXTS] = { 128, 160, 180,
+                                                              128, 160 };
+#endif  // CONFIG_REF_MV
+
+static const aom_prob
+    default_inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1] = {
+      { 2, 173, 34 },  // 0 = both zero mv
+      { 7, 145, 85 },  // 1 = one zero mv + one a predicted mv
+      { 7, 166, 63 },  // 2 = two predicted mvs
+      { 7, 94, 66 },   // 3 = one predicted/zero and one new mv
+      { 8, 64, 46 },   // 4 = two new mvs
+      { 17, 81, 31 },  // 5 = one intra neighbour + x
+      { 25, 29, 30 },  // 6 = two intra neighbours
+    };
+
+#if CONFIG_EXT_INTER
+static const aom_prob default_inter_compound_mode_probs
+    [INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES - 1] = {
+      { 2, 173, 68, 192, 64, 192, 128, 180, 180 },   // 0 = both zero mv
+      { 7, 145, 160, 192, 64, 192, 128, 180, 180 },  // 1 = 1 zero + 1 predicted
+      { 7, 166, 126, 192, 64, 192, 128, 180, 180 },  // 2 = two predicted mvs
+      { 7, 94, 132, 192, 64, 192, 128, 180, 180 },   // 3 = 1 pred/zero, 1 new
+      { 8, 64, 64, 192, 64, 192, 128, 180, 180 },    // 4 = two new mvs
+      { 17, 81, 52, 192, 64, 192, 128, 180, 180 },   // 5 = one intra neighbour
+      { 25, 29, 50, 192, 64, 192, 128, 180, 180 },   // 6 = two intra neighbours
+    };
+
+#if CONFIG_COMPOUND_SINGLEREF
+// TODO(zoeliu): Default values to be further adjusted based on the collected
+//               stats.
+static const aom_prob default_inter_singleref_comp_mode_probs
+    [INTER_MODE_CONTEXTS][INTER_SINGLEREF_COMP_MODES - 1] = {
+      { 2, 173, 68, 180 },   // 0 = both zero mv
+      { 7, 145, 160, 180 },  // 1 = 1 zero + 1 predicted
+      { 7, 166, 126, 180 },  // 2 = two predicted mvs
+      { 7, 94, 132, 180 },   // 3 = 1 pred/zero, 1 new
+      { 8, 64, 64, 180 },    // 4 = two new mvs
+      { 17, 81, 52, 180 },   // 5 = one intra neighbour
+      { 25, 29, 50, 180 },   // 6 = two intra neighbours
+    };
+#endif  // CONFIG_COMPOUND_SINGLEREF
+
+#if CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
+static const aom_prob
+    default_compound_type_probs[BLOCK_SIZES][COMPOUND_TYPES - 1] = {
+#if CONFIG_CB4X4
+      { 255, 255 }, { 255, 255 }, { 255, 255 },
+#endif
+      { 208, 200 }, { 208, 200 }, { 208, 200 }, { 208, 200 }, { 208, 200 },
+      { 208, 200 }, { 216, 200 }, { 216, 200 }, { 216, 200 }, { 224, 200 },
+      { 224, 200 }, { 240, 200 }, { 240, 200 },
+#if CONFIG_EXT_PARTITION
+      { 255, 200 }, { 255, 200 }, { 255, 200 },
+#endif  // CONFIG_EXT_PARTITION
+    };
+#elif !CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
+static const aom_prob
+    default_compound_type_probs[BLOCK_SIZES][COMPOUND_TYPES - 1] = {
+#if CONFIG_CB4X4
+      { 208 }, { 208 }, { 208 },
+#endif
+      { 208 }, { 208 }, { 208 }, { 208 }, { 208 }, { 208 }, { 216 },
+      { 216 }, { 216 }, { 224 }, { 224 }, { 240 }, { 240 },
+#if CONFIG_EXT_PARTITION
+      { 255 }, { 255 }, { 255 },
+#endif  // CONFIG_EXT_PARTITION
+    };
+#elif CONFIG_COMPOUND_SEGMENT && !CONFIG_WEDGE
+static const aom_prob
+    default_compound_type_probs[BLOCK_SIZES][COMPOUND_TYPES - 1] = {
+#if CONFIG_CB4X4
+      { 208 }, { 208 }, { 208 },
+#endif
+      { 208 }, { 208 }, { 208 }, { 208 }, { 208 }, { 208 }, { 216 },
+      { 216 }, { 216 }, { 224 }, { 224 }, { 240 }, { 240 },
+#if CONFIG_EXT_PARTITION
+      { 255 }, { 255 }, { 255 },
+#endif  // CONFIG_EXT_PARTITION
+    };
+#else
+static const aom_prob default_compound_type_probs[BLOCK_SIZES]
+                                                 [COMPOUND_TYPES - 1];
+#endif  // CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
+
+static const aom_prob default_interintra_prob[BLOCK_SIZE_GROUPS] = {
+  208, 208, 208, 208,
+};
+
+static const aom_prob
+    default_interintra_mode_prob[BLOCK_SIZE_GROUPS][INTERINTRA_MODES - 1] = {
+      { 65, 32, 18, 144, 162, 194, 41, 51, 98 },   // block_size < 8x8
+      { 132, 68, 18, 165, 217, 196, 45, 40, 78 },  // block_size < 16x16
+      { 173, 80, 19, 176, 240, 193, 64, 35, 46 },  // block_size < 32x32
+      { 221, 135, 38, 194, 248, 121, 96, 85, 29 }  // block_size >= 32x32
+    };
+
+static const aom_prob default_wedge_interintra_prob[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  208, 208, 208,
+#endif
+  208, 208, 208, 208, 208, 208, 216, 216, 216, 224, 224, 224, 240,
+#if CONFIG_EXT_PARTITION
+  208, 208, 208
+#endif  // CONFIG_EXT_PARTITION
+};
+#endif  // CONFIG_EXT_INTER
+
+// Change this section appropriately once warped motion is supported
+#if CONFIG_MOTION_VAR && !CONFIG_WARPED_MOTION
+const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)] = {
+  -SIMPLE_TRANSLATION, -OBMC_CAUSAL
+};
+static const aom_prob default_motion_mode_prob[BLOCK_SIZES][MOTION_MODES - 1] =
+    {
+#if CONFIG_CB4X4
+      { 255 }, { 255 }, { 255 },
+#endif
+      { 255 }, { 255 }, { 255 }, { 151 }, { 153 }, { 144 }, { 178 },
+      { 165 }, { 160 }, { 207 }, { 195 }, { 168 }, { 244 },
+#if CONFIG_EXT_PARTITION
+      { 252 }, { 252 }, { 252 },
+#endif  // CONFIG_EXT_PARTITION
+    };
+
+#elif !CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+
+const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)] = {
+  -SIMPLE_TRANSLATION, -WARPED_CAUSAL
+};
+
+static const aom_prob default_motion_mode_prob[BLOCK_SIZES][MOTION_MODES - 1] =
+    {
+#if CONFIG_CB4X4
+      { 255 }, { 255 }, { 255 },
+#endif
+      { 255 }, { 255 }, { 255 }, { 151 }, { 153 }, { 144 }, { 178 },
+      { 165 }, { 160 }, { 207 }, { 195 }, { 168 }, { 244 },
+#if CONFIG_EXT_PARTITION
+      { 252 }, { 252 }, { 252 },
+#endif  // CONFIG_EXT_PARTITION
+    };
+
+#elif CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+
+const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)] = {
+  -SIMPLE_TRANSLATION, 2, -OBMC_CAUSAL, -WARPED_CAUSAL,
+};
+static const aom_prob default_motion_mode_prob[BLOCK_SIZES][MOTION_MODES - 1] =
+    {
+#if CONFIG_CB4X4
+      { 255, 200 }, { 255, 200 }, { 255, 200 },
+#endif
+      { 255, 200 }, { 255, 200 }, { 255, 200 }, { 151, 200 }, { 153, 200 },
+      { 144, 200 }, { 178, 200 }, { 165, 200 }, { 160, 200 }, { 207, 200 },
+      { 195, 200 }, { 168, 200 }, { 244, 200 },
+#if CONFIG_EXT_PARTITION
+      { 252, 200 }, { 252, 200 }, { 252, 200 },
+#endif  // CONFIG_EXT_PARTITION
+    };
+
+// Probability for the case that only 1 additional motion mode is allowed
+static const aom_prob default_obmc_prob[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  255, 255, 255,
+#endif
+  255, 255, 255, 151, 153, 144, 178, 165, 160, 207, 195, 168, 244,
+#if CONFIG_EXT_PARTITION
+  252, 252, 252,
+#endif  // CONFIG_EXT_PARTITION
+};
+#endif
+
+#if CONFIG_DELTA_Q
+static const aom_prob default_delta_q_probs[DELTA_Q_PROBS] = { 220, 220, 220 };
+#if CONFIG_EC_MULTISYMBOL
+static const aom_cdf_prob default_delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)] = {
+  AOM_ICDF(28160), AOM_ICDF(32120), AOM_ICDF(32677), AOM_ICDF(32768), 0
+};
+#endif
+#if CONFIG_EXT_DELTA_Q
+static const aom_prob default_delta_lf_probs[DELTA_LF_PROBS] = { 220, 220,
+                                                                 220 };
+#if CONFIG_EC_MULTISYMBOL
+static const aom_cdf_prob default_delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)] = {
+  28160, 32120, 32677, 32768, 0
+};
+#endif
+#endif
+#endif
+#if CONFIG_EC_MULTISYMBOL
+int av1_intra_mode_ind[INTRA_MODES];
+int av1_intra_mode_inv[INTRA_MODES];
+int av1_inter_mode_ind[INTER_MODES];
+int av1_inter_mode_inv[INTER_MODES];
+#if CONFIG_EXT_TX
+int av1_ext_tx_intra_ind[EXT_TX_SETS_INTRA][TX_TYPES];
+int av1_ext_tx_intra_inv[EXT_TX_SETS_INTRA][TX_TYPES];
+int av1_ext_tx_inter_ind[EXT_TX_SETS_INTER][TX_TYPES];
+int av1_ext_tx_inter_inv[EXT_TX_SETS_INTER][TX_TYPES];
+#endif
+#endif
+
+#if CONFIG_ALT_INTRA
+const aom_tree_index av1_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
+  -DC_PRED,   2,            /* 0 = DC_NODE */
+  -TM_PRED,   4,            /* 1 = TM_NODE */
+  -V_PRED,    6,            /* 2 = V_NODE */
+  8,          12,           /* 3 = COM_NODE */
+  -H_PRED,    10,           /* 4 = H_NODE */
+  -D135_PRED, -D117_PRED,   /* 5 = D135_NODE */
+  -D45_PRED,  14,           /* 6 = D45_NODE */
+  -D63_PRED,  16,           /* 7 = D63_NODE */
+  -D153_PRED, 18,           /* 8 = D153_NODE */
+  -D207_PRED, -SMOOTH_PRED, /* 9 = D207_NODE */
+};
+#else
+const aom_tree_index av1_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
+  -DC_PRED,   2,          /* 0 = DC_NODE */
+  -TM_PRED,   4,          /* 1 = TM_NODE */
+  -V_PRED,    6,          /* 2 = V_NODE */
+  8,          12,         /* 3 = COM_NODE */
+  -H_PRED,    10,         /* 4 = H_NODE */
+  -D135_PRED, -D117_PRED, /* 5 = D135_NODE */
+  -D45_PRED,  14,         /* 6 = D45_NODE */
+  -D63_PRED,  16,         /* 7 = D63_NODE */
+  -D153_PRED, -D207_PRED  /* 8 = D153_NODE */
+};
+#endif  // CONFIG_ALT_INTRA
+
+const aom_tree_index av1_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
+  -INTER_OFFSET(ZEROMV), 2, -INTER_OFFSET(NEARESTMV), 4, -INTER_OFFSET(NEARMV),
+  -INTER_OFFSET(NEWMV)
+};
+
+#if CONFIG_EXT_INTER
+/* clang-format off */
+const aom_tree_index av1_interintra_mode_tree[TREE_SIZE(INTERINTRA_MODES)] = {
+  -II_DC_PRED, 2,                   /* 0 = II_DC_NODE     */
+  -II_TM_PRED, 4,                   /* 1 = II_TM_NODE     */
+  -II_V_PRED, 6,                    /* 2 = II_V_NODE      */
+  8, 12,                            /* 3 = II_COM_NODE    */
+  -II_H_PRED, 10,                   /* 4 = II_H_NODE      */
+  -II_D135_PRED, -II_D117_PRED,     /* 5 = II_D135_NODE   */
+  -II_D45_PRED, 14,                 /* 6 = II_D45_NODE    */
+  -II_D63_PRED, 16,                 /* 7 = II_D63_NODE    */
+  -II_D153_PRED, -II_D207_PRED      /* 8 = II_D153_NODE   */
+};
+
+const aom_tree_index av1_inter_compound_mode_tree
+    [TREE_SIZE(INTER_COMPOUND_MODES)] = {
+  -INTER_COMPOUND_OFFSET(ZERO_ZEROMV), 2,
+  -INTER_COMPOUND_OFFSET(NEAREST_NEARESTMV), 4,
+  6, -INTER_COMPOUND_OFFSET(NEW_NEWMV),
+  8, 12,
+  -INTER_COMPOUND_OFFSET(NEAR_NEARMV), 10,
+  -INTER_COMPOUND_OFFSET(NEAREST_NEARMV),
+      -INTER_COMPOUND_OFFSET(NEAR_NEARESTMV),
+  14, 16,
+  -INTER_COMPOUND_OFFSET(NEAREST_NEWMV), -INTER_COMPOUND_OFFSET(NEW_NEARESTMV),
+  -INTER_COMPOUND_OFFSET(NEAR_NEWMV), -INTER_COMPOUND_OFFSET(NEW_NEARMV)
+};
+
+#if CONFIG_COMPOUND_SINGLEREF
+const aom_tree_index av1_inter_singleref_comp_mode_tree
+    [TREE_SIZE(INTER_SINGLEREF_COMP_MODES)] = {
+  -INTER_SINGLEREF_COMP_OFFSET(SR_ZERO_NEWMV), 2,
+  -INTER_SINGLEREF_COMP_OFFSET(SR_NEAREST_NEARMV), 4,
+  6, -INTER_SINGLEREF_COMP_OFFSET(SR_NEW_NEWMV),
+  -INTER_SINGLEREF_COMP_OFFSET(SR_NEAREST_NEWMV),
+  -INTER_SINGLEREF_COMP_OFFSET(SR_NEAR_NEWMV)
+};
+#endif  // CONFIG_COMPOUND_SINGLEREF
+
+#if CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
+const aom_tree_index av1_compound_type_tree[TREE_SIZE(COMPOUND_TYPES)] = {
+  -COMPOUND_AVERAGE, 2, -COMPOUND_WEDGE, -COMPOUND_SEG
+};
+#elif !CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
+const aom_tree_index av1_compound_type_tree[TREE_SIZE(COMPOUND_TYPES)] = {
+  -COMPOUND_AVERAGE, -COMPOUND_WEDGE
+};
+#elif CONFIG_COMPOUND_SEGMENT && !CONFIG_WEDGE
+const aom_tree_index av1_compound_type_tree[TREE_SIZE(COMPOUND_TYPES)] = {
+  -COMPOUND_AVERAGE, -COMPOUND_SEG
+};
+#else
+const aom_tree_index av1_compound_type_tree[TREE_SIZE(COMPOUND_TYPES)] = {};
+#endif  // CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
+/* clang-format on */
+#endif  // CONFIG_EXT_INTER
+
+const aom_tree_index av1_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
+  -PARTITION_NONE, 2, -PARTITION_HORZ, 4, -PARTITION_VERT, -PARTITION_SPLIT
+};
+
+#if CONFIG_EXT_PARTITION_TYPES
+/* clang-format off */
+const aom_tree_index av1_ext_partition_tree[TREE_SIZE(EXT_PARTITION_TYPES)] = {
+  -PARTITION_NONE, 2,
+  6, 4,
+  8, -PARTITION_SPLIT,
+  -PARTITION_HORZ, 10,
+  -PARTITION_VERT, 12,
+  -PARTITION_HORZ_A, -PARTITION_HORZ_B,
+  -PARTITION_VERT_A, -PARTITION_VERT_B
+};
+/* clang-format on */
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
+static const aom_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
+  9, 102, 187, 225
+};
+
+static const aom_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = {
+  239, 183, 119, 96, 41
+};
+
+#if CONFIG_EXT_REFS
+static const aom_prob default_comp_ref_p[REF_CONTEXTS][FWD_REFS - 1] = {
+  // TODO(zoeliu): To adjust the initial prob values.
+  { 33, 16, 16 },
+  { 77, 74, 74 },
+  { 142, 142, 142 },
+  { 172, 170, 170 },
+  { 238, 247, 247 }
+};
+static const aom_prob default_comp_bwdref_p[REF_CONTEXTS][BWD_REFS - 1] = {
+  { 16 }, { 74 }, { 142 }, { 170 }, { 247 }
+};
+#else
+static const aom_prob default_comp_ref_p[REF_CONTEXTS][COMP_REFS - 1] = {
+  { 50 }, { 126 }, { 123 }, { 221 }, { 226 }
+};
+#endif  // CONFIG_EXT_REFS
+
+static const aom_prob default_single_ref_p[REF_CONTEXTS][SINGLE_REFS - 1] = {
+#if CONFIG_EXT_REFS
+  { 33, 16, 16, 16, 16 },
+  { 77, 74, 74, 74, 74 },
+  { 142, 142, 142, 142, 142 },
+  { 172, 170, 170, 170, 170 },
+  { 238, 247, 247, 247, 247 }
+#else
+  { 33, 16 }, { 77, 74 }, { 142, 142 }, { 172, 170 }, { 238, 247 }
+#endif  // CONFIG_EXT_REFS
+};
+
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+// TODO(zoeliu): Default values to be further adjusted based on the collected
+//               stats.
+static const aom_prob default_comp_inter_mode_p[COMP_INTER_MODE_CONTEXTS] = {
+  41, 119, 187, 225
+};
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
+#if CONFIG_PALETTE
+
+// Tree to code palette size (number of colors in a palette) and the
+// corresponding probabilities for Y and UV planes.
+const aom_tree_index av1_palette_size_tree[TREE_SIZE(PALETTE_SIZES)] = {
+  -TWO_COLORS,  2, -THREE_COLORS, 4,  -FOUR_COLORS,  6,
+  -FIVE_COLORS, 8, -SIX_COLORS,   10, -SEVEN_COLORS, -EIGHT_COLORS,
+};
+
+// TODO(huisu): tune these probs
+const aom_prob
+    av1_default_palette_y_size_prob[PALETTE_BLOCK_SIZES][PALETTE_SIZES - 1] = {
+      { 96, 89, 100, 64, 77, 130 },   { 22, 15, 44, 16, 34, 82 },
+      { 30, 19, 57, 18, 38, 86 },     { 94, 36, 104, 23, 43, 92 },
+      { 116, 76, 107, 46, 65, 105 },  { 112, 82, 94, 40, 70, 112 },
+      { 147, 124, 123, 58, 69, 103 }, { 180, 113, 136, 49, 45, 114 },
+      { 107, 70, 87, 49, 154, 156 },  { 98, 105, 142, 63, 64, 152 },
+#if CONFIG_EXT_PARTITION
+      { 98, 105, 142, 63, 64, 152 },  { 98, 105, 142, 63, 64, 152 },
+      { 98, 105, 142, 63, 64, 152 },
+#endif  // CONFIG_EXT_PARTITION
+    };
+
+const aom_prob
+    av1_default_palette_uv_size_prob[PALETTE_BLOCK_SIZES][PALETTE_SIZES - 1] = {
+      { 160, 196, 228, 213, 175, 230 }, { 87, 148, 208, 141, 166, 163 },
+      { 72, 151, 204, 139, 155, 161 },  { 78, 135, 171, 104, 120, 173 },
+      { 59, 92, 131, 78, 92, 142 },     { 75, 118, 149, 84, 90, 128 },
+      { 89, 87, 92, 66, 66, 128 },      { 67, 53, 54, 55, 66, 93 },
+      { 120, 130, 83, 171, 75, 214 },   { 72, 55, 66, 68, 79, 107 },
+#if CONFIG_EXT_PARTITION
+      { 72, 55, 66, 68, 79, 107 },      { 72, 55, 66, 68, 79, 107 },
+      { 72, 55, 66, 68, 79, 107 },
+#endif  // CONFIG_EXT_PARTITION
+    };
+
+// When palette mode is enabled, following probability tables indicate the
+// probabilities to code the "is_palette" bit (i.e. the bit that indicates
+// if this block uses palette mode or DC_PRED mode).
+const aom_prob av1_default_palette_y_mode_prob
+    [PALETTE_BLOCK_SIZES][PALETTE_Y_MODE_CONTEXTS] = {
+      { 240, 180, 100 }, { 240, 180, 100 }, { 240, 180, 100 },
+      { 240, 180, 100 }, { 240, 180, 100 }, { 240, 180, 100 },
+      { 240, 180, 100 }, { 240, 180, 100 }, { 240, 180, 100 },
+      { 240, 180, 100 },
+#if CONFIG_EXT_PARTITION
+      { 240, 180, 100 }, { 240, 180, 100 }, { 240, 180, 100 },
+#endif  // CONFIG_EXT_PARTITION
+    };
+
+const aom_prob av1_default_palette_uv_mode_prob[PALETTE_UV_MODE_CONTEXTS] = {
+  253, 229
+};
+
+// Trees to code palette color indices (for various palette sizes), and the
+// corresponding probability tables for Y and UV planes.
+const aom_tree_index
+    av1_palette_color_index_tree[PALETTE_SIZES][TREE_SIZE(PALETTE_COLORS)] = {
+      { // 2 colors
+        -PALETTE_COLOR_ONE, -PALETTE_COLOR_TWO },
+      { // 3 colors
+        -PALETTE_COLOR_ONE, 2, -PALETTE_COLOR_TWO, -PALETTE_COLOR_THREE },
+      { // 4 colors
+        -PALETTE_COLOR_ONE, 2, -PALETTE_COLOR_TWO, 4, -PALETTE_COLOR_THREE,
+        -PALETTE_COLOR_FOUR },
+      { // 5 colors
+        -PALETTE_COLOR_ONE, 2, -PALETTE_COLOR_TWO, 4, -PALETTE_COLOR_THREE, 6,
+        -PALETTE_COLOR_FOUR, -PALETTE_COLOR_FIVE },
+      { // 6 colors
+        -PALETTE_COLOR_ONE, 2, -PALETTE_COLOR_TWO, 4, -PALETTE_COLOR_THREE, 6,
+        -PALETTE_COLOR_FOUR, 8, -PALETTE_COLOR_FIVE, -PALETTE_COLOR_SIX },
+      { // 7 colors
+        -PALETTE_COLOR_ONE, 2, -PALETTE_COLOR_TWO, 4, -PALETTE_COLOR_THREE, 6,
+        -PALETTE_COLOR_FOUR, 8, -PALETTE_COLOR_FIVE, 10, -PALETTE_COLOR_SIX,
+        -PALETTE_COLOR_SEVEN },
+      { // 8 colors
+        -PALETTE_COLOR_ONE, 2, -PALETTE_COLOR_TWO, 4, -PALETTE_COLOR_THREE, 6,
+        -PALETTE_COLOR_FOUR, 8, -PALETTE_COLOR_FIVE, 10, -PALETTE_COLOR_SIX, 12,
+        -PALETTE_COLOR_SEVEN, -PALETTE_COLOR_EIGHT },
+    };
+
+// Note: Has to be non-zero to avoid any asserts triggering.
+#define UNUSED_PROB 128
+
+const aom_prob av1_default_palette_y_color_index_prob
+    [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][PALETTE_COLORS - 1] = {
+      {
+          // 2 colors
+          { 231, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 69, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 224, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 249, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+      },
+      {
+          // 3 colors
+          { 219, 124, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 91, 191, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 34, 237, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 184, 118, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 252, 124, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+      },
+      {
+          // 4 colors
+          { 204, 87, 97, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 74, 144, 129, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 52, 191, 134, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 151, 85, 147, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 248, 60, 115, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+      },
+      {
+          // 5 colors
+          { 218, 69, 62, 106, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 76, 143, 89, 127, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 21, 233, 94, 131, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 172, 72, 89, 112, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 253, 66, 65, 128, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+      },
+      {
+          // 6 colors
+          { 190, 60, 47, 54, 74, UNUSED_PROB, UNUSED_PROB },
+          { 62, 106, 51, 95, 110, UNUSED_PROB, UNUSED_PROB },
+          { 52, 180, 69, 72, 107, UNUSED_PROB, UNUSED_PROB },
+          { 156, 83, 72, 83, 101, UNUSED_PROB, UNUSED_PROB },
+          { 245, 45, 37, 52, 91, UNUSED_PROB, UNUSED_PROB },
+      },
+      {
+          // 7 colors
+          { 206, 56, 42, 42, 53, 85, UNUSED_PROB },
+          { 70, 100, 45, 68, 77, 94, UNUSED_PROB },
+          { 57, 169, 51, 62, 74, 119, UNUSED_PROB },
+          { 172, 76, 71, 40, 59, 76, UNUSED_PROB },
+          { 248, 47, 36, 53, 61, 110, UNUSED_PROB },
+      },
+      {
+          // 8 colors
+          { 208, 52, 38, 34, 34, 44, 66 },
+          { 52, 107, 34, 73, 69, 82, 87 },
+          { 28, 208, 53, 43, 62, 70, 102 },
+          { 184, 64, 45, 37, 37, 69, 105 },
+          { 251, 18, 31, 45, 47, 61, 104 },
+      },
+    };
+
+const aom_prob av1_default_palette_uv_color_index_prob
+    [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][PALETTE_COLORS - 1] = {
+      {
+          // 2 colors
+          { 233, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 69, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 240, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 248, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+      },
+      {
+          // 3 colors
+          { 216, 128, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 110, 171, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 40, 239, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 191, 104, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 247, 134, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+      },
+      {
+          // 4 colors
+          { 202, 89, 132, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 90, 132, 136, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 63, 195, 149, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 152, 84, 152, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 241, 87, 136, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+      },
+      {
+          // 5 colors
+          { 209, 54, 82, 134, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 94, 173, 180, 93, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 10, 251, 127, 84, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 183, 20, 150, 47, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 252, 73, 111, 150, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+      },
+      {
+          // 6 colors
+          { 192, 67, 59, 46, 184, UNUSED_PROB, UNUSED_PROB },
+          { 59, 92, 61, 100, 130, UNUSED_PROB, UNUSED_PROB },
+          { 49, 162, 68, 91, 150, UNUSED_PROB, UNUSED_PROB },
+          { 133, 29, 36, 153, 101, UNUSED_PROB, UNUSED_PROB },
+          { 247, 71, 44, 90, 129, UNUSED_PROB, UNUSED_PROB },
+      },
+      {
+          // 7 colors
+          { 182, 62, 80, 78, 46, 116, UNUSED_PROB },
+          { 59, 62, 39, 81, 65, 99, UNUSED_PROB },
+          { 54, 177, 48, 58, 93, 104, UNUSED_PROB },
+          { 137, 79, 54, 55, 44, 134, UNUSED_PROB },
+          { 239, 82, 79, 44, 69, 71, UNUSED_PROB },
+      },
+      {
+          // 8 colors
+          { 172, 53, 27, 67, 30, 79, 113 },
+          { 63, 57, 45, 81, 62, 35, 47 },
+          { 51, 200, 36, 47, 82, 165, 129 },
+          { 141, 100, 47, 29, 33, 37, 129 },
+          { 236, 42, 50, 91, 24, 154, 65 },
+      },
+    };
+
+#undef UNUSED_PROB
+
+#define MAX_COLOR_CONTEXT_HASH 8
+// Negative values are invalid
+static const int palette_color_index_context_lookup[MAX_COLOR_CONTEXT_HASH +
+                                                    1] = { -1, -1, 0, -1, -1,
+                                                           4,  3,  2, 1 };
+
+#endif  // CONFIG_PALETTE
+
+// The transform size is coded as an offset to the smallest transform
+// block size.
+const aom_tree_index av1_tx_size_tree[MAX_TX_DEPTH][TREE_SIZE(TX_SIZES)] = {
+  {
+      // Max tx_size is 8X8
+      -0, -1,
+  },
+  {
+      // Max tx_size is 16X16
+      -0, 2, -1, -2,
+  },
+  {
+      // Max tx_size is 32X32
+      -0, 2, -1, 4, -2, -3,
+  },
+#if CONFIG_TX64X64
+  {
+      // Max tx_size is 64X64
+      -0, 2, -1, 4, -2, 6, -3, -4,
+  },
+#endif  // CONFIG_TX64X64
+};
+
+static const aom_prob default_tx_size_prob[MAX_TX_DEPTH][TX_SIZE_CONTEXTS]
+                                          [MAX_TX_DEPTH] = {
+                                            {
+                                                // Max tx_size is 8X8
+                                                { 100 },
+                                                { 66 },
+                                            },
+                                            {
+                                                // Max tx_size is 16X16
+                                                { 20, 152 },
+                                                { 15, 101 },
+                                            },
+                                            {
+                                                // Max tx_size is 32X32
+                                                { 3, 136, 37 },
+                                                { 5, 52, 13 },
+                                            },
+#if CONFIG_TX64X64
+                                            {
+                                                // Max tx_size is 64X64
+                                                { 1, 64, 136, 127 },
+                                                { 1, 32, 52, 67 },
+                                            },
+#endif  // CONFIG_TX64X64
+                                          };
+
+#if CONFIG_LOOP_RESTORATION
+const aom_tree_index
+    av1_switchable_restore_tree[TREE_SIZE(RESTORE_SWITCHABLE_TYPES)] = {
+      -RESTORE_NONE, 2, -RESTORE_WIENER, -RESTORE_SGRPROJ,
+    };
+
+static const aom_prob
+    default_switchable_restore_prob[RESTORE_SWITCHABLE_TYPES - 1] = {
+      32, 128,
+    };
+#endif  // CONFIG_LOOP_RESTORATION
+
+#if CONFIG_PALETTE
+#define NUM_PALETTE_NEIGHBORS 3  // left, top-left and top.
+int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
+                                        int r, int c, int palette_size,
+                                        uint8_t *color_order, int *color_idx) {
+  int i;
+  // The +10 below should not be needed. But we get a warning "array subscript
+  // is above array bounds [-Werror=array-bounds]" without it, possibly due to
+  // this (or similar) bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124
+  int scores[PALETTE_MAX_SIZE + 10];
+  const int weights[NUM_PALETTE_NEIGHBORS] = { 2, 1, 2 };
+  const int hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 };
+  int color_index_ctx_hash;
+  int color_index_ctx;
+  int color_neighbors[NUM_PALETTE_NEIGHBORS];
+  int inverse_color_order[PALETTE_MAX_SIZE];
+  assert(palette_size <= PALETTE_MAX_SIZE);
+  assert(r > 0 || c > 0);
+
+  // Get color indices of neighbors.
+  color_neighbors[0] = (c - 1 >= 0) ? color_map[r * stride + c - 1] : -1;
+  color_neighbors[1] =
+      (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * stride + c - 1] : -1;
+  color_neighbors[2] = (r - 1 >= 0) ? color_map[(r - 1) * stride + c] : -1;
+
+  for (i = 0; i < PALETTE_MAX_SIZE; ++i) {
+    color_order[i] = i;
+    inverse_color_order[i] = i;
+  }
+  memset(scores, 0, PALETTE_MAX_SIZE * sizeof(scores[0]));
+  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
+    if (color_neighbors[i] >= 0) {
+      scores[color_neighbors[i]] += weights[i];
+    }
+  }
+
+  // Get the top NUM_PALETTE_NEIGHBORS scores (sorted from large to small).
+  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
+    int max = scores[i];
+    int max_idx = i;
+    int j;
+    for (j = i + 1; j < palette_size; ++j) {
+      if (scores[j] > max) {
+        max = scores[j];
+        max_idx = j;
+      }
+    }
+    if (max_idx != i) {
+      // Move the score at index 'max_idx' to index 'i', and shift the scores
+      // from 'i' to 'max_idx - 1' by 1.
+      const int max_score = scores[max_idx];
+      const uint8_t max_color_order = color_order[max_idx];
+      int k;
+      for (k = max_idx; k > i; --k) {
+        scores[k] = scores[k - 1];
+        color_order[k] = color_order[k - 1];
+        inverse_color_order[color_order[k]] = k;
+      }
+      scores[i] = max_score;
+      color_order[i] = max_color_order;
+      inverse_color_order[color_order[i]] = i;
+    }
+  }
+
+  // Get hash value of context.
+  color_index_ctx_hash = 0;
+  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
+    color_index_ctx_hash += scores[i] * hash_multipliers[i];
+  }
+  assert(color_index_ctx_hash > 0);
+  assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH);
+
+  // Lookup context from hash.
+  color_index_ctx = palette_color_index_context_lookup[color_index_ctx_hash];
+  assert(color_index_ctx >= 0);
+  assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS);
+
+  if (color_idx != NULL) {
+    *color_idx = inverse_color_order[color_map[r * stride + c]];
+  }
+  return color_index_ctx;
+}
+#undef NUM_PALETTE_NEIGHBORS
+#undef MAX_COLOR_CONTEXT_HASH
+
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_VAR_TX
+static const aom_prob default_txfm_partition_probs[TXFM_PARTITION_CONTEXTS] = {
+  250, 231, 212, 241, 166, 66, 241, 230, 135, 243, 154, 64, 248, 161, 63, 128,
+};
+#endif
+
+static const aom_prob default_skip_probs[SKIP_CONTEXTS] = { 192, 128, 64 };
+
+#if CONFIG_DUAL_FILTER
+static const aom_prob default_switchable_interp_prob
+    [SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS - 1] = {
+      { 235, 192, 128 }, { 36, 243, 48 },   { 34, 16, 128 },
+      { 34, 16, 128 },   { 149, 160, 128 }, { 235, 192, 128 },
+      { 36, 243, 48 },   { 34, 16, 128 },   { 34, 16, 128 },
+      { 149, 160, 128 }, { 235, 192, 128 }, { 36, 243, 48 },
+      { 34, 16, 128 },   { 34, 16, 128 },   { 149, 160, 128 },
+      { 235, 192, 128 }, { 36, 243, 48 },   { 34, 16, 128 },
+      { 34, 16, 128 },   { 149, 160, 128 },
+    };
+#else   // CONFIG_DUAL_FILTER
+static const aom_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+                                                    [SWITCHABLE_FILTERS - 1] = {
+                                                      { 235, 162 },
+                                                      { 36, 255 },
+                                                      { 34, 3 },
+                                                      { 149, 144 },
+                                                    };
+#endif  // CONFIG_DUAL_FILTER
+
+#if CONFIG_EXT_TX
+/* clang-format off */
+const aom_tree_index av1_ext_tx_inter_tree[EXT_TX_SETS_INTER]
+                                           [TREE_SIZE(TX_TYPES)] = {
+  { // ToDo(yaowu): remove used entry 0.
+    0
+  }, {
+    -IDTX, 2,
+    4, 14,
+    6, 8,
+    -V_DCT, -H_DCT,
+    10, 12,
+    -V_ADST, -H_ADST,
+    -V_FLIPADST, -H_FLIPADST,
+    -DCT_DCT, 16,
+    18, 24,
+    20, 22,
+    -ADST_DCT, -DCT_ADST,
+    -FLIPADST_DCT, -DCT_FLIPADST,
+    26, 28,
+    -ADST_ADST, -FLIPADST_FLIPADST,
+    -ADST_FLIPADST, -FLIPADST_ADST
+  }, {
+    -IDTX, 2,
+    4, 6,
+    -V_DCT, -H_DCT,
+    -DCT_DCT, 8,
+    10, 16,
+    12, 14,
+    -ADST_DCT, -DCT_ADST,
+    -FLIPADST_DCT, -DCT_FLIPADST,
+    18, 20,
+    -ADST_ADST, -FLIPADST_FLIPADST,
+    -ADST_FLIPADST, -FLIPADST_ADST
+  }, {
+    -IDTX, -DCT_DCT,
+  }
+};
+
+const aom_tree_index av1_ext_tx_intra_tree[EXT_TX_SETS_INTRA]
+                                           [TREE_SIZE(TX_TYPES)] = {
+  {  // ToDo(yaowu): remove unused entry 0.
+    0
+  }, {
+    -IDTX, 2,
+    -DCT_DCT, 4,
+    6, 8,
+    -V_DCT, -H_DCT,
+    -ADST_ADST, 10,
+    -ADST_DCT, -DCT_ADST,
+  }, {
+    -IDTX, 2,
+    -DCT_DCT, 4,
+    -ADST_ADST, 6,
+    -ADST_DCT, -DCT_ADST,
+  }
+};
+/* clang-format on */
+
+static const aom_prob
+    default_inter_ext_tx_prob[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES - 1] = {
+      {
+// ToDo(yaowu): remove unused entry 0.
+#if CONFIG_CB4X4
+          { 0 },
+#endif
+          { 0 },
+          { 0 },
+          { 0 },
+          { 0 },
+      },
+      {
+#if CONFIG_CB4X4
+          { 0 },
+#endif
+          { 10, 24, 30, 128, 128, 128, 128, 112, 160, 128, 128, 128, 128, 128,
+            128 },
+          { 10, 24, 30, 128, 128, 128, 128, 112, 160, 128, 128, 128, 128, 128,
+            128 },
+          { 10, 24, 30, 128, 128, 128, 128, 112, 160, 128, 128, 128, 128, 128,
+            128 },
+          { 10, 24, 30, 128, 128, 128, 128, 112, 160, 128, 128, 128, 128, 128,
+            128 },
+      },
+      {
+#if CONFIG_CB4X4
+          { 0 },
+#endif
+          { 10, 30, 128, 112, 160, 128, 128, 128, 128, 128, 128 },
+          { 10, 30, 128, 112, 160, 128, 128, 128, 128, 128, 128 },
+          { 10, 30, 128, 112, 160, 128, 128, 128, 128, 128, 128 },
+          { 10, 30, 128, 112, 160, 128, 128, 128, 128, 128, 128 },
+      },
+      {
+#if CONFIG_CB4X4
+          { 0 },
+#endif
+          { 12 },
+          { 12 },
+          { 12 },
+          { 12 },
+      }
+    };
+
+// TODO(urvang): 3rd context should be tx_type instead of intra mode just like
+// the baseline.
+static const aom_prob
+    default_intra_ext_tx_prob[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                             [TX_TYPES - 1] = {
+                               {
+// ToDo(yaowu): remove unused entry 0.
+#if CONFIG_CB4X4
+                                   {
+                                       { 0 },
+                                   },
+#endif
+                                   {
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+#if CONFIG_ALT_INTRA
+                                       { 0 },
+#endif  // CONFIG_ALT_INTRA
+                                   },
+                                   {
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+#if CONFIG_ALT_INTRA
+                                       { 0 },
+#endif  // CONFIG_ALT_INTRA
+                                   },
+                                   {
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+#if CONFIG_ALT_INTRA
+                                       { 0 },
+#endif  // CONFIG_ALT_INTRA
+                                   },
+                                   {
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+#if CONFIG_ALT_INTRA
+                                       { 0 },
+#endif  // CONFIG_ALT_INTRA
+                                   },
+                               },
+                               {
+#if CONFIG_CB4X4
+                                   {
+                                       { 0 },
+                                   },
+#endif
+                                   {
+                                       { 8, 224, 32, 128, 64, 128 },
+                                       { 10, 32, 32, 128, 16, 192 },
+                                       { 10, 32, 32, 128, 16, 64 },
+                                       { 9, 200, 32, 128, 64, 128 },
+                                       { 8, 8, 32, 128, 224, 128 },
+                                       { 10, 32, 32, 128, 16, 192 },
+                                       { 10, 32, 32, 128, 16, 64 },
+                                       { 10, 23, 32, 128, 80, 176 },
+                                       { 10, 23, 32, 128, 80, 176 },
+                                       { 10, 32, 32, 128, 16, 64 },
+#if CONFIG_ALT_INTRA
+                                       { 10, 32, 32, 128, 16, 64 },
+#endif  // CONFIG_ALT_INTRA
+                                   },
+                                   {
+                                       { 8, 224, 32, 128, 64, 128 },
+                                       { 10, 32, 32, 128, 16, 192 },
+                                       { 10, 32, 32, 128, 16, 64 },
+                                       { 9, 200, 32, 128, 64, 128 },
+                                       { 8, 8, 32, 128, 224, 128 },
+                                       { 10, 32, 32, 128, 16, 192 },
+                                       { 10, 32, 32, 128, 16, 64 },
+                                       { 10, 23, 32, 128, 80, 176 },
+                                       { 10, 23, 32, 128, 80, 176 },
+                                       { 10, 32, 32, 128, 16, 64 },
+#if CONFIG_ALT_INTRA
+                                       { 10, 32, 32, 128, 16, 64 },
+#endif  // CONFIG_ALT_INTRA
+                                   },
+                                   {
+                                       { 8, 224, 32, 128, 64, 128 },
+                                       { 10, 32, 32, 128, 16, 192 },
+                                       { 10, 32, 32, 128, 16, 64 },
+                                       { 9, 200, 32, 128, 64, 128 },
+                                       { 8, 8, 32, 128, 224, 128 },
+                                       { 10, 32, 32, 128, 16, 192 },
+                                       { 10, 32, 32, 128, 16, 64 },
+                                       { 10, 23, 32, 128, 80, 176 },
+                                       { 10, 23, 32, 128, 80, 176 },
+                                       { 10, 32, 32, 128, 16, 64 },
+#if CONFIG_ALT_INTRA
+                                       { 10, 32, 32, 128, 16, 64 },
+#endif  // CONFIG_ALT_INTRA
+                                   },
+                                   {
+                                       { 8, 224, 32, 128, 64, 128 },
+                                       { 10, 32, 32, 128, 16, 192 },
+                                       { 10, 32, 32, 128, 16, 64 },
+                                       { 9, 200, 32, 128, 64, 128 },
+                                       { 8, 8, 32, 128, 224, 128 },
+                                       { 10, 32, 32, 128, 16, 192 },
+                                       { 10, 32, 32, 128, 16, 64 },
+                                       { 10, 23, 32, 128, 80, 176 },
+                                       { 10, 23, 32, 128, 80, 176 },
+                                       { 10, 32, 32, 128, 16, 64 },
+#if CONFIG_ALT_INTRA
+                                       { 10, 32, 32, 128, 16, 64 },
+#endif  // CONFIG_ALT_INTRA
+                                   },
+                               },
+                               {
+#if CONFIG_CB4X4
+                                   {
+                                       { 0 },
+                                   },
+#endif
+                                   {
+                                       { 8, 224, 64, 128 },
+                                       { 10, 32, 16, 192 },
+                                       { 10, 32, 16, 64 },
+                                       { 9, 200, 64, 128 },
+                                       { 8, 8, 224, 128 },
+                                       { 10, 32, 16, 192 },
+                                       { 10, 32, 16, 64 },
+                                       { 10, 23, 80, 176 },
+                                       { 10, 23, 80, 176 },
+                                       { 10, 32, 16, 64 },
+#if CONFIG_ALT_INTRA
+                                       { 10, 32, 16, 64 },
+#endif  // CONFIG_ALT_INTRA
+                                   },
+                                   {
+                                       { 8, 224, 64, 128 },
+                                       { 10, 32, 16, 192 },
+                                       { 10, 32, 16, 64 },
+                                       { 9, 200, 64, 128 },
+                                       { 8, 8, 224, 128 },
+                                       { 10, 32, 16, 192 },
+                                       { 10, 32, 16, 64 },
+                                       { 10, 23, 80, 176 },
+                                       { 10, 23, 80, 176 },
+                                       { 10, 32, 16, 64 },
+#if CONFIG_ALT_INTRA
+                                       { 10, 32, 16, 64 },
+#endif  // CONFIG_ALT_INTRA
+                                   },
+                                   {
+                                       { 8, 224, 64, 128 },
+                                       { 10, 32, 16, 192 },
+                                       { 10, 32, 16, 64 },
+                                       { 9, 200, 64, 128 },
+                                       { 8, 8, 224, 128 },
+                                       { 10, 32, 16, 192 },
+                                       { 10, 32, 16, 64 },
+                                       { 10, 23, 80, 176 },
+                                       { 10, 23, 80, 176 },
+                                       { 10, 32, 16, 64 },
+#if CONFIG_ALT_INTRA
+                                       { 10, 32, 16, 64 },
+#endif  // CONFIG_ALT_INTRA
+                                   },
+                                   {
+                                       { 8, 224, 64, 128 },
+                                       { 10, 32, 16, 192 },
+                                       { 10, 32, 16, 64 },
+                                       { 9, 200, 64, 128 },
+                                       { 8, 8, 224, 128 },
+                                       { 10, 32, 16, 192 },
+                                       { 10, 32, 16, 64 },
+                                       { 10, 23, 80, 176 },
+                                       { 10, 23, 80, 176 },
+                                       { 10, 32, 16, 64 },
+#if CONFIG_ALT_INTRA
+                                       { 10, 32, 16, 64 },
+#endif  // CONFIG_ALT_INTRA
+                                   },
+                               },
+                             };
+#else  // !CONFIG_EXT_TX
+
+/* clang-format off */
+const aom_tree_index av1_ext_tx_tree[TREE_SIZE(TX_TYPES)] = {
+  -DCT_DCT, 2,
+  -ADST_ADST, 4,
+  -ADST_DCT, -DCT_ADST
+};
+/* clang-format on */
+
+int av1_ext_tx_ind[TX_TYPES];
+int av1_ext_tx_inv[TX_TYPES];
+
+static const aom_prob
+    default_intra_ext_tx_prob[EXT_TX_SIZES][TX_TYPES][TX_TYPES - 1] = {
+#if CONFIG_CB4X4
+      { { 240, 85, 128 }, { 4, 1, 248 }, { 4, 1, 8 }, { 4, 248, 128 } },
+#endif
+      { { 240, 85, 128 }, { 4, 1, 248 }, { 4, 1, 8 }, { 4, 248, 128 } },
+      { { 244, 85, 128 }, { 8, 2, 248 }, { 8, 2, 8 }, { 8, 248, 128 } },
+      { { 248, 85, 128 }, { 16, 4, 248 }, { 16, 4, 8 }, { 16, 248, 128 } },
+    };
+
+static const aom_prob default_inter_ext_tx_prob[EXT_TX_SIZES][TX_TYPES - 1] = {
+#if CONFIG_CB4X4
+  { 160, 85, 128 },
+#endif
+  { 160, 85, 128 },
+  { 176, 85, 128 },
+  { 192, 85, 128 },
+};
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+static const aom_prob
+    default_intra_filter_probs[INTRA_FILTERS + 1][INTRA_FILTERS - 1] = {
+      { 98, 63, 60 }, { 98, 82, 80 }, { 94, 65, 103 },
+      { 49, 25, 24 }, { 72, 38, 50 },
+    };
+const aom_tree_index av1_intra_filter_tree[TREE_SIZE(INTRA_FILTERS)] = {
+  -INTRA_FILTER_LINEAR,      2, -INTRA_FILTER_8TAP, 4, -INTRA_FILTER_8TAP_SHARP,
+  -INTRA_FILTER_8TAP_SMOOTH,
+};
+#if CONFIG_EC_MULTISYMBOL
+int av1_intra_filter_ind[INTRA_FILTERS];
+int av1_intra_filter_inv[INTRA_FILTERS];
+#endif  // CONFIG_EC_MULTISYMBOL
+#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+
+#if CONFIG_FILTER_INTRA
+static const aom_prob default_filter_intra_probs[2] = { 230, 230 };
+#endif  // CONFIG_FILTER_INTRA
+
+#if CONFIG_SUPERTX
+static const aom_prob
+    default_supertx_prob[PARTITION_SUPERTX_CONTEXTS][TX_SIZES] = {
+#if CONFIG_CB4X4
+#if CONFIG_TX64X64
+      { 1, 1, 160, 160, 170, 180 }, { 1, 1, 200, 200, 210, 220 },
+#else
+      { 1, 1, 160, 160, 170 }, { 1, 1, 200, 200, 210 },
+#endif  // CONFIG_TX64X64
+#else
+#if CONFIG_TX64X64
+      { 1, 160, 160, 170, 180 }, { 1, 200, 200, 210, 220 },
+#else
+      { 1, 160, 160, 170 }, { 1, 200, 200, 210 },
+#endif  // CONFIG_CB4X4
+#endif  // CONFIG_TX64X64
+    };
+#endif  // CONFIG_SUPERTX
+
+// FIXME(someone) need real defaults here
+static const aom_prob default_segment_tree_probs[SEG_TREE_PROBS] = {
+  128, 128, 128, 128, 128, 128, 128
+};
+// clang-format off
+static const aom_prob default_segment_pred_probs[PREDICTION_PROBS] = {
+  128, 128, 128
+};
+// clang-format on
+
+#if CONFIG_EC_MULTISYMBOL
+#if CONFIG_DUAL_FILTER
+static const aom_cdf_prob
+    default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS][CDF_SIZE(
+        SWITCHABLE_FILTERS)] = {
+      { AOM_ICDF(30080), AOM_ICDF(31088), AOM_ICDF(32096), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(4608), AOM_ICDF(9620), AOM_ICDF(31338), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(6128), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(6128), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(19072), AOM_ICDF(23352), AOM_ICDF(27632), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(30080), AOM_ICDF(31088), AOM_ICDF(32096), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(4608), AOM_ICDF(9620), AOM_ICDF(31338), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(6128), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(6128), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(19072), AOM_ICDF(23352), AOM_ICDF(27632), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(30080), AOM_ICDF(31088), AOM_ICDF(32096), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(4608), AOM_ICDF(9620), AOM_ICDF(31338), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(6128), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(6128), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(19072), AOM_ICDF(23352), AOM_ICDF(27632), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(30080), AOM_ICDF(31088), AOM_ICDF(32096), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(4608), AOM_ICDF(9620), AOM_ICDF(31338), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(6128), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(6128), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(19072), AOM_ICDF(23352), AOM_ICDF(27632), AOM_ICDF(32768), 0 }
+    };
+#else
+static const aom_cdf_prob
+    default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS][CDF_SIZE(
+        SWITCHABLE_FILTERS)] = {
+      { AOM_ICDF(30080), AOM_ICDF(31781), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(4608), AOM_ICDF(32658), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(4352), AOM_ICDF(4685), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(19072), AOM_ICDF(26776), AOM_ICDF(32768), 0 },
+    };
+#endif
+
+static const aom_cdf_prob default_seg_tree_cdf[CDF_SIZE(MAX_SEGMENTS)] = {
+  AOM_ICDF(4096),  AOM_ICDF(8192),  AOM_ICDF(12288),
+  AOM_ICDF(16384), AOM_ICDF(20480), AOM_ICDF(24576),
+  AOM_ICDF(28672), AOM_ICDF(32768), 0
+};
+
+static const aom_cdf_prob
+    default_tx_size_cdf[MAX_TX_DEPTH][TX_SIZE_CONTEXTS][CDF_SIZE(MAX_TX_DEPTH +
+                                                                 1)] = {
+      { { AOM_ICDF(12800), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(8448), AOM_ICDF(32768), 0 } },
+      { { AOM_ICDF(2560), AOM_ICDF(20496), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(1920), AOM_ICDF(14091), AOM_ICDF(32768), 0 } },
+      { { AOM_ICDF(384), AOM_ICDF(17588), AOM_ICDF(19782), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(640), AOM_ICDF(7166), AOM_ICDF(8466), AOM_ICDF(32768), 0 } },
+#if CONFIG_TX64X64
+      { { AOM_ICDF(128), AOM_ICDF(8288), AOM_ICDF(21293), AOM_ICDF(26986),
+          AOM_ICDF(32768), 0 },
+        { AOM_ICDF(128), AOM_ICDF(4208), AOM_ICDF(10009), AOM_ICDF(15965),
+          AOM_ICDF(32768), 0 } },
+#endif
+    };
+
+#if CONFIG_ALT_INTRA
+static const aom_cdf_prob
+    default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)] = {
+      { AOM_ICDF(11264), AOM_ICDF(12608), AOM_ICDF(16309), AOM_ICDF(21086),
+        AOM_ICDF(23297), AOM_ICDF(24860), AOM_ICDF(27022), AOM_ICDF(28099),
+        AOM_ICDF(29631), AOM_ICDF(31126), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(9600), AOM_ICDF(11953), AOM_ICDF(16100), AOM_ICDF(20922),
+        AOM_ICDF(22756), AOM_ICDF(23913), AOM_ICDF(25435), AOM_ICDF(26724),
+        AOM_ICDF(28046), AOM_ICDF(29927), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(9344), AOM_ICDF(11540), AOM_ICDF(16515), AOM_ICDF(21763),
+        AOM_ICDF(23078), AOM_ICDF(23816), AOM_ICDF(24725), AOM_ICDF(25856),
+        AOM_ICDF(26720), AOM_ICDF(28208), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(12288), AOM_ICDF(14448), AOM_ICDF(18026), AOM_ICDF(23346),
+        AOM_ICDF(23833), AOM_ICDF(24188), AOM_ICDF(24724), AOM_ICDF(25415),
+        AOM_ICDF(25817), AOM_ICDF(26876), AOM_ICDF(32768), 0 },
+    };
+
+static const aom_cdf_prob
+    default_uv_mode_cdf[INTRA_MODES][CDF_SIZE(INTRA_MODES)] = {
+      { AOM_ICDF(25472), AOM_ICDF(25558), AOM_ICDF(27783), AOM_ICDF(30779),
+        AOM_ICDF(30988), AOM_ICDF(31269), AOM_ICDF(31492), AOM_ICDF(31741),
+        AOM_ICDF(32014), AOM_ICDF(32420), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(2176), AOM_ICDF(2415), AOM_ICDF(28381), AOM_ICDF(29574),
+        AOM_ICDF(29832), AOM_ICDF(30712), AOM_ICDF(30881), AOM_ICDF(31662),
+        AOM_ICDF(31761), AOM_ICDF(31922), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(3328), AOM_ICDF(3443), AOM_ICDF(4016), AOM_ICDF(31099),
+        AOM_ICDF(31272), AOM_ICDF(31420), AOM_ICDF(31504), AOM_ICDF(31608),
+        AOM_ICDF(31916), AOM_ICDF(32598), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(23424), AOM_ICDF(23534), AOM_ICDF(25915), AOM_ICDF(27831),
+        AOM_ICDF(28058), AOM_ICDF(28431), AOM_ICDF(30142), AOM_ICDF(31209),
+        AOM_ICDF(31459), AOM_ICDF(32369), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(22784), AOM_ICDF(22862), AOM_ICDF(24255), AOM_ICDF(26287),
+        AOM_ICDF(28490), AOM_ICDF(29509), AOM_ICDF(29776), AOM_ICDF(30115),
+        AOM_ICDF(31203), AOM_ICDF(31674), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(19712), AOM_ICDF(19865), AOM_ICDF(23141), AOM_ICDF(24428),
+        AOM_ICDF(25731), AOM_ICDF(31377), AOM_ICDF(31622), AOM_ICDF(32047),
+        AOM_ICDF(32458), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(21376), AOM_ICDF(21421), AOM_ICDF(22130), AOM_ICDF(27688),
+        AOM_ICDF(28485), AOM_ICDF(28779), AOM_ICDF(28935), AOM_ICDF(29085),
+        AOM_ICDF(31962), AOM_ICDF(32450), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(19712), AOM_ICDF(19814), AOM_ICDF(20725), AOM_ICDF(28510),
+        AOM_ICDF(28814), AOM_ICDF(29099), AOM_ICDF(29457), AOM_ICDF(29729),
+        AOM_ICDF(30133), AOM_ICDF(32408), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(19584), AOM_ICDF(19790), AOM_ICDF(23643), AOM_ICDF(25501),
+        AOM_ICDF(25913), AOM_ICDF(26673), AOM_ICDF(27578), AOM_ICDF(30923),
+        AOM_ICDF(31255), AOM_ICDF(31870), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(20864), AOM_ICDF(21004), AOM_ICDF(24129), AOM_ICDF(26308),
+        AOM_ICDF(27062), AOM_ICDF(27065), AOM_ICDF(27488), AOM_ICDF(28045),
+        AOM_ICDF(28506), AOM_ICDF(29272), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(23680), AOM_ICDF(23929), AOM_ICDF(27831), AOM_ICDF(30446),
+        AOM_ICDF(30598), AOM_ICDF(31129), AOM_ICDF(31244), AOM_ICDF(31655),
+        AOM_ICDF(31868), AOM_ICDF(32234), AOM_ICDF(32768), 0 },
+    };
+#else   // !CONFIG_ALT_INTRA
+static const aom_cdf_prob
+    default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)] = {
+      { AOM_ICDF(8320), AOM_ICDF(11376), AOM_ICDF(12880), AOM_ICDF(19959),
+        AOM_ICDF(23072), AOM_ICDF(24067), AOM_ICDF(25461), AOM_ICDF(26917),
+        AOM_ICDF(29157), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(16896), AOM_ICDF(21112), AOM_ICDF(21932), AOM_ICDF(27852),
+        AOM_ICDF(28667), AOM_ICDF(28916), AOM_ICDF(29593), AOM_ICDF(30089),
+        AOM_ICDF(30905), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(22144), AOM_ICDF(25464), AOM_ICDF(26006), AOM_ICDF(30364),
+        AOM_ICDF(30583), AOM_ICDF(30655), AOM_ICDF(31183), AOM_ICDF(31400),
+        AOM_ICDF(31646), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(28288), AOM_ICDF(30650), AOM_ICDF(30964), AOM_ICDF(32288),
+        AOM_ICDF(32308), AOM_ICDF(32331), AOM_ICDF(32495), AOM_ICDF(32586),
+        AOM_ICDF(32607), AOM_ICDF(32768), 0 },
+    };
+
+static const aom_cdf_prob
+    default_uv_mode_cdf[INTRA_MODES][CDF_SIZE(INTRA_MODES)] = {
+      { AOM_ICDF(15360), AOM_ICDF(15836), AOM_ICDF(20863), AOM_ICDF(27513),
+        AOM_ICDF(28269), AOM_ICDF(29048), AOM_ICDF(29455), AOM_ICDF(30154),
+        AOM_ICDF(31206), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(6144), AOM_ICDF(7392), AOM_ICDF(22657), AOM_ICDF(25981),
+        AOM_ICDF(26965), AOM_ICDF(28779), AOM_ICDF(29309), AOM_ICDF(30890),
+        AOM_ICDF(31763), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(8576), AOM_ICDF(9143), AOM_ICDF(11450), AOM_ICDF(27575),
+        AOM_ICDF(28108), AOM_ICDF(28438), AOM_ICDF(28658), AOM_ICDF(28995),
+        AOM_ICDF(30410), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(12416), AOM_ICDF(12814), AOM_ICDF(16244), AOM_ICDF(22057),
+        AOM_ICDF(23492), AOM_ICDF(24700), AOM_ICDF(26213), AOM_ICDF(27954),
+        AOM_ICDF(29778), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(10624), AOM_ICDF(11057), AOM_ICDF(14619), AOM_ICDF(19415),
+        AOM_ICDF(23134), AOM_ICDF(25679), AOM_ICDF(26399), AOM_ICDF(27618),
+        AOM_ICDF(30676), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(10240), AOM_ICDF(10680), AOM_ICDF(15684), AOM_ICDF(19118),
+        AOM_ICDF(21856), AOM_ICDF(27563), AOM_ICDF(28234), AOM_ICDF(29332),
+        AOM_ICDF(31278), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(11008), AOM_ICDF(11433), AOM_ICDF(14100), AOM_ICDF(22522),
+        AOM_ICDF(24365), AOM_ICDF(25330), AOM_ICDF(25737), AOM_ICDF(26341),
+        AOM_ICDF(30433), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(10880), AOM_ICDF(11308), AOM_ICDF(13991), AOM_ICDF(23645),
+        AOM_ICDF(24679), AOM_ICDF(25433), AOM_ICDF(25977), AOM_ICDF(26746),
+        AOM_ICDF(28463), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(9856), AOM_ICDF(10483), AOM_ICDF(16054), AOM_ICDF(19959),
+        AOM_ICDF(21708), AOM_ICDF(23628), AOM_ICDF(24949), AOM_ICDF(28797),
+        AOM_ICDF(30658), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(12928), AOM_ICDF(14556), AOM_ICDF(22168), AOM_ICDF(27789),
+        AOM_ICDF(28543), AOM_ICDF(29663), AOM_ICDF(29893), AOM_ICDF(30645),
+        AOM_ICDF(31682), AOM_ICDF(32768), 0 },
+    };
+#endif  // CONFIG_ALT_INTRA
+
+#if CONFIG_EXT_PARTITION_TYPES
+static const aom_cdf_prob
+    default_partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(EXT_PARTITION_TYPES)] = {
+      // 8x8 -> 4x4 only supports the four legacy partition types
+      { AOM_ICDF(25472), AOM_ICDF(28949), AOM_ICDF(31052), AOM_ICDF(32768), 0,
+        0, 0, 0, 0 },
+      { AOM_ICDF(18816), AOM_ICDF(22250), AOM_ICDF(28783), AOM_ICDF(32768), 0,
+        0, 0, 0, 0 },
+      { AOM_ICDF(18944), AOM_ICDF(26126), AOM_ICDF(29188), AOM_ICDF(32768), 0,
+        0, 0, 0, 0 },
+      { AOM_ICDF(15488), AOM_ICDF(22508), AOM_ICDF(27077), AOM_ICDF(32768), 0,
+        0, 0, 0, 0 },
+      { AOM_ICDF(22272), AOM_ICDF(23768), AOM_ICDF(25043), AOM_ICDF(29996),
+        AOM_ICDF(30744), AOM_ICDF(31493), AOM_ICDF(32130), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(11776), AOM_ICDF(13457), AOM_ICDF(16315), AOM_ICDF(28229),
+        AOM_ICDF(29069), AOM_ICDF(29910), AOM_ICDF(31339), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(10496), AOM_ICDF(14802), AOM_ICDF(16136), AOM_ICDF(27127),
+        AOM_ICDF(29280), AOM_ICDF(31434), AOM_ICDF(32101), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(6784), AOM_ICDF(8763), AOM_ICDF(10440), AOM_ICDF(29110),
+        AOM_ICDF(30100), AOM_ICDF(31090), AOM_ICDF(31929), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(22656), AOM_ICDF(23801), AOM_ICDF(24702), AOM_ICDF(30721),
+        AOM_ICDF(31294), AOM_ICDF(31867), AOM_ICDF(32317), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(8704), AOM_ICDF(9926), AOM_ICDF(12586), AOM_ICDF(28885),
+        AOM_ICDF(29496), AOM_ICDF(30107), AOM_ICDF(31437), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(6656), AOM_ICDF(10685), AOM_ICDF(11566), AOM_ICDF(27857),
+        AOM_ICDF(29871), AOM_ICDF(31886), AOM_ICDF(32327), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(2176), AOM_ICDF(3012), AOM_ICDF(3690), AOM_ICDF(31253),
+        AOM_ICDF(31671), AOM_ICDF(32090), AOM_ICDF(32429), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(28416), AOM_ICDF(28705), AOM_ICDF(28926), AOM_ICDF(32258),
+        AOM_ICDF(32402), AOM_ICDF(32547), AOM_ICDF(32657), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(9216), AOM_ICDF(9952), AOM_ICDF(11849), AOM_ICDF(30134),
+        AOM_ICDF(30502), AOM_ICDF(30870), AOM_ICDF(31819), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(7424), AOM_ICDF(9008), AOM_ICDF(9528), AOM_ICDF(30664),
+        AOM_ICDF(31456), AOM_ICDF(32248), AOM_ICDF(32508), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(1280), AOM_ICDF(1710), AOM_ICDF(2069), AOM_ICDF(31978),
+        AOM_ICDF(32193), AOM_ICDF(32409), AOM_ICDF(32588), AOM_ICDF(32768), 0 },
+#if CONFIG_EXT_PARTITION
+      { AOM_ICDF(28416), AOM_ICDF(28705), AOM_ICDF(28926), AOM_ICDF(32258),
+        AOM_ICDF(32402), AOM_ICDF(32547), AOM_ICDF(32657), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(9216), AOM_ICDF(9952), AOM_ICDF(11849), AOM_ICDF(30134),
+        AOM_ICDF(30502), AOM_ICDF(30870), AOM_ICDF(31819), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(7424), AOM_ICDF(9008), AOM_ICDF(9528), AOM_ICDF(30664),
+        AOM_ICDF(31456), AOM_ICDF(32248), AOM_ICDF(32508), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(1280), AOM_ICDF(1710), AOM_ICDF(2069), AOM_ICDF(31978),
+        AOM_ICDF(32193), AOM_ICDF(32409), AOM_ICDF(32588), AOM_ICDF(32768), 0 },
+#endif
+    };
+#else
+static const aom_cdf_prob
+    default_partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(PARTITION_TYPES)] = {
+      { AOM_ICDF(25472), AOM_ICDF(28949), AOM_ICDF(31052), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(18816), AOM_ICDF(22250), AOM_ICDF(28783), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(18944), AOM_ICDF(26126), AOM_ICDF(29188), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(15488), AOM_ICDF(22508), AOM_ICDF(27077), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(22272), AOM_ICDF(25265), AOM_ICDF(27815), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(11776), AOM_ICDF(15138), AOM_ICDF(20854), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(10496), AOM_ICDF(19109), AOM_ICDF(21777), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(6784), AOM_ICDF(10743), AOM_ICDF(14098), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(22656), AOM_ICDF(24947), AOM_ICDF(26749), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(8704), AOM_ICDF(11148), AOM_ICDF(16469), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(6656), AOM_ICDF(14714), AOM_ICDF(16477), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(2176), AOM_ICDF(3849), AOM_ICDF(5205), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(28416), AOM_ICDF(28994), AOM_ICDF(29436), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(9216), AOM_ICDF(10688), AOM_ICDF(14483), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(7424), AOM_ICDF(10592), AOM_ICDF(11632), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(1280), AOM_ICDF(2141), AOM_ICDF(2859), AOM_ICDF(32768), 0 },
+#if CONFIG_EXT_PARTITION
+      { AOM_ICDF(28416), AOM_ICDF(28994), AOM_ICDF(29436), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(9216), AOM_ICDF(10688), AOM_ICDF(14483), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(7424), AOM_ICDF(10592), AOM_ICDF(11632), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(1280), AOM_ICDF(2141), AOM_ICDF(2859), AOM_ICDF(32768), 0 },
+#endif
+    };
+#endif
+
+static const aom_cdf_prob
+    default_inter_mode_cdf[INTER_MODE_CONTEXTS][CDF_SIZE(INTER_MODES)] = {
+      { AOM_ICDF(256), AOM_ICDF(22227), AOM_ICDF(23627), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(896), AOM_ICDF(18948), AOM_ICDF(23537), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(896), AOM_ICDF(21563), AOM_ICDF(24320), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(896), AOM_ICDF(12599), AOM_ICDF(17799), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(1024), AOM_ICDF(8960), AOM_ICDF(13238), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(2176), AOM_ICDF(11856), AOM_ICDF(14388), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(3200), AOM_ICDF(6550), AOM_ICDF(9622), AOM_ICDF(32768), 0 },
+    };
+
+#if CONFIG_EXT_TX
+static const aom_cdf_prob default_intra_ext_tx_cdf
+    [EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES][CDF_SIZE(TX_TYPES)] = {
+      {
+// FIXME: unused zero positions, from uncoded trivial transform set
+#if CONFIG_CB4X4
+          {
+              { 0 },
+          },
+#endif
+          { { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+#if CONFIG_ALT_INTRA
+            { 0 }
+#endif
+          },
+          { { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+#if CONFIG_ALT_INTRA
+            { 0 }
+#endif
+          },
+          { { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+#if CONFIG_ALT_INTRA
+            { 0 }
+#endif
+          },
+          { { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+            { 0 },
+#if CONFIG_ALT_INTRA
+            { 0 }
+#endif
+          },
+      },
+      {
+          { { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29048), AOM_ICDF(29296),
+              AOM_ICDF(30164), AOM_ICDF(31466), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(26284), AOM_ICDF(26717),
+              AOM_ICDF(28230), AOM_ICDF(30499), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(3938), AOM_ICDF(5860),
+              AOM_ICDF(29404), AOM_ICDF(31086), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
+              AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
+              AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
+#if CONFIG_ALT_INTRA
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 }
+#endif
+          },
+          { { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29048), AOM_ICDF(29296),
+              AOM_ICDF(30164), AOM_ICDF(31466), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(26284), AOM_ICDF(26717),
+              AOM_ICDF(28230), AOM_ICDF(30499), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(3938), AOM_ICDF(5860),
+              AOM_ICDF(29404), AOM_ICDF(31086), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
+              AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
+              AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
+#if CONFIG_ALT_INTRA
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 }
+#endif
+          },
+          { { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29048), AOM_ICDF(29296),
+              AOM_ICDF(30164), AOM_ICDF(31466), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(26284), AOM_ICDF(26717),
+              AOM_ICDF(28230), AOM_ICDF(30499), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(3938), AOM_ICDF(5860),
+              AOM_ICDF(29404), AOM_ICDF(31086), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
+              AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
+              AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
+#if CONFIG_ALT_INTRA
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 }
+#endif
+          },
+          { { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29048), AOM_ICDF(29296),
+              AOM_ICDF(30164), AOM_ICDF(31466), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(26284), AOM_ICDF(26717),
+              AOM_ICDF(28230), AOM_ICDF(30499), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(3938), AOM_ICDF(5860),
+              AOM_ICDF(29404), AOM_ICDF(31086), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
+              AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
+              AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
+#if CONFIG_ALT_INTRA
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
+              AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 }
+#endif
+          },
+      },
+      {
+          { { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29792), AOM_ICDF(31280),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(27581), AOM_ICDF(30174),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(28924), AOM_ICDF(30846),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065), AOM_ICDF(26611),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065), AOM_ICDF(26611),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
+              AOM_ICDF(32768), 0 },
+#if CONFIG_ALT_INTRA
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
+              AOM_ICDF(32768), 0 }
+#endif
+          },
+          { { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29792), AOM_ICDF(31280),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(27581), AOM_ICDF(30174),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(28924), AOM_ICDF(30846),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065), AOM_ICDF(26611),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065), AOM_ICDF(26611),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
+              AOM_ICDF(32768), 0 },
+#if CONFIG_ALT_INTRA
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
+              AOM_ICDF(32768), 0 }
+#endif
+          },
+          { { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29792), AOM_ICDF(31280),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(27581), AOM_ICDF(30174),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(28924), AOM_ICDF(30846),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065), AOM_ICDF(26611),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065), AOM_ICDF(26611),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
+              AOM_ICDF(32768), 0 },
+#if CONFIG_ALT_INTRA
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
+              AOM_ICDF(32768), 0 }
+#endif
+          },
+          { { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29792), AOM_ICDF(31280),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(27581), AOM_ICDF(30174),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(28924), AOM_ICDF(30846),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065), AOM_ICDF(26611),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065), AOM_ICDF(26611),
+              AOM_ICDF(32768), 0 },
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
+              AOM_ICDF(32768), 0 },
+#if CONFIG_ALT_INTRA
+            { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
+              AOM_ICDF(32768), 0 }
+#endif
+          },
+      }
+    };
+static const aom_cdf_prob
+    default_inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES][CDF_SIZE(
+        TX_TYPES)] = {
+      {
+#if CONFIG_CB4X4
+          { 0 },
+#endif
+          { 0 },
+          { 0 },
+          { 0 },
+          { 0 } },
+      {
+#if CONFIG_CB4X4
+          { 0 },
+#endif
+          { AOM_ICDF(1280), AOM_ICDF(1453), AOM_ICDF(1626), AOM_ICDF(2277),
+            AOM_ICDF(2929), AOM_ICDF(3580), AOM_ICDF(4232), AOM_ICDF(16717),
+            AOM_ICDF(19225), AOM_ICDF(21733), AOM_ICDF(24241), AOM_ICDF(26749),
+            AOM_ICDF(28253), AOM_ICDF(29758), AOM_ICDF(31263), AOM_ICDF(32768),
+            0 },
+          { AOM_ICDF(1280), AOM_ICDF(1453), AOM_ICDF(1626), AOM_ICDF(2277),
+            AOM_ICDF(2929), AOM_ICDF(3580), AOM_ICDF(4232), AOM_ICDF(16717),
+            AOM_ICDF(19225), AOM_ICDF(21733), AOM_ICDF(24241), AOM_ICDF(26749),
+            AOM_ICDF(28253), AOM_ICDF(29758), AOM_ICDF(31263), AOM_ICDF(32768),
+            0 },
+          { AOM_ICDF(1280), AOM_ICDF(1453), AOM_ICDF(1626), AOM_ICDF(2277),
+            AOM_ICDF(2929), AOM_ICDF(3580), AOM_ICDF(4232), AOM_ICDF(16717),
+            AOM_ICDF(19225), AOM_ICDF(21733), AOM_ICDF(24241), AOM_ICDF(26749),
+            AOM_ICDF(28253), AOM_ICDF(29758), AOM_ICDF(31263), AOM_ICDF(32768),
+            0 },
+          { AOM_ICDF(1280), AOM_ICDF(1453), AOM_ICDF(1626), AOM_ICDF(2277),
+            AOM_ICDF(2929), AOM_ICDF(3580), AOM_ICDF(4232), AOM_ICDF(16717),
+            AOM_ICDF(19225), AOM_ICDF(21733), AOM_ICDF(24241), AOM_ICDF(26749),
+            AOM_ICDF(28253), AOM_ICDF(29758), AOM_ICDF(31263), AOM_ICDF(32768),
+            0 } },
+      {
+#if CONFIG_CB4X4
+          { 0 },
+#endif
+          { AOM_ICDF(1280), AOM_ICDF(3125), AOM_ICDF(4970), AOM_ICDF(17132),
+            AOM_ICDF(19575), AOM_ICDF(22018), AOM_ICDF(24461), AOM_ICDF(26904),
+            AOM_ICDF(28370), AOM_ICDF(29836), AOM_ICDF(31302), AOM_ICDF(32768),
+            0 },
+          { AOM_ICDF(1280), AOM_ICDF(3125), AOM_ICDF(4970), AOM_ICDF(17132),
+            AOM_ICDF(19575), AOM_ICDF(22018), AOM_ICDF(24461), AOM_ICDF(26904),
+            AOM_ICDF(28370), AOM_ICDF(29836), AOM_ICDF(31302), AOM_ICDF(32768),
+            0 },
+          { AOM_ICDF(1280), AOM_ICDF(3125), AOM_ICDF(4970), AOM_ICDF(17132),
+            AOM_ICDF(19575), AOM_ICDF(22018), AOM_ICDF(24461), AOM_ICDF(26904),
+            AOM_ICDF(28370), AOM_ICDF(29836), AOM_ICDF(31302), AOM_ICDF(32768),
+            0 },
+          { AOM_ICDF(1280), AOM_ICDF(3125), AOM_ICDF(4970), AOM_ICDF(17132),
+            AOM_ICDF(19575), AOM_ICDF(22018), AOM_ICDF(24461), AOM_ICDF(26904),
+            AOM_ICDF(28370), AOM_ICDF(29836), AOM_ICDF(31302), AOM_ICDF(32768),
+            0 } },
+      {
+#if CONFIG_CB4X4
+          { 0 },
+#endif
+          { AOM_ICDF(1536), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(1536), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(1536), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(1536), AOM_ICDF(32768), 0 } }
+    };
+#else
+static const aom_cdf_prob
+    default_intra_ext_tx_cdf[EXT_TX_SIZES][TX_TYPES][CDF_SIZE(TX_TYPES)] = {
+#if CONFIG_CB4X4
+      { { AOM_ICDF(30720), AOM_ICDF(31400), AOM_ICDF(32084), AOM_ICDF(32768),
+          0 },
+        { AOM_ICDF(512), AOM_ICDF(638), AOM_ICDF(31764), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(512), AOM_ICDF(638), AOM_ICDF(1642), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(512), AOM_ICDF(31760), AOM_ICDF(32264), AOM_ICDF(32768),
+          0 } },
+#endif
+      { { AOM_ICDF(30720), AOM_ICDF(31400), AOM_ICDF(32084), AOM_ICDF(32768),
+          0 },
+        { AOM_ICDF(512), AOM_ICDF(638), AOM_ICDF(31764), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(512), AOM_ICDF(638), AOM_ICDF(1642), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(512), AOM_ICDF(31760), AOM_ICDF(32264), AOM_ICDF(32768),
+          0 } },
+
+      { { AOM_ICDF(31232), AOM_ICDF(31742), AOM_ICDF(32255), AOM_ICDF(32768),
+          0 },
+        { AOM_ICDF(1024), AOM_ICDF(1272), AOM_ICDF(31784), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(1024), AOM_ICDF(1272), AOM_ICDF(2256), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(1024), AOM_ICDF(31776), AOM_ICDF(32272), AOM_ICDF(32768),
+          0 } },
+      { { AOM_ICDF(31744), AOM_ICDF(32084), AOM_ICDF(32426), AOM_ICDF(32768),
+          0 },
+        { AOM_ICDF(2048), AOM_ICDF(2528), AOM_ICDF(31823), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(2048), AOM_ICDF(2528), AOM_ICDF(3473), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(2048), AOM_ICDF(31808), AOM_ICDF(32288), AOM_ICDF(32768),
+          0 } },
+    };
+
+static const aom_cdf_prob
+    default_inter_ext_tx_cdf[EXT_TX_SIZES][CDF_SIZE(TX_TYPES)] = {
+#if CONFIG_CB4X4
+      { AOM_ICDF(20480), AOM_ICDF(24560), AOM_ICDF(28664), AOM_ICDF(32768), 0 },
+#endif
+      { AOM_ICDF(20480), AOM_ICDF(24560), AOM_ICDF(28664), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(22528), AOM_ICDF(25928), AOM_ICDF(29348), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(24576), AOM_ICDF(27296), AOM_ICDF(30032), AOM_ICDF(32768), 0 },
+    };
+#endif  // !CONFIG_EXT_TX
+
+#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+static const aom_cdf_prob
+    default_intra_filter_cdf[INTRA_FILTERS + 1][CDF_SIZE(INTRA_FILTERS)] = {
+      { AOM_ICDF(12544), AOM_ICDF(17521), AOM_ICDF(21095), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(12544), AOM_ICDF(19022), AOM_ICDF(23318), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(12032), AOM_ICDF(17297), AOM_ICDF(23522), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(6272), AOM_ICDF(8860), AOM_ICDF(11101), AOM_ICDF(32768), 0 },
+      { AOM_ICDF(9216), AOM_ICDF(12712), AOM_ICDF(16629), AOM_ICDF(32768), 0 },
+    };
+#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+
+// CDF version of 'av1_kf_y_mode_prob'.
+const aom_cdf_prob
+    av1_kf_y_mode_cdf[INTRA_MODES][INTRA_MODES][CDF_SIZE(INTRA_MODES)] = {
+#if CONFIG_ALT_INTRA
+      {
+          { AOM_ICDF(15488), AOM_ICDF(17513), AOM_ICDF(20731), AOM_ICDF(24586),
+            AOM_ICDF(25921), AOM_ICDF(26749), AOM_ICDF(27807), AOM_ICDF(28602),
+            AOM_ICDF(29530), AOM_ICDF(30681), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(11648), AOM_ICDF(14783), AOM_ICDF(21879), AOM_ICDF(23981),
+            AOM_ICDF(25213), AOM_ICDF(26218), AOM_ICDF(27472), AOM_ICDF(28465),
+            AOM_ICDF(29221), AOM_ICDF(30232), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8448), AOM_ICDF(11108), AOM_ICDF(13392), AOM_ICDF(25167),
+            AOM_ICDF(26295), AOM_ICDF(26789), AOM_ICDF(27536), AOM_ICDF(28088),
+            AOM_ICDF(29039), AOM_ICDF(30700), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(13568), AOM_ICDF(15293), AOM_ICDF(18706), AOM_ICDF(21610),
+            AOM_ICDF(23139), AOM_ICDF(24254), AOM_ICDF(26383), AOM_ICDF(27630),
+            AOM_ICDF(28613), AOM_ICDF(30350), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9600), AOM_ICDF(11772), AOM_ICDF(14397), AOM_ICDF(16580),
+            AOM_ICDF(20091), AOM_ICDF(22865), AOM_ICDF(24490), AOM_ICDF(25395),
+            AOM_ICDF(27037), AOM_ICDF(28694), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(12160), AOM_ICDF(14092), AOM_ICDF(17010), AOM_ICDF(18922),
+            AOM_ICDF(22683), AOM_ICDF(25751), AOM_ICDF(27725), AOM_ICDF(30109),
+            AOM_ICDF(31449), AOM_ICDF(32763), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9088), AOM_ICDF(10383), AOM_ICDF(12569), AOM_ICDF(17113),
+            AOM_ICDF(21351), AOM_ICDF(22511), AOM_ICDF(23633), AOM_ICDF(24382),
+            AOM_ICDF(28215), AOM_ICDF(29798), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(10880), AOM_ICDF(12248), AOM_ICDF(15214), AOM_ICDF(20017),
+            AOM_ICDF(21922), AOM_ICDF(22757), AOM_ICDF(24360), AOM_ICDF(25280),
+            AOM_ICDF(26684), AOM_ICDF(29869), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(11008), AOM_ICDF(13133), AOM_ICDF(15587), AOM_ICDF(17872),
+            AOM_ICDF(19579), AOM_ICDF(21157), AOM_ICDF(23788), AOM_ICDF(26629),
+            AOM_ICDF(27732), AOM_ICDF(29601), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(10112), AOM_ICDF(12325), AOM_ICDF(15360), AOM_ICDF(18348),
+            AOM_ICDF(20452), AOM_ICDF(20460), AOM_ICDF(21902), AOM_ICDF(23982),
+            AOM_ICDF(25149), AOM_ICDF(26667), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8704), AOM_ICDF(14250), AOM_ICDF(17722), AOM_ICDF(23128),
+            AOM_ICDF(24217), AOM_ICDF(24892), AOM_ICDF(26215), AOM_ICDF(27392),
+            AOM_ICDF(28358), AOM_ICDF(30287), AOM_ICDF(32768), 0 },
+      },
+      {
+          { AOM_ICDF(8448), AOM_ICDF(10443), AOM_ICDF(20733), AOM_ICDF(23689),
+            AOM_ICDF(24634), AOM_ICDF(25951), AOM_ICDF(26670), AOM_ICDF(27861),
+            AOM_ICDF(28379), AOM_ICDF(29305), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(6656), AOM_ICDF(9206), AOM_ICDF(24577), AOM_ICDF(25792),
+            AOM_ICDF(26335), AOM_ICDF(27169), AOM_ICDF(27913), AOM_ICDF(28956),
+            AOM_ICDF(29239), AOM_ICDF(29680), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(7168), AOM_ICDF(8968), AOM_ICDF(15662), AOM_ICDF(22937),
+            AOM_ICDF(23849), AOM_ICDF(24616), AOM_ICDF(25603), AOM_ICDF(26555),
+            AOM_ICDF(27210), AOM_ICDF(29142), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9600), AOM_ICDF(11501), AOM_ICDF(19310), AOM_ICDF(21731),
+            AOM_ICDF(22790), AOM_ICDF(23936), AOM_ICDF(25627), AOM_ICDF(27217),
+            AOM_ICDF(27868), AOM_ICDF(29170), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(6912), AOM_ICDF(8730), AOM_ICDF(17650), AOM_ICDF(19377),
+            AOM_ICDF(21025), AOM_ICDF(23319), AOM_ICDF(24537), AOM_ICDF(26112),
+            AOM_ICDF(26840), AOM_ICDF(28345), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(7808), AOM_ICDF(9661), AOM_ICDF(20583), AOM_ICDF(21996),
+            AOM_ICDF(23898), AOM_ICDF(26818), AOM_ICDF(28120), AOM_ICDF(30716),
+            AOM_ICDF(31678), AOM_ICDF(32764), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(6784), AOM_ICDF(8104), AOM_ICDF(15619), AOM_ICDF(18584),
+            AOM_ICDF(20844), AOM_ICDF(22519), AOM_ICDF(23760), AOM_ICDF(25203),
+            AOM_ICDF(27094), AOM_ICDF(28801), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8832), AOM_ICDF(10141), AOM_ICDF(17035), AOM_ICDF(20764),
+            AOM_ICDF(21703), AOM_ICDF(22751), AOM_ICDF(23964), AOM_ICDF(25305),
+            AOM_ICDF(26034), AOM_ICDF(29006), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8192), AOM_ICDF(9920), AOM_ICDF(19113), AOM_ICDF(20594),
+            AOM_ICDF(21747), AOM_ICDF(23327), AOM_ICDF(24581), AOM_ICDF(26916),
+            AOM_ICDF(27533), AOM_ICDF(28944), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(6656), AOM_ICDF(8696), AOM_ICDF(18381), AOM_ICDF(20537),
+            AOM_ICDF(21804), AOM_ICDF(21809), AOM_ICDF(22751), AOM_ICDF(24394),
+            AOM_ICDF(24917), AOM_ICDF(25990), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(6400), AOM_ICDF(9593), AOM_ICDF(20818), AOM_ICDF(23519),
+            AOM_ICDF(24266), AOM_ICDF(25113), AOM_ICDF(26608), AOM_ICDF(27883),
+            AOM_ICDF(28322), AOM_ICDF(29364), AOM_ICDF(32768), 0 },
+      },
+      {
+          { AOM_ICDF(12032), AOM_ICDF(14381), AOM_ICDF(16608), AOM_ICDF(24946),
+            AOM_ICDF(26084), AOM_ICDF(26582), AOM_ICDF(27428), AOM_ICDF(28075),
+            AOM_ICDF(29395), AOM_ICDF(30858), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9216), AOM_ICDF(12620), AOM_ICDF(18287), AOM_ICDF(24345),
+            AOM_ICDF(25984), AOM_ICDF(26715), AOM_ICDF(27732), AOM_ICDF(28519),
+            AOM_ICDF(29399), AOM_ICDF(30781), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(6784), AOM_ICDF(8916), AOM_ICDF(10220), AOM_ICDF(26539),
+            AOM_ICDF(27310), AOM_ICDF(27483), AOM_ICDF(28082), AOM_ICDF(28430),
+            AOM_ICDF(29362), AOM_ICDF(31291), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(11904), AOM_ICDF(14838), AOM_ICDF(17359), AOM_ICDF(21663),
+            AOM_ICDF(22931), AOM_ICDF(23619), AOM_ICDF(25620), AOM_ICDF(26653),
+            AOM_ICDF(27823), AOM_ICDF(30547), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(10752), AOM_ICDF(13504), AOM_ICDF(15536), AOM_ICDF(19057),
+            AOM_ICDF(21753), AOM_ICDF(23883), AOM_ICDF(25202), AOM_ICDF(26266),
+            AOM_ICDF(28196), AOM_ICDF(30589), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(10496), AOM_ICDF(13193), AOM_ICDF(16787), AOM_ICDF(21011),
+            AOM_ICDF(23929), AOM_ICDF(25651), AOM_ICDF(27958), AOM_ICDF(29330),
+            AOM_ICDF(31022), AOM_ICDF(32761), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8448), AOM_ICDF(9968), AOM_ICDF(11749), AOM_ICDF(18062),
+            AOM_ICDF(21841), AOM_ICDF(22669), AOM_ICDF(23852), AOM_ICDF(24444),
+            AOM_ICDF(28118), AOM_ICDF(30007), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9728), AOM_ICDF(11168), AOM_ICDF(12602), AOM_ICDF(20819),
+            AOM_ICDF(22194), AOM_ICDF(22764), AOM_ICDF(24366), AOM_ICDF(25022),
+            AOM_ICDF(26414), AOM_ICDF(30460), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9216), AOM_ICDF(12712), AOM_ICDF(14357), AOM_ICDF(18346),
+            AOM_ICDF(20486), AOM_ICDF(21549), AOM_ICDF(23170), AOM_ICDF(25794),
+            AOM_ICDF(27129), AOM_ICDF(29574), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(7808), AOM_ICDF(10733), AOM_ICDF(13057), AOM_ICDF(20252),
+            AOM_ICDF(21906), AOM_ICDF(21912), AOM_ICDF(23057), AOM_ICDF(24233),
+            AOM_ICDF(25700), AOM_ICDF(27439), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(6784), AOM_ICDF(11352), AOM_ICDF(13778), AOM_ICDF(23877),
+            AOM_ICDF(24995), AOM_ICDF(25424), AOM_ICDF(26830), AOM_ICDF(27688),
+            AOM_ICDF(28779), AOM_ICDF(30368), AOM_ICDF(32768), 0 },
+      },
+      {
+          { AOM_ICDF(12288), AOM_ICDF(13728), AOM_ICDF(16480), AOM_ICDF(19841),
+            AOM_ICDF(21570), AOM_ICDF(22715), AOM_ICDF(25385), AOM_ICDF(27000),
+            AOM_ICDF(28329), AOM_ICDF(29994), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9344), AOM_ICDF(10991), AOM_ICDF(18817), AOM_ICDF(20972),
+            AOM_ICDF(22137), AOM_ICDF(23231), AOM_ICDF(26025), AOM_ICDF(27711),
+            AOM_ICDF(28244), AOM_ICDF(29428), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9344), AOM_ICDF(10900), AOM_ICDF(13206), AOM_ICDF(21344),
+            AOM_ICDF(22332), AOM_ICDF(22987), AOM_ICDF(25127), AOM_ICDF(26440),
+            AOM_ICDF(27231), AOM_ICDF(29502), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(12928), AOM_ICDF(14478), AOM_ICDF(15978), AOM_ICDF(18630),
+            AOM_ICDF(19852), AOM_ICDF(20897), AOM_ICDF(24699), AOM_ICDF(26464),
+            AOM_ICDF(27030), AOM_ICDF(30482), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9088), AOM_ICDF(10476), AOM_ICDF(13350), AOM_ICDF(15237),
+            AOM_ICDF(18175), AOM_ICDF(20252), AOM_ICDF(23283), AOM_ICDF(25321),
+            AOM_ICDF(26426), AOM_ICDF(29349), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(10240), AOM_ICDF(11912), AOM_ICDF(15008), AOM_ICDF(17177),
+            AOM_ICDF(19979), AOM_ICDF(23056), AOM_ICDF(26395), AOM_ICDF(29681),
+            AOM_ICDF(30790), AOM_ICDF(32760), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8704), AOM_ICDF(9738), AOM_ICDF(11717), AOM_ICDF(15480),
+            AOM_ICDF(18656), AOM_ICDF(20022), AOM_ICDF(22611), AOM_ICDF(24357),
+            AOM_ICDF(27150), AOM_ICDF(29257), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(12928), AOM_ICDF(13548), AOM_ICDF(17978), AOM_ICDF(20602),
+            AOM_ICDF(21814), AOM_ICDF(22427), AOM_ICDF(24568), AOM_ICDF(25881),
+            AOM_ICDF(26823), AOM_ICDF(30817), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(10496), AOM_ICDF(12149), AOM_ICDF(14082), AOM_ICDF(18054),
+            AOM_ICDF(19032), AOM_ICDF(19994), AOM_ICDF(24086), AOM_ICDF(28427),
+            AOM_ICDF(29156), AOM_ICDF(30680), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8448), AOM_ICDF(10158), AOM_ICDF(13867), AOM_ICDF(16506),
+            AOM_ICDF(18584), AOM_ICDF(18592), AOM_ICDF(21472), AOM_ICDF(23767),
+            AOM_ICDF(24646), AOM_ICDF(27279), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(7296), AOM_ICDF(9684), AOM_ICDF(13471), AOM_ICDF(17701),
+            AOM_ICDF(18934), AOM_ICDF(19878), AOM_ICDF(25115), AOM_ICDF(27238),
+            AOM_ICDF(27972), AOM_ICDF(29583), AOM_ICDF(32768), 0 },
+      },
+      {
+          { AOM_ICDF(10880), AOM_ICDF(12163), AOM_ICDF(14497), AOM_ICDF(17112),
+            AOM_ICDF(20859), AOM_ICDF(22562), AOM_ICDF(23599), AOM_ICDF(24638),
+            AOM_ICDF(26861), AOM_ICDF(29399), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9984), AOM_ICDF(12476), AOM_ICDF(16360), AOM_ICDF(18889),
+            AOM_ICDF(21414), AOM_ICDF(23474), AOM_ICDF(24563), AOM_ICDF(25909),
+            AOM_ICDF(27195), AOM_ICDF(28828), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(7168), AOM_ICDF(9268), AOM_ICDF(10737), AOM_ICDF(20063),
+            AOM_ICDF(22315), AOM_ICDF(23302), AOM_ICDF(24152), AOM_ICDF(25195),
+            AOM_ICDF(26645), AOM_ICDF(28845), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8960), AOM_ICDF(10727), AOM_ICDF(12449), AOM_ICDF(14263),
+            AOM_ICDF(16523), AOM_ICDF(17608), AOM_ICDF(23352), AOM_ICDF(24676),
+            AOM_ICDF(26478), AOM_ICDF(28886), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9856), AOM_ICDF(11109), AOM_ICDF(13309), AOM_ICDF(14975),
+            AOM_ICDF(19055), AOM_ICDF(21670), AOM_ICDF(23144), AOM_ICDF(24460),
+            AOM_ICDF(26212), AOM_ICDF(28107), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9984), AOM_ICDF(11586), AOM_ICDF(14565), AOM_ICDF(16562),
+            AOM_ICDF(21107), AOM_ICDF(25444), AOM_ICDF(27218), AOM_ICDF(29429),
+            AOM_ICDF(31451), AOM_ICDF(32763), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(7168), AOM_ICDF(8268), AOM_ICDF(9704), AOM_ICDF(13144),
+            AOM_ICDF(18443), AOM_ICDF(20065), AOM_ICDF(21653), AOM_ICDF(23607),
+            AOM_ICDF(26506), AOM_ICDF(28854), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(11520), AOM_ICDF(13014), AOM_ICDF(14866), AOM_ICDF(18136),
+            AOM_ICDF(20231), AOM_ICDF(21509), AOM_ICDF(23004), AOM_ICDF(24186),
+            AOM_ICDF(25728), AOM_ICDF(29468), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(10240), AOM_ICDF(12264), AOM_ICDF(14507), AOM_ICDF(16388),
+            AOM_ICDF(18888), AOM_ICDF(20927), AOM_ICDF(22731), AOM_ICDF(24691),
+            AOM_ICDF(26142), AOM_ICDF(28394), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8064), AOM_ICDF(10187), AOM_ICDF(12921), AOM_ICDF(15952),
+            AOM_ICDF(19960), AOM_ICDF(19976), AOM_ICDF(21275), AOM_ICDF(23205),
+            AOM_ICDF(25110), AOM_ICDF(26636), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8448), AOM_ICDF(11488), AOM_ICDF(14065), AOM_ICDF(19113),
+            AOM_ICDF(21604), AOM_ICDF(22978), AOM_ICDF(24508), AOM_ICDF(25895),
+            AOM_ICDF(27398), AOM_ICDF(29055), AOM_ICDF(32768), 0 },
+      },
+      {
+          { AOM_ICDF(10368), AOM_ICDF(11768), AOM_ICDF(16772), AOM_ICDF(19842),
+            AOM_ICDF(22940), AOM_ICDF(27394), AOM_ICDF(28528), AOM_ICDF(30267),
+            AOM_ICDF(31371), AOM_ICDF(32763), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9472), AOM_ICDF(11292), AOM_ICDF(18507), AOM_ICDF(20777),
+            AOM_ICDF(23357), AOM_ICDF(27587), AOM_ICDF(28902), AOM_ICDF(30850),
+            AOM_ICDF(31607), AOM_ICDF(32763), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8064), AOM_ICDF(9512), AOM_ICDF(13782), AOM_ICDF(20645),
+            AOM_ICDF(24493), AOM_ICDF(26242), AOM_ICDF(28001), AOM_ICDF(29435),
+            AOM_ICDF(30438), AOM_ICDF(32759), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8960), AOM_ICDF(10541), AOM_ICDF(15664), AOM_ICDF(17639),
+            AOM_ICDF(19646), AOM_ICDF(22145), AOM_ICDF(25216), AOM_ICDF(28815),
+            AOM_ICDF(30050), AOM_ICDF(32757), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9984), AOM_ICDF(11141), AOM_ICDF(15365), AOM_ICDF(16746),
+            AOM_ICDF(21186), AOM_ICDF(25766), AOM_ICDF(27817), AOM_ICDF(30022),
+            AOM_ICDF(31309), AOM_ICDF(32762), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9216), AOM_ICDF(10688), AOM_ICDF(16639), AOM_ICDF(17735),
+            AOM_ICDF(21499), AOM_ICDF(26657), AOM_ICDF(28161), AOM_ICDF(30572),
+            AOM_ICDF(31490), AOM_ICDF(32763), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8448), AOM_ICDF(9303), AOM_ICDF(13611), AOM_ICDF(16636),
+            AOM_ICDF(20555), AOM_ICDF(23414), AOM_ICDF(24912), AOM_ICDF(27613),
+            AOM_ICDF(29727), AOM_ICDF(32756), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9984), AOM_ICDF(11052), AOM_ICDF(16142), AOM_ICDF(19312),
+            AOM_ICDF(21680), AOM_ICDF(23870), AOM_ICDF(25504), AOM_ICDF(28200),
+            AOM_ICDF(29324), AOM_ICDF(32755), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(10496), AOM_ICDF(12323), AOM_ICDF(16955), AOM_ICDF(18839),
+            AOM_ICDF(21144), AOM_ICDF(24861), AOM_ICDF(26838), AOM_ICDF(29988),
+            AOM_ICDF(30976), AOM_ICDF(32761), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(2944), AOM_ICDF(5973), AOM_ICDF(8904), AOM_ICDF(11875),
+            AOM_ICDF(14864), AOM_ICDF(17853), AOM_ICDF(20824), AOM_ICDF(23810),
+            AOM_ICDF(26784), AOM_ICDF(29776), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(7424), AOM_ICDF(10097), AOM_ICDF(15588), AOM_ICDF(20217),
+            AOM_ICDF(23899), AOM_ICDF(26460), AOM_ICDF(28308), AOM_ICDF(30155),
+            AOM_ICDF(30951), AOM_ICDF(32761), AOM_ICDF(32768), 0 },
+      },
+      {
+          { AOM_ICDF(11648), AOM_ICDF(13133), AOM_ICDF(15050), AOM_ICDF(20481),
+            AOM_ICDF(22470), AOM_ICDF(23425), AOM_ICDF(24337), AOM_ICDF(25160),
+            AOM_ICDF(28964), AOM_ICDF(30480), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(10240), AOM_ICDF(12616), AOM_ICDF(16631), AOM_ICDF(20485),
+            AOM_ICDF(22290), AOM_ICDF(23628), AOM_ICDF(25235), AOM_ICDF(26353),
+            AOM_ICDF(28107), AOM_ICDF(29655), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(6784), AOM_ICDF(8002), AOM_ICDF(9066), AOM_ICDF(20038),
+            AOM_ICDF(22926), AOM_ICDF(23324), AOM_ICDF(23951), AOM_ICDF(24537),
+            AOM_ICDF(26916), AOM_ICDF(30231), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(11904), AOM_ICDF(14105), AOM_ICDF(15782), AOM_ICDF(19896),
+            AOM_ICDF(22283), AOM_ICDF(23147), AOM_ICDF(24763), AOM_ICDF(25983),
+            AOM_ICDF(27812), AOM_ICDF(29980), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(10624), AOM_ICDF(11922), AOM_ICDF(13632), AOM_ICDF(15941),
+            AOM_ICDF(20469), AOM_ICDF(22453), AOM_ICDF(24065), AOM_ICDF(25187),
+            AOM_ICDF(27349), AOM_ICDF(29296), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(12032), AOM_ICDF(13085), AOM_ICDF(15468), AOM_ICDF(17768),
+            AOM_ICDF(20613), AOM_ICDF(24388), AOM_ICDF(26385), AOM_ICDF(28430),
+            AOM_ICDF(30938), AOM_ICDF(32761), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9728), AOM_ICDF(10538), AOM_ICDF(11493), AOM_ICDF(14765),
+            AOM_ICDF(18460), AOM_ICDF(19471), AOM_ICDF(20302), AOM_ICDF(20935),
+            AOM_ICDF(28192), AOM_ICDF(29926), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8960), AOM_ICDF(9890), AOM_ICDF(10962), AOM_ICDF(16685),
+            AOM_ICDF(18880), AOM_ICDF(19480), AOM_ICDF(20674), AOM_ICDF(21477),
+            AOM_ICDF(23815), AOM_ICDF(29341), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(14592), AOM_ICDF(16367), AOM_ICDF(17712), AOM_ICDF(20293),
+            AOM_ICDF(22544), AOM_ICDF(23829), AOM_ICDF(24877), AOM_ICDF(26326),
+            AOM_ICDF(27660), AOM_ICDF(29875), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8960), AOM_ICDF(10448), AOM_ICDF(12279), AOM_ICDF(16206),
+            AOM_ICDF(18672), AOM_ICDF(18682), AOM_ICDF(20058), AOM_ICDF(21547),
+            AOM_ICDF(25097), AOM_ICDF(27165), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(11136), AOM_ICDF(13840), AOM_ICDF(15762), AOM_ICDF(21710),
+            AOM_ICDF(23038), AOM_ICDF(23734), AOM_ICDF(24863), AOM_ICDF(25882),
+            AOM_ICDF(27765), AOM_ICDF(30071), AOM_ICDF(32768), 0 },
+      },
+      {
+          { AOM_ICDF(12544), AOM_ICDF(14124), AOM_ICDF(16964), AOM_ICDF(21907),
+            AOM_ICDF(23808), AOM_ICDF(24496), AOM_ICDF(25724), AOM_ICDF(26715),
+            AOM_ICDF(27992), AOM_ICDF(30455), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(10368), AOM_ICDF(13606), AOM_ICDF(18247), AOM_ICDF(20869),
+            AOM_ICDF(22590), AOM_ICDF(23749), AOM_ICDF(25088), AOM_ICDF(26378),
+            AOM_ICDF(27277), AOM_ICDF(29808), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9088), AOM_ICDF(11031), AOM_ICDF(12899), AOM_ICDF(23497),
+            AOM_ICDF(24465), AOM_ICDF(24851), AOM_ICDF(25995), AOM_ICDF(26815),
+            AOM_ICDF(27885), AOM_ICDF(30555), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(11520), AOM_ICDF(14342), AOM_ICDF(15710), AOM_ICDF(19196),
+            AOM_ICDF(21250), AOM_ICDF(21907), AOM_ICDF(24665), AOM_ICDF(26153),
+            AOM_ICDF(27212), AOM_ICDF(30750), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9984), AOM_ICDF(11764), AOM_ICDF(13979), AOM_ICDF(16405),
+            AOM_ICDF(19279), AOM_ICDF(20658), AOM_ICDF(23354), AOM_ICDF(25266),
+            AOM_ICDF(26702), AOM_ICDF(29380), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(10112), AOM_ICDF(12325), AOM_ICDF(15918), AOM_ICDF(19060),
+            AOM_ICDF(21829), AOM_ICDF(23882), AOM_ICDF(26277), AOM_ICDF(27697),
+            AOM_ICDF(30114), AOM_ICDF(32758), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9344), AOM_ICDF(10534), AOM_ICDF(12184), AOM_ICDF(16208),
+            AOM_ICDF(19764), AOM_ICDF(20627), AOM_ICDF(22524), AOM_ICDF(23644),
+            AOM_ICDF(26887), AOM_ICDF(29782), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(12928), AOM_ICDF(14013), AOM_ICDF(15625), AOM_ICDF(19107),
+            AOM_ICDF(20654), AOM_ICDF(21451), AOM_ICDF(22910), AOM_ICDF(23873),
+            AOM_ICDF(24776), AOM_ICDF(30239), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(10368), AOM_ICDF(12818), AOM_ICDF(14610), AOM_ICDF(17350),
+            AOM_ICDF(19568), AOM_ICDF(20710), AOM_ICDF(22971), AOM_ICDF(25114),
+            AOM_ICDF(26340), AOM_ICDF(29127), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8960), AOM_ICDF(11192), AOM_ICDF(13720), AOM_ICDF(18429),
+            AOM_ICDF(20409), AOM_ICDF(20417), AOM_ICDF(22250), AOM_ICDF(23318),
+            AOM_ICDF(24647), AOM_ICDF(27248), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(7808), AOM_ICDF(11416), AOM_ICDF(13918), AOM_ICDF(19028),
+            AOM_ICDF(20181), AOM_ICDF(20839), AOM_ICDF(24380), AOM_ICDF(26018),
+            AOM_ICDF(26967), AOM_ICDF(29845), AOM_ICDF(32768), 0 },
+      },
+      {
+          { AOM_ICDF(9856), AOM_ICDF(11020), AOM_ICDF(14928), AOM_ICDF(18159),
+            AOM_ICDF(19421), AOM_ICDF(20921), AOM_ICDF(23466), AOM_ICDF(26664),
+            AOM_ICDF(27475), AOM_ICDF(28881), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8704), AOM_ICDF(10302), AOM_ICDF(17323), AOM_ICDF(18907),
+            AOM_ICDF(19868), AOM_ICDF(21184), AOM_ICDF(24171), AOM_ICDF(28033),
+            AOM_ICDF(28625), AOM_ICDF(29353), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(7936), AOM_ICDF(9197), AOM_ICDF(12604), AOM_ICDF(20616),
+            AOM_ICDF(21514), AOM_ICDF(22371), AOM_ICDF(24239), AOM_ICDF(26138),
+            AOM_ICDF(26863), AOM_ICDF(29239), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(11264), AOM_ICDF(12524), AOM_ICDF(16083), AOM_ICDF(18574),
+            AOM_ICDF(19858), AOM_ICDF(20841), AOM_ICDF(24242), AOM_ICDF(27606),
+            AOM_ICDF(28352), AOM_ICDF(29853), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8704), AOM_ICDF(10208), AOM_ICDF(13292), AOM_ICDF(15170),
+            AOM_ICDF(17277), AOM_ICDF(19226), AOM_ICDF(22083), AOM_ICDF(25046),
+            AOM_ICDF(26041), AOM_ICDF(27802), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9088), AOM_ICDF(10568), AOM_ICDF(15511), AOM_ICDF(17246),
+            AOM_ICDF(20170), AOM_ICDF(22791), AOM_ICDF(25558), AOM_ICDF(30740),
+            AOM_ICDF(31635), AOM_ICDF(32764), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(7040), AOM_ICDF(8045), AOM_ICDF(10653), AOM_ICDF(13145),
+            AOM_ICDF(15286), AOM_ICDF(16614), AOM_ICDF(19075), AOM_ICDF(23140),
+            AOM_ICDF(26224), AOM_ICDF(28652), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(10240), AOM_ICDF(11032), AOM_ICDF(14258), AOM_ICDF(17629),
+            AOM_ICDF(18914), AOM_ICDF(19898), AOM_ICDF(22412), AOM_ICDF(24961),
+            AOM_ICDF(25815), AOM_ICDF(29156), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(11008), AOM_ICDF(12028), AOM_ICDF(14702), AOM_ICDF(16147),
+            AOM_ICDF(17209), AOM_ICDF(18160), AOM_ICDF(21812), AOM_ICDF(27547),
+            AOM_ICDF(28709), AOM_ICDF(30120), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(7168), AOM_ICDF(9068), AOM_ICDF(14160), AOM_ICDF(16937),
+            AOM_ICDF(18515), AOM_ICDF(18521), AOM_ICDF(20636), AOM_ICDF(24617),
+            AOM_ICDF(25317), AOM_ICDF(26365), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(6784), AOM_ICDF(8510), AOM_ICDF(14195), AOM_ICDF(17148),
+            AOM_ICDF(18158), AOM_ICDF(19201), AOM_ICDF(23070), AOM_ICDF(27351),
+            AOM_ICDF(27901), AOM_ICDF(29422), AOM_ICDF(32768), 0 },
+      },
+      {
+          { AOM_ICDF(10112), AOM_ICDF(11528), AOM_ICDF(15345), AOM_ICDF(19296),
+            AOM_ICDF(21394), AOM_ICDF(21402), AOM_ICDF(22379), AOM_ICDF(23840),
+            AOM_ICDF(24851), AOM_ICDF(26150), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8064), AOM_ICDF(10187), AOM_ICDF(17949), AOM_ICDF(20052),
+            AOM_ICDF(22051), AOM_ICDF(22059), AOM_ICDF(23147), AOM_ICDF(24688),
+            AOM_ICDF(25351), AOM_ICDF(26365), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(6528), AOM_ICDF(8373), AOM_ICDF(11041), AOM_ICDF(21963),
+            AOM_ICDF(23089), AOM_ICDF(23093), AOM_ICDF(24076), AOM_ICDF(24925),
+            AOM_ICDF(25691), AOM_ICDF(27764), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9600), AOM_ICDF(11229), AOM_ICDF(14847), AOM_ICDF(17527),
+            AOM_ICDF(19738), AOM_ICDF(19747), AOM_ICDF(21629), AOM_ICDF(23761),
+            AOM_ICDF(24957), AOM_ICDF(27673), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8960), AOM_ICDF(10262), AOM_ICDF(13339), AOM_ICDF(15480),
+            AOM_ICDF(19925), AOM_ICDF(19942), AOM_ICDF(21445), AOM_ICDF(23037),
+            AOM_ICDF(24329), AOM_ICDF(25977), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(2944), AOM_ICDF(5973), AOM_ICDF(8904), AOM_ICDF(11875),
+            AOM_ICDF(14864), AOM_ICDF(17853), AOM_ICDF(20824), AOM_ICDF(23810),
+            AOM_ICDF(26784), AOM_ICDF(29776), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9472), AOM_ICDF(10564), AOM_ICDF(13426), AOM_ICDF(16561),
+            AOM_ICDF(19685), AOM_ICDF(19697), AOM_ICDF(21076), AOM_ICDF(22583),
+            AOM_ICDF(24891), AOM_ICDF(26983), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8448), AOM_ICDF(9493), AOM_ICDF(12221), AOM_ICDF(16542),
+            AOM_ICDF(18394), AOM_ICDF(18401), AOM_ICDF(19580), AOM_ICDF(20971),
+            AOM_ICDF(22031), AOM_ICDF(26770), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8704), AOM_ICDF(10772), AOM_ICDF(14209), AOM_ICDF(16381),
+            AOM_ICDF(18911), AOM_ICDF(18921), AOM_ICDF(20436), AOM_ICDF(23374),
+            AOM_ICDF(24475), AOM_ICDF(26095), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(7680), AOM_ICDF(9444), AOM_ICDF(13453), AOM_ICDF(16320),
+            AOM_ICDF(18650), AOM_ICDF(18659), AOM_ICDF(19651), AOM_ICDF(21291),
+            AOM_ICDF(22277), AOM_ICDF(23916), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(6656), AOM_ICDF(9920), AOM_ICDF(14740), AOM_ICDF(19864),
+            AOM_ICDF(21495), AOM_ICDF(21501), AOM_ICDF(22953), AOM_ICDF(24372),
+            AOM_ICDF(25192), AOM_ICDF(26760), AOM_ICDF(32768), 0 },
+      },
+      {
+          { AOM_ICDF(9728), AOM_ICDF(13958), AOM_ICDF(18881), AOM_ICDF(23624),
+            AOM_ICDF(24754), AOM_ICDF(25553), AOM_ICDF(26709), AOM_ICDF(27940),
+            AOM_ICDF(28977), AOM_ICDF(30413), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8832), AOM_ICDF(12572), AOM_ICDF(22433), AOM_ICDF(24653),
+            AOM_ICDF(25676), AOM_ICDF(26551), AOM_ICDF(27571), AOM_ICDF(28688),
+            AOM_ICDF(29198), AOM_ICDF(30174), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(5888), AOM_ICDF(8828), AOM_ICDF(11353), AOM_ICDF(23482),
+            AOM_ICDF(24310), AOM_ICDF(24737), AOM_ICDF(25804), AOM_ICDF(26375),
+            AOM_ICDF(27174), AOM_ICDF(29840), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9984), AOM_ICDF(13099), AOM_ICDF(16249), AOM_ICDF(19443),
+            AOM_ICDF(20990), AOM_ICDF(22637), AOM_ICDF(24576), AOM_ICDF(25952),
+            AOM_ICDF(26884), AOM_ICDF(29435), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8448), AOM_ICDF(11108), AOM_ICDF(15085), AOM_ICDF(18134),
+            AOM_ICDF(20319), AOM_ICDF(21992), AOM_ICDF(23549), AOM_ICDF(24989),
+            AOM_ICDF(27177), AOM_ICDF(29208), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9856), AOM_ICDF(13168), AOM_ICDF(18987), AOM_ICDF(22481),
+            AOM_ICDF(24282), AOM_ICDF(26200), AOM_ICDF(27868), AOM_ICDF(30203),
+            AOM_ICDF(31085), AOM_ICDF(32761), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(6784), AOM_ICDF(9119), AOM_ICDF(12629), AOM_ICDF(16877),
+            AOM_ICDF(20262), AOM_ICDF(21125), AOM_ICDF(22307), AOM_ICDF(23615),
+            AOM_ICDF(27727), AOM_ICDF(29972), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(8320), AOM_ICDF(10230), AOM_ICDF(12783), AOM_ICDF(19005),
+            AOM_ICDF(20213), AOM_ICDF(20668), AOM_ICDF(22039), AOM_ICDF(23045),
+            AOM_ICDF(24146), AOM_ICDF(30478), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(9088), AOM_ICDF(11308), AOM_ICDF(15416), AOM_ICDF(18118),
+            AOM_ICDF(19762), AOM_ICDF(20906), AOM_ICDF(22574), AOM_ICDF(25162),
+            AOM_ICDF(25994), AOM_ICDF(28455), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(6912), AOM_ICDF(10548), AOM_ICDF(15148), AOM_ICDF(20026),
+            AOM_ICDF(21612), AOM_ICDF(21618), AOM_ICDF(22707), AOM_ICDF(24200),
+            AOM_ICDF(24869), AOM_ICDF(26844), AOM_ICDF(32768), 0 },
+          { AOM_ICDF(6656), AOM_ICDF(12164), AOM_ICDF(16993), AOM_ICDF(21568),
+            AOM_ICDF(22933), AOM_ICDF(23648), AOM_ICDF(25322), AOM_ICDF(26602),
+            AOM_ICDF(27806), AOM_ICDF(29841), AOM_ICDF(32768), 0 },
+      },
+#else   // !CONFIG_ALT_INTRA
+      { { AOM_ICDF(17536), AOM_ICDF(19321), AOM_ICDF(21527), AOM_ICDF(25360),
+          AOM_ICDF(27516), AOM_ICDF(28026), AOM_ICDF(29323), AOM_ICDF(30023),
+          AOM_ICDF(30999), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(11776), AOM_ICDF(15466), AOM_ICDF(22360), AOM_ICDF(24865),
+          AOM_ICDF(26991), AOM_ICDF(27889), AOM_ICDF(29299), AOM_ICDF(30519),
+          AOM_ICDF(31398), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(9344), AOM_ICDF(12272), AOM_ICDF(13793), AOM_ICDF(25813),
+          AOM_ICDF(27359), AOM_ICDF(27654), AOM_ICDF(28573), AOM_ICDF(29130),
+          AOM_ICDF(30551), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(11648), AOM_ICDF(14123), AOM_ICDF(16454), AOM_ICDF(19948),
+          AOM_ICDF(22780), AOM_ICDF(23846), AOM_ICDF(27087), AOM_ICDF(28995),
+          AOM_ICDF(30380), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(9216), AOM_ICDF(12436), AOM_ICDF(15295), AOM_ICDF(17996),
+          AOM_ICDF(24006), AOM_ICDF(25465), AOM_ICDF(27405), AOM_ICDF(28725),
+          AOM_ICDF(30383), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(9344), AOM_ICDF(12181), AOM_ICDF(14433), AOM_ICDF(16634),
+          AOM_ICDF(20355), AOM_ICDF(24317), AOM_ICDF(26133), AOM_ICDF(29295),
+          AOM_ICDF(31344), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(8576), AOM_ICDF(10750), AOM_ICDF(12556), AOM_ICDF(17996),
+          AOM_ICDF(22315), AOM_ICDF(23609), AOM_ICDF(25040), AOM_ICDF(26157),
+          AOM_ICDF(30573), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(11008), AOM_ICDF(13303), AOM_ICDF(15432), AOM_ICDF(20646),
+          AOM_ICDF(23506), AOM_ICDF(24100), AOM_ICDF(25624), AOM_ICDF(26824),
+          AOM_ICDF(28055), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(9472), AOM_ICDF(12384), AOM_ICDF(14534), AOM_ICDF(17094),
+          AOM_ICDF(20257), AOM_ICDF(22155), AOM_ICDF(24767), AOM_ICDF(28955),
+          AOM_ICDF(30474), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(7552), AOM_ICDF(14152), AOM_ICDF(17352), AOM_ICDF(22654),
+          AOM_ICDF(25123), AOM_ICDF(25783), AOM_ICDF(27911), AOM_ICDF(29182),
+          AOM_ICDF(30849), AOM_ICDF(32768), 0 } },
+      { { AOM_ICDF(8064), AOM_ICDF(11538), AOM_ICDF(21987), AOM_ICDF(24941),
+          AOM_ICDF(26913), AOM_ICDF(28136), AOM_ICDF(29222), AOM_ICDF(30469),
+          AOM_ICDF(31331), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5504), AOM_ICDF(10403), AOM_ICDF(25080), AOM_ICDF(26762),
+          AOM_ICDF(27933), AOM_ICDF(29104), AOM_ICDF(30092), AOM_ICDF(31576),
+          AOM_ICDF(32004), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5632), AOM_ICDF(8706), AOM_ICDF(15097), AOM_ICDF(23714),
+          AOM_ICDF(25344), AOM_ICDF(26072), AOM_ICDF(27380), AOM_ICDF(28580),
+          AOM_ICDF(29840), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(7424), AOM_ICDF(11186), AOM_ICDF(17593), AOM_ICDF(20154),
+          AOM_ICDF(22974), AOM_ICDF(24351), AOM_ICDF(26916), AOM_ICDF(29956),
+          AOM_ICDF(30967), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5888), AOM_ICDF(10193), AOM_ICDF(16895), AOM_ICDF(19031),
+          AOM_ICDF(23735), AOM_ICDF(25576), AOM_ICDF(27514), AOM_ICDF(29813),
+          AOM_ICDF(30471), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(4864), AOM_ICDF(8352), AOM_ICDF(16459), AOM_ICDF(18062),
+          AOM_ICDF(21263), AOM_ICDF(25378), AOM_ICDF(26937), AOM_ICDF(30376),
+          AOM_ICDF(31619), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(4992), AOM_ICDF(7922), AOM_ICDF(13842), AOM_ICDF(18004),
+          AOM_ICDF(21779), AOM_ICDF(23527), AOM_ICDF(25115), AOM_ICDF(27357),
+          AOM_ICDF(30232), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6656), AOM_ICDF(9716), AOM_ICDF(16379), AOM_ICDF(20053),
+          AOM_ICDF(22487), AOM_ICDF(23613), AOM_ICDF(25437), AOM_ICDF(27270),
+          AOM_ICDF(28516), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6016), AOM_ICDF(9674), AOM_ICDF(16891), AOM_ICDF(18684),
+          AOM_ICDF(21147), AOM_ICDF(23093), AOM_ICDF(25512), AOM_ICDF(30132),
+          AOM_ICDF(30894), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(4608), AOM_ICDF(11318), AOM_ICDF(21038), AOM_ICDF(23650),
+          AOM_ICDF(25303), AOM_ICDF(26262), AOM_ICDF(28295), AOM_ICDF(30479),
+          AOM_ICDF(31212), AOM_ICDF(32768), 0 } },
+      { { AOM_ICDF(10496), AOM_ICDF(12758), AOM_ICDF(14790), AOM_ICDF(24547),
+          AOM_ICDF(26342), AOM_ICDF(26799), AOM_ICDF(27825), AOM_ICDF(28443),
+          AOM_ICDF(30217), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(7040), AOM_ICDF(11462), AOM_ICDF(17121), AOM_ICDF(24215),
+          AOM_ICDF(26504), AOM_ICDF(27267), AOM_ICDF(28492), AOM_ICDF(29444),
+          AOM_ICDF(30846), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5376), AOM_ICDF(8158), AOM_ICDF(9215), AOM_ICDF(26451),
+          AOM_ICDF(27407), AOM_ICDF(27524), AOM_ICDF(27995), AOM_ICDF(28275),
+          AOM_ICDF(29767), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(8704), AOM_ICDF(12652), AOM_ICDF(14145), AOM_ICDF(20101),
+          AOM_ICDF(22879), AOM_ICDF(23675), AOM_ICDF(25629), AOM_ICDF(27079),
+          AOM_ICDF(28923), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(7424), AOM_ICDF(12374), AOM_ICDF(14366), AOM_ICDF(18855),
+          AOM_ICDF(23842), AOM_ICDF(24358), AOM_ICDF(25639), AOM_ICDF(27087),
+          AOM_ICDF(29706), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6400), AOM_ICDF(10005), AOM_ICDF(12939), AOM_ICDF(17753),
+          AOM_ICDF(22206), AOM_ICDF(24790), AOM_ICDF(26785), AOM_ICDF(28164),
+          AOM_ICDF(30520), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5632), AOM_ICDF(8176), AOM_ICDF(9713), AOM_ICDF(19053),
+          AOM_ICDF(22343), AOM_ICDF(23222), AOM_ICDF(24453), AOM_ICDF(25070),
+          AOM_ICDF(29761), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(7040), AOM_ICDF(9754), AOM_ICDF(10833), AOM_ICDF(21229),
+          AOM_ICDF(23540), AOM_ICDF(23943), AOM_ICDF(24839), AOM_ICDF(25675),
+          AOM_ICDF(27033), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6784), AOM_ICDF(11758), AOM_ICDF(13481), AOM_ICDF(17236),
+          AOM_ICDF(20210), AOM_ICDF(21768), AOM_ICDF(24303), AOM_ICDF(26948),
+          AOM_ICDF(28676), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(4864), AOM_ICDF(12712), AOM_ICDF(14201), AOM_ICDF(23863),
+          AOM_ICDF(25952), AOM_ICDF(26386), AOM_ICDF(27632), AOM_ICDF(28635),
+          AOM_ICDF(30362), AOM_ICDF(32768), 0 } },
+      { { AOM_ICDF(13184), AOM_ICDF(15173), AOM_ICDF(17647), AOM_ICDF(21576),
+          AOM_ICDF(24474), AOM_ICDF(25267), AOM_ICDF(27699), AOM_ICDF(29283),
+          AOM_ICDF(30549), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(7552), AOM_ICDF(11295), AOM_ICDF(18257), AOM_ICDF(20811),
+          AOM_ICDF(23213), AOM_ICDF(24606), AOM_ICDF(27731), AOM_ICDF(30407),
+          AOM_ICDF(31237), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(7936), AOM_ICDF(10846), AOM_ICDF(12816), AOM_ICDF(22436),
+          AOM_ICDF(24614), AOM_ICDF(25130), AOM_ICDF(26890), AOM_ICDF(28199),
+          AOM_ICDF(29091), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(8576), AOM_ICDF(11411), AOM_ICDF(13830), AOM_ICDF(15918),
+          AOM_ICDF(18996), AOM_ICDF(20044), AOM_ICDF(25114), AOM_ICDF(27835),
+          AOM_ICDF(28972), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(7680), AOM_ICDF(10816), AOM_ICDF(13646), AOM_ICDF(15966),
+          AOM_ICDF(21162), AOM_ICDF(22012), AOM_ICDF(24701), AOM_ICDF(27506),
+          AOM_ICDF(29644), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6784), AOM_ICDF(9423), AOM_ICDF(12524), AOM_ICDF(14773),
+          AOM_ICDF(19447), AOM_ICDF(22804), AOM_ICDF(26073), AOM_ICDF(29211),
+          AOM_ICDF(30642), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6784), AOM_ICDF(8916), AOM_ICDF(11059), AOM_ICDF(15861),
+          AOM_ICDF(21174), AOM_ICDF(22338), AOM_ICDF(24620), AOM_ICDF(27071),
+          AOM_ICDF(30899), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(9856), AOM_ICDF(11557), AOM_ICDF(13960), AOM_ICDF(18525),
+          AOM_ICDF(21788), AOM_ICDF(22189), AOM_ICDF(24462), AOM_ICDF(26603),
+          AOM_ICDF(27470), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(7808), AOM_ICDF(10636), AOM_ICDF(13143), AOM_ICDF(15844),
+          AOM_ICDF(18698), AOM_ICDF(20272), AOM_ICDF(24323), AOM_ICDF(30096),
+          AOM_ICDF(31787), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6016), AOM_ICDF(10928), AOM_ICDF(14596), AOM_ICDF(18926),
+          AOM_ICDF(21586), AOM_ICDF(22688), AOM_ICDF(26626), AOM_ICDF(29001),
+          AOM_ICDF(30399), AOM_ICDF(32768), 0 } },
+      { { AOM_ICDF(8832), AOM_ICDF(10983), AOM_ICDF(13451), AOM_ICDF(16582),
+          AOM_ICDF(21656), AOM_ICDF(23109), AOM_ICDF(24845), AOM_ICDF(26207),
+          AOM_ICDF(28796), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6784), AOM_ICDF(10844), AOM_ICDF(15554), AOM_ICDF(18073),
+          AOM_ICDF(22954), AOM_ICDF(24901), AOM_ICDF(26776), AOM_ICDF(28649),
+          AOM_ICDF(30419), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5120), AOM_ICDF(8252), AOM_ICDF(10072), AOM_ICDF(20108),
+          AOM_ICDF(23535), AOM_ICDF(24346), AOM_ICDF(25761), AOM_ICDF(26418),
+          AOM_ICDF(28675), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(7680), AOM_ICDF(11012), AOM_ICDF(12627), AOM_ICDF(14595),
+          AOM_ICDF(19462), AOM_ICDF(20888), AOM_ICDF(23348), AOM_ICDF(25703),
+          AOM_ICDF(28159), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6656), AOM_ICDF(9818), AOM_ICDF(11790), AOM_ICDF(13813),
+          AOM_ICDF(22731), AOM_ICDF(24737), AOM_ICDF(26557), AOM_ICDF(28061),
+          AOM_ICDF(29697), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5632), AOM_ICDF(8918), AOM_ICDF(11620), AOM_ICDF(13802),
+          AOM_ICDF(19950), AOM_ICDF(23764), AOM_ICDF(25734), AOM_ICDF(28537),
+          AOM_ICDF(31809), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(4480), AOM_ICDF(6580), AOM_ICDF(7808), AOM_ICDF(12281),
+          AOM_ICDF(19375), AOM_ICDF(20970), AOM_ICDF(22860), AOM_ICDF(24602),
+          AOM_ICDF(29929), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(7040), AOM_ICDF(9553), AOM_ICDF(11457), AOM_ICDF(15102),
+          AOM_ICDF(20291), AOM_ICDF(21280), AOM_ICDF(22985), AOM_ICDF(24475),
+          AOM_ICDF(26613), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6528), AOM_ICDF(10423), AOM_ICDF(12605), AOM_ICDF(14621),
+          AOM_ICDF(19031), AOM_ICDF(21505), AOM_ICDF(24585), AOM_ICDF(27558),
+          AOM_ICDF(29532), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6016), AOM_ICDF(11659), AOM_ICDF(14463), AOM_ICDF(18867),
+          AOM_ICDF(23653), AOM_ICDF(24903), AOM_ICDF(27115), AOM_ICDF(29389),
+          AOM_ICDF(31382), AOM_ICDF(32768), 0 } },
+      { { AOM_ICDF(8192), AOM_ICDF(10016), AOM_ICDF(13304), AOM_ICDF(16362),
+          AOM_ICDF(21107), AOM_ICDF(25165), AOM_ICDF(26620), AOM_ICDF(28901),
+          AOM_ICDF(30910), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5888), AOM_ICDF(8723), AOM_ICDF(16237), AOM_ICDF(18318),
+          AOM_ICDF(22002), AOM_ICDF(25923), AOM_ICDF(27394), AOM_ICDF(29934),
+          AOM_ICDF(31428), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(4608), AOM_ICDF(7138), AOM_ICDF(9841), AOM_ICDF(18442),
+          AOM_ICDF(22447), AOM_ICDF(24618), AOM_ICDF(26337), AOM_ICDF(27945),
+          AOM_ICDF(30168), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6784), AOM_ICDF(8916), AOM_ICDF(12270), AOM_ICDF(14851),
+          AOM_ICDF(19886), AOM_ICDF(22759), AOM_ICDF(25105), AOM_ICDF(28368),
+          AOM_ICDF(29760), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5120), AOM_ICDF(7928), AOM_ICDF(11324), AOM_ICDF(13340),
+          AOM_ICDF(21205), AOM_ICDF(24224), AOM_ICDF(25926), AOM_ICDF(28518),
+          AOM_ICDF(30560), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(4480), AOM_ICDF(6580), AOM_ICDF(10058), AOM_ICDF(11237),
+          AOM_ICDF(16807), AOM_ICDF(25937), AOM_ICDF(27218), AOM_ICDF(30015),
+          AOM_ICDF(31348), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(4608), AOM_ICDF(6808), AOM_ICDF(9445), AOM_ICDF(12446),
+          AOM_ICDF(18461), AOM_ICDF(21835), AOM_ICDF(23244), AOM_ICDF(26109),
+          AOM_ICDF(30115), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5760), AOM_ICDF(7659), AOM_ICDF(10798), AOM_ICDF(14720),
+          AOM_ICDF(19157), AOM_ICDF(21955), AOM_ICDF(23645), AOM_ICDF(26460),
+          AOM_ICDF(28702), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5760), AOM_ICDF(8503), AOM_ICDF(11157), AOM_ICDF(13071),
+          AOM_ICDF(17594), AOM_ICDF(22047), AOM_ICDF(24099), AOM_ICDF(29077),
+          AOM_ICDF(30850), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(4864), AOM_ICDF(9660), AOM_ICDF(14264), AOM_ICDF(17105),
+          AOM_ICDF(21528), AOM_ICDF(24094), AOM_ICDF(26025), AOM_ICDF(28580),
+          AOM_ICDF(30559), AOM_ICDF(32768), 0 } },
+      { { AOM_ICDF(9600), AOM_ICDF(11139), AOM_ICDF(12998), AOM_ICDF(18660),
+          AOM_ICDF(22158), AOM_ICDF(23501), AOM_ICDF(24659), AOM_ICDF(25736),
+          AOM_ICDF(30296), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(7168), AOM_ICDF(11068), AOM_ICDF(15984), AOM_ICDF(19969),
+          AOM_ICDF(23169), AOM_ICDF(24704), AOM_ICDF(26216), AOM_ICDF(27572),
+          AOM_ICDF(31368), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(4480), AOM_ICDF(6801), AOM_ICDF(8018), AOM_ICDF(20908),
+          AOM_ICDF(23071), AOM_ICDF(23583), AOM_ICDF(24301), AOM_ICDF(25062),
+          AOM_ICDF(29427), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(7168), AOM_ICDF(10068), AOM_ICDF(11753), AOM_ICDF(15843),
+          AOM_ICDF(19742), AOM_ICDF(21358), AOM_ICDF(23809), AOM_ICDF(26189),
+          AOM_ICDF(29067), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6016), AOM_ICDF(9047), AOM_ICDF(10622), AOM_ICDF(13931),
+          AOM_ICDF(22462), AOM_ICDF(23858), AOM_ICDF(25911), AOM_ICDF(27277),
+          AOM_ICDF(29722), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5888), AOM_ICDF(7568), AOM_ICDF(9931), AOM_ICDF(13533),
+          AOM_ICDF(18431), AOM_ICDF(22063), AOM_ICDF(23777), AOM_ICDF(26025),
+          AOM_ICDF(30555), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(4352), AOM_ICDF(6239), AOM_ICDF(7379), AOM_ICDF(13739),
+          AOM_ICDF(16917), AOM_ICDF(18090), AOM_ICDF(18835), AOM_ICDF(19651),
+          AOM_ICDF(30360), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6528), AOM_ICDF(8988), AOM_ICDF(10288), AOM_ICDF(15534),
+          AOM_ICDF(19495), AOM_ICDF(20386), AOM_ICDF(21934), AOM_ICDF(23034),
+          AOM_ICDF(26988), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(7040), AOM_ICDF(10055), AOM_ICDF(11652), AOM_ICDF(14757),
+          AOM_ICDF(19622), AOM_ICDF(21715), AOM_ICDF(23615), AOM_ICDF(26761),
+          AOM_ICDF(29483), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(4736), AOM_ICDF(10102), AOM_ICDF(12315), AOM_ICDF(19078),
+          AOM_ICDF(21348), AOM_ICDF(22621), AOM_ICDF(24246), AOM_ICDF(26044),
+          AOM_ICDF(29931), AOM_ICDF(32768), 0 } },
+      { { AOM_ICDF(10496), AOM_ICDF(12410), AOM_ICDF(14955), AOM_ICDF(19891),
+          AOM_ICDF(23137), AOM_ICDF(23792), AOM_ICDF(25159), AOM_ICDF(26378),
+          AOM_ICDF(28125), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(7936), AOM_ICDF(12204), AOM_ICDF(17104), AOM_ICDF(20191),
+          AOM_ICDF(23468), AOM_ICDF(24630), AOM_ICDF(26156), AOM_ICDF(27628),
+          AOM_ICDF(28913), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6016), AOM_ICDF(8629), AOM_ICDF(10232), AOM_ICDF(23591),
+          AOM_ICDF(25349), AOM_ICDF(25637), AOM_ICDF(26306), AOM_ICDF(27063),
+          AOM_ICDF(28980), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(8704), AOM_ICDF(12088), AOM_ICDF(13461), AOM_ICDF(16646),
+          AOM_ICDF(20516), AOM_ICDF(21455), AOM_ICDF(24062), AOM_ICDF(26579),
+          AOM_ICDF(28368), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(7296), AOM_ICDF(11177), AOM_ICDF(13117), AOM_ICDF(16196),
+          AOM_ICDF(23378), AOM_ICDF(24708), AOM_ICDF(26440), AOM_ICDF(27997),
+          AOM_ICDF(29078), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6272), AOM_ICDF(9377), AOM_ICDF(12575), AOM_ICDF(15616),
+          AOM_ICDF(20919), AOM_ICDF(23697), AOM_ICDF(26603), AOM_ICDF(27566),
+          AOM_ICDF(29903), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6528), AOM_ICDF(9091), AOM_ICDF(10478), AOM_ICDF(16445),
+          AOM_ICDF(21081), AOM_ICDF(22320), AOM_ICDF(23871), AOM_ICDF(25087),
+          AOM_ICDF(29258), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(8704), AOM_ICDF(11148), AOM_ICDF(12499), AOM_ICDF(17340),
+          AOM_ICDF(20656), AOM_ICDF(21288), AOM_ICDF(22588), AOM_ICDF(23701),
+          AOM_ICDF(24693), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(7552), AOM_ICDF(11394), AOM_ICDF(12980), AOM_ICDF(15562),
+          AOM_ICDF(19942), AOM_ICDF(21792), AOM_ICDF(25093), AOM_ICDF(28211),
+          AOM_ICDF(28959), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5120), AOM_ICDF(11708), AOM_ICDF(13847), AOM_ICDF(19377),
+          AOM_ICDF(22421), AOM_ICDF(23160), AOM_ICDF(25449), AOM_ICDF(27136),
+          AOM_ICDF(29182), AOM_ICDF(32768), 0 } },
+      { { AOM_ICDF(9984), AOM_ICDF(12031), AOM_ICDF(15190), AOM_ICDF(18673),
+          AOM_ICDF(21422), AOM_ICDF(22812), AOM_ICDF(25690), AOM_ICDF(29118),
+          AOM_ICDF(30458), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6144), AOM_ICDF(9680), AOM_ICDF(17436), AOM_ICDF(19610),
+          AOM_ICDF(21820), AOM_ICDF(23485), AOM_ICDF(26313), AOM_ICDF(30826),
+          AOM_ICDF(31843), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6016), AOM_ICDF(8315), AOM_ICDF(10607), AOM_ICDF(19333),
+          AOM_ICDF(21572), AOM_ICDF(22553), AOM_ICDF(25266), AOM_ICDF(27288),
+          AOM_ICDF(28551), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(7168), AOM_ICDF(9668), AOM_ICDF(12646), AOM_ICDF(16257),
+          AOM_ICDF(19648), AOM_ICDF(20899), AOM_ICDF(25304), AOM_ICDF(30465),
+          AOM_ICDF(31625), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6144), AOM_ICDF(9368), AOM_ICDF(11836), AOM_ICDF(14130),
+          AOM_ICDF(19153), AOM_ICDF(21157), AOM_ICDF(24876), AOM_ICDF(28452),
+          AOM_ICDF(29396), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5504), AOM_ICDF(8486), AOM_ICDF(11996), AOM_ICDF(14412),
+          AOM_ICDF(17968), AOM_ICDF(21814), AOM_ICDF(24424), AOM_ICDF(30682),
+          AOM_ICDF(32059), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5376), AOM_ICDF(7195), AOM_ICDF(9592), AOM_ICDF(13331),
+          AOM_ICDF(17569), AOM_ICDF(19460), AOM_ICDF(22371), AOM_ICDF(25458),
+          AOM_ICDF(28942), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(7424), AOM_ICDF(9206), AOM_ICDF(11783), AOM_ICDF(16456),
+          AOM_ICDF(19253), AOM_ICDF(20390), AOM_ICDF(23775), AOM_ICDF(27007),
+          AOM_ICDF(28425), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5888), AOM_ICDF(8303), AOM_ICDF(11361), AOM_ICDF(13440),
+          AOM_ICDF(15848), AOM_ICDF(17549), AOM_ICDF(21532), AOM_ICDF(29564),
+          AOM_ICDF(30665), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(4608), AOM_ICDF(8788), AOM_ICDF(13284), AOM_ICDF(16621),
+          AOM_ICDF(18983), AOM_ICDF(20286), AOM_ICDF(24577), AOM_ICDF(28960),
+          AOM_ICDF(30314), AOM_ICDF(32768), 0 } },
+      { { AOM_ICDF(8320), AOM_ICDF(15005), AOM_ICDF(19168), AOM_ICDF(24282),
+          AOM_ICDF(26707), AOM_ICDF(27402), AOM_ICDF(28681), AOM_ICDF(29639),
+          AOM_ICDF(30629), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5632), AOM_ICDF(13900), AOM_ICDF(22376), AOM_ICDF(24867),
+          AOM_ICDF(26804), AOM_ICDF(27734), AOM_ICDF(29130), AOM_ICDF(30722),
+          AOM_ICDF(31465), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(4992), AOM_ICDF(9115), AOM_ICDF(11055), AOM_ICDF(24893),
+          AOM_ICDF(26316), AOM_ICDF(26661), AOM_ICDF(27663), AOM_ICDF(28301),
+          AOM_ICDF(29418), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(7424), AOM_ICDF(12077), AOM_ICDF(14987), AOM_ICDF(19596),
+          AOM_ICDF(22615), AOM_ICDF(23600), AOM_ICDF(26465), AOM_ICDF(28484),
+          AOM_ICDF(29789), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6272), AOM_ICDF(11447), AOM_ICDF(14362), AOM_ICDF(18204),
+          AOM_ICDF(23418), AOM_ICDF(24715), AOM_ICDF(26697), AOM_ICDF(28547),
+          AOM_ICDF(29520), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5248), AOM_ICDF(10946), AOM_ICDF(15379), AOM_ICDF(18167),
+          AOM_ICDF(22197), AOM_ICDF(25432), AOM_ICDF(27295), AOM_ICDF(30031),
+          AOM_ICDF(30576), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5120), AOM_ICDF(9008), AOM_ICDF(11607), AOM_ICDF(18210),
+          AOM_ICDF(22327), AOM_ICDF(23427), AOM_ICDF(24887), AOM_ICDF(26580),
+          AOM_ICDF(29892), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(6656), AOM_ICDF(10124), AOM_ICDF(12689), AOM_ICDF(19922),
+          AOM_ICDF(22480), AOM_ICDF(22807), AOM_ICDF(24441), AOM_ICDF(25579),
+          AOM_ICDF(26787), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5376), AOM_ICDF(10084), AOM_ICDF(13983), AOM_ICDF(17113),
+          AOM_ICDF(19996), AOM_ICDF(21614), AOM_ICDF(24403), AOM_ICDF(28651),
+          AOM_ICDF(29938), AOM_ICDF(32768), 0 },
+        { AOM_ICDF(5504), AOM_ICDF(14131), AOM_ICDF(17989), AOM_ICDF(23324),
+          AOM_ICDF(25513), AOM_ICDF(26071), AOM_ICDF(27850), AOM_ICDF(29464),
+          AOM_ICDF(30393), AOM_ICDF(32768), 0 } },
+#endif  // CONFIG_ALT_INTRA
+    };
+#endif  // CONFIG_EC_MULTISYMBOL
+
+static void init_mode_probs(FRAME_CONTEXT *fc) {
+  av1_copy(fc->uv_mode_prob, default_uv_probs);
+  av1_copy(fc->y_mode_prob, default_if_y_probs);
+  av1_copy(fc->switchable_interp_prob, default_switchable_interp_prob);
+  av1_copy(fc->partition_prob, default_partition_probs);
+  av1_copy(fc->intra_inter_prob, default_intra_inter_p);
+  av1_copy(fc->comp_inter_prob, default_comp_inter_p);
+  av1_copy(fc->comp_ref_prob, default_comp_ref_p);
+#if CONFIG_LV_MAP
+  av1_copy(fc->txb_skip, default_txb_skip);
+  av1_copy(fc->nz_map, default_nz_map);
+  av1_copy(fc->eob_flag, default_eob_flag);
+  av1_copy(fc->dc_sign, default_dc_sign);
+  av1_copy(fc->coeff_base, default_coeff_base);
+  av1_copy(fc->coeff_lps, default_coeff_lps);
+#endif
+#if CONFIG_EXT_REFS
+  av1_copy(fc->comp_bwdref_prob, default_comp_bwdref_p);
+#endif  // CONFIG_EXT_REFS
+  av1_copy(fc->single_ref_prob, default_single_ref_p);
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  av1_copy(fc->comp_inter_mode_prob, default_comp_inter_mode_p);
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  av1_copy(fc->tx_size_probs, default_tx_size_prob);
+#if CONFIG_VAR_TX
+  av1_copy(fc->txfm_partition_prob, default_txfm_partition_probs);
+#endif
+  av1_copy(fc->skip_probs, default_skip_probs);
+#if CONFIG_REF_MV
+  av1_copy(fc->newmv_prob, default_newmv_prob);
+  av1_copy(fc->zeromv_prob, default_zeromv_prob);
+  av1_copy(fc->refmv_prob, default_refmv_prob);
+  av1_copy(fc->drl_prob, default_drl_prob);
+#endif  // CONFIG_REF_MV
+  av1_copy(fc->inter_mode_probs, default_inter_mode_probs);
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  av1_copy(fc->motion_mode_prob, default_motion_mode_prob);
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+  av1_copy(fc->obmc_prob, default_obmc_prob);
+#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_EXT_INTER
+  av1_copy(fc->inter_compound_mode_probs, default_inter_compound_mode_probs);
+#if CONFIG_COMPOUND_SINGLEREF
+  av1_copy(fc->inter_singleref_comp_mode_probs,
+           default_inter_singleref_comp_mode_probs);
+#endif  // CONFIG_COMPOUND_SINGLEREF
+  av1_copy(fc->compound_type_prob, default_compound_type_probs);
+  av1_copy(fc->interintra_prob, default_interintra_prob);
+  av1_copy(fc->interintra_mode_prob, default_interintra_mode_prob);
+  av1_copy(fc->wedge_interintra_prob, default_wedge_interintra_prob);
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_SUPERTX
+  av1_copy(fc->supertx_prob, default_supertx_prob);
+#endif  // CONFIG_SUPERTX
+  av1_copy(fc->seg.tree_probs, default_segment_tree_probs);
+  av1_copy(fc->seg.pred_probs, default_segment_pred_probs);
+#if CONFIG_EXT_INTRA
+#if CONFIG_INTRA_INTERP
+  av1_copy(fc->intra_filter_probs, default_intra_filter_probs);
+#endif  // CONFIG_INTRA_INTERP
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  av1_copy(fc->filter_intra_probs, default_filter_intra_probs);
+#endif  // CONFIG_FILTER_INTRA
+  av1_copy(fc->inter_ext_tx_prob, default_inter_ext_tx_prob);
+  av1_copy(fc->intra_ext_tx_prob, default_intra_ext_tx_prob);
+#if CONFIG_LOOP_RESTORATION
+  av1_copy(fc->switchable_restore_prob, default_switchable_restore_prob);
+#endif  // CONFIG_LOOP_RESTORATION
+#if CONFIG_EC_MULTISYMBOL
+  av1_copy(fc->y_mode_cdf, default_if_y_mode_cdf);
+  av1_copy(fc->uv_mode_cdf, default_uv_mode_cdf);
+  av1_copy(fc->switchable_interp_cdf, default_switchable_interp_cdf);
+  av1_copy(fc->partition_cdf, default_partition_cdf);
+  av1_copy(fc->inter_mode_cdf, default_inter_mode_cdf);
+  av1_copy(fc->intra_ext_tx_cdf, default_intra_ext_tx_cdf);
+  av1_copy(fc->inter_ext_tx_cdf, default_inter_ext_tx_cdf);
+#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+  av1_copy(fc->intra_filter_cdf, default_intra_filter_cdf);
+#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+  av1_copy(fc->seg.tree_cdf, default_seg_tree_cdf);
+  av1_copy(fc->tx_size_cdf, default_tx_size_cdf);
+#endif  // CONFIG_EC_MULTISYMBOL
+#if CONFIG_DELTA_Q
+  av1_copy(fc->delta_q_prob, default_delta_q_probs);
+#if CONFIG_EC_MULTISYMBOL
+  av1_copy(fc->delta_q_cdf, default_delta_q_cdf);
+#endif  // CONFIG_EC_MULTISYMBOL
+#if CONFIG_EXT_DELTA_Q
+  av1_copy(fc->delta_lf_prob, default_delta_lf_probs);
+#if CONFIG_EC_MULTISYMBOL
+  av1_copy(fc->delta_lf_cdf, default_delta_lf_cdf);
+#endif  // CONFIG_EC_MULTISYMBOL
+#endif
+#endif  // CONFIG_DELTA_Q
+}
+
+#if CONFIG_EC_MULTISYMBOL
+int av1_switchable_interp_ind[SWITCHABLE_FILTERS];
+int av1_switchable_interp_inv[SWITCHABLE_FILTERS];
+
+#if !CONFIG_EC_ADAPT
+void av1_set_mode_cdfs(struct AV1Common *cm) {
+  FRAME_CONTEXT *fc = cm->fc;
+  int i, j;
+  if (cm->seg.enabled && cm->seg.update_map) {
+    av1_tree_to_cdf(av1_segment_tree, cm->fc->seg.tree_probs,
+                    cm->fc->seg.tree_cdf);
+  }
+
+  for (i = 0; i < INTRA_MODES; ++i)
+    av1_tree_to_cdf(av1_intra_mode_tree, fc->uv_mode_prob[i],
+                    fc->uv_mode_cdf[i]);
+#if CONFIG_EXT_PARTITION_TYPES
+  for (i = 0; i < PARTITION_PLOFFSET; ++i)
+    av1_tree_to_cdf(av1_partition_tree, fc->partition_prob[i],
+                    fc->partition_cdf[i]);
+  // Logical index (enum value) to inorder index (tree_to_cdf order)
+  aom_cdf_prob inorder_partition_cdf[CDF_SIZE(EXT_PARTITION_TYPES)] = {};
+  // TODO(aconverse): Generate this dynamically. The assumptions that
+  // av1_indices_from_tree() makes don't hold for this tree.
+  static const uint8_t av1_ext_partition_index_map[EXT_PARTITION_TYPES] = {
+    0, 1, 4, 7, 2, 3, 5, 6,
+  };
+  for (; i < PARTITION_CONTEXTS; ++i) {
+    av1_tree_to_cdf(av1_ext_partition_tree, fc->partition_prob[i],
+                    inorder_partition_cdf);
+    aom_cdf_prob cum_prob = 0;
+    for (j = 0; j < EXT_PARTITION_TYPES; ++j) {
+      int inorder_idx = av1_ext_partition_index_map[j];
+      aom_cdf_prob prob =
+          AOM_ICDF(inorder_partition_cdf[inorder_idx]) -
+          (inorder_idx > 0 ? AOM_ICDF(inorder_partition_cdf[inorder_idx - 1])
+                           : 0);
+      cum_prob += prob;
+      fc->partition_cdf[i][j] = AOM_ICDF(cum_prob);
+    }
+    assert(cum_prob == CDF_PROB_TOP);
+  }
+#else
+  for (i = 0; i < PARTITION_CONTEXTS; ++i)
+    av1_tree_to_cdf(av1_partition_tree, fc->partition_prob[i],
+                    fc->partition_cdf[i]);
+#endif
+
+  for (i = 0; i < INTRA_MODES; ++i)
+    for (j = 0; j < INTRA_MODES; ++j)
+      av1_tree_to_cdf(av1_intra_mode_tree, cm->kf_y_prob[i][j],
+                      cm->fc->kf_y_cdf[i][j]);
+
+  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+    av1_tree_to_cdf(av1_switchable_interp_tree, fc->switchable_interp_prob[j],
+                    fc->switchable_interp_cdf[j]);
+
+  for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+    av1_tree_to_cdf(av1_inter_mode_tree, fc->inter_mode_probs[i],
+                    fc->inter_mode_cdf[i]);
+
+  for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+    av1_tree_to_cdf(av1_intra_mode_tree, fc->y_mode_prob[i], fc->y_mode_cdf[i]);
+
+#if CONFIG_EXT_TX
+  int s;
+  for (s = 0; s < EXT_TX_SETS_INTRA; ++s)
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i)
+      for (j = 0; j < INTRA_MODES; ++j)
+        av1_tree_to_cdf(av1_ext_tx_intra_tree[s],
+                        fc->intra_ext_tx_prob[s][i][j],
+                        fc->intra_ext_tx_cdf[s][i][j]);
+
+  for (s = 0; s < EXT_TX_SETS_INTER; ++s)
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i)
+      av1_tree_to_cdf(av1_ext_tx_inter_tree[s], fc->inter_ext_tx_prob[s][i],
+                      fc->inter_ext_tx_cdf[s][i]);
+#else
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i)
+    for (j = 0; j < TX_TYPES; ++j)
+      av1_tree_to_cdf(av1_ext_tx_tree, fc->intra_ext_tx_prob[i][j],
+                      fc->intra_ext_tx_cdf[i][j]);
+
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i)
+    av1_tree_to_cdf(av1_ext_tx_tree, fc->inter_ext_tx_prob[i],
+                    fc->inter_ext_tx_cdf[i]);
+#endif
+  for (i = 0; i < MAX_TX_DEPTH; i++) {
+    for (j = 0; j < TX_SIZE_CONTEXTS; j++) {
+      av1_tree_to_cdf(av1_tx_size_tree[i], fc->tx_size_probs[i][j],
+                      fc->tx_size_cdf[i][j]);
+    }
+  }
+#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+  for (i = 0; i < INTRA_FILTERS + 1; ++i) {
+    av1_tree_to_cdf(av1_intra_filter_tree, fc->intra_filter_probs[i],
+                    fc->intra_filter_cdf[i]);
+  }
+#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+}
+#endif  // !CONFIG_EC_ADAPT
+#endif  // CONFIG_EC_MULTISYMBOL
+
+#if CONFIG_DUAL_FILTER
+const aom_tree_index av1_switchable_interp_tree[TREE_SIZE(SWITCHABLE_FILTERS)] =
+    {
+      -EIGHTTAP_REGULAR, 2, 4, -MULTITAP_SHARP, -EIGHTTAP_SMOOTH,
+      -EIGHTTAP_SMOOTH2,
+    };
+#else
+const aom_tree_index av1_switchable_interp_tree[TREE_SIZE(SWITCHABLE_FILTERS)] =
+    { -EIGHTTAP_REGULAR, 2, -EIGHTTAP_SMOOTH, -MULTITAP_SHARP };
+#endif  // CONFIG_DUAL_FILTER
+
+void av1_adapt_inter_frame_probs(AV1_COMMON *cm) {
+  int i, j;
+  FRAME_CONTEXT *fc = cm->fc;
+  const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+  const FRAME_COUNTS *counts = &cm->counts;
+
+  for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+    fc->intra_inter_prob[i] = av1_mode_mv_merge_probs(
+        pre_fc->intra_inter_prob[i], counts->intra_inter[i]);
+
+  for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+    fc->comp_inter_prob[i] = av1_mode_mv_merge_probs(pre_fc->comp_inter_prob[i],
+                                                     counts->comp_inter[i]);
+
+#if CONFIG_EXT_REFS
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < (FWD_REFS - 1); j++)
+      fc->comp_ref_prob[i][j] = mode_mv_merge_probs(pre_fc->comp_ref_prob[i][j],
+                                                    counts->comp_ref[i][j]);
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < (BWD_REFS - 1); j++)
+      fc->comp_bwdref_prob[i][j] = mode_mv_merge_probs(
+          pre_fc->comp_bwdref_prob[i][j], counts->comp_bwdref[i][j]);
+#else
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < (COMP_REFS - 1); j++)
+      fc->comp_ref_prob[i][j] = mode_mv_merge_probs(pre_fc->comp_ref_prob[i][j],
+                                                    counts->comp_ref[i][j]);
+#endif  // CONFIG_EXT_REFS
+
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < (SINGLE_REFS - 1); j++)
+      fc->single_ref_prob[i][j] = av1_mode_mv_merge_probs(
+          pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]);
+
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  for (i = 0; i < COMP_INTER_MODE_CONTEXTS; i++)
+    fc->comp_inter_mode_prob[i] = av1_mode_mv_merge_probs(
+        pre_fc->comp_inter_mode_prob[i], counts->comp_inter_mode[i]);
+
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
+#if CONFIG_REF_MV
+  for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
+    fc->newmv_prob[i] =
+        av1_mode_mv_merge_probs(pre_fc->newmv_prob[i], counts->newmv_mode[i]);
+  for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
+    fc->zeromv_prob[i] =
+        av1_mode_mv_merge_probs(pre_fc->zeromv_prob[i], counts->zeromv_mode[i]);
+  for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
+    fc->refmv_prob[i] =
+        av1_mode_mv_merge_probs(pre_fc->refmv_prob[i], counts->refmv_mode[i]);
+
+  for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
+    fc->drl_prob[i] =
+        av1_mode_mv_merge_probs(pre_fc->drl_prob[i], counts->drl_mode[i]);
+#else
+  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
+    aom_tree_merge_probs(av1_inter_mode_tree, pre_fc->inter_mode_probs[i],
+                         counts->inter_mode[i], fc->inter_mode_probs[i]);
+#endif
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  for (i = BLOCK_8X8; i < BLOCK_SIZES; ++i)
+    aom_tree_merge_probs(av1_motion_mode_tree, pre_fc->motion_mode_prob[i],
+                         counts->motion_mode[i], fc->motion_mode_prob[i]);
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+  for (i = BLOCK_8X8; i < BLOCK_SIZES; ++i)
+    fc->obmc_prob[i] =
+        av1_mode_mv_merge_probs(pre_fc->obmc_prob[i], counts->obmc[i]);
+#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if CONFIG_SUPERTX
+  for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
+    for (j = TX_8X8; j < TX_SIZES; ++j) {
+      fc->supertx_prob[i][j] = av1_mode_mv_merge_probs(
+          pre_fc->supertx_prob[i][j], counts->supertx[i][j]);
+    }
+  }
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_EXT_INTER
+  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
+    aom_tree_merge_probs(
+        av1_inter_compound_mode_tree, pre_fc->inter_compound_mode_probs[i],
+        counts->inter_compound_mode[i], fc->inter_compound_mode_probs[i]);
+#if CONFIG_COMPOUND_SINGLEREF
+  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
+    aom_tree_merge_probs(av1_inter_singleref_comp_mode_tree,
+                         pre_fc->inter_singleref_comp_mode_probs[i],
+                         counts->inter_singleref_comp_mode[i],
+                         fc->inter_singleref_comp_mode_probs[i]);
+#endif  // CONFIG_COMPOUND_SINGLEREF
+  for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
+    if (is_interintra_allowed_bsize_group(i))
+      fc->interintra_prob[i] = av1_mode_mv_merge_probs(
+          pre_fc->interintra_prob[i], counts->interintra[i]);
+  }
+  for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+    aom_tree_merge_probs(
+        av1_interintra_mode_tree, pre_fc->interintra_mode_prob[i],
+        counts->interintra_mode[i], fc->interintra_mode_prob[i]);
+  }
+  for (i = 0; i < BLOCK_SIZES; ++i) {
+    if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i))
+      fc->wedge_interintra_prob[i] = av1_mode_mv_merge_probs(
+          pre_fc->wedge_interintra_prob[i], counts->wedge_interintra[i]);
+  }
+
+#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+  for (i = 0; i < BLOCK_SIZES; ++i) {
+    aom_tree_merge_probs(av1_compound_type_tree, pre_fc->compound_type_prob[i],
+                         counts->compound_interinter[i],
+                         fc->compound_type_prob[i]);
+  }
+#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+#endif  // CONFIG_EXT_INTER
+
+  for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
+    aom_tree_merge_probs(av1_intra_mode_tree, pre_fc->y_mode_prob[i],
+                         counts->y_mode[i], fc->y_mode_prob[i]);
+
+  if (cm->interp_filter == SWITCHABLE) {
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+      aom_tree_merge_probs(
+          av1_switchable_interp_tree, pre_fc->switchable_interp_prob[i],
+          counts->switchable_interp[i], fc->switchable_interp_prob[i]);
+  }
+}
+
+void av1_adapt_intra_frame_probs(AV1_COMMON *cm) {
+  int i, j;
+  FRAME_CONTEXT *fc = cm->fc;
+  const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+  const FRAME_COUNTS *counts = &cm->counts;
+
+  if (cm->tx_mode == TX_MODE_SELECT) {
+    for (i = 0; i < MAX_TX_DEPTH; ++i) {
+      for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+        aom_tree_merge_probs(av1_tx_size_tree[i], pre_fc->tx_size_probs[i][j],
+                             counts->tx_size[i][j], fc->tx_size_probs[i][j]);
+    }
+  }
+
+#if CONFIG_VAR_TX
+  if (cm->tx_mode == TX_MODE_SELECT) {
+    for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i)
+      fc->txfm_partition_prob[i] = av1_mode_mv_merge_probs(
+          pre_fc->txfm_partition_prob[i], counts->txfm_partition[i]);
+  }
+#endif
+
+  for (i = 0; i < SKIP_CONTEXTS; ++i)
+    fc->skip_probs[i] =
+        av1_mode_mv_merge_probs(pre_fc->skip_probs[i], counts->skip[i]);
+
+#if CONFIG_EXT_TX
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    int s;
+    for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+      if (use_inter_ext_tx_for_txsize[s][i]) {
+        aom_tree_merge_probs(
+            av1_ext_tx_inter_tree[s], pre_fc->inter_ext_tx_prob[s][i],
+            counts->inter_ext_tx[s][i], fc->inter_ext_tx_prob[s][i]);
+      }
+    }
+    for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+      if (use_intra_ext_tx_for_txsize[s][i]) {
+        for (j = 0; j < INTRA_MODES; ++j)
+          aom_tree_merge_probs(
+              av1_ext_tx_intra_tree[s], pre_fc->intra_ext_tx_prob[s][i][j],
+              counts->intra_ext_tx[s][i][j], fc->intra_ext_tx_prob[s][i][j]);
+      }
+    }
+  }
+#else
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    for (j = 0; j < TX_TYPES; ++j) {
+      aom_tree_merge_probs(av1_ext_tx_tree, pre_fc->intra_ext_tx_prob[i][j],
+                           counts->intra_ext_tx[i][j],
+                           fc->intra_ext_tx_prob[i][j]);
+    }
+  }
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    aom_tree_merge_probs(av1_ext_tx_tree, pre_fc->inter_ext_tx_prob[i],
+                         counts->inter_ext_tx[i], fc->inter_ext_tx_prob[i]);
+  }
+#endif  // CONFIG_EXT_TX
+
+  if (cm->seg.temporal_update) {
+    for (i = 0; i < PREDICTION_PROBS; i++)
+      fc->seg.pred_probs[i] = av1_mode_mv_merge_probs(pre_fc->seg.pred_probs[i],
+                                                      counts->seg.pred[i]);
+
+    aom_tree_merge_probs(av1_segment_tree, pre_fc->seg.tree_probs,
+                         counts->seg.tree_mispred, fc->seg.tree_probs);
+  } else {
+    aom_tree_merge_probs(av1_segment_tree, pre_fc->seg.tree_probs,
+                         counts->seg.tree_total, fc->seg.tree_probs);
+  }
+
+  for (i = 0; i < INTRA_MODES; ++i)
+    aom_tree_merge_probs(av1_intra_mode_tree, pre_fc->uv_mode_prob[i],
+                         counts->uv_mode[i], fc->uv_mode_prob[i]);
+
+#if CONFIG_EXT_PARTITION_TYPES
+  for (i = 0; i < PARTITION_PLOFFSET; ++i)
+    aom_tree_merge_probs(av1_partition_tree, pre_fc->partition_prob[i],
+                         counts->partition[i], fc->partition_prob[i]);
+  for (; i < PARTITION_CONTEXTS_PRIMARY; ++i)
+    aom_tree_merge_probs(av1_ext_partition_tree, pre_fc->partition_prob[i],
+                         counts->partition[i], fc->partition_prob[i]);
+#else
+  for (i = 0; i < PARTITION_CONTEXTS_PRIMARY; ++i) {
+    aom_tree_merge_probs(av1_partition_tree, pre_fc->partition_prob[i],
+                         counts->partition[i], fc->partition_prob[i]);
+  }
+#endif  // CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_UNPOISON_PARTITION_CTX
+  for (i = PARTITION_CONTEXTS_PRIMARY;
+       i < PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES; ++i) {
+    unsigned int ct[2] = { counts->partition[i][PARTITION_VERT],
+                           counts->partition[i][PARTITION_SPLIT] };
+    assert(counts->partition[i][PARTITION_NONE] == 0);
+    assert(counts->partition[i][PARTITION_HORZ] == 0);
+    assert(fc->partition_prob[i][PARTITION_NONE] == 0);
+    assert(fc->partition_prob[i][PARTITION_HORZ] == 0);
+    fc->partition_prob[i][PARTITION_VERT] =
+        av1_mode_mv_merge_probs(pre_fc->partition_prob[i][PARTITION_VERT], ct);
+  }
+  for (i = PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES;
+       i < PARTITION_CONTEXTS_PRIMARY + 2 * PARTITION_BLOCK_SIZES; ++i) {
+    unsigned int ct[2] = { counts->partition[i][PARTITION_HORZ],
+                           counts->partition[i][PARTITION_SPLIT] };
+    assert(counts->partition[i][PARTITION_NONE] == 0);
+    assert(counts->partition[i][PARTITION_VERT] == 0);
+    assert(fc->partition_prob[i][PARTITION_NONE] == 0);
+    assert(fc->partition_prob[i][PARTITION_VERT] == 0);
+    fc->partition_prob[i][PARTITION_HORZ] =
+        av1_mode_mv_merge_probs(pre_fc->partition_prob[i][PARTITION_HORZ], ct);
+  }
+#endif
+#if CONFIG_DELTA_Q
+  for (i = 0; i < DELTA_Q_PROBS; ++i)
+    fc->delta_q_prob[i] =
+        mode_mv_merge_probs(pre_fc->delta_q_prob[i], counts->delta_q[i]);
+#if CONFIG_EXT_DELTA_Q
+  for (i = 0; i < DELTA_LF_PROBS; ++i)
+    fc->delta_lf_prob[i] =
+        mode_mv_merge_probs(pre_fc->delta_lf_prob[i], counts->delta_lf[i]);
+#endif  // CONFIG_EXT_DELTA_Q
+#endif
+#if CONFIG_EXT_INTRA
+#if CONFIG_INTRA_INTERP
+  for (i = 0; i < INTRA_FILTERS + 1; ++i) {
+    aom_tree_merge_probs(av1_intra_filter_tree, pre_fc->intra_filter_probs[i],
+                         counts->intra_filter[i], fc->intra_filter_probs[i]);
+  }
+#endif  // CONFIG_INTRA_INTERP
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  for (i = 0; i < PLANE_TYPES; ++i) {
+    fc->filter_intra_probs[i] = av1_mode_mv_merge_probs(
+        pre_fc->filter_intra_probs[i], counts->filter_intra[i]);
+  }
+#endif  // CONFIG_FILTER_INTRA
+}
+
+static void set_default_lf_deltas(struct loopfilter *lf) {
+  lf->mode_ref_delta_enabled = 1;
+  lf->mode_ref_delta_update = 1;
+
+  lf->ref_deltas[INTRA_FRAME] = 1;
+  lf->ref_deltas[LAST_FRAME] = 0;
+#if CONFIG_EXT_REFS
+  lf->ref_deltas[LAST2_FRAME] = lf->ref_deltas[LAST_FRAME];
+  lf->ref_deltas[LAST3_FRAME] = lf->ref_deltas[LAST_FRAME];
+  lf->ref_deltas[BWDREF_FRAME] = lf->ref_deltas[LAST_FRAME];
+#endif  // CONFIG_EXT_REFS
+  lf->ref_deltas[GOLDEN_FRAME] = -1;
+  lf->ref_deltas[ALTREF_FRAME] = -1;
+
+  lf->mode_deltas[0] = 0;
+  lf->mode_deltas[1] = 0;
+}
+
+void av1_setup_past_independence(AV1_COMMON *cm) {
+  // Reset the segment feature data to the default stats:
+  // Features disabled, 0, with delta coding (Default state).
+  struct loopfilter *const lf = &cm->lf;
+
+  int i;
+  av1_clearall_segfeatures(&cm->seg);
+  cm->seg.abs_delta = SEGMENT_DELTADATA;
+
+  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+    memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+
+  if (cm->current_frame_seg_map)
+    memset(cm->current_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+
+  // Reset the mode ref deltas for loop filter
+  av1_zero(lf->last_ref_deltas);
+  av1_zero(lf->last_mode_deltas);
+  set_default_lf_deltas(lf);
+
+  // To force update of the sharpness
+  lf->last_sharpness_level = -1;
+
+  av1_default_coef_probs(cm);
+  init_mode_probs(cm->fc);
+  av1_init_mv_probs(cm);
+#if CONFIG_PVQ
+  av1_default_pvq_probs(cm);
+#endif  // CONFIG_PVQ
+#if CONFIG_ADAPT_SCAN
+  av1_init_scan_order(cm);
+#endif
+  av1_convolve_init(cm);
+  cm->fc->initialized = 1;
+
+  if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode ||
+      cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL) {
+    // Reset all frame contexts.
+    for (i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
+  } else if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT) {
+    // Reset only the frame context specified in the frame header.
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+  }
+
+  // prev_mip will only be allocated in encoder.
+  if (frame_is_intra_only(cm) && cm->prev_mip && !cm->frame_parallel_decode)
+    memset(cm->prev_mip, 0,
+           cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip));
+
+  cm->frame_context_idx = 0;
+}
diff --git a/third_party/aom/av1/common/entropymode.h b/third_party/aom/av1/common/entropymode.h
new file mode 100644
index 000000000..9c3a78d61
--- /dev/null
+++ b/third_party/aom/av1/common/entropymode.h
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_ENTROPYMODE_H_
+#define AV1_COMMON_ENTROPYMODE_H_
+
+#include "av1/common/entropy.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/filter.h"
+#include "av1/common/seg_common.h"
+#include "aom_dsp/aom_filter.h"
+
+#if CONFIG_PVQ
+#include "av1/common/pvq.h"
+#include "av1/common/pvq_state.h"
+#include "av1/common/generic_code.h"
+#endif  // CONFIG_PVQ
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BLOCK_SIZE_GROUPS 4
+
+#define TX_SIZE_CONTEXTS 2
+
+#define INTER_OFFSET(mode) ((mode)-NEARESTMV)
+#if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SINGLEREF
+#define INTER_SINGLEREF_COMP_OFFSET(mode) ((mode)-SR_NEAREST_NEARMV)
+#endif  // CONFIG_COMPOUND_SINGLEREF
+#define INTER_COMPOUND_OFFSET(mode) ((mode)-NEAREST_NEARESTMV)
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_PALETTE
+// Number of possible contexts for a color index.
+// As can be seen from av1_get_palette_color_index_context(), the possible
+// contexts are (2,0,0), (2,2,1), (3,2,0), (4,1,0), (5,0,0). These are mapped to
+// a value from 0 to 4 using 'palette_color_index_context_lookup' table.
+#define PALETTE_COLOR_INDEX_CONTEXTS 5
+
+// Maximum number of colors in a palette.
+#define PALETTE_MAX_SIZE 8
+// Minimum number of colors in a palette.
+#define PALETTE_MIN_SIZE 2
+
+// Palette mode is available for block sizes >= 8x8.
+#define PALETTE_BLOCK_SIZES (BLOCK_LARGEST - BLOCK_8X8 + 1)
+
+// Palette Y mode context for a block is determined by number of neighboring
+// blocks (top and/or left) using a palette for Y plane. So, possible Y mode'
+// context values are:
+// 0 if neither left nor top block uses palette for Y plane,
+// 1 if exactly one of left or top block uses palette for Y plane, and
+// 2 if both left and top blocks use palette for Y plane.
+#define PALETTE_Y_MODE_CONTEXTS 3
+
+// Palette UV mode context for a block is determined by whether this block uses
+// palette for the Y plane. So, possible values are:
+// 0 if this block doesn't use palette for Y plane.
+// 1 if this block uses palette for Y plane (i.e. Y palette size > 0).
+#define PALETTE_UV_MODE_CONTEXTS 2
+
+#define PALETTE_MAX_BLOCK_SIZE (64 * 64)
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_INTRABC
+#define INTRABC_PROB 192
+#endif  // CONFIG_INTRABC
+
+struct AV1Common;
+
+typedef struct {
+  const int16_t *scan;
+  const int16_t *iscan;
+  const int16_t *neighbors;
+} SCAN_ORDER;
+
+struct seg_counts {
+  unsigned int tree_total[MAX_SEGMENTS];
+  unsigned int tree_mispred[MAX_SEGMENTS];
+  unsigned int pred[PREDICTION_PROBS][2];
+};
+
+typedef struct frame_contexts {
+  aom_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
+  aom_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+#if CONFIG_EXT_PARTITION_TYPES
+  aom_prob partition_prob[PARTITION_CONTEXTS][EXT_PARTITION_TYPES - 1];
+#else
+  aom_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
+#endif
+  av1_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES];
+#if CONFIG_NEW_TOKENSET
+  coeff_cdf_model coef_tail_cdfs[TX_SIZES][PLANE_TYPES];
+  coeff_cdf_model coef_head_cdfs[TX_SIZES][PLANE_TYPES];
+  aom_prob blockzero_probs[TX_SIZES][PLANE_TYPES][REF_TYPES][BLOCKZ_CONTEXTS];
+#elif CONFIG_EC_MULTISYMBOL
+  coeff_cdf_model coef_cdfs[TX_SIZES][PLANE_TYPES];
+#endif  // CONFIG_NEW_TOKENSET
+  aom_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+                                 [SWITCHABLE_FILTERS - 1];
+#if CONFIG_ADAPT_SCAN
+// TODO(angiebird): try aom_prob
+#if CONFIG_CB4X4
+  uint32_t non_zero_prob_2x2[TX_TYPES][4];
+#endif
+  uint32_t non_zero_prob_4X4[TX_TYPES][16];
+  uint32_t non_zero_prob_8X8[TX_TYPES][64];
+  uint32_t non_zero_prob_16X16[TX_TYPES][256];
+  uint32_t non_zero_prob_32X32[TX_TYPES][1024];
+
+  uint32_t non_zero_prob_4X8[TX_TYPES][32];
+  uint32_t non_zero_prob_8X4[TX_TYPES][32];
+  uint32_t non_zero_prob_16X8[TX_TYPES][128];
+  uint32_t non_zero_prob_8X16[TX_TYPES][128];
+  uint32_t non_zero_prob_32X16[TX_TYPES][512];
+  uint32_t non_zero_prob_16X32[TX_TYPES][512];
+
+#if CONFIG_CB4X4
+  DECLARE_ALIGNED(16, int16_t, scan_2x2[TX_TYPES][4]);
+#endif
+  DECLARE_ALIGNED(16, int16_t, scan_4X4[TX_TYPES][16]);
+  DECLARE_ALIGNED(16, int16_t, scan_8X8[TX_TYPES][64]);
+  DECLARE_ALIGNED(16, int16_t, scan_16X16[TX_TYPES][256]);
+  DECLARE_ALIGNED(16, int16_t, scan_32X32[TX_TYPES][1024]);
+
+  DECLARE_ALIGNED(16, int16_t, scan_4X8[TX_TYPES][32]);
+  DECLARE_ALIGNED(16, int16_t, scan_8X4[TX_TYPES][32]);
+  DECLARE_ALIGNED(16, int16_t, scan_8X16[TX_TYPES][128]);
+  DECLARE_ALIGNED(16, int16_t, scan_16X8[TX_TYPES][128]);
+  DECLARE_ALIGNED(16, int16_t, scan_16X32[TX_TYPES][512]);
+  DECLARE_ALIGNED(16, int16_t, scan_32X16[TX_TYPES][512]);
+
+#if CONFIG_CB4X4
+  DECLARE_ALIGNED(16, int16_t, iscan_2x2[TX_TYPES][4]);
+#endif
+  DECLARE_ALIGNED(16, int16_t, iscan_4X4[TX_TYPES][16]);
+  DECLARE_ALIGNED(16, int16_t, iscan_8X8[TX_TYPES][64]);
+  DECLARE_ALIGNED(16, int16_t, iscan_16X16[TX_TYPES][256]);
+  DECLARE_ALIGNED(16, int16_t, iscan_32X32[TX_TYPES][1024]);
+
+  DECLARE_ALIGNED(16, int16_t, iscan_4X8[TX_TYPES][32]);
+  DECLARE_ALIGNED(16, int16_t, iscan_8X4[TX_TYPES][32]);
+  DECLARE_ALIGNED(16, int16_t, iscan_8X16[TX_TYPES][128]);
+  DECLARE_ALIGNED(16, int16_t, iscan_16X8[TX_TYPES][128]);
+  DECLARE_ALIGNED(16, int16_t, iscan_16X32[TX_TYPES][512]);
+  DECLARE_ALIGNED(16, int16_t, iscan_32X16[TX_TYPES][512]);
+
+#if CONFIG_CB4X4
+  int16_t nb_2x2[TX_TYPES][(4 + 1) * 2];
+#endif
+  int16_t nb_4X4[TX_TYPES][(16 + 1) * 2];
+  int16_t nb_8X8[TX_TYPES][(64 + 1) * 2];
+  int16_t nb_16X16[TX_TYPES][(256 + 1) * 2];
+  int16_t nb_32X32[TX_TYPES][(1024 + 1) * 2];
+
+  int16_t nb_4X8[TX_TYPES][(32 + 1) * 2];
+  int16_t nb_8X4[TX_TYPES][(32 + 1) * 2];
+  int16_t nb_8X16[TX_TYPES][(128 + 1) * 2];
+  int16_t nb_16X8[TX_TYPES][(128 + 1) * 2];
+  int16_t nb_16X32[TX_TYPES][(512 + 1) * 2];
+  int16_t nb_32X16[TX_TYPES][(512 + 1) * 2];
+
+  SCAN_ORDER sc[TX_SIZES_ALL][TX_TYPES];
+
+  int16_t eob_threshold[TX_SIZES_ALL][TX_TYPES][EOB_THRESHOLD_NUM];
+#endif  // CONFIG_ADAPT_SCAN
+
+#if CONFIG_LV_MAP
+  aom_prob txb_skip[TX_SIZES][TXB_SKIP_CONTEXTS];
+  aom_prob nz_map[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS];
+  aom_prob eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS];
+  aom_prob dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS];
+  aom_prob coeff_base[TX_SIZES][PLANE_TYPES][NUM_BASE_LEVELS]
+                     [COEFF_BASE_CONTEXTS];
+  aom_prob coeff_lps[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS];
+#endif
+
+#if CONFIG_REF_MV
+  aom_prob newmv_prob[NEWMV_MODE_CONTEXTS];
+  aom_prob zeromv_prob[ZEROMV_MODE_CONTEXTS];
+  aom_prob refmv_prob[REFMV_MODE_CONTEXTS];
+  aom_prob drl_prob[DRL_MODE_CONTEXTS];
+#endif  // CONFIG_REF_MV
+
+  aom_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
+#if CONFIG_EXT_INTER
+  aom_prob inter_compound_mode_probs[INTER_MODE_CONTEXTS]
+                                    [INTER_COMPOUND_MODES - 1];
+#if CONFIG_COMPOUND_SINGLEREF
+  aom_prob inter_singleref_comp_mode_probs[INTER_MODE_CONTEXTS]
+                                          [INTER_SINGLEREF_COMP_MODES - 1];
+#endif  // CONFIG_COMPOUND_SINGLEREF
+  aom_prob compound_type_prob[BLOCK_SIZES][COMPOUND_TYPES - 1];
+  aom_prob interintra_prob[BLOCK_SIZE_GROUPS];
+  aom_prob interintra_mode_prob[BLOCK_SIZE_GROUPS][INTERINTRA_MODES - 1];
+  aom_prob wedge_interintra_prob[BLOCK_SIZES];
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  aom_prob motion_mode_prob[BLOCK_SIZES][MOTION_MODES - 1];
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+  aom_prob obmc_prob[BLOCK_SIZES];
+#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  aom_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
+  aom_prob comp_inter_prob[COMP_INTER_CONTEXTS];
+  aom_prob single_ref_prob[REF_CONTEXTS][SINGLE_REFS - 1];
+#if CONFIG_EXT_REFS
+  aom_prob comp_ref_prob[REF_CONTEXTS][FWD_REFS - 1];
+  aom_prob comp_bwdref_prob[REF_CONTEXTS][BWD_REFS - 1];
+#else
+  aom_prob comp_ref_prob[REF_CONTEXTS][COMP_REFS - 1];
+#endif  // CONFIG_EXT_REFS
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  aom_prob comp_inter_mode_prob[COMP_INTER_MODE_CONTEXTS];
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  aom_prob tx_size_probs[MAX_TX_DEPTH][TX_SIZE_CONTEXTS][MAX_TX_DEPTH];
+#if CONFIG_VAR_TX
+  aom_prob txfm_partition_prob[TXFM_PARTITION_CONTEXTS];
+#endif
+  aom_prob skip_probs[SKIP_CONTEXTS];
+#if CONFIG_REF_MV
+  nmv_context nmvc[NMV_CONTEXTS];
+#else
+  nmv_context nmvc;
+#endif
+#if CONFIG_INTRABC
+  nmv_context ndvc;
+#endif
+  int initialized;
+#if CONFIG_EXT_TX
+  aom_prob inter_ext_tx_prob[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES - 1];
+  aom_prob intra_ext_tx_prob[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                            [TX_TYPES - 1];
+#else
+  aom_prob intra_ext_tx_prob[EXT_TX_SIZES][TX_TYPES][TX_TYPES - 1];
+  aom_prob inter_ext_tx_prob[EXT_TX_SIZES][TX_TYPES - 1];
+#endif  // CONFIG_EXT_TX
+#if CONFIG_SUPERTX
+  aom_prob supertx_prob[PARTITION_SUPERTX_CONTEXTS][TX_SIZES];
+#endif  // CONFIG_SUPERTX
+  struct segmentation_probs seg;
+#if CONFIG_EXT_INTRA
+#if CONFIG_INTRA_INTERP
+  aom_prob intra_filter_probs[INTRA_FILTERS + 1][INTRA_FILTERS - 1];
+#endif  // CONFIG_INTRA_INTERP
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  aom_prob filter_intra_probs[PLANE_TYPES];
+#endif  // CONFIG_FILTER_INTRA
+#if CONFIG_GLOBAL_MOTION
+  aom_prob global_motion_types_prob[GLOBAL_TRANS_TYPES - 1];
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_LOOP_RESTORATION
+  aom_prob switchable_restore_prob[RESTORE_SWITCHABLE_TYPES - 1];
+#endif  // CONFIG_LOOP_RESTORATION
+#if CONFIG_EC_MULTISYMBOL
+  aom_cdf_prob y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)];
+  aom_cdf_prob uv_mode_cdf[INTRA_MODES][CDF_SIZE(INTRA_MODES)];
+#if CONFIG_EXT_PARTITION_TYPES
+  aom_cdf_prob partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(EXT_PARTITION_TYPES)];
+#else
+  aom_cdf_prob partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(PARTITION_TYPES)];
+#endif
+  aom_cdf_prob switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS]
+                                    [CDF_SIZE(SWITCHABLE_FILTERS)];
+  aom_cdf_prob inter_mode_cdf[INTER_MODE_CONTEXTS][CDF_SIZE(INTER_MODES)];
+  /* Keep track of kf_y_cdf here, as this makes handling
+     multiple copies for adaptation in tiles easier */
+  aom_cdf_prob kf_y_cdf[INTRA_MODES][INTRA_MODES][CDF_SIZE(INTRA_MODES)];
+  aom_cdf_prob tx_size_cdf[MAX_TX_DEPTH][TX_SIZE_CONTEXTS]
+                          [CDF_SIZE(MAX_TX_DEPTH + 1)];
+#if CONFIG_DELTA_Q
+  aom_cdf_prob delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)];
+#if CONFIG_EXT_DELTA_Q
+  aom_cdf_prob delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)];
+#endif
+#endif  // CONFIG_DELTA_Q
+#if CONFIG_EXT_TX
+  aom_cdf_prob intra_ext_tx_cdf[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                               [CDF_SIZE(TX_TYPES)];
+  aom_cdf_prob inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES]
+                               [CDF_SIZE(TX_TYPES)];
+#else
+  aom_cdf_prob intra_ext_tx_cdf[EXT_TX_SIZES][TX_TYPES][CDF_SIZE(TX_TYPES)];
+  aom_cdf_prob inter_ext_tx_cdf[EXT_TX_SIZES][CDF_SIZE(TX_TYPES)];
+#endif  // CONFIG_EXT_TX
+#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+  aom_cdf_prob intra_filter_cdf[INTRA_FILTERS + 1][CDF_SIZE(INTRA_FILTERS)];
+#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+#endif  // CONFIG_EC_MULTISYMBOL
+#if CONFIG_DELTA_Q
+  aom_prob delta_q_prob[DELTA_Q_PROBS];
+#if CONFIG_EXT_DELTA_Q
+  aom_prob delta_lf_prob[DELTA_LF_PROBS];
+#endif
+#endif
+#if CONFIG_PVQ
+  // TODO(any): If PVQ is enabled, most of coefficient related cdf,
+  // such as coef_cdfs[], coef_tail_cdfs[], and coef_heaf_cdfs[] can be removed.
+  od_adapt_ctx pvq_context;
+#endif  // CONFIG_PVQ
+} FRAME_CONTEXT;
+
+typedef struct FRAME_COUNTS {
+  // Note: This structure should only contain 'unsigned int' fields, or
+  // aggregates built solely from 'unsigned int' fields/elements
+  unsigned int kf_y_mode[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+  unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
+  unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
+#if CONFIG_EXT_PARTITION_TYPES
+  unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+#else
+  unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES];
+#endif
+  av1_coeff_count_model coef[TX_SIZES][PLANE_TYPES];
+  unsigned int eob_branch[TX_SIZES][PLANE_TYPES][REF_TYPES][COEF_BANDS]
+                         [COEFF_CONTEXTS];
+  unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
+                                [SWITCHABLE_FILTERS];
+#if CONFIG_ADAPT_SCAN
+#if CONFIG_CB4X4
+  unsigned int non_zero_count_2x2[TX_TYPES][4];
+#endif  // CONFIG_CB4X4
+  unsigned int non_zero_count_4X4[TX_TYPES][16];
+  unsigned int non_zero_count_8X8[TX_TYPES][64];
+  unsigned int non_zero_count_16X16[TX_TYPES][256];
+  unsigned int non_zero_count_32X32[TX_TYPES][1024];
+
+  unsigned int non_zero_count_4x8[TX_TYPES][32];
+  unsigned int non_zero_count_8x4[TX_TYPES][32];
+  unsigned int non_zero_count_8x16[TX_TYPES][128];
+  unsigned int non_zero_count_16x8[TX_TYPES][128];
+  unsigned int non_zero_count_16x32[TX_TYPES][512];
+  unsigned int non_zero_count_32x16[TX_TYPES][512];
+
+  unsigned int txb_count[TX_SIZES_ALL][TX_TYPES];
+#endif  // CONFIG_ADAPT_SCAN
+
+#if CONFIG_LV_MAP
+  unsigned int txb_skip[TX_SIZES][TXB_SKIP_CONTEXTS][2];
+  unsigned int nz_map[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS][2];
+  unsigned int eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS][2];
+  unsigned int dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS][2];
+  unsigned int coeff_base[TX_SIZES][PLANE_TYPES][NUM_BASE_LEVELS]
+                         [COEFF_BASE_CONTEXTS][2];
+  unsigned int coeff_lps[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS][2];
+#endif  // CONFIG_LV_MAP
+
+#if CONFIG_EC_MULTISYMBOL
+  av1_blockz_count_model blockz_count[TX_SIZES][PLANE_TYPES];
+#endif
+
+#if CONFIG_REF_MV
+  unsigned int newmv_mode[NEWMV_MODE_CONTEXTS][2];
+  unsigned int zeromv_mode[ZEROMV_MODE_CONTEXTS][2];
+  unsigned int refmv_mode[REFMV_MODE_CONTEXTS][2];
+  unsigned int drl_mode[DRL_MODE_CONTEXTS][2];
+#endif
+
+  unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
+#if CONFIG_EXT_INTER
+  unsigned int inter_compound_mode[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+#if CONFIG_COMPOUND_SINGLEREF
+  unsigned int inter_singleref_comp_mode[INTER_MODE_CONTEXTS]
+                                        [INTER_SINGLEREF_COMP_MODES];
+#endif  // CONFIG_COMPOUND_SINGLEREF
+  unsigned int interintra[BLOCK_SIZE_GROUPS][2];
+  unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+  unsigned int wedge_interintra[BLOCK_SIZES][2];
+  unsigned int compound_interinter[BLOCK_SIZES][COMPOUND_TYPES];
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  unsigned int motion_mode[BLOCK_SIZES][MOTION_MODES];
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+  unsigned int obmc[BLOCK_SIZES][2];
+#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
+  unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
+  unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS - 1][2];
+#if CONFIG_EXT_REFS
+  unsigned int comp_ref[REF_CONTEXTS][FWD_REFS - 1][2];
+  unsigned int comp_bwdref[REF_CONTEXTS][BWD_REFS - 1][2];
+#else
+  unsigned int comp_ref[REF_CONTEXTS][COMP_REFS - 1][2];
+#endif  // CONFIG_EXT_REFS
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  unsigned int comp_inter_mode[COMP_INTER_MODE_CONTEXTS][2];
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  // TODO(any): tx_size_totals is only used by the encoder to decide whether
+  // to use forward updates for the coeff probs, and as such it does not really
+  // belong into this structure.
+  unsigned int tx_size_totals[TX_SIZES];
+  unsigned int tx_size[MAX_TX_DEPTH][TX_SIZE_CONTEXTS][TX_SIZES];
+#if CONFIG_VAR_TX
+  unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2];
+#endif
+  unsigned int skip[SKIP_CONTEXTS][2];
+#if CONFIG_REF_MV
+  nmv_context_counts mv[NMV_CONTEXTS];
+#else
+  nmv_context_counts mv;
+#endif
+#if CONFIG_INTRABC
+  nmv_context_counts dv;
+#endif
+#if CONFIG_DELTA_Q
+  unsigned int delta_q[DELTA_Q_PROBS][2];
+#if CONFIG_EXT_DELTA_Q
+  unsigned int delta_lf[DELTA_LF_PROBS][2];
+#endif
+#endif
+#if CONFIG_EXT_TX
+#if CONFIG_RECT_TX
+  unsigned int tx_size_implied[TX_SIZES][TX_SIZES];
+#endif  // CONFIG_RECT_TX
+  unsigned int inter_ext_tx[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+  unsigned int intra_ext_tx[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                           [TX_TYPES];
+#else
+  unsigned int intra_ext_tx[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
+  unsigned int inter_ext_tx[EXT_TX_SIZES][TX_TYPES];
+#endif  // CONFIG_EXT_TX
+#if CONFIG_SUPERTX
+  unsigned int supertx[PARTITION_SUPERTX_CONTEXTS][TX_SIZES][2];
+  unsigned int supertx_size[TX_SIZES];
+#endif  // CONFIG_SUPERTX
+  struct seg_counts seg;
+#if CONFIG_EXT_INTRA
+#if CONFIG_INTRA_INTERP
+  unsigned int intra_filter[INTRA_FILTERS + 1][INTRA_FILTERS];
+#endif  // CONFIG_INTRA_INTERP
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  unsigned int filter_intra[PLANE_TYPES][2];
+#endif  // CONFIG_FILTER_INTRA
+} FRAME_COUNTS;
+
+// Default probabilities for signaling Intra mode for Y plane -- used only for
+// intra-only frames. ('default_if_y_probs' is used for inter frames).
+// Contexts used: Intra mode (Y plane) of 'above' and 'left' blocks.
+extern const aom_prob av1_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
+                                        [INTRA_MODES - 1];
+#if CONFIG_EC_MULTISYMBOL
+// CDF version of 'av1_kf_y_mode_prob'.
+extern const aom_cdf_prob av1_kf_y_mode_cdf[INTRA_MODES][INTRA_MODES]
+                                           [CDF_SIZE(INTRA_MODES)];
+#endif
+
+#if CONFIG_PALETTE
+extern const aom_prob av1_default_palette_y_mode_prob[PALETTE_BLOCK_SIZES]
+                                                     [PALETTE_Y_MODE_CONTEXTS];
+extern const aom_prob
+    av1_default_palette_uv_mode_prob[PALETTE_UV_MODE_CONTEXTS];
+extern const aom_prob av1_default_palette_y_size_prob[PALETTE_BLOCK_SIZES]
+                                                     [PALETTE_SIZES - 1];
+extern const aom_prob av1_default_palette_uv_size_prob[PALETTE_BLOCK_SIZES]
+                                                      [PALETTE_SIZES - 1];
+extern const aom_prob av1_default_palette_y_color_index_prob
+    [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][PALETTE_COLORS - 1];
+extern const aom_prob av1_default_palette_uv_color_index_prob
+    [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][PALETTE_COLORS - 1];
+#endif  // CONFIG_PALETTE
+
+extern const aom_tree_index av1_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
+extern const aom_tree_index av1_inter_mode_tree[TREE_SIZE(INTER_MODES)];
+#if CONFIG_EC_MULTISYMBOL
+extern int av1_intra_mode_ind[INTRA_MODES];
+extern int av1_intra_mode_inv[INTRA_MODES];
+extern int av1_inter_mode_ind[INTER_MODES];
+extern int av1_inter_mode_inv[INTER_MODES];
+#if CONFIG_EXT_TX
+extern int av1_ext_tx_intra_ind[EXT_TX_SETS_INTRA][TX_TYPES];
+extern int av1_ext_tx_intra_inv[EXT_TX_SETS_INTRA][TX_TYPES];
+extern int av1_ext_tx_inter_ind[EXT_TX_SETS_INTER][TX_TYPES];
+extern int av1_ext_tx_inter_inv[EXT_TX_SETS_INTER][TX_TYPES];
+#endif
+#endif
+
+#if CONFIG_EXT_INTER
+extern const aom_tree_index
+    av1_interintra_mode_tree[TREE_SIZE(INTERINTRA_MODES)];
+extern const aom_tree_index
+    av1_inter_compound_mode_tree[TREE_SIZE(INTER_COMPOUND_MODES)];
+#if CONFIG_COMPOUND_SINGLEREF
+extern const aom_tree_index
+    av1_inter_singleref_comp_mode_tree[TREE_SIZE(INTER_SINGLEREF_COMP_MODES)];
+#endif  // CONFIG_COMPOUND_SINGLEREF
+extern const aom_tree_index av1_compound_type_tree[TREE_SIZE(COMPOUND_TYPES)];
+#endif  // CONFIG_EXT_INTER
+extern const aom_tree_index av1_partition_tree[TREE_SIZE(PARTITION_TYPES)];
+#if CONFIG_EXT_PARTITION_TYPES
+extern const aom_tree_index
+    av1_ext_partition_tree[TREE_SIZE(EXT_PARTITION_TYPES)];
+#endif
+extern const aom_tree_index
+    av1_switchable_interp_tree[TREE_SIZE(SWITCHABLE_FILTERS)];
+#if CONFIG_PALETTE
+extern const aom_tree_index av1_palette_size_tree[TREE_SIZE(PALETTE_SIZES)];
+extern const aom_tree_index
+    av1_palette_color_index_tree[PALETTE_SIZES][TREE_SIZE(PALETTE_COLORS)];
+#endif  // CONFIG_PALETTE
+extern const aom_tree_index av1_tx_size_tree[MAX_TX_DEPTH][TREE_SIZE(TX_SIZES)];
+#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+extern const aom_tree_index av1_intra_filter_tree[TREE_SIZE(INTRA_FILTERS)];
+#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+#if CONFIG_EXT_TX
+extern const aom_tree_index av1_ext_tx_inter_tree[EXT_TX_SETS_INTER]
+                                                 [TREE_SIZE(TX_TYPES)];
+extern const aom_tree_index av1_ext_tx_intra_tree[EXT_TX_SETS_INTRA]
+                                                 [TREE_SIZE(TX_TYPES)];
+#else
+extern const aom_tree_index av1_ext_tx_tree[TREE_SIZE(TX_TYPES)];
+#endif  // CONFIG_EXT_TX
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+extern const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)];
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if CONFIG_LOOP_RESTORATION
+#define RESTORE_NONE_SGRPROJ_PROB 64
+#define RESTORE_NONE_BILATERAL_PROB 16
+#define RESTORE_NONE_WIENER_PROB 64
+#define RESTORE_NONE_DOMAINTXFMRF_PROB 64
+extern const aom_tree_index
+    av1_switchable_restore_tree[TREE_SIZE(RESTORE_SWITCHABLE_TYPES)];
+#endif  // CONFIG_LOOP_RESTORATION
+#if CONFIG_EC_MULTISYMBOL
+extern int av1_switchable_interp_ind[SWITCHABLE_FILTERS];
+extern int av1_switchable_interp_inv[SWITCHABLE_FILTERS];
+
+void av1_set_mode_cdfs(struct AV1Common *cm);
+#endif
+
+void av1_setup_past_independence(struct AV1Common *cm);
+
+void av1_adapt_intra_frame_probs(struct AV1Common *cm);
+void av1_adapt_inter_frame_probs(struct AV1Common *cm);
+#if CONFIG_EC_MULTISYMBOL && !CONFIG_EXT_TX
+extern int av1_ext_tx_ind[TX_TYPES];
+extern int av1_ext_tx_inv[TX_TYPES];
+#endif
+
+static INLINE int av1_ceil_log2(int n) {
+  int i = 1, p = 2;
+  while (p < n) {
+    i++;
+    p = p << 1;
+  }
+  return i;
+}
+
+#if CONFIG_PALETTE
+// Returns the context for palette color index at row 'r' and column 'c',
+// along with the 'color_order' of neighbors and the 'color_idx'.
+// The 'color_map' is a 2D array with the given 'stride'.
+int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
+                                        int r, int c, int palette_size,
+                                        uint8_t *color_order, int *color_idx);
+#endif  // CONFIG_PALETTE
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_ENTROPYMODE_H_
diff --git a/third_party/aom/av1/common/entropymv.c b/third_party/aom/av1/common/entropymv.c
new file mode 100644
index 000000000..9c162d2c5
--- /dev/null
+++ b/third_party/aom/av1/common/entropymv.c
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/onyxc_int.h"
+#include "av1/common/entropymv.h"
+
+// Integer pel reference mv threshold for use of high-precision 1/8 mv
+#define COMPANDED_MVREF_THRESH 8
+
+const aom_tree_index av1_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
+  -MV_JOINT_ZERO, 2, -MV_JOINT_HNZVZ, 4, -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
+};
+
+/* clang-format off */
+const aom_tree_index av1_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
+  -MV_CLASS_0, 2,
+  -MV_CLASS_1, 4,
+  6, 8,
+  -MV_CLASS_2, -MV_CLASS_3,
+  10, 12,
+  -MV_CLASS_4, -MV_CLASS_5,
+  -MV_CLASS_6, 14,
+  16, 18,
+  -MV_CLASS_7, -MV_CLASS_8,
+  -MV_CLASS_9, -MV_CLASS_10,
+};
+/* clang-format on */
+
+const aom_tree_index av1_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
+  -0, -1,
+};
+
+const aom_tree_index av1_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = { -0, 2,  -1,
+                                                               4,  -2, -3 };
+
+static const nmv_context default_nmv_context = {
+  { 32, 64, 96 },  // joints
+#if CONFIG_EC_MULTISYMBOL
+  { AOM_ICDF(4096), AOM_ICDF(11264), AOM_ICDF(19328), AOM_ICDF(32768),
+    0 },  // joint_cdf
+#endif
+  { {
+        // Vertical component
+        128,                                                   // sign
+        { 224, 144, 192, 168, 192, 176, 192, 198, 198, 245 },  // class
+#if CONFIG_EC_MULTISYMBOL
+        { AOM_ICDF(28672), AOM_ICDF(30976), AOM_ICDF(31858), AOM_ICDF(32320),
+          AOM_ICDF(32551), AOM_ICDF(32656), AOM_ICDF(32740), AOM_ICDF(32757),
+          AOM_ICDF(32762), AOM_ICDF(32767), AOM_ICDF(32768), 0 },  // class_cdf
+#endif
+        { 216 },                                               // class0
+        { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 },  // bits
+        { { 128, 128, 64 }, { 96, 112, 64 } },                 // class0_fp
+        { 64, 96, 64 },                                        // fp
+#if CONFIG_EC_MULTISYMBOL
+        { { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(26624), AOM_ICDF(32768),
+            0 },
+          { AOM_ICDF(12288), AOM_ICDF(21248), AOM_ICDF(24128), AOM_ICDF(32768),
+            0 } },  // class0_fp_cdf
+        { AOM_ICDF(8192), AOM_ICDF(17408), AOM_ICDF(21248), AOM_ICDF(32768),
+          0 },  // fp_cdf
+#endif
+        160,  // class0_hp bit
+        128,  // hp
+    },
+    {
+        // Horizontal component
+        128,                                                   // sign
+        { 216, 128, 176, 160, 176, 176, 192, 198, 198, 208 },  // class
+#if CONFIG_EC_MULTISYMBOL
+        { AOM_ICDF(28672), AOM_ICDF(30976), AOM_ICDF(31858), AOM_ICDF(32320),
+          AOM_ICDF(32551), AOM_ICDF(32656), AOM_ICDF(32740), AOM_ICDF(32757),
+          AOM_ICDF(32762), AOM_ICDF(32767), AOM_ICDF(32768), 0 },  // class_cdf
+#endif
+        { 208 },                                               // class0
+        { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 },  // bits
+        { { 128, 128, 64 }, { 96, 112, 64 } },                 // class0_fp
+        { 64, 96, 64 },                                        // fp
+#if CONFIG_EC_MULTISYMBOL
+        { { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(26624), AOM_ICDF(32768),
+            0 },
+          { AOM_ICDF(12288), AOM_ICDF(21248), AOM_ICDF(24128), AOM_ICDF(32768),
+            0 } },  // class0_fp_cdf
+        { AOM_ICDF(8192), AOM_ICDF(17408), AOM_ICDF(21248), AOM_ICDF(32768),
+          0 },  // fp_cdf
+#endif
+        160,  // class0_hp bit
+        128,  // hp
+    } },
+};
+
+static const uint8_t log_in_base_2[] = {
+  0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+  4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10
+};
+
+#if CONFIG_GLOBAL_MOTION
+#if GLOBAL_TRANS_TYPES == 7  // All models
+const aom_tree_index av1_global_motion_types_tree[TREE_SIZE(
+    GLOBAL_TRANS_TYPES)] = { -IDENTITY,   2,  -TRANSLATION,  4,
+                             -ROTZOOM,    6,  -AFFINE,       8,
+                             -HOMOGRAPHY, 10, -HORTRAPEZOID, -VERTRAPEZOID };
+
+static const aom_prob default_global_motion_types_prob[GLOBAL_TRANS_TYPES - 1] =
+    { 224, 128, 192, 192, 32, 128 };
+
+#elif GLOBAL_TRANS_TYPES == 6  // Do not allow full homography
+const aom_tree_index
+    av1_global_motion_types_tree[TREE_SIZE(GLOBAL_TRANS_TYPES)] = {
+      -IDENTITY,    2, -TRANSLATION, 4, -ROTZOOM, 6, -AFFINE, 8, -HORTRAPEZOID,
+      -VERTRAPEZOID
+    };
+
+static const aom_prob default_global_motion_types_prob[GLOBAL_TRANS_TYPES - 1] =
+    { 224, 128, 192, 192, 128 };
+
+#elif GLOBAL_TRANS_TYPES == 4  // Upto Affine
+const aom_tree_index av1_global_motion_types_tree[TREE_SIZE(
+    GLOBAL_TRANS_TYPES)] = { -IDENTITY, 2, -TRANSLATION, 4, -ROTZOOM, -AFFINE };
+
+static const aom_prob default_global_motion_types_prob[GLOBAL_TRANS_TYPES - 1] =
+    { 224, 128, 240 };
+
+#elif GLOBAL_TRANS_TYPES == 3  // Upto rotation-zoom
+
+const aom_tree_index av1_global_motion_types_tree[TREE_SIZE(
+    GLOBAL_TRANS_TYPES)] = { -IDENTITY, 2, -TRANSLATION, -ROTZOOM };
+
+static const aom_prob default_global_motion_types_prob[GLOBAL_TRANS_TYPES - 1] =
+    { 224, 128 };
+#endif                         // GLOBAL_TRANS_TYPES
+#endif                         // CONFIG_GLOBAL_MOTION
+
+static INLINE int mv_class_base(MV_CLASS_TYPE c) {
+  return c ? CLASS0_SIZE << (c + 2) : 0;
+}
+
+MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) {
+  const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096)
+                              ? MV_CLASS_10
+                              : (MV_CLASS_TYPE)log_in_base_2[z >> 3];
+  if (offset) *offset = z - mv_class_base(c);
+  return c;
+}
+
+static void inc_mv_component(int v, nmv_component_counts *comp_counts, int incr,
+                             int usehp) {
+  int s, z, c, o, d, e, f;
+  assert(v != 0); /* should not be zero */
+  s = v < 0;
+  comp_counts->sign[s] += incr;
+  z = (s ? -v : v) - 1; /* magnitude - 1 */
+
+  c = av1_get_mv_class(z, &o);
+  comp_counts->classes[c] += incr;
+
+  d = (o >> 3);     /* int mv data */
+  f = (o >> 1) & 3; /* fractional pel mv data */
+  e = (o & 1);      /* high precision mv data */
+
+  if (c == MV_CLASS_0) {
+    comp_counts->class0[d] += incr;
+    comp_counts->class0_fp[d][f] += incr;
+    if (usehp) comp_counts->class0_hp[e] += incr;
+  } else {
+    int i;
+    int b = c + CLASS0_BITS - 1;  // number of bits
+    for (i = 0; i < b; ++i) comp_counts->bits[i][((d >> i) & 1)] += incr;
+    comp_counts->fp[f] += incr;
+    if (usehp) comp_counts->hp[e] += incr;
+  }
+}
+
+void av1_inc_mv(const MV *mv, nmv_context_counts *counts, const int usehp) {
+  if (counts != NULL) {
+    const MV_JOINT_TYPE j = av1_get_mv_joint(mv);
+    ++counts->joints[j];
+
+    if (mv_joint_vertical(j))
+      inc_mv_component(mv->row, &counts->comps[0], 1, usehp);
+
+    if (mv_joint_horizontal(j))
+      inc_mv_component(mv->col, &counts->comps[1], 1, usehp);
+  }
+}
+
+void av1_adapt_mv_probs(AV1_COMMON *cm, int allow_hp) {
+  int i, j;
+#if CONFIG_REF_MV
+  int idx;
+  for (idx = 0; idx < NMV_CONTEXTS; ++idx) {
+    nmv_context *fc = &cm->fc->nmvc[idx];
+    const nmv_context *pre_fc =
+        &cm->frame_contexts[cm->frame_context_idx].nmvc[idx];
+    const nmv_context_counts *counts = &cm->counts.mv[idx];
+#else
+  nmv_context *fc = &cm->fc->nmvc;
+  const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc;
+  const nmv_context_counts *counts = &cm->counts.mv;
+#endif  // CONFIG_REF_MV
+    aom_tree_merge_probs(av1_mv_joint_tree, pre_fc->joints, counts->joints,
+                         fc->joints);
+    for (i = 0; i < 2; ++i) {
+      nmv_component *comp = &fc->comps[i];
+      const nmv_component *pre_comp = &pre_fc->comps[i];
+      const nmv_component_counts *c = &counts->comps[i];
+
+      comp->sign = av1_mode_mv_merge_probs(pre_comp->sign, c->sign);
+      aom_tree_merge_probs(av1_mv_class_tree, pre_comp->classes, c->classes,
+                           comp->classes);
+      aom_tree_merge_probs(av1_mv_class0_tree, pre_comp->class0, c->class0,
+                           comp->class0);
+
+      for (j = 0; j < MV_OFFSET_BITS; ++j)
+        comp->bits[j] = av1_mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]);
+
+      for (j = 0; j < CLASS0_SIZE; ++j)
+        aom_tree_merge_probs(av1_mv_fp_tree, pre_comp->class0_fp[j],
+                             c->class0_fp[j], comp->class0_fp[j]);
+
+      aom_tree_merge_probs(av1_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
+
+      if (allow_hp) {
+        comp->class0_hp =
+            av1_mode_mv_merge_probs(pre_comp->class0_hp, c->class0_hp);
+        comp->hp = av1_mode_mv_merge_probs(pre_comp->hp, c->hp);
+      }
+    }
+#if CONFIG_REF_MV
+  }
+#endif  // CONFIG_REF_MV
+}
+
+#if CONFIG_EC_MULTISYMBOL && !CONFIG_EC_ADAPT
+void av1_set_mv_cdfs(nmv_context *ctx) {
+  int i;
+  int j;
+  av1_tree_to_cdf(av1_mv_joint_tree, ctx->joints, ctx->joint_cdf);
+
+  for (i = 0; i < 2; ++i) {
+    nmv_component *const comp_ctx = &ctx->comps[i];
+    av1_tree_to_cdf(av1_mv_class_tree, comp_ctx->classes, comp_ctx->class_cdf);
+
+    for (j = 0; j < CLASS0_SIZE; ++j) {
+      av1_tree_to_cdf(av1_mv_fp_tree, comp_ctx->class0_fp[j],
+                      comp_ctx->class0_fp_cdf[j]);
+    }
+    av1_tree_to_cdf(av1_mv_fp_tree, comp_ctx->fp, comp_ctx->fp_cdf);
+  }
+}
+#endif
+
+void av1_init_mv_probs(AV1_COMMON *cm) {
+#if CONFIG_REF_MV
+  int i;
+  for (i = 0; i < NMV_CONTEXTS; ++i) {
+    // NB: this sets CDFs too
+    cm->fc->nmvc[i] = default_nmv_context;
+  }
+#else
+  cm->fc->nmvc = default_nmv_context;
+#endif  // CONFIG_REF_MV
+#if CONFIG_INTRABC
+  cm->fc->ndvc = default_nmv_context;
+#endif  // CONFIG_INTRABC
+#if CONFIG_GLOBAL_MOTION
+  av1_copy(cm->fc->global_motion_types_prob, default_global_motion_types_prob);
+#endif  // CONFIG_GLOBAL_MOTION
+}
diff --git a/third_party/aom/av1/common/entropymv.h b/third_party/aom/av1/common/entropymv.h
new file mode 100644
index 000000000..2c79d447a
--- /dev/null
+++ b/third_party/aom/av1/common/entropymv.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_ENTROPYMV_H_
+#define AV1_COMMON_ENTROPYMV_H_
+
+#include "./aom_config.h"
+
+#include "aom_dsp/prob.h"
+
+#include "av1/common/mv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+
+void av1_init_mv_probs(struct AV1Common *cm);
+
+void av1_adapt_mv_probs(struct AV1Common *cm, int usehp);
+
+#define MV_UPDATE_PROB 252
+
+/* Symbols for coding which components are zero jointly */
+#define MV_JOINTS 4
+typedef enum {
+  MV_JOINT_ZERO = 0,   /* Zero vector */
+  MV_JOINT_HNZVZ = 1,  /* Vert zero, hor nonzero */
+  MV_JOINT_HZVNZ = 2,  /* Hor zero, vert nonzero */
+  MV_JOINT_HNZVNZ = 3, /* Both components nonzero */
+} MV_JOINT_TYPE;
+
+static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) {
+  return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ;
+}
+
+static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) {
+  return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ;
+}
+
+/* Symbols for coding magnitude class of nonzero components */
+#define MV_CLASSES 11
+typedef enum {
+  MV_CLASS_0 = 0,   /* (0, 2]     integer pel */
+  MV_CLASS_1 = 1,   /* (2, 4]     integer pel */
+  MV_CLASS_2 = 2,   /* (4, 8]     integer pel */
+  MV_CLASS_3 = 3,   /* (8, 16]    integer pel */
+  MV_CLASS_4 = 4,   /* (16, 32]   integer pel */
+  MV_CLASS_5 = 5,   /* (32, 64]   integer pel */
+  MV_CLASS_6 = 6,   /* (64, 128]  integer pel */
+  MV_CLASS_7 = 7,   /* (128, 256] integer pel */
+  MV_CLASS_8 = 8,   /* (256, 512] integer pel */
+  MV_CLASS_9 = 9,   /* (512, 1024] integer pel */
+  MV_CLASS_10 = 10, /* (1024,2048] integer pel */
+} MV_CLASS_TYPE;
+
+#define CLASS0_BITS 1 /* bits at integer precision for class 0 */
+#define CLASS0_SIZE (1 << CLASS0_BITS)
+#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)
+#define MV_FP_SIZE 4
+
+#define MV_MAX_BITS (MV_CLASSES + CLASS0_BITS + 2)
+#define MV_MAX ((1 << MV_MAX_BITS) - 1)
+#define MV_VALS ((MV_MAX << 1) + 1)
+
+#define MV_IN_USE_BITS 14
+#define MV_UPP ((1 << MV_IN_USE_BITS) - 1)
+#define MV_LOW (-(1 << MV_IN_USE_BITS))
+
+extern const aom_tree_index av1_mv_joint_tree[];
+extern const aom_tree_index av1_mv_class_tree[];
+extern const aom_tree_index av1_mv_class0_tree[];
+extern const aom_tree_index av1_mv_fp_tree[];
+
+typedef struct {
+  aom_prob sign;
+  aom_prob classes[MV_CLASSES - 1];
+#if CONFIG_EC_MULTISYMBOL
+  aom_cdf_prob class_cdf[CDF_SIZE(MV_CLASSES)];
+#endif
+  aom_prob class0[CLASS0_SIZE - 1];
+  aom_prob bits[MV_OFFSET_BITS];
+  aom_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1];
+  aom_prob fp[MV_FP_SIZE - 1];
+#if CONFIG_EC_MULTISYMBOL
+  aom_cdf_prob class0_fp_cdf[CLASS0_SIZE][CDF_SIZE(MV_FP_SIZE)];
+  aom_cdf_prob fp_cdf[CDF_SIZE(MV_FP_SIZE)];
+#endif
+  aom_prob class0_hp;
+  aom_prob hp;
+} nmv_component;
+
+typedef struct {
+  aom_prob joints[MV_JOINTS - 1];
+#if CONFIG_EC_MULTISYMBOL
+  aom_cdf_prob joint_cdf[CDF_SIZE(MV_JOINTS)];
+#endif
+  nmv_component comps[2];
+} nmv_context;
+
+static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) {
+  if (mv->row == 0) {
+    return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ;
+  } else {
+    return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ;
+  }
+}
+
+MV_CLASS_TYPE av1_get_mv_class(int z, int *offset);
+
+typedef struct {
+  unsigned int sign[2];
+  unsigned int classes[MV_CLASSES];
+  unsigned int class0[CLASS0_SIZE];
+  unsigned int bits[MV_OFFSET_BITS][2];
+  unsigned int class0_fp[CLASS0_SIZE][MV_FP_SIZE];
+  unsigned int fp[MV_FP_SIZE];
+  unsigned int class0_hp[2];
+  unsigned int hp[2];
+} nmv_component_counts;
+
+typedef struct {
+  unsigned int joints[MV_JOINTS];
+  nmv_component_counts comps[2];
+} nmv_context_counts;
+
+void av1_inc_mv(const MV *mv, nmv_context_counts *mvctx, const int usehp);
+#if CONFIG_GLOBAL_MOTION
+extern const aom_tree_index
+    av1_global_motion_types_tree[TREE_SIZE(GLOBAL_TRANS_TYPES)];
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_EC_MULTISYMBOL
+void av1_set_mv_cdfs(nmv_context *ctx);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_ENTROPYMV_H_
diff --git a/third_party/aom/av1/common/enums.h b/third_party/aom/av1/common/enums.h
new file mode 100644
index 000000000..054bd40be
--- /dev/null
+++ b/third_party/aom/av1/common/enums.h
@@ -0,0 +1,543 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_ENUMS_H_
+#define AV1_COMMON_ENUMS_H_
+
+#include "./aom_config.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#undef MAX_SB_SIZE
+
+// Max superblock size
+#if CONFIG_EXT_PARTITION
+#define MAX_SB_SIZE_LOG2 7
+#else
+#define MAX_SB_SIZE_LOG2 6
+#endif  // CONFIG_EXT_PARTITION
+#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
+#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
+
+// Min superblock size
+#define MIN_SB_SIZE_LOG2 6
+
+// Pixels per Mode Info (MI) unit
+#if CONFIG_CB4X4
+#define MI_SIZE_LOG2 2
+#else
+#define MI_SIZE_LOG2 3
+#endif
+#define MI_SIZE (1 << MI_SIZE_LOG2)
+
+// MI-units per max superblock (MI Block - MIB)
+#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
+#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
+
+// MI-units per min superblock
+#define MIN_MIB_SIZE_LOG2 (MIN_SB_SIZE_LOG2 - MI_SIZE_LOG2)
+
+// Mask to extract MI offset within max MIB
+#define MAX_MIB_MASK (MAX_MIB_SIZE - 1)
+#define MAX_MIB_MASK_2 (MAX_MIB_SIZE * 2 - 1)
+
+// Maximum number of tile rows and tile columns
+#if CONFIG_EXT_TILE
+#define TILE_NORMAL 0
+#define TILE_VR 1
+
+#define MAX_TILE_ROWS 1024
+#define MAX_TILE_COLS 1024
+#else
+#define MAX_TILE_ROWS 4
+#define MAX_TILE_COLS 64
+#endif  // CONFIG_EXT_TILE
+
+#if CONFIG_VAR_TX
+#define MAX_VARTX_DEPTH 2
+#endif
+
+// Bitstream profiles indicated by 2-3 bits in the uncompressed header.
+// 00: Profile 0.  8-bit 4:2:0 only.
+// 10: Profile 1.  8-bit 4:4:4, 4:2:2, and 4:4:0.
+// 01: Profile 2.  10-bit and 12-bit color only, with 4:2:0 sampling.
+// 110: Profile 3. 10-bit and 12-bit color only, with 4:2:2/4:4:4/4:4:0
+//                 sampling.
+// 111: Undefined profile.
+typedef enum BITSTREAM_PROFILE {
+  PROFILE_0,
+  PROFILE_1,
+  PROFILE_2,
+  PROFILE_3,
+  MAX_PROFILES
+} BITSTREAM_PROFILE;
+
+// Note: Some enums use the attribute 'packed' to use smallest possible integer
+// type, so that we can save memory when they are used in structs/arrays.
+
+typedef enum ATTRIBUTE_PACKED {
+#if CONFIG_CB4X4
+  BLOCK_2X2,
+  BLOCK_2X4,
+  BLOCK_4X2,
+#endif
+  BLOCK_4X4,
+  BLOCK_4X8,
+  BLOCK_8X4,
+  BLOCK_8X8,
+  BLOCK_8X16,
+  BLOCK_16X8,
+  BLOCK_16X16,
+  BLOCK_16X32,
+  BLOCK_32X16,
+  BLOCK_32X32,
+  BLOCK_32X64,
+  BLOCK_64X32,
+  BLOCK_64X64,
+#if CONFIG_EXT_PARTITION
+  BLOCK_64X128,
+  BLOCK_128X64,
+  BLOCK_128X128,
+#endif  // CONFIG_EXT_PARTITION
+  BLOCK_SIZES,
+  BLOCK_INVALID = BLOCK_SIZES,
+  BLOCK_LARGEST = (BLOCK_SIZES - 1)
+} BLOCK_SIZE;
+
+typedef enum {
+  PARTITION_NONE,
+  PARTITION_HORZ,
+  PARTITION_VERT,
+  PARTITION_SPLIT,
+#if CONFIG_EXT_PARTITION_TYPES
+  PARTITION_HORZ_A,  // HORZ split and the left partition is split again
+  PARTITION_HORZ_B,  // HORZ split and the right partition is split again
+  PARTITION_VERT_A,  // VERT split and the top partition is split again
+  PARTITION_VERT_B,  // VERT split and the bottom partition is split again
+  EXT_PARTITION_TYPES,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+  PARTITION_TYPES = PARTITION_SPLIT + 1,
+  PARTITION_INVALID = 255
+} PARTITION_TYPE;
+
+typedef char PARTITION_CONTEXT;
+#define PARTITION_PLOFFSET 4  // number of probability models per block size
+#define PARTITION_BLOCK_SIZES (4 + CONFIG_EXT_PARTITION)
+#define PARTITION_CONTEXTS_PRIMARY (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET)
+#if CONFIG_UNPOISON_PARTITION_CTX
+#define PARTITION_CONTEXTS \
+  (PARTITION_CONTEXTS_PRIMARY + 2 * PARTITION_BLOCK_SIZES)
+#else
+#define PARTITION_CONTEXTS PARTITION_CONTEXTS_PRIMARY
+#endif
+
+// block transform size
+typedef enum ATTRIBUTE_PACKED {
+#if CONFIG_CB4X4
+  TX_2X2,  // 2x2 transform
+#endif
+  TX_4X4,    // 4x4 transform
+  TX_8X8,    // 8x8 transform
+  TX_16X16,  // 16x16 transform
+  TX_32X32,  // 32x32 transform
+#if CONFIG_TX64X64
+  TX_64X64,           // 64x64 transform
+#endif                // CONFIG_TX64X64
+  TX_4X8,             // 4x8 transform
+  TX_8X4,             // 8x4 transform
+  TX_8X16,            // 8x16 transform
+  TX_16X8,            // 16x8 transform
+  TX_16X32,           // 16x32 transform
+  TX_32X16,           // 32x16 transform
+  TX_4X16,            // 4x16 transform
+  TX_16X4,            // 16x4 transform
+  TX_8X32,            // 8x32 transform
+  TX_32X8,            // 32x8 transform
+  TX_SIZES_ALL,       // Includes rectangular transforms
+  TX_SIZES = TX_4X8,  // Does NOT include rectangular transforms
+  TX_INVALID = 255    // Invalid transform size
+} TX_SIZE;
+
+#define MAX_TX_DEPTH (TX_SIZES - 1 - TX_4X4)
+
+#define MAX_TX_SIZE_LOG2 (5 + CONFIG_TX64X64)
+#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
+#define MIN_TX_SIZE_LOG2 2
+#define MIN_TX_SIZE (1 << MIN_TX_SIZE_LOG2)
+#define MAX_TX_SQUARE (MAX_TX_SIZE * MAX_TX_SIZE)
+
+// Number of maxium size transform blocks in the maximum size superblock
+#define MAX_TX_BLOCKS_IN_MAX_SB_LOG2 ((MAX_SB_SIZE_LOG2 - MAX_TX_SIZE_LOG2) * 2)
+#define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2)
+
+#define MAX_NUM_TXB (1 << (MAX_SB_SIZE_LOG2 - MIN_TX_SIZE_LOG2))
+
+// frame transform mode
+typedef enum {
+  ONLY_4X4 = 0,     // only 4x4 transform used
+  ALLOW_8X8 = 1,    // allow block transform size up to 8x8
+  ALLOW_16X16 = 2,  // allow block transform size up to 16x16
+  ALLOW_32X32 = 3,  // allow block transform size up to 32x32
+#if CONFIG_TX64X64
+  ALLOW_64X64 = 4,  // allow block transform size up to 64x64
+#endif
+  TX_MODE_SELECT,  // transform specified for each block
+  TX_MODES,
+} TX_MODE;
+
+// 1D tx types
+typedef enum {
+  DCT_1D = 0,
+  ADST_1D = 1,
+  FLIPADST_1D = 2,
+  IDTX_1D = 3,
+  TX_TYPES_1D = 4,
+} TX_TYPE_1D;
+
+typedef enum {
+  DCT_DCT = 0,    // DCT  in both horizontal and vertical
+  ADST_DCT = 1,   // ADST in vertical, DCT in horizontal
+  DCT_ADST = 2,   // DCT  in vertical, ADST in horizontal
+  ADST_ADST = 3,  // ADST in both directions
+#if CONFIG_EXT_TX
+  FLIPADST_DCT = 4,
+  DCT_FLIPADST = 5,
+  FLIPADST_FLIPADST = 6,
+  ADST_FLIPADST = 7,
+  FLIPADST_ADST = 8,
+  IDTX = 9,
+  V_DCT = 10,
+  H_DCT = 11,
+  V_ADST = 12,
+  H_ADST = 13,
+  V_FLIPADST = 14,
+  H_FLIPADST = 15,
+#endif  // CONFIG_EXT_TX
+  TX_TYPES,
+} TX_TYPE;
+
+typedef enum {
+  TILE_LEFT_BOUNDARY = 1,
+  TILE_RIGHT_BOUNDARY = 2,
+  TILE_ABOVE_BOUNDARY = 4,
+  TILE_BOTTOM_BOUNDARY = 8,
+  FRAME_LEFT_BOUNDARY = 16,
+  FRAME_RIGHT_BOUNDARY = 32,
+  FRAME_ABOVE_BOUNDARY = 64,
+  FRAME_BOTTOM_BOUNDARY = 128,
+} BOUNDARY_TYPE;
+
+#if CONFIG_EXT_TX
+#if CONFIG_CB4X4
+#define EXT_TX_SIZES 5  // number of sizes that use extended transforms
+#else
+#define EXT_TX_SIZES 4       // number of sizes that use extended transforms
+#endif                       // CONFIG_CB4X4
+#define EXT_TX_SETS_INTER 4  // Sets of transform selections for INTER
+#define EXT_TX_SETS_INTRA 3  // Sets of transform selections for INTRA
+#else
+#if CONFIG_CB4X4
+#define EXT_TX_SIZES 4  // number of sizes that use extended transforms
+#else
+#define EXT_TX_SIZES 3  // number of sizes that use extended transforms
+#endif
+#endif  // CONFIG_EXT_TX
+
+typedef enum {
+  AOM_LAST_FLAG = 1 << 0,
+#if CONFIG_EXT_REFS
+  AOM_LAST2_FLAG = 1 << 1,
+  AOM_LAST3_FLAG = 1 << 2,
+  AOM_GOLD_FLAG = 1 << 3,
+  AOM_BWD_FLAG = 1 << 4,
+  AOM_ALT_FLAG = 1 << 5,
+  AOM_REFFRAME_ALL = (1 << 6) - 1
+#else
+  AOM_GOLD_FLAG = 1 << 1,
+  AOM_ALT_FLAG = 1 << 2,
+  AOM_REFFRAME_ALL = (1 << 3) - 1
+#endif  // CONFIG_EXT_REFS
+} AOM_REFFRAME;
+
+typedef enum { PLANE_TYPE_Y = 0, PLANE_TYPE_UV = 1, PLANE_TYPES } PLANE_TYPE;
+
+#if CONFIG_CFL
+typedef enum { CFL_PRED_U = 0, CFL_PRED_V = 1, CFL_PRED_PLANES } CFL_PRED_TYPE;
+#endif
+
+#if CONFIG_PALETTE
+typedef enum {
+  TWO_COLORS,
+  THREE_COLORS,
+  FOUR_COLORS,
+  FIVE_COLORS,
+  SIX_COLORS,
+  SEVEN_COLORS,
+  EIGHT_COLORS,
+  PALETTE_SIZES
+} PALETTE_SIZE;
+
+typedef enum {
+  PALETTE_COLOR_ONE,
+  PALETTE_COLOR_TWO,
+  PALETTE_COLOR_THREE,
+  PALETTE_COLOR_FOUR,
+  PALETTE_COLOR_FIVE,
+  PALETTE_COLOR_SIX,
+  PALETTE_COLOR_SEVEN,
+  PALETTE_COLOR_EIGHT,
+  PALETTE_COLORS
+} PALETTE_COLOR;
+#endif  // CONFIG_PALETTE
+
+typedef enum ATTRIBUTE_PACKED {
+  DC_PRED,    // Average of above and left pixels
+  V_PRED,     // Vertical
+  H_PRED,     // Horizontal
+  D45_PRED,   // Directional 45  deg = round(arctan(1/1) * 180/pi)
+  D135_PRED,  // Directional 135 deg = 180 - 45
+  D117_PRED,  // Directional 117 deg = 180 - 63
+  D153_PRED,  // Directional 153 deg = 180 - 27
+  D207_PRED,  // Directional 207 deg = 180 + 27
+  D63_PRED,   // Directional 63  deg = round(arctan(2/1) * 180/pi)
+#if CONFIG_ALT_INTRA
+  SMOOTH_PRED,  // Combination of horizontal and vertical interpolation
+#endif          // CONFIG_ALT_INTRA
+  TM_PRED,      // True-motion
+  NEARESTMV,
+  NEARMV,
+  ZEROMV,
+  NEWMV,
+#if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SINGLEREF
+  // Single ref compound modes
+  SR_NEAREST_NEARMV,
+  SR_NEAREST_NEWMV,
+  SR_NEAR_NEWMV,
+  SR_ZERO_NEWMV,
+  SR_NEW_NEWMV,
+#endif  // CONFIG_COMPOUND_SINGLEREF
+  // Compound ref compound modes
+  NEAREST_NEARESTMV,
+  NEAREST_NEARMV,
+  NEAR_NEARESTMV,
+  NEAR_NEARMV,
+  NEAREST_NEWMV,
+  NEW_NEARESTMV,
+  NEAR_NEWMV,
+  NEW_NEARMV,
+  ZERO_ZEROMV,
+  NEW_NEWMV,
+#endif  // CONFIG_EXT_INTER
+  MB_MODE_COUNT,
+  INTRA_MODES = TM_PRED + 1,
+  INTRA_INVALID = MB_MODE_COUNT  // For uv_mode in inter blocks
+} PREDICTION_MODE;
+
+typedef enum {
+  SIMPLE_TRANSLATION = 0,
+#if CONFIG_MOTION_VAR
+  OBMC_CAUSAL,  // 2-sided OBMC
+#endif          // CONFIG_MOTION_VAR
+#if CONFIG_WARPED_MOTION
+  WARPED_CAUSAL,  // 2-sided WARPED
+#endif            // CONFIG_WARPED_MOTION
+  MOTION_MODES
+} MOTION_MODE;
+
+// TODO(urvang): Consider adding II_SMOOTH_PRED if it's helpful.
+
+#if CONFIG_EXT_INTER
+typedef enum {
+  II_DC_PRED = 0,
+  II_V_PRED,
+  II_H_PRED,
+  II_D45_PRED,
+  II_D135_PRED,
+  II_D117_PRED,
+  II_D153_PRED,
+  II_D207_PRED,
+  II_D63_PRED,
+  II_TM_PRED,
+  INTERINTRA_MODES
+} INTERINTRA_MODE;
+
+typedef enum {
+  COMPOUND_AVERAGE = 0,
+#if CONFIG_WEDGE
+  COMPOUND_WEDGE,
+#endif  // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+  COMPOUND_SEG,
+#endif  // CONFIG_COMPOUND_SEGMENT
+  COMPOUND_TYPES,
+} COMPOUND_TYPE;
+#endif  // CONFIG_EXT_INTER
+
+// TODO(huisu): Consider adding FILTER_SMOOTH_PRED to "FILTER_INTRA_MODE".
+#if CONFIG_FILTER_INTRA
+typedef enum {
+  FILTER_DC_PRED,
+  FILTER_V_PRED,
+  FILTER_H_PRED,
+  FILTER_D45_PRED,
+  FILTER_D135_PRED,
+  FILTER_D117_PRED,
+  FILTER_D153_PRED,
+  FILTER_D207_PRED,
+  FILTER_D63_PRED,
+  FILTER_TM_PRED,
+  FILTER_INTRA_MODES,
+} FILTER_INTRA_MODE;
+#endif  // CONFIG_FILTER_INTRA
+
+#if CONFIG_EXT_INTRA
+#define DIRECTIONAL_MODES (INTRA_MODES - 2)
+#endif  // CONFIG_EXT_INTRA
+
+#define INTER_MODES (1 + NEWMV - NEARESTMV)
+
+#if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SINGLEREF
+#define INTER_SINGLEREF_COMP_MODES (1 + SR_NEW_NEWMV - SR_NEAREST_NEARMV)
+#endif  // CONFIG_COMPOUND_SINGLEREF
+
+#define INTER_COMPOUND_MODES (1 + NEW_NEWMV - NEAREST_NEARESTMV)
+#endif  // CONFIG_EXT_INTER
+
+#define SKIP_CONTEXTS 3
+
+#if CONFIG_REF_MV
+#define NMV_CONTEXTS 3
+
+#define NEWMV_MODE_CONTEXTS 7
+#define ZEROMV_MODE_CONTEXTS 2
+#define REFMV_MODE_CONTEXTS 9
+#define DRL_MODE_CONTEXTS 5
+
+#define ZEROMV_OFFSET 3
+#define REFMV_OFFSET 4
+
+#define NEWMV_CTX_MASK ((1 << ZEROMV_OFFSET) - 1)
+#define ZEROMV_CTX_MASK ((1 << (REFMV_OFFSET - ZEROMV_OFFSET)) - 1)
+#define REFMV_CTX_MASK ((1 << (8 - REFMV_OFFSET)) - 1)
+
+#define ALL_ZERO_FLAG_OFFSET 8
+#define SKIP_NEARESTMV_OFFSET 9
+#define SKIP_NEARMV_OFFSET 10
+#define SKIP_NEARESTMV_SUB8X8_OFFSET 11
+#endif
+
+#define INTER_MODE_CONTEXTS 7
+#if CONFIG_DELTA_Q
+#define DELTA_Q_SMALL 3
+#define DELTA_Q_PROBS (DELTA_Q_SMALL)
+#define DEFAULT_DELTA_Q_RES 4
+#if CONFIG_EXT_DELTA_Q
+#define DELTA_LF_SMALL 3
+#define DELTA_LF_PROBS (DELTA_LF_SMALL)
+#define DEFAULT_DELTA_LF_RES 2
+#endif
+#endif
+
+/* Segment Feature Masks */
+#define MAX_MV_REF_CANDIDATES 2
+
+#if CONFIG_REF_MV
+#define MAX_REF_MV_STACK_SIZE 16
+#if CONFIG_EXT_PARTITION
+#define REF_CAT_LEVEL 640
+#else
+#define REF_CAT_LEVEL 255
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_REF_MV
+
+#define INTRA_INTER_CONTEXTS 4
+#define COMP_INTER_CONTEXTS 5
+#define REF_CONTEXTS 5
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#define COMP_INTER_MODE_CONTEXTS 4
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
+#if CONFIG_VAR_TX
+#define TXFM_PARTITION_CONTEXTS ((TX_SIZES - TX_8X8) * 6 - 2)
+typedef uint8_t TXFM_CONTEXT;
+#endif
+
+#define NONE_FRAME -1
+#define INTRA_FRAME 0
+#define LAST_FRAME 1
+
+#if CONFIG_EXT_REFS
+#define LAST2_FRAME 2
+#define LAST3_FRAME 3
+#define GOLDEN_FRAME 4
+#define BWDREF_FRAME 5
+#define ALTREF_FRAME 6
+#define LAST_REF_FRAMES (LAST3_FRAME - LAST_FRAME + 1)
+#else
+#define GOLDEN_FRAME 2
+#define ALTREF_FRAME 3
+#endif  // CONFIG_EXT_REFS
+
+#define INTER_REFS_PER_FRAME (ALTREF_FRAME - LAST_FRAME + 1)
+#define TOTAL_REFS_PER_FRAME (ALTREF_FRAME - INTRA_FRAME + 1)
+
+#define FWD_REFS (GOLDEN_FRAME - LAST_FRAME + 1)
+#define FWD_RF_OFFSET(ref) (ref - LAST_FRAME)
+#if CONFIG_EXT_REFS
+#define BWD_REFS (ALTREF_FRAME - BWDREF_FRAME + 1)
+#define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME)
+#else
+#define BWD_REFS 1
+#define BWD_RF_OFFSET(ref) (ref - ALTREF_FRAME)
+#endif  // CONFIG_EXT_REFS
+
+#define SINGLE_REFS (FWD_REFS + BWD_REFS)
+#define COMP_REFS (FWD_REFS * BWD_REFS)
+
+#if CONFIG_REF_MV
+#define MODE_CTX_REF_FRAMES (TOTAL_REFS_PER_FRAME + COMP_REFS)
+#else
+#define MODE_CTX_REF_FRAMES TOTAL_REFS_PER_FRAME
+#endif
+
+#if CONFIG_SUPERTX
+#define PARTITION_SUPERTX_CONTEXTS 2
+#define MAX_SUPERTX_BLOCK_SIZE BLOCK_32X32
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_LOOP_RESTORATION
+typedef enum {
+  RESTORE_NONE = 0,
+  RESTORE_WIENER = 1,
+  RESTORE_SGRPROJ = 2,
+  RESTORE_SWITCHABLE,
+  RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE,
+  RESTORE_TYPES,
+} RestorationType;
+#endif  // CONFIG_LOOP_RESTORATION
+
+#if CONFIG_FRAME_SUPERRES
+#define SUPERRES_SCALE_DENOMINATOR 16
+#define SUPERRES_SCALE_BITS 3
+#define SUPERRES_SCALE_NUMERATOR_MIN 8
+#endif  // CONFIG_FRAME_SUPERRES
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_ENUMS_H_
diff --git a/third_party/aom/av1/common/filter.c b/third_party/aom/av1/common/filter.c
new file mode 100644
index 000000000..9f0c58866
--- /dev/null
+++ b/third_party/aom/av1/common/filter.c
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/filter.h"
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                bilinear_filters[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },  { 0, 0, 0, 120, 8, 0, 0, 0 },
+  { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 },
+  { 0, 0, 0, 96, 32, 0, 0, 0 },  { 0, 0, 0, 88, 40, 0, 0, 0 },
+  { 0, 0, 0, 80, 48, 0, 0, 0 },  { 0, 0, 0, 72, 56, 0, 0, 0 },
+  { 0, 0, 0, 64, 64, 0, 0, 0 },  { 0, 0, 0, 56, 72, 0, 0, 0 },
+  { 0, 0, 0, 48, 80, 0, 0, 0 },  { 0, 0, 0, 40, 88, 0, 0, 0 },
+  { 0, 0, 0, 32, 96, 0, 0, 0 },  { 0, 0, 0, 24, 104, 0, 0, 0 },
+  { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 }
+};
+
+#if USE_TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(16, static const int16_t,
+                sub_pel_filters_temporalfilter_12[SUBPEL_SHIFTS][12]) = {
+  // intfilt 0.8
+  { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 },
+  { 0, 1, -1, 3, -7, 127, 8, -4, 2, -1, 0, 0 },
+  { 0, 1, -3, 5, -12, 124, 18, -8, 4, -2, 1, 0 },
+  { -1, 2, -4, 8, -17, 120, 28, -11, 6, -3, 1, -1 },
+  { -1, 2, -4, 10, -21, 114, 38, -15, 8, -4, 2, -1 },
+  { -1, 3, -5, 11, -23, 107, 49, -18, 9, -5, 2, -1 },
+  { -1, 3, -6, 12, -25, 99, 60, -21, 11, -6, 3, -1 },
+  { -1, 3, -6, 12, -25, 90, 70, -23, 12, -6, 3, -1 },
+  { -1, 3, -6, 12, -24, 80, 80, -24, 12, -6, 3, -1 },
+  { -1, 3, -6, 12, -23, 70, 90, -25, 12, -6, 3, -1 },
+  { -1, 3, -6, 11, -21, 60, 99, -25, 12, -6, 3, -1 },
+  { -1, 2, -5, 9, -18, 49, 107, -23, 11, -5, 3, -1 },
+  { -1, 2, -4, 8, -15, 38, 114, -21, 10, -4, 2, -1 },
+  { -1, 1, -3, 6, -11, 28, 120, -17, 8, -4, 2, -1 },
+  { 0, 1, -2, 4, -8, 18, 124, -12, 5, -3, 1, 0 },
+  { 0, 0, -1, 2, -4, 8, 127, -7, 3, -1, 1, 0 },
+};
+#endif  // USE_TEMPORALFILTER_12TAP
+
+#if CONFIG_DUAL_FILTER
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+  // intfilt 0.575
+  { 0, 0, 0, 128, 0, 0, 0, 0 },        { 0, 1, -5, 126, 8, -3, 1, 0 },
+  { -1, 3, -10, 123, 18, -6, 2, -1 },  { -1, 4, -14, 118, 27, -9, 3, 0 },
+  { -1, 5, -16, 112, 37, -12, 4, -1 }, { -1, 5, -18, 105, 48, -14, 4, -1 },
+  { -1, 6, -19, 97, 58, -17, 5, -1 },  { -1, 6, -20, 88, 68, -18, 6, -1 },
+  { -1, 6, -19, 78, 78, -19, 6, -1 },  { -1, 6, -18, 68, 88, -20, 6, -1 },
+  { -1, 5, -17, 58, 97, -19, 6, -1 },  { -1, 4, -14, 48, 105, -18, 5, -1 },
+  { -1, 4, -12, 37, 112, -16, 5, -1 }, { 0, 3, -9, 27, 118, -14, 4, -1 },
+  { -1, 2, -6, 18, 123, -10, 3, -1 },  { 0, 1, -3, 8, 126, -5, 1, 0 },
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_regular_uv[SUBPEL_SHIFTS]) = {
+  // intfilt 0.575
+  { 0, 0, 0, 128, 0, 0, 0, 0 },        { 0, 1, -5, 126, 8, -3, 1, 0 },
+  { -1, 3, -10, 123, 18, -6, 2, -1 },  { -1, 4, -14, 118, 27, -9, 3, 0 },
+  { -1, 5, -16, 112, 37, -12, 4, -1 }, { -1, 5, -18, 105, 48, -14, 4, -1 },
+  { -1, 6, -19, 97, 58, -17, 5, -1 },  { -1, 6, -20, 88, 68, -18, 6, -1 },
+  { -1, 6, -19, 78, 78, -19, 6, -1 },  { -1, 6, -18, 68, 88, -20, 6, -1 },
+  { -1, 5, -17, 58, 97, -19, 6, -1 },  { -1, 4, -14, 48, 105, -18, 5, -1 },
+  { -1, 4, -12, 37, 112, -16, 5, -1 }, { 0, 3, -9, 27, 118, -14, 4, -1 },
+  { -1, 2, -6, 18, 123, -10, 3, -1 },  { 0, 1, -3, 8, 126, -5, 1, 0 },
+};
+
+#if USE_12TAP_FILTER
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
+  // intfilt 0.8
+  { 0, 0, 0, 128, 0, 0, 0, 0 },         { -1, 2, -6, 127, 9, -4, 2, -1 },
+  { -2, 5, -12, 124, 18, -7, 4, -2 },   { -2, 7, -16, 119, 28, -11, 5, -2 },
+  { -3, 8, -19, 114, 38, -14, 7, -3 },  { -3, 9, -22, 107, 49, -17, 8, -3 },
+  { -4, 10, -23, 99, 60, -20, 10, -4 }, { -4, 11, -23, 90, 70, -22, 10, -4 },
+  { -4, 11, -23, 80, 80, -23, 11, -4 }, { -4, 10, -22, 70, 90, -23, 11, -4 },
+  { -4, 10, -20, 60, 99, -23, 10, -4 }, { -3, 8, -17, 49, 107, -22, 9, -3 },
+  { -3, 7, -14, 38, 114, -19, 8, -3 },  { -2, 5, -11, 28, 119, -16, 7, -2 },
+  { -2, 4, -7, 18, 124, -12, 5, -2 },   { -1, 2, -4, 9, 127, -6, 2, -1 },
+};
+
+DECLARE_ALIGNED(256, static const int16_t,
+                sub_pel_filters_10sharp[SUBPEL_SHIFTS][12]) = {
+  // intfilt 0.85
+  { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 },
+  { 0, 1, -2, 3, -7, 127, 8, -4, 2, -1, 1, 0 },
+  { 0, 1, -3, 6, -13, 124, 18, -8, 4, -2, 1, 0 },
+  { 0, 2, -4, 8, -18, 120, 28, -12, 6, -4, 2, 0 },
+  { 0, 2, -5, 10, -21, 114, 38, -15, 8, -5, 2, 0 },
+  { 0, 3, -6, 11, -24, 107, 49, -19, 10, -6, 3, 0 },
+  { 0, 3, -7, 12, -25, 99, 59, -21, 11, -6, 3, 0 },
+  { 0, 3, -7, 12, -25, 90, 70, -23, 12, -7, 3, 0 },
+  { 0, 3, -7, 12, -25, 81, 81, -25, 12, -7, 3, 0 },
+  { 0, 3, -7, 12, -23, 70, 90, -25, 12, -7, 3, 0 },
+  { 0, 3, -6, 11, -21, 59, 99, -25, 12, -7, 3, 0 },
+  { 0, 3, -6, 10, -19, 49, 107, -24, 11, -6, 3, 0 },
+  { 0, 2, -5, 8, -15, 38, 114, -21, 10, -5, 2, 0 },
+  { 0, 2, -4, 6, -12, 28, 120, -18, 8, -4, 2, 0 },
+  { 0, 1, -2, 4, -8, 18, 124, -13, 6, -3, 1, 0 },
+  { 0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0 },
+};
+#else
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
+#if CONFIG_FILTER_7BIT
+  { 0, 0, 0, 128, 0, 0, 0, 0 },         { -2, 2, -6, 126, 8, -2, 2, 0 },
+  { -2, 6, -12, 124, 16, -6, 4, -2 },   { -2, 8, -18, 120, 26, -10, 6, -2 },
+  { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 },
+  { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 },
+  { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 },
+  { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 },
+  { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 },
+  { -2, 4, -6, 16, 124, -12, 6, -2 },   { 0, 2, -2, 8, 126, -6, 2, -2 }
+#else
+  { 0, 0, 0, 128, 0, 0, 0, 0 },         { -1, 3, -7, 127, 8, -3, 1, 0 },
+  { -2, 5, -13, 125, 17, -6, 3, -1 },   { -3, 7, -17, 121, 27, -10, 5, -2 },
+  { -4, 9, -20, 115, 37, -13, 6, -2 },  { -4, 10, -23, 108, 48, -16, 8, -3 },
+  { -4, 10, -24, 100, 59, -19, 9, -3 }, { -4, 11, -24, 90, 70, -21, 10, -4 },
+  { -4, 11, -23, 80, 80, -23, 11, -4 }, { -4, 10, -21, 70, 90, -24, 11, -4 },
+  { -3, 9, -19, 59, 100, -24, 10, -4 }, { -3, 8, -16, 48, 108, -23, 10, -4 },
+  { -2, 6, -13, 37, 115, -20, 9, -4 },  { -2, 5, -10, 27, 121, -17, 7, -3 },
+  { -1, 3, -6, 17, 125, -13, 5, -2 },   { 0, 1, -3, 8, 127, -7, 3, -1 }
+#endif
+};
+#endif
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8smooth2[SUBPEL_SHIFTS]) = {
+  // freqmultiplier = 0.2
+  { 0, 0, 0, 128, 0, 0, 0, 0 },   { 0, 9, 30, 44, 32, 11, 2, 0 },
+  { 0, 8, 28, 44, 34, 12, 2, 0 }, { 0, 7, 27, 44, 35, 13, 2, 0 },
+  { 0, 6, 26, 43, 37, 14, 2, 0 }, { 0, 5, 24, 43, 38, 16, 2, 0 },
+  { 0, 5, 23, 42, 38, 17, 3, 0 }, { 0, 4, 21, 41, 40, 19, 3, 0 },
+  { 0, 4, 20, 40, 40, 20, 4, 0 }, { 0, 3, 19, 40, 41, 21, 4, 0 },
+  { 0, 3, 17, 38, 42, 23, 5, 0 }, { 0, 2, 16, 38, 43, 24, 5, 0 },
+  { 0, 2, 14, 37, 43, 26, 6, 0 }, { 0, 2, 13, 35, 44, 27, 7, 0 },
+  { 0, 2, 12, 34, 44, 28, 8, 0 }, { 0, 2, 11, 32, 44, 30, 9, 0 },
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_smooth2_uv[SUBPEL_SHIFTS]) = {
+  // freqmultiplier = 0.2
+  { 0, 0, 0, 128, 0, 0, 0, 0 },   { 0, 9, 30, 44, 32, 11, 2, 0 },
+  { 0, 8, 28, 44, 34, 12, 2, 0 }, { 0, 7, 27, 44, 35, 13, 2, 0 },
+  { 0, 6, 26, 43, 37, 14, 2, 0 }, { 0, 5, 24, 43, 38, 16, 2, 0 },
+  { 0, 5, 23, 42, 38, 17, 3, 0 }, { 0, 4, 21, 41, 40, 19, 3, 0 },
+  { 0, 4, 20, 40, 40, 20, 4, 0 }, { 0, 3, 19, 40, 41, 21, 4, 0 },
+  { 0, 3, 17, 38, 42, 23, 5, 0 }, { 0, 2, 16, 38, 43, 24, 5, 0 },
+  { 0, 2, 14, 37, 43, 26, 6, 0 }, { 0, 2, 13, 35, 44, 27, 7, 0 },
+  { 0, 2, 12, 34, 44, 28, 8, 0 }, { 0, 2, 11, 32, 44, 30, 9, 0 },
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
+  // freqmultiplier = 0.8
+  { 0, 0, 0, 128, 0, 0, 0, 0 },    { 0, -5, 13, 102, 24, -7, 1, 0 },
+  { 0, -4, 8, 100, 31, -8, 1, 0 }, { 0, -3, 4, 97, 37, -8, 1, 0 },
+  { 0, -2, 0, 94, 44, -9, 1, 0 },  { 0, -2, -3, 90, 51, -9, 1, 0 },
+  { 0, -1, -5, 84, 59, -9, 0, 0 }, { 0, 0, -7, 79, 65, -9, 0, 0 },
+  { 0, 0, -8, 72, 72, -8, 0, 0 },  { 0, 0, -9, 65, 79, -7, 0, 0 },
+  { 0, 0, -9, 59, 84, -5, -1, 0 }, { 0, 1, -9, 51, 90, -3, -2, 0 },
+  { 0, 1, -9, 44, 94, 0, -2, 0 },  { 0, 1, -8, 37, 97, 4, -3, 0 },
+  { 0, 1, -8, 31, 100, 8, -4, 0 }, { 0, 1, -7, 24, 102, 13, -5, 0 },
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_smooth_uv[SUBPEL_SHIFTS]) = {
+  // freqmultiplier = 0.8
+  { 0, 0, 0, 128, 0, 0, 0, 0 },    { 0, -5, 13, 102, 24, -7, 1, 0 },
+  { 0, -4, 8, 100, 31, -8, 1, 0 }, { 0, -3, 4, 97, 37, -8, 1, 0 },
+  { 0, -2, 0, 94, 44, -9, 1, 0 },  { 0, -2, -3, 90, 51, -9, 1, 0 },
+  { 0, -1, -5, 84, 59, -9, 0, 0 }, { 0, 0, -7, 79, 65, -9, 0, 0 },
+  { 0, 0, -8, 72, 72, -8, 0, 0 },  { 0, 0, -9, 65, 79, -7, 0, 0 },
+  { 0, 0, -9, 59, 84, -5, -1, 0 }, { 0, 1, -9, 51, 90, -3, -2, 0 },
+  { 0, 1, -9, 44, 94, 0, -2, 0 },  { 0, 1, -8, 37, 97, 4, -3, 0 },
+  { 0, 1, -8, 31, 100, 8, -4, 0 }, { 0, 1, -7, 24, 102, 13, -5, 0 },
+};
+#else  // CONFIG_DUAL_FILTER
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+#if CONFIG_FILTER_7BIT
+  { 0, 0, 0, 128, 0, 0, 0, 0 },      { 0, 2, -6, 126, 8, -2, 0, 0 },
+  { 0, 2, -10, 122, 18, -4, 0, 0 },  { 0, 2, -12, 116, 28, -8, 2, 0 },
+  { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 },
+  { 0, 2, -16, 94, 58, -12, 2, 0 },  { 0, 2, -14, 84, 66, -12, 2, 0 },
+  { 0, 2, -14, 76, 76, -14, 2, 0 },  { 0, 2, -12, 66, 84, -14, 2, 0 },
+  { 0, 2, -12, 58, 94, -16, 2, 0 },  { 0, 2, -12, 48, 102, -14, 2, 0 },
+  { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 },
+  { 0, 0, -4, 18, 122, -10, 2, 0 },  { 0, 0, -2, 8, 126, -6, 2, 0 }
+#else
+  { 0, 0, 0, 128, 0, 0, 0, 0 },        { 0, 1, -5, 126, 8, -3, 1, 0 },
+  { -1, 3, -10, 122, 18, -6, 2, 0 },   { -1, 4, -13, 118, 27, -9, 3, -1 },
+  { -1, 4, -16, 112, 37, -11, 4, -1 }, { -1, 5, -18, 105, 48, -14, 4, -1 },
+  { -1, 5, -19, 97, 58, -16, 5, -1 },  { -1, 6, -19, 88, 68, -18, 5, -1 },
+  { -1, 6, -19, 78, 78, -19, 6, -1 },  { -1, 5, -18, 68, 88, -19, 6, -1 },
+  { -1, 5, -16, 58, 97, -19, 5, -1 },  { -1, 4, -14, 48, 105, -18, 5, -1 },
+  { -1, 4, -11, 37, 112, -16, 4, -1 }, { -1, 3, -9, 27, 118, -13, 4, -1 },
+  { 0, 2, -6, 18, 122, -10, 3, -1 },   { 0, 1, -3, 8, 126, -5, 1, 0 }
+#endif
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
+#if CONFIG_FILTER_7BIT
+  { 0, 0, 0, 128, 0, 0, 0, 0 },         { -2, 2, -6, 126, 8, -2, 2, 0 },
+  { -2, 6, -12, 124, 16, -6, 4, -2 },   { -2, 8, -18, 120, 26, -10, 6, -2 },
+  { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 },
+  { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 },
+  { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 },
+  { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 },
+  { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 },
+  { -2, 4, -6, 16, 124, -12, 6, -2 },   { 0, 2, -2, 8, 126, -6, 2, -2 }
+#else
+  { 0, 0, 0, 128, 0, 0, 0, 0 },         { -1, 3, -7, 127, 8, -3, 1, 0 },
+  { -2, 5, -13, 125, 17, -6, 3, -1 },   { -3, 7, -17, 121, 27, -10, 5, -2 },
+  { -4, 9, -20, 115, 37, -13, 6, -2 },  { -4, 10, -23, 108, 48, -16, 8, -3 },
+  { -4, 10, -24, 100, 59, -19, 9, -3 }, { -4, 11, -24, 90, 70, -21, 10, -4 },
+  { -4, 11, -23, 80, 80, -23, 11, -4 }, { -4, 10, -21, 70, 90, -24, 11, -4 },
+  { -3, 9, -19, 59, 100, -24, 10, -4 }, { -3, 8, -16, 48, 108, -23, 10, -4 },
+  { -2, 6, -13, 37, 115, -20, 9, -4 },  { -2, 5, -10, 27, 121, -17, 7, -3 },
+  { -1, 3, -6, 17, 125, -13, 5, -2 },   { 0, 1, -3, 8, 127, -7, 3, -1 }
+#endif
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
+#if CONFIG_FILTER_7BIT
+  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 2, 28, 62, 34, 2, 0, 0 },
+  { 0, 0, 26, 62, 36, 4, 0, 0 },    { 0, 0, 22, 62, 40, 4, 0, 0 },
+  { 0, 0, 20, 60, 42, 6, 0, 0 },    { 0, 0, 18, 58, 44, 8, 0, 0 },
+  { 0, 0, 16, 56, 46, 10, 0, 0 },   { 0, -2, 16, 54, 48, 12, 0, 0 },
+  { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 },
+  { 0, 0, 10, 46, 56, 16, 0, 0 },   { 0, 0, 8, 44, 58, 18, 0, 0 },
+  { 0, 0, 6, 42, 60, 20, 0, 0 },    { 0, 0, 4, 40, 62, 22, 0, 0 },
+  { 0, 0, 4, 36, 62, 26, 0, 0 },    { 0, 0, 2, 34, 62, 28, 2, 0 }
+#else
+  { 0, 0, 0, 128, 0, 0, 0, 0 },       { -3, -1, 32, 64, 38, 1, -3, 0 },
+  { -2, -2, 29, 63, 41, 2, -3, 0 },   { -2, -2, 26, 63, 43, 4, -4, 0 },
+  { -2, -3, 24, 62, 46, 5, -4, 0 },   { -2, -3, 21, 60, 49, 7, -4, 0 },
+  { -1, -4, 18, 59, 51, 9, -4, 0 },   { -1, -4, 16, 57, 53, 12, -4, -1 },
+  { -1, -4, 14, 55, 55, 14, -4, -1 }, { -1, -4, 12, 53, 57, 16, -4, -1 },
+  { 0, -4, 9, 51, 59, 18, -4, -1 },   { 0, -4, 7, 49, 60, 21, -3, -2 },
+  { 0, -4, 5, 46, 62, 24, -3, -2 },   { 0, -4, 4, 43, 63, 26, -2, -2 },
+  { 0, -3, 2, 41, 63, 29, -2, -2 },   { 0, -3, 1, 38, 64, 32, -1, -3 }
+#endif
+};
+#endif  // CONFIG_DUAL_FILTER
+
+#if CONFIG_EXT_INTRA
+#if CONFIG_INTRA_INTERP
+const InterpKernel *av1_intra_filter_kernels[INTRA_FILTERS] = {
+  bilinear_filters,         // INTRA_FILTER_LINEAR
+  sub_pel_filters_8,        // INTRA_FILTER_8TAP
+  sub_pel_filters_8sharp,   // INTRA_FILTER_8TAP_SHARP
+  sub_pel_filters_8smooth,  // INTRA_FILTER_8TAP_SMOOTH
+};
+#endif  // CONFIG_INTRA_INTERP
+#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_DUAL_FILTER
+static const InterpFilterParams
+    av1_interp_filter_params_list[SWITCHABLE_FILTERS + EXTRA_FILTERS] = {
+      { (const int16_t *)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        EIGHTTAP_REGULAR },
+      { (const int16_t *)sub_pel_filters_8smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        EIGHTTAP_SMOOTH },
+#if USE_12TAP_FILTER
+      { (const int16_t *)sub_pel_filters_10sharp, 12, SUBPEL_SHIFTS,
+        MULTITAP_SHARP },
+#else
+      { (const int16_t *)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        EIGHTTAP_SHARP },
+#endif
+      { (const int16_t *)sub_pel_filters_8smooth2, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        EIGHTTAP_SMOOTH2 },
+      { (const int16_t *)bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        BILINEAR },
+      { (const int16_t *)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        EIGHTTAP_SHARP },
+      { (const int16_t *)sub_pel_filters_regular_uv, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        FILTER_REGULAR_UV },
+      { (const int16_t *)sub_pel_filters_smooth_uv, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        FILTER_SMOOTH_UV },
+      { (const int16_t *)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        FILTER_SHARP_UV },
+      { (const int16_t *)sub_pel_filters_smooth2_uv, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        FILTER_SMOOTH2_UV },
+    };
+#else
+static const InterpFilterParams
+    av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
+      { (const int16_t *)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        EIGHTTAP_REGULAR },
+      { (const int16_t *)sub_pel_filters_8smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        EIGHTTAP_SMOOTH },
+      { (const int16_t *)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        MULTITAP_SHARP },
+      { (const int16_t *)bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        BILINEAR }
+    };
+#endif  // CONFIG_DUAL_FILTER
+
+#if USE_TEMPORALFILTER_12TAP
+static const InterpFilterParams av1_interp_temporalfilter_12tap = {
+  (const int16_t *)sub_pel_filters_temporalfilter_12, 12, SUBPEL_SHIFTS,
+  TEMPORALFILTER_12TAP
+};
+#endif  // USE_TEMPORALFILTER_12TAP
+
+InterpFilterParams av1_get_interp_filter_params(
+    const InterpFilter interp_filter) {
+#if USE_TEMPORALFILTER_12TAP
+  if (interp_filter == TEMPORALFILTER_12TAP)
+    return av1_interp_temporalfilter_12tap;
+#endif  // USE_TEMPORALFILTER_12TAP
+  return av1_interp_filter_params_list[interp_filter];
+}
+
+const int16_t *av1_get_interp_filter_kernel(const InterpFilter interp_filter) {
+#if USE_TEMPORALFILTER_12TAP
+  if (interp_filter == TEMPORALFILTER_12TAP)
+    return av1_interp_temporalfilter_12tap.filter_ptr;
+#endif  // USE_TEMPORALFILTER_12TAP
+  return (const int16_t *)av1_interp_filter_params_list[interp_filter]
+      .filter_ptr;
+}
+
+#if CONFIG_DUAL_FILTER
+InterpFilter av1_get_plane_interp_filter(InterpFilter interp_filter,
+                                         int plane) {
+#if USE_TEMPORALFILTER_12TAP
+  assert(interp_filter <= EIGHTTAP_SHARP ||
+         interp_filter == TEMPORALFILTER_12TAP);
+#else
+  assert(interp_filter <= EIGHTTAP_SHARP);
+#endif
+  if (plane == 0) {
+    return interp_filter;
+  } else {
+    switch (interp_filter) {
+      case EIGHTTAP_REGULAR: return FILTER_REGULAR_UV;
+      case EIGHTTAP_SMOOTH: return FILTER_SMOOTH_UV;
+      case MULTITAP_SHARP: return FILTER_SHARP_UV;
+      case EIGHTTAP_SMOOTH2: return FILTER_SMOOTH2_UV;
+      default: return interp_filter;
+    }
+  }
+}
+#endif
diff --git a/third_party/aom/av1/common/filter.h b/third_party/aom/av1/common/filter.h
new file mode 100644
index 000000000..693a46902
--- /dev/null
+++ b/third_party/aom/av1/common/filter.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_FILTER_H_
+#define AV1_COMMON_FILTER_H_
+
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define USE_TEMPORALFILTER_12TAP 1
+#define MAX_FILTER_TAP 12
+
+#define USE_12TAP_FILTER 0
+
+typedef enum {
+  EIGHTTAP_REGULAR,
+  EIGHTTAP_SMOOTH,
+  MULTITAP_SHARP,
+#if CONFIG_DUAL_FILTER
+  EIGHTTAP_SMOOTH2,
+#endif  // CONFIG_DUAL_FILTER
+  BILINEAR,
+#if CONFIG_DUAL_FILTER
+  EIGHTTAP_SHARP,
+  FILTER_REGULAR_UV,
+  FILTER_SMOOTH_UV,
+  FILTER_SHARP_UV,
+  FILTER_SMOOTH2_UV,
+#endif  // CONFIG_DUAL_FILTER
+  INTERP_FILTERS_ALL,
+  SWITCHABLE_FILTERS = BILINEAR,
+  SWITCHABLE = SWITCHABLE_FILTERS + 1, /* the last switchable one */
+  EXTRA_FILTERS = INTERP_FILTERS_ALL - SWITCHABLE_FILTERS,
+#if USE_TEMPORALFILTER_12TAP
+  TEMPORALFILTER_12TAP = SWITCHABLE_FILTERS + EXTRA_FILTERS,
+#endif
+} InterpFilter;
+
+#if CONFIG_DUAL_FILTER
+#define MAX_SUBPEL_TAPS 12
+#define LOG_SWITCHABLE_FILTERS \
+  3 /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
+#define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4)
+#define INTER_FILTER_COMP_OFFSET (SWITCHABLE_FILTERS + 1)
+#define INTER_FILTER_DIR_OFFSET ((SWITCHABLE_FILTERS + 1) * 2)
+#else  // CONFIG_DUAL_FILTER
+#define LOG_SWITCHABLE_FILTERS \
+  2 /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
+#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
+#endif  // CONFIG_DUAL_FILTER
+
+#if CONFIG_EXT_INTRA
+#if CONFIG_INTRA_INTERP
+typedef enum {
+  INTRA_FILTER_LINEAR,
+  INTRA_FILTER_8TAP,
+  INTRA_FILTER_8TAP_SHARP,
+  INTRA_FILTER_8TAP_SMOOTH,
+  INTRA_FILTERS,
+} INTRA_FILTER;
+
+extern const InterpKernel *av1_intra_filter_kernels[INTRA_FILTERS];
+#endif  // CONFIG_INTRA_INTERP
+#endif  // CONFIG_EXT_INTRA
+
+typedef struct InterpFilterParams {
+  const int16_t *filter_ptr;
+  uint16_t taps;
+  uint16_t subpel_shifts;
+  InterpFilter interp_filter;
+} InterpFilterParams;
+
+InterpFilterParams av1_get_interp_filter_params(
+    const InterpFilter interp_filter);
+
+const int16_t *av1_get_interp_filter_kernel(const InterpFilter interp_filter);
+
+static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
+    const InterpFilterParams filter_params, const int subpel) {
+  return filter_params.filter_ptr + filter_params.taps * subpel;
+}
+
+static INLINE int av1_is_interpolating_filter(
+    const InterpFilter interp_filter) {
+  const InterpFilterParams ip = av1_get_interp_filter_params(interp_filter);
+  return (ip.filter_ptr[ip.taps / 2 - 1] == 128);
+}
+
+#if CONFIG_DUAL_FILTER
+InterpFilter av1_get_plane_interp_filter(InterpFilter interp_filter, int plane);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_FILTER_H_
diff --git a/third_party/aom/av1/common/frame_buffers.c b/third_party/aom/av1/common/frame_buffers.c
new file mode 100644
index 000000000..0b6b78e3d
--- /dev/null
+++ b/third_party/aom/av1/common/frame_buffers.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/frame_buffers.h"
+#include "aom_mem/aom_mem.h"
+
+int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list) {
+  assert(list != NULL);
+  av1_free_internal_frame_buffers(list);
+
+  list->num_internal_frame_buffers =
+      AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+  list->int_fb = (InternalFrameBuffer *)aom_calloc(
+      list->num_internal_frame_buffers, sizeof(*list->int_fb));
+  return (list->int_fb == NULL);
+}
+
+void av1_free_internal_frame_buffers(InternalFrameBufferList *list) {
+  int i;
+
+  assert(list != NULL);
+
+  for (i = 0; i < list->num_internal_frame_buffers; ++i) {
+    aom_free(list->int_fb[i].data);
+    list->int_fb[i].data = NULL;
+  }
+  aom_free(list->int_fb);
+  list->int_fb = NULL;
+}
+
+int av1_get_frame_buffer(void *cb_priv, size_t min_size,
+                         aom_codec_frame_buffer_t *fb) {
+  int i;
+  InternalFrameBufferList *const int_fb_list =
+      (InternalFrameBufferList *)cb_priv;
+  if (int_fb_list == NULL) return -1;
+
+  // Find a free frame buffer.
+  for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) {
+    if (!int_fb_list->int_fb[i].in_use) break;
+  }
+
+  if (i == int_fb_list->num_internal_frame_buffers) return -1;
+
+  if (int_fb_list->int_fb[i].size < min_size) {
+    aom_free(int_fb_list->int_fb[i].data);
+    // The data must be zeroed to fix a valgrind error from the C loop filter
+    // due to access uninitialized memory in frame border. It could be
+    // skipped if border were totally removed.
+    int_fb_list->int_fb[i].data = (uint8_t *)aom_calloc(1, min_size);
+    if (!int_fb_list->int_fb[i].data) return -1;
+    int_fb_list->int_fb[i].size = min_size;
+  }
+
+  fb->data = int_fb_list->int_fb[i].data;
+  fb->size = int_fb_list->int_fb[i].size;
+  int_fb_list->int_fb[i].in_use = 1;
+
+  // Set the frame buffer's private data to point at the internal frame buffer.
+  fb->priv = &int_fb_list->int_fb[i];
+  return 0;
+}
+
+int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb) {
+  InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
+  (void)cb_priv;
+  if (int_fb) int_fb->in_use = 0;
+  return 0;
+}
diff --git a/third_party/aom/av1/common/frame_buffers.h b/third_party/aom/av1/common/frame_buffers.h
new file mode 100644
index 000000000..e7341cfdd
--- /dev/null
+++ b/third_party/aom/av1/common/frame_buffers.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_FRAME_BUFFERS_H_
+#define AV1_COMMON_FRAME_BUFFERS_H_
+
+#include "aom/aom_frame_buffer.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct InternalFrameBuffer {
+  uint8_t *data;
+  size_t size;
+  int in_use;
+} InternalFrameBuffer;
+
+typedef struct InternalFrameBufferList {
+  int num_internal_frame_buffers;
+  InternalFrameBuffer *int_fb;
+} InternalFrameBufferList;
+
+// Initializes |list|. Returns 0 on success.
+int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Free any data allocated to the frame buffers.
+void av1_free_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Callback used by libaom to request an external frame buffer. |cb_priv|
+// Callback private data, which points to an InternalFrameBufferList.
+// |min_size| is the minimum size in bytes needed to decode the next frame.
+// |fb| pointer to the frame buffer.
+int av1_get_frame_buffer(void *cb_priv, size_t min_size,
+                         aom_codec_frame_buffer_t *fb);
+
+// Callback used by libaom when there are no references to the frame buffer.
+// |cb_priv| is not used. |fb| pointer to the frame buffer.
+int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_FRAME_BUFFERS_H_
diff --git a/third_party/aom/av1/common/generic_code.c b/third_party/aom/av1/common/generic_code.c
new file mode 100644
index 000000000..2955a695f
--- /dev/null
+++ b/third_party/aom/av1/common/generic_code.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "generic_code.h"
+
+void aom_cdf_init_q15_1D(uint16_t *cdf, int nsyms, int cdf_size) {
+  int i;
+  for (i = 0; i < nsyms; i++)
+    cdf[i] = AOM_ICDF((i + 1)*CDF_PROB_TOP/nsyms);
+
+#if CONFIG_EC_ADAPT
+  cdf[cdf_size - 1] = 0;
+#endif
+}
+
+/** Adapts a Q15 cdf after encoding/decoding a symbol. */
+void aom_cdf_adapt_q15(int val, uint16_t *cdf, int n, int *count, int rate) {
+  int i;
+  *count = OD_MINI(*count + 1, 1 << rate);
+  OD_ASSERT(AOM_ICDF(cdf[n - 1]) == 32768);
+  if (*count >= 1 << rate) {
+    /* Steady-state adaptation based on a simple IIR with dyadic rate. */
+    for (i = 0; i < n; i++) {
+      int tmp;
+      /* When (i < val), we want the adjustment ((cdf[i] - tmp) >> rate) to be
+         positive so long as (cdf[i] > i + 1), and 0 when (cdf[i] == i + 1),
+         to ensure we don't drive any probabilities to 0. Replacing cdf[i] with
+         (i + 2) and solving ((i + 2 - tmp) >> rate == 1) for tmp produces
+         tmp == i + 2 - (1 << rate). Using this value of tmp with
+         cdf[i] == i + 1 instead gives an adjustment of 0 as desired.
+
+         When (i >= val), we want ((cdf[i] - tmp) >> rate) to be negative so
+         long as cdf[i] < 32768 - (n - 1 - i), and 0 when
+         cdf[i] == 32768 - (n - 1 - i), again to ensure we don't drive any
+         probabilities to 0. Since right-shifting any negative value is still
+         negative, we can solve (32768 - (n - 1 - i) - tmp == 0) for tmp,
+         producing tmp = 32769 - n + i. Using this value of tmp with smaller
+         values of cdf[i] instead gives negative adjustments, as desired.
+
+         Combining the two cases gives the expression below. These could be
+         stored in a lookup table indexed by n and rate to avoid the
+         arithmetic. */
+      tmp = 2 - (1<<rate) + i + (32767 + (1<<rate) - n)*(i >= val);
+      cdf[i] = AOM_ICDF(AOM_ICDF(cdf[i]) - ((AOM_ICDF(cdf[i]) - tmp) >> rate));
+    }
+  }
+  else {
+    int alpha;
+    /* Initial adaptation for the first symbols. The adaptation rate is
+       computed to be equivalent to what od_{en,de}code_cdf_adapt() does
+       when the initial cdf is set to increment/4. */
+    alpha = 4*32768/(n + 4**count);
+    for (i = 0; i < n; i++) {
+      int tmp;
+      tmp = (32768 - n)*(i >= val) + i + 1;
+      cdf[i] = AOM_ICDF(AOM_ICDF(cdf[i])
+          - (((AOM_ICDF(cdf[i]) - tmp)*alpha) >> 15));
+    }
+  }
+  OD_ASSERT(AOM_ICDF(cdf[n - 1]) == 32768);
+}
+
+/** Takes the base-2 log of E(x) in Q1.
+ *
+ * @param [in] ExQ16 expectation of x in Q16
+ *
+ * @retval 2*log2(ExQ16/2^16)
+ */
+int log_ex(int ex_q16) {
+  int lg;
+  int lg_q1;
+  int odd;
+  lg = OD_ILOG(ex_q16);
+  if (lg < 15) {
+    odd = ex_q16*ex_q16 > 2 << 2*lg;
+  }
+  else {
+    int tmp;
+    tmp = ex_q16 >> (lg - 8);
+    odd = tmp*tmp > (1 << 15);
+  }
+  lg_q1 = OD_MAXI(0, 2*lg - 33 + odd);
+  return lg_q1;
+}
+
+/** Updates the probability model based on the encoded/decoded value
+ *
+ * @param [in,out] model generic prob model
+ * @param [in,out] ExQ16 expectation of x
+ * @param [in]     x     variable encoded/decoded (used for ExQ16)
+ * @param [in]     xs    variable x after shift (used for the model)
+ * @param [in]     id    id of the icdf to adapt
+ * @param [in]     integration integration period of ExQ16 (leaky average over
+ * 1<<integration samples)
+ */
+void generic_model_update(int *ex_q16, int x, int integration) {
+  /* We could have saturated ExQ16 directly, but this is safe and simpler */
+  x = OD_MINI(x, 32767);
+  OD_IIR_DIADIC(*ex_q16, x << 16, integration);
+}
diff --git a/third_party/aom/av1/common/generic_code.h b/third_party/aom/av1/common/generic_code.h
new file mode 100644
index 000000000..c9d87799d
--- /dev/null
+++ b/third_party/aom/av1/common/generic_code.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#if !defined(_generic_code_H)
+# define _generic_code_H
+
+# include "aom_dsp/bitreader.h"
+# include "aom_dsp/bitwriter.h"
+
+# define GENERIC_TABLES 12
+
+#define generic_decode(r, model, ex_q16, integration, ACCT_STR_NAME) \
+  generic_decode_(r, model, ex_q16, integration ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_decode_cdf_adapt_q15(r, cdf, n, count, rate, ACCT_STR_NAME) \
+  aom_decode_cdf_adapt_q15_(r, cdf, n, count, rate ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_decode_cdf_adapt(r, cdf, n, increment, ACCT_STR_NAME) \
+  aom_decode_cdf_adapt_(r, cdf, n, increment ACCT_STR_ARG(ACCT_STR_NAME))
+
+typedef struct {
+  /** cdf for multiple expectations of x */
+  uint16_t cdf[GENERIC_TABLES][CDF_SIZE(16)];
+} generic_encoder;
+
+#define OD_IIR_DIADIC(y, x, shift) ((y) += ((x) - (y)) >> (shift))
+
+void generic_model_init(generic_encoder *model);
+
+/* Initialize a CDF for use by aom_write_symbol_pvq()/aom_read_symbol_pvq().
+   This is used for CDFs whose size might not match the declared array size.
+   The only real requirement is that the first value of every CDF be zero.
+   Then aom_cdf_init_q15_1D() will be called with the real size the first time
+   the CDF is used. */
+#define OD_CDFS_INIT_DYNAMIC(cdf) (memset(cdf, 0, sizeof(cdf)))
+
+// WARNING: DO NOT USE this init function,
+// if the size of cdf is different from what is declared by code.
+#define OD_CDFS_INIT_Q15(cdfs) \
+  { int n_cdfs = sizeof(cdfs)/sizeof(cdfs[0]); \
+    int cdf_size = sizeof(cdfs[0])/sizeof(cdfs[0][0]); \
+    int nsyms = cdf_size - CONFIG_EC_ADAPT; \
+    int i_; \
+    for (i_ = 0; i_ < n_cdfs; i_++) \
+      aom_cdf_init_q15_1D(cdfs[i_], nsyms, cdf_size); \
+  }
+
+void aom_cdf_init(uint16_t *cdf, int ncdfs, int nsyms, int val, int first);
+
+void aom_cdf_init_q15_1D(uint16_t *cdf, int nsyms, int cdf_size);
+
+void aom_cdf_adapt_q15(int val, uint16_t *cdf, int n, int *count, int rate);
+
+void aom_encode_cdf_adapt_q15(aom_writer *w, int val, uint16_t *cdf, int n,
+ int *count, int rate);
+
+void generic_encode(aom_writer *w, generic_encoder *model, int x,
+ int *ex_q16, int integration);
+double generic_encode_cost(generic_encoder *model, int x, int *ex_q16);
+
+double od_encode_cdf_cost(int val, uint16_t *cdf, int n);
+
+int aom_decode_cdf_adapt_q15_(aom_reader *r, uint16_t *cdf, int n,
+ int *count, int rate ACCT_STR_PARAM);
+
+int generic_decode_(aom_reader *r, generic_encoder *model,
+ int *ex_q16, int integration ACCT_STR_PARAM);
+
+int log_ex(int ex_q16);
+
+void generic_model_update(int *ex_q16, int x, int integration);
+
+#endif
diff --git a/third_party/aom/av1/common/idct.c b/third_party/aom/av1/common/idct.c
new file mode 100644
index 000000000..0ea58bfe6
--- /dev/null
+++ b/third_party/aom/av1/common/idct.c
@@ -0,0 +1,3067 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
+#include "aom_dsp/inv_txfm.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_inv_txfm2d_cfg.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+
+int av1_get_tx_scale(const TX_SIZE tx_size) {
+  if (txsize_sqr_up_map[tx_size] == TX_32X32) return 1;
+#if CONFIG_TX64X64
+  else if (txsize_sqr_up_map[tx_size] == TX_64X64)
+    return 2;
+#endif  // CONFIG_TX64X64
+  else
+    return 0;
+}
+
+// NOTE: The implementation of all inverses need to be aware of the fact
+// that input and output could be the same buffer.
+
+#if CONFIG_EXT_TX
+static void iidtx4_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 4; ++i)
+    output[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+}
+
+static void iidtx8_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 8; ++i) output[i] = input[i] * 2;
+}
+
+static void iidtx16_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 16; ++i)
+    output[i] = (tran_low_t)dct_const_round_shift(input[i] * 2 * Sqrt2);
+}
+
+static void iidtx32_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
+}
+
+#if CONFIG_TX64X64
+static void iidtx64_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 64; ++i)
+    output[i] = (tran_low_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_EXT_TX
+
+// For use in lieu of ADST
+static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[16];
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+  }
+  for (i = 0; i < 16; ++i) {
+    output[i] = input[16 + i] * 4;
+  }
+  aom_idct16_c(inputhalf, output + 16);
+  // Note overall scaling factor is 4 times orthogonal
+}
+
+#if CONFIG_TX64X64
+static void idct64_col_c(const tran_low_t *input, tran_low_t *output) {
+  int32_t in[64], out[64];
+  int i;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_idct64_new(in, out, inv_cos_bit_col_dct_dct_64,
+                 inv_stage_range_col_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+static void idct64_row_c(const tran_low_t *input, tran_low_t *output) {
+  int32_t in[64], out[64];
+  int i;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_idct64_new(in, out, inv_cos_bit_row_dct_dct_64,
+                 inv_stage_range_row_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+// For use in lieu of ADST
+static void ihalfright64_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[32];
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 32; ++i) {
+    inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+  }
+  for (i = 0; i < 32; ++i) {
+    output[i] = (tran_low_t)dct_const_round_shift(input[32 + i] * 4 * Sqrt2);
+  }
+  aom_idct32_c(inputhalf, output + 32);
+  // Note overall scaling factor is 4 * sqrt(2)  times orthogonal
+}
+#endif  // CONFIG_TX64X64
+
+#if CONFIG_HIGHBITDEPTH
+static void highbd_idct4(const tran_low_t *input, tran_low_t *output,
+                         const int8_t *cos_bit, const int8_t *stage_range,
+                         int bd) {
+  (void)bd;
+  av1_idct4_new(input, output, cos_bit, stage_range);
+}
+
+static void highbd_idct8(const tran_low_t *input, tran_low_t *output,
+                         const int8_t *cos_bit, const int8_t *stage_range,
+                         int bd) {
+  (void)bd;
+  av1_idct8_new(input, output, cos_bit, stage_range);
+}
+
+static void highbd_idct16(const tran_low_t *input, tran_low_t *output,
+                          const int8_t *cos_bit, const int8_t *stage_range,
+                          int bd) {
+  (void)bd;
+  av1_idct16_new(input, output, cos_bit, stage_range);
+}
+
+static void highbd_idct32(const tran_low_t *input, tran_low_t *output,
+                          const int8_t *cos_bit, const int8_t *stage_range,
+                          int bd) {
+  (void)bd;
+  av1_idct32_new(input, output, cos_bit, stage_range);
+}
+
+static void highbd_iadst4(const tran_low_t *input, tran_low_t *output,
+                          const int8_t *cos_bit, const int8_t *stage_range,
+                          int bd) {
+  (void)bd;
+  av1_iadst4_new(input, output, cos_bit, stage_range);
+}
+
+static void highbd_iadst8(const tran_low_t *input, tran_low_t *output,
+                          const int8_t *cos_bit, const int8_t *stage_range,
+                          int bd) {
+  (void)bd;
+  av1_iadst8_new(input, output, cos_bit, stage_range);
+}
+
+static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
+                           const int8_t *cos_bit, const int8_t *stage_range,
+                           int bd) {
+  (void)bd;
+  av1_iadst16_new(input, output, cos_bit, stage_range);
+}
+
+#if CONFIG_EXT_TX
+static void highbd_iidtx4_c(const tran_low_t *input, tran_low_t *output,
+                            const int8_t *cos_bit, const int8_t *stage_range,
+                            int bd) {
+  int i;
+  (void)cos_bit;
+  (void)stage_range;
+  for (i = 0; i < 4; ++i)
+    output[i] = HIGHBD_WRAPLOW(dct_const_round_shift(input[i] * Sqrt2), bd);
+}
+
+static void highbd_iidtx8_c(const tran_low_t *input, tran_low_t *output,
+                            const int8_t *cos_bit, const int8_t *stage_range,
+                            int bd) {
+  int i;
+  (void)bd;
+  (void)cos_bit;
+  (void)stage_range;
+  for (i = 0; i < 8; ++i) output[i] = input[i] * 2;
+}
+
+static void highbd_iidtx16_c(const tran_low_t *input, tran_low_t *output,
+                             const int8_t *cos_bit, const int8_t *stage_range,
+                             int bd) {
+  int i;
+  (void)cos_bit;
+  (void)stage_range;
+  for (i = 0; i < 16; ++i)
+    output[i] = HIGHBD_WRAPLOW(dct_const_round_shift(input[i] * 2 * Sqrt2), bd);
+}
+
+static void highbd_iidtx32_c(const tran_low_t *input, tran_low_t *output,
+                             const int8_t *cos_bit, const int8_t *stage_range,
+                             int bd) {
+  int i;
+  (void)bd;
+  (void)cos_bit;
+  (void)stage_range;
+  for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
+}
+#endif  // CONFIG_EXT_TX
+
+static void highbd_ihalfright32_c(const tran_low_t *input, tran_low_t *output,
+                                  const int8_t *cos_bit,
+                                  const int8_t *stage_range, int bd) {
+  int i;
+  tran_low_t inputhalf[16];
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = HIGHBD_WRAPLOW(dct_const_round_shift(input[i] * Sqrt2), bd);
+  }
+  for (i = 0; i < 16; ++i) {
+    output[i] = input[16 + i] * 4;
+  }
+  highbd_idct16(inputhalf, output + 16, cos_bit, stage_range, bd);
+  // Note overall scaling factor is 4 times orthogonal
+}
+
+#if CONFIG_EXT_TX
+#if CONFIG_TX64X64
+static void highbd_iidtx64_c(const tran_low_t *input, tran_low_t *output,
+                             const int8_t *cos_bit, const int8_t *stage_range,
+                             int bd) {
+  (void)cos_bit;
+  (void)stage_range;
+  int i;
+  for (i = 0; i < 64; ++i)
+    output[i] = HIGHBD_WRAPLOW(dct_const_round_shift(input[i] * 4 * Sqrt2), bd);
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_TX64X64
+// For use in lieu of ADST
+static void highbd_ihalfright64_c(const tran_low_t *input, tran_low_t *output,
+                                  const int8_t *cos_bit,
+                                  const int8_t *stage_range, int bd) {
+  int i;
+  tran_low_t inputhalf[32];
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 32; ++i) {
+    inputhalf[i] = HIGHBD_WRAPLOW(dct_const_round_shift(input[i] * Sqrt2), bd);
+  }
+  for (i = 0; i < 32; ++i) {
+    output[i] =
+        HIGHBD_WRAPLOW(dct_const_round_shift(input[32 + i] * 4 * Sqrt2), bd);
+  }
+  highbd_idct32(inputhalf, output + 32, cos_bit, stage_range, bd);
+  // Note overall scaling factor is 4 * sqrt(2)  times orthogonal
+}
+
+static void highbd_idct64_col_c(const tran_low_t *input, tran_low_t *output,
+                                const int8_t *cos_bit,
+                                const int8_t *stage_range, int bd) {
+  int32_t in[64], out[64];
+  int i;
+  (void)cos_bit;
+  (void)stage_range;
+  (void)bd;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_idct64_new(in, out, inv_cos_bit_col_dct_dct_64,
+                 inv_stage_range_col_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+static void highbd_idct64_row_c(const tran_low_t *input, tran_low_t *output,
+                                const int8_t *cos_bit,
+                                const int8_t *stage_range, int bd) {
+  int32_t in[64], out[64];
+  int i;
+  (void)cos_bit;
+  (void)stage_range;
+  (void)bd;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_idct64_new(in, out, inv_cos_bit_row_dct_dct_64,
+                 inv_stage_range_row_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_HIGHBITDEPTH
+
+// Inverse identity transform and add.
+#if CONFIG_EXT_TX
+static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                           int bs, int tx_type) {
+  int r, c;
+  const int shift = bs < 32 ? 3 : (bs < 64 ? 2 : 1);
+  if (tx_type == IDTX) {
+    for (r = 0; r < bs; ++r) {
+      for (c = 0; c < bs; ++c)
+        dest[c] = clip_pixel_add(dest[c], input[c] >> shift);
+      dest += stride;
+      input += bs;
+    }
+  }
+}
+#endif  // CONFIG_EXT_TX
+
+#define FLIPUD_PTR(dest, stride, size)       \
+  do {                                       \
+    (dest) = (dest) + ((size)-1) * (stride); \
+    (stride) = -(stride);                    \
+  } while (0)
+
+#if CONFIG_EXT_TX
+static void maybe_flip_strides(uint8_t **dst, int *dstride, tran_low_t **src,
+                               int *sstride, int tx_type, int sizey,
+                               int sizex) {
+  // Note that the transpose of src will be added to dst. In order to LR
+  // flip the addends (in dst coordinates), we UD flip the src. To UD flip
+  // the addends, we UD flip the dst.
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case IDTX:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST: break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, sizey);
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, sizex);
+      break;
+    case FLIPADST_FLIPADST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, sizey);
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, sizex);
+      break;
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_HIGHBITDEPTH
+#if CONFIG_EXT_TX
+static void highbd_inv_idtx_add_c(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bs, int tx_type, int bd) {
+  int r, c;
+  const int shift = bs < 32 ? 3 : 2;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  if (tx_type == IDTX) {
+    for (r = 0; r < bs; ++r) {
+      for (c = 0; c < bs; ++c)
+        dest[c] = highbd_clip_pixel_add(dest[c], input[c] >> shift, bd);
+      dest += stride;
+      input += bs;
+    }
+  }
+}
+
+static void maybe_flip_strides16(uint16_t **dst, int *dstride, tran_low_t **src,
+                                 int *sstride, int tx_type, int sizey,
+                                 int sizex) {
+  // Note that the transpose of src will be added to dst. In order to LR
+  // flip the addends (in dst coordinates), we UD flip the src. To UD flip
+  // the addends, we UD flip the dst.
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case IDTX:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST: break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, sizey);
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, sizex);
+      break;
+    case FLIPADST_FLIPADST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, sizey);
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, sizex);
+      break;
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_EXT_TX
+#endif  // CONFIG_HIGHBITDEPTH
+
+void av1_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                         int tx_type) {
+  static const transform_2d IHT_4[] = {
+    { aom_idct4_c, aom_idct4_c },    // DCT_DCT  = 0
+    { aom_iadst4_c, aom_idct4_c },   // ADST_DCT = 1
+    { aom_idct4_c, aom_iadst4_c },   // DCT_ADST = 2
+    { aom_iadst4_c, aom_iadst4_c },  // ADST_ADST = 3
+#if CONFIG_EXT_TX
+    { aom_iadst4_c, aom_idct4_c },   // FLIPADST_DCT
+    { aom_idct4_c, aom_iadst4_c },   // DCT_FLIPADST
+    { aom_iadst4_c, aom_iadst4_c },  // FLIPADST_FLIPADST
+    { aom_iadst4_c, aom_iadst4_c },  // ADST_FLIPADST
+    { aom_iadst4_c, aom_iadst4_c },  // FLIPADST_ADST
+    { iidtx4_c, iidtx4_c },          // IDTX
+    { aom_idct4_c, iidtx4_c },       // V_DCT
+    { iidtx4_c, aom_idct4_c },       // H_DCT
+    { aom_iadst4_c, iidtx4_c },      // V_ADST
+    { iidtx4_c, aom_iadst4_c },      // H_ADST
+    { aom_iadst4_c, iidtx4_c },      // V_FLIPADST
+    { iidtx4_c, aom_iadst4_c },      // H_FLIPADST
+#endif                               // CONFIG_EXT_TX
+  };
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[4][4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 4;
+
+  // inverse transform row vectors
+  for (i = 0; i < 4; ++i) {
+    IHT_4[tx_type].rows(input, out[i]);
+    input += 4;
+  }
+
+  // transpose
+  for (i = 1; i < 4; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 4; ++i) {
+    IHT_4[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4, 4);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
+    }
+  }
+}
+
+void av1_iht4x8_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                         int tx_type) {
+  static const transform_2d IHT_4x8[] = {
+    { aom_idct8_c, aom_idct4_c },    // DCT_DCT
+    { aom_iadst8_c, aom_idct4_c },   // ADST_DCT
+    { aom_idct8_c, aom_iadst4_c },   // DCT_ADST
+    { aom_iadst8_c, aom_iadst4_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_iadst8_c, aom_idct4_c },   // FLIPADST_DCT
+    { aom_idct8_c, aom_iadst4_c },   // DCT_FLIPADST
+    { aom_iadst8_c, aom_iadst4_c },  // FLIPADST_FLIPADST
+    { aom_iadst8_c, aom_iadst4_c },  // ADST_FLIPADST
+    { aom_iadst8_c, aom_iadst4_c },  // FLIPADST_ADST
+    { iidtx8_c, iidtx4_c },          // IDTX
+    { aom_idct8_c, iidtx4_c },       // V_DCT
+    { iidtx8_c, aom_idct4_c },       // H_DCT
+    { aom_iadst8_c, iidtx4_c },      // V_ADST
+    { iidtx8_c, aom_iadst4_c },      // H_ADST
+    { aom_iadst8_c, iidtx4_c },      // V_FLIPADST
+    { iidtx8_c, aom_iadst4_c },      // H_FLIPADST
+#endif
+  };
+
+  const int n = 4;
+  const int n2 = 8;
+  int i, j;
+  tran_low_t out[4][8], outtmp[4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n2;
+
+  // inverse transform row vectors and transpose
+  for (i = 0; i < n2; ++i) {
+    IHT_4x8[tx_type].rows(input, outtmp);
+    for (j = 0; j < n; ++j)
+      out[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
+    input += n;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n; ++i) {
+    IHT_4x8[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+    }
+  }
+}
+
+void av1_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                         int tx_type) {
+  static const transform_2d IHT_8x4[] = {
+    { aom_idct4_c, aom_idct8_c },    // DCT_DCT
+    { aom_iadst4_c, aom_idct8_c },   // ADST_DCT
+    { aom_idct4_c, aom_iadst8_c },   // DCT_ADST
+    { aom_iadst4_c, aom_iadst8_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_iadst4_c, aom_idct8_c },   // FLIPADST_DCT
+    { aom_idct4_c, aom_iadst8_c },   // DCT_FLIPADST
+    { aom_iadst4_c, aom_iadst8_c },  // FLIPADST_FLIPADST
+    { aom_iadst4_c, aom_iadst8_c },  // ADST_FLIPADST
+    { aom_iadst4_c, aom_iadst8_c },  // FLIPADST_ADST
+    { iidtx4_c, iidtx8_c },          // IDTX
+    { aom_idct4_c, iidtx8_c },       // V_DCT
+    { iidtx4_c, aom_idct8_c },       // H_DCT
+    { aom_iadst4_c, iidtx8_c },      // V_ADST
+    { iidtx4_c, aom_iadst8_c },      // H_ADST
+    { aom_iadst4_c, iidtx8_c },      // V_FLIPADST
+    { iidtx4_c, aom_iadst8_c },      // H_FLIPADST
+#endif
+  };
+  const int n = 4;
+  const int n2 = 8;
+
+  int i, j;
+  tran_low_t out[8][4], outtmp[8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n;
+
+  // inverse transform row vectors and transpose
+  for (i = 0; i < n; ++i) {
+    IHT_8x4[tx_type].rows(input, outtmp);
+    for (j = 0; j < n2; ++j)
+      out[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
+    input += n2;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n2; ++i) {
+    IHT_8x4[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+    }
+  }
+}
+
+void av1_iht4x16_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                          int tx_type) {
+  static const transform_2d IHT_4x16[] = {
+    { aom_idct16_c, aom_idct4_c },    // DCT_DCT
+    { aom_iadst16_c, aom_idct4_c },   // ADST_DCT
+    { aom_idct16_c, aom_iadst4_c },   // DCT_ADST
+    { aom_iadst16_c, aom_iadst4_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_iadst16_c, aom_idct4_c },   // FLIPADST_DCT
+    { aom_idct16_c, aom_iadst4_c },   // DCT_FLIPADST
+    { aom_iadst16_c, aom_iadst4_c },  // FLIPADST_FLIPADST
+    { aom_iadst16_c, aom_iadst4_c },  // ADST_FLIPADST
+    { aom_iadst16_c, aom_iadst4_c },  // FLIPADST_ADST
+    { iidtx16_c, iidtx4_c },          // IDTX
+    { aom_idct16_c, iidtx4_c },       // V_DCT
+    { iidtx16_c, aom_idct4_c },       // H_DCT
+    { aom_iadst16_c, iidtx4_c },      // V_ADST
+    { iidtx16_c, aom_iadst4_c },      // H_ADST
+    { aom_iadst16_c, iidtx4_c },      // V_FLIPADST
+    { iidtx16_c, aom_iadst4_c },      // H_FLIPADST
+#endif
+  };
+
+  const int n = 4;
+  const int n4 = 16;
+  int i, j;
+  tran_low_t out[4][16], outtmp[4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n4;
+
+  // inverse transform row vectors and transpose
+  for (i = 0; i < n4; ++i) {
+    IHT_4x16[tx_type].rows(input, outtmp);
+    for (j = 0; j < n; ++j) out[j][i] = outtmp[j];
+    input += n;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n; ++i) IHT_4x16[tx_type].cols(out[i], out[i]);
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n4, n);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < n4; ++i) {
+    for (j = 0; j < n; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+    }
+  }
+}
+
+void av1_iht16x4_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                          int tx_type) {
+  static const transform_2d IHT_16x4[] = {
+    { aom_idct4_c, aom_idct16_c },    // DCT_DCT
+    { aom_iadst4_c, aom_idct16_c },   // ADST_DCT
+    { aom_idct4_c, aom_iadst16_c },   // DCT_ADST
+    { aom_iadst4_c, aom_iadst16_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_iadst4_c, aom_idct16_c },   // FLIPADST_DCT
+    { aom_idct4_c, aom_iadst16_c },   // DCT_FLIPADST
+    { aom_iadst4_c, aom_iadst16_c },  // FLIPADST_FLIPADST
+    { aom_iadst4_c, aom_iadst16_c },  // ADST_FLIPADST
+    { aom_iadst4_c, aom_iadst16_c },  // FLIPADST_ADST
+    { iidtx4_c, iidtx16_c },          // IDTX
+    { aom_idct4_c, iidtx16_c },       // V_DCT
+    { iidtx4_c, aom_idct16_c },       // H_DCT
+    { aom_iadst4_c, iidtx16_c },      // V_ADST
+    { iidtx4_c, aom_iadst16_c },      // H_ADST
+    { aom_iadst4_c, iidtx16_c },      // V_FLIPADST
+    { iidtx4_c, aom_iadst16_c },      // H_FLIPADST
+#endif
+  };
+  const int n = 4;
+  const int n4 = 16;
+
+  int i, j;
+  tran_low_t out[16][4], outtmp[16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n;
+
+  // inverse transform row vectors and transpose
+  for (i = 0; i < n; ++i) {
+    IHT_16x4[tx_type].rows(input, outtmp);
+    for (j = 0; j < n4; ++j) out[j][i] = outtmp[j];
+    input += n4;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n4; ++i) IHT_16x4[tx_type].cols(out[i], out[i]);
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n4);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n4; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+    }
+  }
+}
+
+void av1_iht8x16_128_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                           int tx_type) {
+  static const transform_2d IHT_8x16[] = {
+    { aom_idct16_c, aom_idct8_c },    // DCT_DCT
+    { aom_iadst16_c, aom_idct8_c },   // ADST_DCT
+    { aom_idct16_c, aom_iadst8_c },   // DCT_ADST
+    { aom_iadst16_c, aom_iadst8_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_iadst16_c, aom_idct8_c },   // FLIPADST_DCT
+    { aom_idct16_c, aom_iadst8_c },   // DCT_FLIPADST
+    { aom_iadst16_c, aom_iadst8_c },  // FLIPADST_FLIPADST
+    { aom_iadst16_c, aom_iadst8_c },  // ADST_FLIPADST
+    { aom_iadst16_c, aom_iadst8_c },  // FLIPADST_ADST
+    { iidtx16_c, iidtx8_c },          // IDTX
+    { aom_idct16_c, iidtx8_c },       // V_DCT
+    { iidtx16_c, aom_idct8_c },       // H_DCT
+    { aom_iadst16_c, iidtx8_c },      // V_ADST
+    { iidtx16_c, aom_iadst8_c },      // H_ADST
+    { aom_iadst16_c, iidtx8_c },      // V_FLIPADST
+    { iidtx16_c, aom_iadst8_c },      // H_FLIPADST
+#endif
+  };
+
+  const int n = 8;
+  const int n2 = 16;
+  int i, j;
+  tran_low_t out[8][16], outtmp[8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n2;
+
+  // inverse transform row vectors and transpose
+  for (i = 0; i < n2; ++i) {
+    IHT_8x16[tx_type].rows(input, outtmp);
+    for (j = 0; j < n; ++j)
+      out[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
+    input += n;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n; ++i) {
+    IHT_8x16[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
+    }
+  }
+}
+
+void av1_iht16x8_128_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                           int tx_type) {
+  static const transform_2d IHT_16x8[] = {
+    { aom_idct8_c, aom_idct16_c },    // DCT_DCT
+    { aom_iadst8_c, aom_idct16_c },   // ADST_DCT
+    { aom_idct8_c, aom_iadst16_c },   // DCT_ADST
+    { aom_iadst8_c, aom_iadst16_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_iadst8_c, aom_idct16_c },   // FLIPADST_DCT
+    { aom_idct8_c, aom_iadst16_c },   // DCT_FLIPADST
+    { aom_iadst8_c, aom_iadst16_c },  // FLIPADST_FLIPADST
+    { aom_iadst8_c, aom_iadst16_c },  // ADST_FLIPADST
+    { aom_iadst8_c, aom_iadst16_c },  // FLIPADST_ADST
+    { iidtx8_c, iidtx16_c },          // IDTX
+    { aom_idct8_c, iidtx16_c },       // V_DCT
+    { iidtx8_c, aom_idct16_c },       // H_DCT
+    { aom_iadst8_c, iidtx16_c },      // V_ADST
+    { iidtx8_c, aom_iadst16_c },      // H_ADST
+    { aom_iadst8_c, iidtx16_c },      // V_FLIPADST
+    { iidtx8_c, aom_iadst16_c },      // H_FLIPADST
+#endif
+  };
+  const int n = 8;
+  const int n2 = 16;
+
+  int i, j;
+  tran_low_t out[16][8], outtmp[16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n;
+
+  // inverse transform row vectors and transpose
+  for (i = 0; i < n; ++i) {
+    IHT_16x8[tx_type].rows(input, outtmp);
+    for (j = 0; j < n2; ++j)
+      out[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
+    input += n2;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n2; ++i) {
+    IHT_16x8[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
+    }
+  }
+}
+
+void av1_iht8x32_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                           int tx_type) {
+  static const transform_2d IHT_8x32[] = {
+    { aom_idct32_c, aom_idct8_c },     // DCT_DCT
+    { ihalfright32_c, aom_idct8_c },   // ADST_DCT
+    { aom_idct32_c, aom_iadst8_c },    // DCT_ADST
+    { ihalfright32_c, aom_iadst8_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { ihalfright32_c, aom_idct8_c },   // FLIPADST_DCT
+    { aom_idct32_c, aom_iadst8_c },    // DCT_FLIPADST
+    { ihalfright32_c, aom_iadst8_c },  // FLIPADST_FLIPADST
+    { ihalfright32_c, aom_iadst8_c },  // ADST_FLIPADST
+    { ihalfright32_c, aom_iadst8_c },  // FLIPADST_ADST
+    { iidtx32_c, iidtx8_c },           // IDTX
+    { aom_idct32_c, iidtx8_c },        // V_DCT
+    { iidtx32_c, aom_idct8_c },        // H_DCT
+    { ihalfright32_c, iidtx8_c },      // V_ADST
+    { iidtx32_c, aom_iadst8_c },       // H_ADST
+    { ihalfright32_c, iidtx8_c },      // V_FLIPADST
+    { iidtx32_c, aom_iadst8_c },       // H_FLIPADST
+#endif
+  };
+
+  const int n = 8;
+  const int n4 = 32;
+  int i, j;
+  tran_low_t out[8][32], outtmp[8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n4;
+
+  // inverse transform row vectors and transpose
+  for (i = 0; i < n4; ++i) {
+    IHT_8x32[tx_type].rows(input, outtmp);
+    for (j = 0; j < n; ++j) out[j][i] = outtmp[j];
+    input += n;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n; ++i) IHT_8x32[tx_type].cols(out[i], out[i]);
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n4, n);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < n4; ++i) {
+    for (j = 0; j < n; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
+    }
+  }
+}
+
+void av1_iht32x8_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                           int tx_type) {
+  static const transform_2d IHT_32x8[] = {
+    { aom_idct8_c, aom_idct32_c },     // DCT_DCT
+    { aom_iadst8_c, aom_idct32_c },    // ADST_DCT
+    { aom_idct8_c, ihalfright32_c },   // DCT_ADST
+    { aom_iadst8_c, ihalfright32_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_iadst8_c, aom_idct32_c },    // FLIPADST_DCT
+    { aom_idct8_c, ihalfright32_c },   // DCT_FLIPADST
+    { aom_iadst8_c, ihalfright32_c },  // FLIPADST_FLIPADST
+    { aom_iadst8_c, ihalfright32_c },  // ADST_FLIPADST
+    { aom_iadst8_c, ihalfright32_c },  // FLIPADST_ADST
+    { iidtx8_c, iidtx32_c },           // IDTX
+    { aom_idct8_c, iidtx32_c },        // V_DCT
+    { iidtx8_c, aom_idct32_c },        // H_DCT
+    { aom_iadst8_c, iidtx32_c },       // V_ADST
+    { iidtx8_c, ihalfright32_c },      // H_ADST
+    { aom_iadst8_c, iidtx32_c },       // V_FLIPADST
+    { iidtx8_c, ihalfright32_c },      // H_FLIPADST
+#endif
+  };
+  const int n = 8;
+  const int n4 = 32;
+
+  int i, j;
+  tran_low_t out[32][8], outtmp[32];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n;
+
+  // inverse transform row vectors and transpose
+  for (i = 0; i < n; ++i) {
+    IHT_32x8[tx_type].rows(input, outtmp);
+    for (j = 0; j < n4; ++j) out[j][i] = outtmp[j];
+    input += n4;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n4; ++i) IHT_32x8[tx_type].cols(out[i], out[i]);
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n4);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n4; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
+    }
+  }
+}
+
+void av1_iht16x32_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  static const transform_2d IHT_16x32[] = {
+    { aom_idct32_c, aom_idct16_c },     // DCT_DCT
+    { ihalfright32_c, aom_idct16_c },   // ADST_DCT
+    { aom_idct32_c, aom_iadst16_c },    // DCT_ADST
+    { ihalfright32_c, aom_iadst16_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { ihalfright32_c, aom_idct16_c },   // FLIPADST_DCT
+    { aom_idct32_c, aom_iadst16_c },    // DCT_FLIPADST
+    { ihalfright32_c, aom_iadst16_c },  // FLIPADST_FLIPADST
+    { ihalfright32_c, aom_iadst16_c },  // ADST_FLIPADST
+    { ihalfright32_c, aom_iadst16_c },  // FLIPADST_ADST
+    { iidtx32_c, iidtx16_c },           // IDTX
+    { aom_idct32_c, iidtx16_c },        // V_DCT
+    { iidtx32_c, aom_idct16_c },        // H_DCT
+    { ihalfright32_c, iidtx16_c },      // V_ADST
+    { iidtx32_c, aom_iadst16_c },       // H_ADST
+    { ihalfright32_c, iidtx16_c },      // V_FLIPADST
+    { iidtx32_c, aom_iadst16_c },       // H_FLIPADST
+#endif
+  };
+
+  const int n = 16;
+  const int n2 = 32;
+  int i, j;
+  tran_low_t out[16][32], outtmp[16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n2;
+
+  // inverse transform row vectors and transpose
+  for (i = 0; i < n2; ++i) {
+    IHT_16x32[tx_type].rows(input, outtmp);
+    for (j = 0; j < n; ++j)
+      out[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
+    input += n;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n; ++i) {
+    IHT_16x32[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
+    }
+  }
+}
+
+void av1_iht32x16_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  static const transform_2d IHT_32x16[] = {
+    { aom_idct16_c, aom_idct32_c },     // DCT_DCT
+    { aom_iadst16_c, aom_idct32_c },    // ADST_DCT
+    { aom_idct16_c, ihalfright32_c },   // DCT_ADST
+    { aom_iadst16_c, ihalfright32_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_iadst16_c, aom_idct32_c },    // FLIPADST_DCT
+    { aom_idct16_c, ihalfright32_c },   // DCT_FLIPADST
+    { aom_iadst16_c, ihalfright32_c },  // FLIPADST_FLIPADST
+    { aom_iadst16_c, ihalfright32_c },  // ADST_FLIPADST
+    { aom_iadst16_c, ihalfright32_c },  // FLIPADST_ADST
+    { iidtx16_c, iidtx32_c },           // IDTX
+    { aom_idct16_c, iidtx32_c },        // V_DCT
+    { iidtx16_c, aom_idct32_c },        // H_DCT
+    { aom_iadst16_c, iidtx32_c },       // V_ADST
+    { iidtx16_c, ihalfright32_c },      // H_ADST
+    { aom_iadst16_c, iidtx32_c },       // V_FLIPADST
+    { iidtx16_c, ihalfright32_c },      // H_FLIPADST
+#endif
+  };
+  const int n = 16;
+  const int n2 = 32;
+
+  int i, j;
+  tran_low_t out[32][16], outtmp[32];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n;
+
+  // inverse transform row vectors and transpose
+  for (i = 0; i < n; ++i) {
+    IHT_32x16[tx_type].rows(input, outtmp);
+    for (j = 0; j < n2; ++j)
+      out[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
+    input += n2;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n2; ++i) {
+    IHT_32x16[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
+    }
+  }
+}
+
+void av1_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                         int tx_type) {
+  static const transform_2d IHT_8[] = {
+    { aom_idct8_c, aom_idct8_c },    // DCT_DCT  = 0
+    { aom_iadst8_c, aom_idct8_c },   // ADST_DCT = 1
+    { aom_idct8_c, aom_iadst8_c },   // DCT_ADST = 2
+    { aom_iadst8_c, aom_iadst8_c },  // ADST_ADST = 3
+#if CONFIG_EXT_TX
+    { aom_iadst8_c, aom_idct8_c },   // FLIPADST_DCT
+    { aom_idct8_c, aom_iadst8_c },   // DCT_FLIPADST
+    { aom_iadst8_c, aom_iadst8_c },  // FLIPADST_FLIPADST
+    { aom_iadst8_c, aom_iadst8_c },  // ADST_FLIPADST
+    { aom_iadst8_c, aom_iadst8_c },  // FLIPADST_ADST
+    { iidtx8_c, iidtx8_c },          // IDTX
+    { aom_idct8_c, iidtx8_c },       // V_DCT
+    { iidtx8_c, aom_idct8_c },       // H_DCT
+    { aom_iadst8_c, iidtx8_c },      // V_ADST
+    { iidtx8_c, aom_iadst8_c },      // H_ADST
+    { aom_iadst8_c, iidtx8_c },      // V_FLIPADST
+    { iidtx8_c, aom_iadst8_c },      // H_FLIPADST
+#endif                               // CONFIG_EXT_TX
+  };
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[8][8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 8;
+
+  // inverse transform row vectors
+  for (i = 0; i < 8; ++i) {
+    IHT_8[tx_type].rows(input, out[i]);
+    input += 8;
+  }
+
+  // transpose
+  for (i = 1; i < 8; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 8; ++i) {
+    IHT_8[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 8, 8);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+    }
+  }
+}
+
+void av1_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  static const transform_2d IHT_16[] = {
+    { aom_idct16_c, aom_idct16_c },    // DCT_DCT  = 0
+    { aom_iadst16_c, aom_idct16_c },   // ADST_DCT = 1
+    { aom_idct16_c, aom_iadst16_c },   // DCT_ADST = 2
+    { aom_iadst16_c, aom_iadst16_c },  // ADST_ADST = 3
+#if CONFIG_EXT_TX
+    { aom_iadst16_c, aom_idct16_c },   // FLIPADST_DCT
+    { aom_idct16_c, aom_iadst16_c },   // DCT_FLIPADST
+    { aom_iadst16_c, aom_iadst16_c },  // FLIPADST_FLIPADST
+    { aom_iadst16_c, aom_iadst16_c },  // ADST_FLIPADST
+    { aom_iadst16_c, aom_iadst16_c },  // FLIPADST_ADST
+    { iidtx16_c, iidtx16_c },          // IDTX
+    { aom_idct16_c, iidtx16_c },       // V_DCT
+    { iidtx16_c, aom_idct16_c },       // H_DCT
+    { aom_iadst16_c, iidtx16_c },      // V_ADST
+    { iidtx16_c, aom_iadst16_c },      // H_ADST
+    { aom_iadst16_c, iidtx16_c },      // V_FLIPADST
+    { iidtx16_c, aom_iadst16_c },      // H_FLIPADST
+#endif                                 // CONFIG_EXT_TX
+  };
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[16][16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 16;
+
+  // inverse transform row vectors
+  for (i = 0; i < 16; ++i) {
+    IHT_16[tx_type].rows(input, out[i]);
+    input += 16;
+  }
+
+  // transpose
+  for (i = 1; i < 16; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 16; ++i) {
+    IHT_16[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 16, 16);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
+    }
+  }
+}
+
+#if CONFIG_EXT_TX
+void av1_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                             int tx_type) {
+  static const transform_2d IHT_32[] = {
+    { aom_idct32_c, aom_idct32_c },      // DCT_DCT
+    { ihalfright32_c, aom_idct32_c },    // ADST_DCT
+    { aom_idct32_c, ihalfright32_c },    // DCT_ADST
+    { ihalfright32_c, ihalfright32_c },  // ADST_ADST
+    { ihalfright32_c, aom_idct32_c },    // FLIPADST_DCT
+    { aom_idct32_c, ihalfright32_c },    // DCT_FLIPADST
+    { ihalfright32_c, ihalfright32_c },  // FLIPADST_FLIPADST
+    { ihalfright32_c, ihalfright32_c },  // ADST_FLIPADST
+    { ihalfright32_c, ihalfright32_c },  // FLIPADST_ADST
+    { iidtx32_c, iidtx32_c },            // IDTX
+    { aom_idct32_c, iidtx32_c },         // V_DCT
+    { iidtx32_c, aom_idct32_c },         // H_DCT
+    { ihalfright32_c, iidtx32_c },       // V_ADST
+    { iidtx32_c, ihalfright32_c },       // H_ADST
+    { ihalfright32_c, iidtx32_c },       // V_FLIPADST
+    { iidtx32_c, ihalfright32_c },       // H_FLIPADST
+  };
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[32][32];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 32;
+
+  // inverse transform row vectors
+  for (i = 0; i < 32; ++i) {
+    IHT_32[tx_type].rows(input, out[i]);
+    input += 32;
+  }
+
+  // transpose
+  for (i = 1; i < 32; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 32; ++i) {
+    IHT_32[tx_type].cols(out[i], out[i]);
+  }
+
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 32, 32);
+
+  // Sum with the destination
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
+    }
+  }
+}
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_TX64X64
+void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                             int tx_type) {
+  static const transform_2d IHT_64[] = {
+    { idct64_col_c, idct64_row_c },      // DCT_DCT
+    { ihalfright64_c, idct64_row_c },    // ADST_DCT
+    { idct64_col_c, ihalfright64_c },    // DCT_ADST
+    { ihalfright64_c, ihalfright64_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { ihalfright64_c, idct64_row_c },    // FLIPADST_DCT
+    { idct64_col_c, ihalfright64_c },    // DCT_FLIPADST
+    { ihalfright64_c, ihalfright64_c },  // FLIPADST_FLIPADST
+    { ihalfright64_c, ihalfright64_c },  // ADST_FLIPADST
+    { ihalfright64_c, ihalfright64_c },  // FLIPADST_ADST
+    { iidtx64_c, iidtx64_c },            // IDTX
+    { idct64_col_c, iidtx64_c },         // V_DCT
+    { iidtx64_c, idct64_row_c },         // H_DCT
+    { ihalfright64_c, iidtx64_c },       // V_ADST
+    { iidtx64_c, ihalfright64_c },       // H_ADST
+    { ihalfright64_c, iidtx64_c },       // V_FLIPADST
+    { iidtx64_c, ihalfright64_c },       // H_FLIPADST
+#endif                                   // CONFIG_EXT_TX
+  };
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[64][64];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 64;
+
+  // inverse transform row vectors
+  for (i = 0; i < 64; ++i) {
+    IHT_64[tx_type].rows(input, out[i]);
+    for (j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1);
+    input += 64;
+  }
+
+  // transpose
+  for (i = 1; i < 64; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 64; ++i) {
+    IHT_64[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 64, 64);
+#endif  // CONFIG_EXT_TX
+
+  // Sum with the destination
+  for (i = 0; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+    }
+  }
+}
+#endif  // CONFIG_TX64X64
+
+// idct
+void av1_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob) {
+  if (eob > 1)
+    aom_idct4x4_16_add(input, dest, stride);
+  else
+    aom_idct4x4_1_add(input, dest, stride);
+}
+
+void av1_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob) {
+  if (eob > 1)
+    aom_iwht4x4_16_add(input, dest, stride);
+  else
+    aom_iwht4x4_1_add(input, dest, stride);
+}
+
+static void idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                        int eob) {
+  // If dc is 1, then input[0] is the reconstructed value, do not need
+  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
+
+  // The calculation can be simplified if there are not many non-zero dct
+  // coefficients. Use eobs to decide what to do.
+  // TODO(yunqingwang): "eobs = 1" case is also handled in av1_short_idct8x8_c.
+  // Combine that with code here.
+  if (eob == 1)
+    // DC only DCT coefficient
+    aom_idct8x8_1_add(input, dest, stride);
+#if !CONFIG_ADAPT_SCAN
+  else if (eob <= 12)
+    aom_idct8x8_12_add(input, dest, stride);
+#endif
+  else
+    aom_idct8x8_64_add(input, dest, stride);
+}
+
+static void idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob) {
+  /* The calculation can be simplified if there are not many non-zero dct
+   * coefficients. Use eobs to separate different cases. */
+  if (eob == 1) /* DC only DCT coefficient. */
+    aom_idct16x16_1_add(input, dest, stride);
+#if !CONFIG_ADAPT_SCAN
+  else if (eob <= 10)
+    aom_idct16x16_10_add(input, dest, stride);
+#endif
+  else
+    aom_idct16x16_256_add(input, dest, stride);
+}
+
+static void idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob) {
+  if (eob == 1) aom_idct32x32_1_add(input, dest, stride);
+#if !CONFIG_ADAPT_SCAN
+  else if (eob <= 34)
+    // non-zero coeff only in upper-left 8x8
+    aom_idct32x32_34_add(input, dest, stride);
+#endif
+  else
+    aom_idct32x32_1024_add(input, dest, stride);
+}
+
+#if CONFIG_TX64X64
+static void idct64x64_add(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob) {
+  (void)eob;
+  av1_iht64x64_4096_add(input, dest, stride, DCT_DCT);
+}
+#endif  // CONFIG_TX64X64
+
+#if CONFIG_CB4X4
+static void inv_txfm_add_2x2(const tran_low_t *input, uint8_t *dest, int stride,
+                             int eob, TX_TYPE tx_type, int lossless) {
+  tran_high_t a1 = input[0] >> UNIT_QUANT_SHIFT;
+  tran_high_t b1 = input[1] >> UNIT_QUANT_SHIFT;
+  tran_high_t c1 = input[2] >> UNIT_QUANT_SHIFT;
+  tran_high_t d1 = input[3] >> UNIT_QUANT_SHIFT;
+
+  tran_high_t a2 = a1 + c1;
+  tran_high_t b2 = b1 + d1;
+  tran_high_t c2 = a1 - c1;
+  tran_high_t d2 = b1 - d1;
+
+  (void)tx_type;
+  (void)lossless;
+  (void)eob;
+
+  a1 = (a2 + b2) >> 2;
+  b1 = (a2 - b2) >> 2;
+  c1 = (c2 + d2) >> 2;
+  d1 = (c2 - d2) >> 2;
+
+  dest[0] = clip_pixel_add(dest[0], WRAPLOW(a1));
+  dest[1] = clip_pixel_add(dest[1], WRAPLOW(b1));
+  dest[stride] = clip_pixel_add(dest[stride], WRAPLOW(c1));
+  dest[stride + 1] = clip_pixel_add(dest[stride + 1], WRAPLOW(d1));
+}
+#endif
+
+void av1_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, TX_TYPE tx_type, int lossless) {
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    av1_iwht4x4_add(input, dest, stride, eob);
+    return;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT: av1_idct4x4_add(input, dest, stride, eob); break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST: av1_iht4x4_16_add(input, dest, stride, tx_type); break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST: av1_iht4x4_16_add(input, dest, stride, tx_type); break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // Use C version since DST only exists in C code
+      av1_iht4x4_16_add_c(input, dest, stride, tx_type);
+      break;
+    case IDTX: inv_idtx_add_c(input, dest, stride, 4, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+
+void av1_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, TX_TYPE tx_type) {
+  (void)eob;
+  av1_iht4x8_32_add(input, dest, stride, tx_type);
+}
+
+void av1_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, TX_TYPE tx_type) {
+  (void)eob;
+  av1_iht8x4_32_add(input, dest, stride, tx_type);
+}
+
+// These will be used by the masked-tx experiment in the future.
+#if CONFIG_MASKED_TX && 0
+static void inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, TX_TYPE tx_type) {
+  (void)eob;
+  av1_iht4x16_64_add(input, dest, stride, tx_type);
+}
+
+static void inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, TX_TYPE tx_type) {
+  (void)eob;
+  av1_iht16x4_64_add(input, dest, stride, tx_type);
+}
+
+static void inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, TX_TYPE tx_type) {
+  (void)eob;
+  av1_iht8x32_256_add(input, dest, stride, tx_type);
+}
+
+static void inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, TX_TYPE tx_type) {
+  (void)eob;
+  av1_iht32x8_256_add(input, dest, stride, tx_type);
+}
+#endif  // CONFIG_MASKED_TX
+
+static void inv_txfm_add_8x16(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, TX_TYPE tx_type) {
+  (void)eob;
+  av1_iht8x16_128_add(input, dest, stride, tx_type);
+}
+
+static void inv_txfm_add_16x8(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, TX_TYPE tx_type) {
+  (void)eob;
+  av1_iht16x8_128_add(input, dest, stride, tx_type);
+}
+
+static void inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
+                               int stride, int eob, TX_TYPE tx_type) {
+  (void)eob;
+  av1_iht16x32_512_add(input, dest, stride, tx_type);
+}
+
+static void inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
+                               int stride, int eob, TX_TYPE tx_type) {
+  (void)eob;
+  av1_iht32x16_512_add(input, dest, stride, tx_type);
+}
+
+static void inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest, int stride,
+                             int eob, TX_TYPE tx_type) {
+  switch (tx_type) {
+    case DCT_DCT: idct8x8_add(input, dest, stride, eob); break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST: av1_iht8x8_64_add(input, dest, stride, tx_type); break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST: av1_iht8x8_64_add(input, dest, stride, tx_type); break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // Use C version since DST only exists in C code
+      av1_iht8x8_64_add_c(input, dest, stride, tx_type);
+      break;
+    case IDTX: inv_idtx_add_c(input, dest, stride, 8, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+
+static void inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
+                               int stride, int eob, TX_TYPE tx_type) {
+  switch (tx_type) {
+    case DCT_DCT: idct16x16_add(input, dest, stride, eob); break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST: av1_iht16x16_256_add(input, dest, stride, tx_type); break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST: av1_iht16x16_256_add(input, dest, stride, tx_type); break;
+    case IDTX: inv_idtx_add_c(input, dest, stride, 16, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+
+static void inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
+                               int stride, int eob, TX_TYPE tx_type) {
+  switch (tx_type) {
+    case DCT_DCT: idct32x32_add(input, dest, stride, eob); break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      av1_iht32x32_1024_add_c(input, dest, stride, tx_type);
+      break;
+    case IDTX: inv_idtx_add_c(input, dest, stride, 32, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+
+#if CONFIG_TX64X64
+static void inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest,
+                               int stride, int eob, TX_TYPE tx_type) {
+  switch (tx_type) {
+    case DCT_DCT: idct64x64_add(input, dest, stride, eob); break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      av1_iht64x64_4096_add_c(input, dest, stride, tx_type);
+      break;
+    case IDTX: inv_idtx_add_c(input, dest, stride, 64, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_TX64X64
+
+#if CONFIG_HIGHBITDEPTH
+
+const TXFM_2D_CFG *inv_txfm_cfg_ls[TX_TYPES][TX_SIZES];
+
+typedef struct {
+  const int8_t *cos_bit;
+  const int8_t *stage_range;
+} tx_1d_cfg;
+
+typedef struct {
+  tx_1d_cfg row;
+  tx_1d_cfg col;
+} tx_2d_cfg;
+
+tx_2d_cfg inv_tx_cfg(int tx_type, int tx_size_row, int tx_size_col) {
+  const TXFM_2D_CFG *cfg_row = inv_txfm_cfg_ls[tx_type][tx_size_row];
+  const int8_t *stage_range_row = cfg_row->stage_range_row;
+  const int8_t *cos_bit_row = cfg_row->cos_bit_row;
+
+  const TXFM_2D_CFG *cfg_col = inv_txfm_cfg_ls[tx_type][tx_size_col];
+  const int8_t *stage_range_col = cfg_col->stage_range_col;
+  const int8_t *cos_bit_col = cfg_col->cos_bit_col;
+
+  tx_2d_cfg cfg = {
+    { cos_bit_row, stage_range_row }, { cos_bit_col, stage_range_col },
+  };
+  return cfg;
+}
+
+void av1_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_4[] = {
+    { highbd_idct4, highbd_idct4 },    // DCT_DCT
+    { highbd_iadst4, highbd_idct4 },   // ADST_DCT
+    { highbd_idct4, highbd_iadst4 },   // DCT_ADST
+    { highbd_iadst4, highbd_iadst4 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { highbd_iadst4, highbd_idct4 },       // FLIPADST_DCT
+    { highbd_idct4, highbd_iadst4 },       // DCT_FLIPADST
+    { highbd_iadst4, highbd_iadst4 },      // FLIPADST_FLIPADST
+    { highbd_iadst4, highbd_iadst4 },      // ADST_FLIPADST
+    { highbd_iadst4, highbd_iadst4 },      // FLIPADST_ADST
+    { highbd_iidtx4_c, highbd_iidtx4_c },  // IDTX
+    { highbd_idct4, highbd_iidtx4_c },     // V_DCT
+    { highbd_iidtx4_c, highbd_idct4 },     // H_DCT
+    { highbd_iadst4, highbd_iidtx4_c },    // V_ADST
+    { highbd_iidtx4_c, highbd_iadst4 },    // H_ADST
+    { highbd_iadst4, highbd_iidtx4_c },    // V_FLIPADST
+    { highbd_iidtx4_c, highbd_iadst4 },    // H_FLIPADST
+#endif                                     // CONFIG_EXT_TX
+  };
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[4][4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 4;
+
+  tx_2d_cfg cfg = inv_tx_cfg(tx_type, TX_4X4, TX_4X4);
+
+  // inverse transform row vectors
+  for (i = 0; i < 4; ++i) {
+    HIGH_IHT_4[tx_type].rows(input, out[i], cfg.row.cos_bit,
+                             cfg.row.stage_range, bd);
+    input += 4;
+  }
+
+  // transpose
+  for (i = 1; i < 4; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 4; ++i) {
+    HIGH_IHT_4[tx_type].cols(out[i], out[i], cfg.col.cos_bit,
+                             cfg.col.stage_range, bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 4, 4);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4), bd);
+    }
+  }
+}
+
+void av1_highbd_iht4x8_32_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_4x8[] = {
+    { highbd_idct8, highbd_idct4 },    // DCT_DCT
+    { highbd_iadst8, highbd_idct4 },   // ADST_DCT
+    { highbd_idct8, highbd_iadst4 },   // DCT_ADST
+    { highbd_iadst8, highbd_iadst4 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { highbd_iadst8, highbd_idct4 },       // FLIPADST_DCT
+    { highbd_idct8, highbd_iadst4 },       // DCT_FLIPADST
+    { highbd_iadst8, highbd_iadst4 },      // FLIPADST_FLIPADST
+    { highbd_iadst8, highbd_iadst4 },      // ADST_FLIPADST
+    { highbd_iadst8, highbd_iadst4 },      // FLIPADST_ADST
+    { highbd_iidtx8_c, highbd_iidtx4_c },  // IDTX
+    { highbd_idct8, highbd_iidtx4_c },     // V_DCT
+    { highbd_iidtx8_c, highbd_idct4 },     // H_DCT
+    { highbd_iadst8, highbd_iidtx4_c },    // V_ADST
+    { highbd_iidtx8_c, highbd_iadst4 },    // H_ADST
+    { highbd_iadst8, highbd_iidtx4_c },    // V_FLIPADST
+    { highbd_iidtx8_c, highbd_iadst4 },    // H_FLIPADST
+#endif                                     // CONFIG_EXT_TX
+  };
+  const int n = 4;
+  const int n2 = 8;
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[4][8], outtmp[4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n2;
+
+  tx_2d_cfg cfg = inv_tx_cfg(tx_type, TX_4X4, TX_8X8);
+
+  // inverse transform row vectors, and transpose
+  for (i = 0; i < n2; ++i) {
+    HIGH_IHT_4x8[tx_type].rows(input, outtmp, cfg.row.cos_bit,
+                               cfg.row.stage_range, bd);
+    for (j = 0; j < n; ++j) {
+      out[j][i] = HIGHBD_WRAPLOW(dct_const_round_shift(outtmp[j] * Sqrt2), bd);
+    }
+    input += n;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n; ++i) {
+    HIGH_IHT_4x8[tx_type].cols(out[i], out[i], cfg.col.cos_bit,
+                               cfg.col.stage_range, bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif  // CONFIG_EXT_TX
+
+  // Sum with the destination
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5), bd);
+    }
+  }
+}
+
+void av1_highbd_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_8x4[] = {
+    { highbd_idct4, highbd_idct8 },    // DCT_DCT
+    { highbd_iadst4, highbd_idct8 },   // ADST_DCT
+    { highbd_idct4, highbd_iadst8 },   // DCT_ADST
+    { highbd_iadst4, highbd_iadst8 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { highbd_iadst4, highbd_idct8 },       // FLIPADST_DCT
+    { highbd_idct4, highbd_iadst8 },       // DCT_FLIPADST
+    { highbd_iadst4, highbd_iadst8 },      // FLIPADST_FLIPADST
+    { highbd_iadst4, highbd_iadst8 },      // ADST_FLIPADST
+    { highbd_iadst4, highbd_iadst8 },      // FLIPADST_ADST
+    { highbd_iidtx4_c, highbd_iidtx8_c },  // IDTX
+    { highbd_idct4, highbd_iidtx8_c },     // V_DCT
+    { highbd_iidtx4_c, highbd_idct8 },     // H_DCT
+    { highbd_iadst4, highbd_iidtx8_c },    // V_ADST
+    { highbd_iidtx4_c, highbd_iadst8 },    // H_ADST
+    { highbd_iadst4, highbd_iidtx8_c },    // V_FLIPADST
+    { highbd_iidtx4_c, highbd_iadst8 },    // H_FLIPADST
+#endif                                     // CONFIG_EXT_TX
+  };
+  const int n = 4;
+  const int n2 = 8;
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[8][4], outtmp[8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n;
+
+  tx_2d_cfg cfg = inv_tx_cfg(tx_type, TX_8X8, TX_4X4);
+
+  // inverse transform row vectors, and transpose
+  for (i = 0; i < n; ++i) {
+    HIGH_IHT_8x4[tx_type].rows(input, outtmp, cfg.row.cos_bit,
+                               cfg.row.stage_range, bd);
+    for (j = 0; j < n2; ++j) {
+      out[j][i] = HIGHBD_WRAPLOW(dct_const_round_shift(outtmp[j] * Sqrt2), bd);
+    }
+    input += n2;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n2; ++i) {
+    HIGH_IHT_8x4[tx_type].cols(out[i], out[i], cfg.col.cos_bit,
+                               cfg.col.stage_range, bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif  // CONFIG_EXT_TX
+
+  // Sum with the destination
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5), bd);
+    }
+  }
+}
+
+void av1_highbd_iht4x16_64_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_4x16[] = {
+    { highbd_idct16, highbd_idct4 },    // DCT_DCT
+    { highbd_iadst16, highbd_idct4 },   // ADST_DCT
+    { highbd_idct16, highbd_iadst4 },   // DCT_ADST
+    { highbd_iadst16, highbd_iadst4 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { highbd_iadst16, highbd_idct4 },       // FLIPADST_DCT
+    { highbd_idct16, highbd_iadst4 },       // DCT_FLIPADST
+    { highbd_iadst16, highbd_iadst4 },      // FLIPADST_FLIPADST
+    { highbd_iadst16, highbd_iadst4 },      // ADST_FLIPADST
+    { highbd_iadst16, highbd_iadst4 },      // FLIPADST_ADST
+    { highbd_iidtx16_c, highbd_iidtx4_c },  // IDTX
+    { highbd_idct16, highbd_iidtx4_c },     // V_DCT
+    { highbd_iidtx16_c, highbd_idct4 },     // H_DCT
+    { highbd_iadst16, highbd_iidtx4_c },    // V_ADST
+    { highbd_iidtx16_c, highbd_iadst4 },    // H_ADST
+    { highbd_iadst16, highbd_iidtx4_c },    // V_FLIPADST
+    { highbd_iidtx16_c, highbd_iadst4 },    // H_FLIPADST
+#endif                                      // CONFIG_EXT_TX
+  };
+  const int n = 4;
+  const int n4 = 16;
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[4][16], outtmp[4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n4;
+
+  tx_2d_cfg cfg = inv_tx_cfg(tx_type, TX_4X4, TX_16X16);
+
+  // inverse transform row vectors, and transpose
+  for (i = 0; i < n4; ++i) {
+    HIGH_IHT_4x16[tx_type].rows(input, outtmp, cfg.row.cos_bit,
+                                cfg.row.stage_range, bd);
+    for (j = 0; j < n; ++j) out[j][i] = outtmp[j];
+    input += n;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n; ++i)
+    HIGH_IHT_4x16[tx_type].cols(out[i], out[i], cfg.col.cos_bit,
+                                cfg.col.stage_range, bd);
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n4, n);
+#endif  // CONFIG_EXT_TX
+
+  // Sum with the destination
+  for (i = 0; i < n4; ++i) {
+    for (j = 0; j < n; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5), bd);
+    }
+  }
+}
+
+void av1_highbd_iht16x4_64_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_16x4[] = {
+    { highbd_idct4, highbd_idct16 },    // DCT_DCT
+    { highbd_iadst4, highbd_idct16 },   // ADST_DCT
+    { highbd_idct4, highbd_iadst16 },   // DCT_ADST
+    { highbd_iadst4, highbd_iadst16 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { highbd_iadst4, highbd_idct16 },       // FLIPADST_DCT
+    { highbd_idct4, highbd_iadst16 },       // DCT_FLIPADST
+    { highbd_iadst4, highbd_iadst16 },      // FLIPADST_FLIPADST
+    { highbd_iadst4, highbd_iadst16 },      // ADST_FLIPADST
+    { highbd_iadst4, highbd_iadst16 },      // FLIPADST_ADST
+    { highbd_iidtx4_c, highbd_iidtx16_c },  // IDTX
+    { highbd_idct4, highbd_iidtx16_c },     // V_DCT
+    { highbd_iidtx4_c, highbd_idct16 },     // H_DCT
+    { highbd_iadst4, highbd_iidtx16_c },    // V_ADST
+    { highbd_iidtx4_c, highbd_iadst16 },    // H_ADST
+    { highbd_iadst4, highbd_iidtx16_c },    // V_FLIPADST
+    { highbd_iidtx4_c, highbd_iadst16 },    // H_FLIPADST
+#endif                                      // CONFIG_EXT_TX
+  };
+  const int n = 4;
+  const int n4 = 16;
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[16][4], outtmp[16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n;
+
+  tx_2d_cfg cfg = inv_tx_cfg(tx_type, TX_16X16, TX_4X4);
+
+  // inverse transform row vectors, and transpose
+  for (i = 0; i < n; ++i) {
+    HIGH_IHT_16x4[tx_type].rows(input, outtmp, cfg.row.cos_bit,
+                                cfg.row.stage_range, bd);
+    for (j = 0; j < n4; ++j) out[j][i] = outtmp[j];
+    input += n4;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n4; ++i) {
+    HIGH_IHT_16x4[tx_type].cols(out[i], out[i], cfg.col.cos_bit,
+                                cfg.col.stage_range, bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n, n4);
+#endif  // CONFIG_EXT_TX
+
+  // Sum with the destination
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n4; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5), bd);
+    }
+  }
+}
+
+void av1_highbd_iht8x16_128_add_c(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_8x16[] = {
+    { highbd_idct16, highbd_idct8 },    // DCT_DCT
+    { highbd_iadst16, highbd_idct8 },   // ADST_DCT
+    { highbd_idct16, highbd_iadst8 },   // DCT_ADST
+    { highbd_iadst16, highbd_iadst8 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { highbd_iadst16, highbd_idct8 },       // FLIPADST_DCT
+    { highbd_idct16, highbd_iadst8 },       // DCT_FLIPADST
+    { highbd_iadst16, highbd_iadst8 },      // FLIPADST_FLIPADST
+    { highbd_iadst16, highbd_iadst8 },      // ADST_FLIPADST
+    { highbd_iadst16, highbd_iadst8 },      // FLIPADST_ADST
+    { highbd_iidtx16_c, highbd_iidtx8_c },  // IDTX
+    { highbd_idct16, highbd_iidtx8_c },     // V_DCT
+    { highbd_iidtx16_c, highbd_idct8 },     // H_DCT
+    { highbd_iadst16, highbd_iidtx8_c },    // V_ADST
+    { highbd_iidtx16_c, highbd_iadst8 },    // H_ADST
+    { highbd_iadst16, highbd_iidtx8_c },    // V_FLIPADST
+    { highbd_iidtx16_c, highbd_iadst8 },    // H_FLIPADST
+#endif                                      // CONFIG_EXT_TX
+  };
+  const int n = 8;
+  const int n2 = 16;
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[8][16], outtmp[8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n2;
+
+  tx_2d_cfg cfg = inv_tx_cfg(tx_type, TX_8X8, TX_16X16);
+
+  // inverse transform row vectors, and transpose
+  for (i = 0; i < n2; ++i) {
+    HIGH_IHT_8x16[tx_type].rows(input, outtmp, cfg.row.cos_bit,
+                                cfg.row.stage_range, bd);
+    for (j = 0; j < n; ++j)
+      out[j][i] = HIGHBD_WRAPLOW(dct_const_round_shift(outtmp[j] * Sqrt2), bd);
+    input += n;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n; ++i) {
+    HIGH_IHT_8x16[tx_type].cols(out[i], out[i], cfg.col.cos_bit,
+                                cfg.col.stage_range, bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif  // CONFIG_EXT_TX
+
+  // Sum with the destination
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6), bd);
+    }
+  }
+}
+
+void av1_highbd_iht16x8_128_add_c(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_16x8[] = {
+    { highbd_idct8, highbd_idct16 },    // DCT_DCT
+    { highbd_iadst8, highbd_idct16 },   // ADST_DCT
+    { highbd_idct8, highbd_iadst16 },   // DCT_ADST
+    { highbd_iadst8, highbd_iadst16 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { highbd_iadst8, highbd_idct16 },       // FLIPADST_DCT
+    { highbd_idct8, highbd_iadst16 },       // DCT_FLIPADST
+    { highbd_iadst8, highbd_iadst16 },      // FLIPADST_FLIPADST
+    { highbd_iadst8, highbd_iadst16 },      // ADST_FLIPADST
+    { highbd_iadst8, highbd_iadst16 },      // FLIPADST_ADST
+    { highbd_iidtx8_c, highbd_iidtx16_c },  // IDTX
+    { highbd_idct8, highbd_iidtx16_c },     // V_DCT
+    { highbd_iidtx8_c, highbd_idct16 },     // H_DCT
+    { highbd_iadst8, highbd_iidtx16_c },    // V_ADST
+    { highbd_iidtx8_c, highbd_iadst16 },    // H_ADST
+    { highbd_iadst8, highbd_iidtx16_c },    // V_FLIPADST
+    { highbd_iidtx8_c, highbd_iadst16 },    // H_FLIPADST
+#endif                                      // CONFIG_EXT_TX
+  };
+  const int n = 8;
+  const int n2 = 16;
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[16][8], outtmp[16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n;
+
+  tx_2d_cfg cfg = inv_tx_cfg(tx_type, TX_16X16, TX_8X8);
+
+  // inverse transform row vectors, and transpose
+  for (i = 0; i < n; ++i) {
+    HIGH_IHT_16x8[tx_type].rows(input, outtmp, cfg.row.cos_bit,
+                                cfg.row.stage_range, bd);
+    for (j = 0; j < n2; ++j)
+      out[j][i] = HIGHBD_WRAPLOW(dct_const_round_shift(outtmp[j] * Sqrt2), bd);
+    input += n2;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n2; ++i) {
+    HIGH_IHT_16x8[tx_type].cols(out[i], out[i], cfg.col.cos_bit,
+                                cfg.col.stage_range, bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif  // CONFIG_EXT_TX
+
+  // Sum with the destination
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6), bd);
+    }
+  }
+}
+
+void av1_highbd_iht8x32_256_add_c(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_8x32[] = {
+    { highbd_idct32, highbd_idct8 },           // DCT_DCT
+    { highbd_ihalfright32_c, highbd_idct8 },   // ADST_DCT
+    { highbd_idct32, highbd_iadst8 },          // DCT_ADST
+    { highbd_ihalfright32_c, highbd_iadst8 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { highbd_ihalfright32_c, highbd_idct8 },     // FLIPADST_DCT
+    { highbd_idct32, highbd_iadst8 },            // DCT_FLIPADST
+    { highbd_ihalfright32_c, highbd_iadst8 },    // FLIPADST_FLIPADST
+    { highbd_ihalfright32_c, highbd_iadst8 },    // ADST_FLIPADST
+    { highbd_ihalfright32_c, highbd_iadst8 },    // FLIPADST_ADST
+    { highbd_iidtx32_c, highbd_iidtx8_c },       // IDTX
+    { highbd_idct32, highbd_iidtx8_c },          // V_DCT
+    { highbd_iidtx32_c, highbd_idct8 },          // H_DCT
+    { highbd_ihalfright32_c, highbd_iidtx8_c },  // V_ADST
+    { highbd_iidtx32_c, highbd_iadst8 },         // H_ADST
+    { highbd_ihalfright32_c, highbd_iidtx8_c },  // V_FLIPADST
+    { highbd_iidtx32_c, highbd_iadst8 },         // H_FLIPADST
+#endif                                           // CONFIG_EXT_TX
+  };
+  const int n = 8;
+  const int n4 = 32;
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[8][32], outtmp[8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n4;
+
+  tx_2d_cfg cfg = inv_tx_cfg(tx_type, TX_8X8, TX_32X32);
+
+  // inverse transform row vectors, and transpose
+  for (i = 0; i < n4; ++i) {
+    HIGH_IHT_8x32[tx_type].rows(input, outtmp, cfg.row.cos_bit,
+                                cfg.row.stage_range, bd);
+    for (j = 0; j < n; ++j) out[j][i] = outtmp[j];
+    input += n;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n; ++i)
+    HIGH_IHT_8x32[tx_type].cols(out[i], out[i], cfg.col.cos_bit,
+                                cfg.col.stage_range, bd);
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n4, n);
+#endif  // CONFIG_EXT_TX
+
+  // Sum with the destination
+  for (i = 0; i < n4; ++i) {
+    for (j = 0; j < n; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6), bd);
+    }
+  }
+}
+
+void av1_highbd_iht32x8_256_add_c(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_32x8[] = {
+    { highbd_idct8, highbd_idct32 },           // DCT_DCT
+    { highbd_iadst8, highbd_idct32 },          // ADST_DCT
+    { highbd_idct8, highbd_ihalfright32_c },   // DCT_ADST
+    { highbd_iadst8, highbd_ihalfright32_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { highbd_iadst8, highbd_idct32 },            // FLIPADST_DCT
+    { highbd_idct8, highbd_ihalfright32_c },     // DCT_FLIPADST
+    { highbd_iadst8, highbd_ihalfright32_c },    // FLIPADST_FLIPADST
+    { highbd_iadst8, highbd_ihalfright32_c },    // ADST_FLIPADST
+    { highbd_iadst8, highbd_ihalfright32_c },    // FLIPADST_ADST
+    { highbd_iidtx8_c, highbd_iidtx32_c },       // IDTX
+    { highbd_idct8, highbd_iidtx32_c },          // V_DCT
+    { highbd_iidtx8_c, highbd_idct32 },          // H_DCT
+    { highbd_iadst8, highbd_iidtx32_c },         // V_ADST
+    { highbd_iidtx8_c, highbd_ihalfright32_c },  // H_ADST
+    { highbd_iadst8, highbd_iidtx32_c },         // V_FLIPADST
+    { highbd_iidtx8_c, highbd_ihalfright32_c },  // H_FLIPADST
+#endif                                           // CONFIG_EXT_TX
+  };
+  const int n = 8;
+  const int n4 = 32;
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[32][8], outtmp[32];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n;
+
+  tx_2d_cfg cfg = inv_tx_cfg(tx_type, TX_32X32, TX_8X8);
+
+  // inverse transform row vectors, and transpose
+  for (i = 0; i < n; ++i) {
+    HIGH_IHT_32x8[tx_type].rows(input, outtmp, cfg.row.cos_bit,
+                                cfg.row.stage_range, bd);
+    for (j = 0; j < n4; ++j) out[j][i] = outtmp[j];
+    input += n4;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n4; ++i)
+    HIGH_IHT_32x8[tx_type].cols(out[i], out[i], cfg.col.cos_bit,
+                                cfg.col.stage_range, bd);
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n, n4);
+#endif  // CONFIG_EXT_TX
+
+  // Sum with the destination
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n4; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6), bd);
+    }
+  }
+}
+
+void av1_highbd_iht16x32_512_add_c(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_16x32[] = {
+    { highbd_idct32, highbd_idct16 },           // DCT_DCT
+    { highbd_ihalfright32_c, highbd_idct16 },   // ADST_DCT
+    { highbd_idct32, highbd_iadst16 },          // DCT_ADST
+    { highbd_ihalfright32_c, highbd_iadst16 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { highbd_ihalfright32_c, highbd_idct16 },     // FLIPADST_DCT
+    { highbd_idct32, highbd_iadst16 },            // DCT_FLIPADST
+    { highbd_ihalfright32_c, highbd_iadst16 },    // FLIPADST_FLIPADST
+    { highbd_ihalfright32_c, highbd_iadst16 },    // ADST_FLIPADST
+    { highbd_ihalfright32_c, highbd_iadst16 },    // FLIPADST_ADST
+    { highbd_iidtx32_c, highbd_iidtx16_c },       // IDTX
+    { highbd_idct32, highbd_iidtx16_c },          // V_DCT
+    { highbd_iidtx32_c, highbd_idct16 },          // H_DCT
+    { highbd_ihalfright32_c, highbd_iidtx16_c },  // V_ADST
+    { highbd_iidtx32_c, highbd_iadst16 },         // H_ADST
+    { highbd_ihalfright32_c, highbd_iidtx16_c },  // V_FLIPADST
+    { highbd_iidtx32_c, highbd_iadst16 },         // H_FLIPADST
+#endif                                            // CONFIG_EXT_TX
+  };
+  const int n = 16;
+  const int n2 = 32;
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[16][32], outtmp[16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n2;
+
+  tx_2d_cfg cfg = inv_tx_cfg(tx_type, TX_16X16, TX_32X32);
+
+  // inverse transform row vectors, and transpose
+  for (i = 0; i < n2; ++i) {
+    HIGH_IHT_16x32[tx_type].rows(input, outtmp, cfg.row.cos_bit,
+                                 cfg.row.stage_range, bd);
+    for (j = 0; j < n; ++j)
+      out[j][i] = HIGHBD_WRAPLOW(dct_const_round_shift(outtmp[j] * Sqrt2), bd);
+    input += n;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n; ++i) {
+    HIGH_IHT_16x32[tx_type].cols(out[i], out[i], cfg.col.cos_bit,
+                                 cfg.col.stage_range, bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif  // CONFIG_EXT_TX
+
+  // Sum with the destination
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6), bd);
+    }
+  }
+}
+
+void av1_highbd_iht32x16_512_add_c(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_32x16[] = {
+    { highbd_idct16, highbd_idct32 },           // DCT_DCT
+    { highbd_iadst16, highbd_idct32 },          // ADST_DCT
+    { highbd_idct16, highbd_ihalfright32_c },   // DCT_ADST
+    { highbd_iadst16, highbd_ihalfright32_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { highbd_iadst16, highbd_idct32 },            // FLIPADST_DCT
+    { highbd_idct16, highbd_ihalfright32_c },     // DCT_FLIPADST
+    { highbd_iadst16, highbd_ihalfright32_c },    // FLIPADST_FLIPADST
+    { highbd_iadst16, highbd_ihalfright32_c },    // ADST_FLIPADST
+    { highbd_iadst16, highbd_ihalfright32_c },    // FLIPADST_ADST
+    { highbd_iidtx16_c, highbd_iidtx32_c },       // IDTX
+    { highbd_idct16, highbd_iidtx32_c },          // V_DCT
+    { highbd_iidtx16_c, highbd_idct32 },          // H_DCT
+    { highbd_iadst16, highbd_iidtx32_c },         // V_ADST
+    { highbd_iidtx16_c, highbd_ihalfright32_c },  // H_ADST
+    { highbd_iadst16, highbd_iidtx32_c },         // V_FLIPADST
+    { highbd_iidtx16_c, highbd_ihalfright32_c },  // H_FLIPADST
+#endif                                            // CONFIG_EXT_TX
+  };
+  const int n = 16;
+  const int n2 = 32;
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[32][16], outtmp[32];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n;
+
+  tx_2d_cfg cfg = inv_tx_cfg(tx_type, TX_32X32, TX_16X16);
+
+  // inverse transform row vectors, and transpose
+  for (i = 0; i < n; ++i) {
+    HIGH_IHT_32x16[tx_type].rows(input, outtmp, cfg.row.cos_bit,
+                                 cfg.row.stage_range, bd);
+    for (j = 0; j < n2; ++j)
+      out[j][i] = HIGHBD_WRAPLOW(dct_const_round_shift(outtmp[j] * Sqrt2), bd);
+    input += n2;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n2; ++i) {
+    HIGH_IHT_32x16[tx_type].cols(out[i], out[i], cfg.col.cos_bit,
+                                 cfg.col.stage_range, bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif  // CONFIG_EXT_TX
+
+  // Sum with the destination
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6), bd);
+    }
+  }
+}
+
+void av1_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_8[] = {
+    { highbd_idct8, highbd_idct8 },    // DCT_DCT
+    { highbd_iadst8, highbd_idct8 },   // ADST_DCT
+    { highbd_idct8, highbd_iadst8 },   // DCT_ADST
+    { highbd_iadst8, highbd_iadst8 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { highbd_iadst8, highbd_idct8 },       // FLIPADST_DCT
+    { highbd_idct8, highbd_iadst8 },       // DCT_FLIPADST
+    { highbd_iadst8, highbd_iadst8 },      // FLIPADST_FLIPADST
+    { highbd_iadst8, highbd_iadst8 },      // ADST_FLIPADST
+    { highbd_iadst8, highbd_iadst8 },      // FLIPADST_ADST
+    { highbd_iidtx8_c, highbd_iidtx8_c },  // IDTX
+    { highbd_idct8, highbd_iidtx8_c },     // V_DCT
+    { highbd_iidtx8_c, highbd_idct8 },     // H_DCT
+    { highbd_iadst8, highbd_iidtx8_c },    // V_ADST
+    { highbd_iidtx8_c, highbd_iadst8 },    // H_ADST
+    { highbd_iadst8, highbd_iidtx8_c },    // V_FLIPADST
+    { highbd_iidtx8_c, highbd_iadst8 },    // H_FLIPADST
+#endif                                     // CONFIG_EXT_TX
+  };
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[8][8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 8;
+
+  tx_2d_cfg cfg = inv_tx_cfg(tx_type, TX_8X8, TX_8X8);
+
+  // inverse transform row vectors
+  for (i = 0; i < 8; ++i) {
+    HIGH_IHT_8[tx_type].rows(input, out[i], cfg.row.cos_bit,
+                             cfg.row.stage_range, bd);
+    input += 8;
+  }
+
+  // transpose
+  for (i = 1; i < 8; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 8; ++i) {
+    HIGH_IHT_8[tx_type].cols(out[i], out[i], cfg.col.cos_bit,
+                             cfg.col.stage_range, bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 8, 8);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5), bd);
+    }
+  }
+}
+
+void av1_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_16[] = {
+    { highbd_idct16, highbd_idct16 },    // DCT_DCT
+    { highbd_iadst16, highbd_idct16 },   // ADST_DCT
+    { highbd_idct16, highbd_iadst16 },   // DCT_ADST
+    { highbd_iadst16, highbd_iadst16 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { highbd_iadst16, highbd_idct16 },       // FLIPADST_DCT
+    { highbd_idct16, highbd_iadst16 },       // DCT_FLIPADST
+    { highbd_iadst16, highbd_iadst16 },      // FLIPADST_FLIPADST
+    { highbd_iadst16, highbd_iadst16 },      // ADST_FLIPADST
+    { highbd_iadst16, highbd_iadst16 },      // FLIPADST_ADST
+    { highbd_iidtx16_c, highbd_iidtx16_c },  // IDTX
+    { highbd_idct16, highbd_iidtx16_c },     // V_DCT
+    { highbd_iidtx16_c, highbd_idct16 },     // H_DCT
+    { highbd_iadst16, highbd_iidtx16_c },    // V_ADST
+    { highbd_iidtx16_c, highbd_iadst16 },    // H_ADST
+    { highbd_iadst16, highbd_iidtx16_c },    // V_FLIPADST
+    { highbd_iidtx16_c, highbd_iadst16 },    // H_FLIPADST
+#endif                                       // CONFIG_EXT_TX
+  };
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[16][16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 16;
+
+  tx_2d_cfg cfg = inv_tx_cfg(tx_type, TX_16X16, TX_16X16);
+
+  // inverse transform row vectors
+  for (i = 0; i < 16; ++i) {
+    HIGH_IHT_16[tx_type].rows(input, out[i], cfg.row.cos_bit,
+                              cfg.row.stage_range, bd);
+    input += 16;
+  }
+
+  // transpose
+  for (i = 1; i < 16; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 16; ++i) {
+    HIGH_IHT_16[tx_type].cols(out[i], out[i], cfg.col.cos_bit,
+                              cfg.col.stage_range, bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 16, 16);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6), bd);
+    }
+  }
+}
+
+#if CONFIG_EXT_TX
+static void highbd_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+                                       int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_32[] = {
+    { highbd_idct32, highbd_idct32 },                  // DCT_DCT
+    { highbd_ihalfright32_c, highbd_idct32 },          // ADST_DCT
+    { highbd_idct32, highbd_ihalfright32_c },          // DCT_ADST
+    { highbd_ihalfright32_c, highbd_ihalfright32_c },  // ADST_ADST
+    { highbd_ihalfright32_c, highbd_idct32 },          // FLIPADST_DCT
+    { highbd_idct32, highbd_ihalfright32_c },          // DCT_FLIPADST
+    { highbd_ihalfright32_c, highbd_ihalfright32_c },  // FLIPADST_FLIPADST
+    { highbd_ihalfright32_c, highbd_ihalfright32_c },  // ADST_FLIPADST
+    { highbd_ihalfright32_c, highbd_ihalfright32_c },  // FLIPADST_ADST
+    { highbd_iidtx32_c, highbd_iidtx32_c },            // IDTX
+    { highbd_idct32, highbd_iidtx32_c },               // V_DCT
+    { highbd_iidtx32_c, highbd_idct32 },               // H_DCT
+    { highbd_ihalfright32_c, highbd_iidtx32_c },       // V_ADST
+    { highbd_iidtx32_c, highbd_ihalfright32_c },       // H_ADST
+    { highbd_ihalfright32_c, highbd_iidtx32_c },       // V_FLIPADST
+    { highbd_iidtx32_c, highbd_ihalfright32_c },       // H_FLIPADST
+  };
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[32][32];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 32;
+
+  tx_2d_cfg cfg = inv_tx_cfg(tx_type, TX_32X32, TX_32X32);
+
+  // inverse transform row vectors
+  for (i = 0; i < 32; ++i) {
+    HIGH_IHT_32[tx_type].rows(input, out[i], cfg.row.cos_bit,
+                              cfg.row.stage_range, bd);
+    input += 32;
+  }
+
+  // transpose
+  for (i = 1; i < 32; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 32; ++i) {
+    HIGH_IHT_32[tx_type].cols(out[i], out[i], cfg.col.cos_bit,
+                              cfg.col.stage_range, bd);
+  }
+
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 32, 32);
+
+  // Sum with the destination
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6), bd);
+    }
+  }
+}
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_TX64X64
+static void highbd_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest8,
+                                       int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_64[] = {
+    { highbd_idct64_col_c, highbd_idct64_row_c },      // DCT_DCT
+    { highbd_ihalfright64_c, highbd_idct64_row_c },    // ADST_DCT
+    { highbd_idct64_col_c, highbd_ihalfright64_c },    // DCT_ADST
+    { highbd_ihalfright64_c, highbd_ihalfright64_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { highbd_ihalfright64_c, highbd_idct64_row_c },    // FLIPADST_DCT
+    { highbd_idct64_col_c, highbd_ihalfright64_c },    // DCT_FLIPADST
+    { highbd_ihalfright64_c, highbd_ihalfright64_c },  // FLIPADST_FLIPADST
+    { highbd_ihalfright64_c, highbd_ihalfright64_c },  // ADST_FLIPADST
+    { highbd_ihalfright64_c, highbd_ihalfright64_c },  // FLIPADST_ADST
+    { highbd_iidtx64_c, highbd_iidtx64_c },            // IDTX
+    { highbd_idct64_col_c, highbd_iidtx64_c },         // V_DCT
+    { highbd_iidtx64_c, highbd_idct64_row_c },         // H_DCT
+    { highbd_ihalfright64_c, highbd_iidtx64_c },       // V_ADST
+    { highbd_iidtx64_c, highbd_ihalfright64_c },       // H_ADST
+    { highbd_ihalfright64_c, highbd_iidtx64_c },       // V_FLIPADST
+    { highbd_iidtx64_c, highbd_ihalfright64_c },       // H_FLIPADST
+#endif                                                 // CONFIG_EXT_TX
+  };
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[64][64];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 64;
+
+  tx_2d_cfg cfg = inv_tx_cfg(tx_type, TX_64X64, TX_64X64);
+
+  // inverse transform row vectors
+  for (i = 0; i < 64; ++i) {
+    HIGH_IHT_64[tx_type].rows(input, out[i], cfg.row.cos_bit,
+                              cfg.row.stage_range, bd);
+    for (j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1);
+    input += 64;
+  }
+
+  // transpose
+  for (i = 1; i < 64; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 64; ++i) {
+    HIGH_IHT_64[tx_type].cols(out[i], out[i], cfg.col.cos_bit_col,
+                              cfg.col.stage_range, bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 64, 64);
+#endif  // CONFIG_EXT_TX
+
+  // Sum with the destination
+  for (i = 0; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5), bd);
+    }
+  }
+}
+#endif  // CONFIG_TX64X64
+
+// idct
+void av1_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd) {
+  if (eob > 1)
+    aom_highbd_idct4x4_16_add(input, dest, stride, bd);
+  else
+    aom_highbd_idct4x4_1_add(input, dest, stride, bd);
+}
+
+void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd) {
+  if (eob > 1)
+    aom_highbd_iwht4x4_16_add(input, dest, stride, bd);
+  else
+    aom_highbd_iwht4x4_1_add(input, dest, stride, bd);
+}
+
+#if CONFIG_CB4X4
+static void highbd_inv_txfm_add_2x2(const tran_low_t *input, uint8_t *dest,
+                                    int stride, int eob, int bd,
+                                    TX_TYPE tx_type, int lossless) {
+  tran_high_t a1 = input[0] >> UNIT_QUANT_SHIFT;
+  tran_high_t b1 = input[1] >> UNIT_QUANT_SHIFT;
+  tran_high_t c1 = input[2] >> UNIT_QUANT_SHIFT;
+  tran_high_t d1 = input[3] >> UNIT_QUANT_SHIFT;
+
+  tran_high_t a2 = a1 + c1;
+  tran_high_t b2 = b1 + d1;
+  tran_high_t c2 = a1 - c1;
+  tran_high_t d2 = b1 - d1;
+
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dest);
+
+  (void)tx_type;
+  (void)lossless;
+  (void)eob;
+
+  a1 = (a2 + b2) >> 2;
+  b1 = (a2 - b2) >> 2;
+  c1 = (c2 + d2) >> 2;
+  d1 = (c2 - d2) >> 2;
+
+  dst[0] = highbd_clip_pixel_add(dst[0], a1, bd);
+  dst[1] = highbd_clip_pixel_add(dst[1], b1, bd);
+  dst[stride] = highbd_clip_pixel_add(dst[stride], c1, bd);
+  dst[stride + 1] = highbd_clip_pixel_add(dst[stride + 1], d1, bd);
+}
+#endif
+
+void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
+                                 int stride, int eob, int bd, TX_TYPE tx_type,
+                                 int lossless) {
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
+    return;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+#endif  // CONFIG_EXT_TX
+      av1_inv_txfm2d_add_4x4(input, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                             bd);
+      break;
+#if CONFIG_EXT_TX
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // Use C version since DST only exists in C code
+      av1_highbd_iht4x4_16_add_c(input, dest, stride, tx_type, bd);
+      break;
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 4, tx_type, bd);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+
+void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
+                                 int stride, int eob, int bd, TX_TYPE tx_type) {
+  (void)eob;
+  av1_highbd_iht4x8_32_add_c(input, dest, stride, tx_type, bd);
+}
+
+void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
+                                 int stride, int eob, int bd, TX_TYPE tx_type) {
+  (void)eob;
+  av1_highbd_iht8x4_32_add_c(input, dest, stride, tx_type, bd);
+}
+
+void av1_highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest,
+                                  int stride, int eob, int bd,
+                                  TX_TYPE tx_type) {
+  (void)eob;
+  av1_highbd_iht4x16_64_add_c(input, dest, stride, tx_type, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest,
+                                  int stride, int eob, int bd,
+                                  TX_TYPE tx_type) {
+  (void)eob;
+  av1_highbd_iht16x4_64_add_c(input, dest, stride, tx_type, bd);
+}
+
+static void highbd_inv_txfm_add_8x16(const tran_low_t *input, uint8_t *dest,
+                                     int stride, int eob, int bd,
+                                     TX_TYPE tx_type) {
+  (void)eob;
+  av1_highbd_iht8x16_128_add_c(input, dest, stride, tx_type, bd);
+}
+
+static void highbd_inv_txfm_add_16x8(const tran_low_t *input, uint8_t *dest,
+                                     int stride, int eob, int bd,
+                                     TX_TYPE tx_type) {
+  (void)eob;
+  av1_highbd_iht16x8_128_add_c(input, dest, stride, tx_type, bd);
+}
+
+void av1_highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest,
+                                  int stride, int eob, int bd,
+                                  TX_TYPE tx_type) {
+  (void)eob;
+  av1_highbd_iht8x32_256_add_c(input, dest, stride, tx_type, bd);
+}
+
+void av1_highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest,
+                                  int stride, int eob, int bd,
+                                  TX_TYPE tx_type) {
+  (void)eob;
+  av1_highbd_iht32x8_256_add_c(input, dest, stride, tx_type, bd);
+}
+
+static void highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
+                                      int stride, int eob, int bd,
+                                      TX_TYPE tx_type) {
+  (void)eob;
+  av1_highbd_iht16x32_512_add_c(input, dest, stride, tx_type, bd);
+}
+
+static void highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
+                                      int stride, int eob, int bd,
+                                      TX_TYPE tx_type) {
+  (void)eob;
+  av1_highbd_iht32x16_512_add_c(input, dest, stride, tx_type, bd);
+}
+
+static void highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
+                                    int stride, int eob, int bd,
+                                    TX_TYPE tx_type) {
+  (void)eob;
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+#endif  // CONFIG_EXT_TX
+      av1_inv_txfm2d_add_8x8(input, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                             bd);
+      break;
+#if CONFIG_EXT_TX
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // Use C version since DST only exists in C code
+      av1_highbd_iht8x8_64_add_c(input, dest, stride, tx_type, bd);
+      break;
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 8, tx_type, bd);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+
+static void highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
+                                      int stride, int eob, int bd,
+                                      TX_TYPE tx_type) {
+  (void)eob;
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+#endif  // CONFIG_EXT_TX
+      av1_inv_txfm2d_add_16x16(input, CONVERT_TO_SHORTPTR(dest), stride,
+                               tx_type, bd);
+      break;
+#if CONFIG_EXT_TX
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // Use C version since DST only exists in C code
+      av1_highbd_iht16x16_256_add_c(input, dest, stride, tx_type, bd);
+      break;
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 16, tx_type, bd);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+
+static void highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
+                                      int stride, int eob, int bd,
+                                      TX_TYPE tx_type) {
+  (void)eob;
+  switch (tx_type) {
+    case DCT_DCT:
+      av1_inv_txfm2d_add_32x32(input, CONVERT_TO_SHORTPTR(dest), stride,
+                               DCT_DCT, bd);
+      break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      highbd_iht32x32_1024_add_c(input, dest, stride, tx_type, bd);
+      break;
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 32, tx_type, bd);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+
+#if CONFIG_TX64X64
+static void highbd_inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest,
+                                      int stride, int eob, int bd,
+                                      TX_TYPE tx_type) {
+  (void)eob;
+  switch (tx_type) {
+    case DCT_DCT:
+      av1_inv_txfm2d_add_64x64(input, CONVERT_TO_SHORTPTR(dest), stride,
+                               DCT_DCT, bd);
+      break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      highbd_iht64x64_4096_add_c(input, dest, stride, tx_type, bd);
+      break;
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 64, tx_type, bd);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_HIGHBITDEPTH
+
+void av1_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
+                      INV_TXFM_PARAM *inv_txfm_param) {
+  const TX_TYPE tx_type = inv_txfm_param->tx_type;
+  const TX_SIZE tx_size = inv_txfm_param->tx_size;
+  const int eob = inv_txfm_param->eob;
+  const int lossless = inv_txfm_param->lossless;
+
+  switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64: inv_txfm_add_64x64(input, dest, stride, eob, tx_type); break;
+#endif  // CONFIG_TX64X64
+    case TX_32X32: inv_txfm_add_32x32(input, dest, stride, eob, tx_type); break;
+    case TX_16X16: inv_txfm_add_16x16(input, dest, stride, eob, tx_type); break;
+    case TX_8X8: inv_txfm_add_8x8(input, dest, stride, eob, tx_type); break;
+    case TX_4X8: av1_inv_txfm_add_4x8(input, dest, stride, eob, tx_type); break;
+    case TX_8X4: av1_inv_txfm_add_8x4(input, dest, stride, eob, tx_type); break;
+    case TX_8X16: inv_txfm_add_8x16(input, dest, stride, eob, tx_type); break;
+    case TX_16X8: inv_txfm_add_16x8(input, dest, stride, eob, tx_type); break;
+    case TX_16X32: inv_txfm_add_16x32(input, dest, stride, eob, tx_type); break;
+    case TX_32X16: inv_txfm_add_32x16(input, dest, stride, eob, tx_type); break;
+    case TX_4X4:
+      // this is like av1_short_idct4x4 but has a special case around eob<=1
+      // which is significant (not just an optimization) for the lossless
+      // case.
+      av1_inv_txfm_add_4x4(input, dest, stride, eob, tx_type, lossless);
+      break;
+#if CONFIG_CB4X4
+    case TX_2X2:
+      inv_txfm_add_2x2(input, dest, stride, eob, tx_type, lossless);
+      break;
+#endif
+    default: assert(0 && "Invalid transform size"); break;
+  }
+}
+
+static void init_inv_txfm_param(const MACROBLOCKD *xd, TX_SIZE tx_size,
+                                TX_TYPE tx_type, int eob, INV_TXFM_PARAM *inv) {
+  inv->tx_type = tx_type;
+  inv->tx_size = tx_size;
+  inv->eob = eob;
+  inv->lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+#if CONFIG_HIGHBITDEPTH
+  inv->bd = xd->bd;
+#endif
+#if CONFIG_ADAPT_SCAN
+  inv->eob_threshold = &xd->eob_threshold_md[tx_size][tx_type][0];
+#endif
+}
+
+void av1_inverse_transform_block(const MACROBLOCKD *xd,
+                                 const tran_low_t *dqcoeff, TX_TYPE tx_type,
+                                 TX_SIZE tx_size, uint8_t *dst, int stride,
+                                 int eob) {
+  if (!eob) return;
+#if CONFIG_PVQ
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  const int txb_width = block_size_wide[tx_bsize];
+  const int txb_height = block_size_high[tx_bsize];
+  int r, c;
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+    for (r = 0; r < txb_height; r++)
+      for (c = 0; c < txb_width; c++)
+        CONVERT_TO_SHORTPTR(dst)[r * stride + c] = 0;
+  } else {
+#endif  // CONFIG_HIGHBITDEPTH
+    for (r = 0; r < txb_height; r++)
+      for (c = 0; c < txb_width; c++) dst[r * stride + c] = 0;
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_PVQ
+  INV_TXFM_PARAM inv_txfm_param;
+  init_inv_txfm_param(xd, tx_size, tx_type, eob, &inv_txfm_param);
+
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &inv_txfm_param);
+  } else {
+#endif  // CONFIG_HIGHBITDEPTH
+    av1_inv_txfm_add(dqcoeff, dst, stride, &inv_txfm_param);
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+}
+
+void av1_inverse_transform_block_facade(MACROBLOCKD *xd, int plane, int block,
+                                        int blk_row, int blk_col, int eob) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_SIZE tx_size = get_tx_size(plane, xd);
+  const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst =
+      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, dst, dst_stride,
+                              eob);
+}
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
+                             INV_TXFM_PARAM *inv_txfm_param) {
+  const TX_TYPE tx_type = inv_txfm_param->tx_type;
+  const TX_SIZE tx_size = inv_txfm_param->tx_size;
+  const int eob = inv_txfm_param->eob;
+  const int bd = inv_txfm_param->bd;
+  const int lossless = inv_txfm_param->lossless;
+
+  switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      highbd_inv_txfm_add_64x64(input, dest, stride, eob, bd, tx_type);
+      break;
+#endif  // CONFIG_TX64X64
+    case TX_32X32:
+      highbd_inv_txfm_add_32x32(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_16X16:
+      highbd_inv_txfm_add_16x16(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_8X8:
+      highbd_inv_txfm_add_8x8(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_4X8:
+      av1_highbd_inv_txfm_add_4x8(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_8X4:
+      av1_highbd_inv_txfm_add_8x4(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_8X16:
+      highbd_inv_txfm_add_8x16(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_16X8:
+      highbd_inv_txfm_add_16x8(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_16X32:
+      highbd_inv_txfm_add_16x32(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_32X16:
+      highbd_inv_txfm_add_32x16(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_4X4:
+      // this is like av1_short_idct4x4 but has a special case around eob<=1
+      // which is significant (not just an optimization) for the lossless
+      // case.
+      av1_highbd_inv_txfm_add_4x4(input, dest, stride, eob, bd, tx_type,
+                                  lossless);
+      break;
+#if CONFIG_CB4X4
+    case TX_2X2:
+      highbd_inv_txfm_add_2x2(input, dest, stride, eob, bd, tx_type, lossless);
+      break;
+#endif
+    default: assert(0 && "Invalid transform size"); break;
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/av1/common/idct.h b/third_party/aom/av1/common/idct.h
new file mode 100644
index 000000000..e3a192187
--- /dev/null
+++ b/third_party/aom/av1/common/idct.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_IDCT_H_
+#define AV1_COMMON_IDCT_H_
+
+#include <assert.h>
+
+#include "./aom_config.h"
+#include "av1/common/blockd.h"
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+#include "aom_dsp/inv_txfm.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct INV_TXFM_PARAM {
+#if CONFIG_ADAPT_SCAN
+  const int16_t *eob_threshold;
+#endif
+  TX_TYPE tx_type;
+  TX_SIZE tx_size;
+  int eob;
+  int lossless;
+#if CONFIG_HIGHBITDEPTH
+  int bd;
+#endif
+} INV_TXFM_PARAM;
+
+typedef void (*transform_1d)(const tran_low_t *, tran_low_t *);
+
+typedef struct {
+  transform_1d cols, rows;  // vertical and horizontal
+} transform_2d;
+
+#if CONFIG_HIGHBITDEPTH
+typedef void (*highbd_transform_1d)(const tran_low_t *, tran_low_t *,
+                                    const int8_t *cos_bit,
+                                    const int8_t *stage_range, int bd);
+
+typedef struct {
+  highbd_transform_1d cols, rows;  // vertical and horizontal
+} highbd_transform_2d;
+#endif  // CONFIG_HIGHBITDEPTH
+
+#define MAX_TX_SCALE 1
+int av1_get_tx_scale(const TX_SIZE tx_size);
+
+void av1_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob);
+void av1_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob);
+
+void av1_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, TX_TYPE tx_type, int lossless);
+void av1_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, TX_TYPE tx_type);
+void av1_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, TX_TYPE tx_type);
+void av1_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
+                      INV_TXFM_PARAM *inv_txfm_param);
+void av1_inverse_transform_block(const MACROBLOCKD *xd,
+                                 const tran_low_t *dqcoeff, TX_TYPE tx_type,
+                                 TX_SIZE tx_size, uint8_t *dst, int stride,
+                                 int eob);
+void av1_inverse_transform_block_facade(MACROBLOCKD *xd, int plane, int block,
+                                        int blk_row, int blk_col, int eob);
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd);
+void av1_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd);
+void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
+                                 int stride, int eob, int bd, TX_TYPE tx_type,
+                                 int lossless);
+void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
+                                 int stride, int eob, int bd, TX_TYPE tx_type);
+void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
+                                 int stride, int eob, int bd, TX_TYPE tx_type);
+void av1_highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
+                             INV_TXFM_PARAM *inv_txfm_param);
+#endif  // CONFIG_HIGHBITDEPTH
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_IDCT_H_
diff --git a/third_party/aom/av1/common/laplace_tables.c b/third_party/aom/av1/common/laplace_tables.c
new file mode 100644
index 000000000..ab8784895
--- /dev/null
+++ b/third_party/aom/av1/common/laplace_tables.c
@@ -0,0 +1,657 @@
+/* This file is auto-generated using "gen_laplace_tables 128 7" */
+
+/* clang-format off */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "aom_dsp/prob.h"
+#include "pvq.h"
+
+const uint16_t EXP_CDF_TABLE[128][16] = {
+  {AOM_ICDF(32753), AOM_ICDF(32754), AOM_ICDF(32755), AOM_ICDF(32756),
+    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(32499), AOM_ICDF(32753), AOM_ICDF(32755), AOM_ICDF(32756),
+    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(32243), AOM_ICDF(32747), AOM_ICDF(32755), AOM_ICDF(32756),
+    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(31987), AOM_ICDF(32737), AOM_ICDF(32755), AOM_ICDF(32756),
+    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(31732), AOM_ICDF(32724), AOM_ICDF(32755), AOM_ICDF(32756),
+    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(31476), AOM_ICDF(32706), AOM_ICDF(32754), AOM_ICDF(32756),
+    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(31220), AOM_ICDF(32684), AOM_ICDF(32753), AOM_ICDF(32756),
+    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(30964), AOM_ICDF(32658), AOM_ICDF(32751), AOM_ICDF(32756),
+    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(30708), AOM_ICDF(32628), AOM_ICDF(32748), AOM_ICDF(32756),
+    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(30452), AOM_ICDF(32594), AOM_ICDF(32745), AOM_ICDF(32756),
+    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(30198), AOM_ICDF(32558), AOM_ICDF(32742), AOM_ICDF(32756),
+    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(29941), AOM_ICDF(32515), AOM_ICDF(32736), AOM_ICDF(32755),
+    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(29686), AOM_ICDF(32470), AOM_ICDF(32731), AOM_ICDF(32755),
+    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(29429), AOM_ICDF(32419), AOM_ICDF(32723), AOM_ICDF(32754),
+    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(29174), AOM_ICDF(32366), AOM_ICDF(32715), AOM_ICDF(32753),
+    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(28918), AOM_ICDF(32308), AOM_ICDF(32705), AOM_ICDF(32752),
+    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(28662), AOM_ICDF(32246), AOM_ICDF(32694), AOM_ICDF(32750),
+    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(28406), AOM_ICDF(32180), AOM_ICDF(32681), AOM_ICDF(32748),
+    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(28150), AOM_ICDF(32110), AOM_ICDF(32667), AOM_ICDF(32745),
+    AOM_ICDF(32756), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(27894), AOM_ICDF(32036), AOM_ICDF(32651), AOM_ICDF(32742),
+    AOM_ICDF(32756), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(27639), AOM_ICDF(31959), AOM_ICDF(32634), AOM_ICDF(32739),
+    AOM_ICDF(32755), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(27383), AOM_ICDF(31877), AOM_ICDF(32614), AOM_ICDF(32735),
+    AOM_ICDF(32755), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(27126), AOM_ICDF(31790), AOM_ICDF(32592), AOM_ICDF(32730),
+    AOM_ICDF(32754), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(26871), AOM_ICDF(31701), AOM_ICDF(32569), AOM_ICDF(32725),
+    AOM_ICDF(32753), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(26615), AOM_ICDF(31607), AOM_ICDF(32543), AOM_ICDF(32719),
+    AOM_ICDF(32752), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(26361), AOM_ICDF(31511), AOM_ICDF(32517), AOM_ICDF(32713),
+    AOM_ICDF(32751), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(26104), AOM_ICDF(31408), AOM_ICDF(32485), AOM_ICDF(32704),
+    AOM_ICDF(32748), AOM_ICDF(32757), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(25848), AOM_ICDF(31302), AOM_ICDF(32452), AOM_ICDF(32695),
+    AOM_ICDF(32746), AOM_ICDF(32757), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(25591), AOM_ICDF(31191), AOM_ICDF(32416), AOM_ICDF(32684),
+    AOM_ICDF(32743), AOM_ICDF(32756), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(25336), AOM_ICDF(31078), AOM_ICDF(32379), AOM_ICDF(32674),
+    AOM_ICDF(32741), AOM_ICDF(32756), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(25080), AOM_ICDF(30960), AOM_ICDF(32338), AOM_ICDF(32661),
+    AOM_ICDF(32737), AOM_ICDF(32755), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(24824), AOM_ICDF(30838), AOM_ICDF(32295), AOM_ICDF(32648),
+    AOM_ICDF(32733), AOM_ICDF(32754), AOM_ICDF(32759), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(24568), AOM_ICDF(30712), AOM_ICDF(32248), AOM_ICDF(32632),
+    AOM_ICDF(32728), AOM_ICDF(32752), AOM_ICDF(32758), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(24313), AOM_ICDF(30583), AOM_ICDF(32199), AOM_ICDF(32616),
+    AOM_ICDF(32723), AOM_ICDF(32751), AOM_ICDF(32758), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(24057), AOM_ICDF(30449), AOM_ICDF(32147), AOM_ICDF(32598),
+    AOM_ICDF(32718), AOM_ICDF(32750), AOM_ICDF(32758), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(23801), AOM_ICDF(30311), AOM_ICDF(32091), AOM_ICDF(32578),
+    AOM_ICDF(32711), AOM_ICDF(32747), AOM_ICDF(32757), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(23546), AOM_ICDF(30170), AOM_ICDF(32033), AOM_ICDF(32557),
+    AOM_ICDF(32704), AOM_ICDF(32745), AOM_ICDF(32757), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(23288), AOM_ICDF(30022), AOM_ICDF(31969), AOM_ICDF(32532),
+    AOM_ICDF(32695), AOM_ICDF(32742), AOM_ICDF(32756), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(23033), AOM_ICDF(29873), AOM_ICDF(31904), AOM_ICDF(32507),
+    AOM_ICDF(32686), AOM_ICDF(32739), AOM_ICDF(32755), AOM_ICDF(32760),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(22778), AOM_ICDF(29720), AOM_ICDF(31835), AOM_ICDF(32479),
+    AOM_ICDF(32675), AOM_ICDF(32735), AOM_ICDF(32753), AOM_ICDF(32759),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(22521), AOM_ICDF(29561), AOM_ICDF(31761), AOM_ICDF(32449),
+    AOM_ICDF(32664), AOM_ICDF(32731), AOM_ICDF(32752), AOM_ICDF(32759),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(22267), AOM_ICDF(29401), AOM_ICDF(31686), AOM_ICDF(32418),
+    AOM_ICDF(32652), AOM_ICDF(32727), AOM_ICDF(32751), AOM_ICDF(32759),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(22011), AOM_ICDF(29235), AOM_ICDF(31605), AOM_ICDF(32383),
+    AOM_ICDF(32638), AOM_ICDF(32722), AOM_ICDF(32749), AOM_ICDF(32758),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(21754), AOM_ICDF(29064), AOM_ICDF(31520), AOM_ICDF(32345),
+    AOM_ICDF(32622), AOM_ICDF(32715), AOM_ICDF(32746), AOM_ICDF(32757),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(21501), AOM_ICDF(28893), AOM_ICDF(31434), AOM_ICDF(32307),
+    AOM_ICDF(32607), AOM_ICDF(32710), AOM_ICDF(32745), AOM_ICDF(32757),
+    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(21243), AOM_ICDF(28713), AOM_ICDF(31339), AOM_ICDF(32262),
+    AOM_ICDF(32587), AOM_ICDF(32701), AOM_ICDF(32741), AOM_ICDF(32755),
+    AOM_ICDF(32760), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(20988), AOM_ICDF(28532), AOM_ICDF(31243), AOM_ICDF(32217),
+    AOM_ICDF(32567), AOM_ICDF(32693), AOM_ICDF(32738), AOM_ICDF(32754),
+    AOM_ICDF(32760), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(20730), AOM_ICDF(28344), AOM_ICDF(31140), AOM_ICDF(32167),
+    AOM_ICDF(32544), AOM_ICDF(32682), AOM_ICDF(32733), AOM_ICDF(32752),
+    AOM_ICDF(32759), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(20476), AOM_ICDF(28156), AOM_ICDF(31036), AOM_ICDF(32116),
+    AOM_ICDF(32521), AOM_ICDF(32673), AOM_ICDF(32730), AOM_ICDF(32751),
+    AOM_ICDF(32759), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(20220), AOM_ICDF(27962), AOM_ICDF(30926), AOM_ICDF(32061),
+    AOM_ICDF(32495), AOM_ICDF(32661), AOM_ICDF(32725), AOM_ICDF(32749),
+    AOM_ICDF(32758), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(19963), AOM_ICDF(27763), AOM_ICDF(30810), AOM_ICDF(32000),
+    AOM_ICDF(32465), AOM_ICDF(32647), AOM_ICDF(32718), AOM_ICDF(32746),
+    AOM_ICDF(32757), AOM_ICDF(32761), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(19708), AOM_ICDF(27562), AOM_ICDF(30691), AOM_ICDF(31938),
+    AOM_ICDF(32435), AOM_ICDF(32633), AOM_ICDF(32712), AOM_ICDF(32743),
+    AOM_ICDF(32756), AOM_ICDF(32761), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(19454), AOM_ICDF(27358), AOM_ICDF(30569), AOM_ICDF(31873),
+    AOM_ICDF(32403), AOM_ICDF(32618), AOM_ICDF(32705), AOM_ICDF(32741),
+    AOM_ICDF(32755), AOM_ICDF(32761), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(19196), AOM_ICDF(27146), AOM_ICDF(30438), AOM_ICDF(31801),
+    AOM_ICDF(32365), AOM_ICDF(32599), AOM_ICDF(32696), AOM_ICDF(32736),
+    AOM_ICDF(32753), AOM_ICDF(32760), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(18942), AOM_ICDF(26934), AOM_ICDF(30306), AOM_ICDF(31728),
+    AOM_ICDF(32328), AOM_ICDF(32581), AOM_ICDF(32688), AOM_ICDF(32733),
+    AOM_ICDF(32752), AOM_ICDF(32760), AOM_ICDF(32763), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(18684), AOM_ICDF(26714), AOM_ICDF(30164), AOM_ICDF(31647),
+    AOM_ICDF(32284), AOM_ICDF(32558), AOM_ICDF(32676), AOM_ICDF(32727),
+    AOM_ICDF(32749), AOM_ICDF(32758), AOM_ICDF(32762), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(18429), AOM_ICDF(26493), AOM_ICDF(30021), AOM_ICDF(31565),
+    AOM_ICDF(32240), AOM_ICDF(32535), AOM_ICDF(32664), AOM_ICDF(32721),
+    AOM_ICDF(32746), AOM_ICDF(32757), AOM_ICDF(32762), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(18174), AOM_ICDF(26268), AOM_ICDF(29872), AOM_ICDF(31477),
+    AOM_ICDF(32192), AOM_ICDF(32510), AOM_ICDF(32652), AOM_ICDF(32715),
+    AOM_ICDF(32743), AOM_ICDF(32756), AOM_ICDF(32762), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(17920), AOM_ICDF(26040), AOM_ICDF(29719), AOM_ICDF(31386),
+    AOM_ICDF(32141), AOM_ICDF(32483), AOM_ICDF(32638), AOM_ICDF(32708),
+    AOM_ICDF(32740), AOM_ICDF(32754), AOM_ICDF(32761), AOM_ICDF(32764),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(17661), AOM_ICDF(25803), AOM_ICDF(29556), AOM_ICDF(31286),
+    AOM_ICDF(32083), AOM_ICDF(32451), AOM_ICDF(32620), AOM_ICDF(32698),
+    AOM_ICDF(32734), AOM_ICDF(32751), AOM_ICDF(32759), AOM_ICDF(32763),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(17406), AOM_ICDF(25566), AOM_ICDF(29391), AOM_ICDF(31184),
+    AOM_ICDF(32024), AOM_ICDF(32418), AOM_ICDF(32603), AOM_ICDF(32690),
+    AOM_ICDF(32731), AOM_ICDF(32750), AOM_ICDF(32759), AOM_ICDF(32763),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(17151), AOM_ICDF(25325), AOM_ICDF(29220), AOM_ICDF(31076),
+    AOM_ICDF(31961), AOM_ICDF(32383), AOM_ICDF(32584), AOM_ICDF(32680),
+    AOM_ICDF(32726), AOM_ICDF(32748), AOM_ICDF(32758), AOM_ICDF(32763),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(16896), AOM_ICDF(25080), AOM_ICDF(29044), AOM_ICDF(30964),
+    AOM_ICDF(31894), AOM_ICDF(32344), AOM_ICDF(32562), AOM_ICDF(32668),
+    AOM_ICDF(32719), AOM_ICDF(32744), AOM_ICDF(32756), AOM_ICDF(32762),
+    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(16639), AOM_ICDF(24829), AOM_ICDF(28860), AOM_ICDF(30844),
+    AOM_ICDF(31821), AOM_ICDF(32302), AOM_ICDF(32539), AOM_ICDF(32655),
+    AOM_ICDF(32712), AOM_ICDF(32740), AOM_ICDF(32754), AOM_ICDF(32761),
+    AOM_ICDF(32764), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(28672), AOM_ICDF(30720),
+    AOM_ICDF(31744), AOM_ICDF(32256), AOM_ICDF(32512), AOM_ICDF(32640),
+    AOM_ICDF(32704), AOM_ICDF(32736), AOM_ICDF(32752), AOM_ICDF(32760),
+    AOM_ICDF(32764), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(16130), AOM_ICDF(24320), AOM_ICDF(28479), AOM_ICDF(30591),
+    AOM_ICDF(31663), AOM_ICDF(32208), AOM_ICDF(32485), AOM_ICDF(32625),
+    AOM_ICDF(32696), AOM_ICDF(32732), AOM_ICDF(32750), AOM_ICDF(32759),
+    AOM_ICDF(32764), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
+  {AOM_ICDF(15872), AOM_ICDF(24056), AOM_ICDF(28276), AOM_ICDF(30452),
+    AOM_ICDF(31574), AOM_ICDF(32152), AOM_ICDF(32450), AOM_ICDF(32604),
+    AOM_ICDF(32683), AOM_ICDF(32724), AOM_ICDF(32745), AOM_ICDF(32756),
+    AOM_ICDF(32762), AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32768)},
+  {AOM_ICDF(15615), AOM_ICDF(23789), AOM_ICDF(28068), AOM_ICDF(30308),
+    AOM_ICDF(31480), AOM_ICDF(32094), AOM_ICDF(32415), AOM_ICDF(32583),
+    AOM_ICDF(32671), AOM_ICDF(32717), AOM_ICDF(32741), AOM_ICDF(32754),
+    AOM_ICDF(32761), AOM_ICDF(32764), AOM_ICDF(32766), AOM_ICDF(32768)},
+  {AOM_ICDF(15361), AOM_ICDF(23521), AOM_ICDF(27856), AOM_ICDF(30159),
+    AOM_ICDF(31382), AOM_ICDF(32032), AOM_ICDF(32377), AOM_ICDF(32560),
+    AOM_ICDF(32657), AOM_ICDF(32709), AOM_ICDF(32737), AOM_ICDF(32752),
+    AOM_ICDF(32760), AOM_ICDF(32764), AOM_ICDF(32766), AOM_ICDF(32768)},
+  {AOM_ICDF(15103), AOM_ICDF(23245), AOM_ICDF(27634), AOM_ICDF(30000),
+    AOM_ICDF(31275), AOM_ICDF(31963), AOM_ICDF(32334), AOM_ICDF(32534),
+    AOM_ICDF(32642), AOM_ICDF(32700), AOM_ICDF(32731), AOM_ICDF(32748),
+    AOM_ICDF(32757), AOM_ICDF(32762), AOM_ICDF(32765), AOM_ICDF(32768)},
+  {AOM_ICDF(14848), AOM_ICDF(22968), AOM_ICDF(27409), AOM_ICDF(29837),
+    AOM_ICDF(31165), AOM_ICDF(31891), AOM_ICDF(32288), AOM_ICDF(32505),
+    AOM_ICDF(32624), AOM_ICDF(32689), AOM_ICDF(32725), AOM_ICDF(32744),
+    AOM_ICDF(32755), AOM_ICDF(32761), AOM_ICDF(32764), AOM_ICDF(32768)},
+  {AOM_ICDF(14592), AOM_ICDF(22686), AOM_ICDF(27176), AOM_ICDF(29666),
+    AOM_ICDF(31047), AOM_ICDF(31813), AOM_ICDF(32238), AOM_ICDF(32474),
+    AOM_ICDF(32605), AOM_ICDF(32678), AOM_ICDF(32718), AOM_ICDF(32740),
+    AOM_ICDF(32752), AOM_ICDF(32759), AOM_ICDF(32763), AOM_ICDF(32768)},
+  {AOM_ICDF(14336), AOM_ICDF(22400), AOM_ICDF(26936), AOM_ICDF(29488),
+    AOM_ICDF(30923), AOM_ICDF(31730), AOM_ICDF(32184), AOM_ICDF(32439),
+    AOM_ICDF(32583), AOM_ICDF(32664), AOM_ICDF(32709), AOM_ICDF(32735),
+    AOM_ICDF(32749), AOM_ICDF(32757), AOM_ICDF(32762), AOM_ICDF(32768)},
+  {AOM_ICDF(14079), AOM_ICDF(22109), AOM_ICDF(26689), AOM_ICDF(29301),
+    AOM_ICDF(30791), AOM_ICDF(31641), AOM_ICDF(32125), AOM_ICDF(32401),
+    AOM_ICDF(32559), AOM_ICDF(32649), AOM_ICDF(32700), AOM_ICDF(32729),
+    AOM_ICDF(32746), AOM_ICDF(32756), AOM_ICDF(32761), AOM_ICDF(32768)},
+  {AOM_ICDF(13825), AOM_ICDF(21817), AOM_ICDF(26437), AOM_ICDF(29108),
+    AOM_ICDF(30652), AOM_ICDF(31545), AOM_ICDF(32061), AOM_ICDF(32359),
+    AOM_ICDF(32532), AOM_ICDF(32632), AOM_ICDF(32690), AOM_ICDF(32723),
+    AOM_ICDF(32742), AOM_ICDF(32753), AOM_ICDF(32759), AOM_ICDF(32768)},
+  {AOM_ICDF(13568), AOM_ICDF(21518), AOM_ICDF(26176), AOM_ICDF(28905),
+    AOM_ICDF(30504), AOM_ICDF(31441), AOM_ICDF(31990), AOM_ICDF(32312),
+    AOM_ICDF(32501), AOM_ICDF(32611), AOM_ICDF(32676), AOM_ICDF(32714),
+    AOM_ICDF(32736), AOM_ICDF(32749), AOM_ICDF(32757), AOM_ICDF(32768)},
+  {AOM_ICDF(13314), AOM_ICDF(21218), AOM_ICDF(25911), AOM_ICDF(28697),
+    AOM_ICDF(30351), AOM_ICDF(31333), AOM_ICDF(31916), AOM_ICDF(32262),
+    AOM_ICDF(32468), AOM_ICDF(32590), AOM_ICDF(32662), AOM_ICDF(32705),
+    AOM_ICDF(32731), AOM_ICDF(32746), AOM_ICDF(32755), AOM_ICDF(32768)},
+  {AOM_ICDF(13054), AOM_ICDF(20908), AOM_ICDF(25633), AOM_ICDF(28475),
+    AOM_ICDF(30185), AOM_ICDF(31214), AOM_ICDF(31833), AOM_ICDF(32205),
+    AOM_ICDF(32429), AOM_ICDF(32564), AOM_ICDF(32645), AOM_ICDF(32694),
+    AOM_ICDF(32723), AOM_ICDF(32741), AOM_ICDF(32752), AOM_ICDF(32768)},
+  {AOM_ICDF(12803), AOM_ICDF(20603), AOM_ICDF(25356), AOM_ICDF(28252),
+    AOM_ICDF(30017), AOM_ICDF(31093), AOM_ICDF(31748), AOM_ICDF(32147),
+    AOM_ICDF(32390), AOM_ICDF(32538), AOM_ICDF(32628), AOM_ICDF(32683),
+    AOM_ICDF(32717), AOM_ICDF(32737), AOM_ICDF(32749), AOM_ICDF(32768)},
+  {AOM_ICDF(12544), AOM_ICDF(20286), AOM_ICDF(25064), AOM_ICDF(28013),
+    AOM_ICDF(29833), AOM_ICDF(30956), AOM_ICDF(31649), AOM_ICDF(32077),
+    AOM_ICDF(32341), AOM_ICDF(32504), AOM_ICDF(32605), AOM_ICDF(32667),
+    AOM_ICDF(32705), AOM_ICDF(32729), AOM_ICDF(32744), AOM_ICDF(32768)},
+  {AOM_ICDF(12288), AOM_ICDF(19968), AOM_ICDF(24768), AOM_ICDF(27768),
+    AOM_ICDF(29643), AOM_ICDF(30815), AOM_ICDF(31547), AOM_ICDF(32005),
+    AOM_ICDF(32291), AOM_ICDF(32470), AOM_ICDF(32582), AOM_ICDF(32652),
+    AOM_ICDF(32696), AOM_ICDF(32723), AOM_ICDF(32740), AOM_ICDF(32768)},
+  {AOM_ICDF(12033), AOM_ICDF(19647), AOM_ICDF(24465), AOM_ICDF(27514),
+    AOM_ICDF(29443), AOM_ICDF(30664), AOM_ICDF(31437), AOM_ICDF(31926),
+    AOM_ICDF(32235), AOM_ICDF(32431), AOM_ICDF(32555), AOM_ICDF(32633),
+    AOM_ICDF(32683), AOM_ICDF(32714), AOM_ICDF(32734), AOM_ICDF(32768)},
+  {AOM_ICDF(11777), AOM_ICDF(19321), AOM_ICDF(24154), AOM_ICDF(27250),
+    AOM_ICDF(29233), AOM_ICDF(30504), AOM_ICDF(31318), AOM_ICDF(31839),
+    AOM_ICDF(32173), AOM_ICDF(32387), AOM_ICDF(32524), AOM_ICDF(32612),
+    AOM_ICDF(32668), AOM_ICDF(32704), AOM_ICDF(32727), AOM_ICDF(32768)},
+  {AOM_ICDF(11521), AOM_ICDF(18991), AOM_ICDF(23835), AOM_ICDF(26976),
+    AOM_ICDF(29013), AOM_ICDF(30334), AOM_ICDF(31190), AOM_ICDF(31745),
+    AOM_ICDF(32105), AOM_ICDF(32338), AOM_ICDF(32489), AOM_ICDF(32587),
+    AOM_ICDF(32651), AOM_ICDF(32692), AOM_ICDF(32719), AOM_ICDF(32768)},
+  {AOM_ICDF(11265), AOM_ICDF(18657), AOM_ICDF(23508), AOM_ICDF(26691),
+    AOM_ICDF(28780), AOM_ICDF(30151), AOM_ICDF(31051), AOM_ICDF(31641),
+    AOM_ICDF(32028), AOM_ICDF(32282), AOM_ICDF(32449), AOM_ICDF(32559),
+    AOM_ICDF(32631), AOM_ICDF(32678), AOM_ICDF(32709), AOM_ICDF(32768)},
+  {AOM_ICDF(11006), AOM_ICDF(18316), AOM_ICDF(23170), AOM_ICDF(26394),
+    AOM_ICDF(28535), AOM_ICDF(29957), AOM_ICDF(30901), AOM_ICDF(31528),
+    AOM_ICDF(31944), AOM_ICDF(32220), AOM_ICDF(32404), AOM_ICDF(32526),
+    AOM_ICDF(32607), AOM_ICDF(32661), AOM_ICDF(32697), AOM_ICDF(32768)},
+  {AOM_ICDF(10752), AOM_ICDF(17976), AOM_ICDF(22830), AOM_ICDF(26091),
+    AOM_ICDF(28282), AOM_ICDF(29754), AOM_ICDF(30743), AOM_ICDF(31408),
+    AOM_ICDF(31854), AOM_ICDF(32154), AOM_ICDF(32356), AOM_ICDF(32491),
+    AOM_ICDF(32582), AOM_ICDF(32643), AOM_ICDF(32684), AOM_ICDF(32768)},
+  {AOM_ICDF(10496), AOM_ICDF(17630), AOM_ICDF(22479), AOM_ICDF(25775),
+    AOM_ICDF(28015), AOM_ICDF(29538), AOM_ICDF(30573), AOM_ICDF(31276),
+    AOM_ICDF(31754), AOM_ICDF(32079), AOM_ICDF(32300), AOM_ICDF(32450),
+    AOM_ICDF(32552), AOM_ICDF(32621), AOM_ICDF(32668), AOM_ICDF(32768)},
+  {AOM_ICDF(10240), AOM_ICDF(17280), AOM_ICDF(22120), AOM_ICDF(25448),
+    AOM_ICDF(27736), AOM_ICDF(29309), AOM_ICDF(30390), AOM_ICDF(31133),
+    AOM_ICDF(31644), AOM_ICDF(31995), AOM_ICDF(32237), AOM_ICDF(32403),
+    AOM_ICDF(32517), AOM_ICDF(32595), AOM_ICDF(32649), AOM_ICDF(32768)},
+  { AOM_ICDF(9984), AOM_ICDF(16926), AOM_ICDF(21753), AOM_ICDF(25109),
+    AOM_ICDF(27443), AOM_ICDF(29066), AOM_ICDF(30194), AOM_ICDF(30978),
+    AOM_ICDF(31523), AOM_ICDF(31902), AOM_ICDF(32166), AOM_ICDF(32349),
+    AOM_ICDF(32476), AOM_ICDF(32565), AOM_ICDF(32627), AOM_ICDF(32768)},
+  { AOM_ICDF(9728), AOM_ICDF(16568), AOM_ICDF(21377), AOM_ICDF(24759),
+    AOM_ICDF(27137), AOM_ICDF(28809), AOM_ICDF(29984), AOM_ICDF(30811),
+    AOM_ICDF(31392), AOM_ICDF(31801), AOM_ICDF(32088), AOM_ICDF(32290),
+    AOM_ICDF(32432), AOM_ICDF(32532), AOM_ICDF(32602), AOM_ICDF(32768)},
+  { AOM_ICDF(9474), AOM_ICDF(16208), AOM_ICDF(20995), AOM_ICDF(24399),
+    AOM_ICDF(26819), AOM_ICDF(28539), AOM_ICDF(29762), AOM_ICDF(30631),
+    AOM_ICDF(31249), AOM_ICDF(31688), AOM_ICDF(32000), AOM_ICDF(32222),
+    AOM_ICDF(32380), AOM_ICDF(32492), AOM_ICDF(32572), AOM_ICDF(32768)},
+  { AOM_ICDF(9216), AOM_ICDF(15840), AOM_ICDF(20601), AOM_ICDF(24023),
+    AOM_ICDF(26483), AOM_ICDF(28251), AOM_ICDF(29522), AOM_ICDF(30435),
+    AOM_ICDF(31091), AOM_ICDF(31563), AOM_ICDF(31902), AOM_ICDF(32146),
+    AOM_ICDF(32321), AOM_ICDF(32447), AOM_ICDF(32537), AOM_ICDF(32768)},
+  { AOM_ICDF(8959), AOM_ICDF(15469), AOM_ICDF(20199), AOM_ICDF(23636),
+    AOM_ICDF(26133), AOM_ICDF(27947), AOM_ICDF(29265), AOM_ICDF(30223),
+    AOM_ICDF(30919), AOM_ICDF(31425), AOM_ICDF(31792), AOM_ICDF(32059),
+    AOM_ICDF(32253), AOM_ICDF(32394), AOM_ICDF(32496), AOM_ICDF(32768)},
+  { AOM_ICDF(8705), AOM_ICDF(15097), AOM_ICDF(19791), AOM_ICDF(23238),
+    AOM_ICDF(25770), AOM_ICDF(27629), AOM_ICDF(28994), AOM_ICDF(29997),
+    AOM_ICDF(30733), AOM_ICDF(31274), AOM_ICDF(31671), AOM_ICDF(31963),
+    AOM_ICDF(32177), AOM_ICDF(32334), AOM_ICDF(32449), AOM_ICDF(32768)},
+  { AOM_ICDF(8449), AOM_ICDF(14719), AOM_ICDF(19373), AOM_ICDF(22827),
+    AOM_ICDF(25390), AOM_ICDF(27292), AOM_ICDF(28704), AOM_ICDF(29752),
+    AOM_ICDF(30530), AOM_ICDF(31107), AOM_ICDF(31535), AOM_ICDF(31853),
+    AOM_ICDF(32089), AOM_ICDF(32264), AOM_ICDF(32394), AOM_ICDF(32768)},
+  { AOM_ICDF(8192), AOM_ICDF(14336), AOM_ICDF(18944), AOM_ICDF(22400),
+    AOM_ICDF(24992), AOM_ICDF(26936), AOM_ICDF(28394), AOM_ICDF(29488),
+    AOM_ICDF(30308), AOM_ICDF(30923), AOM_ICDF(31384), AOM_ICDF(31730),
+    AOM_ICDF(31989), AOM_ICDF(32184), AOM_ICDF(32330), AOM_ICDF(32768)},
+  { AOM_ICDF(7936), AOM_ICDF(13950), AOM_ICDF(18507), AOM_ICDF(21961),
+    AOM_ICDF(24578), AOM_ICDF(26561), AOM_ICDF(28064), AOM_ICDF(29203),
+    AOM_ICDF(30066), AOM_ICDF(30720), AOM_ICDF(31216), AOM_ICDF(31592),
+    AOM_ICDF(31877), AOM_ICDF(32093), AOM_ICDF(32256), AOM_ICDF(32768)},
+  { AOM_ICDF(7678), AOM_ICDF(13558), AOM_ICDF(18060), AOM_ICDF(21507),
+    AOM_ICDF(24146), AOM_ICDF(26166), AOM_ICDF(27713), AOM_ICDF(28897),
+    AOM_ICDF(29804), AOM_ICDF(30498), AOM_ICDF(31030), AOM_ICDF(31437),
+    AOM_ICDF(31749), AOM_ICDF(31988), AOM_ICDF(32171), AOM_ICDF(32768)},
+  { AOM_ICDF(7423), AOM_ICDF(13165), AOM_ICDF(17606), AOM_ICDF(21041),
+    AOM_ICDF(23698), AOM_ICDF(25753), AOM_ICDF(27342), AOM_ICDF(28571),
+    AOM_ICDF(29522), AOM_ICDF(30257), AOM_ICDF(30826), AOM_ICDF(31266),
+    AOM_ICDF(31606), AOM_ICDF(31869), AOM_ICDF(32073), AOM_ICDF(32768)},
+  { AOM_ICDF(7168), AOM_ICDF(12768), AOM_ICDF(17143), AOM_ICDF(20561),
+    AOM_ICDF(23231), AOM_ICDF(25317), AOM_ICDF(26947), AOM_ICDF(28220),
+    AOM_ICDF(29215), AOM_ICDF(29992), AOM_ICDF(30599), AOM_ICDF(31073),
+    AOM_ICDF(31444), AOM_ICDF(31734), AOM_ICDF(31960), AOM_ICDF(32768)},
+  { AOM_ICDF(6911), AOM_ICDF(12365), AOM_ICDF(16669), AOM_ICDF(20065),
+    AOM_ICDF(22744), AOM_ICDF(24858), AOM_ICDF(26526), AOM_ICDF(27842),
+    AOM_ICDF(28881), AOM_ICDF(29701), AOM_ICDF(30348), AOM_ICDF(30858),
+    AOM_ICDF(31261), AOM_ICDF(31579), AOM_ICDF(31830), AOM_ICDF(32768)},
+  { AOM_ICDF(6657), AOM_ICDF(11961), AOM_ICDF(16188), AOM_ICDF(19556),
+    AOM_ICDF(22240), AOM_ICDF(24379), AOM_ICDF(26083), AOM_ICDF(27441),
+    AOM_ICDF(28523), AOM_ICDF(29385), AOM_ICDF(30072), AOM_ICDF(30620),
+    AOM_ICDF(31056), AOM_ICDF(31404), AOM_ICDF(31681), AOM_ICDF(32768)},
+  { AOM_ICDF(6400), AOM_ICDF(11550), AOM_ICDF(15694), AOM_ICDF(19029),
+    AOM_ICDF(21712), AOM_ICDF(23871), AOM_ICDF(25609), AOM_ICDF(27007),
+    AOM_ICDF(28132), AOM_ICDF(29037), AOM_ICDF(29766), AOM_ICDF(30352),
+    AOM_ICDF(30824), AOM_ICDF(31204), AOM_ICDF(31509), AOM_ICDF(32768)},
+  { AOM_ICDF(6142), AOM_ICDF(11134), AOM_ICDF(15190), AOM_ICDF(18486),
+    AOM_ICDF(21164), AOM_ICDF(23340), AOM_ICDF(25108), AOM_ICDF(26544),
+    AOM_ICDF(27711), AOM_ICDF(28659), AOM_ICDF(29429), AOM_ICDF(30055),
+    AOM_ICDF(30564), AOM_ICDF(30977), AOM_ICDF(31313), AOM_ICDF(32768)},
+  { AOM_ICDF(5890), AOM_ICDF(10720), AOM_ICDF(14682), AOM_ICDF(17932),
+    AOM_ICDF(20598), AOM_ICDF(22785), AOM_ICDF(24579), AOM_ICDF(26051),
+    AOM_ICDF(27258), AOM_ICDF(28248), AOM_ICDF(29060), AOM_ICDF(29726),
+    AOM_ICDF(30273), AOM_ICDF(30721), AOM_ICDF(31089), AOM_ICDF(32768)},
+  { AOM_ICDF(5631), AOM_ICDF(10295), AOM_ICDF(14157), AOM_ICDF(17356),
+    AOM_ICDF(20005), AOM_ICDF(22199), AOM_ICDF(24016), AOM_ICDF(25520),
+    AOM_ICDF(26766), AOM_ICDF(27798), AOM_ICDF(28652), AOM_ICDF(29359),
+    AOM_ICDF(29945), AOM_ICDF(30430), AOM_ICDF(30832), AOM_ICDF(32768)},
+  { AOM_ICDF(5377), AOM_ICDF(9871), AOM_ICDF(13628), AOM_ICDF(16768),
+    AOM_ICDF(19393), AOM_ICDF(21587), AOM_ICDF(23421), AOM_ICDF(24954),
+    AOM_ICDF(26236), AOM_ICDF(27308), AOM_ICDF(28204), AOM_ICDF(28953),
+    AOM_ICDF(29579), AOM_ICDF(30102), AOM_ICDF(30539), AOM_ICDF(32768)},
+  { AOM_ICDF(5121), AOM_ICDF(9441), AOM_ICDF(13086), AOM_ICDF(16161),
+    AOM_ICDF(18756), AOM_ICDF(20945), AOM_ICDF(22792), AOM_ICDF(24351),
+    AOM_ICDF(25666), AOM_ICDF(26776), AOM_ICDF(27712), AOM_ICDF(28502),
+    AOM_ICDF(29169), AOM_ICDF(29731), AOM_ICDF(30206), AOM_ICDF(32768)},
+  { AOM_ICDF(4865), AOM_ICDF(9007), AOM_ICDF(12534), AOM_ICDF(15538),
+    AOM_ICDF(18096), AOM_ICDF(20274), AOM_ICDF(22129), AOM_ICDF(23708),
+    AOM_ICDF(25053), AOM_ICDF(26198), AOM_ICDF(27173), AOM_ICDF(28004),
+    AOM_ICDF(28711), AOM_ICDF(29313), AOM_ICDF(29826), AOM_ICDF(32768)},
+  { AOM_ICDF(4608), AOM_ICDF(8568), AOM_ICDF(11971), AOM_ICDF(14896),
+    AOM_ICDF(17409), AOM_ICDF(19569), AOM_ICDF(21425), AOM_ICDF(23020),
+    AOM_ICDF(24391), AOM_ICDF(25569), AOM_ICDF(26581), AOM_ICDF(27451),
+    AOM_ICDF(28199), AOM_ICDF(28842), AOM_ICDF(29394), AOM_ICDF(32768)},
+  { AOM_ICDF(4351), AOM_ICDF(8125), AOM_ICDF(11398), AOM_ICDF(14236),
+    AOM_ICDF(16697), AOM_ICDF(18831), AOM_ICDF(20682), AOM_ICDF(22287),
+    AOM_ICDF(23679), AOM_ICDF(24886), AOM_ICDF(25933), AOM_ICDF(26841),
+    AOM_ICDF(27628), AOM_ICDF(28311), AOM_ICDF(28903), AOM_ICDF(32768)},
+  { AOM_ICDF(4096), AOM_ICDF(7680), AOM_ICDF(10816), AOM_ICDF(13560),
+    AOM_ICDF(15961), AOM_ICDF(18062), AOM_ICDF(19900), AOM_ICDF(21508),
+    AOM_ICDF(22915), AOM_ICDF(24146), AOM_ICDF(25224), AOM_ICDF(26167),
+    AOM_ICDF(26992), AOM_ICDF(27714), AOM_ICDF(28346), AOM_ICDF(32768)},
+  { AOM_ICDF(3840), AOM_ICDF(7230), AOM_ICDF(10223), AOM_ICDF(12865),
+    AOM_ICDF(15197), AOM_ICDF(17256), AOM_ICDF(19074), AOM_ICDF(20679),
+    AOM_ICDF(22096), AOM_ICDF(23347), AOM_ICDF(24451), AOM_ICDF(25426),
+    AOM_ICDF(26287), AOM_ICDF(27047), AOM_ICDF(27718), AOM_ICDF(32768)},
+  { AOM_ICDF(3584), AOM_ICDF(6776), AOM_ICDF(9619), AOM_ICDF(12151),
+    AOM_ICDF(14406), AOM_ICDF(16414), AOM_ICDF(18203), AOM_ICDF(19796),
+    AOM_ICDF(21215), AOM_ICDF(22479), AOM_ICDF(23604), AOM_ICDF(24606),
+    AOM_ICDF(25499), AOM_ICDF(26294), AOM_ICDF(27002), AOM_ICDF(32768)},
+  { AOM_ICDF(3328), AOM_ICDF(6318), AOM_ICDF(9004), AOM_ICDF(11417),
+    AOM_ICDF(13585), AOM_ICDF(15533), AOM_ICDF(17283), AOM_ICDF(18856),
+    AOM_ICDF(20269), AOM_ICDF(21538), AOM_ICDF(22678), AOM_ICDF(23703),
+    AOM_ICDF(24624), AOM_ICDF(25451), AOM_ICDF(26194), AOM_ICDF(32768)},
+  { AOM_ICDF(3072), AOM_ICDF(5856), AOM_ICDF(8379), AOM_ICDF(10665),
+    AOM_ICDF(12737), AOM_ICDF(14615), AOM_ICDF(16317), AOM_ICDF(17859),
+    AOM_ICDF(19257), AOM_ICDF(20524), AOM_ICDF(21672), AOM_ICDF(22712),
+    AOM_ICDF(23655), AOM_ICDF(24509), AOM_ICDF(25283), AOM_ICDF(32768)},
+  { AOM_ICDF(2816), AOM_ICDF(5390), AOM_ICDF(7743), AOM_ICDF(9894),
+    AOM_ICDF(11860), AOM_ICDF(13657), AOM_ICDF(15299), AOM_ICDF(16800),
+    AOM_ICDF(18172), AOM_ICDF(19426), AOM_ICDF(20573), AOM_ICDF(21621),
+    AOM_ICDF(22579), AOM_ICDF(23455), AOM_ICDF(24255), AOM_ICDF(32768)},
+  { AOM_ICDF(2560), AOM_ICDF(4920), AOM_ICDF(7096), AOM_ICDF(9102),
+    AOM_ICDF(10951), AOM_ICDF(12656), AOM_ICDF(14227), AOM_ICDF(15676),
+    AOM_ICDF(17011), AOM_ICDF(18242), AOM_ICDF(19377), AOM_ICDF(20423),
+    AOM_ICDF(21388), AOM_ICDF(22277), AOM_ICDF(23097), AOM_ICDF(32768)},
+  { AOM_ICDF(2304), AOM_ICDF(4446), AOM_ICDF(6437), AOM_ICDF(8288),
+    AOM_ICDF(10009), AOM_ICDF(11609), AOM_ICDF(13097), AOM_ICDF(14480),
+    AOM_ICDF(15766), AOM_ICDF(16961), AOM_ICDF(18072), AOM_ICDF(19105),
+    AOM_ICDF(20066), AOM_ICDF(20959), AOM_ICDF(21789), AOM_ICDF(32768)},
+  { AOM_ICDF(2048), AOM_ICDF(3968), AOM_ICDF(5768), AOM_ICDF(7456),
+    AOM_ICDF(9038), AOM_ICDF(10521), AOM_ICDF(11911), AOM_ICDF(13215),
+    AOM_ICDF(14437), AOM_ICDF(15583), AOM_ICDF(16657), AOM_ICDF(17664),
+    AOM_ICDF(18608), AOM_ICDF(19493), AOM_ICDF(20323), AOM_ICDF(32768)},
+  { AOM_ICDF(1792), AOM_ICDF(3486), AOM_ICDF(5087), AOM_ICDF(6601),
+    AOM_ICDF(8032), AOM_ICDF(9385), AOM_ICDF(10664), AOM_ICDF(11873),
+    AOM_ICDF(13016), AOM_ICDF(14096), AOM_ICDF(15117), AOM_ICDF(16082),
+    AOM_ICDF(16995), AOM_ICDF(17858), AOM_ICDF(18673), AOM_ICDF(32768)},
+  { AOM_ICDF(1536), AOM_ICDF(3000), AOM_ICDF(4395), AOM_ICDF(5725),
+    AOM_ICDF(6993), AOM_ICDF(8201), AOM_ICDF(9353), AOM_ICDF(10451),
+    AOM_ICDF(11497), AOM_ICDF(12494), AOM_ICDF(13444), AOM_ICDF(14350),
+    AOM_ICDF(15213), AOM_ICDF(16036), AOM_ICDF(16820), AOM_ICDF(32768)},
+  { AOM_ICDF(1280), AOM_ICDF(2510), AOM_ICDF(3692), AOM_ICDF(4828),
+    AOM_ICDF(5919), AOM_ICDF(6968), AOM_ICDF(7976), AOM_ICDF(8944),
+    AOM_ICDF(9875), AOM_ICDF(10769), AOM_ICDF(11628), AOM_ICDF(12454),
+    AOM_ICDF(13248), AOM_ICDF(14011), AOM_ICDF(14744), AOM_ICDF(32768)},
+  { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(2977), AOM_ICDF(3908),
+    AOM_ICDF(4810), AOM_ICDF(5684), AOM_ICDF(6530), AOM_ICDF(7350),
+    AOM_ICDF(8144), AOM_ICDF(8913), AOM_ICDF(9658), AOM_ICDF(10380),
+    AOM_ICDF(11080), AOM_ICDF(11758), AOM_ICDF(12415), AOM_ICDF(32768)},
+  {  AOM_ICDF(768), AOM_ICDF(1518), AOM_ICDF(2250), AOM_ICDF(2965),
+    AOM_ICDF(3663), AOM_ICDF(4345), AOM_ICDF(5011), AOM_ICDF(5662),
+    AOM_ICDF(6297), AOM_ICDF(6917), AOM_ICDF(7523), AOM_ICDF(8115),
+    AOM_ICDF(8693), AOM_ICDF(9257), AOM_ICDF(9808), AOM_ICDF(32768)},
+  {  AOM_ICDF(512), AOM_ICDF(1016), AOM_ICDF(1512), AOM_ICDF(2000),
+    AOM_ICDF(2481), AOM_ICDF(2954), AOM_ICDF(3420), AOM_ICDF(3879),
+    AOM_ICDF(4330), AOM_ICDF(4774), AOM_ICDF(5211), AOM_ICDF(5642),
+    AOM_ICDF(6066), AOM_ICDF(6483), AOM_ICDF(6894), AOM_ICDF(32768)},
+  {  AOM_ICDF(256),  AOM_ICDF(510),  AOM_ICDF(762), AOM_ICDF(1012),
+    AOM_ICDF(1260), AOM_ICDF(1506), AOM_ICDF(1750), AOM_ICDF(1992),
+    AOM_ICDF(2232), AOM_ICDF(2471), AOM_ICDF(2708), AOM_ICDF(2943),
+    AOM_ICDF(3176), AOM_ICDF(3407), AOM_ICDF(3636), AOM_ICDF(32768)},
+};
+
+
+const uint16_t LAPLACE_OFFSET[128] = {
+  0,
+  29871,
+  28672,
+  27751,
+  26975,
+  26291,
+  25673,
+  25105,
+  24576,
+  24079,
+  23609,
+  23162,
+  22734,
+  22325,
+  21931,
+  21550,
+  21182,
+  20826,
+  20480,
+  20143,
+  19815,
+  19495,
+  19183,
+  18877,
+  18579,
+  18286,
+  17999,
+  17718,
+  17442,
+  17170,
+  16904,
+  16642,
+  16384,
+  16129,
+  15879,
+  15633,
+  15390,
+  15150,
+  14913,
+  14680,
+  14450,
+  14222,
+  13997,
+  13775,
+  13556,
+  13338,
+  13124,
+  12911,
+  12701,
+  12493,
+  12288,
+  12084,
+  11882,
+  11682,
+  11484,
+  11288,
+  11094,
+  10901,
+  10710,
+  10521,
+  10333,
+  10147,
+  9962,
+  9779,
+  9597,
+  9417,
+  9238,
+  9060,
+  8884,
+  8709,
+  8535,
+  8363,
+  8192,
+  8021,
+  7853,
+  7685,
+  7518,
+  7352,
+  7188,
+  7025,
+  6862,
+  6701,
+  6540,
+  6381,
+  6222,
+  6065,
+  5908,
+  5753,
+  5598,
+  5444,
+  5291,
+  5138,
+  4987,
+  4837,
+  4687,
+  4538,
+  4390,
+  4242,
+  4096,
+  3950,
+  3804,
+  3660,
+  3516,
+  3373,
+  3231,
+  3089,
+  2948,
+  2808,
+  2668,
+  2529,
+  2391,
+  2253,
+  2116,
+  1979,
+  1843,
+  1708,
+  1573,
+  1439,
+  1306,
+  1172,
+  1040,
+  908,
+  777,
+  646,
+  516,
+  386,
+  257,
+  128,
+};
diff --git a/third_party/aom/av1/common/mips/dspr2/av1_itrans16_dspr2.c b/third_party/aom/av1/common/mips/dspr2/av1_itrans16_dspr2.c
new file mode 100644
index 000000000..79f9338bd
--- /dev/null
+++ b/third_party/aom/av1/common/mips/dspr2/av1_itrans16_dspr2.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+#include "av1/common/common.h"
+#include "av1/common/blockd.h"
+#include "av1/common/idct.h"
+#include "aom_dsp/mips/inv_txfm_dspr2.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+void av1_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, int pitch,
+                                int tx_type) {
+  int i, j;
+  DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+  int16_t *outptr = out;
+  int16_t temp_out[16];
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
+
+  switch (tx_type) {
+    case DCT_DCT:  // DCT in both horizontal and vertical
+      idct16_rows_dspr2(input, outptr, 16);
+      idct16_cols_add_blk_dspr2(out, dest, pitch);
+      break;
+    case ADST_DCT:  // ADST in vertical, DCT in horizontal
+      idct16_rows_dspr2(input, outptr, 16);
+
+      outptr = out;
+
+      for (i = 0; i < 16; ++i) {
+        iadst16_dspr2(outptr, temp_out);
+
+        for (j = 0; j < 16; ++j)
+          dest[j * pitch + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) +
+                                           dest[j * pitch + i]);
+        outptr += 16;
+      }
+      break;
+    case DCT_ADST:  // DCT in vertical, ADST in horizontal
+    {
+      int16_t temp_in[16 * 16];
+
+      for (i = 0; i < 16; ++i) {
+        /* prefetch row */
+        prefetch_load((const uint8_t *)(input + 16));
+
+        iadst16_dspr2(input, outptr);
+        input += 16;
+        outptr += 16;
+      }
+
+      for (i = 0; i < 16; ++i)
+        for (j = 0; j < 16; ++j) temp_in[j * 16 + i] = out[i * 16 + j];
+
+      idct16_cols_add_blk_dspr2(temp_in, dest, pitch);
+    } break;
+    case ADST_ADST:  // ADST in both directions
+    {
+      int16_t temp_in[16];
+
+      for (i = 0; i < 16; ++i) {
+        /* prefetch row */
+        prefetch_load((const uint8_t *)(input + 16));
+
+        iadst16_dspr2(input, outptr);
+        input += 16;
+        outptr += 16;
+      }
+
+      for (i = 0; i < 16; ++i) {
+        for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+        iadst16_dspr2(temp_in, temp_out);
+        for (j = 0; j < 16; ++j)
+          dest[j * pitch + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) +
+                                           dest[j * pitch + i]);
+      }
+    } break;
+    default: printf("av1_short_iht16x16_add_dspr2 : Invalid tx_type\n"); break;
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/av1/common/mips/dspr2/av1_itrans4_dspr2.c b/third_party/aom/av1/common/mips/dspr2/av1_itrans4_dspr2.c
new file mode 100644
index 000000000..0a9552376
--- /dev/null
+++ b/third_party/aom/av1/common/mips/dspr2/av1_itrans4_dspr2.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+#include "av1/common/common.h"
+#include "av1/common/blockd.h"
+#include "av1/common/idct.h"
+#include "aom_dsp/mips/inv_txfm_dspr2.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+void av1_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride, int tx_type) {
+  int i, j;
+  DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+  int16_t *outptr = out;
+  int16_t temp_in[4 * 4], temp_out[4];
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                       :
+                       : [pos] "r"(pos));
+
+  switch (tx_type) {
+    case DCT_DCT:  // DCT in both horizontal and vertical
+      aom_idct4_rows_dspr2(input, outptr);
+      aom_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      break;
+    case ADST_DCT:  // ADST in vertical, DCT in horizontal
+      aom_idct4_rows_dspr2(input, outptr);
+
+      outptr = out;
+
+      for (i = 0; i < 4; ++i) {
+        iadst4_dspr2(outptr, temp_out);
+
+        for (j = 0; j < 4; ++j)
+          dest[j * dest_stride + i] = clip_pixel(
+              ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]);
+
+        outptr += 4;
+      }
+      break;
+    case DCT_ADST:  // DCT in vertical, ADST in horizontal
+      for (i = 0; i < 4; ++i) {
+        iadst4_dspr2(input, outptr);
+        input += 4;
+        outptr += 4;
+      }
+
+      for (i = 0; i < 4; ++i) {
+        for (j = 0; j < 4; ++j) {
+          temp_in[i * 4 + j] = out[j * 4 + i];
+        }
+      }
+      aom_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      break;
+    case ADST_ADST:  // ADST in both directions
+      for (i = 0; i < 4; ++i) {
+        iadst4_dspr2(input, outptr);
+        input += 4;
+        outptr += 4;
+      }
+
+      for (i = 0; i < 4; ++i) {
+        for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
+        iadst4_dspr2(temp_in, temp_out);
+
+        for (j = 0; j < 4; ++j)
+          dest[j * dest_stride + i] = clip_pixel(
+              ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]);
+      }
+      break;
+    default: printf("av1_short_iht4x4_add_dspr2 : Invalid tx_type\n"); break;
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/av1/common/mips/dspr2/av1_itrans8_dspr2.c b/third_party/aom/av1/common/mips/dspr2/av1_itrans8_dspr2.c
new file mode 100644
index 000000000..8bf5b4f0e
--- /dev/null
+++ b/third_party/aom/av1/common/mips/dspr2/av1_itrans8_dspr2.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+#include "av1/common/common.h"
+#include "av1/common/blockd.h"
+#include "aom_dsp/mips/inv_txfm_dspr2.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+void av1_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride, int tx_type) {
+  int i, j;
+  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+  int16_t *outptr = out;
+  int16_t temp_in[8 * 8], temp_out[8];
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
+
+  switch (tx_type) {
+    case DCT_DCT:  // DCT in both horizontal and vertical
+      idct8_rows_dspr2(input, outptr, 8);
+      idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      break;
+    case ADST_DCT:  // ADST in vertical, DCT in horizontal
+      idct8_rows_dspr2(input, outptr, 8);
+
+      for (i = 0; i < 8; ++i) {
+        iadst8_dspr2(&out[i * 8], temp_out);
+
+        for (j = 0; j < 8; ++j)
+          dest[j * dest_stride + i] = clip_pixel(
+              ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]);
+      }
+      break;
+    case DCT_ADST:  // DCT in vertical, ADST in horizontal
+      for (i = 0; i < 8; ++i) {
+        iadst8_dspr2(input, outptr);
+        input += 8;
+        outptr += 8;
+      }
+
+      for (i = 0; i < 8; ++i) {
+        for (j = 0; j < 8; ++j) {
+          temp_in[i * 8 + j] = out[j * 8 + i];
+        }
+      }
+      idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      break;
+    case ADST_ADST:  // ADST in both directions
+      for (i = 0; i < 8; ++i) {
+        iadst8_dspr2(input, outptr);
+        input += 8;
+        outptr += 8;
+      }
+
+      for (i = 0; i < 8; ++i) {
+        for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
+
+        iadst8_dspr2(temp_in, temp_out);
+
+        for (j = 0; j < 8; ++j)
+          dest[j * dest_stride + i] = clip_pixel(
+              ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]);
+      }
+      break;
+    default: printf("av1_short_iht8x8_add_dspr2 : Invalid tx_type\n"); break;
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/av1/common/mips/msa/av1_idct16x16_msa.c b/third_party/aom/av1/common/mips/msa/av1_idct16x16_msa.c
new file mode 100644
index 000000000..4bd0a1635
--- /dev/null
+++ b/third_party/aom/av1/common/mips/msa/av1_idct16x16_msa.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/enums.h"
+#include "aom_dsp/mips/inv_txfm_msa.h"
+
+void av1_iht16x16_256_add_msa(const int16_t *input, uint8_t *dst,
+                              int32_t dst_stride, int32_t tx_type) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+  int16_t *out_ptr = &out[0];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      /* transform rows */
+      for (i = 0; i < 2; ++i) {
+        /* process 16 * 8 block */
+        aom_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+      }
+
+      /* transform columns */
+      for (i = 0; i < 2; ++i) {
+        /* process 8 * 16 block */
+        aom_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
+                                         dst_stride);
+      }
+      break;
+    case ADST_DCT:
+      /* transform rows */
+      for (i = 0; i < 2; ++i) {
+        /* process 16 * 8 block */
+        aom_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+      }
+
+      /* transform columns */
+      for (i = 0; i < 2; ++i) {
+        aom_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                          (dst + (i << 3)), dst_stride);
+      }
+      break;
+    case DCT_ADST:
+      /* transform rows */
+      for (i = 0; i < 2; ++i) {
+        /* process 16 * 8 block */
+        aom_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+      }
+
+      /* transform columns */
+      for (i = 0; i < 2; ++i) {
+        /* process 8 * 16 block */
+        aom_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
+                                         dst_stride);
+      }
+      break;
+    case ADST_ADST:
+      /* transform rows */
+      for (i = 0; i < 2; ++i) {
+        /* process 16 * 8 block */
+        aom_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+      }
+
+      /* transform columns */
+      for (i = 0; i < 2; ++i) {
+        aom_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                          (dst + (i << 3)), dst_stride);
+      }
+      break;
+    default: assert(0); break;
+  }
+}
diff --git a/third_party/aom/av1/common/mips/msa/av1_idct4x4_msa.c b/third_party/aom/av1/common/mips/msa/av1_idct4x4_msa.c
new file mode 100644
index 000000000..8364f8dc4
--- /dev/null
+++ b/third_party/aom/av1/common/mips/msa/av1_idct4x4_msa.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/enums.h"
+#include "aom_dsp/mips/inv_txfm_msa.h"
+
+void av1_iht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
+                           int32_t dst_stride, int32_t tx_type) {
+  v8i16 in0, in1, in2, in3;
+
+  /* load vector elements of 4x4 block */
+  LD4x4_SH(input, in0, in1, in2, in3);
+  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      /* DCT in horizontal */
+      AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      /* DCT in vertical */
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case ADST_DCT:
+      /* DCT in horizontal */
+      AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      /* ADST in vertical */
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      AOM_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case DCT_ADST:
+      /* ADST in horizontal */
+      AOM_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      /* DCT in vertical */
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case ADST_ADST:
+      /* ADST in horizontal */
+      AOM_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      /* ADST in vertical */
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      AOM_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    default: assert(0); break;
+  }
+
+  /* final rounding (add 2^3, divide by 2^4) and shift */
+  SRARI_H4_SH(in0, in1, in2, in3, 4);
+  /* add block and store 4x4 */
+  ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}
diff --git a/third_party/aom/av1/common/mips/msa/av1_idct8x8_msa.c b/third_party/aom/av1/common/mips/msa/av1_idct8x8_msa.c
new file mode 100644
index 000000000..71117051b
--- /dev/null
+++ b/third_party/aom/av1/common/mips/msa/av1_idct8x8_msa.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/enums.h"
+#include "aom_dsp/mips/inv_txfm_msa.h"
+
+void av1_iht8x8_64_add_msa(const int16_t *input, uint8_t *dst,
+                           int32_t dst_stride, int32_t tx_type) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+  /* load vector elements of 8x8 block */
+  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      /* DCT in horizontal */
+      AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+      /* DCT in vertical */
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
+                         in3, in4, in5, in6, in7);
+      AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+      break;
+    case ADST_DCT:
+      /* DCT in horizontal */
+      AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+      /* ADST in vertical */
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
+                         in3, in4, in5, in6, in7);
+      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+                in5, in6, in7);
+      break;
+    case DCT_ADST:
+      /* ADST in horizontal */
+      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+                in5, in6, in7);
+      /* DCT in vertical */
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
+                         in3, in4, in5, in6, in7);
+      AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+      break;
+    case ADST_ADST:
+      /* ADST in horizontal */
+      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+                in5, in6, in7);
+      /* ADST in vertical */
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
+                         in3, in4, in5, in6, in7);
+      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+                in5, in6, in7);
+      break;
+    default: assert(0); break;
+  }
+
+  /* final rounding (add 2^4, divide by 2^5) and shift */
+  SRARI_H4_SH(in0, in1, in2, in3, 5);
+  SRARI_H4_SH(in4, in5, in6, in7, 5);
+
+  /* add block and store 8x8 */
+  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+  dst += (4 * dst_stride);
+  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
diff --git a/third_party/aom/av1/common/mv.h b/third_party/aom/av1/common/mv.h
new file mode 100644
index 000000000..d4df3790f
--- /dev/null
+++ b/third_party/aom/av1/common/mv.h
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_MV_H_
+#define AV1_COMMON_MV_H_
+
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "aom_dsp/aom_filter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct mv {
+  int16_t row;
+  int16_t col;
+} MV;
+
+typedef union int_mv {
+  uint32_t as_int;
+  MV as_mv;
+} int_mv; /* facilitates faster equality tests and copies */
+
+typedef struct mv32 {
+  int32_t row;
+  int32_t col;
+} MV32;
+
+#if (CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR) && CONFIG_GLOBAL_MOTION
+#define SEPARATE_GLOBAL_MOTION 1
+#endif  // (CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR) && CONFIG_GLOBAL_MOTION
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+// Bits of precision used for the model
+#define WARPEDMODEL_PREC_BITS 16
+#define WARPEDMODEL_ROW3HOMO_PREC_BITS 16
+
+#define WARPEDMODEL_TRANS_CLAMP (128 << WARPEDMODEL_PREC_BITS)
+#define WARPEDMODEL_DIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS + 1))
+#define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 1))
+#define WARPEDMODEL_ROW3HOMO_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 1))
+
+// Bits of subpel precision for warped interpolation
+#define WARPEDPIXEL_PREC_BITS 6
+#define WARPEDPIXEL_PREC_SHIFTS (1 << WARPEDPIXEL_PREC_BITS)
+
+// Taps for ntap filter
+#define WARPEDPIXEL_FILTER_TAPS 6
+
+// Precision of filter taps
+#define WARPEDPIXEL_FILTER_BITS 7
+
+// Precision bits reduction after horizontal shear
+#define HORSHEAR_REDUCE_PREC_BITS 5
+#define VERSHEAR_REDUCE_PREC_BITS \
+  (2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)
+
+#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
+
+/* clang-format off */
+typedef enum {
+  IDENTITY = 0,      // identity transformation, 0-parameter
+  TRANSLATION = 1,   // translational motion 2-parameter
+  ROTZOOM = 2,       // simplified affine with rotation + zoom only, 4-parameter
+  AFFINE = 3,        // affine, 6-parameter
+  HORTRAPEZOID = 4,  // constrained homography, hor trapezoid, 6-parameter
+  VERTRAPEZOID = 5,  // constrained homography, ver trapezoid, 6-parameter
+  HOMOGRAPHY = 6,    // homography, 8-parameter
+  TRANS_TYPES = 7,
+} TransformationType;
+/* clang-format on */
+
+// Number of types used for global motion (must be >= 3 and <= TRANS_TYPES)
+// The following can be useful:
+// GLOBAL_TRANS_TYPES 3 - up to rotation-zoom
+// GLOBAL_TRANS_TYPES 4 - up to affine
+// GLOBAL_TRANS_TYPES 6 - up to hor/ver trapezoids
+// GLOBAL_TRANS_TYPES 7 - up to full homography
+#define GLOBAL_TRANS_TYPES 4
+
+typedef struct {
+#if CONFIG_GLOBAL_MOTION
+  int global_warp_allowed;
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+  int local_warp_allowed;
+#endif  // CONFIG_WARPED_MOTION
+} WarpTypesAllowed;
+
+// number of parameters used by each transformation in TransformationTypes
+static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6, 6, 6, 8 };
+
+// The order of values in the wmmat matrix below is best described
+// by the homography:
+//      [x'     (m2 m3 m0   [x
+//  z .  y'  =   m4 m5 m1 *  y
+//       1]      m6 m7 1)    1]
+typedef struct {
+  TransformationType wmtype;
+  int32_t wmmat[8];
+  int16_t alpha, beta, gamma, delta;
+} WarpedMotionParams;
+
+static INLINE void set_default_warp_params(WarpedMotionParams *wm) {
+  static const int32_t default_wm_mat[8] = {
+    0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0
+  };
+  memset(wm, 0, sizeof(*wm));
+  memcpy(wm->wmmat, default_wm_mat, sizeof(wm->wmmat));
+  wm->wmtype = IDENTITY;
+}
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+
+#if CONFIG_GLOBAL_MOTION
+// The following constants describe the various precisions
+// of different parameters in the global motion experiment.
+//
+// Given the general homography:
+//      [x'     (a  b  c   [x
+//  z .  y'  =   d  e  f *  y
+//       1]      g  h  i)    1]
+//
+// Constants using the name ALPHA here are related to parameters
+// a, b, d, e. Constants using the name TRANS are related
+// to parameters c and f.
+//
+// Anything ending in PREC_BITS is the number of bits of precision
+// to maintain when converting from double to integer.
+//
+// The ABS parameters are used to create an upper and lower bound
+// for each parameter. In other words, after a parameter is integerized
+// it is clamped between -(1 << ABS_XXX_BITS) and (1 << ABS_XXX_BITS).
+//
+// XXX_PREC_DIFF and XXX_DECODE_FACTOR
+// are computed once here to prevent repetitive
+// computation on the decoder side. These are
+// to allow the global motion parameters to be encoded in a lower
+// precision than the warped model precision. This means that they
+// need to be changed to warped precision when they are decoded.
+//
+// XX_MIN, XX_MAX are also computed to avoid repeated computation
+
+#define SUBEXPFIN_K 3
+#define GM_TRANS_PREC_BITS 6
+#define GM_ABS_TRANS_BITS 12
+#define GM_ABS_TRANS_ONLY_BITS (GM_ABS_TRANS_BITS - GM_TRANS_PREC_BITS + 3)
+#define GM_TRANS_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_TRANS_PREC_BITS)
+#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3)
+#define GM_TRANS_DECODE_FACTOR (1 << GM_TRANS_PREC_DIFF)
+#define GM_TRANS_ONLY_DECODE_FACTOR (1 << GM_TRANS_ONLY_PREC_DIFF)
+
+#define GM_ALPHA_PREC_BITS 15
+#define GM_ABS_ALPHA_BITS 12
+#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
+#define GM_ALPHA_DECODE_FACTOR (1 << GM_ALPHA_PREC_DIFF)
+
+#define GM_ROW3HOMO_PREC_BITS 16
+#define GM_ABS_ROW3HOMO_BITS 11
+#define GM_ROW3HOMO_PREC_DIFF \
+  (WARPEDMODEL_ROW3HOMO_PREC_BITS - GM_ROW3HOMO_PREC_BITS)
+#define GM_ROW3HOMO_DECODE_FACTOR (1 << GM_ROW3HOMO_PREC_DIFF)
+
+#define GM_TRANS_MAX (1 << GM_ABS_TRANS_BITS)
+#define GM_ALPHA_MAX (1 << GM_ABS_ALPHA_BITS)
+#define GM_ROW3HOMO_MAX (1 << GM_ABS_ROW3HOMO_BITS)
+
+#define GM_TRANS_MIN -GM_TRANS_MAX
+#define GM_ALPHA_MIN -GM_ALPHA_MAX
+#define GM_ROW3HOMO_MIN -GM_ROW3HOMO_MAX
+
+// Use global motion parameters for sub8x8 blocks
+#define GLOBAL_SUB8X8_USED 0
+
+static INLINE int block_center_x(int mi_col, BLOCK_SIZE bs) {
+  const int bw = block_size_wide[bs];
+  return mi_col * MI_SIZE + bw / 2 - 1;
+}
+
+static INLINE int block_center_y(int mi_row, BLOCK_SIZE bs) {
+  const int bh = block_size_high[bs];
+  return mi_row * MI_SIZE + bh / 2 - 1;
+}
+
+static INLINE int convert_to_trans_prec(int allow_hp, int coor) {
+  if (allow_hp)
+    return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 3);
+  else
+    return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 2) * 2;
+}
+
+// Convert a global motion translation vector (which may have more bits than a
+// regular motion vector) into a motion vector
+static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm,
+                                          int allow_hp, BLOCK_SIZE bsize,
+                                          int mi_col, int mi_row,
+                                          int block_idx) {
+  const int unify_bsize = CONFIG_CB4X4;
+  int_mv res;
+  const int32_t *mat = gm->wmmat;
+  int x, y, tx, ty;
+
+  if (gm->wmtype == TRANSLATION) {
+    res.as_mv.row = gm->wmmat[0] >> GM_TRANS_ONLY_PREC_DIFF;
+    res.as_mv.col = gm->wmmat[1] >> GM_TRANS_ONLY_PREC_DIFF;
+    return res;
+  }
+
+  if (bsize >= BLOCK_8X8 || unify_bsize) {
+    x = block_center_x(mi_col, bsize);
+    y = block_center_y(mi_row, bsize);
+  } else {
+    x = block_center_x(mi_col, bsize);
+    y = block_center_y(mi_row, bsize);
+    x += (block_idx & 1) * MI_SIZE / 2;
+    y += (block_idx & 2) * MI_SIZE / 4;
+  }
+
+  if (gm->wmtype == ROTZOOM) {
+    assert(gm->wmmat[5] == gm->wmmat[2]);
+    assert(gm->wmmat[4] == -gm->wmmat[3]);
+  }
+  if (gm->wmtype > AFFINE) {
+    int xc = (int)((int64_t)mat[2] * x + (int64_t)mat[3] * y + mat[0]);
+    int yc = (int)((int64_t)mat[4] * x + (int64_t)mat[5] * y + mat[1]);
+    const int Z = (int)((int64_t)mat[6] * x + (int64_t)mat[7] * y +
+                        (1 << WARPEDMODEL_ROW3HOMO_PREC_BITS));
+    xc *= 1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS - WARPEDMODEL_PREC_BITS);
+    yc *= 1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS - WARPEDMODEL_PREC_BITS);
+    xc = (int)(xc > 0 ? ((int64_t)xc + Z / 2) / Z : ((int64_t)xc - Z / 2) / Z);
+    yc = (int)(yc > 0 ? ((int64_t)yc + Z / 2) / Z : ((int64_t)yc - Z / 2) / Z);
+    tx = convert_to_trans_prec(allow_hp, xc) - (x << 3);
+    ty = convert_to_trans_prec(allow_hp, yc) - (y << 3);
+  } else {
+    const int xc =
+        (mat[2] - (1 << WARPEDMODEL_PREC_BITS)) * x + mat[3] * y + mat[0];
+    const int yc =
+        mat[4] * x + (mat[5] - (1 << WARPEDMODEL_PREC_BITS)) * y + mat[1];
+    tx = convert_to_trans_prec(allow_hp, xc);
+    ty = convert_to_trans_prec(allow_hp, yc);
+  }
+
+  res.as_mv.row = ty;
+  res.as_mv.col = tx;
+  return res;
+}
+
+static INLINE TransformationType get_gmtype(const WarpedMotionParams *gm) {
+  if (gm->wmmat[6] != 0 || gm->wmmat[7] != 0) {
+    if (!gm->wmmat[6] && !gm->wmmat[4]) return HORTRAPEZOID;
+    if (!gm->wmmat[7] && !gm->wmmat[3]) return VERTRAPEZOID;
+    return HOMOGRAPHY;
+  }
+  if (gm->wmmat[5] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[4] &&
+      gm->wmmat[2] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[3]) {
+    return ((!gm->wmmat[1] && !gm->wmmat[0]) ? IDENTITY : TRANSLATION);
+  }
+  if (gm->wmmat[2] == gm->wmmat[5] && gm->wmmat[3] == -gm->wmmat[4])
+    return ROTZOOM;
+  else
+    return AFFINE;
+}
+#endif  // CONFIG_GLOBAL_MOTION
+
+#if CONFIG_REF_MV
+typedef struct candidate_mv {
+  int_mv this_mv;
+  int_mv comp_mv;
+  uint8_t pred_diff[2];
+  int weight;
+} CANDIDATE_MV;
+#endif
+
+static INLINE int is_zero_mv(const MV *mv) {
+  return *((const uint32_t *)mv) == 0;
+}
+
+static INLINE int is_equal_mv(const MV *a, const MV *b) {
+  return *((const uint32_t *)a) == *((const uint32_t *)b);
+}
+
+static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row,
+                            int max_row) {
+  mv->col = clamp(mv->col, min_col, max_col);
+  mv->row = clamp(mv->row, min_row, max_row);
+}
+
+static INLINE int mv_has_subpel(const MV *mv) {
+  return (mv->row & SUBPEL_MASK) || (mv->col & SUBPEL_MASK);
+}
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_MV_H_
diff --git a/third_party/aom/av1/common/mvref_common.c b/third_party/aom/av1/common/mvref_common.c
new file mode 100644
index 000000000..5222948c8
--- /dev/null
+++ b/third_party/aom/av1/common/mvref_common.c
@@ -0,0 +1,1164 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/mvref_common.h"
+#if CONFIG_WARPED_MOTION
+#include "av1/common/warped_motion.h"
+#endif  // CONFIG_WARPED_MOTION
+
+#if CONFIG_REF_MV
+
+static uint8_t add_ref_mv_candidate(
+    const MODE_INFO *const candidate_mi, const MB_MODE_INFO *const candidate,
+    const MV_REFERENCE_FRAME rf[2], uint8_t *refmv_count,
+    CANDIDATE_MV *ref_mv_stack, const int use_hp, int len, int block, int col) {
+  int index = 0, ref;
+  int newmv_count = 0;
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+#endif
+
+  if (rf[1] == NONE_FRAME) {
+    // single reference frame
+    for (ref = 0; ref < 2; ++ref) {
+      if (candidate->ref_frame[ref] == rf[0]) {
+        int_mv this_refmv = get_sub_block_mv(candidate_mi, ref, col, block);
+        lower_mv_precision(&this_refmv.as_mv, use_hp);
+
+        for (index = 0; index < *refmv_count; ++index)
+          if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) break;
+
+        if (index < *refmv_count) ref_mv_stack[index].weight += 2 * len;
+
+        // Add a new item to the list.
+        if (index == *refmv_count) {
+          ref_mv_stack[index].this_mv = this_refmv;
+          ref_mv_stack[index].pred_diff[0] = av1_get_pred_diff_ctx(
+              get_sub_block_pred_mv(candidate_mi, ref, col, block), this_refmv);
+          ref_mv_stack[index].weight = 2 * len;
+          ++(*refmv_count);
+
+          if (candidate->mode == NEWMV) ++newmv_count;
+        }
+
+        if (candidate_mi->mbmi.sb_type < BLOCK_8X8 && block >= 0 &&
+            !unify_bsize) {
+          int alt_block = 3 - block;
+          this_refmv = get_sub_block_mv(candidate_mi, ref, col, alt_block);
+          lower_mv_precision(&this_refmv.as_mv, use_hp);
+
+          for (index = 0; index < *refmv_count; ++index)
+            if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) break;
+
+          if (index < *refmv_count) ref_mv_stack[index].weight += len;
+
+          // Add a new item to the list.
+          if (index == *refmv_count) {
+            ref_mv_stack[index].this_mv = this_refmv;
+            ref_mv_stack[index].pred_diff[0] = av1_get_pred_diff_ctx(
+                get_sub_block_pred_mv(candidate_mi, ref, col, alt_block),
+                this_refmv);
+            ref_mv_stack[index].weight = len;
+            ++(*refmv_count);
+
+            if (candidate->mode == NEWMV) ++newmv_count;
+          }
+        }
+      }
+    }
+  } else {
+    // compound reference frame
+    if (candidate->ref_frame[0] == rf[0] && candidate->ref_frame[1] == rf[1]) {
+      int_mv this_refmv[2];
+
+      for (ref = 0; ref < 2; ++ref) {
+        this_refmv[ref] = get_sub_block_mv(candidate_mi, ref, col, block);
+        lower_mv_precision(&this_refmv[ref].as_mv, use_hp);
+      }
+
+      for (index = 0; index < *refmv_count; ++index)
+        if ((ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int) &&
+            (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int))
+          break;
+
+      if (index < *refmv_count) ref_mv_stack[index].weight += 2 * len;
+
+      // Add a new item to the list.
+      if (index == *refmv_count) {
+        ref_mv_stack[index].this_mv = this_refmv[0];
+        ref_mv_stack[index].comp_mv = this_refmv[1];
+        ref_mv_stack[index].pred_diff[0] = av1_get_pred_diff_ctx(
+            get_sub_block_pred_mv(candidate_mi, 0, col, block), this_refmv[0]);
+        ref_mv_stack[index].pred_diff[1] = av1_get_pred_diff_ctx(
+            get_sub_block_pred_mv(candidate_mi, 1, col, block), this_refmv[1]);
+        ref_mv_stack[index].weight = 2 * len;
+        ++(*refmv_count);
+
+#if CONFIG_EXT_INTER
+        if (candidate->mode == NEW_NEWMV)
+#else
+        if (candidate->mode == NEWMV)
+#endif  // CONFIG_EXT_INTER
+          ++newmv_count;
+      }
+
+      if (candidate_mi->mbmi.sb_type < BLOCK_8X8 && block >= 0 &&
+          !unify_bsize) {
+        int alt_block = 3 - block;
+        this_refmv[0] = get_sub_block_mv(candidate_mi, 0, col, alt_block);
+        this_refmv[1] = get_sub_block_mv(candidate_mi, 1, col, alt_block);
+
+        for (ref = 0; ref < 2; ++ref)
+          lower_mv_precision(&this_refmv[ref].as_mv, use_hp);
+
+        for (index = 0; index < *refmv_count; ++index)
+          if (ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int &&
+              ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int)
+            break;
+
+        if (index < *refmv_count) ref_mv_stack[index].weight += len;
+
+        // Add a new item to the list.
+        if (index == *refmv_count) {
+          ref_mv_stack[index].this_mv = this_refmv[0];
+          ref_mv_stack[index].comp_mv = this_refmv[1];
+          ref_mv_stack[index].pred_diff[0] = av1_get_pred_diff_ctx(
+              get_sub_block_pred_mv(candidate_mi, 0, col, block),
+              this_refmv[0]);
+          ref_mv_stack[index].pred_diff[0] = av1_get_pred_diff_ctx(
+              get_sub_block_pred_mv(candidate_mi, 1, col, block),
+              this_refmv[1]);
+          ref_mv_stack[index].weight = len;
+          ++(*refmv_count);
+
+#if CONFIG_EXT_INTER
+          if (candidate->mode == NEW_NEWMV)
+#else
+          if (candidate->mode == NEWMV)
+#endif  // CONFIG_EXT_INTER
+            ++newmv_count;
+        }
+      }
+    }
+  }
+  return newmv_count;
+}
+
+static uint8_t scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                             const int mi_row, const int mi_col, int block,
+                             const MV_REFERENCE_FRAME rf[2], int row_offset,
+                             CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count) {
+  const TileInfo *const tile = &xd->tile;
+  int i;
+  uint8_t newmv_count = 0;
+#if CONFIG_CB4X4
+  const int bsize = xd->mi[0]->mbmi.sb_type;
+  const int mi_offset =
+      bsize < BLOCK_8X8 ? mi_size_wide[BLOCK_4X4] : mi_size_wide[BLOCK_8X8];
+  // TODO(jingning): Revisit this part after cb4x4 is stable.
+  if (bsize >= BLOCK_8X8) row_offset *= 2;
+#else
+  const int mi_offset = mi_size_wide[BLOCK_8X8];
+#endif
+
+  for (i = 0; i < xd->n8_w && *refmv_count < MAX_REF_MV_STACK_SIZE;) {
+    POSITION mi_pos;
+#if CONFIG_CB4X4
+    const int use_step_16 = (xd->n8_w >= 16);
+#else
+    const int use_step_16 = (xd->n8_w >= 8);
+#endif
+
+    mi_pos.row = row_offset;
+    mi_pos.col = i;
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, cm, &mi_pos)) {
+      const MODE_INFO *const candidate_mi =
+          xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+      int len = AOMMIN(xd->n8_w, mi_size_wide[candidate->sb_type]);
+      if (use_step_16) len = AOMMAX(mi_size_wide[BLOCK_16X16], len);
+      newmv_count += add_ref_mv_candidate(
+          candidate_mi, candidate, rf, refmv_count, ref_mv_stack,
+          cm->allow_high_precision_mv, len, block, mi_pos.col);
+      i += len;
+    } else {
+      if (use_step_16)
+        i += (mi_offset << 1);
+      else
+        i += mi_offset;
+    }
+  }
+
+  return newmv_count;
+}
+
+static uint8_t scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                             const int mi_row, const int mi_col, int block,
+                             const MV_REFERENCE_FRAME rf[2], int col_offset,
+                             CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count) {
+  const TileInfo *const tile = &xd->tile;
+  int i;
+  uint8_t newmv_count = 0;
+#if CONFIG_CB4X4
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const int mi_offset =
+      (bsize < BLOCK_8X8) ? mi_size_high[BLOCK_4X4] : mi_size_high[BLOCK_8X8];
+  if (bsize >= BLOCK_8X8) col_offset *= 2;
+#else
+  const int mi_offset = mi_size_wide[BLOCK_8X8];
+#endif
+
+  for (i = 0; i < xd->n8_h && *refmv_count < MAX_REF_MV_STACK_SIZE;) {
+    POSITION mi_pos;
+#if CONFIG_CB4X4
+    const int use_step_16 = (xd->n8_h >= 16);
+#else
+    const int use_step_16 = (xd->n8_h >= 8);
+#endif
+
+    mi_pos.row = i;
+    mi_pos.col = col_offset;
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, cm, &mi_pos)) {
+      const MODE_INFO *const candidate_mi =
+          xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+      int len = AOMMIN(xd->n8_h, mi_size_high[candidate->sb_type]);
+      if (use_step_16) len = AOMMAX(mi_size_high[BLOCK_16X16], len);
+      newmv_count += add_ref_mv_candidate(
+          candidate_mi, candidate, rf, refmv_count, ref_mv_stack,
+          cm->allow_high_precision_mv, len, block, mi_pos.col);
+      i += len;
+    } else {
+      if (use_step_16)
+        i += (mi_offset << 1);
+      else
+        i += mi_offset;
+    }
+  }
+
+  return newmv_count;
+}
+
+static uint8_t scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                             const int mi_row, const int mi_col, int block,
+                             const MV_REFERENCE_FRAME rf[2], int row_offset,
+                             int col_offset, CANDIDATE_MV *ref_mv_stack,
+                             uint8_t *refmv_count) {
+  const TileInfo *const tile = &xd->tile;
+  POSITION mi_pos;
+  uint8_t newmv_count = 0;
+
+  mi_pos.row = row_offset;
+  mi_pos.col = col_offset;
+
+  if (is_inside(tile, mi_col, mi_row, cm->mi_rows, cm, &mi_pos) &&
+      *refmv_count < MAX_REF_MV_STACK_SIZE) {
+    const MODE_INFO *const candidate_mi =
+        xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
+    const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+    const int len = mi_size_wide[BLOCK_8X8];
+
+    newmv_count += add_ref_mv_candidate(
+        candidate_mi, candidate, rf, refmv_count, ref_mv_stack,
+        cm->allow_high_precision_mv, len, block, mi_pos.col);
+  }  // Analyze a single 8x8 block motion information.
+
+  return newmv_count;
+}
+
+static int has_top_right(const MACROBLOCKD *xd, int mi_row, int mi_col,
+                         int bs) {
+  const int mask_row = mi_row & MAX_MIB_MASK;
+  const int mask_col = mi_col & MAX_MIB_MASK;
+
+  // In a split partition all apart from the bottom right has a top right
+  int has_tr = !((mask_row & bs) && (mask_col & bs));
+
+  // bs > 0 and bs is a power of 2
+  assert(bs > 0 && !(bs & (bs - 1)));
+
+  // For each 4x4 group of blocks, when the bottom right is decoded the blocks
+  // to the right have not been decoded therefore the bottom right does
+  // not have a top right
+  while (bs < MAX_MIB_SIZE) {
+    if (mask_col & bs) {
+      if ((mask_col & (2 * bs)) && (mask_row & (2 * bs))) {
+        has_tr = 0;
+        break;
+      }
+    } else {
+      break;
+    }
+    bs <<= 1;
+  }
+
+  // The left hand of two vertical rectangles always has a top right (as the
+  // block above will have been decoded)
+  if (xd->n8_w < xd->n8_h)
+    if (!xd->is_sec_rect) has_tr = 1;
+
+  // The bottom of two horizontal rectangles never has a top right (as the block
+  // to the right won't have been decoded)
+  if (xd->n8_w > xd->n8_h)
+    if (xd->is_sec_rect) has_tr = 0;
+
+#if CONFIG_EXT_PARTITION_TYPES
+  // The bottom left square of a Vertical A does not have a top right as it is
+  // decoded before the right hand rectangle of the partition
+  if (xd->mi[0]->mbmi.partition == PARTITION_VERT_A)
+    if ((mask_row & bs) && !(mask_col & bs)) has_tr = 0;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
+  return has_tr;
+}
+
+static int add_col_ref_mv(const AV1_COMMON *cm,
+                          const MV_REF *prev_frame_mvs_base,
+                          const MACROBLOCKD *xd, int mi_row, int mi_col,
+                          MV_REFERENCE_FRAME ref_frame, int blk_row,
+                          int blk_col, uint8_t *refmv_count,
+                          CANDIDATE_MV *ref_mv_stack, int16_t *mode_context) {
+  const MV_REF *prev_frame_mvs =
+      prev_frame_mvs_base + blk_row * cm->mi_cols + blk_col;
+  POSITION mi_pos;
+  int ref, idx;
+  int coll_blk_count = 0;
+  const int weight_unit = mi_size_wide[BLOCK_8X8];
+
+#if CONFIG_MV_COMPRESS
+  mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1;
+  mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1;
+#else
+  mi_pos.row = blk_row;
+  mi_pos.col = blk_col;
+#endif
+
+  if (!is_inside(&xd->tile, mi_col, mi_row, cm->mi_rows, cm, &mi_pos))
+    return coll_blk_count;
+  for (ref = 0; ref < 2; ++ref) {
+    if (prev_frame_mvs->ref_frame[ref] == ref_frame) {
+      int_mv this_refmv = prev_frame_mvs->mv[ref];
+      lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv);
+
+      if (abs(this_refmv.as_mv.row) >= 16 || abs(this_refmv.as_mv.col) >= 16)
+        mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
+
+      for (idx = 0; idx < *refmv_count; ++idx)
+        if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break;
+
+      if (idx < *refmv_count) ref_mv_stack[idx].weight += 2 * weight_unit;
+
+      if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+        ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
+        ref_mv_stack[idx].pred_diff[0] =
+            av1_get_pred_diff_ctx(prev_frame_mvs->pred_mv[ref], this_refmv);
+        ref_mv_stack[idx].weight = 2 * weight_unit;
+        ++(*refmv_count);
+      }
+
+      ++coll_blk_count;
+    }
+  }
+
+  return coll_blk_count;
+}
+
+static void setup_ref_mv_list(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                              MV_REFERENCE_FRAME ref_frame,
+                              uint8_t *refmv_count, CANDIDATE_MV *ref_mv_stack,
+                              int_mv *mv_ref_list, int block, int mi_row,
+                              int mi_col, int16_t *mode_context) {
+  int idx, nearest_refmv_count = 0;
+  uint8_t newmv_count = 0;
+  CANDIDATE_MV tmp_mv;
+  int len, nr_len;
+
+#if CONFIG_MV_COMPRESS
+  const MV_REF *const prev_frame_mvs_base =
+      cm->use_prev_frame_mvs
+          ? cm->prev_frame->mvs + (((mi_row >> 1) << 1) + 1) * cm->mi_cols +
+                ((mi_col >> 1) << 1) + 1
+          : NULL;
+#else
+  const MV_REF *const prev_frame_mvs_base =
+      cm->use_prev_frame_mvs
+          ? cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col
+          : NULL;
+#endif
+
+  const int bs = AOMMAX(xd->n8_w, xd->n8_h);
+  const int has_tr = has_top_right(xd, mi_row, mi_col, bs);
+  MV_REFERENCE_FRAME rf[2];
+
+  av1_set_ref_frame(rf, ref_frame);
+  mode_context[ref_frame] = 0;
+  *refmv_count = 0;
+
+  // Scan the first above row mode info.
+  newmv_count += scan_row_mbmi(cm, xd, mi_row, mi_col, block, rf, -1,
+                               ref_mv_stack, refmv_count);
+  // Scan the first left column mode info.
+  newmv_count += scan_col_mbmi(cm, xd, mi_row, mi_col, block, rf, -1,
+                               ref_mv_stack, refmv_count);
+
+  // Check top-right boundary
+  if (has_tr)
+    newmv_count += scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf, -1,
+                                 xd->n8_w, ref_mv_stack, refmv_count);
+
+  nearest_refmv_count = *refmv_count;
+
+  for (idx = 0; idx < nearest_refmv_count; ++idx)
+    ref_mv_stack[idx].weight += REF_CAT_LEVEL;
+#if CONFIG_TEMPMV_SIGNALING
+  if (cm->use_prev_frame_mvs && rf[1] == NONE_FRAME) {
+#else
+  if (prev_frame_mvs_base && cm->show_frame && cm->last_show_frame &&
+      rf[1] == NONE_FRAME) {
+#endif
+    int blk_row, blk_col;
+    int coll_blk_count = 0;
+#if CONFIG_CB4X4
+    const int mi_step = (xd->n8_w == 1 || xd->n8_h == 1)
+                            ? mi_size_wide[BLOCK_8X8]
+                            : mi_size_wide[BLOCK_16X16];
+#else
+    const int mi_step = mi_size_wide[BLOCK_16X16];
+#endif
+
+#if CONFIG_TPL_MV
+    int tpl_sample_pos[5][2] = { { -1, xd->n8_w },
+                                 { 0, xd->n8_w },
+                                 { xd->n8_h, xd->n8_w },
+                                 { xd->n8_h, 0 },
+                                 { xd->n8_h, -1 } };
+    int i;
+#endif
+
+    for (blk_row = 0; blk_row < xd->n8_h; blk_row += mi_step) {
+      for (blk_col = 0; blk_col < xd->n8_w; blk_col += mi_step) {
+        coll_blk_count += add_col_ref_mv(
+            cm, prev_frame_mvs_base, xd, mi_row, mi_col, ref_frame, blk_row,
+            blk_col, refmv_count, ref_mv_stack, mode_context);
+      }
+    }
+
+#if CONFIG_TPL_MV
+    for (i = 0; i < 5; ++i) {
+      blk_row = tpl_sample_pos[i][0];
+      blk_col = tpl_sample_pos[i][1];
+      coll_blk_count += add_col_ref_mv(cm, prev_frame_mvs_base, xd, mi_row,
+                                       mi_col, ref_frame, blk_row, blk_col,
+                                       refmv_count, ref_mv_stack, mode_context);
+    }
+#endif
+
+    if (coll_blk_count == 0) mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
+  } else {
+    mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
+  }
+
+  // Scan the second outer area.
+  scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf, -1, -1, ref_mv_stack,
+                refmv_count);
+  for (idx = 2; idx <= 3; ++idx) {
+    scan_row_mbmi(cm, xd, mi_row, mi_col, block, rf, -idx, ref_mv_stack,
+                  refmv_count);
+    scan_col_mbmi(cm, xd, mi_row, mi_col, block, rf, -idx, ref_mv_stack,
+                  refmv_count);
+  }
+  scan_col_mbmi(cm, xd, mi_row, mi_col, block, rf, -4, ref_mv_stack,
+                refmv_count);
+
+  switch (nearest_refmv_count) {
+    case 0:
+      mode_context[ref_frame] |= 0;
+      if (*refmv_count >= 1) mode_context[ref_frame] |= 1;
+
+      if (*refmv_count == 1)
+        mode_context[ref_frame] |= (1 << REFMV_OFFSET);
+      else if (*refmv_count >= 2)
+        mode_context[ref_frame] |= (2 << REFMV_OFFSET);
+      break;
+    case 1:
+      mode_context[ref_frame] |= (newmv_count > 0) ? 2 : 3;
+
+      if (*refmv_count == 1)
+        mode_context[ref_frame] |= (3 << REFMV_OFFSET);
+      else if (*refmv_count >= 2)
+        mode_context[ref_frame] |= (4 << REFMV_OFFSET);
+      break;
+
+    case 2:
+    default:
+      if (newmv_count >= 2)
+        mode_context[ref_frame] |= 4;
+      else if (newmv_count == 1)
+        mode_context[ref_frame] |= 5;
+      else
+        mode_context[ref_frame] |= 6;
+
+      mode_context[ref_frame] |= (5 << REFMV_OFFSET);
+      break;
+  }
+
+  // Rank the likelihood and assign nearest and near mvs.
+  len = nearest_refmv_count;
+  while (len > 0) {
+    nr_len = 0;
+    for (idx = 1; idx < len; ++idx) {
+      if (ref_mv_stack[idx - 1].weight < ref_mv_stack[idx].weight) {
+        tmp_mv = ref_mv_stack[idx - 1];
+        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
+        ref_mv_stack[idx] = tmp_mv;
+        nr_len = idx;
+      }
+    }
+    len = nr_len;
+  }
+
+  len = *refmv_count;
+  while (len > nearest_refmv_count) {
+    nr_len = nearest_refmv_count;
+    for (idx = nearest_refmv_count + 1; idx < len; ++idx) {
+      if (ref_mv_stack[idx - 1].weight < ref_mv_stack[idx].weight) {
+        tmp_mv = ref_mv_stack[idx - 1];
+        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
+        ref_mv_stack[idx] = tmp_mv;
+        nr_len = idx;
+      }
+    }
+    len = nr_len;
+  }
+
+  if (rf[1] > NONE_FRAME) {
+    for (idx = 0; idx < *refmv_count; ++idx) {
+      clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+                   xd->n8_h << MI_SIZE_LOG2, xd);
+      clamp_mv_ref(&ref_mv_stack[idx].comp_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+                   xd->n8_h << MI_SIZE_LOG2, xd);
+    }
+  } else {
+    for (idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, *refmv_count); ++idx) {
+      mv_ref_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
+      clamp_mv_ref(&mv_ref_list[idx].as_mv, xd->n8_w << MI_SIZE_LOG2,
+                   xd->n8_h << MI_SIZE_LOG2, xd);
+    }
+  }
+}
+#endif
+
+// This function searches the neighbourhood of a given MB/SB
+// to try and find candidate reference vectors.
+static void find_mv_refs_idx(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                             MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                             int_mv *mv_ref_list, int block, int mi_row,
+                             int mi_col, find_mv_refs_sync sync,
+                             void *const data, int16_t *mode_context,
+                             int_mv zeromv) {
+  const int *ref_sign_bias = cm->ref_frame_sign_bias;
+  int i, refmv_count = 0;
+#if !CONFIG_REF_MV
+  const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
+#endif
+  int different_ref_found = 0;
+  int context_counter = 0;
+#if CONFIG_MV_COMPRESS
+  const TileInfo *const tile_ = &xd->tile;
+  int mi_row_end = tile_->mi_row_end;
+  int mi_col_end = tile_->mi_col_end;
+  const MV_REF *const prev_frame_mvs =
+      cm->use_prev_frame_mvs
+          ? cm->prev_frame->mvs +
+                AOMMIN(((mi_row >> 1) << 1) + 1 + (((xd->n8_h - 1) >> 1) << 1),
+                       mi_row_end - 1) *
+                    cm->mi_cols +
+                AOMMIN(((mi_col >> 1) << 1) + 1 + (((xd->n8_w - 1) >> 1) << 1),
+                       mi_col_end - 1)
+          : NULL;
+#else
+  const MV_REF *const prev_frame_mvs =
+      cm->use_prev_frame_mvs
+          ? cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col
+          : NULL;
+#endif
+  const TileInfo *const tile = &xd->tile;
+  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
+  const int bw = block_size_wide[AOMMAX(bsize, BLOCK_8X8)];
+  const int bh = block_size_high[AOMMAX(bsize, BLOCK_8X8)];
+#if CONFIG_REF_MV
+  POSITION mv_ref_search[MVREF_NEIGHBOURS];
+  const int num_8x8_blocks_wide = num_8x8_blocks_wide_lookup[bsize];
+  const int num_8x8_blocks_high = num_8x8_blocks_high_lookup[bsize];
+  mv_ref_search[0].row = num_8x8_blocks_high - 1;
+  mv_ref_search[0].col = -1;
+  mv_ref_search[1].row = -1;
+  mv_ref_search[1].col = num_8x8_blocks_wide - 1;
+  mv_ref_search[2].row = -1;
+  mv_ref_search[2].col = (num_8x8_blocks_wide - 1) >> 1;
+  mv_ref_search[3].row = (num_8x8_blocks_high - 1) >> 1;
+  mv_ref_search[3].col = -1;
+  mv_ref_search[4].row = -1;
+  mv_ref_search[4].col = -1;
+#if CONFIG_EXT_PARTITION_TYPES
+  if (num_8x8_blocks_wide == num_8x8_blocks_high) {
+    mv_ref_search[5].row = -1;
+    mv_ref_search[5].col = 0;
+    mv_ref_search[6].row = 0;
+    mv_ref_search[6].col = -1;
+  } else {
+    mv_ref_search[5].row = -1;
+    mv_ref_search[5].col = num_8x8_blocks_wide;
+    mv_ref_search[6].row = num_8x8_blocks_high;
+    mv_ref_search[6].col = -1;
+  }
+#else
+  mv_ref_search[5].row = -1;
+  mv_ref_search[5].col = num_8x8_blocks_wide;
+  mv_ref_search[6].row = num_8x8_blocks_high;
+  mv_ref_search[6].col = -1;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+  mv_ref_search[7].row = -1;
+  mv_ref_search[7].col = -3;
+  mv_ref_search[8].row = num_8x8_blocks_high - 1;
+  mv_ref_search[8].col = -3;
+
+#if CONFIG_CB4X4
+  for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
+    mv_ref_search[i].row *= 2;
+    mv_ref_search[i].col *= 2;
+  }
+#endif  // CONFIG_CB4X4
+#endif  // CONFIG_REF_MV
+
+  // The nearest 2 blocks are treated differently
+  // if the size < 8x8 we get the mv from the bmi substructure,
+  // and we also need to keep a mode count.
+  for (i = 0; i < 2; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, cm, mv_ref)) {
+      const MODE_INFO *const candidate_mi =
+          xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+      // Keep counts for entropy encoding.
+      context_counter += mode_2_counter[candidate->mode];
+      different_ref_found = 1;
+
+      if (candidate->ref_frame[0] == ref_frame)
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block),
+                        refmv_count, mv_ref_list, bw, bh, xd, Done);
+      else if (candidate->ref_frame[1] == ref_frame)
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block),
+                        refmv_count, mv_ref_list, bw, bh, xd, Done);
+    }
+  }
+
+  // Check the rest of the neighbors in much the same way
+  // as before except we don't need to keep track of sub blocks or
+  // mode counts.
+  for (; i < MVREF_NEIGHBOURS; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, cm, mv_ref)) {
+      const MB_MODE_INFO *const candidate =
+          !xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]
+              ? NULL
+              : &xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]->mbmi;
+#if CONFIG_REF_MV
+      if (candidate == NULL) continue;
+      if ((mi_row % MAX_MIB_SIZE) + mv_ref->row >= MAX_MIB_SIZE ||
+          (mi_col % MAX_MIB_SIZE) + mv_ref->col >= MAX_MIB_SIZE)
+        continue;
+#endif
+      different_ref_found = 1;
+
+      if (candidate->ref_frame[0] == ref_frame)
+        ADD_MV_REF_LIST(candidate->mv[0], refmv_count, mv_ref_list, bw, bh, xd,
+                        Done);
+      else if (candidate->ref_frame[1] == ref_frame)
+        ADD_MV_REF_LIST(candidate->mv[1], refmv_count, mv_ref_list, bw, bh, xd,
+                        Done);
+    }
+  }
+
+// TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast
+// on windows platform. The sync here is unncessary if use_perv_frame_mvs
+// is 0. But after removing it, there will be hang in the unit test on windows
+// due to several threads waiting for a thread's signal.
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+  if (cm->frame_parallel_decode && sync != NULL) {
+    sync(data, mi_row);
+  }
+#endif
+
+  // Check the last frame's mode and mv info.
+  if (cm->use_prev_frame_mvs) {
+    // Synchronize here for frame parallel decode if sync function is provided.
+    if (cm->frame_parallel_decode && sync != NULL) {
+      sync(data, mi_row);
+    }
+
+    if (prev_frame_mvs->ref_frame[0] == ref_frame) {
+      ADD_MV_REF_LIST(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, bw, bh,
+                      xd, Done);
+    } else if (prev_frame_mvs->ref_frame[1] == ref_frame) {
+      ADD_MV_REF_LIST(prev_frame_mvs->mv[1], refmv_count, mv_ref_list, bw, bh,
+                      xd, Done);
+    }
+  }
+
+  // Since we couldn't find 2 mvs from the same reference frame
+  // go back through the neighbors and find motion vectors from
+  // different reference frames.
+  if (different_ref_found) {
+    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
+      const POSITION *mv_ref = &mv_ref_search[i];
+      if (is_inside(tile, mi_col, mi_row, cm->mi_rows, cm, mv_ref)) {
+        const MB_MODE_INFO *const candidate =
+            !xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]
+                ? NULL
+                : &xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]->mbmi;
+#if CONFIG_REF_MV
+        if (candidate == NULL) continue;
+        if ((mi_row % MAX_MIB_SIZE) + mv_ref->row >= MAX_MIB_SIZE ||
+            (mi_col % MAX_MIB_SIZE) + mv_ref->col >= MAX_MIB_SIZE)
+          continue;
+#endif
+
+        // If the candidate is INTRA we don't want to consider its mv.
+        IF_DIFF_REF_FRAME_ADD_MV(candidate, ref_frame, ref_sign_bias,
+                                 refmv_count, mv_ref_list, bw, bh, xd, Done);
+      }
+    }
+  }
+
+  // Since we still don't have a candidate we'll try the last frame.
+  if (cm->use_prev_frame_mvs) {
+    if (prev_frame_mvs->ref_frame[0] != ref_frame &&
+        prev_frame_mvs->ref_frame[0] > INTRA_FRAME) {
+      int_mv mv = prev_frame_mvs->mv[0];
+      if (ref_sign_bias[prev_frame_mvs->ref_frame[0]] !=
+          ref_sign_bias[ref_frame]) {
+        mv.as_mv.row *= -1;
+        mv.as_mv.col *= -1;
+      }
+      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, bw, bh, xd, Done);
+    }
+
+    if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME &&
+        prev_frame_mvs->ref_frame[1] != ref_frame) {
+      int_mv mv = prev_frame_mvs->mv[1];
+      if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] !=
+          ref_sign_bias[ref_frame]) {
+        mv.as_mv.row *= -1;
+        mv.as_mv.col *= -1;
+      }
+      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, bw, bh, xd, Done);
+    }
+  }
+
+Done:
+  if (mode_context)
+    mode_context[ref_frame] = counter_to_context[context_counter];
+  for (i = refmv_count; i < MAX_MV_REF_CANDIDATES; ++i)
+    mv_ref_list[i].as_int = zeromv.as_int;
+}
+
+#if CONFIG_EXT_INTER
+// This function keeps a mode count for a given MB/SB
+void av1_update_mv_context(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                           MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                           int_mv *mv_ref_list, int block, int mi_row,
+                           int mi_col, int16_t *mode_context) {
+  int i, refmv_count = 0;
+#if !CONFIG_REF_MV
+  const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
+#endif
+  int context_counter = 0;
+  const int bw = block_size_wide[mi->mbmi.sb_type];
+  const int bh = block_size_high[mi->mbmi.sb_type];
+  const TileInfo *const tile = &xd->tile;
+#if CONFIG_REF_MV
+  POSITION mv_ref_search[MVREF_NEIGHBOURS];
+  const int num_8x8_blocks_wide = mi_size_wide[mi->mbmi.sb_type];
+  const int num_8x8_blocks_high = mi_size_high[mi->mbmi.sb_type];
+  mv_ref_search[0].row = num_8x8_blocks_high - 1;
+  mv_ref_search[0].col = -1;
+  mv_ref_search[1].row = -1;
+  mv_ref_search[1].col = num_8x8_blocks_wide - 1;
+  mv_ref_search[2].row = -1;
+  mv_ref_search[2].col = (num_8x8_blocks_wide - 1) >> 1;
+  mv_ref_search[3].row = (num_8x8_blocks_high - 1) >> 1;
+  mv_ref_search[3].col = -1;
+  mv_ref_search[4].row = -1;
+  mv_ref_search[4].col = -1;
+#if CONFIG_EXT_PARTITION_TYPES
+  if (num_8x8_blocks_wide == num_8x8_blocks_high) {
+    mv_ref_search[5].row = -1;
+    mv_ref_search[5].col = 0;
+    mv_ref_search[6].row = 0;
+    mv_ref_search[6].col = -1;
+  } else {
+    mv_ref_search[5].row = -1;
+    mv_ref_search[5].col = num_8x8_blocks_wide;
+    mv_ref_search[6].row = num_8x8_blocks_high;
+    mv_ref_search[6].col = -1;
+  }
+#else
+  mv_ref_search[5].row = -1;
+  mv_ref_search[5].col = num_8x8_blocks_wide;
+  mv_ref_search[6].row = num_8x8_blocks_high;
+  mv_ref_search[6].col = -1;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+  mv_ref_search[7].row = -1;
+  mv_ref_search[7].col = -3;
+  mv_ref_search[8].row = num_8x8_blocks_high - 1;
+  mv_ref_search[8].col = -3;
+#endif
+
+  // Blank the reference vector list
+  memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+
+  // The nearest 2 blocks are examined only.
+  // If the size < 8x8, we get the mv from the bmi substructure;
+  for (i = 0; i < 2; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, cm, mv_ref)) {
+      const MODE_INFO *const candidate_mi =
+          xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+
+      // Keep counts for entropy encoding.
+      context_counter += mode_2_counter[candidate->mode];
+
+      if (candidate->ref_frame[0] == ref_frame) {
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block),
+                        refmv_count, mv_ref_list, bw, bh, xd, Done);
+      } else if (candidate->ref_frame[1] == ref_frame) {
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block),
+                        refmv_count, mv_ref_list, bw, bh, xd, Done);
+      }
+    }
+  }
+
+Done:
+
+  if (mode_context)
+    mode_context[ref_frame] = counter_to_context[context_counter];
+}
+#endif  // CONFIG_EXT_INTER
+
+void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+#if CONFIG_REF_MV
+                      uint8_t *ref_mv_count, CANDIDATE_MV *ref_mv_stack,
+#if CONFIG_EXT_INTER
+                      int16_t *compound_mode_context,
+#endif  // CONFIG_EXT_INTER
+#endif
+                      int_mv *mv_ref_list, int mi_row, int mi_col,
+                      find_mv_refs_sync sync, void *const data,
+                      int16_t *mode_context) {
+  int_mv zeromv[2];
+#if CONFIG_GLOBAL_MOTION
+  BLOCK_SIZE bsize = mi->mbmi.sb_type;
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_REF_MV
+  int idx, all_zero = 1;
+#if CONFIG_GLOBAL_MOTION
+  MV_REFERENCE_FRAME rf[2];
+#endif  // CONFIG_GLOBAL_MOTION
+#endif  // CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+  av1_update_mv_context(cm, xd, mi, ref_frame, mv_ref_list, -1, mi_row, mi_col,
+#if CONFIG_REF_MV
+                        compound_mode_context);
+#else
+                        mode_context);
+#endif  // CONFIG_REF_MV
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_GLOBAL_MOTION
+#if CONFIG_REF_MV
+  av1_set_ref_frame(rf, ref_frame);
+  zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[rf[0]],
+                                          cm->allow_high_precision_mv, bsize,
+                                          mi_col, mi_row, 0)
+                         .as_int;
+  zeromv[1].as_int = (rf[1] != NONE_FRAME)
+                         ? gm_get_motion_vector(&cm->global_motion[rf[1]],
+                                                cm->allow_high_precision_mv,
+                                                bsize, mi_col, mi_row, 0)
+                               .as_int
+                         : 0;
+#else
+  zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame],
+                                          cm->allow_high_precision_mv, bsize,
+                                          mi_col, mi_row, 0)
+                         .as_int;
+  zeromv[1].as_int = 0;
+#endif  // CONFIG_REF_MV
+#else
+  zeromv[0].as_int = zeromv[1].as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
+
+#if CONFIG_REF_MV
+  if (ref_frame <= ALTREF_FRAME)
+#endif  // CONFIG_REF_MV
+    find_mv_refs_idx(cm, xd, mi, ref_frame, mv_ref_list, -1, mi_row, mi_col,
+                     sync, data, mode_context, zeromv[0]);
+
+#if CONFIG_REF_MV
+  setup_ref_mv_list(cm, xd, ref_frame, ref_mv_count, ref_mv_stack, mv_ref_list,
+                    -1, mi_row, mi_col, mode_context);
+  /* Note: If global motion is enabled, then we want to set the ALL_ZERO flag
+     iff all of the MVs we could generate with NEARMV/NEARESTMV are equivalent
+     to the global motion vector.
+     Note: For the following to work properly, the encoder can't throw away
+     any global motion models after calling this function, even if they are
+     unused. Instead we rely on the recode loop: If any non-IDENTITY model
+     is unused, the whole frame will be re-encoded without it.
+     The problem is that, otherwise, we can end up in the following situation:
+     * Encoder has a global motion model with nonzero translational part,
+       and all candidate MVs are zero. So the ALL_ZERO flag is unset.
+     * Encoder throws away global motion because it is never used.
+     * Decoder sees that there is no global motion and all candidate MVs are
+       zero, so sets the ALL_ZERO flag.
+     * This leads to an encode/decode mismatch.
+  */
+  if (*ref_mv_count >= 2) {
+    for (idx = 0; idx < AOMMIN(3, *ref_mv_count); ++idx) {
+      if (ref_mv_stack[idx].this_mv.as_int != zeromv[0].as_int) all_zero = 0;
+      if (ref_frame > ALTREF_FRAME)
+        if (ref_mv_stack[idx].comp_mv.as_int != zeromv[1].as_int) all_zero = 0;
+    }
+  } else if (ref_frame <= ALTREF_FRAME) {
+    for (idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx)
+      if (mv_ref_list[idx].as_int != zeromv[0].as_int) all_zero = 0;
+  }
+
+  if (all_zero) mode_context[ref_frame] |= (1 << ALL_ZERO_FLAG_OFFSET);
+#endif
+}
+
+void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
+                           int_mv *near_mv) {
+  int i;
+  // Make sure all the candidates are properly clamped etc
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
+    lower_mv_precision(&mvlist[i].as_mv, allow_hp);
+  }
+  *nearest_mv = mvlist[0];
+  *near_mv = mvlist[1];
+}
+
+void av1_append_sub8x8_mvs_for_idx(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                   int block, int ref, int mi_row, int mi_col,
+#if CONFIG_REF_MV
+                                   CANDIDATE_MV *ref_mv_stack,
+                                   uint8_t *ref_mv_count,
+#endif
+#if CONFIG_EXT_INTER
+                                   int_mv *mv_list,
+#endif  // CONFIG_EXT_INTER
+                                   int_mv *nearest_mv, int_mv *near_mv) {
+#if !CONFIG_EXT_INTER
+  int_mv mv_list[MAX_MV_REF_CANDIDATES];
+#endif  // !CONFIG_EXT_INTER
+  MODE_INFO *const mi = xd->mi[0];
+  b_mode_info *bmi = mi->bmi;
+  int n;
+  int_mv zeromv;
+#if CONFIG_REF_MV
+  CANDIDATE_MV tmp_mv;
+  uint8_t idx;
+  uint8_t above_count = 0, left_count = 0;
+  MV_REFERENCE_FRAME rf[2] = { mi->mbmi.ref_frame[ref], NONE_FRAME };
+  *ref_mv_count = 0;
+#endif
+
+  assert(MAX_MV_REF_CANDIDATES == 2);
+
+#if CONFIG_GLOBAL_MOTION
+  zeromv.as_int =
+      gm_get_motion_vector(&cm->global_motion[ref], cm->allow_high_precision_mv,
+                           mi->mbmi.sb_type, mi_col, mi_row, block)
+          .as_int;
+#else
+  zeromv.as_int = 0;
+#endif
+  find_mv_refs_idx(cm, xd, mi, mi->mbmi.ref_frame[ref], mv_list, block, mi_row,
+                   mi_col, NULL, NULL, NULL, zeromv);
+
+#if CONFIG_REF_MV
+  scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf, -1, 0, ref_mv_stack,
+                ref_mv_count);
+  above_count = *ref_mv_count;
+
+  scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf, 0, -1, ref_mv_stack,
+                ref_mv_count);
+  left_count = *ref_mv_count - above_count;
+
+  if (above_count > 1 && left_count > 0) {
+    tmp_mv = ref_mv_stack[1];
+    ref_mv_stack[1] = ref_mv_stack[above_count];
+    ref_mv_stack[above_count] = tmp_mv;
+  }
+
+  for (idx = 0; idx < *ref_mv_count; ++idx)
+    clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+                 xd->n8_h << MI_SIZE_LOG2, xd);
+
+  for (idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, *ref_mv_count); ++idx)
+    mv_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
+#endif
+
+  near_mv->as_int = 0;
+  switch (block) {
+    case 0:
+      nearest_mv->as_int = mv_list[0].as_int;
+      near_mv->as_int = mv_list[1].as_int;
+      break;
+    case 1:
+    case 2:
+      nearest_mv->as_int = bmi[0].as_mv[ref].as_int;
+      for (n = 0; n < MAX_MV_REF_CANDIDATES; ++n)
+        if (nearest_mv->as_int != mv_list[n].as_int) {
+          near_mv->as_int = mv_list[n].as_int;
+          break;
+        }
+      break;
+    case 3: {
+      int_mv candidates[2 + MAX_MV_REF_CANDIDATES];
+      candidates[0] = bmi[1].as_mv[ref];
+      candidates[1] = bmi[0].as_mv[ref];
+      candidates[2] = mv_list[0];
+      candidates[3] = mv_list[1];
+
+      nearest_mv->as_int = bmi[2].as_mv[ref].as_int;
+      for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n)
+        if (nearest_mv->as_int != candidates[n].as_int) {
+          near_mv->as_int = candidates[n].as_int;
+          break;
+        }
+      break;
+    }
+    default: assert(0 && "Invalid block index.");
+  }
+}
+
+#if CONFIG_WARPED_MOTION
+void calc_projection_samples(MB_MODE_INFO *const mbmi, int x, int y,
+                             int *pts_inref) {
+  pts_inref[0] = (x * 8) + mbmi->mv[0].as_mv.col;
+  pts_inref[1] = (y * 8) + mbmi->mv[0].as_mv.row;
+}
+
+// Note: Samples returned are at 1/8-pel precision
+int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
+                int *pts, int *pts_inref) {
+  MB_MODE_INFO *const mbmi0 = &(xd->mi[0]->mbmi);
+  int ref_frame = mbmi0->ref_frame[0];
+  int up_available = xd->up_available;
+  int left_available = xd->left_available;
+  int i, mi_step, np = 0;
+  int global_offset_c = mi_col * MI_SIZE;
+  int global_offset_r = mi_row * MI_SIZE;
+
+  // scan the above row
+  if (up_available) {
+    for (i = 0; i < AOMMIN(xd->n8_w, cm->mi_cols - mi_col); i += mi_step) {
+      int mi_row_offset = -1;
+      int mi_col_offset = i;
+
+      MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+      MB_MODE_INFO *mbmi = &mi->mbmi;
+
+      mi_step = AOMMIN(xd->n8_w, mi_size_wide[mbmi->sb_type]);
+
+      if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+        int bw = block_size_wide[mbmi->sb_type];
+        int bh = block_size_high[mbmi->sb_type];
+        int cr_offset = -AOMMAX(bh, MI_SIZE) / 2 - 1;
+        int cc_offset = i * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1;
+        int x = cc_offset + global_offset_c;
+        int y = cr_offset + global_offset_r;
+
+        pts[0] = (x * 8);
+        pts[1] = (y * 8);
+        calc_projection_samples(mbmi, x, y, pts_inref);
+        pts += 2;
+        pts_inref += 2;
+        np++;
+        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+      }
+    }
+  }
+  assert(2 * np <= SAMPLES_ARRAY_SIZE);
+
+  // scan the left column
+  if (left_available) {
+    for (i = 0; i < AOMMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
+      int mi_row_offset = i;
+      int mi_col_offset = -1;
+
+      MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+      MB_MODE_INFO *mbmi = &mi->mbmi;
+
+      mi_step = AOMMIN(xd->n8_h, mi_size_high[mbmi->sb_type]);
+
+      if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+        int bw = block_size_wide[mbmi->sb_type];
+        int bh = block_size_high[mbmi->sb_type];
+        int cr_offset = i * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1;
+        int cc_offset = -AOMMAX(bw, MI_SIZE) / 2 - 1;
+        int x = cc_offset + global_offset_c;
+        int y = cr_offset + global_offset_r;
+
+        pts[0] = (x * 8);
+        pts[1] = (y * 8);
+        calc_projection_samples(mbmi, x, y, pts_inref);
+        pts += 2;
+        pts_inref += 2;
+        np++;
+        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+      }
+    }
+  }
+  assert(2 * np <= SAMPLES_ARRAY_SIZE);
+
+  if (left_available && up_available) {
+    int mi_row_offset = -1;
+    int mi_col_offset = -1;
+
+    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+    MB_MODE_INFO *mbmi = &mi->mbmi;
+
+    if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+      int bw = block_size_wide[mbmi->sb_type];
+      int bh = block_size_high[mbmi->sb_type];
+      int cr_offset = -AOMMAX(bh, MI_SIZE) / 2 - 1;
+      int cc_offset = -AOMMAX(bw, MI_SIZE) / 2 - 1;
+      int x = cc_offset + global_offset_c;
+      int y = cr_offset + global_offset_r;
+
+      pts[0] = (x * 8);
+      pts[1] = (y * 8);
+      calc_projection_samples(mbmi, x, y, pts_inref);
+      np++;
+    }
+  }
+  assert(2 * np <= SAMPLES_ARRAY_SIZE);
+
+  return np;
+}
+#endif  // CONFIG_WARPED_MOTION
diff --git a/third_party/aom/av1/common/mvref_common.h b/third_party/aom/av1/common/mvref_common.h
new file mode 100644
index 000000000..01f74b77a
--- /dev/null
+++ b/third_party/aom/av1/common/mvref_common.h
@@ -0,0 +1,580 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_COMMON_MVREF_COMMON_H_
+#define AV1_COMMON_MVREF_COMMON_H_
+
+#include "av1/common/onyxc_int.h"
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_REF_MV
+#define MVREF_NEIGHBOURS 9
+#else
+#define MVREF_NEIGHBOURS 8
+#endif
+
+typedef struct position {
+  int row;
+  int col;
+} POSITION;
+
+typedef enum {
+  BOTH_ZERO = 0,
+  ZERO_PLUS_PREDICTED = 1,
+  BOTH_PREDICTED = 2,
+  NEW_PLUS_NON_INTRA = 3,
+  BOTH_NEW = 4,
+  INTRA_PLUS_NON_INTRA = 5,
+  BOTH_INTRA = 6,
+  INVALID_CASE = 9
+} motion_vector_context;
+
+// This is used to figure out a context for the ref blocks. The code flattens
+// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by
+// adding 9 for each intra block, 3 for each zero mv and 1 for each new
+// motion vector. This single number is then converted into a context
+// with a single lookup ( counter_to_context ).
+static const int mode_2_counter[MB_MODE_COUNT] = {
+  9,  // DC_PRED
+  9,  // V_PRED
+  9,  // H_PRED
+  9,  // D45_PRED
+  9,  // D135_PRED
+  9,  // D117_PRED
+  9,  // D153_PRED
+  9,  // D207_PRED
+  9,  // D63_PRED
+#if CONFIG_ALT_INTRA
+  9,    // SMOOTH_PRED
+#endif  // CONFIG_ALT_INTRA
+  9,    // TM_PRED
+  0,    // NEARESTMV
+  0,    // NEARMV
+  3,    // ZEROMV
+  1,    // NEWMV
+#if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SINGLEREF
+  0,    // SR_NEAREST_NEARMV
+  1,    // SR_NEAREST_NEWMV
+  1,    // SR_NEAR_NEWMV
+  3,    // SR_ZERO_NEWMV
+  1,    // SR_NEW_NEWMV
+#endif  // CONFIG_COMPOUND_SINGLEREF
+  0,    // NEAREST_NEARESTMV
+  0,    // NEAREST_NEARMV
+  0,    // NEAR_NEARESTMV
+  0,    // NEAR_NEARMV
+  1,    // NEAREST_NEWMV
+  1,    // NEW_NEARESTMV
+  1,    // NEAR_NEWMV
+  1,    // NEW_NEARMV
+  3,    // ZERO_ZEROMV
+  1,    // NEW_NEWMV
+#endif  // CONFIG_EXT_INTER
+};
+
+// There are 3^3 different combinations of 3 counts that can be either 0,1 or
+// 2. However the actual count can never be greater than 2 so the highest
+// counter we need is 18. 9 is an invalid counter that's never used.
+static const int counter_to_context[19] = {
+  BOTH_PREDICTED,        // 0
+  NEW_PLUS_NON_INTRA,    // 1
+  BOTH_NEW,              // 2
+  ZERO_PLUS_PREDICTED,   // 3
+  NEW_PLUS_NON_INTRA,    // 4
+  INVALID_CASE,          // 5
+  BOTH_ZERO,             // 6
+  INVALID_CASE,          // 7
+  INVALID_CASE,          // 8
+  INTRA_PLUS_NON_INTRA,  // 9
+  INTRA_PLUS_NON_INTRA,  // 10
+  INVALID_CASE,          // 11
+  INTRA_PLUS_NON_INTRA,  // 12
+  INVALID_CASE,          // 13
+  INVALID_CASE,          // 14
+  INVALID_CASE,          // 15
+  INVALID_CASE,          // 16
+  INVALID_CASE,          // 17
+  BOTH_INTRA             // 18
+};
+
+#if !CONFIG_REF_MV
+static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
+  // 4X4
+  { { -1, 0 },
+    { 0, -1 },
+    { -1, -1 },
+    { -2, 0 },
+    { 0, -2 },
+    { -2, -1 },
+    { -1, -2 },
+    { -2, -2 } },
+  // 4X8
+  { { -1, 0 },
+    { 0, -1 },
+    { -1, -1 },
+    { -2, 0 },
+    { 0, -2 },
+    { -2, -1 },
+    { -1, -2 },
+    { -2, -2 } },
+  // 8X4
+  { { -1, 0 },
+    { 0, -1 },
+    { -1, -1 },
+    { -2, 0 },
+    { 0, -2 },
+    { -2, -1 },
+    { -1, -2 },
+    { -2, -2 } },
+  // 8X8
+  { { -1, 0 },
+    { 0, -1 },
+    { -1, -1 },
+    { -2, 0 },
+    { 0, -2 },
+    { -2, -1 },
+    { -1, -2 },
+    { -2, -2 } },
+  // 8X16
+  { { 0, -1 },
+    { -1, 0 },
+    { 1, -1 },
+    { -1, -1 },
+    { 0, -2 },
+    { -2, 0 },
+    { -2, -1 },
+    { -1, -2 } },
+  // 16X8
+  { { -1, 0 },
+    { 0, -1 },
+    { -1, 1 },
+    { -1, -1 },
+    { -2, 0 },
+    { 0, -2 },
+    { -1, -2 },
+    { -2, -1 } },
+  // 16X16
+  { { -1, 0 },
+    { 0, -1 },
+    { -1, 1 },
+    { 1, -1 },
+    { -1, -1 },
+    { -3, 0 },
+    { 0, -3 },
+    { -3, -3 } },
+  // 16X32
+  { { 0, -1 },
+    { -1, 0 },
+    { 2, -1 },
+    { -1, -1 },
+    { -1, 1 },
+    { 0, -3 },
+    { -3, 0 },
+    { -3, -3 } },
+  // 32X16
+  { { -1, 0 },
+    { 0, -1 },
+    { -1, 2 },
+    { -1, -1 },
+    { 1, -1 },
+    { -3, 0 },
+    { 0, -3 },
+    { -3, -3 } },
+  // 32X32
+  { { -1, 1 },
+    { 1, -1 },
+    { -1, 2 },
+    { 2, -1 },
+    { -1, -1 },
+    { -3, 0 },
+    { 0, -3 },
+    { -3, -3 } },
+  // 32X64
+  { { 0, -1 },
+    { -1, 0 },
+    { 4, -1 },
+    { -1, 2 },
+    { -1, -1 },
+    { 0, -3 },
+    { -3, 0 },
+    { 2, -1 } },
+  // 64X32
+  { { -1, 0 },
+    { 0, -1 },
+    { -1, 4 },
+    { 2, -1 },
+    { -1, -1 },
+    { -3, 0 },
+    { 0, -3 },
+    { -1, 2 } },
+  // 64X64
+  { { -1, 3 },
+    { 3, -1 },
+    { -1, 4 },
+    { 4, -1 },
+    { -1, -1 },
+    { -1, 0 },
+    { 0, -1 },
+    { -1, 6 } },
+#if CONFIG_EXT_PARTITION
+  // TODO(debargha/jingning) Making them twice the 32x64, .. ones above
+  // 64x128
+  { { 0, -2 },
+    { -2, 0 },
+    { 8, -2 },
+    { -2, 4 },
+    { -2, -2 },
+    { 0, -6 },
+    { -6, 0 },
+    { 4, -2 } },
+  // 128x64
+  { { -2, 0 },
+    { 0, -2 },
+    { -2, 8 },
+    { 4, -2 },
+    { -2, -2 },
+    { -6, 0 },
+    { 0, -6 },
+    { -2, 4 } },
+  // 128x128
+  { { -2, 6 },
+    { 6, -2 },
+    { -2, 8 },
+    { 8, -2 },
+    { -2, -2 },
+    { -2, 0 },
+    { 0, -2 },
+    { -2, 12 } },
+#endif  // CONFIG_EXT_PARTITION
+};
+#endif
+
+static const int idx_n_column_to_subblock[4][2] = {
+  { 1, 2 }, { 1, 3 }, { 3, 2 }, { 3, 3 }
+};
+
+// clamp_mv_ref
+#if CONFIG_EXT_PARTITION
+#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
+#else
+#define MV_BORDER (8 << 3)  // Allow 8 pels in 1/8th pel units
+#endif                      // CONFIG_EXT_PARTITION
+
+static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
+  clamp_mv(mv, xd->mb_to_left_edge - bw * 8 - MV_BORDER,
+           xd->mb_to_right_edge + bw * 8 + MV_BORDER,
+           xd->mb_to_top_edge - bh * 8 - MV_BORDER,
+           xd->mb_to_bottom_edge + bh * 8 + MV_BORDER);
+}
+
+// This function returns either the appropriate sub block or block's mv
+// on whether the block_size < 8x8 and we have check_sub_blocks set.
+static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
+                                      int search_col, int block_idx) {
+#if CONFIG_REF_MV
+  (void)search_col;
+  (void)block_idx;
+  return candidate->mbmi.mv[which_mv];
+#else
+  return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8
+             ? candidate
+                   ->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
+                   .as_mv[which_mv]
+             : candidate->mbmi.mv[which_mv];
+#endif
+}
+
+#if CONFIG_REF_MV
+static INLINE int_mv get_sub_block_pred_mv(const MODE_INFO *candidate,
+                                           int which_mv, int search_col,
+                                           int block_idx) {
+  (void)search_col;
+  (void)block_idx;
+  return candidate->mbmi.mv[which_mv];
+}
+#endif
+
+// Performs mv sign inversion if indicated by the reference frame combination.
+static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
+                              const MV_REFERENCE_FRAME this_ref_frame,
+                              const int *ref_sign_bias) {
+  int_mv mv = mbmi->mv[ref];
+  if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
+    mv.as_mv.row *= -1;
+    mv.as_mv.col *= -1;
+  }
+  return mv;
+}
+
+#define CLIP_IN_ADD(mv, bw, bh, xd) clamp_mv_ref(mv, bw, bh, xd)
+
+// This macro is used to add a motion vector mv_ref list if it isn't
+// already in the list.  If it's the second motion vector it will also
+// skip all additional processing and jump to done!
+#define ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, bw, bh, xd, Done)      \
+  do {                                                                       \
+    (mv_ref_list)[(refmv_count)] = (mv);                                     \
+    CLIP_IN_ADD(&(mv_ref_list)[(refmv_count)].as_mv, (bw), (bh), (xd));      \
+    if (refmv_count && (mv_ref_list)[1].as_int != (mv_ref_list)[0].as_int) { \
+      (refmv_count) = 2;                                                     \
+      goto Done;                                                             \
+    }                                                                        \
+    (refmv_count) = 1;                                                       \
+  } while (0)
+
+// If either reference frame is different, not INTRA, and they
+// are different from each other scale and add the mv to our list.
+#define IF_DIFF_REF_FRAME_ADD_MV(mbmi, ref_frame, ref_sign_bias, refmv_count, \
+                                 mv_ref_list, bw, bh, xd, Done)               \
+  do {                                                                        \
+    if (is_inter_block(mbmi)) {                                               \
+      if ((mbmi)->ref_frame[0] != ref_frame)                                  \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias),        \
+                        refmv_count, mv_ref_list, bw, bh, xd, Done);          \
+      if (has_second_ref(mbmi) && (mbmi)->ref_frame[1] != ref_frame)          \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias),        \
+                        refmv_count, mv_ref_list, bw, bh, xd, Done);          \
+    }                                                                         \
+  } while (0)
+
+// Checks that the given mi_row, mi_col and search point
+// are inside the borders of the tile.
+static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
+                            int mi_rows, const AV1_COMMON *cm,
+                            const POSITION *mi_pos) {
+#if CONFIG_DEPENDENT_HORZTILES
+  const int dependent_horz_tile_flag = cm->dependent_horz_tiles;
+#else
+  const int dependent_horz_tile_flag = 0;
+  (void)cm;
+#endif
+#if CONFIG_TILE_GROUPS
+  if (dependent_horz_tile_flag && !tile->tg_horz_boundary) {
+#else
+  if (dependent_horz_tile_flag) {
+#endif
+    return !(mi_row + mi_pos->row < 0 ||
+             mi_col + mi_pos->col < tile->mi_col_start ||
+             mi_row + mi_pos->row >= mi_rows ||
+             mi_col + mi_pos->col >= tile->mi_col_end);
+  } else {
+    return !(mi_row + mi_pos->row < tile->mi_row_start ||
+             mi_col + mi_pos->col < tile->mi_col_start ||
+             mi_row + mi_pos->row >= tile->mi_row_end ||
+             mi_col + mi_pos->col >= tile->mi_col_end);
+  }
+}
+
+static INLINE void lower_mv_precision(MV *mv, int allow_hp) {
+  if (!allow_hp) {
+    if (mv->row & 1) mv->row += (mv->row > 0 ? -1 : 1);
+    if (mv->col & 1) mv->col += (mv->col > 0 ? -1 : 1);
+  }
+}
+
+#if CONFIG_REF_MV
+static INLINE uint8_t av1_get_pred_diff_ctx(const int_mv pred_mv,
+                                            const int_mv this_mv) {
+  if (abs(this_mv.as_mv.row - pred_mv.as_mv.row) <= 4 &&
+      abs(this_mv.as_mv.col - pred_mv.as_mv.col) <= 4)
+    return 2;
+  else
+    return 1;
+}
+
+static INLINE int av1_nmv_ctx(const uint8_t ref_mv_count,
+                              const CANDIDATE_MV *ref_mv_stack, int ref,
+                              int ref_mv_idx) {
+  if (ref_mv_stack[ref_mv_idx].weight >= REF_CAT_LEVEL && ref_mv_count > 0)
+    return ref_mv_stack[ref_mv_idx].pred_diff[ref];
+
+  return 0;
+}
+
+static INLINE int8_t av1_ref_frame_type(const MV_REFERENCE_FRAME *const rf) {
+  if (rf[1] > INTRA_FRAME) {
+    return TOTAL_REFS_PER_FRAME + FWD_RF_OFFSET(rf[0]) +
+           BWD_RF_OFFSET(rf[1]) * FWD_REFS;
+  }
+
+  return rf[0];
+}
+
+// clang-format off
+static MV_REFERENCE_FRAME ref_frame_map[COMP_REFS][2] = {
+#if CONFIG_EXT_REFS
+  { LAST_FRAME, BWDREF_FRAME },  { LAST2_FRAME, BWDREF_FRAME },
+  { LAST3_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, BWDREF_FRAME },
+
+  { LAST_FRAME, ALTREF_FRAME },  { LAST2_FRAME, ALTREF_FRAME },
+  { LAST3_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME }
+#else
+  { LAST_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME }
+#endif
+};
+// clang-format on
+
+static INLINE void av1_set_ref_frame(MV_REFERENCE_FRAME *rf,
+                                     int8_t ref_frame_type) {
+  if (ref_frame_type >= TOTAL_REFS_PER_FRAME) {
+    rf[0] = ref_frame_map[ref_frame_type - TOTAL_REFS_PER_FRAME][0];
+    rf[1] = ref_frame_map[ref_frame_type - TOTAL_REFS_PER_FRAME][1];
+  } else {
+    rf[0] = ref_frame_type;
+    rf[1] = NONE_FRAME;
+    assert(ref_frame_type > INTRA_FRAME &&
+           ref_frame_type < TOTAL_REFS_PER_FRAME);
+  }
+}
+
+static INLINE int16_t av1_mode_context_analyzer(
+    const int16_t *const mode_context, const MV_REFERENCE_FRAME *const rf,
+    BLOCK_SIZE bsize, int block) {
+  int16_t mode_ctx = 0;
+  int8_t ref_frame_type = av1_ref_frame_type(rf);
+
+  if (block >= 0) {
+    mode_ctx = mode_context[rf[0]] & 0x00ff;
+#if !CONFIG_CB4X4
+    if (block > 0 && bsize < BLOCK_8X8 && bsize > BLOCK_4X4)
+      mode_ctx |= (1 << SKIP_NEARESTMV_SUB8X8_OFFSET);
+#else
+    (void)block;
+    (void)bsize;
+#endif
+
+    return mode_ctx;
+  }
+
+  return mode_context[ref_frame_type];
+}
+
+static INLINE uint8_t av1_drl_ctx(const CANDIDATE_MV *ref_mv_stack,
+                                  int ref_idx) {
+  if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL &&
+      ref_mv_stack[ref_idx + 1].weight >= REF_CAT_LEVEL)
+    return 0;
+
+  if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL &&
+      ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
+    return 2;
+
+  if (ref_mv_stack[ref_idx].weight < REF_CAT_LEVEL &&
+      ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
+    return 3;
+
+  return 0;
+}
+#endif
+
+typedef void (*find_mv_refs_sync)(void *const data, int mi_row);
+void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+#if CONFIG_REF_MV
+                      uint8_t *ref_mv_count, CANDIDATE_MV *ref_mv_stack,
+#if CONFIG_EXT_INTER
+                      int16_t *compound_mode_context,
+#endif  // CONFIG_EXT_INTER
+#endif
+                      int_mv *mv_ref_list, int mi_row, int mi_col,
+                      find_mv_refs_sync sync, void *const data,
+                      int16_t *mode_context);
+
+// check a list of motion vectors by sad score using a number rows of pixels
+// above and a number cols of pixels in the left to select the one with best
+// score to use as ref motion vector
+void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
+                           int_mv *near_mv);
+
+void av1_append_sub8x8_mvs_for_idx(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                   int block, int ref, int mi_row, int mi_col,
+#if CONFIG_REF_MV
+                                   CANDIDATE_MV *ref_mv_stack,
+                                   uint8_t *ref_mv_count,
+#endif
+#if CONFIG_EXT_INTER
+                                   int_mv *mv_list,
+#endif  // CONFIG_EXT_INTER
+                                   int_mv *nearest_mv, int_mv *near_mv);
+
+#if CONFIG_EXT_INTER
+// This function keeps a mode count for a given MB/SB
+void av1_update_mv_context(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                           MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                           int_mv *mv_ref_list, int block, int mi_row,
+                           int mi_col, int16_t *mode_context);
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_WARPED_MOTION
+int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
+                int *pts, int *pts_inref);
+#endif  // CONFIG_WARPED_MOTION
+
+#if CONFIG_INTRABC
+static INLINE void av1_find_ref_dv(int_mv *ref_dv, int mi_row, int mi_col) {
+  // TODO(aconverse@google.com): Handle tiles and such
+  (void)mi_col;
+  if (mi_row < MAX_MIB_SIZE) {
+    ref_dv->as_mv.row = 0;
+    ref_dv->as_mv.col = -MI_SIZE * MAX_MIB_SIZE;
+  } else {
+    ref_dv->as_mv.row = -MI_SIZE * MAX_MIB_SIZE;
+    ref_dv->as_mv.col = 0;
+  }
+}
+
+static INLINE int is_dv_valid(const MV dv, const TileInfo *const tile,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int SCALE_PX_TO_MV = 8;
+  // Disallow subpixel for now
+  // SUBPEL_MASK is not the correct scale
+  if ((dv.row & (SCALE_PX_TO_MV - 1) || dv.col & (SCALE_PX_TO_MV - 1)))
+    return 0;
+  // Is the source top-left inside the current tile?
+  const int src_top_edge = mi_row * MI_SIZE * SCALE_PX_TO_MV + dv.row;
+  const int tile_top_edge = tile->mi_row_start * MI_SIZE * SCALE_PX_TO_MV;
+  if (src_top_edge < tile_top_edge) return 0;
+  const int src_left_edge = mi_col * MI_SIZE * SCALE_PX_TO_MV + dv.col;
+  const int tile_left_edge = tile->mi_col_start * MI_SIZE * SCALE_PX_TO_MV;
+  if (src_left_edge < tile_left_edge) return 0;
+  // Is the bottom right inside the current tile?
+  const int src_bottom_edge = (mi_row * MI_SIZE + bh) * SCALE_PX_TO_MV + dv.row;
+  const int tile_bottom_edge = tile->mi_row_end * MI_SIZE * SCALE_PX_TO_MV;
+  if (src_bottom_edge > tile_bottom_edge) return 0;
+  const int src_right_edge = (mi_col * MI_SIZE + bw) * SCALE_PX_TO_MV + dv.col;
+  const int tile_right_edge = tile->mi_col_end * MI_SIZE * SCALE_PX_TO_MV;
+  if (src_right_edge > tile_right_edge) return 0;
+  // Is the bottom right within an already coded SB?
+  const int active_sb_top_edge =
+      (mi_row & ~MAX_MIB_MASK) * MI_SIZE * SCALE_PX_TO_MV;
+  const int active_sb_bottom_edge =
+      ((mi_row & ~MAX_MIB_MASK) + MAX_MIB_SIZE) * MI_SIZE * SCALE_PX_TO_MV;
+  const int active_sb_left_edge =
+      (mi_col & ~MAX_MIB_MASK) * MI_SIZE * SCALE_PX_TO_MV;
+  if (src_bottom_edge > active_sb_bottom_edge) return 0;
+  if (src_bottom_edge > active_sb_top_edge &&
+      src_right_edge > active_sb_left_edge)
+    return 0;
+  return 1;
+}
+#endif  // CONFIG_INTRABC
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_MVREF_COMMON_H_
diff --git a/third_party/aom/av1/common/od_dering.c b/third_party/aom/av1/common/od_dering.c
new file mode 100644
index 000000000..f54f337ef
--- /dev/null
+++ b/third_party/aom/av1/common/od_dering.c
@@ -0,0 +1,416 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+
+#ifdef HAVE_CONFIG_H
+#include "./config.h"
+#endif
+
+#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
+#include "./cdef.h"
+
+/* Generated from gen_filter_tables.c. */
+const int OD_DIRECTION_OFFSETS_TABLE[8][3] = {
+  { -1 * OD_FILT_BSTRIDE + 1, -2 * OD_FILT_BSTRIDE + 2,
+    -3 * OD_FILT_BSTRIDE + 3 },
+  { 0 * OD_FILT_BSTRIDE + 1, -1 * OD_FILT_BSTRIDE + 2,
+    -1 * OD_FILT_BSTRIDE + 3 },
+  { 0 * OD_FILT_BSTRIDE + 1, 0 * OD_FILT_BSTRIDE + 2, 0 * OD_FILT_BSTRIDE + 3 },
+  { 0 * OD_FILT_BSTRIDE + 1, 1 * OD_FILT_BSTRIDE + 2, 1 * OD_FILT_BSTRIDE + 3 },
+  { 1 * OD_FILT_BSTRIDE + 1, 2 * OD_FILT_BSTRIDE + 2, 3 * OD_FILT_BSTRIDE + 3 },
+  { 1 * OD_FILT_BSTRIDE + 0, 2 * OD_FILT_BSTRIDE + 1, 3 * OD_FILT_BSTRIDE + 1 },
+  { 1 * OD_FILT_BSTRIDE + 0, 2 * OD_FILT_BSTRIDE + 0, 3 * OD_FILT_BSTRIDE + 0 },
+  { 1 * OD_FILT_BSTRIDE + 0, 2 * OD_FILT_BSTRIDE - 1, 3 * OD_FILT_BSTRIDE - 1 },
+};
+
+/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
+   The search minimizes the weighted variance along all the lines in a
+   particular direction, i.e. the squared error between the input and a
+   "predicted" block where each pixel is replaced by the average along a line
+   in a particular direction. Since each direction have the same sum(x^2) term,
+   that term is never computed. See Section 2, step 2, of:
+   http://jmvalin.ca/notes/intra_paint.pdf */
+int od_dir_find8_c(const uint16_t *img, int stride, int32_t *var,
+                   int coeff_shift) {
+  int i;
+  int32_t cost[8] = { 0 };
+  int partial[8][15] = { { 0 } };
+  int32_t best_cost = 0;
+  int best_dir = 0;
+  /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n.
+     The output is then 840 times larger, but we don't care for finding
+     the max. */
+  static const int div_table[] = { 0, 840, 420, 280, 210, 168, 140, 120, 105 };
+  for (i = 0; i < 8; i++) {
+    int j;
+    for (j = 0; j < 8; j++) {
+      int x;
+      /* We subtract 128 here to reduce the maximum range of the squared
+         partial sums. */
+      x = (img[i * stride + j] >> coeff_shift) - 128;
+      partial[0][i + j] += x;
+      partial[1][i + j / 2] += x;
+      partial[2][i] += x;
+      partial[3][3 + i - j / 2] += x;
+      partial[4][7 + i - j] += x;
+      partial[5][3 - i / 2 + j] += x;
+      partial[6][j] += x;
+      partial[7][i / 2 + j] += x;
+    }
+  }
+  for (i = 0; i < 8; i++) {
+    cost[2] += partial[2][i] * partial[2][i];
+    cost[6] += partial[6][i] * partial[6][i];
+  }
+  cost[2] *= div_table[8];
+  cost[6] *= div_table[8];
+  for (i = 0; i < 7; i++) {
+    cost[0] += (partial[0][i] * partial[0][i] +
+                partial[0][14 - i] * partial[0][14 - i]) *
+               div_table[i + 1];
+    cost[4] += (partial[4][i] * partial[4][i] +
+                partial[4][14 - i] * partial[4][14 - i]) *
+               div_table[i + 1];
+  }
+  cost[0] += partial[0][7] * partial[0][7] * div_table[8];
+  cost[4] += partial[4][7] * partial[4][7] * div_table[8];
+  for (i = 1; i < 8; i += 2) {
+    int j;
+    for (j = 0; j < 4 + 1; j++) {
+      cost[i] += partial[i][3 + j] * partial[i][3 + j];
+    }
+    cost[i] *= div_table[8];
+    for (j = 0; j < 4 - 1; j++) {
+      cost[i] += (partial[i][j] * partial[i][j] +
+                  partial[i][10 - j] * partial[i][10 - j]) *
+                 div_table[2 * j + 2];
+    }
+  }
+  for (i = 0; i < 8; i++) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      best_dir = i;
+    }
+  }
+  /* Difference between the optimal variance and the variance along the
+     orthogonal direction. Again, the sum(x^2) terms cancel out. */
+  *var = best_cost - cost[(best_dir + 4) & 7];
+  /* We'd normally divide by 840, but dividing by 1024 is close enough
+     for what we're going to do with this. */
+  *var >>= 10;
+  return best_dir;
+}
+
+/* Smooth in the direction detected. */
+void od_filter_dering_direction_8x8_c(uint16_t *y, int ystride,
+                                      const uint16_t *in, int threshold,
+                                      int dir, int damping) {
+  int i;
+  int j;
+  int k;
+  static const int taps[3] = { 3, 2, 1 };
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 8; j++) {
+      int16_t sum;
+      int16_t xx;
+      int16_t yy;
+      xx = in[i * OD_FILT_BSTRIDE + j];
+      sum = 0;
+      for (k = 0; k < 3; k++) {
+        int16_t p0;
+        int16_t p1;
+        p0 = in[i * OD_FILT_BSTRIDE + j + OD_DIRECTION_OFFSETS_TABLE[dir][k]] -
+             xx;
+        p1 = in[i * OD_FILT_BSTRIDE + j - OD_DIRECTION_OFFSETS_TABLE[dir][k]] -
+             xx;
+        sum += taps[k] * constrain(p0, threshold, damping);
+        sum += taps[k] * constrain(p1, threshold, damping);
+      }
+      sum = (sum + 8) >> 4;
+      yy = xx + sum;
+      y[i * ystride + j] = yy;
+    }
+  }
+}
+
+/* Smooth in the direction detected. */
+void od_filter_dering_direction_4x4_c(uint16_t *y, int ystride,
+                                      const uint16_t *in, int threshold,
+                                      int dir, int damping) {
+  int i;
+  int j;
+  int k;
+  static const int taps[2] = { 4, 1 };
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      int16_t sum;
+      int16_t xx;
+      int16_t yy;
+      xx = in[i * OD_FILT_BSTRIDE + j];
+      sum = 0;
+      for (k = 0; k < 2; k++) {
+        int16_t p0;
+        int16_t p1;
+        p0 = in[i * OD_FILT_BSTRIDE + j + OD_DIRECTION_OFFSETS_TABLE[dir][k]] -
+             xx;
+        p1 = in[i * OD_FILT_BSTRIDE + j - OD_DIRECTION_OFFSETS_TABLE[dir][k]] -
+             xx;
+        sum += taps[k] * constrain(p0, threshold, damping);
+        sum += taps[k] * constrain(p1, threshold, damping);
+      }
+      sum = (sum + 8) >> 4;
+      yy = xx + sum;
+      y[i * ystride + j] = yy;
+    }
+  }
+}
+
+/* Compute deringing filter threshold for an 8x8 block based on the
+   directional variance difference. A high variance difference means that we
+   have a highly directional pattern (e.g. a high contrast edge), so we can
+   apply more deringing. A low variance means that we either have a low
+   contrast edge, or a non-directional texture, so we want to be careful not
+   to blur. */
+static INLINE int od_adjust_thresh(int threshold, int32_t var) {
+  const int i = var >> 6 ? AOMMIN(get_msb(var >> 6), 12) : 0;
+  /* We use the variance of 8x8 blocks to adjust the threshold. */
+  return var ? (threshold * (4 + i) + 8) >> 4 : 0;
+}
+
+void copy_8x8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src,
+                               int sstride) {
+  int i, j;
+  for (i = 0; i < 8; i++)
+    for (j = 0; j < 8; j++) dst[i * dstride + j] = src[i * sstride + j];
+}
+
+void copy_4x4_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src,
+                               int sstride) {
+  int i, j;
+  for (i = 0; i < 4; i++)
+    for (j = 0; j < 4; j++) dst[i * dstride + j] = src[i * sstride + j];
+}
+
+void copy_dering_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
+                                dering_list *dlist, int dering_count,
+                                int bsize) {
+  int bi, bx, by;
+
+  if (bsize == BLOCK_8X8) {
+    for (bi = 0; bi < dering_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      copy_8x8_16bit_to_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
+                              &src[bi << (3 + 3)], 8);
+    }
+  } else if (bsize == BLOCK_4X8) {
+    for (bi = 0; bi < dering_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      copy_4x4_16bit_to_16bit(&dst[(by << 3) * dstride + (bx << 2)], dstride,
+                              &src[bi << (3 + 2)], 4);
+      copy_4x4_16bit_to_16bit(&dst[((by << 3) + 4) * dstride + (bx << 2)],
+                              dstride, &src[(bi << (3 + 2)) + 4 * 4], 4);
+    }
+  } else if (bsize == BLOCK_8X4) {
+    for (bi = 0; bi < dering_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      copy_4x4_16bit_to_16bit(&dst[(by << 2) * dstride + (bx << 3)], dstride,
+                              &src[bi << (2 + 3)], 8);
+      copy_4x4_16bit_to_16bit(&dst[(by << 2) * dstride + (bx << 3) + 4],
+                              dstride, &src[(bi << (2 + 3)) + 4], 8);
+    }
+  } else {
+    assert(bsize == BLOCK_4X4);
+    for (bi = 0; bi < dering_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      copy_4x4_16bit_to_16bit(&dst[(by << 2) * dstride + (bx << 2)], dstride,
+                              &src[bi << (2 + 2)], 4);
+    }
+  }
+}
+
+void copy_8x8_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src,
+                              int sstride) {
+  int i, j;
+  for (i = 0; i < 8; i++)
+    for (j = 0; j < 8; j++)
+      dst[i * dstride + j] = (uint8_t)src[i * sstride + j];
+}
+
+void copy_4x4_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src,
+                              int sstride) {
+  int i, j;
+  for (i = 0; i < 4; i++)
+    for (j = 0; j < 4; j++)
+      dst[i * dstride + j] = (uint8_t)src[i * sstride + j];
+}
+
+static void copy_dering_16bit_to_8bit(uint8_t *dst, int dstride,
+                                      const uint16_t *src, dering_list *dlist,
+                                      int dering_count, int bsize) {
+  int bi, bx, by;
+  if (bsize == BLOCK_8X8) {
+    for (bi = 0; bi < dering_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      copy_8x8_16bit_to_8bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
+                             &src[bi << (3 + 3)], 8);
+    }
+  } else if (bsize == BLOCK_4X8) {
+    for (bi = 0; bi < dering_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      copy_4x4_16bit_to_8bit(&dst[(by << 3) * dstride + (bx << 2)], dstride,
+                             &src[bi << (3 + 2)], 4);
+      copy_4x4_16bit_to_8bit(&dst[((by << 3) + 4) * dstride + (bx << 2)],
+                             dstride, &src[(bi << (3 + 2)) + 4 * 4], 4);
+    }
+  } else if (bsize == BLOCK_8X4) {
+    for (bi = 0; bi < dering_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      copy_4x4_16bit_to_8bit(&dst[(by << 2) * dstride + (bx << 3)], dstride,
+                             &src[bi << (2 + 3)], 8);
+      copy_4x4_16bit_to_8bit(&dst[(by << 2) * dstride + (bx << 3) + 4], dstride,
+                             &src[(bi << (2 + 3)) + 4], 8);
+    }
+  } else {
+    assert(bsize == BLOCK_4X4);
+    for (bi = 0; bi < dering_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      copy_4x4_16bit_to_8bit(&dst[(by << 2) * dstride + (bx << 2)], dstride,
+                             &src[bi << (2 * 2)], 4);
+    }
+  }
+}
+
+int get_filter_skip(int level) {
+  int filter_skip = level & 1;
+  if (level == 1) filter_skip = 0;
+  return filter_skip;
+}
+
+void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec,
+               int ydec, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
+               int *dirinit, int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
+               int pli, dering_list *dlist, int dering_count, int level,
+               int clpf_strength, int clpf_damping, int dering_damping,
+               int coeff_shift, int skip_dering, int hbd) {
+  int bi;
+  int bx;
+  int by;
+  int bsize, bsizex, bsizey;
+
+  int threshold = (level >> 1) << coeff_shift;
+  int filter_skip = get_filter_skip(level);
+  if (level == 1) threshold = 31 << coeff_shift;
+
+  od_filter_dering_direction_func filter_dering_direction[] = {
+    od_filter_dering_direction_4x4, od_filter_dering_direction_8x8
+  };
+  clpf_damping += coeff_shift - (pli != AOM_PLANE_Y);
+  dering_damping += coeff_shift - (pli != AOM_PLANE_Y);
+  bsize =
+      ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8);
+  bsizex = 3 - xdec;
+  bsizey = 3 - ydec;
+
+  if (!skip_dering) {
+    if (pli == 0) {
+      if (!dirinit || !*dirinit) {
+        for (bi = 0; bi < dering_count; bi++) {
+          by = dlist[bi].by;
+          bx = dlist[bi].bx;
+          dir[by][bx] =
+              od_dir_find8(&in[8 * by * OD_FILT_BSTRIDE + 8 * bx],
+                           OD_FILT_BSTRIDE, &var[by][bx], coeff_shift);
+        }
+        if (dirinit) *dirinit = 1;
+      }
+    }
+    // Only run dering for non-zero threshold (which is always the case for
+    // 4:2:2 or 4:4:0). If we don't dering, we still need to eventually write
+    // something out in y[] later.
+    if (threshold != 0) {
+      assert(bsize == BLOCK_8X8 || bsize == BLOCK_4X4);
+      for (bi = 0; bi < dering_count; bi++) {
+        int t = !filter_skip && dlist[bi].skip ? 0 : threshold;
+        by = dlist[bi].by;
+        bx = dlist[bi].bx;
+        (filter_dering_direction[bsize == BLOCK_8X8])(
+            &y[bi << (bsizex + bsizey)], 1 << bsizex,
+            &in[(by * OD_FILT_BSTRIDE << bsizey) + (bx << bsizex)],
+            pli ? t : od_adjust_thresh(t, var[by][bx]), dir[by][bx],
+            dering_damping);
+      }
+    }
+  }
+
+  if (clpf_strength) {
+    if (threshold && !skip_dering)
+      copy_dering_16bit_to_16bit(in, OD_FILT_BSTRIDE, y, dlist, dering_count,
+                                 bsize);
+    for (bi = 0; bi < dering_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      int py = by << bsizey;
+      int px = bx << bsizex;
+
+      if (!filter_skip && dlist[bi].skip) continue;
+      if (!dst || hbd) {
+        // 16 bit destination if high bitdepth or 8 bit destination not given
+        (!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block_hbd
+                                                        : aom_clpf_hblock_hbd)(
+            dst ? (uint16_t *)dst + py * dstride + px
+                : &y[bi << (bsizex + bsizey)],
+            in + py * OD_FILT_BSTRIDE + px, dst && hbd ? dstride : 1 << bsizex,
+            OD_FILT_BSTRIDE, 1 << bsizex, 1 << bsizey,
+            clpf_strength << coeff_shift, clpf_damping);
+      } else {
+        // Do clpf and write the result to an 8 bit destination
+        (!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block
+                                                        : aom_clpf_hblock)(
+            dst + py * dstride + px, in + py * OD_FILT_BSTRIDE + px, dstride,
+            OD_FILT_BSTRIDE, 1 << bsizex, 1 << bsizey,
+            clpf_strength << coeff_shift, clpf_damping);
+      }
+    }
+  } else if (threshold != 0) {
+    // No clpf, so copy instead
+    if (hbd) {
+      copy_dering_16bit_to_16bit((uint16_t *)dst, dstride, y, dlist,
+                                 dering_count, bsize);
+    } else {
+      copy_dering_16bit_to_8bit(dst, dstride, y, dlist, dering_count, bsize);
+    }
+  } else if (dirinit) {
+    // If we're here, both dering and clpf are off, and we still haven't written
+    // anything to y[] yet, so we just copy the input to y[]. This is necessary
+    // only for av1_cdef_search() and only av1_cdef_search() sets dirinit.
+    for (bi = 0; bi < dering_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      int iy, ix;
+      // TODO(stemidts/jmvalin): SIMD optimisations
+      for (iy = 0; iy < 1 << bsizey; iy++)
+        for (ix = 0; ix < 1 << bsizex; ix++)
+          y[(bi << (bsizex + bsizey)) + (iy << bsizex) + ix] =
+              in[((by << bsizey) + iy) * OD_FILT_BSTRIDE + (bx << bsizex) + ix];
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/od_dering.h b/third_party/aom/av1/common/od_dering.h
new file mode 100644
index 000000000..4362001b4
--- /dev/null
+++ b/third_party/aom/av1/common/od_dering.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if !defined(_dering_H)
+#define _dering_H (1)
+
+#include "odintrin.h"
+
+#define OD_DERING_NBLOCKS (MAX_SB_SIZE / 8)
+
+/* We need to buffer three vertical lines. */
+#define OD_FILT_VBORDER (3)
+/* We only need to buffer three horizontal pixels too, but let's align to
+   16 bytes (8 x 16 bits) to make vectorization easier. */
+#define OD_FILT_HBORDER (8)
+#define OD_FILT_BSTRIDE ALIGN_POWER_OF_TWO(MAX_SB_SIZE + 2 * OD_FILT_HBORDER, 3)
+
+#define OD_DERING_VERY_LARGE (30000)
+#define OD_DERING_INBUF_SIZE \
+  (OD_FILT_BSTRIDE * (MAX_SB_SIZE + 2 * OD_FILT_VBORDER))
+
+extern const int OD_DIRECTION_OFFSETS_TABLE[8][3];
+
+typedef struct {
+  uint8_t by;
+  uint8_t bx;
+  uint8_t skip;
+} dering_list;
+
+typedef void (*od_filter_dering_direction_func)(uint16_t *y, int ystride,
+                                                const uint16_t *in,
+                                                int threshold, int dir,
+                                                int damping);
+void copy_dering_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
+                                dering_list *dlist, int dering_count,
+                                int bsize);
+
+int get_filter_skip(int level);
+
+void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec,
+               int ydec, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
+               int *dirinit, int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
+               int pli, dering_list *dlist, int dering_count, int level,
+               int clpf_strength, int clpf_damping, int dering_damping,
+               int coeff_shift, int skip_dering, int hbd);
+#endif
diff --git a/third_party/aom/av1/common/od_dering_neon.c b/third_party/aom/av1/common/od_dering_neon.c
new file mode 100644
index 000000000..99441050a
--- /dev/null
+++ b/third_party/aom/av1/common/od_dering_neon.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_neon
+#include "./od_dering_simd.h"
diff --git a/third_party/aom/av1/common/od_dering_simd.h b/third_party/aom/av1/common/od_dering_simd.h
new file mode 100644
index 000000000..4074e7e50
--- /dev/null
+++ b/third_party/aom/av1/common/od_dering_simd.h
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "./cdef_simd.h"
+#include "./od_dering.h"
+
+/* partial A is a 16-bit vector of the form:
+   [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
+   [0  y1 y2 y3 y4 y5 y6 y7].
+   This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
+   (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
+   and const2. */
+static INLINE v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1,
+                                    v128 const2) {
+  v128 tmp;
+  /* Reverse partial B. */
+  partialb = v128_shuffle_8(
+      partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c));
+  /* Interleave the x and y values of identical indices and pair x8 with 0. */
+  tmp = partiala;
+  partiala = v128_ziplo_16(partialb, partiala);
+  partialb = v128_ziphi_16(partialb, tmp);
+  /* Square and add the corresponding x and y values. */
+  partiala = v128_madd_s16(partiala, partiala);
+  partialb = v128_madd_s16(partialb, partialb);
+  /* Multiply by constant. */
+  partiala = v128_mullo_s32(partiala, const1);
+  partialb = v128_mullo_s32(partialb, const2);
+  /* Sum all results. */
+  partiala = v128_add_32(partiala, partialb);
+  return partiala;
+}
+
+static INLINE v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) {
+  v128 t0, t1, t2, t3;
+  t0 = v128_ziplo_32(x1, x0);
+  t1 = v128_ziplo_32(x3, x2);
+  t2 = v128_ziphi_32(x1, x0);
+  t3 = v128_ziphi_32(x3, x2);
+  x0 = v128_ziplo_64(t1, t0);
+  x1 = v128_ziphi_64(t1, t0);
+  x2 = v128_ziplo_64(t3, t2);
+  x3 = v128_ziphi_64(t3, t2);
+  return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3));
+}
+
+/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
+   to compute the remaining directions. */
+static INLINE v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) {
+  v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
+  v128 partial6;
+  v128 tmp;
+  /* Partial sums for lines 0 and 1. */
+  partial4a = v128_shl_n_byte(lines[0], 14);
+  partial4b = v128_shr_n_byte(lines[0], 2);
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4));
+  tmp = v128_add_16(lines[0], lines[1]);
+  partial5a = v128_shl_n_byte(tmp, 10);
+  partial5b = v128_shr_n_byte(tmp, 6);
+  partial7a = v128_shl_n_byte(tmp, 4);
+  partial7b = v128_shr_n_byte(tmp, 12);
+  partial6 = tmp;
+
+  /* Partial sums for lines 2 and 3. */
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6));
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8));
+  tmp = v128_add_16(lines[2], lines[3]);
+  partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8));
+  partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8));
+  partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6));
+  partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10));
+  partial6 = v128_add_16(partial6, tmp);
+
+  /* Partial sums for lines 4 and 5. */
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10));
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12));
+  tmp = v128_add_16(lines[4], lines[5]);
+  partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6));
+  partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10));
+  partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8));
+  partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8));
+  partial6 = v128_add_16(partial6, tmp);
+
+  /* Partial sums for lines 6 and 7. */
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14));
+  partial4a = v128_add_16(partial4a, lines[7]);
+  tmp = v128_add_16(lines[6], lines[7]);
+  partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4));
+  partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12));
+  partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10));
+  partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6));
+  partial6 = v128_add_16(partial6, tmp);
+
+  /* Compute costs in terms of partial sums. */
+  partial4a =
+      fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840),
+                       v128_from_32(105, 120, 140, 168));
+  partial7a =
+      fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0),
+                       v128_from_32(105, 105, 105, 140));
+  partial5a =
+      fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0),
+                       v128_from_32(105, 105, 105, 140));
+  partial6 = v128_madd_s16(partial6, partial6);
+  partial6 = v128_mullo_s32(partial6, v128_dup_32(105));
+
+  partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
+  v128_store_unaligned(tmp_cost1, partial4a);
+  return partial4a;
+}
+
+/* transpose and reverse the order of the lines -- equivalent to a 90-degree
+   counter-clockwise rotation of the pixels. */
+static INLINE void array_reverse_transpose_8x8(v128 *in, v128 *res) {
+  const v128 tr0_0 = v128_ziplo_16(in[1], in[0]);
+  const v128 tr0_1 = v128_ziplo_16(in[3], in[2]);
+  const v128 tr0_2 = v128_ziphi_16(in[1], in[0]);
+  const v128 tr0_3 = v128_ziphi_16(in[3], in[2]);
+  const v128 tr0_4 = v128_ziplo_16(in[5], in[4]);
+  const v128 tr0_5 = v128_ziplo_16(in[7], in[6]);
+  const v128 tr0_6 = v128_ziphi_16(in[5], in[4]);
+  const v128 tr0_7 = v128_ziphi_16(in[7], in[6]);
+
+  const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0);
+  const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4);
+  const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0);
+  const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4);
+  const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2);
+  const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6);
+  const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2);
+  const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6);
+
+  res[7] = v128_ziplo_64(tr1_1, tr1_0);
+  res[6] = v128_ziphi_64(tr1_1, tr1_0);
+  res[5] = v128_ziplo_64(tr1_3, tr1_2);
+  res[4] = v128_ziphi_64(tr1_3, tr1_2);
+  res[3] = v128_ziplo_64(tr1_5, tr1_4);
+  res[2] = v128_ziphi_64(tr1_5, tr1_4);
+  res[1] = v128_ziplo_64(tr1_7, tr1_6);
+  res[0] = v128_ziphi_64(tr1_7, tr1_6);
+}
+
+int SIMD_FUNC(od_dir_find8)(const od_dering_in *img, int stride, int32_t *var,
+                            int coeff_shift) {
+  int i;
+  int32_t cost[8];
+  int32_t best_cost = 0;
+  int best_dir = 0;
+  v128 lines[8];
+  for (i = 0; i < 8; i++) {
+    lines[i] = v128_load_unaligned(&img[i * stride]);
+    lines[i] =
+        v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128));
+  }
+
+#if defined(__SSE4_1__)
+  /* Compute "mostly vertical" directions. */
+  __m128i dir47 = compute_directions(lines, cost + 4);
+
+  array_reverse_transpose_8x8(lines, lines);
+
+  /* Compute "mostly horizontal" directions. */
+  __m128i dir03 = compute_directions(lines, cost);
+
+  __m128i max = _mm_max_epi32(dir03, dir47);
+  max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(1, 0, 3, 2)));
+  max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(2, 3, 0, 1)));
+  best_cost = _mm_cvtsi128_si32(max);
+  __m128i t =
+      _mm_packs_epi32(_mm_cmpeq_epi32(max, dir03), _mm_cmpeq_epi32(max, dir47));
+  best_dir = _mm_movemask_epi8(_mm_packs_epi16(t, t));
+  best_dir = get_msb(best_dir ^ (best_dir - 1));  // Count trailing zeros
+#else
+  /* Compute "mostly vertical" directions. */
+  compute_directions(lines, cost + 4);
+
+  array_reverse_transpose_8x8(lines, lines);
+
+  /* Compute "mostly horizontal" directions. */
+  compute_directions(lines, cost);
+
+  for (i = 0; i < 8; i++) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      best_dir = i;
+    }
+  }
+#endif
+
+  /* Difference between the optimal variance and the variance along the
+     orthogonal direction. Again, the sum(x^2) terms cancel out. */
+  *var = best_cost - cost[(best_dir + 4) & 7];
+  /* We'd normally divide by 840, but dividing by 1024 is close enough
+     for what we're going to do with this. */
+  *var >>= 10;
+  return best_dir;
+}
+
+void SIMD_FUNC(od_filter_dering_direction_4x4)(uint16_t *y, int ystride,
+                                               const uint16_t *in,
+                                               int threshold, int dir,
+                                               int damping) {
+  int i;
+  v128 p0, p1, sum, row, res;
+  int o1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
+  int o2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
+
+  if (threshold) damping -= get_msb(threshold);
+  for (i = 0; i < 4; i += 2) {
+    sum = v128_zero();
+    row = v128_from_v64(v64_load_aligned(&in[i * OD_FILT_BSTRIDE]),
+                        v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE]));
+
+    // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
+    p0 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE + o1]),
+                       v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE + o1]));
+    p0 = constrain16(p0, row, threshold, damping);
+
+    // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
+    p1 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE - o1]),
+                       v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE - o1]));
+    p1 = constrain16(p1, row, threshold, damping);
+
+    // sum += 4 * (p0 + p1)
+    sum = v128_add_16(sum, v128_shl_n_16(v128_add_16(p0, p1), 2));
+
+    // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
+    p0 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE + o2]),
+                       v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE + o2]));
+    p0 = constrain16(p0, row, threshold, damping);
+
+    // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
+    p1 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE - o2]),
+                       v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE - o2]));
+    p1 = constrain16(p1, row, threshold, damping);
+
+    // sum += 1 * (p0 + p1)
+    sum = v128_add_16(sum, v128_add_16(p0, p1));
+
+    // res = row + ((sum + 8) >> 4)
+    res = v128_add_16(sum, v128_dup_16(8));
+    res = v128_shr_n_s16(res, 4);
+    res = v128_add_16(row, res);
+    v64_store_aligned(&y[i * ystride], v128_high_v64(res));
+    v64_store_aligned(&y[(i + 1) * ystride], v128_low_v64(res));
+  }
+}
+
+void SIMD_FUNC(od_filter_dering_direction_8x8)(uint16_t *y, int ystride,
+                                               const uint16_t *in,
+                                               int threshold, int dir,
+                                               int damping) {
+  int i;
+  v128 sum, p0, p1, row, res;
+  int o1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
+  int o2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
+  int o3 = OD_DIRECTION_OFFSETS_TABLE[dir][2];
+
+  if (threshold) damping -= get_msb(threshold);
+  for (i = 0; i < 8; i++) {
+    sum = v128_zero();
+    row = v128_load_aligned(&in[i * OD_FILT_BSTRIDE]);
+
+    // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
+    p0 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + o1]);
+    p0 = constrain16(p0, row, threshold, damping);
+
+    // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
+    p1 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - o1]);
+    p1 = constrain16(p1, row, threshold, damping);
+
+    // sum += 3 * (p0 + p1)
+    p0 = v128_add_16(p0, p1);
+    p0 = v128_add_16(p0, v128_shl_n_16(p0, 1));
+    sum = v128_add_16(sum, p0);
+
+    // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
+    p0 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + o2]);
+    p0 = constrain16(p0, row, threshold, damping);
+
+    // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
+    p1 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - o2]);
+    p1 = constrain16(p1, row, threshold, damping);
+
+    // sum += 2 * (p0 + p1)
+    p0 = v128_shl_n_16(v128_add_16(p0, p1), 1);
+    sum = v128_add_16(sum, p0);
+
+    // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
+    p0 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + o3]);
+    p0 = constrain16(p0, row, threshold, damping);
+
+    // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
+    p1 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - o3]);
+    p1 = constrain16(p1, row, threshold, damping);
+
+    // sum += (p0 + p1)
+    p0 = v128_add_16(p0, p1);
+    sum = v128_add_16(sum, p0);
+
+    // res = row + ((sum + 8) >> 4)
+    res = v128_add_16(sum, v128_dup_16(8));
+    res = v128_shr_n_s16(res, 4);
+    res = v128_add_16(row, res);
+    v128_store_unaligned(&y[i * ystride], res);
+  }
+}
+
+void SIMD_FUNC(copy_8x8_16bit_to_8bit)(uint8_t *dst, int dstride,
+                                       const uint16_t *src, int sstride) {
+  int i;
+  for (i = 0; i < 8; i++) {
+    v128 row = v128_load_unaligned(&src[i * sstride]);
+    row = v128_pack_s16_u8(row, row);
+    v64_store_unaligned(&dst[i * dstride], v128_low_v64(row));
+  }
+}
+
+void SIMD_FUNC(copy_4x4_16bit_to_8bit)(uint8_t *dst, int dstride,
+                                       const uint16_t *src, int sstride) {
+  int i;
+  for (i = 0; i < 4; i++) {
+    v128 row = v128_load_unaligned(&src[i * sstride]);
+    row = v128_pack_s16_u8(row, row);
+    u32_store_unaligned(&dst[i * dstride], v128_low_u32(row));
+  }
+}
+
+void SIMD_FUNC(copy_8x8_16bit_to_16bit)(uint16_t *dst, int dstride,
+                                        const uint16_t *src, int sstride) {
+  int i;
+  for (i = 0; i < 8; i++) {
+    v128 row = v128_load_unaligned(&src[i * sstride]);
+    v128_store_unaligned(&dst[i * dstride], row);
+  }
+}
+
+void SIMD_FUNC(copy_4x4_16bit_to_16bit)(uint16_t *dst, int dstride,
+                                        const uint16_t *src, int sstride) {
+  int i;
+  for (i = 0; i < 4; i++) {
+    v64 row = v64_load_unaligned(&src[i * sstride]);
+    v64_store_unaligned(&dst[i * dstride], row);
+  }
+}
+
+void SIMD_FUNC(copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
+                                         const uint8_t *src, int sstride, int v,
+                                         int h) {
+  int i, j;
+  for (i = 0; i < v; i++) {
+    for (j = 0; j < (h & ~0x7); j += 8) {
+      v64 row = v64_load_unaligned(&src[i * sstride + j]);
+      v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
+    }
+    for (; j < h; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
+
+void SIMD_FUNC(copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
+                                          const uint16_t *src, int sstride,
+                                          int v, int h) {
+  int i, j;
+  for (i = 0; i < v; i++) {
+    for (j = 0; j < (h & ~0x7); j += 8) {
+      v128 row = v128_load_unaligned(&src[i * sstride + j]);
+      v128_store_unaligned(&dst[i * dstride + j], row);
+    }
+    for (; j < h; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/od_dering_sse2.c b/third_party/aom/av1/common/od_dering_sse2.c
new file mode 100644
index 000000000..8a2a62f6c
--- /dev/null
+++ b/third_party/aom/av1/common/od_dering_sse2.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_sse2
+#include "./od_dering_simd.h"
diff --git a/third_party/aom/av1/common/od_dering_sse4.c b/third_party/aom/av1/common/od_dering_sse4.c
new file mode 100644
index 000000000..0769db9fd
--- /dev/null
+++ b/third_party/aom/av1/common/od_dering_sse4.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_sse4_1
+#include "./od_dering_simd.h"
diff --git a/third_party/aom/av1/common/od_dering_ssse3.c b/third_party/aom/av1/common/od_dering_ssse3.c
new file mode 100644
index 000000000..99df62b6b
--- /dev/null
+++ b/third_party/aom/av1/common/od_dering_ssse3.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_ssse3
+#include "./od_dering_simd.h"
diff --git a/third_party/aom/av1/common/odintrin.c b/third_party/aom/av1/common/odintrin.c
new file mode 100644
index 000000000..868efacc9
--- /dev/null
+++ b/third_party/aom/av1/common/odintrin.c
@@ -0,0 +1,551 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#include "av1/common/odintrin.h"
+
+#if defined(OD_ENABLE_ASSERTIONS)
+# include <stdio.h>
+
+void od_fatal_impl(const char *_str, const char *_file, int _line) {
+  fprintf(stderr, "Fatal (internal) error in %s, line %d: %s\n",
+   _file, _line, _str);
+  abort();
+}
+#endif
+
+/*Constants for use with OD_DIVU_SMALL().
+  See \cite{Rob05} for details on computing these constants.
+  @INPROCEEDINGS{Rob05,
+    author="Arch D. Robison",
+    title="{N}-bit Unsigned Division via {N}-bit Multiply-Add",
+    booktitle="Proc. of the 17th IEEE Symposium on Computer Arithmetic
+     (ARITH'05)",
+    pages="131--139",
+    address="Cape Cod, MA",
+    month=Jun,
+    year=2005
+  }*/
+uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2] = {
+  { 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xAAAAAAAB, 0 },          { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xCCCCCCCD, 0 },          { 0xAAAAAAAB, 0 },
+  { 0x92492492, 0x92492492 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xE38E38E4, 0 },          { 0xCCCCCCCD, 0 },
+  { 0xBA2E8BA3, 0 },          { 0xAAAAAAAB, 0 },
+  { 0x9D89D89E, 0 },          { 0x92492492, 0x92492492 },
+  { 0x88888889, 0 },          { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xF0F0F0F1, 0 },          { 0xE38E38E4, 0 },
+  { 0xD79435E5, 0xD79435E5 }, { 0xCCCCCCCD, 0 },
+  { 0xC30C30C3, 0xC30C30C3 }, { 0xBA2E8BA3, 0 },
+  { 0xB21642C9, 0 },          { 0xAAAAAAAB, 0 },
+  { 0xA3D70A3E, 0 },          { 0x9D89D89E, 0 },
+  { 0x97B425ED, 0x97B425ED }, { 0x92492492, 0x92492492 },
+  { 0x8D3DCB09, 0 },          { 0x88888889, 0 },
+  { 0x84210842, 0x84210842 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xF83E0F84, 0 },          { 0xF0F0F0F1, 0 },
+  { 0xEA0EA0EA, 0xEA0EA0EA }, { 0xE38E38E4, 0 },
+  { 0xDD67C8A6, 0xDD67C8A6 }, { 0xD79435E5, 0xD79435E5 },
+  { 0xD20D20D2, 0xD20D20D2 }, { 0xCCCCCCCD, 0 },
+  { 0xC7CE0C7D, 0 },          { 0xC30C30C3, 0xC30C30C3 },
+  { 0xBE82FA0C, 0 },          { 0xBA2E8BA3, 0 },
+  { 0xB60B60B6, 0xB60B60B6 }, { 0xB21642C9, 0 },
+  { 0xAE4C415D, 0 },          { 0xAAAAAAAB, 0 },
+  { 0xA72F053A, 0 },          { 0xA3D70A3E, 0 },
+  { 0xA0A0A0A1, 0 },          { 0x9D89D89E, 0 },
+  { 0x9A90E7D9, 0x9A90E7D9 }, { 0x97B425ED, 0x97B425ED },
+  { 0x94F2094F, 0x94F2094F }, { 0x92492492, 0x92492492 },
+  { 0x8FB823EE, 0x8FB823EE }, { 0x8D3DCB09, 0 },
+  { 0x8AD8F2FC, 0 },          { 0x88888889, 0 },
+  { 0x864B8A7E, 0 },          { 0x84210842, 0x84210842 },
+  { 0x82082082, 0x82082082 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xFC0FC0FD, 0 },          { 0xF83E0F84, 0 },
+  { 0xF4898D60, 0 },          { 0xF0F0F0F1, 0 },
+  { 0xED7303B6, 0 },          { 0xEA0EA0EA, 0xEA0EA0EA },
+  { 0xE6C2B449, 0 },          { 0xE38E38E4, 0 },
+  { 0xE070381C, 0xE070381C }, { 0xDD67C8A6, 0xDD67C8A6 },
+  { 0xDA740DA8, 0 },          { 0xD79435E5, 0xD79435E5 },
+  { 0xD4C77B04, 0 },          { 0xD20D20D2, 0xD20D20D2 },
+  { 0xCF6474A9, 0 },          { 0xCCCCCCCD, 0 },
+  { 0xCA4587E7, 0 },          { 0xC7CE0C7D, 0 },
+  { 0xC565C87C, 0 },          { 0xC30C30C3, 0xC30C30C3 },
+  { 0xC0C0C0C1, 0 },          { 0xBE82FA0C, 0 },
+  { 0xBC52640C, 0 },          { 0xBA2E8BA3, 0 },
+  { 0xB81702E1, 0 },          { 0xB60B60B6, 0xB60B60B6 },
+  { 0xB40B40B4, 0xB40B40B4 }, { 0xB21642C9, 0 },
+  { 0xB02C0B03, 0 },          { 0xAE4C415D, 0 },
+  { 0xAC769184, 0xAC769184 }, { 0xAAAAAAAB, 0 },
+  { 0xA8E83F57, 0xA8E83F57 }, { 0xA72F053A, 0 },
+  { 0xA57EB503, 0 },          { 0xA3D70A3E, 0 },
+  { 0xA237C32B, 0xA237C32B }, { 0xA0A0A0A1, 0 },
+  { 0x9F1165E7, 0x9F1165E7 }, { 0x9D89D89E, 0 },
+  { 0x9C09C09C, 0x9C09C09C }, { 0x9A90E7D9, 0x9A90E7D9 },
+  { 0x991F1A51, 0x991F1A51 }, { 0x97B425ED, 0x97B425ED },
+  { 0x964FDA6C, 0x964FDA6C }, { 0x94F2094F, 0x94F2094F },
+  { 0x939A85C4, 0x939A85C4 }, { 0x92492492, 0x92492492 },
+  { 0x90FDBC09, 0x90FDBC09 }, { 0x8FB823EE, 0x8FB823EE },
+  { 0x8E78356D, 0x8E78356D }, { 0x8D3DCB09, 0 },
+  { 0x8C08C08C, 0x8C08C08C }, { 0x8AD8F2FC, 0 },
+  { 0x89AE408A, 0 },          { 0x88888889, 0 },
+  { 0x8767AB5F, 0x8767AB5F }, { 0x864B8A7E, 0 },
+  { 0x85340853, 0x85340853 }, { 0x84210842, 0x84210842 },
+  { 0x83126E98, 0 },          { 0x82082082, 0x82082082 },
+  { 0x81020408, 0x81020408 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xFE03F810, 0 },          { 0xFC0FC0FD, 0 },
+  { 0xFA232CF3, 0 },          { 0xF83E0F84, 0 },
+  { 0xF6603D99, 0 },          { 0xF4898D60, 0 },
+  { 0xF2B9D649, 0 },          { 0xF0F0F0F1, 0 },
+  { 0xEF2EB720, 0 },          { 0xED7303B6, 0 },
+  { 0xEBBDB2A6, 0 },          { 0xEA0EA0EA, 0xEA0EA0EA },
+  { 0xE865AC7C, 0 },          { 0xE6C2B449, 0 },
+  { 0xE525982B, 0 },          { 0xE38E38E4, 0 },
+  { 0xE1FC780F, 0 },          { 0xE070381C, 0xE070381C },
+  { 0xDEE95C4D, 0 },          { 0xDD67C8A6, 0xDD67C8A6 },
+  { 0xDBEB61EF, 0 },          { 0xDA740DA8, 0 },
+  { 0xD901B204, 0 },          { 0xD79435E5, 0xD79435E5 },
+  { 0xD62B80D7, 0 },          { 0xD4C77B04, 0 },
+  { 0xD3680D37, 0 },          { 0xD20D20D2, 0xD20D20D2 },
+  { 0xD0B69FCC, 0 },          { 0xCF6474A9, 0 },
+  { 0xCE168A77, 0xCE168A77 }, { 0xCCCCCCCD, 0 },
+  { 0xCB8727C1, 0 },          { 0xCA4587E7, 0 },
+  { 0xC907DA4F, 0 },          { 0xC7CE0C7D, 0 },
+  { 0xC6980C6A, 0 },          { 0xC565C87C, 0 },
+  { 0xC4372F86, 0 },          { 0xC30C30C3, 0xC30C30C3 },
+  { 0xC1E4BBD6, 0 },          { 0xC0C0C0C1, 0 },
+  { 0xBFA02FE8, 0xBFA02FE8 }, { 0xBE82FA0C, 0 },
+  { 0xBD691047, 0xBD691047 }, { 0xBC52640C, 0 },
+  { 0xBB3EE722, 0 },          { 0xBA2E8BA3, 0 },
+  { 0xB92143FA, 0xB92143FA }, { 0xB81702E1, 0 },
+  { 0xB70FBB5A, 0xB70FBB5A }, { 0xB60B60B6, 0xB60B60B6 },
+  { 0xB509E68B, 0 },          { 0xB40B40B4, 0xB40B40B4 },
+  { 0xB30F6353, 0 },          { 0xB21642C9, 0 },
+  { 0xB11FD3B8, 0xB11FD3B8 }, { 0xB02C0B03, 0 },
+  { 0xAF3ADDC7, 0 },          { 0xAE4C415D, 0 },
+  { 0xAD602B58, 0xAD602B58 }, { 0xAC769184, 0xAC769184 },
+  { 0xAB8F69E3, 0 },          { 0xAAAAAAAB, 0 },
+  { 0xA9C84A48, 0 },          { 0xA8E83F57, 0xA8E83F57 },
+  { 0xA80A80A8, 0xA80A80A8 }, { 0xA72F053A, 0 },
+  { 0xA655C439, 0xA655C439 }, { 0xA57EB503, 0 },
+  { 0xA4A9CF1E, 0 },          { 0xA3D70A3E, 0 },
+  { 0xA3065E40, 0 },          { 0xA237C32B, 0xA237C32B },
+  { 0xA16B312F, 0 },          { 0xA0A0A0A1, 0 },
+  { 0x9FD809FE, 0 },          { 0x9F1165E7, 0x9F1165E7 },
+  { 0x9E4CAD24, 0 },          { 0x9D89D89E, 0 },
+  { 0x9CC8E161, 0 },          { 0x9C09C09C, 0x9C09C09C },
+  { 0x9B4C6F9F, 0 },          { 0x9A90E7D9, 0x9A90E7D9 },
+  { 0x99D722DB, 0 },          { 0x991F1A51, 0x991F1A51 },
+  { 0x9868C80A, 0 },          { 0x97B425ED, 0x97B425ED },
+  { 0x97012E02, 0x97012E02 }, { 0x964FDA6C, 0x964FDA6C },
+  { 0x95A02568, 0x95A02568 }, { 0x94F2094F, 0x94F2094F },
+  { 0x94458094, 0x94458094 }, { 0x939A85C4, 0x939A85C4 },
+  { 0x92F11384, 0x92F11384 }, { 0x92492492, 0x92492492 },
+  { 0x91A2B3C5, 0 },          { 0x90FDBC09, 0x90FDBC09 },
+  { 0x905A3863, 0x905A3863 }, { 0x8FB823EE, 0x8FB823EE },
+  { 0x8F1779DA, 0 },          { 0x8E78356D, 0x8E78356D },
+  { 0x8DDA5202, 0x8DDA5202 }, { 0x8D3DCB09, 0 },
+  { 0x8CA29C04, 0x8CA29C04 }, { 0x8C08C08C, 0x8C08C08C },
+  { 0x8B70344A, 0x8B70344A }, { 0x8AD8F2FC, 0 },
+  { 0x8A42F870, 0x8A42F870 }, { 0x89AE408A, 0 },
+  { 0x891AC73B, 0 },          { 0x88888889, 0 },
+  { 0x87F78088, 0 },          { 0x8767AB5F, 0x8767AB5F },
+  { 0x86D90545, 0 },          { 0x864B8A7E, 0 },
+  { 0x85BF3761, 0x85BF3761 }, { 0x85340853, 0x85340853 },
+  { 0x84A9F9C8, 0x84A9F9C8 }, { 0x84210842, 0x84210842 },
+  { 0x83993052, 0x83993052 }, { 0x83126E98, 0 },
+  { 0x828CBFBF, 0 },          { 0x82082082, 0x82082082 },
+  { 0x81848DA9, 0 },          { 0x81020408, 0x81020408 },
+  { 0x80808081, 0 },          { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xFF00FF01, 0 },          { 0xFE03F810, 0 },
+  { 0xFD08E551, 0 },          { 0xFC0FC0FD, 0 },
+  { 0xFB188566, 0 },          { 0xFA232CF3, 0 },
+  { 0xF92FB222, 0 },          { 0xF83E0F84, 0 },
+  { 0xF74E3FC3, 0 },          { 0xF6603D99, 0 },
+  { 0xF57403D6, 0 },          { 0xF4898D60, 0 },
+  { 0xF3A0D52D, 0 },          { 0xF2B9D649, 0 },
+  { 0xF1D48BCF, 0 },          { 0xF0F0F0F1, 0 },
+  { 0xF00F00F0, 0xF00F00F0 }, { 0xEF2EB720, 0 },
+  { 0xEE500EE5, 0xEE500EE5 }, { 0xED7303B6, 0 },
+  { 0xEC979119, 0 },          { 0xEBBDB2A6, 0 },
+  { 0xEAE56404, 0 },          { 0xEA0EA0EA, 0xEA0EA0EA },
+  { 0xE9396520, 0 },          { 0xE865AC7C, 0 },
+  { 0xE79372E3, 0 },          { 0xE6C2B449, 0 },
+  { 0xE5F36CB0, 0xE5F36CB0 }, { 0xE525982B, 0 },
+  { 0xE45932D8, 0 },          { 0xE38E38E4, 0 },
+  { 0xE2C4A689, 0 },          { 0xE1FC780F, 0 },
+  { 0xE135A9CA, 0 },          { 0xE070381C, 0xE070381C },
+  { 0xDFAC1F75, 0 },          { 0xDEE95C4D, 0 },
+  { 0xDE27EB2D, 0 },          { 0xDD67C8A6, 0xDD67C8A6 },
+  { 0xDCA8F159, 0 },          { 0xDBEB61EF, 0 },
+  { 0xDB2F171E, 0 },          { 0xDA740DA8, 0 },
+  { 0xD9BA4257, 0 },          { 0xD901B204, 0 },
+  { 0xD84A598F, 0 },          { 0xD79435E5, 0xD79435E5 },
+  { 0xD6DF43FD, 0 },          { 0xD62B80D7, 0 },
+  { 0xD578E97D, 0 },          { 0xD4C77B04, 0 },
+  { 0xD417328A, 0 },          { 0xD3680D37, 0 },
+  { 0xD2BA083C, 0 },          { 0xD20D20D2, 0xD20D20D2 },
+  { 0xD161543E, 0xD161543E }, { 0xD0B69FCC, 0 },
+  { 0xD00D00D0, 0xD00D00D0 }, { 0xCF6474A9, 0 },
+  { 0xCEBCF8BC, 0 },          { 0xCE168A77, 0xCE168A77 },
+  { 0xCD712753, 0 },          { 0xCCCCCCCD, 0 },
+  { 0xCC29786D, 0 },          { 0xCB8727C1, 0 },
+  { 0xCAE5D85F, 0xCAE5D85F }, { 0xCA4587E7, 0 },
+  { 0xC9A633FD, 0 },          { 0xC907DA4F, 0 },
+  { 0xC86A7890, 0xC86A7890 }, { 0xC7CE0C7D, 0 },
+  { 0xC73293D8, 0 },          { 0xC6980C6A, 0 },
+  { 0xC5FE7403, 0xC5FE7403 }, { 0xC565C87C, 0 },
+  { 0xC4CE07B0, 0xC4CE07B0 }, { 0xC4372F86, 0 },
+  { 0xC3A13DE6, 0xC3A13DE6 }, { 0xC30C30C3, 0xC30C30C3 },
+  { 0xC2780614, 0 },          { 0xC1E4BBD6, 0 },
+  { 0xC152500C, 0xC152500C }, { 0xC0C0C0C1, 0 },
+  { 0xC0300C03, 0xC0300C03 }, { 0xBFA02FE8, 0xBFA02FE8 },
+  { 0xBF112A8B, 0 },          { 0xBE82FA0C, 0 },
+  { 0xBDF59C92, 0 },          { 0xBD691047, 0xBD691047 },
+  { 0xBCDD535E, 0 },          { 0xBC52640C, 0 },
+  { 0xBBC8408D, 0 },          { 0xBB3EE722, 0 },
+  { 0xBAB65610, 0xBAB65610 }, { 0xBA2E8BA3, 0 },
+  { 0xB9A7862A, 0xB9A7862A }, { 0xB92143FA, 0xB92143FA },
+  { 0xB89BC36D, 0 },          { 0xB81702E1, 0 },
+  { 0xB79300B8, 0 },          { 0xB70FBB5A, 0xB70FBB5A },
+  { 0xB68D3134, 0xB68D3134 }, { 0xB60B60B6, 0xB60B60B6 },
+  { 0xB58A4855, 0xB58A4855 }, { 0xB509E68B, 0 },
+  { 0xB48A39D4, 0xB48A39D4 }, { 0xB40B40B4, 0xB40B40B4 },
+  { 0xB38CF9B0, 0xB38CF9B0 }, { 0xB30F6353, 0 },
+  { 0xB2927C2A, 0 },          { 0xB21642C9, 0 },
+  { 0xB19AB5C5, 0 },          { 0xB11FD3B8, 0xB11FD3B8 },
+  { 0xB0A59B42, 0 },          { 0xB02C0B03, 0 },
+  { 0xAFB321A1, 0xAFB321A1 }, { 0xAF3ADDC7, 0 },
+  { 0xAEC33E20, 0 },          { 0xAE4C415D, 0 },
+  { 0xADD5E632, 0xADD5E632 }, { 0xAD602B58, 0xAD602B58 },
+  { 0xACEB0F89, 0xACEB0F89 }, { 0xAC769184, 0xAC769184 },
+  { 0xAC02B00B, 0 },          { 0xAB8F69E3, 0 },
+  { 0xAB1CBDD4, 0 },          { 0xAAAAAAAB, 0 },
+  { 0xAA392F36, 0 },          { 0xA9C84A48, 0 },
+  { 0xA957FAB5, 0xA957FAB5 }, { 0xA8E83F57, 0xA8E83F57 },
+  { 0xA8791709, 0 },          { 0xA80A80A8, 0xA80A80A8 },
+  { 0xA79C7B17, 0 },          { 0xA72F053A, 0 },
+  { 0xA6C21DF7, 0 },          { 0xA655C439, 0xA655C439 },
+  { 0xA5E9F6ED, 0xA5E9F6ED }, { 0xA57EB503, 0 },
+  { 0xA513FD6C, 0 },          { 0xA4A9CF1E, 0 },
+  { 0xA4402910, 0xA4402910 }, { 0xA3D70A3E, 0 },
+  { 0xA36E71A3, 0 },          { 0xA3065E40, 0 },
+  { 0xA29ECF16, 0xA29ECF16 }, { 0xA237C32B, 0xA237C32B },
+  { 0xA1D13986, 0 },          { 0xA16B312F, 0 },
+  { 0xA105A933, 0 },          { 0xA0A0A0A1, 0 },
+  { 0xA03C1689, 0 },          { 0x9FD809FE, 0 },
+  { 0x9F747A15, 0x9F747A15 }, { 0x9F1165E7, 0x9F1165E7 },
+  { 0x9EAECC8D, 0x9EAECC8D }, { 0x9E4CAD24, 0 },
+  { 0x9DEB06C9, 0x9DEB06C9 }, { 0x9D89D89E, 0 },
+  { 0x9D2921C4, 0 },          { 0x9CC8E161, 0 },
+  { 0x9C69169B, 0x9C69169B }, { 0x9C09C09C, 0x9C09C09C },
+  { 0x9BAADE8E, 0x9BAADE8E }, { 0x9B4C6F9F, 0 },
+  { 0x9AEE72FD, 0 },          { 0x9A90E7D9, 0x9A90E7D9 },
+  { 0x9A33CD67, 0x9A33CD67 }, { 0x99D722DB, 0 },
+  { 0x997AE76B, 0x997AE76B }, { 0x991F1A51, 0x991F1A51 },
+  { 0x98C3BAC7, 0x98C3BAC7 }, { 0x9868C80A, 0 },
+  { 0x980E4156, 0x980E4156 }, { 0x97B425ED, 0x97B425ED },
+  { 0x975A7510, 0 },          { 0x97012E02, 0x97012E02 },
+  { 0x96A8500A, 0 },          { 0x964FDA6C, 0x964FDA6C },
+  { 0x95F7CC73, 0 },          { 0x95A02568, 0x95A02568 },
+  { 0x9548E498, 0 },          { 0x94F2094F, 0x94F2094F },
+  { 0x949B92DE, 0 },          { 0x94458094, 0x94458094 },
+  { 0x93EFD1C5, 0x93EFD1C5 }, { 0x939A85C4, 0x939A85C4 },
+  { 0x93459BE7, 0 },          { 0x92F11384, 0x92F11384 },
+  { 0x929CEBF5, 0 },          { 0x92492492, 0x92492492 },
+  { 0x91F5BCB9, 0 },          { 0x91A2B3C5, 0 },
+  { 0x91500915, 0x91500915 }, { 0x90FDBC09, 0x90FDBC09 },
+  { 0x90ABCC02, 0x90ABCC02 }, { 0x905A3863, 0x905A3863 },
+  { 0x90090090, 0x90090090 }, { 0x8FB823EE, 0x8FB823EE },
+  { 0x8F67A1E4, 0 },          { 0x8F1779DA, 0 },
+  { 0x8EC7AB3A, 0 },          { 0x8E78356D, 0x8E78356D },
+  { 0x8E2917E1, 0 },          { 0x8DDA5202, 0x8DDA5202 },
+  { 0x8D8BE340, 0 },          { 0x8D3DCB09, 0 },
+  { 0x8CF008CF, 0x8CF008CF }, { 0x8CA29C04, 0x8CA29C04 },
+  { 0x8C55841D, 0 },          { 0x8C08C08C, 0x8C08C08C },
+  { 0x8BBC50C9, 0 },          { 0x8B70344A, 0x8B70344A },
+  { 0x8B246A88, 0 },          { 0x8AD8F2FC, 0 },
+  { 0x8A8DCD20, 0 },          { 0x8A42F870, 0x8A42F870 },
+  { 0x89F8746A, 0 },          { 0x89AE408A, 0 },
+  { 0x89645C4F, 0x89645C4F }, { 0x891AC73B, 0 },
+  { 0x88D180CD, 0x88D180CD }, { 0x88888889, 0 },
+  { 0x883FDDF0, 0x883FDDF0 }, { 0x87F78088, 0 },
+  { 0x87AF6FD6, 0 },          { 0x8767AB5F, 0x8767AB5F },
+  { 0x872032AC, 0x872032AC }, { 0x86D90545, 0 },
+  { 0x869222B2, 0 },          { 0x864B8A7E, 0 },
+  { 0x86053C34, 0x86053C34 }, { 0x85BF3761, 0x85BF3761 },
+  { 0x85797B91, 0x85797B91 }, { 0x85340853, 0x85340853 },
+  { 0x84EEDD36, 0 },          { 0x84A9F9C8, 0x84A9F9C8 },
+  { 0x84655D9C, 0 },          { 0x84210842, 0x84210842 },
+  { 0x83DCF94E, 0 },          { 0x83993052, 0x83993052 },
+  { 0x8355ACE4, 0 },          { 0x83126E98, 0 },
+  { 0x82CF7504, 0 },          { 0x828CBFBF, 0 },
+  { 0x824A4E61, 0 },          { 0x82082082, 0x82082082 },
+  { 0x81C635BC, 0x81C635BC }, { 0x81848DA9, 0 },
+  { 0x814327E4, 0 },          { 0x81020408, 0x81020408 },
+  { 0x80C121B3, 0 },          { 0x80808081, 0 },
+  { 0x80402010, 0x80402010 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xFF803FE1, 0 },          { 0xFF00FF01, 0 },
+  { 0xFE823CA6, 0 },          { 0xFE03F810, 0 },
+  { 0xFD863087, 0 },          { 0xFD08E551, 0 },
+  { 0xFC8C15B5, 0 },          { 0xFC0FC0FD, 0 },
+  { 0xFB93E673, 0 },          { 0xFB188566, 0 },
+  { 0xFA9D9D20, 0 },          { 0xFA232CF3, 0 },
+  { 0xF9A9342D, 0 },          { 0xF92FB222, 0 },
+  { 0xF8B6A622, 0xF8B6A622 }, { 0xF83E0F84, 0 },
+  { 0xF7C5ED9D, 0 },          { 0xF74E3FC3, 0 },
+  { 0xF6D7054E, 0 },          { 0xF6603D99, 0 },
+  { 0xF5E9E7FD, 0 },          { 0xF57403D6, 0 },
+  { 0xF4FE9083, 0 },          { 0xF4898D60, 0 },
+  { 0xF414F9CE, 0 },          { 0xF3A0D52D, 0 },
+  { 0xF32D1EE0, 0 },          { 0xF2B9D649, 0 },
+  { 0xF246FACC, 0 },          { 0xF1D48BCF, 0 },
+  { 0xF16288B9, 0 },          { 0xF0F0F0F1, 0 },
+  { 0xF07FC3E0, 0xF07FC3E0 }, { 0xF00F00F0, 0xF00F00F0 },
+  { 0xEF9EA78C, 0 },          { 0xEF2EB720, 0 },
+  { 0xEEBF2F19, 0 },          { 0xEE500EE5, 0xEE500EE5 },
+  { 0xEDE155F4, 0 },          { 0xED7303B6, 0 },
+  { 0xED05179C, 0xED05179C }, { 0xEC979119, 0 },
+  { 0xEC2A6FA0, 0xEC2A6FA0 }, { 0xEBBDB2A6, 0 },
+  { 0xEB5159A0, 0 },          { 0xEAE56404, 0 },
+  { 0xEA79D14A, 0 },          { 0xEA0EA0EA, 0xEA0EA0EA },
+  { 0xE9A3D25E, 0xE9A3D25E }, { 0xE9396520, 0 },
+  { 0xE8CF58AB, 0 },          { 0xE865AC7C, 0 },
+  { 0xE7FC600F, 0 },          { 0xE79372E3, 0 },
+  { 0xE72AE476, 0 },          { 0xE6C2B449, 0 },
+  { 0xE65AE1DC, 0 },          { 0xE5F36CB0, 0xE5F36CB0 },
+  { 0xE58C544A, 0 },          { 0xE525982B, 0 },
+  { 0xE4BF37D9, 0 },          { 0xE45932D8, 0 },
+  { 0xE3F388AF, 0 },          { 0xE38E38E4, 0 },
+  { 0xE32942FF, 0 },          { 0xE2C4A689, 0 },
+  { 0xE260630B, 0 },          { 0xE1FC780F, 0 },
+  { 0xE198E520, 0 },          { 0xE135A9CA, 0 },
+  { 0xE0D2C59A, 0 },          { 0xE070381C, 0xE070381C },
+  { 0xE00E00E0, 0xE00E00E0 }, { 0xDFAC1F75, 0 },
+  { 0xDF4A9369, 0 },          { 0xDEE95C4D, 0 },
+  { 0xDE8879B3, 0 },          { 0xDE27EB2D, 0 },
+  { 0xDDC7B04D, 0 },          { 0xDD67C8A6, 0xDD67C8A6 },
+  { 0xDD0833CE, 0 },          { 0xDCA8F159, 0 },
+  { 0xDC4A00DD, 0 },          { 0xDBEB61EF, 0 },
+  { 0xDB8D1428, 0 },          { 0xDB2F171E, 0 },
+  { 0xDAD16A6B, 0 },          { 0xDA740DA8, 0 },
+  { 0xDA17006D, 0xDA17006D }, { 0xD9BA4257, 0 },
+  { 0xD95DD300, 0 },          { 0xD901B204, 0 },
+  { 0xD8A5DEFF, 0 },          { 0xD84A598F, 0 },
+  { 0xD7EF2152, 0 },          { 0xD79435E5, 0xD79435E5 },
+  { 0xD73996E9, 0 },          { 0xD6DF43FD, 0 },
+  { 0xD6853CC1, 0 },          { 0xD62B80D7, 0 },
+  { 0xD5D20FDF, 0 },          { 0xD578E97D, 0 },
+  { 0xD5200D52, 0xD5200D52 }, { 0xD4C77B04, 0 },
+  { 0xD46F3235, 0 },          { 0xD417328A, 0 },
+  { 0xD3BF7BA9, 0 },          { 0xD3680D37, 0 },
+  { 0xD310E6DB, 0 },          { 0xD2BA083C, 0 },
+  { 0xD2637101, 0 },          { 0xD20D20D2, 0xD20D20D2 },
+  { 0xD1B71759, 0 },          { 0xD161543E, 0xD161543E },
+  { 0xD10BD72C, 0 },          { 0xD0B69FCC, 0 },
+  { 0xD061ADCA, 0 },          { 0xD00D00D0, 0xD00D00D0 },
+  { 0xCFB8988C, 0 },          { 0xCF6474A9, 0 },
+  { 0xCF1094D4, 0 },          { 0xCEBCF8BC, 0 },
+  { 0xCE69A00D, 0 },          { 0xCE168A77, 0xCE168A77 },
+  { 0xCDC3B7A9, 0xCDC3B7A9 }, { 0xCD712753, 0 },
+  { 0xCD1ED924, 0 },          { 0xCCCCCCCD, 0 },
+  { 0xCC7B0200, 0 },          { 0xCC29786D, 0 },
+  { 0xCBD82FC7, 0 },          { 0xCB8727C1, 0 },
+  { 0xCB36600D, 0 },          { 0xCAE5D85F, 0xCAE5D85F },
+  { 0xCA95906C, 0 },          { 0xCA4587E7, 0 },
+  { 0xC9F5BE86, 0 },          { 0xC9A633FD, 0 },
+  { 0xC956E803, 0xC956E803 }, { 0xC907DA4F, 0 },
+  { 0xC8B90A96, 0 },          { 0xC86A7890, 0xC86A7890 },
+  { 0xC81C23F5, 0xC81C23F5 }, { 0xC7CE0C7D, 0 },
+  { 0xC78031E0, 0xC78031E0 }, { 0xC73293D8, 0 },
+  { 0xC6E5321D, 0 },          { 0xC6980C6A, 0 },
+  { 0xC64B2278, 0xC64B2278 }, { 0xC5FE7403, 0xC5FE7403 },
+  { 0xC5B200C6, 0 },          { 0xC565C87C, 0 },
+  { 0xC519CAE0, 0xC519CAE0 }, { 0xC4CE07B0, 0xC4CE07B0 },
+  { 0xC4827EA8, 0xC4827EA8 }, { 0xC4372F86, 0 },
+  { 0xC3EC1A06, 0 },          { 0xC3A13DE6, 0xC3A13DE6 },
+  { 0xC3569AE6, 0 },          { 0xC30C30C3, 0xC30C30C3 },
+  { 0xC2C1FF3E, 0 },          { 0xC2780614, 0 },
+  { 0xC22E4507, 0 },          { 0xC1E4BBD6, 0 },
+  { 0xC19B6A42, 0 },          { 0xC152500C, 0xC152500C },
+  { 0xC1096CF6, 0 },          { 0xC0C0C0C1, 0 },
+  { 0xC0784B2F, 0 },          { 0xC0300C03, 0xC0300C03 },
+  { 0xBFE80300, 0 },          { 0xBFA02FE8, 0xBFA02FE8 },
+  { 0xBF589280, 0 },          { 0xBF112A8B, 0 },
+  { 0xBEC9F7CE, 0 },          { 0xBE82FA0C, 0 },
+  { 0xBE3C310C, 0 },          { 0xBDF59C92, 0 },
+  { 0xBDAF3C64, 0 },          { 0xBD691047, 0xBD691047 },
+  { 0xBD231803, 0 },          { 0xBCDD535E, 0 },
+  { 0xBC97C21E, 0xBC97C21E }, { 0xBC52640C, 0 },
+  { 0xBC0D38EE, 0xBC0D38EE }, { 0xBBC8408D, 0 },
+  { 0xBB837AB1, 0 },          { 0xBB3EE722, 0 },
+  { 0xBAFA85A9, 0xBAFA85A9 }, { 0xBAB65610, 0xBAB65610 },
+  { 0xBA725820, 0xBA725820 }, { 0xBA2E8BA3, 0 },
+  { 0xB9EAF063, 0 },          { 0xB9A7862A, 0xB9A7862A },
+  { 0xB9644CC4, 0 },          { 0xB92143FA, 0xB92143FA },
+  { 0xB8DE6B9A, 0 },          { 0xB89BC36D, 0 },
+  { 0xB8594B41, 0 },          { 0xB81702E1, 0 },
+  { 0xB7D4EA19, 0xB7D4EA19 }, { 0xB79300B8, 0 },
+  { 0xB7514689, 0 },          { 0xB70FBB5A, 0xB70FBB5A },
+  { 0xB6CE5EF9, 0xB6CE5EF9 }, { 0xB68D3134, 0xB68D3134 },
+  { 0xB64C31D9, 0 },          { 0xB60B60B6, 0xB60B60B6 },
+  { 0xB5CABD9B, 0 },          { 0xB58A4855, 0xB58A4855 },
+  { 0xB54A00B5, 0xB54A00B5 }, { 0xB509E68B, 0 },
+  { 0xB4C9F9A5, 0 },          { 0xB48A39D4, 0xB48A39D4 },
+  { 0xB44AA6E9, 0xB44AA6E9 }, { 0xB40B40B4, 0xB40B40B4 },
+  { 0xB3CC0706, 0 },          { 0xB38CF9B0, 0xB38CF9B0 },
+  { 0xB34E1884, 0 },          { 0xB30F6353, 0 },
+  { 0xB2D0D9EF, 0 },          { 0xB2927C2A, 0 },
+  { 0xB25449D7, 0 },          { 0xB21642C9, 0 },
+  { 0xB1D866D1, 0xB1D866D1 }, { 0xB19AB5C5, 0 },
+  { 0xB15D2F76, 0 },          { 0xB11FD3B8, 0xB11FD3B8 },
+  { 0xB0E2A260, 0xB0E2A260 }, { 0xB0A59B42, 0 },
+  { 0xB068BE31, 0 },          { 0xB02C0B03, 0 },
+  { 0xAFEF818C, 0 },          { 0xAFB321A1, 0xAFB321A1 },
+  { 0xAF76EB19, 0 },          { 0xAF3ADDC7, 0 },
+  { 0xAEFEF982, 0 },          { 0xAEC33E20, 0 },
+  { 0xAE87AB76, 0xAE87AB76 }, { 0xAE4C415D, 0 },
+  { 0xAE10FFA9, 0 },          { 0xADD5E632, 0xADD5E632 },
+  { 0xAD9AF4D0, 0 },          { 0xAD602B58, 0xAD602B58 },
+  { 0xAD2589A4, 0 },          { 0xACEB0F89, 0xACEB0F89 },
+  { 0xACB0BCE1, 0xACB0BCE1 }, { 0xAC769184, 0xAC769184 },
+  { 0xAC3C8D4A, 0 },          { 0xAC02B00B, 0 },
+  { 0xABC8F9A0, 0xABC8F9A0 }, { 0xAB8F69E3, 0 },
+  { 0xAB5600AC, 0 },          { 0xAB1CBDD4, 0 },
+  { 0xAAE3A136, 0 },          { 0xAAAAAAAB, 0 },
+  { 0xAA71DA0D, 0 },          { 0xAA392F36, 0 },
+  { 0xAA00AA01, 0 },          { 0xA9C84A48, 0 },
+  { 0xA9900FE6, 0 },          { 0xA957FAB5, 0xA957FAB5 },
+  { 0xA9200A92, 0xA9200A92 }, { 0xA8E83F57, 0xA8E83F57 },
+  { 0xA8B098E0, 0xA8B098E0 }, { 0xA8791709, 0 },
+  { 0xA841B9AD, 0 },          { 0xA80A80A8, 0xA80A80A8 },
+  { 0xA7D36BD8, 0 },          { 0xA79C7B17, 0 },
+  { 0xA765AE44, 0 },          { 0xA72F053A, 0 },
+  { 0xA6F87FD6, 0xA6F87FD6 }, { 0xA6C21DF7, 0 },
+  { 0xA68BDF79, 0 },          { 0xA655C439, 0xA655C439 },
+  { 0xA61FCC16, 0xA61FCC16 }, { 0xA5E9F6ED, 0xA5E9F6ED },
+  { 0xA5B4449D, 0 },          { 0xA57EB503, 0 },
+  { 0xA54947FE, 0 },          { 0xA513FD6C, 0 },
+  { 0xA4DED52C, 0xA4DED52C }, { 0xA4A9CF1E, 0 },
+  { 0xA474EB1F, 0xA474EB1F }, { 0xA4402910, 0xA4402910 },
+  { 0xA40B88D0, 0 },          { 0xA3D70A3E, 0 },
+  { 0xA3A2AD39, 0xA3A2AD39 }, { 0xA36E71A3, 0 },
+  { 0xA33A575A, 0xA33A575A }, { 0xA3065E40, 0 },
+  { 0xA2D28634, 0 },          { 0xA29ECF16, 0xA29ECF16 },
+  { 0xA26B38C9, 0 },          { 0xA237C32B, 0xA237C32B },
+  { 0xA2046E1F, 0xA2046E1F }, { 0xA1D13986, 0 },
+  { 0xA19E2540, 0 },          { 0xA16B312F, 0 },
+  { 0xA1385D35, 0 },          { 0xA105A933, 0 },
+  { 0xA0D3150C, 0 },          { 0xA0A0A0A1, 0 },
+  { 0xA06E4BD4, 0xA06E4BD4 }, { 0xA03C1689, 0 },
+  { 0xA00A00A0, 0xA00A00A0 }, { 0x9FD809FE, 0 },
+  { 0x9FA63284, 0 },          { 0x9F747A15, 0x9F747A15 },
+  { 0x9F42E095, 0x9F42E095 }, { 0x9F1165E7, 0x9F1165E7 },
+  { 0x9EE009EE, 0x9EE009EE }, { 0x9EAECC8D, 0x9EAECC8D },
+  { 0x9E7DADA9, 0 },          { 0x9E4CAD24, 0 },
+  { 0x9E1BCAE3, 0 },          { 0x9DEB06C9, 0x9DEB06C9 },
+  { 0x9DBA60BB, 0x9DBA60BB }, { 0x9D89D89E, 0 },
+  { 0x9D596E54, 0x9D596E54 }, { 0x9D2921C4, 0 },
+  { 0x9CF8F2D1, 0x9CF8F2D1 }, { 0x9CC8E161, 0 },
+  { 0x9C98ED58, 0 },          { 0x9C69169B, 0x9C69169B },
+  { 0x9C395D10, 0x9C395D10 }, { 0x9C09C09C, 0x9C09C09C },
+  { 0x9BDA4124, 0x9BDA4124 }, { 0x9BAADE8E, 0x9BAADE8E },
+  { 0x9B7B98C0, 0 },          { 0x9B4C6F9F, 0 },
+  { 0x9B1D6311, 0x9B1D6311 }, { 0x9AEE72FD, 0 },
+  { 0x9ABF9F48, 0x9ABF9F48 }, { 0x9A90E7D9, 0x9A90E7D9 },
+  { 0x9A624C97, 0 },          { 0x9A33CD67, 0x9A33CD67 },
+  { 0x9A056A31, 0 },          { 0x99D722DB, 0 },
+  { 0x99A8F74C, 0 },          { 0x997AE76B, 0x997AE76B },
+  { 0x994CF320, 0x994CF320 }, { 0x991F1A51, 0x991F1A51 },
+  { 0x98F15CE7, 0 },          { 0x98C3BAC7, 0x98C3BAC7 },
+  { 0x989633DB, 0x989633DB }, { 0x9868C80A, 0 },
+  { 0x983B773B, 0 },          { 0x980E4156, 0x980E4156 },
+  { 0x97E12644, 0x97E12644 }, { 0x97B425ED, 0x97B425ED },
+  { 0x97874039, 0 },          { 0x975A7510, 0 },
+  { 0x972DC45B, 0 },          { 0x97012E02, 0x97012E02 },
+  { 0x96D4B1EF, 0 },          { 0x96A8500A, 0 },
+  { 0x967C083B, 0 },          { 0x964FDA6C, 0x964FDA6C },
+  { 0x9623C686, 0x9623C686 }, { 0x95F7CC73, 0 },
+  { 0x95CBEC1B, 0 },          { 0x95A02568, 0x95A02568 },
+  { 0x95747844, 0 },          { 0x9548E498, 0 },
+  { 0x951D6A4E, 0 },          { 0x94F2094F, 0x94F2094F },
+  { 0x94C6C187, 0 },          { 0x949B92DE, 0 },
+  { 0x94707D3F, 0 },          { 0x94458094, 0x94458094 },
+  { 0x941A9CC8, 0x941A9CC8 }, { 0x93EFD1C5, 0x93EFD1C5 },
+  { 0x93C51F76, 0 },          { 0x939A85C4, 0x939A85C4 },
+  { 0x9370049C, 0 },          { 0x93459BE7, 0 },
+  { 0x931B4B91, 0 },          { 0x92F11384, 0x92F11384 },
+  { 0x92C6F3AC, 0x92C6F3AC }, { 0x929CEBF5, 0 },
+  { 0x9272FC48, 0x9272FC48 }, { 0x92492492, 0x92492492 },
+  { 0x921F64BF, 0 },          { 0x91F5BCB9, 0 },
+  { 0x91CC2C6C, 0x91CC2C6C }, { 0x91A2B3C5, 0 },
+  { 0x917952AF, 0 },          { 0x91500915, 0x91500915 },
+  { 0x9126D6E5, 0 },          { 0x90FDBC09, 0x90FDBC09 },
+  { 0x90D4B86F, 0 },          { 0x90ABCC02, 0x90ABCC02 },
+  { 0x9082F6B0, 0 },          { 0x905A3863, 0x905A3863 },
+  { 0x9031910A, 0 },          { 0x90090090, 0x90090090 },
+  { 0x8FE086E3, 0 },          { 0x8FB823EE, 0x8FB823EE },
+  { 0x8F8FD7A0, 0 },          { 0x8F67A1E4, 0 },
+  { 0x8F3F82A8, 0x8F3F82A8 }, { 0x8F1779DA, 0 },
+  { 0x8EEF8766, 0 },          { 0x8EC7AB3A, 0 },
+  { 0x8E9FE542, 0x8E9FE542 }, { 0x8E78356D, 0x8E78356D },
+  { 0x8E509BA8, 0x8E509BA8 }, { 0x8E2917E1, 0 },
+  { 0x8E01AA05, 0 },          { 0x8DDA5202, 0x8DDA5202 },
+  { 0x8DB30FC6, 0x8DB30FC6 }, { 0x8D8BE340, 0 },
+  { 0x8D64CC5C, 0 },          { 0x8D3DCB09, 0 },
+  { 0x8D16DF35, 0x8D16DF35 }, { 0x8CF008CF, 0x8CF008CF },
+  { 0x8CC947C5, 0 },          { 0x8CA29C04, 0x8CA29C04 },
+  { 0x8C7C057D, 0 },          { 0x8C55841D, 0 },
+  { 0x8C2F17D2, 0x8C2F17D2 }, { 0x8C08C08C, 0x8C08C08C },
+  { 0x8BE27E39, 0x8BE27E39 }, { 0x8BBC50C9, 0 },
+  { 0x8B963829, 0x8B963829 }, { 0x8B70344A, 0x8B70344A },
+  { 0x8B4A451A, 0 },          { 0x8B246A88, 0 },
+  { 0x8AFEA483, 0x8AFEA483 }, { 0x8AD8F2FC, 0 },
+  { 0x8AB355E0, 0x8AB355E0 }, { 0x8A8DCD20, 0 },
+  { 0x8A6858AB, 0 },          { 0x8A42F870, 0x8A42F870 },
+  { 0x8A1DAC60, 0x8A1DAC60 }, { 0x89F8746A, 0 },
+  { 0x89D3507D, 0 },          { 0x89AE408A, 0 },
+  { 0x89894480, 0 },          { 0x89645C4F, 0x89645C4F },
+  { 0x893F87E8, 0x893F87E8 }, { 0x891AC73B, 0 },
+  { 0x88F61A37, 0x88F61A37 }, { 0x88D180CD, 0x88D180CD },
+  { 0x88ACFAEE, 0 },          { 0x88888889, 0 },
+  { 0x8864298F, 0 },          { 0x883FDDF0, 0x883FDDF0 },
+  { 0x881BA59E, 0 },          { 0x87F78088, 0 },
+  { 0x87D36EA0, 0 },          { 0x87AF6FD6, 0 },
+  { 0x878B841B, 0 },          { 0x8767AB5F, 0x8767AB5F },
+  { 0x8743E595, 0 },          { 0x872032AC, 0x872032AC },
+  { 0x86FC9296, 0x86FC9296 }, { 0x86D90545, 0 },
+  { 0x86B58AA8, 0 },          { 0x869222B2, 0 },
+  { 0x866ECD53, 0x866ECD53 }, { 0x864B8A7E, 0 },
+  { 0x86285A23, 0x86285A23 }, { 0x86053C34, 0x86053C34 },
+  { 0x85E230A3, 0x85E230A3 }, { 0x85BF3761, 0x85BF3761 },
+  { 0x859C5060, 0x859C5060 }, { 0x85797B91, 0x85797B91 },
+  { 0x8556B8E7, 0x8556B8E7 }, { 0x85340853, 0x85340853 },
+  { 0x851169C7, 0x851169C7 }, { 0x84EEDD36, 0 },
+  { 0x84CC6290, 0 },          { 0x84A9F9C8, 0x84A9F9C8 },
+  { 0x8487A2D1, 0 },          { 0x84655D9C, 0 },
+  { 0x84432A1B, 0x84432A1B }, { 0x84210842, 0x84210842 },
+  { 0x83FEF802, 0x83FEF802 }, { 0x83DCF94E, 0 },
+  { 0x83BB0C18, 0 },          { 0x83993052, 0x83993052 },
+  { 0x837765F0, 0x837765F0 }, { 0x8355ACE4, 0 },
+  { 0x83340520, 0x83340520 }, { 0x83126E98, 0 },
+  { 0x82F0E93D, 0x82F0E93D }, { 0x82CF7504, 0 },
+  { 0x82AE11DE, 0 },          { 0x828CBFBF, 0 },
+  { 0x826B7E99, 0x826B7E99 }, { 0x824A4E61, 0 },
+  { 0x82292F08, 0 },          { 0x82082082, 0x82082082 },
+  { 0x81E722C2, 0x81E722C2 }, { 0x81C635BC, 0x81C635BC },
+  { 0x81A55963, 0 },          { 0x81848DA9, 0 },
+  { 0x8163D283, 0 },          { 0x814327E4, 0 },
+  { 0x81228DBF, 0 },          { 0x81020408, 0x81020408 },
+  { 0x80E18AB3, 0 },          { 0x80C121B3, 0 },
+  { 0x80A0C8FB, 0x80A0C8FB }, { 0x80808081, 0 },
+  { 0x80604836, 0x80604836 }, { 0x80402010, 0x80402010 },
+  { 0x80200802, 0x80200802 }, { 0xFFFFFFFF, 0xFFFFFFFF }
+};
diff --git a/third_party/aom/av1/common/odintrin.h b/third_party/aom/av1/common/odintrin.h
new file mode 100644
index 000000000..fe99d8003
--- /dev/null
+++ b/third_party/aom/av1/common/odintrin.h
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifndef AV1_COMMON_ODINTRIN_H_
+#define AV1_COMMON_ODINTRIN_H_
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/bitops.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+# if !defined(M_PI)
+#  define M_PI      (3.1415926535897932384626433832795)
+# endif
+
+# if !defined(M_SQRT2)
+#  define M_SQRT2 (1.41421356237309504880168872420970)
+# endif
+
+# if !defined(M_SQRT1_2)
+#  define M_SQRT1_2 (0.70710678118654752440084436210485)
+# endif
+
+# if !defined(M_LOG2E)
+#  define M_LOG2E (1.4426950408889634073599246810019)
+# endif
+
+# if !defined(M_LN2)
+#  define M_LN2 (0.69314718055994530941723212145818)
+# endif
+
+/*Smallest blocks are 4x4*/
+#define OD_LOG_BSIZE0 (2)
+/*There are 5 block sizes total (4x4, 8x8, 16x16, 32x32 and 64x64).*/
+#define OD_NBSIZES (5)
+
+/*There are 4 transform sizes total in AV1 (4x4, 8x8, 16x16 and 32x32).*/
+#define OD_TXSIZES TX_SIZES
+/*The log of the maximum length of the side of a transform.*/
+#define OD_LOG_TXSIZE_MAX (OD_LOG_BSIZE0 + OD_TXSIZES - 1)
+/*The maximum length of the side of a transform.*/
+#define OD_TXSIZE_MAX (1 << OD_LOG_TXSIZE_MAX)
+
+/**The maximum number of color planes allowed in a single frame.*/
+# define OD_NPLANES_MAX (3)
+
+# define OD_COEFF_SHIFT (4)
+
+# define OD_DISABLE_CFL (1)
+# define OD_DISABLE_FILTER (1)
+
+#if !defined(NDEBUG)
+# define OD_ENABLE_ASSERTIONS (1)
+#endif
+
+# define OD_LOG(a)
+# define OD_LOG_PARTIAL(a)
+
+/*Possible block sizes, note that OD_BLOCK_NXN = log2(N) - 2.*/
+#define OD_BLOCK_4X4 (0)
+#define OD_BLOCK_8X8 (1)
+#define OD_BLOCK_16X16 (2)
+#define OD_BLOCK_32X32 (3)
+#define OD_BLOCK_SIZES (OD_BLOCK_32X32 + 1)
+
+# define OD_LIMIT_BSIZE_MIN (OD_BLOCK_4X4)
+# define OD_LIMIT_BSIZE_MAX (OD_BLOCK_32X32)
+
+typedef int od_coeff;
+
+#define OD_DIVU_DMAX (1024)
+
+extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
+
+/*Translate unsigned division by small divisors into multiplications.*/
+#define OD_DIVU_SMALL(_x, _d)                                     \
+  ((uint32_t)((OD_DIVU_SMALL_CONSTS[(_d)-1][0] * (uint64_t)(_x) + \
+               OD_DIVU_SMALL_CONSTS[(_d)-1][1]) >>                \
+              32) >>                                              \
+   (OD_ILOG_NZ(_d) - 1))
+
+#define OD_DIVU(_x, _d) \
+  (((_d) < OD_DIVU_DMAX) ? (OD_DIVU_SMALL((_x), (_d))) : ((_x) / (_d)))
+
+#define OD_MINI AOMMIN
+#define OD_MAXI AOMMAX
+#define OD_CLAMPI(min, val, max) (OD_MAXI(min, OD_MINI(val, max)))
+
+#define OD_CLZ0 (1)
+#define OD_CLZ(x) (-get_msb(x))
+#define OD_ILOG_NZ(x) (OD_CLZ0 - OD_CLZ(x))
+/*Note that __builtin_clz is not defined when x == 0, according to the gcc
+   documentation (and that of the x86 BSR instruction that implements it), so
+   we have to special-case it.
+  We define a special version of the macro to use when x can be zero.*/
+#define OD_ILOG(x) ((x) ? OD_ILOG_NZ(x) : 0)
+
+#define OD_LOG2(x) (M_LOG2E*log(x))
+#define OD_EXP2(x) (exp(M_LN2*(x)))
+
+/*Enable special features for gcc and compatible compilers.*/
+#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
+#define OD_GNUC_PREREQ(maj, min, pat)                                \
+  ((__GNUC__ << 16) + (__GNUC_MINOR__ << 8) + __GNUC_PATCHLEVEL__ >= \
+   ((maj) << 16) + ((min) << 8) + pat)  // NOLINT
+#else
+#define OD_GNUC_PREREQ(maj, min, pat) (0)
+#endif
+
+#if OD_GNUC_PREREQ(3, 4, 0)
+#define OD_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
+#else
+#define OD_WARN_UNUSED_RESULT
+#endif
+
+#if OD_GNUC_PREREQ(3, 4, 0)
+#define OD_ARG_NONNULL(x) __attribute__((__nonnull__(x)))
+#else
+#define OD_ARG_NONNULL(x)
+#endif
+
+#if defined(OD_ENABLE_ASSERTIONS)
+#if OD_GNUC_PREREQ(2, 5, 0)
+__attribute__((noreturn))
+#endif
+void od_fatal_impl(const char *_str, const char *_file, int _line);
+
+#define OD_FATAL(_str) (od_fatal_impl(_str, __FILE__, __LINE__))
+
+#define OD_ASSERT(_cond)                     \
+  do {                                       \
+    if (!(_cond)) {                          \
+      OD_FATAL("assertion failed: " #_cond); \
+    }                                        \
+  } while (0)
+
+#define OD_ASSERT2(_cond, _message)                        \
+  do {                                                     \
+    if (!(_cond)) {                                        \
+      OD_FATAL("assertion failed: " #_cond "\n" _message); \
+    }                                                      \
+  } while (0)
+
+#define OD_ALWAYS_TRUE(_cond) OD_ASSERT(_cond)
+
+#else
+#define OD_ASSERT(_cond)
+#define OD_ASSERT2(_cond, _message)
+#define OD_ALWAYS_TRUE(_cond) ((void)(_cond))
+#endif
+
+/** Copy n elements of memory from src to dst. The 0* term provides
+    compile-time type checking  */
+#if !defined(OVERRIDE_OD_COPY)
+#define OD_COPY(dst, src, n) \
+  (memcpy((dst), (src), sizeof(*(dst)) * (n) + 0 * ((dst) - (src))))
+#endif
+
+/** Copy n elements of memory from src to dst, allowing overlapping regions.
+    The 0* term provides compile-time type checking */
+#if !defined(OVERRIDE_OD_MOVE)
+# define OD_MOVE(dst, src, n) \
+ (memmove((dst), (src), sizeof(*(dst))*(n) + 0*((dst) - (src)) ))
+#endif
+
+/** Linkage will break without this if using a C++ compiler, and will issue
+ * warnings without this for a C compiler*/
+#if defined(__cplusplus)
+# define OD_EXTERN extern
+#else
+# define OD_EXTERN
+#endif
+
+/** Set n elements of dst to zero */
+#if !defined(OVERRIDE_OD_CLEAR)
+# define OD_CLEAR(dst, n) (memset((dst), 0, sizeof(*(dst))*(n)))
+#endif
+
+/** Silence unused parameter/variable warnings */
+# define OD_UNUSED(expr) (void)(expr)
+
+#if defined(OD_FLOAT_PVQ)
+typedef double od_val16;
+typedef double od_val32;
+# define OD_QCONST32(x, bits) (x)
+# define OD_ROUND16(x) (x)
+# define OD_ROUND32(x) (x)
+# define OD_SHL(x, shift) (x)
+# define OD_SHR(x, shift) (x)
+# define OD_SHR_ROUND(x, shift) (x)
+# define OD_ABS(x) (fabs(x))
+# define OD_MULT16_16(a, b) ((a)*(b))
+# define OD_MULT16_32_Q16(a, b) ((a)*(b))
+#else
+typedef int16_t od_val16;
+typedef int32_t od_val32;
+/** Compile-time conversion of float constant to 32-bit value */
+# define OD_QCONST32(x, bits) ((od_val32)(.5 + (x)*(((od_val32)1) << (bits))))
+# define OD_ROUND16(x) (int16_t)(floor(.5 + (x)))
+# define OD_ROUND32(x) (int32_t)(floor(.5 + (x)))
+/*Shift x left by shift*/
+# define OD_SHL(a, shift) ((int32_t)((uint32_t)(a) << (shift)))
+/*Shift x right by shift (without rounding)*/
+# define OD_SHR(x, shift) \
+  ((int32_t)((x) >> (shift)))
+/*Shift x right by shift (with rounding)*/
+# define OD_SHR_ROUND(x, shift) \
+  ((int32_t)(((x) + (1 << (shift) >> 1)) >> (shift)))
+/*Shift x right by shift (without rounding) or left by -shift if shift
+  is negative.*/
+# define OD_VSHR(x, shift) \
+  (((shift) > 0) ? OD_SHR(x, shift) : OD_SHL(x, -(shift)))
+/*Shift x right by shift (with rounding) or left by -shift if shift
+  is negative.*/
+# define OD_VSHR_ROUND(x, shift) \
+  (((shift) > 0) ? OD_SHR_ROUND(x, shift) : OD_SHL(x, -(shift)))
+# define OD_ABS(x) (abs(x))
+/* (od_val32)(od_val16) gives TI compiler a hint that it's 16x16->32 multiply */
+/** 16x16 multiplication where the result fits in 32 bits */
+# define OD_MULT16_16(a, b) \
+ (((od_val32)(od_val16)(a))*((od_val32)(od_val16)(b)))
+/* Multiplies 16-bit a by 32-bit b and keeps bits [16:47]. */
+# define OD_MULT16_32_Q16(a, b) ((int16_t)(a)*(int64_t)(int32_t)(b) >> 16)
+/*16x16 multiplication where the result fits in 16 bits, without rounding.*/
+# define OD_MULT16_16_Q15(a, b) \
+  (((int16_t)(a)*((int32_t)(int16_t)(b))) >> 15)
+/*16x16 multiplication where the result fits in 16 bits, without rounding.*/
+# define OD_MULT16_16_Q16(a, b) \
+  ((((int16_t)(a))*((int32_t)(int16_t)(b))) >> 16)
+#endif
+
+/*All of these macros should expect floats as arguments.*/
+/*These two should compile as a single SSE instruction.*/
+# define OD_MINF(a, b) ((a) < (b) ? (a) : (b))
+# define OD_MAXF(a, b) ((a) > (b) ? (a) : (b))
+
+# define OD_DIV_R0(x, y) (((x) + OD_FLIPSIGNI((((y) + 1) >> 1) - 1, (x)))/(y))
+
+# define OD_SIGNMASK(a) (-((a) < 0))
+# define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b))
+
+# define OD_MULT16_16_Q15(a, b) \
+  (((int16_t)(a)*((int32_t)(int16_t)(b))) >> 15)
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_ODINTRIN_H_
diff --git a/third_party/aom/av1/common/onyxc_int.h b/third_party/aom/av1/common/onyxc_int.h
new file mode 100644
index 000000000..7980bde39
--- /dev/null
+++ b/third_party/aom/av1/common/onyxc_int.h
@@ -0,0 +1,1027 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_ONYXC_INT_H_
+#define AV1_COMMON_ONYXC_INT_H_
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+#include "aom/internal/aom_codec_internal.h"
+#include "aom_util/aom_thread.h"
+#if CONFIG_ANS
+#include "aom_dsp/ans.h"
+#endif
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/frame_buffers.h"
+#include "av1/common/mv.h"
+#include "av1/common/quant_common.h"
+#if CONFIG_LOOP_RESTORATION
+#include "av1/common/restoration.h"
+#endif  // CONFIG_LOOP_RESTORATION
+#include "av1/common/tile_common.h"
+#include "av1/common/odintrin.h"
+#if CONFIG_PVQ
+#include "av1/common/pvq.h"
+#endif
+#if CONFIG_CFL
+#include "av1/common/cfl.h"
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CDEF_MAX_STRENGTHS 16
+
+#define REF_FRAMES_LOG2 3
+#define REF_FRAMES (1 << REF_FRAMES_LOG2)
+
+// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
+// in parallel, 3 for scaled references on the encoder.
+// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
+// of framebuffers.
+// TODO(jkoleszar): These 3 extra references could probably come from the
+// normal reference pool.
+#define FRAME_BUFFERS (REF_FRAMES + 7)
+
+#if CONFIG_REFERENCE_BUFFER
+/* Constant values while waiting for the sequence header */
+#define FRAME_ID_NUMBERS_PRESENT_FLAG 1
+#define FRAME_ID_LENGTH_MINUS7 8         // Allows frame id up to 2^15-1
+#define DELTA_FRAME_ID_LENGTH_MINUS2 12  // Allows frame id deltas up to 2^14-1
+#endif
+
+#if CONFIG_EXT_REFS
+#define FRAME_CONTEXTS_LOG2 3
+#else
+#define FRAME_CONTEXTS_LOG2 2
+#endif
+
+#define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2)
+
+#define NUM_PING_PONG_BUFFERS 2
+
+typedef enum {
+  SINGLE_REFERENCE = 0,
+  COMPOUND_REFERENCE = 1,
+  REFERENCE_MODE_SELECT = 2,
+  REFERENCE_MODES = 3,
+} REFERENCE_MODE;
+
+typedef enum {
+  RESET_FRAME_CONTEXT_NONE = 0,
+  RESET_FRAME_CONTEXT_CURRENT = 1,
+  RESET_FRAME_CONTEXT_ALL = 2,
+} RESET_FRAME_CONTEXT_MODE;
+
+typedef enum {
+  /**
+   * Update frame context to values resulting from forward probability
+   * updates signaled in the frame header
+   */
+  REFRESH_FRAME_CONTEXT_FORWARD,
+  /**
+   * Update frame context to values resulting from backward probability
+   * updates based on entropy/counts in the decoded frame
+   */
+  REFRESH_FRAME_CONTEXT_BACKWARD,
+} REFRESH_FRAME_CONTEXT_MODE;
+
+typedef struct {
+  int_mv mv[2];
+#if CONFIG_REF_MV
+  int_mv pred_mv[2];
+#endif
+  MV_REFERENCE_FRAME ref_frame[2];
+} MV_REF;
+
+typedef struct {
+  int ref_count;
+  MV_REF *mvs;
+  int mi_rows;
+  int mi_cols;
+#if CONFIG_GLOBAL_MOTION
+  WarpedMotionParams global_motion[TOTAL_REFS_PER_FRAME];
+#endif  // CONFIG_GLOBAL_MOTION
+  aom_codec_frame_buffer_t raw_frame_buffer;
+  YV12_BUFFER_CONFIG buf;
+#if CONFIG_TEMPMV_SIGNALING
+  uint8_t intra_only;
+#endif
+  // The Following variables will only be used in frame parallel decode.
+
+  // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
+  // that no FrameWorker owns, or is decoding, this buffer.
+  AVxWorker *frame_worker_owner;
+
+  // row and col indicate which position frame has been decoded to in real
+  // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX
+  // when the frame is fully decoded.
+  int row;
+  int col;
+} RefCntBuffer;
+
+typedef struct BufferPool {
+// Protect BufferPool from being accessed by several FrameWorkers at
+// the same time during frame parallel decode.
+// TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t pool_mutex;
+#endif
+
+  // Private data associated with the frame buffer callbacks.
+  void *cb_priv;
+
+  aom_get_frame_buffer_cb_fn_t get_fb_cb;
+  aom_release_frame_buffer_cb_fn_t release_fb_cb;
+
+  RefCntBuffer frame_bufs[FRAME_BUFFERS];
+
+  // Frame buffers allocated internally by the codec.
+  InternalFrameBufferList int_frame_buffers;
+} BufferPool;
+
+typedef struct AV1Common {
+  struct aom_internal_error_info error;
+  aom_color_space_t color_space;
+  int color_range;
+  int width;
+  int height;
+  int render_width;
+  int render_height;
+  int last_width;
+  int last_height;
+
+#if CONFIG_FRAME_SUPERRES
+  // The numerator of the superres scale, the denominator is fixed
+  uint8_t superres_scale_numerator;
+  int superres_width, superres_height;
+#endif  // CONFIG_FRAME_SUPERRES
+
+  // TODO(jkoleszar): this implies chroma ss right now, but could vary per
+  // plane. Revisit as part of the future change to YV12_BUFFER_CONFIG to
+  // support additional planes.
+  int subsampling_x;
+  int subsampling_y;
+
+#if CONFIG_HIGHBITDEPTH
+  // Marks if we need to use 16bit frame buffers (1: yes, 0: no).
+  int use_highbitdepth;
+#endif
+  YV12_BUFFER_CONFIG *frame_to_show;
+  RefCntBuffer *prev_frame;
+
+  // TODO(hkuang): Combine this with cur_buf in macroblockd.
+  RefCntBuffer *cur_frame;
+
+  int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */
+
+  // Prepare ref_frame_map for the next frame.
+  // Only used in frame parallel decode.
+  int next_ref_frame_map[REF_FRAMES];
+
+  // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
+  // roll new_fb_idx into it.
+
+  // Each Inter frame can reference INTER_REFS_PER_FRAME buffers
+  RefBuffer frame_refs[INTER_REFS_PER_FRAME];
+
+  int new_fb_idx;
+
+  FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/
+  FRAME_TYPE frame_type;
+
+  int show_frame;
+  int last_show_frame;
+  int show_existing_frame;
+#if CONFIG_EXT_REFS
+  // Flag for a frame used as a reference - not written to the bitstream
+  int is_reference_frame;
+#endif  // CONFIG_EXT_REFS
+
+  // Flag signaling that the frame is encoded using only INTRA modes.
+  uint8_t intra_only;
+  uint8_t last_intra_only;
+
+  int allow_high_precision_mv;
+
+#if CONFIG_PALETTE
+  int allow_screen_content_tools;
+#endif  // CONFIG_PALETTE
+
+  // Flag signaling which frame contexts should be reset to default values.
+  RESET_FRAME_CONTEXT_MODE reset_frame_context;
+
+  // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in
+  // MODE_INFO (8-pixel) units.
+  int MBs;
+  int mb_rows, mi_rows;
+  int mb_cols, mi_cols;
+  int mi_stride;
+
+  /* profile settings */
+  TX_MODE tx_mode;
+
+  int base_qindex;
+  int y_dc_delta_q;
+  int uv_dc_delta_q;
+  int uv_ac_delta_q;
+  int16_t y_dequant[MAX_SEGMENTS][2];
+  int16_t uv_dequant[MAX_SEGMENTS][2];
+
+#if CONFIG_AOM_QM
+  // Global quant matrix tables
+  qm_val_t *giqmatrix[NUM_QM_LEVELS][2][2][TX_SIZES];
+  qm_val_t *gqmatrix[NUM_QM_LEVELS][2][2][TX_SIZES];
+
+  // Local quant matrix tables for each frame
+  qm_val_t *y_iqmatrix[MAX_SEGMENTS][2][TX_SIZES];
+  qm_val_t *uv_iqmatrix[MAX_SEGMENTS][2][TX_SIZES];
+  // Encoder
+  qm_val_t *y_qmatrix[MAX_SEGMENTS][2][TX_SIZES];
+  qm_val_t *uv_qmatrix[MAX_SEGMENTS][2][TX_SIZES];
+
+  int using_qmatrix;
+  int min_qmlevel;
+  int max_qmlevel;
+#endif
+#if CONFIG_NEW_QUANT
+  dequant_val_type_nuq y_dequant_nuq[MAX_SEGMENTS][QUANT_PROFILES][COEF_BANDS];
+  dequant_val_type_nuq uv_dequant_nuq[MAX_SEGMENTS][QUANT_PROFILES][COEF_BANDS];
+#endif
+
+  /* We allocate a MODE_INFO struct for each macroblock, together with
+     an extra row on top and column on the left to simplify prediction. */
+  int mi_alloc_size;
+  MODE_INFO *mip; /* Base of allocated array */
+  MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
+
+  // TODO(agrange): Move prev_mi into encoder structure.
+  // prev_mip and prev_mi will only be allocated in encoder.
+  MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
+  MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
+
+  // Separate mi functions between encoder and decoder.
+  int (*alloc_mi)(struct AV1Common *cm, int mi_size);
+  void (*free_mi)(struct AV1Common *cm);
+  void (*setup_mi)(struct AV1Common *cm);
+
+  // Grid of pointers to 8x8 MODE_INFO structs.  Any 8x8 not in the visible
+  // area will be NULL.
+  MODE_INFO **mi_grid_base;
+  MODE_INFO **mi_grid_visible;
+  MODE_INFO **prev_mi_grid_base;
+  MODE_INFO **prev_mi_grid_visible;
+
+  // Whether to use previous frame's motion vectors for prediction.
+  int use_prev_frame_mvs;
+
+  // Persistent mb segment id map used in prediction.
+  int seg_map_idx;
+  int prev_seg_map_idx;
+
+  uint8_t *seg_map_array[NUM_PING_PONG_BUFFERS];
+  uint8_t *last_frame_seg_map;
+  uint8_t *current_frame_seg_map;
+  int seg_map_alloc_size;
+
+  InterpFilter interp_filter;
+
+  loop_filter_info_n lf_info;
+#if CONFIG_LOOP_RESTORATION
+  RestorationInfo rst_info[MAX_MB_PLANE];
+  RestorationInternal rst_internal;
+#endif  // CONFIG_LOOP_RESTORATION
+
+  // Flag signaling how frame contexts should be updated at the end of
+  // a frame decode
+  REFRESH_FRAME_CONTEXT_MODE refresh_frame_context;
+
+  int ref_frame_sign_bias[TOTAL_REFS_PER_FRAME]; /* Two state 0, 1 */
+
+  struct loopfilter lf;
+  struct segmentation seg;
+
+  int frame_parallel_decode;  // frame-based threading.
+
+#if CONFIG_EXT_TX
+  int reduced_tx_set_used;
+#endif  // CONFIG_EXT_TX
+
+// Context probabilities for reference frame prediction
+#if CONFIG_EXT_REFS
+  MV_REFERENCE_FRAME comp_fwd_ref[FWD_REFS];
+  MV_REFERENCE_FRAME comp_bwd_ref[BWD_REFS];
+#else
+  MV_REFERENCE_FRAME comp_fixed_ref;
+  MV_REFERENCE_FRAME comp_var_ref[COMP_REFS];
+#endif  // CONFIG_EXT_REFS
+  REFERENCE_MODE reference_mode;
+
+  FRAME_CONTEXT *fc;              /* this frame entropy */
+  FRAME_CONTEXT *frame_contexts;  // FRAME_CONTEXTS
+  unsigned int frame_context_idx; /* Context to use/update */
+  FRAME_COUNTS counts;
+
+#if CONFIG_SUBFRAME_PROB_UPDATE
+  // The initial probabilities for a frame, before any subframe backward update,
+  // and after forward update.
+  av1_coeff_probs_model starting_coef_probs[TX_SIZES][PLANE_TYPES];
+  // Number of subframe backward updates already done
+  uint8_t coef_probs_update_idx;
+  // Signal if the backward update is subframe or end-of-frame
+  uint8_t partial_prob_update;
+  // Frame level flag to turn on/off subframe backward update
+  uint8_t do_subframe_update;
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+
+  unsigned int current_video_frame;
+  BITSTREAM_PROFILE profile;
+
+  // AOM_BITS_8 in profile 0 or 1, AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3.
+  aom_bit_depth_t bit_depth;
+  aom_bit_depth_t dequant_bit_depth;  // bit_depth of current dequantizer
+
+  int error_resilient_mode;
+
+#if !CONFIG_EXT_TILE
+  int log2_tile_cols, log2_tile_rows;
+#endif  // !CONFIG_EXT_TILE
+  int tile_cols, tile_rows;
+  int tile_width, tile_height;  // In MI units
+#if CONFIG_EXT_TILE
+  unsigned int tile_encoding_mode;
+#endif  // CONFIG_EXT_TILE
+
+#if CONFIG_DEPENDENT_HORZTILES
+  int dependent_horz_tiles;
+#if CONFIG_TILE_GROUPS
+  int tile_group_start_row[MAX_TILE_ROWS][MAX_TILE_COLS];
+  int tile_group_start_col[MAX_TILE_ROWS][MAX_TILE_COLS];
+#endif
+#endif
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+  int loop_filter_across_tiles_enabled;
+#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+
+  int byte_alignment;
+  int skip_loop_filter;
+
+  // Private data associated with the frame buffer callbacks.
+  void *cb_priv;
+  aom_get_frame_buffer_cb_fn_t get_fb_cb;
+  aom_release_frame_buffer_cb_fn_t release_fb_cb;
+
+  // Handles memory for the codec.
+  InternalFrameBufferList int_frame_buffers;
+
+  // External BufferPool passed from outside.
+  BufferPool *buffer_pool;
+
+  PARTITION_CONTEXT *above_seg_context;
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+#if CONFIG_VAR_TX
+  TXFM_CONTEXT *above_txfm_context;
+  TXFM_CONTEXT left_txfm_context[MAX_MIB_SIZE];
+#endif
+  int above_context_alloc_cols;
+
+  // scratch memory for intraonly/keyframe forward updates from default tables
+  // - this is intentionally not placed in FRAME_CONTEXT since it's reset upon
+  // each keyframe and not used afterwards
+  aom_prob kf_y_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1];
+#if CONFIG_GLOBAL_MOTION
+  WarpedMotionParams global_motion[TOTAL_REFS_PER_FRAME];
+#endif
+
+  BLOCK_SIZE sb_size;  // Size of the superblock used for this frame
+  int mib_size;        // Size of the superblock in units of MI blocks
+  int mib_size_log2;   // Log 2 of above.
+#if CONFIG_CDEF
+  int cdef_dering_damping;
+  int cdef_clpf_damping;
+  int nb_cdef_strengths;
+  int cdef_strengths[CDEF_MAX_STRENGTHS];
+  int cdef_uv_strengths[CDEF_MAX_STRENGTHS];
+  int cdef_bits;
+#endif
+
+#if CONFIG_DELTA_Q
+  int delta_q_present_flag;
+  // Resolution of delta quant
+  int delta_q_res;
+#if CONFIG_EXT_DELTA_Q
+  int delta_lf_present_flag;
+  // Resolution of delta lf level
+  int delta_lf_res;
+#endif
+#endif
+#if CONFIG_TILE_GROUPS
+  int num_tg;
+#endif
+#if CONFIG_REFERENCE_BUFFER
+  int current_frame_id;
+  int ref_frame_id[REF_FRAMES];
+  int valid_for_referencing[REF_FRAMES];
+  int refresh_mask;
+  int invalid_delta_frame_id_minus1;
+#endif
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+  int ans_window_size_log2;
+#endif
+} AV1_COMMON;
+
+#if CONFIG_REFERENCE_BUFFER
+/* Initial version of sequence header structure */
+typedef struct SequenceHeader {
+  int frame_id_numbers_present_flag;
+  int frame_id_length_minus7;
+  int delta_frame_id_length_minus2;
+} SequenceHeader;
+#endif
+
+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+static void lock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
+}
+
+static void unlock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) {
+  if (index < 0 || index >= REF_FRAMES) return NULL;
+  if (cm->ref_frame_map[index] < 0) return NULL;
+  assert(cm->ref_frame_map[index] < FRAME_BUFFERS);
+  return &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf;
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(
+    const AV1_COMMON *const cm) {
+  return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf;
+}
+
+static INLINE int get_free_fb(AV1_COMMON *cm) {
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  int i;
+
+  lock_buffer_pool(cm->buffer_pool);
+  for (i = 0; i < FRAME_BUFFERS; ++i)
+    if (frame_bufs[i].ref_count == 0) break;
+
+  if (i != FRAME_BUFFERS) {
+    frame_bufs[i].ref_count = 1;
+  } else {
+    // Reset i to be INVALID_IDX to indicate no free buffer found.
+    i = INVALID_IDX;
+  }
+
+  unlock_buffer_pool(cm->buffer_pool);
+  return i;
+}
+
+static INLINE void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) {
+  const int ref_index = *idx;
+
+  if (ref_index >= 0 && bufs[ref_index].ref_count > 0)
+    bufs[ref_index].ref_count--;
+
+  *idx = new_idx;
+
+  bufs[new_idx].ref_count++;
+}
+
+static INLINE int mi_cols_aligned_to_sb(const AV1_COMMON *cm) {
+  return ALIGN_POWER_OF_TWO(cm->mi_cols, cm->mib_size_log2);
+}
+
+static INLINE int mi_rows_aligned_to_sb(const AV1_COMMON *cm) {
+  return ALIGN_POWER_OF_TWO(cm->mi_rows, cm->mib_size_log2);
+}
+
+static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) {
+  return cm->frame_type == KEY_FRAME || cm->intra_only;
+}
+
+static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd,
+#if CONFIG_PVQ
+                                        tran_low_t *pvq_ref_coeff,
+#endif
+#if CONFIG_CFL
+                                        CFL_CTX *cfl,
+#endif
+                                        tran_low_t *dqcoeff) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    xd->plane[i].dqcoeff = dqcoeff;
+#if CONFIG_PVQ
+    xd->plane[i].pvq_ref_coeff = pvq_ref_coeff;
+#endif
+#if CONFIG_CFL
+    xd->cfl = cfl;
+    cfl_init(cfl, cm, xd->plane[AOM_PLANE_U].subsampling_x,
+             xd->plane[AOM_PLANE_U].subsampling_y);
+#endif
+    xd->above_context[i] = cm->above_context[i];
+    if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
+      memcpy(xd->plane[i].seg_dequant, cm->y_dequant, sizeof(cm->y_dequant));
+#if CONFIG_AOM_QM
+      memcpy(xd->plane[i].seg_iqmatrix, cm->y_iqmatrix, sizeof(cm->y_iqmatrix));
+#endif
+
+#if CONFIG_NEW_QUANT
+      memcpy(xd->plane[i].seg_dequant_nuq, cm->y_dequant_nuq,
+             sizeof(cm->y_dequant_nuq));
+#endif
+    } else {
+      memcpy(xd->plane[i].seg_dequant, cm->uv_dequant, sizeof(cm->uv_dequant));
+#if CONFIG_AOM_QM
+      memcpy(xd->plane[i].seg_iqmatrix, cm->uv_iqmatrix,
+             sizeof(cm->uv_iqmatrix));
+#endif
+#if CONFIG_NEW_QUANT
+      memcpy(xd->plane[i].seg_dequant_nuq, cm->uv_dequant_nuq,
+             sizeof(cm->uv_dequant_nuq));
+#endif
+    }
+    xd->fc = cm->fc;
+  }
+  xd->above_seg_context = cm->above_seg_context;
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = cm->above_txfm_context;
+#endif
+  xd->mi_stride = cm->mi_stride;
+  xd->error_info = &cm->error;
+}
+
+static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+#if CONFIG_CHROMA_SUB8X8
+    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
+      // Offset the buffer pointer
+      if (pd->subsampling_y && (mi_row & 0x01)) mi_row -= 1;
+      if (pd->subsampling_x && (mi_col & 0x01)) mi_col -= 1;
+    }
+#endif
+    int above_idx = mi_col * 2;
+    int left_idx = (mi_row * 2) & MAX_MIB_MASK_2;
+    pd->above_context = &xd->above_context[i][above_idx >> pd->subsampling_x];
+    pd->left_context = &xd->left_context[i][left_idx >> pd->subsampling_y];
+  }
+}
+
+static INLINE int calc_mi_size(int len) {
+  // len is in mi units.
+  return len + MAX_MIB_SIZE;
+}
+
+static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].n4_w = (bw << 1) >> xd->plane[i].subsampling_x;
+    xd->plane[i].n4_h = (bh << 1) >> xd->plane[i].subsampling_y;
+
+    xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x;
+    xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y;
+
+#if !CONFIG_CHROMA_2X2
+    xd->plane[i].width = AOMMAX(xd->plane[i].width, 4);
+    xd->plane[i].height = AOMMAX(xd->plane[i].height, 4);
+#endif
+  }
+}
+
+static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
+                                  int mi_row, int bh, int mi_col, int bw,
+#if CONFIG_DEPENDENT_HORZTILES
+                                  int dependent_horz_tile_flag,
+#endif  // CONFIG_DEPENDENT_HORZTILES
+                                  int mi_rows, int mi_cols) {
+  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8;
+  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge = ((mi_cols - bw - mi_col) * MI_SIZE) * 8;
+
+#if CONFIG_DEPENDENT_HORZTILES
+  if (dependent_horz_tile_flag) {
+#if CONFIG_TILE_GROUPS
+    xd->up_available = (mi_row > tile->mi_row_start) || !tile->tg_horz_boundary;
+#else
+    xd->up_available = (mi_row > 0);
+#endif  // CONFIG_TILE_GROUPS
+  } else {
+#endif  // CONFIG_DEPENDENT_HORZTILES
+    // Are edges available for intra prediction?
+    xd->up_available = (mi_row > tile->mi_row_start);
+#if CONFIG_DEPENDENT_HORZTILES
+  }
+#endif  // CONFIG_DEPENDENT_HORZTILES
+
+  xd->left_available = (mi_col > tile->mi_col_start);
+#if CONFIG_CHROMA_SUB8X8
+  xd->chroma_up_available = xd->up_available;
+  xd->chroma_left_available = xd->left_available;
+  if (xd->plane[1].subsampling_x && bw < mi_size_wide[BLOCK_8X8])
+    xd->chroma_left_available = (mi_col - 1) > tile->mi_col_start;
+  if (xd->plane[1].subsampling_y && bh < mi_size_high[BLOCK_8X8])
+    xd->chroma_up_available = (mi_row - 1) > tile->mi_row_start;
+#endif
+  if (xd->up_available) {
+    xd->above_mi = xd->mi[-xd->mi_stride];
+    // above_mi may be NULL in encoder's first pass.
+    xd->above_mbmi = xd->above_mi ? &xd->above_mi->mbmi : NULL;
+  } else {
+    xd->above_mi = NULL;
+    xd->above_mbmi = NULL;
+  }
+
+  if (xd->left_available) {
+    xd->left_mi = xd->mi[-1];
+    // left_mi may be NULL in encoder's first pass.
+    xd->left_mbmi = xd->left_mi ? &xd->left_mi->mbmi : NULL;
+  } else {
+    xd->left_mi = NULL;
+    xd->left_mbmi = NULL;
+  }
+
+  xd->n8_h = bh;
+  xd->n8_w = bw;
+#if CONFIG_REF_MV
+  xd->is_sec_rect = 0;
+  if (xd->n8_w < xd->n8_h)
+    if (mi_col & (xd->n8_h - 1)) xd->is_sec_rect = 1;
+
+  if (xd->n8_w > xd->n8_h)
+    if (mi_row & (xd->n8_w - 1)) xd->is_sec_rect = 1;
+#endif  // CONFIG_REF_MV
+}
+
+static INLINE const aom_prob *get_y_mode_probs(const AV1_COMMON *cm,
+                                               const MODE_INFO *mi,
+                                               const MODE_INFO *above_mi,
+                                               const MODE_INFO *left_mi,
+                                               int block) {
+  const PREDICTION_MODE above = av1_above_block_mode(mi, above_mi, block);
+  const PREDICTION_MODE left = av1_left_block_mode(mi, left_mi, block);
+  return cm->kf_y_prob[above][left];
+}
+
+#if CONFIG_EC_MULTISYMBOL
+static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
+                                           const MODE_INFO *mi,
+                                           const MODE_INFO *above_mi,
+                                           const MODE_INFO *left_mi,
+                                           int block) {
+  const PREDICTION_MODE above = av1_above_block_mode(mi, above_mi, block);
+  const PREDICTION_MODE left = av1_left_block_mode(mi, left_mi, block);
+  return tile_ctx->kf_y_cdf[above][left];
+}
+#endif
+
+static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row,
+                                            int mi_col, BLOCK_SIZE subsize,
+                                            BLOCK_SIZE bsize) {
+  PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
+  PARTITION_CONTEXT *const left_ctx =
+      xd->left_seg_context + (mi_row & MAX_MIB_MASK);
+
+#if CONFIG_EXT_PARTITION_TYPES
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  memset(above_ctx, partition_context_lookup[subsize].above, bw);
+  memset(left_ctx, partition_context_lookup[subsize].left, bh);
+#else
+  // num_4x4_blocks_wide_lookup[bsize] / 2
+  const int bs = mi_size_wide[bsize];
+
+  // update the partition context at the end notes. set partition bits
+  // of block sizes larger than the current one to be one, and partition
+  // bits of smaller block sizes to be zero.
+  memset(above_ctx, partition_context_lookup[subsize].above, bs);
+  memset(left_ctx, partition_context_lookup[subsize].left, bs);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+}
+
+#if CONFIG_CB4X4
+static INLINE int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                      int subsampling_x, int subsampling_y) {
+#if CONFIG_CHROMA_2X2
+  return 1;
+#endif
+
+#if CONFIG_CHROMA_SUB8X8
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+
+  int ref_pos = ((mi_row & 0x01) || !(bh & 0x01) || !subsampling_y) &&
+                ((mi_col & 0x01) || !(bw & 0x01) || !subsampling_x);
+
+  return ref_pos;
+#else
+  int ref_pos = !(((mi_row & 0x01) && subsampling_y) ||
+                  ((mi_col & 0x01) && subsampling_x));
+
+  if (bsize >= BLOCK_8X8) ref_pos = 1;
+
+  return ref_pos;
+#endif
+}
+
+static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
+                                            int subsampling_y) {
+  BLOCK_SIZE bs = bsize;
+
+  if (bs < BLOCK_8X8) {
+    if (subsampling_x == 1 && subsampling_y == 1)
+      bs = BLOCK_8X8;
+    else if (subsampling_x == 1)
+      bs = BLOCK_8X4;
+    else if (subsampling_y == 1)
+      bs = BLOCK_4X8;
+  }
+
+  return bs;
+}
+#endif
+
+#if CONFIG_EXT_PARTITION_TYPES
+static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row,
+                                                int mi_col, BLOCK_SIZE subsize,
+                                                BLOCK_SIZE bsize,
+                                                PARTITION_TYPE partition) {
+  if (bsize >= BLOCK_8X8) {
+    const int hbs = mi_size_wide[bsize] / 2;
+    BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+    switch (partition) {
+      case PARTITION_SPLIT:
+        if (bsize != BLOCK_8X8) break;
+      case PARTITION_NONE:
+      case PARTITION_HORZ:
+      case PARTITION_VERT:
+        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+        break;
+      case PARTITION_HORZ_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
+        break;
+      case PARTITION_HORZ_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize);
+        break;
+      case PARTITION_VERT_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize);
+        break;
+      case PARTITION_VERT_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
+        break;
+      default: assert(0 && "Invalid partition type");
+    }
+  }
+}
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
+static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row,
+                                          int mi_col,
+#if CONFIG_UNPOISON_PARTITION_CTX
+                                          int has_rows, int has_cols,
+#endif
+                                          BLOCK_SIZE bsize) {
+#if CONFIG_UNPOISON_PARTITION_CTX
+  const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
+  const PARTITION_CONTEXT *left_ctx =
+      xd->left_seg_context + (mi_row & MAX_MIB_MASK);
+  // Minimum partition point is 8x8. Offset the bsl accordingly.
+  const int bsl = mi_width_log2_lookup[bsize] - mi_width_log2_lookup[BLOCK_8X8];
+  int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1;
+
+  assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]);
+  assert(bsl >= 0);
+
+  if (has_rows && has_cols)
+    return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+  else if (has_rows && !has_cols)
+    return PARTITION_CONTEXTS_PRIMARY + bsl;
+  else if (!has_rows && has_cols)
+    return PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES + bsl;
+  else
+    return PARTITION_CONTEXTS;  // Bogus context, forced SPLIT
+#else
+  const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
+  const PARTITION_CONTEXT *left_ctx =
+      xd->left_seg_context + (mi_row & MAX_MIB_MASK);
+  // Minimum partition point is 8x8. Offset the bsl accordingly.
+  const int bsl = mi_width_log2_lookup[bsize] - mi_width_log2_lookup[BLOCK_8X8];
+  int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1;
+
+  assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]);
+  assert(bsl >= 0);
+
+  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+#endif
+}
+
+static INLINE int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                 int plane) {
+  int max_blocks_wide = block_size_wide[bsize];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+
+  // Scale the width in the transform block unit.
+  return max_blocks_wide >> tx_size_wide_log2[0];
+}
+
+static INLINE int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                 int plane) {
+  int max_blocks_high = block_size_high[bsize];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
+
+  // Scale the width in the transform block unit.
+  return max_blocks_high >> tx_size_wide_log2[0];
+}
+
+static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
+                                          int mi_col_start, int mi_col_end) {
+  const int width = mi_col_end - mi_col_start;
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, cm->mib_size_log2);
+
+  const int offset_y = 2 * mi_col_start;
+  const int width_y = 2 * aligned_width;
+  const int offset_uv = offset_y >> cm->subsampling_x;
+  const int width_uv = width_y >> cm->subsampling_x;
+
+  av1_zero_array(cm->above_context[0] + offset_y, width_y);
+  av1_zero_array(cm->above_context[1] + offset_uv, width_uv);
+  av1_zero_array(cm->above_context[2] + offset_uv, width_uv);
+
+  av1_zero_array(cm->above_seg_context + mi_col_start, aligned_width);
+
+#if CONFIG_VAR_TX
+  av1_zero_array(cm->above_txfm_context + mi_col_start, aligned_width);
+#endif  // CONFIG_VAR_TX
+}
+
+static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) {
+  av1_zero(xd->left_context);
+  av1_zero(xd->left_seg_context);
+#if CONFIG_VAR_TX
+  av1_zero(xd->left_txfm_context_buffer);
+#endif
+}
+
+#if CONFIG_VAR_TX
+static INLINE TX_SIZE get_min_tx_size(TX_SIZE tx_size) {
+  if (tx_size >= TX_SIZES_ALL) assert(0);
+  return txsize_sqr_map[tx_size];
+}
+
+static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
+  int i;
+  for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
+}
+
+static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n8_w, int n8_h, int skip,
+                                 const MACROBLOCKD *xd) {
+  uint8_t bw = tx_size_wide[tx_size];
+  uint8_t bh = tx_size_high[tx_size];
+
+  if (skip) {
+    bw = n8_w * MI_SIZE;
+    bh = n8_h * MI_SIZE;
+  }
+
+  set_txfm_ctx(xd->above_txfm_context, bw, n8_w);
+  set_txfm_ctx(xd->left_txfm_context, bh, n8_h);
+}
+
+static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
+                                         TXFM_CONTEXT *left_ctx,
+                                         TX_SIZE tx_size, TX_SIZE txb_size) {
+  BLOCK_SIZE bsize = txsize_to_bsize[txb_size];
+  int bh = mi_size_high[bsize];
+  int bw = mi_size_wide[bsize];
+  uint8_t txw = tx_size_wide[tx_size];
+  uint8_t txh = tx_size_high[tx_size];
+  int i;
+  for (i = 0; i < bh; ++i) left_ctx[i] = txh;
+  for (i = 0; i < bw; ++i) above_ctx[i] = txw;
+}
+
+static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx,
+                                         TXFM_CONTEXT *left_ctx,
+                                         BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  const uint8_t txw = tx_size_wide[tx_size];
+  const uint8_t txh = tx_size_high[tx_size];
+  const int above = *above_ctx < txw;
+  const int left = *left_ctx < txh;
+  TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  int category = TXFM_PARTITION_CONTEXTS - 1;
+
+  // dummy return, not used by others.
+  if (tx_size <= TX_4X4) return 0;
+
+  switch (AOMMAX(block_size_wide[bsize], block_size_high[bsize])) {
+#if CONFIG_EXT_PARTITION
+    case 128:
+#endif
+    case 64:
+    case 32: max_tx_size = TX_32X32; break;
+    case 16: max_tx_size = TX_16X16; break;
+    case 8: max_tx_size = TX_8X8; break;
+    default: assert(0);
+  }
+
+  if (max_tx_size >= TX_8X8) {
+    category = (tx_size != max_tx_size && max_tx_size > TX_8X8) +
+               (TX_SIZES - 1 - max_tx_size) * 2;
+  }
+  if (category == TXFM_PARTITION_CONTEXTS - 1) return category;
+  return category * 3 + above + left;
+}
+#endif
+
+static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
+                                           int mi_row, int mi_col,
+                                           BLOCK_SIZE bsize) {
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) {
+    return PARTITION_INVALID;
+  } else {
+    const int offset = mi_row * cm->mi_stride + mi_col;
+    MODE_INFO **mi = cm->mi_grid_visible + offset;
+    const MB_MODE_INFO *const mbmi = &mi[0]->mbmi;
+    const int bsl = b_width_log2_lookup[bsize];
+    const PARTITION_TYPE partition = partition_lookup[bsl][mbmi->sb_type];
+#if !CONFIG_EXT_PARTITION_TYPES
+    return partition;
+#else
+    const int hbs = mi_size_wide[bsize] / 2;
+
+    assert(cm->mi_grid_visible[offset] == &cm->mi[offset]);
+
+    if (partition != PARTITION_NONE && bsize > BLOCK_8X8 &&
+        mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
+      const BLOCK_SIZE h = get_subsize(bsize, PARTITION_HORZ_A);
+      const BLOCK_SIZE v = get_subsize(bsize, PARTITION_VERT_A);
+      const MB_MODE_INFO *const mbmi_right = &mi[hbs]->mbmi;
+      const MB_MODE_INFO *const mbmi_below = &mi[hbs * cm->mi_stride]->mbmi;
+      if (mbmi->sb_type == h) {
+        return mbmi_below->sb_type == h ? PARTITION_HORZ : PARTITION_HORZ_B;
+      } else if (mbmi->sb_type == v) {
+        return mbmi_right->sb_type == v ? PARTITION_VERT : PARTITION_VERT_B;
+      } else if (mbmi_below->sb_type == h) {
+        return PARTITION_HORZ_A;
+      } else if (mbmi_right->sb_type == v) {
+        return PARTITION_VERT_A;
+      } else {
+        return PARTITION_SPLIT;
+      }
+    }
+
+    return partition;
+#endif  // !CONFIG_EXT_PARTITION_TYPES
+  }
+}
+
+static INLINE void set_sb_size(AV1_COMMON *const cm, BLOCK_SIZE sb_size) {
+  cm->sb_size = sb_size;
+  cm->mib_size = mi_size_wide[cm->sb_size];
+#if CONFIG_CB4X4
+  cm->mib_size_log2 = b_width_log2_lookup[cm->sb_size];
+#else
+  cm->mib_size_log2 = mi_width_log2_lookup[cm->sb_size];
+#endif
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_ONYXC_INT_H_
diff --git a/third_party/aom/av1/common/partition.c b/third_party/aom/av1/common/partition.c
new file mode 100644
index 000000000..634a9edd5
--- /dev/null
+++ b/third_party/aom/av1/common/partition.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "enums.h"
+#include "odintrin.h"
+#include "partition.h"
+#include "zigzag.h"
+
+OD_EXTERN const index_pair *OD_ZIGZAG4[4] = {
+  OD_ZIGZAG4_DCT_DCT,
+  OD_ZIGZAG4_ADST_DCT,
+  OD_ZIGZAG4_DCT_ADST,
+  OD_ZIGZAG4_ADST_ADST
+};
+
+OD_EXTERN const index_pair *OD_ZIGZAG8[4] = {
+  OD_ZIGZAG8_DCT_DCT,
+  OD_ZIGZAG8_ADST_DCT,
+  OD_ZIGZAG8_DCT_ADST,
+  OD_ZIGZAG8_ADST_ADST
+};
+
+OD_EXTERN const index_pair *OD_ZIGZAG16[4] = {
+  OD_ZIGZAG16_DCT_DCT,
+  OD_ZIGZAG16_ADST_DCT,
+  OD_ZIGZAG16_DCT_ADST,
+  OD_ZIGZAG16_ADST_ADST
+};
+
+OD_EXTERN const index_pair *OD_ZIGZAG32[4] = {
+  OD_ZIGZAG32_DCT_DCT,
+  OD_ZIGZAG32_DCT_DCT,
+  OD_ZIGZAG32_DCT_DCT,
+  OD_ZIGZAG32_DCT_DCT
+};
+
+/* The tables below specify how coefficient blocks are translated to
+   and from PVQ partition coding scan order for 4x4, 8x8 and 16x16 */
+
+static const int OD_LAYOUT32_OFFSETS[4] = { 0, 128, 256, 768 };
+const band_layout OD_LAYOUT32 = {
+  OD_ZIGZAG32,
+  32,
+  3,
+  OD_LAYOUT32_OFFSETS
+};
+
+static const int OD_LAYOUT16_OFFSETS[4] = { 0, 32, 64, 192 };
+const band_layout OD_LAYOUT16 = {
+  OD_ZIGZAG16,
+  16,
+  3,
+  OD_LAYOUT16_OFFSETS
+};
+
+const int OD_LAYOUT8_OFFSETS[4] = { 0, 8, 16, 48 };
+const band_layout OD_LAYOUT8 = {
+  OD_ZIGZAG8,
+  8,
+  3,
+  OD_LAYOUT8_OFFSETS
+};
+
+static const int OD_LAYOUT4_OFFSETS[2] = { 0, 15 };
+const band_layout OD_LAYOUT4 = {
+  OD_ZIGZAG4,
+  4,
+  1,
+  OD_LAYOUT4_OFFSETS
+};
+
+/* First element is the number of bands, followed by the list all the band
+  boundaries. */
+static const int OD_BAND_OFFSETS4[] = {1, 1, 16};
+static const int OD_BAND_OFFSETS8[] = {4, 1, 16, 24, 32, 64};
+static const int OD_BAND_OFFSETS16[] = {7, 1, 16, 24, 32, 64, 96, 128, 256};
+static const int OD_BAND_OFFSETS32[] = {10, 1, 16, 24, 32, 64, 96, 128, 256,
+ 384, 512, 1024};
+static const int OD_BAND_OFFSETS64[] = {13, 1, 16, 24, 32, 64, 96, 128, 256,
+ 384, 512, 1024, 1536, 2048, 4096};
+
+const int *const OD_BAND_OFFSETS[OD_TXSIZES + 1] = {
+  OD_BAND_OFFSETS4,
+  OD_BAND_OFFSETS8,
+  OD_BAND_OFFSETS16,
+  OD_BAND_OFFSETS32,
+  OD_BAND_OFFSETS64
+};
+
+/** Perform a single stage of conversion from a coefficient block in
+ * raster order into coding scan order
+ *
+ * @param [in]     layout  scan order specification
+ * @param [out]    dst     destination vector
+ * @param [in]     src     source coefficient block
+ * @param [int]    int     source vector row stride
+ */
+static void od_band_from_raster(const band_layout *layout, tran_low_t *dst,
+ const tran_low_t *src, int stride, TX_TYPE tx_type) {
+  int i;
+  int len;
+  len = layout->band_offsets[layout->nb_bands];
+  for (i = 0; i < len; i++) {
+    dst[i] = src[layout->dst_table[tx_type][i][1]*stride + layout->dst_table[tx_type][i][0]];
+  }
+}
+
+/** Perform a single stage of conversion from a vector in coding scan
+    order back into a coefficient block in raster order
+ *
+ * @param [in]     layout  scan order specification
+ * @param [out]    dst     destination coefficient block
+ * @param [in]     src     source vector
+ * @param [int]    stride  destination vector row stride
+ */
+static void od_raster_from_band(const band_layout *layout, tran_low_t *dst,
+ int stride, TX_TYPE tx_type, const tran_low_t *src) {
+  int i;
+  int len;
+  len = layout->band_offsets[layout->nb_bands];
+  for (i = 0; i < len; i++) {
+    dst[layout->dst_table[tx_type][i][1]*stride + layout->dst_table[tx_type][i][0]] = src[i];
+  }
+}
+
+static const band_layout *const OD_LAYOUTS[] = {&OD_LAYOUT4, &OD_LAYOUT8,
+ &OD_LAYOUT16, &OD_LAYOUT32};
+
+/** Converts a coefficient block in raster order into a vector in
+ * coding scan order with the PVQ partitions laid out one after
+ * another.  This works in stages; the 4x4 conversion is applied to
+ * the coefficients nearest DC, then the 8x8 applied to the 8x8 block
+ * nearest DC that was not already coded by 4x4, then 16x16 following
+ * the same pattern.
+ *
+ * @param [out]    dst        destination vector
+ * @param [in]     n          block size (along one side)
+ * @param [in]     ty_type    transfrom type
+ * @param [in]     src        source coefficient block
+ * @param [in]     stride     source vector row stride
+ */
+void od_raster_to_coding_order(tran_low_t *dst, int n, TX_TYPE ty_type,
+ const tran_low_t *src, int stride) {
+  int bs;
+  /* dst + 1 because DC is not included for 4x4 blocks. */
+  od_band_from_raster(OD_LAYOUTS[0], dst + 1, src, stride, ty_type);
+  for (bs = 1; bs < OD_TXSIZES; bs++) {
+    int size;
+    int offset;
+    /* Length of block size > 4. */
+    size = 1 << (OD_LOG_BSIZE0 + bs);
+    /* Offset is the size of the previous block squared. */
+    offset = 1 << 2*(OD_LOG_BSIZE0 - 1 + bs);
+    if (n >= size) {
+      /* 3 16x16 bands come after 3 8x8 bands, which come after 2 4x4 bands. */
+      od_band_from_raster(OD_LAYOUTS[bs], dst + offset, src, stride, ty_type);
+    }
+  }
+  dst[0] = src[0];
+}
+
+/** Converts a vector in coding scan order witht he PVQ partitions
+ * laid out one after another into a coefficient block in raster
+ * order. This works in stages in the reverse order of raster->scan
+ * order; the 16x16 conversion is applied to the coefficients that
+ * don't appear in an 8x8 block, then the 8x8 applied to the 8x8 block
+ * sans the 4x4 block it contains, then 4x4 is converted sans DC.
+ *
+ * @param [out]    dst        destination coefficient block
+ * @param [in]     stride     destination vector row stride
+ * @param [in]     src        source vector
+ * @param [in]     n          block size (along one side)
+ */
+void od_coding_order_to_raster(tran_low_t *dst, int stride, TX_TYPE ty_type,
+ const tran_low_t *src, int n) {
+  int bs;
+  /* src + 1 because DC is not included for 4x4 blocks. */
+  od_raster_from_band(OD_LAYOUTS[0], dst, stride, ty_type, src + 1);
+  for (bs = 1; bs < OD_TXSIZES; bs++) {
+    int size;
+    int offset;
+    /* Length of block size > 4 */
+    size = 1 << (OD_LOG_BSIZE0 + bs);
+    /* Offset is the size of the previous block squared. */
+    offset = 1 << 2*(OD_LOG_BSIZE0 - 1 + bs);
+    if (n >= size) {
+      /* 3 16x16 bands come after 3 8x8 bands, which come after 2 4x4 bands. */
+      od_raster_from_band(OD_LAYOUTS[bs], dst, stride, ty_type, src + offset);
+    }
+  }
+  dst[0] = src[0];
+}
+
+/** Perform a single stage of conversion from a coefficient block in
+ * raster order into coding scan order
+ *
+ * @param [in]     layout  scan order specification
+ * @param [out]    dst     destination vector
+ * @param [in]     src     source coefficient block
+ * @param [int]    int     source vector row stride
+ */
+static void od_band_from_raster_16(const band_layout *layout, int16_t *dst,
+ const int16_t *src, int stride) {
+  int i;
+  int len;
+  len = layout->band_offsets[layout->nb_bands];
+  for (i = 0; i < len; i++) {
+    dst[i] = src[layout->dst_table[DCT_DCT][i][1]*stride + layout->dst_table[DCT_DCT][i][0]];
+  }
+}
+
+/** Converts a coefficient block in raster order into a vector in
+ * coding scan order with the PVQ partitions laid out one after
+ * another.  This works in stages; the 4x4 conversion is applied to
+ * the coefficients nearest DC, then the 8x8 applied to the 8x8 block
+ * nearest DC that was not already coded by 4x4, then 16x16 following
+ * the same pattern.
+ *
+ * @param [out]    dst        destination vector
+ * @param [in]     n          block size (along one side)
+ * @param [in]     src        source coefficient block
+ * @param [in]     stride     source vector row stride
+ */
+void od_raster_to_coding_order_16(int16_t *dst, int n, const int16_t *src,
+ int stride) {
+  int bs;
+  /* dst + 1 because DC is not included for 4x4 blocks. */
+  od_band_from_raster_16(OD_LAYOUTS[0], dst + 1, src, stride);
+  for (bs = 1; bs < OD_TXSIZES; bs++) {
+    int size;
+    int offset;
+    /* Length of block size > 4. */
+    size = 1 << (OD_LOG_BSIZE0 + bs);
+    /* Offset is the size of the previous block squared. */
+    offset = 1 << 2*(OD_LOG_BSIZE0 - 1 + bs);
+    if (n >= size) {
+      /* 3 16x16 bands come after 3 8x8 bands, which come after 2 4x4 bands. */
+      od_band_from_raster_16(OD_LAYOUTS[bs], dst + offset, src, stride);
+    }
+  }
+  dst[0] = src[0];
+}
diff --git a/third_party/aom/av1/common/partition.h b/third_party/aom/av1/common/partition.h
new file mode 100644
index 000000000..bd308f94f
--- /dev/null
+++ b/third_party/aom/av1/common/partition.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#if !defined(_partition_H)
+# define _partition_H
+
+#include "av1/common/enums.h"
+#include "odintrin.h"
+
+typedef unsigned char index_pair[2];
+
+typedef struct {
+  const index_pair **const dst_table;
+  int size;
+  int nb_bands;
+  const int *const band_offsets;
+} band_layout;
+
+extern const int *const OD_BAND_OFFSETS[OD_TXSIZES + 1];
+
+void od_raster_to_coding_order(tran_low_t *dst, int n,  TX_TYPE ty_type,
+ const tran_low_t *src, int stride);
+
+void od_coding_order_to_raster(tran_low_t *dst, int stride,  TX_TYPE ty_type,
+ const tran_low_t *src, int n);
+
+void od_raster_to_coding_order_16(int16_t *dst, int n, const int16_t *src,
+ int stride);
+
+#endif
diff --git a/third_party/aom/av1/common/pred_common.c b/third_party/aom/av1/common/pred_common.c
new file mode 100644
index 000000000..905dd3afe
--- /dev/null
+++ b/third_party/aom/av1/common/pred_common.c
@@ -0,0 +1,1408 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
+#if CONFIG_EXT_INTRA
+#include "av1/common/reconintra.h"
+#endif  // CONFIG_EXT_INTRA
+#include "av1/common/seg_common.h"
+
+// Returns a context number for the given MB prediction signal
+#if CONFIG_DUAL_FILTER
+static InterpFilter get_ref_filter_type(const MODE_INFO *mi,
+                                        const MACROBLOCKD *xd, int dir,
+                                        MV_REFERENCE_FRAME ref_frame) {
+  InterpFilter ref_type = SWITCHABLE_FILTERS;
+  const MB_MODE_INFO *ref_mbmi = &mi->mbmi;
+  int use_subpel[2] = {
+    has_subpel_mv_component(mi, xd, dir),
+    has_subpel_mv_component(mi, xd, dir + 2),
+  };
+
+  if (ref_mbmi->ref_frame[0] == ref_frame && use_subpel[0])
+    ref_type = ref_mbmi->interp_filter[(dir & 0x01)];
+  else if (ref_mbmi->ref_frame[1] == ref_frame && use_subpel[1])
+    ref_type = ref_mbmi->interp_filter[(dir & 0x01) + 2];
+
+  return ref_type;
+}
+
+int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) {
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int ctx_offset =
+      (mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET;
+  MV_REFERENCE_FRAME ref_frame =
+      (dir < 2) ? mbmi->ref_frame[0] : mbmi->ref_frame[1];
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  int filter_type_ctx = ctx_offset + (dir & 0x01) * INTER_FILTER_DIR_OFFSET;
+  int left_type = SWITCHABLE_FILTERS;
+  int above_type = SWITCHABLE_FILTERS;
+
+  if (xd->left_available)
+    left_type = get_ref_filter_type(xd->mi[-1], xd, dir, ref_frame);
+
+  if (xd->up_available)
+    above_type =
+        get_ref_filter_type(xd->mi[-xd->mi_stride], xd, dir, ref_frame);
+
+  if (left_type == above_type) {
+    filter_type_ctx += left_type;
+  } else if (left_type == SWITCHABLE_FILTERS) {
+    assert(above_type != SWITCHABLE_FILTERS);
+    filter_type_ctx += above_type;
+  } else if (above_type == SWITCHABLE_FILTERS) {
+    assert(left_type != SWITCHABLE_FILTERS);
+    filter_type_ctx += left_type;
+  } else {
+    filter_type_ctx += SWITCHABLE_FILTERS;
+  }
+
+  return filter_type_ctx;
+}
+#else
+int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int left_type = xd->left_available && is_inter_block(left_mbmi)
+                            ? left_mbmi->interp_filter
+                            : SWITCHABLE_FILTERS;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const int above_type = xd->up_available && is_inter_block(above_mbmi)
+                             ? above_mbmi->interp_filter
+                             : SWITCHABLE_FILTERS;
+
+  if (left_type == above_type) {
+    return left_type;
+  } else if (left_type == SWITCHABLE_FILTERS) {
+    assert(above_type != SWITCHABLE_FILTERS);
+    return above_type;
+  } else if (above_type == SWITCHABLE_FILTERS) {
+    assert(left_type != SWITCHABLE_FILTERS);
+    return left_type;
+  } else {
+    return SWITCHABLE_FILTERS;
+  }
+}
+#endif
+
+#if CONFIG_EXT_INTRA
+#if CONFIG_INTRA_INTERP
+// Obtain the reference filter type from the above/left neighbor blocks.
+static INTRA_FILTER get_ref_intra_filter(const MB_MODE_INFO *ref_mbmi) {
+  INTRA_FILTER ref_type = INTRA_FILTERS;
+
+  if (ref_mbmi->sb_type >= BLOCK_8X8) {
+    const PREDICTION_MODE mode = ref_mbmi->mode;
+    if (is_inter_block(ref_mbmi)) {
+#if CONFIG_DUAL_FILTER
+      switch (ref_mbmi->interp_filter[0]) {
+#else
+      switch (ref_mbmi->interp_filter) {
+#endif
+        case EIGHTTAP_REGULAR: ref_type = INTRA_FILTER_8TAP; break;
+        case EIGHTTAP_SMOOTH: ref_type = INTRA_FILTER_8TAP_SMOOTH; break;
+        case MULTITAP_SHARP: ref_type = INTRA_FILTER_8TAP_SHARP; break;
+        case BILINEAR: ref_type = INTRA_FILTERS; break;
+        default: break;
+      }
+    } else {
+      if (av1_is_directional_mode(mode, ref_mbmi->sb_type)) {
+        const int p_angle =
+            mode_to_angle_map[mode] + ref_mbmi->angle_delta[0] * ANGLE_STEP;
+        if (av1_is_intra_filter_switchable(p_angle)) {
+          ref_type = ref_mbmi->intra_filter;
+        }
+      }
+    }
+  }
+  return ref_type;
+}
+
+int av1_get_pred_context_intra_interp(const MACROBLOCKD *xd) {
+  int left_type = INTRA_FILTERS, above_type = INTRA_FILTERS;
+
+  if (xd->left_available) left_type = get_ref_intra_filter(xd->left_mbmi);
+
+  if (xd->up_available) above_type = get_ref_intra_filter(xd->above_mbmi);
+
+  if (left_type == above_type)
+    return left_type;
+  else if (left_type == INTRA_FILTERS && above_type != INTRA_FILTERS)
+    return above_type;
+  else if (left_type != INTRA_FILTERS && above_type == INTRA_FILTERS)
+    return left_type;
+  else
+    return INTRA_FILTERS;
+}
+#endif  // CONFIG_INTRA_INTERP
+#endif  // CONFIG_EXT_INTRA
+
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real macroblocks.
+// The prediction flags in these dummy entries are initialized to 0.
+// 0 - inter/inter, inter/--, --/inter, --/--
+// 1 - intra/inter, inter/intra
+// 2 - intra/--, --/intra
+// 3 - intra/intra
+int av1_get_intra_inter_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+    return left_intra && above_intra ? 3 : left_intra || above_intra;
+  } else if (has_above || has_left) {  // one edge available
+    return 2 * !is_inter_block(has_above ? above_mbmi : left_mbmi);
+  } else {
+    return 0;
+  }
+}
+
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+// The compound/single mode info data structure has one element border above and
+// to the left of the entries corresponding to real macroblocks.
+// The prediction flags in these dummy entries are initialized to 0.
+// 0 - single/single
+// 1 - single/--, --/single, --/--
+// 2 - single/comp, comp/single
+// 3 - comp/comp, comp/--, --/comp
+int av1_get_inter_mode_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  if (has_above && has_left) {  // both edges available (0/2/3)
+    const int above_inter_comp_mode = is_inter_compound_mode(above_mbmi->mode);
+    const int left_inter_comp_mode = is_inter_compound_mode(left_mbmi->mode);
+    return (above_inter_comp_mode && left_inter_comp_mode)
+               ? 3
+               : (above_inter_comp_mode || left_inter_comp_mode) * 2;
+  } else if (has_above || has_left) {  // one edge available (1/3)
+    const MB_MODE_INFO *const edge_mbmi = has_above ? above_mbmi : left_mbmi;
+    return is_inter_compound_mode(edge_mbmi->mode) ? 3 : 1;
+  } else {  // no edge available (1)
+    return 1;
+  }
+}
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
+#if CONFIG_EXT_REFS
+#define CHECK_BACKWARD_REFS(ref_frame) \
+  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
+#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
+#else
+#define IS_BACKWARD_REF_FRAME(ref_frame) ((ref_frame) == cm->comp_fixed_ref)
+#endif  // CONFIG_EXT_REFS
+
+int av1_get_reference_mode_context(const AV1_COMMON *cm,
+                                   const MACROBLOCKD *xd) {
+  int ctx;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+#if CONFIG_EXT_REFS
+  (void)cm;
+#endif  // CONFIG_EXT_REFS
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  if (has_above && has_left) {  // both edges available
+    if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi))
+      // neither edge uses comp pred (0/1)
+      ctx = IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) ^
+            IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]);
+    else if (!has_second_ref(above_mbmi))
+      // one of two edges uses comp pred (2/3)
+      ctx = 2 + (IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) ||
+                 !is_inter_block(above_mbmi));
+    else if (!has_second_ref(left_mbmi))
+      // one of two edges uses comp pred (2/3)
+      ctx = 2 + (IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]) ||
+                 !is_inter_block(left_mbmi));
+    else  // both edges use comp pred (4)
+      ctx = 4;
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!has_second_ref(edge_mbmi))
+      // edge does not use comp pred (0/1)
+      ctx = IS_BACKWARD_REF_FRAME(edge_mbmi->ref_frame[0]);
+    else
+      // edge uses comp pred (3)
+      ctx = 3;
+  } else {  // no edges available (1)
+    ctx = 1;
+  }
+  assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS);
+  return ctx;
+}
+
+#if CONFIG_EXT_REFS
+
+// TODO(zoeliu): Future work will be conducted to optimize the context design
+//               for the coding of the reference frames.
+
+#define CHECK_LAST_OR_LAST2(ref_frame) \
+  ((ref_frame == LAST_FRAME) || (ref_frame == LAST2_FRAME))
+
+#define CHECK_GOLDEN_OR_LAST3(ref_frame) \
+  ((ref_frame == GOLDEN_FRAME) || (ref_frame == LAST3_FRAME))
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be either
+// GOLDEN/LAST3, or LAST/LAST2.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is either
+//               GOLDEN_FRAME or LAST3_FRAME.
+#if CONFIG_LOWDELAY_COMPOUND
+int av1_get_pred_context_comp_ref_p(UNUSED const AV1_COMMON *cm,
+                                    const MACROBLOCKD *xd) {
+#else
+int av1_get_pred_context_comp_ref_p(const AV1_COMMON *cm,
+                                    const MACROBLOCKD *xd) {
+#endif
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+// Note:
+// The mode info data structure has a one element border above and to the
+// left of the entries correpsonding to real macroblocks.
+// The prediction flags in these dummy entries are initialised to 0.
+#if CONFIG_LOWDELAY_COMPOUND  // No change to bitstream
+  // Code seems to assume that signbias of cm->comp_bwd_ref[0] is always 1
+  const int bwd_ref_sign_idx = 1;
+#else
+  const int bwd_ref_sign_idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
+#endif
+  const int fwd_ref_sign_idx = !bwd_ref_sign_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+        pred_context =
+            1 + 2 * (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]));
+      else  // comp pred (1/3)
+        pred_context = 1 +
+                       2 * (!CHECK_GOLDEN_OR_LAST3(
+                               edge_mbmi->ref_frame[fwd_ref_sign_idx]));
+    } else {  // inter/inter
+      const int l_sg = !has_second_ref(left_mbmi);
+      const int a_sg = !has_second_ref(above_mbmi);
+      const MV_REFERENCE_FRAME frfa =
+          a_sg ? above_mbmi->ref_frame[0]
+               : above_mbmi->ref_frame[fwd_ref_sign_idx];
+      const MV_REFERENCE_FRAME frfl =
+          l_sg ? left_mbmi->ref_frame[0]
+               : left_mbmi->ref_frame[fwd_ref_sign_idx];
+
+      if (frfa == frfl && CHECK_GOLDEN_OR_LAST3(frfa)) {
+        pred_context = 0;
+      } else if (l_sg && a_sg) {  // single/single
+        if ((CHECK_BACKWARD_REFS(frfa) && CHECK_LAST_OR_LAST2(frfl)) ||
+            (CHECK_BACKWARD_REFS(frfl) && CHECK_LAST_OR_LAST2(frfa))) {
+          pred_context = 4;
+        } else if (CHECK_GOLDEN_OR_LAST3(frfa) || CHECK_GOLDEN_OR_LAST3(frfl)) {
+          pred_context = 1;
+        } else {
+          pred_context = 3;
+        }
+      } else if (l_sg || a_sg) {  // single/comp
+        const MV_REFERENCE_FRAME frfc = l_sg ? frfa : frfl;
+        const MV_REFERENCE_FRAME rfs = a_sg ? frfa : frfl;
+
+        if (CHECK_GOLDEN_OR_LAST3(frfc) && !CHECK_GOLDEN_OR_LAST3(rfs))
+          pred_context = 1;
+        else if (CHECK_GOLDEN_OR_LAST3(rfs) && !CHECK_GOLDEN_OR_LAST3(frfc))
+          pred_context = 2;
+        else
+          pred_context = 4;
+      } else {  // comp/comp
+        if ((CHECK_LAST_OR_LAST2(frfa) && CHECK_LAST_OR_LAST2(frfl))) {
+          pred_context = 4;
+        } else {
+          // NOTE(zoeliu): Following assert may be removed once confirmed.
+          assert(CHECK_GOLDEN_OR_LAST3(frfa) || CHECK_GOLDEN_OR_LAST3(frfl));
+          pred_context = 2;
+        }
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mbmi))
+        pred_context =
+            4 *
+            (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[fwd_ref_sign_idx]));
+      else
+        pred_context = 3 * (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]));
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be LAST,
+// conditioning on that it is known either LAST/LAST2.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is LAST_FRAME,
+// conditioning on it is either LAST_FRAME or LAST2_FRAME.
+#if CONFIG_LOWDELAY_COMPOUND
+int av1_get_pred_context_comp_ref_p1(UNUSED const AV1_COMMON *cm,
+                                     const MACROBLOCKD *xd) {
+#else
+int av1_get_pred_context_comp_ref_p1(const AV1_COMMON *cm,
+                                     const MACROBLOCKD *xd) {
+#endif
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+// Note:
+// The mode info data structure has a one element border above and to the
+// left of the entries correpsonding to real macroblocks.
+// The prediction flags in these dummy entries are initialised to 0.
+#if CONFIG_LOWDELAY_COMPOUND  // No change to bitstream
+  // Code seems to assume that signbias of cm->comp_bwd_ref[0] is always 1
+  const int bwd_ref_sign_idx = 1;
+#else
+  const int bwd_ref_sign_idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
+#endif
+  const int fwd_ref_sign_idx = !bwd_ref_sign_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != LAST_FRAME);
+      else  // comp pred (1/3)
+        pred_context =
+            1 + 2 * (edge_mbmi->ref_frame[fwd_ref_sign_idx] != LAST_FRAME);
+    } else {  // inter/inter
+      const int l_sg = !has_second_ref(left_mbmi);
+      const int a_sg = !has_second_ref(above_mbmi);
+      const MV_REFERENCE_FRAME frfa =
+          a_sg ? above_mbmi->ref_frame[0]
+               : above_mbmi->ref_frame[fwd_ref_sign_idx];
+      const MV_REFERENCE_FRAME frfl =
+          l_sg ? left_mbmi->ref_frame[0]
+               : left_mbmi->ref_frame[fwd_ref_sign_idx];
+
+      if (frfa == frfl && frfa == LAST_FRAME)
+        pred_context = 0;
+      else if (l_sg && a_sg) {  // single/single
+        if (frfa == LAST_FRAME || frfl == LAST_FRAME)
+          pred_context = 1;
+        else if (CHECK_GOLDEN_OR_LAST3(frfa) || CHECK_GOLDEN_OR_LAST3(frfl))
+          pred_context = 2 + (frfa != frfl);
+        else if (frfa == frfl ||
+                 (CHECK_BACKWARD_REFS(frfa) && CHECK_BACKWARD_REFS(frfl)))
+          pred_context = 3;
+        else
+          pred_context = 4;
+      } else if (l_sg || a_sg) {  // single/comp
+        const MV_REFERENCE_FRAME frfc = l_sg ? frfa : frfl;
+        const MV_REFERENCE_FRAME rfs = a_sg ? frfa : frfl;
+
+        if (frfc == LAST_FRAME && rfs != LAST_FRAME)
+          pred_context = 1;
+        else if (rfs == LAST_FRAME && frfc != LAST_FRAME)
+          pred_context = 2;
+        else
+          pred_context =
+              3 + (frfc == LAST2_FRAME || CHECK_GOLDEN_OR_LAST3(rfs));
+      } else {  // comp/comp
+        if (frfa == LAST_FRAME || frfl == LAST_FRAME)
+          pred_context = 2;
+        else
+          pred_context =
+              3 + (CHECK_GOLDEN_OR_LAST3(frfa) || CHECK_GOLDEN_OR_LAST3(frfl));
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mbmi)) {
+        pred_context =
+            4 * (edge_mbmi->ref_frame[fwd_ref_sign_idx] != LAST_FRAME);
+      } else {
+        if (edge_mbmi->ref_frame[0] == LAST_FRAME)
+          pred_context = 0;
+        else
+          pred_context = 2 + CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]);
+      }
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be GOLDEN,
+// conditioning on that it is known either GOLDEN or LAST3.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is GOLDEN_FRAME,
+// conditioning on it is either GOLDEN or LAST3.
+#if CONFIG_LOWDELAY_COMPOUND
+int av1_get_pred_context_comp_ref_p2(UNUSED const AV1_COMMON *cm,
+                                     const MACROBLOCKD *xd) {
+#else
+int av1_get_pred_context_comp_ref_p2(const AV1_COMMON *cm,
+                                     const MACROBLOCKD *xd) {
+#endif
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+// Note:
+// The mode info data structure has a one element border above and to the
+// left of the entries correpsonding to real macroblocks.
+// The prediction flags in these dummy entries are initialised to 0.
+#if CONFIG_LOWDELAY_COMPOUND  // No change to bitstream
+  // Code seems to assume that signbias of cm->comp_bwd_ref[0] is always 1
+  const int bwd_ref_sign_idx = 1;
+#else
+  const int bwd_ref_sign_idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
+#endif
+  const int fwd_ref_sign_idx = !bwd_ref_sign_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != GOLDEN_FRAME);
+      else  // comp pred (1/3)
+        pred_context =
+            1 + 2 * (edge_mbmi->ref_frame[fwd_ref_sign_idx] != GOLDEN_FRAME);
+    } else {  // inter/inter
+      const int l_sg = !has_second_ref(left_mbmi);
+      const int a_sg = !has_second_ref(above_mbmi);
+      const MV_REFERENCE_FRAME frfa =
+          a_sg ? above_mbmi->ref_frame[0]
+               : above_mbmi->ref_frame[fwd_ref_sign_idx];
+      const MV_REFERENCE_FRAME frfl =
+          l_sg ? left_mbmi->ref_frame[0]
+               : left_mbmi->ref_frame[fwd_ref_sign_idx];
+
+      if (frfa == frfl && frfa == GOLDEN_FRAME)
+        pred_context = 0;
+      else if (l_sg && a_sg) {  // single/single
+        if (frfa == GOLDEN_FRAME || frfl == GOLDEN_FRAME)
+          pred_context = 1;
+        else if (CHECK_LAST_OR_LAST2(frfa) || CHECK_LAST_OR_LAST2(frfl))
+          pred_context = 2 + (frfa != frfl);
+        else if (frfa == frfl ||
+                 (CHECK_BACKWARD_REFS(frfa) && CHECK_BACKWARD_REFS(frfl)))
+          pred_context = 3;
+        else
+          pred_context = 4;
+      } else if (l_sg || a_sg) {  // single/comp
+        const MV_REFERENCE_FRAME frfc = l_sg ? frfa : frfl;
+        const MV_REFERENCE_FRAME rfs = a_sg ? frfa : frfl;
+
+        if (frfc == GOLDEN_FRAME && rfs != GOLDEN_FRAME)
+          pred_context = 1;
+        else if (rfs == GOLDEN_FRAME && frfc != GOLDEN_FRAME)
+          pred_context = 2;
+        else
+          pred_context = 3 + (frfc == LAST3_FRAME || CHECK_LAST_OR_LAST2(rfs));
+      } else {  // comp/comp
+        if (frfa == GOLDEN_FRAME || frfl == GOLDEN_FRAME)
+          pred_context = 2;
+        else
+          pred_context =
+              3 + (CHECK_LAST_OR_LAST2(frfa) || CHECK_LAST_OR_LAST2(frfl));
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mbmi)) {
+        pred_context =
+            4 * (edge_mbmi->ref_frame[fwd_ref_sign_idx] != GOLDEN_FRAME);
+      } else {
+        if (edge_mbmi->ref_frame[0] == GOLDEN_FRAME)
+          pred_context = 0;
+        else
+          pred_context = 2 + CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]);
+      }
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+#if CONFIG_LOWDELAY_COMPOUND
+int av1_get_pred_context_comp_bwdref_p(UNUSED const AV1_COMMON *cm,
+                                       const MACROBLOCKD *xd) {
+#else
+int av1_get_pred_context_comp_bwdref_p(const AV1_COMMON *cm,
+                                       const MACROBLOCKD *xd) {
+#endif
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+// Note:
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real macroblocks.
+// The prediction flags in these dummy entries are initialized to 0.
+#if CONFIG_LOWDELAY_COMPOUND  // No change to bitstream
+  // Code seems to assume that signbias of cm->comp_bwd_ref[0] is always 1
+  const int bwd_ref_sign_idx = 1;
+#else
+  const int bwd_ref_sign_idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
+#endif
+  const int fwd_ref_sign_idx = !bwd_ref_sign_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != cm->comp_bwd_ref[1]);
+      else  // comp pred (1/3)
+        pred_context =
+            1 +
+            2 * (edge_mbmi->ref_frame[bwd_ref_sign_idx] != cm->comp_bwd_ref[1]);
+    } else {  // inter/inter
+      const int l_comp = has_second_ref(left_mbmi);
+      const int a_comp = has_second_ref(above_mbmi);
+
+      const MV_REFERENCE_FRAME l_brf =
+          l_comp ? left_mbmi->ref_frame[bwd_ref_sign_idx] : NONE_FRAME;
+      const MV_REFERENCE_FRAME a_brf =
+          a_comp ? above_mbmi->ref_frame[bwd_ref_sign_idx] : NONE_FRAME;
+
+      const MV_REFERENCE_FRAME l_frf =
+          !l_comp ? left_mbmi->ref_frame[0]
+                  : left_mbmi->ref_frame[fwd_ref_sign_idx];
+      const MV_REFERENCE_FRAME a_frf =
+          !a_comp ? above_mbmi->ref_frame[0]
+                  : above_mbmi->ref_frame[fwd_ref_sign_idx];
+
+      if (l_comp && a_comp) {  // comp/comp
+        if (l_brf == a_brf && l_brf == cm->comp_bwd_ref[1]) {
+          pred_context = 0;
+        } else if (l_brf == cm->comp_bwd_ref[1] ||
+                   a_brf == cm->comp_bwd_ref[1]) {
+          pred_context = 1;
+        } else {
+          // NOTE: Backward ref should be either BWDREF or ALTREF.
+          assert(l_brf == a_brf && l_brf != cm->comp_bwd_ref[1]);
+          pred_context = 3;
+        }
+      } else if (!l_comp && !a_comp) {  // single/single
+        if (l_frf == a_frf && l_frf == cm->comp_bwd_ref[1]) {
+          pred_context = 0;
+        } else if (l_frf == cm->comp_bwd_ref[1] ||
+                   a_frf == cm->comp_bwd_ref[1]) {
+          pred_context = 1;
+        } else if (l_frf == a_frf) {
+          pred_context = 3;
+        } else {
+          assert(l_frf != a_frf && l_frf != cm->comp_bwd_ref[1] &&
+                 a_frf != cm->comp_bwd_ref[1]);
+          pred_context = 4;
+        }
+      } else {  // comp/single
+        assert((l_comp && !a_comp) || (!l_comp && a_comp));
+
+        if ((l_comp && l_brf == cm->comp_bwd_ref[1] &&
+             a_frf == cm->comp_bwd_ref[1]) ||
+            (a_comp && a_brf == cm->comp_bwd_ref[1] &&
+             l_frf == cm->comp_bwd_ref[1])) {
+          pred_context = 1;
+        } else if ((l_comp && l_brf == cm->comp_bwd_ref[1]) ||
+                   (a_comp && a_brf == cm->comp_bwd_ref[1]) ||
+                   (!l_comp && l_frf == cm->comp_bwd_ref[1]) ||
+                   (!a_comp && a_frf == cm->comp_bwd_ref[1])) {
+          pred_context = 2;
+        } else {
+          pred_context = 4;
+        }
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mbmi)) {
+        pred_context =
+            4 * (edge_mbmi->ref_frame[bwd_ref_sign_idx] != cm->comp_bwd_ref[1]);
+      } else {
+        pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_bwd_ref[1]);
+      }
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+#else  // CONFIG_EXT_REFS
+
+// Returns a context number for the given MB prediction signal
+int av1_get_pred_context_comp_ref_p(const AV1_COMMON *cm,
+                                    const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+  const int var_ref_idx = !fix_ref_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
+      else  // comp pred (1/3)
+        pred_context =
+            1 + 2 * (edge_mbmi->ref_frame[var_ref_idx] != cm->comp_var_ref[1]);
+    } else {  // inter/inter
+      const int l_sg = !has_second_ref(left_mbmi);
+      const int a_sg = !has_second_ref(above_mbmi);
+      const MV_REFERENCE_FRAME vrfa =
+          a_sg ? above_mbmi->ref_frame[0] : above_mbmi->ref_frame[var_ref_idx];
+      const MV_REFERENCE_FRAME vrfl =
+          l_sg ? left_mbmi->ref_frame[0] : left_mbmi->ref_frame[var_ref_idx];
+
+      if (vrfa == vrfl && cm->comp_var_ref[1] == vrfa) {
+        pred_context = 0;
+      } else if (l_sg && a_sg) {  // single/single
+        if ((vrfa == cm->comp_fixed_ref && vrfl == cm->comp_var_ref[0]) ||
+            (vrfl == cm->comp_fixed_ref && vrfa == cm->comp_var_ref[0]))
+          pred_context = 4;
+        else if (vrfa == vrfl)
+          pred_context = 3;
+        else
+          pred_context = 1;
+      } else if (l_sg || a_sg) {  // single/comp
+        const MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;
+        const MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;
+        if (vrfc == cm->comp_var_ref[1] && rfs != cm->comp_var_ref[1])
+          pred_context = 1;
+        else if (rfs == cm->comp_var_ref[1] && vrfc != cm->comp_var_ref[1])
+          pred_context = 2;
+        else
+          pred_context = 4;
+      } else if (vrfa == vrfl) {  // comp/comp
+        pred_context = 4;
+      } else {
+        pred_context = 2;
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mbmi))
+        pred_context =
+            4 * (edge_mbmi->ref_frame[var_ref_idx] != cm->comp_var_ref[1]);
+      else
+        pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+
+// For the bit to signal whether the single reference is a ALTREF_FRAME
+// or a BWDREF_FRAME.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is ALTREF/BWDREF.
+int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single
+        pred_context = 4 * (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]));
+      else  // comp
+        pred_context = 2;
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second = has_second_ref(left_mbmi);
+
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+
+      if (above_has_second && left_has_second) {  // comp/comp
+        pred_context = 2;
+      } else if (above_has_second || left_has_second) {  // single/comp
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+
+        pred_context = (!CHECK_BACKWARD_REFS(rfs)) ? 4 : 1;
+      } else {  // single/single
+        pred_context = 2 * (!CHECK_BACKWARD_REFS(above0)) +
+                       2 * (!CHECK_BACKWARD_REFS(left0));
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+    if (!is_inter_block(edge_mbmi)) {  // intra
+      pred_context = 2;
+    } else {                           // inter
+      if (!has_second_ref(edge_mbmi))  // single
+        pred_context = 4 * (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]));
+      else  // comp
+        pred_context = 2;
+    }
+  } else {  // no edges available
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// For the bit to signal whether the single reference is ALTREF_FRAME or
+// BWDREF_FRAME, knowing that it shall be either of these 2 choices.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is ALTREF_FRAME, conditioning
+// on it is either ALTREF_FRAME/BWDREF_FRAME.
+int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {  // single
+        if (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]))
+          pred_context = 3;
+        else
+          pred_context = 4 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME);
+      } else {  // comp
+        pred_context = 1 +
+                       2 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME ||
+                            edge_mbmi->ref_frame[1] == BWDREF_FRAME);
+      }
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {  // comp/comp
+        if (above0 == left0 && above1 == left1)
+          pred_context =
+              3 * (above0 == BWDREF_FRAME || above1 == BWDREF_FRAME ||
+                   left0 == BWDREF_FRAME || left1 == BWDREF_FRAME);
+        else
+          pred_context = 2;
+      } else if (above_has_second || left_has_second) {  // single/comp
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (rfs == BWDREF_FRAME)
+          pred_context = 3 + (crf1 == BWDREF_FRAME || crf2 == BWDREF_FRAME);
+        else if (rfs == ALTREF_FRAME)
+          pred_context = (crf1 == BWDREF_FRAME || crf2 == BWDREF_FRAME);
+        else
+          pred_context = 1 + 2 * (crf1 == BWDREF_FRAME || crf2 == BWDREF_FRAME);
+      } else {  // single/single
+        if (!CHECK_BACKWARD_REFS(above0) && !CHECK_BACKWARD_REFS(left0)) {
+          pred_context = 2 + (above0 == left0);
+        } else if (!CHECK_BACKWARD_REFS(above0) ||
+                   !CHECK_BACKWARD_REFS(left0)) {
+          const MV_REFERENCE_FRAME edge0 =
+              !CHECK_BACKWARD_REFS(above0) ? left0 : above0;
+          pred_context = 4 * (edge0 == BWDREF_FRAME);
+        } else {
+          pred_context =
+              2 * (above0 == BWDREF_FRAME) + 2 * (left0 == BWDREF_FRAME);
+        }
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi) ||
+        (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]) &&
+         !has_second_ref(edge_mbmi)))
+      pred_context = 2;
+    else if (!has_second_ref(edge_mbmi))  // single
+      pred_context = 4 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME);
+    else  // comp
+      pred_context = 3 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME ||
+                          edge_mbmi->ref_frame[1] == BWDREF_FRAME);
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// For the bit to signal whether the single reference is LAST3/GOLDEN or
+// LAST2/LAST, knowing that it shall be either of these 2 choices.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is LAST3/GOLDEN, conditioning
+// on it is either LAST3/GOLDEN/LAST2/LAST.
+int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {  // single
+        if (CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]))
+          pred_context = 3;
+        else
+          pred_context = 4 * CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]);
+      } else {  // comp
+        pred_context = 1 +
+                       2 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) ||
+                            CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[1]));
+      }
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {  // comp/comp
+        if (above0 == left0 && above1 == left1)
+          pred_context =
+              3 * (CHECK_LAST_OR_LAST2(above0) || CHECK_LAST_OR_LAST2(above1) ||
+                   CHECK_LAST_OR_LAST2(left0) || CHECK_LAST_OR_LAST2(left1));
+        else
+          pred_context = 2;
+      } else if (above_has_second || left_has_second) {  // single/comp
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (CHECK_LAST_OR_LAST2(rfs))
+          pred_context =
+              3 + (CHECK_LAST_OR_LAST2(crf1) || CHECK_LAST_OR_LAST2(crf2));
+        else if (CHECK_GOLDEN_OR_LAST3(rfs))
+          pred_context =
+              (CHECK_LAST_OR_LAST2(crf1) || CHECK_LAST_OR_LAST2(crf2));
+        else
+          pred_context =
+              1 + 2 * (CHECK_LAST_OR_LAST2(crf1) || CHECK_LAST_OR_LAST2(crf2));
+      } else {  // single/single
+        if (CHECK_BACKWARD_REFS(above0) && CHECK_BACKWARD_REFS(left0)) {
+          pred_context = 2 + (above0 == left0);
+        } else if (CHECK_BACKWARD_REFS(above0) || CHECK_BACKWARD_REFS(left0)) {
+          const MV_REFERENCE_FRAME edge0 =
+              CHECK_BACKWARD_REFS(above0) ? left0 : above0;
+          pred_context = 4 * CHECK_LAST_OR_LAST2(edge0);
+        } else {
+          pred_context =
+              2 * CHECK_LAST_OR_LAST2(above0) + 2 * CHECK_LAST_OR_LAST2(left0);
+        }
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi) ||
+        (CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]) &&
+         !has_second_ref(edge_mbmi)))
+      pred_context = 2;
+    else if (!has_second_ref(edge_mbmi))  // single
+      pred_context = 4 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]));
+    else  // comp
+      pred_context = 3 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) ||
+                          CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[1]));
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// For the bit to signal whether the single reference is LAST2_FRAME or
+// LAST_FRAME, knowing that it shall be either of these 2 choices.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is LAST2_FRAME, conditioning
+// on it is either LAST2_FRAME/LAST_FRAME.
+int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {  // single
+        if (!CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]))
+          pred_context = 3;
+        else
+          pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+      } else {  // comp
+        pred_context = 1 +
+                       2 * (edge_mbmi->ref_frame[0] == LAST_FRAME ||
+                            edge_mbmi->ref_frame[1] == LAST_FRAME);
+      }
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {  // comp/comp
+        if (above0 == left0 && above1 == left1)
+          pred_context = 3 * (above0 == LAST_FRAME || above1 == LAST_FRAME ||
+                              left0 == LAST_FRAME || left1 == LAST_FRAME);
+        else
+          pred_context = 2;
+      } else if (above_has_second || left_has_second) {  // single/comp
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (rfs == LAST_FRAME)
+          pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+        else if (rfs == LAST2_FRAME)
+          pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+        else
+          pred_context = 1 + 2 * (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+      } else {  // single/single
+        if (!CHECK_LAST_OR_LAST2(above0) && !CHECK_LAST_OR_LAST2(left0)) {
+          pred_context = 2 + (above0 == left0);
+        } else if (!CHECK_LAST_OR_LAST2(above0) ||
+                   !CHECK_LAST_OR_LAST2(left0)) {
+          const MV_REFERENCE_FRAME edge0 =
+              !CHECK_LAST_OR_LAST2(above0) ? left0 : above0;
+          pred_context = 4 * (edge0 == LAST_FRAME);
+        } else {
+          pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME);
+        }
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi) ||
+        (!CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) &&
+         !has_second_ref(edge_mbmi)))
+      pred_context = 2;
+    else if (!has_second_ref(edge_mbmi))  // single
+      pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+    else  // comp
+      pred_context = 3 * (edge_mbmi->ref_frame[0] == LAST_FRAME ||
+                          edge_mbmi->ref_frame[1] == LAST_FRAME);
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// For the bit to signal whether the single reference is GOLDEN_FRAME or
+// LAST3_FRAME, knowing that it shall be either of these 2 choices.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is GOLDEN_FRAME, conditioning
+// on it is either GOLDEN_FRAME/LAST3_FRAME.
+int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {  // single
+        if (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]))
+          pred_context = 3;
+        else
+          pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST3_FRAME);
+      } else {  // comp
+        pred_context = 1 +
+                       2 * (edge_mbmi->ref_frame[0] == LAST3_FRAME ||
+                            edge_mbmi->ref_frame[1] == LAST3_FRAME);
+      }
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {  // comp/comp
+        if (above0 == left0 && above1 == left1)
+          pred_context = 3 * (above0 == LAST3_FRAME || above1 == LAST3_FRAME ||
+                              left0 == LAST3_FRAME || left1 == LAST3_FRAME);
+        else
+          pred_context = 2;
+      } else if (above_has_second || left_has_second) {  // single/comp
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (rfs == LAST3_FRAME)
+          pred_context = 3 + (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
+        else if (rfs == GOLDEN_FRAME)
+          pred_context = (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
+        else
+          pred_context = 1 + 2 * (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
+      } else {  // single/single
+        if (!CHECK_GOLDEN_OR_LAST3(above0) && !CHECK_GOLDEN_OR_LAST3(left0)) {
+          pred_context = 2 + (above0 == left0);
+        } else if (!CHECK_GOLDEN_OR_LAST3(above0) ||
+                   !CHECK_GOLDEN_OR_LAST3(left0)) {
+          const MV_REFERENCE_FRAME edge0 =
+              !CHECK_GOLDEN_OR_LAST3(above0) ? left0 : above0;
+          pred_context = 4 * (edge0 == LAST3_FRAME);
+        } else {
+          pred_context =
+              2 * (above0 == LAST3_FRAME) + 2 * (left0 == LAST3_FRAME);
+        }
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi) ||
+        (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]) &&
+         !has_second_ref(edge_mbmi)))
+      pred_context = 2;
+    else if (!has_second_ref(edge_mbmi))  // single
+      pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST3_FRAME);
+    else  // comp
+      pred_context = 3 * (edge_mbmi->ref_frame[0] == LAST3_FRAME ||
+                          edge_mbmi->ref_frame[1] == LAST3_FRAME);
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+#else  // CONFIG_EXT_REFS
+
+int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi))
+        pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+      else
+        pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
+                            edge_mbmi->ref_frame[1] == LAST_FRAME);
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        pred_context = 1 + (above0 == LAST_FRAME || above1 == LAST_FRAME ||
+                            left0 == LAST_FRAME || left1 == LAST_FRAME);
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (rfs == LAST_FRAME)
+          pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+        else
+          pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+      } else {
+        pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME);
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+    if (!is_inter_block(edge_mbmi)) {  // intra
+      pred_context = 2;
+    } else {  // inter
+      if (!has_second_ref(edge_mbmi))
+        pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+      else
+        pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
+                            edge_mbmi->ref_frame[1] == LAST_FRAME);
+    }
+  } else {  // no edges available
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {
+        if (edge_mbmi->ref_frame[0] == LAST_FRAME)
+          pred_context = 3;
+        else
+          pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+      } else {
+        pred_context = 1 +
+                       2 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                            edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
+      }
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        if (above0 == left0 && above1 == left1)
+          pred_context =
+              3 * (above0 == GOLDEN_FRAME || above1 == GOLDEN_FRAME ||
+                   left0 == GOLDEN_FRAME || left1 == GOLDEN_FRAME);
+        else
+          pred_context = 2;
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (rfs == GOLDEN_FRAME)
+          pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+        else if (rfs != GOLDEN_FRAME && rfs != LAST_FRAME)
+          pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;
+        else
+          pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+      } else {
+        if (above0 == LAST_FRAME && left0 == LAST_FRAME) {
+          pred_context = 3;
+        } else if (above0 == LAST_FRAME || left0 == LAST_FRAME) {
+          const MV_REFERENCE_FRAME edge0 =
+              (above0 == LAST_FRAME) ? left0 : above0;
+          pred_context = 4 * (edge0 == GOLDEN_FRAME);
+        } else {
+          pred_context =
+              2 * (above0 == GOLDEN_FRAME) + 2 * (left0 == GOLDEN_FRAME);
+        }
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi) ||
+        (edge_mbmi->ref_frame[0] == LAST_FRAME && !has_second_ref(edge_mbmi)))
+      pred_context = 2;
+    else if (!has_second_ref(edge_mbmi))
+      pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+    else
+      pred_context = 3 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                          edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+#endif  // CONFIG_EXT_REFS
diff --git a/third_party/aom/av1/common/pred_common.h b/third_party/aom/av1/common/pred_common.h
new file mode 100644
index 000000000..e16ad70f6
--- /dev/null
+++ b/third_party/aom/av1/common/pred_common.h
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_PRED_COMMON_H_
+#define AV1_COMMON_PRED_COMMON_H_
+
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE int get_segment_id(const AV1_COMMON *const cm,
+                                 const uint8_t *segment_ids, BLOCK_SIZE bsize,
+                                 int mi_row, int mi_col) {
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int xmis = AOMMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = AOMMIN(cm->mi_rows - mi_row, bh);
+  int x, y, segment_id = MAX_SEGMENTS;
+
+  for (y = 0; y < ymis; ++y)
+    for (x = 0; x < xmis; ++x)
+      segment_id =
+          AOMMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]);
+
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+  return segment_id;
+}
+
+static INLINE int av1_get_pred_context_seg_id(const MACROBLOCKD *xd) {
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int above_sip =
+      (above_mi != NULL) ? above_mi->mbmi.seg_id_predicted : 0;
+  const int left_sip = (left_mi != NULL) ? left_mi->mbmi.seg_id_predicted : 0;
+
+  return above_sip + left_sip;
+}
+
+static INLINE aom_prob av1_get_pred_prob_seg_id(
+    const struct segmentation_probs *segp, const MACROBLOCKD *xd) {
+  return segp->pred_probs[av1_get_pred_context_seg_id(xd)];
+}
+
+static INLINE int av1_get_skip_context(const MACROBLOCKD *xd) {
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int above_skip = (above_mi != NULL) ? above_mi->mbmi.skip : 0;
+  const int left_skip = (left_mi != NULL) ? left_mi->mbmi.skip : 0;
+  return above_skip + left_skip;
+}
+
+static INLINE aom_prob av1_get_skip_prob(const AV1_COMMON *cm,
+                                         const MACROBLOCKD *xd) {
+  return cm->fc->skip_probs[av1_get_skip_context(xd)];
+}
+
+#if CONFIG_DUAL_FILTER
+int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir);
+#else
+int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
+#endif
+
+#if CONFIG_EXT_INTRA
+#if CONFIG_INTRA_INTERP
+int av1_get_pred_context_intra_interp(const MACROBLOCKD *xd);
+#endif  // CONFIG_INTRA_INTERP
+#endif  // CONFIG_EXT_INTRA
+
+int av1_get_intra_inter_context(const MACROBLOCKD *xd);
+
+static INLINE aom_prob av1_get_intra_inter_prob(const AV1_COMMON *cm,
+                                                const MACROBLOCKD *xd) {
+  return cm->fc->intra_inter_prob[av1_get_intra_inter_context(xd)];
+}
+
+int av1_get_reference_mode_context(const AV1_COMMON *cm, const MACROBLOCKD *xd);
+
+static INLINE aom_prob av1_get_reference_mode_prob(const AV1_COMMON *cm,
+                                                   const MACROBLOCKD *xd) {
+  return cm->fc->comp_inter_prob[av1_get_reference_mode_context(cm, xd)];
+}
+
+int av1_get_pred_context_comp_ref_p(const AV1_COMMON *cm,
+                                    const MACROBLOCKD *xd);
+
+static INLINE aom_prob av1_get_pred_prob_comp_ref_p(const AV1_COMMON *cm,
+                                                    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_ref_p(cm, xd);
+  return cm->fc->comp_ref_prob[pred_context][0];
+}
+
+#if CONFIG_EXT_REFS
+int av1_get_pred_context_comp_ref_p1(const AV1_COMMON *cm,
+                                     const MACROBLOCKD *xd);
+
+static INLINE aom_prob av1_get_pred_prob_comp_ref_p1(const AV1_COMMON *cm,
+                                                     const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_ref_p1(cm, xd);
+  return cm->fc->comp_ref_prob[pred_context][1];
+}
+
+int av1_get_pred_context_comp_ref_p2(const AV1_COMMON *cm,
+                                     const MACROBLOCKD *xd);
+
+static INLINE aom_prob av1_get_pred_prob_comp_ref_p2(const AV1_COMMON *cm,
+                                                     const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_ref_p2(cm, xd);
+  return cm->fc->comp_ref_prob[pred_context][2];
+}
+
+int av1_get_pred_context_comp_bwdref_p(const AV1_COMMON *cm,
+                                       const MACROBLOCKD *xd);
+
+static INLINE aom_prob av1_get_pred_prob_comp_bwdref_p(const AV1_COMMON *cm,
+                                                       const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_bwdref_p(cm, xd);
+  return cm->fc->comp_bwdref_prob[pred_context][0];
+}
+#endif  // CONFIG_EXT_REFS
+
+int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
+
+static INLINE aom_prob av1_get_pred_prob_single_ref_p1(const AV1_COMMON *cm,
+                                                       const MACROBLOCKD *xd) {
+  return cm->fc->single_ref_prob[av1_get_pred_context_single_ref_p1(xd)][0];
+}
+
+int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
+
+static INLINE aom_prob av1_get_pred_prob_single_ref_p2(const AV1_COMMON *cm,
+                                                       const MACROBLOCKD *xd) {
+  return cm->fc->single_ref_prob[av1_get_pred_context_single_ref_p2(xd)][1];
+}
+
+#if CONFIG_EXT_REFS
+int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd);
+
+static INLINE aom_prob av1_get_pred_prob_single_ref_p3(const AV1_COMMON *cm,
+                                                       const MACROBLOCKD *xd) {
+  return cm->fc->single_ref_prob[av1_get_pred_context_single_ref_p3(xd)][2];
+}
+
+int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd);
+
+static INLINE aom_prob av1_get_pred_prob_single_ref_p4(const AV1_COMMON *cm,
+                                                       const MACROBLOCKD *xd) {
+  return cm->fc->single_ref_prob[av1_get_pred_context_single_ref_p4(xd)][3];
+}
+
+int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd);
+
+static INLINE aom_prob av1_get_pred_prob_single_ref_p5(const AV1_COMMON *cm,
+                                                       const MACROBLOCKD *xd) {
+  return cm->fc->single_ref_prob[av1_get_pred_context_single_ref_p5(xd)][4];
+}
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+int av1_get_inter_mode_context(const MACROBLOCKD *xd);
+
+static INLINE aom_prob av1_get_inter_mode_prob(const AV1_COMMON *cm,
+                                               const MACROBLOCKD *xd) {
+  return cm->fc->comp_inter_mode_prob[av1_get_inter_mode_context(xd)];
+}
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
+// Returns a context number for the given MB prediction signal
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real blocks.
+// The prediction flags in these dummy entries are initialized to 0.
+static INLINE int get_tx_size_context(const MACROBLOCKD *xd) {
+  const int max_tx_size = max_txsize_lookup[xd->mi[0]->mbmi.sb_type];
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+  int above_ctx = (has_above && !above_mbmi->skip)
+                      ? (int)txsize_sqr_map[above_mbmi->tx_size]
+                      : max_tx_size;
+  int left_ctx = (has_left && !left_mbmi->skip)
+                     ? (int)txsize_sqr_map[left_mbmi->tx_size]
+                     : max_tx_size;
+
+  if (!has_left) left_ctx = above_ctx;
+
+  if (!has_above) above_ctx = left_ctx;
+#if CONFIG_CB4X4
+  // TODO(jingning): Temporary setup. Will rework this after the cb4x4
+  // framework is up running.
+  return (above_ctx + left_ctx) > max_tx_size + 1;
+#else
+  return (above_ctx + left_ctx) > max_tx_size;
+#endif
+}
+
+#if CONFIG_VAR_TX
+static void update_tx_counts(AV1_COMMON *cm, MACROBLOCKD *xd,
+                             MB_MODE_INFO *mbmi, BLOCK_SIZE plane_bsize,
+                             TX_SIZE tx_size, int blk_row, int blk_col,
+                             TX_SIZE max_tx_size, int ctx) {
+  const struct macroblockd_plane *const pd = &xd->plane[0];
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
+  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (tx_size == plane_tx_size) {
+    ++xd->counts->tx_size[max_tx_size - TX_8X8][ctx][tx_size];
+    mbmi->tx_size = tx_size;
+  } else {
+    int bsl = b_width_log2_lookup[bsize];
+    int i;
+
+    assert(bsl > 0);
+    --bsl;
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + ((i >> 1) << bsl);
+      const int offsetc = blk_col + ((i & 0x01) << bsl);
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+      update_tx_counts(cm, xd, mbmi, plane_bsize, (TX_SIZE)(tx_size - 1),
+                       offsetr, offsetc, max_tx_size, ctx);
+    }
+  }
+}
+
+static INLINE void inter_block_tx_count_update(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                               MB_MODE_INFO *mbmi,
+                                               BLOCK_SIZE plane_bsize,
+                                               int ctx) {
+  const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
+  TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+  int bh = tx_size_wide_unit[max_tx_size];
+  int idx, idy;
+
+  for (idy = 0; idy < mi_height; idy += bh)
+    for (idx = 0; idx < mi_width; idx += bh)
+      update_tx_counts(cm, xd, mbmi, plane_bsize, max_tx_size, idy, idx,
+                       max_tx_size, ctx);
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_PRED_COMMON_H_
diff --git a/third_party/aom/av1/common/pvq.c b/third_party/aom/av1/common/pvq.c
new file mode 100644
index 000000000..75fe761d7
--- /dev/null
+++ b/third_party/aom/av1/common/pvq.c
@@ -0,0 +1,1007 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "odintrin.h"
+#include "partition.h"
+#include "pvq.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* Imported from encode.c in daala */
+/* These are the PVQ equivalent of quantization matrices, except that
+   the values are per-band. */
+#define OD_MASKING_DISABLED 0
+#define OD_MASKING_ENABLED 1
+
+const unsigned char OD_LUMA_QM_Q4[2][OD_QM_SIZE] = {
+/* Flat quantization for PSNR. The DC component isn't 16 because the DC
+   magnitude compensation is done here for inter (Haar DC doesn't need it).
+   Masking disabled: */
+ {
+  16, 16,
+  16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16
+ },
+/* The non-flat AC coefficients compensate for the non-linear scaling caused
+   by activity masking. The values are currently hand-tuned so that the rate
+   of each band remains roughly constant when enabling activity masking
+   on intra.
+   Masking enabled: */
+ {
+  16, 16,
+  16, 18, 28, 32,
+  16, 14, 20, 20, 28, 32,
+  16, 11, 14, 14, 17, 17, 22, 28
+ }
+};
+
+const unsigned char OD_CHROMA_QM_Q4[2][OD_QM_SIZE] = {
+/* Chroma quantization is different because of the reduced lapping.
+   FIXME: Use the same matrix as luma for 4:4:4.
+   Masking disabled: */
+ {
+  16, 16,
+  16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16
+ },
+/* The AC part is flat for chroma because it has no activity masking.
+   Masking enabled: */
+ {
+  16, 16,
+  16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16
+ }
+};
+
+/* No interpolation, always use od_flat_qm_q4, but use a different scale for
+   each plane.
+   FIXME: Add interpolation and properly tune chroma. */
+const od_qm_entry OD_DEFAULT_QMS[2][2][OD_NPLANES_MAX] = {
+  /* Masking disabled */
+  { { { 4, 256, OD_LUMA_QM_Q4[OD_MASKING_DISABLED] },
+      { 4, 256, OD_CHROMA_QM_Q4[OD_MASKING_DISABLED] },
+      { 4, 256, OD_CHROMA_QM_Q4[OD_MASKING_DISABLED] } },
+    { { 0, 0, NULL},
+      { 0, 0, NULL},
+      { 0, 0, NULL} } },
+  /* Masking enabled */
+  { { { 4, 256, OD_LUMA_QM_Q4[OD_MASKING_ENABLED] },
+      { 4, 256, OD_CHROMA_QM_Q4[OD_MASKING_ENABLED] },
+      { 4, 256, OD_CHROMA_QM_Q4[OD_MASKING_ENABLED] } },
+    { { 0, 0, NULL},
+      { 0, 0, NULL},
+      { 0, 0, NULL} } }
+};
+
+/* Constants for the beta parameter, which controls how activity masking is
+   used.
+   beta = 1 / (1 - alpha), so when beta is 1, alpha is 0 and activity
+   masking is disabled. When beta is 1.5, activity masking is used. Note that
+   activity masking is neither used for 4x4 blocks nor for chroma. */
+#define OD_BETA(b) OD_QCONST32(b, OD_BETA_SHIFT)
+static const od_val16 OD_PVQ_BETA4_LUMA[1] = {OD_BETA(1.)};
+static const od_val16 OD_PVQ_BETA8_LUMA[4] = {OD_BETA(1.), OD_BETA(1.),
+ OD_BETA(1.), OD_BETA(1.)};
+static const od_val16 OD_PVQ_BETA16_LUMA[7] = {OD_BETA(1.), OD_BETA(1.),
+ OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.)};
+static const od_val16 OD_PVQ_BETA32_LUMA[10] = {OD_BETA(1.), OD_BETA(1.),
+ OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.),
+ OD_BETA(1.), OD_BETA(1.)};
+
+static const od_val16 OD_PVQ_BETA4_LUMA_MASKING[1] = {OD_BETA(1.)};
+static const od_val16 OD_PVQ_BETA8_LUMA_MASKING[4] = {OD_BETA(1.5),
+ OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5)};
+static const od_val16 OD_PVQ_BETA16_LUMA_MASKING[7] = {OD_BETA(1.5),
+ OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5),
+ OD_BETA(1.5)};
+static const od_val16 OD_PVQ_BETA32_LUMA_MASKING[10] = {OD_BETA(1.5),
+ OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5),
+ OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5)};
+
+static const od_val16 OD_PVQ_BETA4_CHROMA[1] = {OD_BETA(1.)};
+static const od_val16 OD_PVQ_BETA8_CHROMA[4] = {OD_BETA(1.), OD_BETA(1.),
+ OD_BETA(1.), OD_BETA(1.)};
+static const od_val16 OD_PVQ_BETA16_CHROMA[7] = {OD_BETA(1.), OD_BETA(1.),
+ OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.)};
+static const od_val16 OD_PVQ_BETA32_CHROMA[10] = {OD_BETA(1.), OD_BETA(1.),
+ OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.),
+ OD_BETA(1.), OD_BETA(1.)};
+
+const od_val16 *const OD_PVQ_BETA[2][OD_NPLANES_MAX][OD_TXSIZES + 1] = {
+ {{OD_PVQ_BETA4_LUMA, OD_PVQ_BETA8_LUMA,
+   OD_PVQ_BETA16_LUMA, OD_PVQ_BETA32_LUMA},
+  {OD_PVQ_BETA4_CHROMA, OD_PVQ_BETA8_CHROMA,
+   OD_PVQ_BETA16_CHROMA, OD_PVQ_BETA32_CHROMA},
+  {OD_PVQ_BETA4_CHROMA, OD_PVQ_BETA8_CHROMA,
+   OD_PVQ_BETA16_CHROMA, OD_PVQ_BETA32_CHROMA}},
+ {{OD_PVQ_BETA4_LUMA_MASKING, OD_PVQ_BETA8_LUMA_MASKING,
+   OD_PVQ_BETA16_LUMA_MASKING, OD_PVQ_BETA32_LUMA_MASKING},
+  {OD_PVQ_BETA4_CHROMA, OD_PVQ_BETA8_CHROMA,
+   OD_PVQ_BETA16_CHROMA, OD_PVQ_BETA32_CHROMA},
+  {OD_PVQ_BETA4_CHROMA, OD_PVQ_BETA8_CHROMA,
+   OD_PVQ_BETA16_CHROMA, OD_PVQ_BETA32_CHROMA}}
+};
+
+
+void od_interp_qm(unsigned char *out, int q, const od_qm_entry *entry1,
+  const od_qm_entry *entry2) {
+  int i;
+  if (entry2 == NULL || entry2->qm_q4 == NULL
+   || q < entry1->interp_q << OD_COEFF_SHIFT) {
+    /* Use entry1. */
+    for (i = 0; i < OD_QM_SIZE; i++) {
+      out[i] = OD_MINI(255, entry1->qm_q4[i]*entry1->scale_q8 >> 8);
+    }
+  }
+  else if (entry1 == NULL || entry1->qm_q4 == NULL
+   || q > entry2->interp_q << OD_COEFF_SHIFT) {
+    /* Use entry2. */
+    for (i = 0; i < OD_QM_SIZE; i++) {
+      out[i] = OD_MINI(255, entry2->qm_q4[i]*entry2->scale_q8 >> 8);
+    }
+  }
+  else {
+    /* Interpolate between entry1 and entry2. The interpolation is linear
+       in terms of log(q) vs log(m*scale). Considering that we're ultimately
+       multiplying the result it makes sense, but we haven't tried other
+       interpolation methods. */
+    double x;
+    const unsigned char *m1;
+    const unsigned char *m2;
+    int q1;
+    int q2;
+    m1 = entry1->qm_q4;
+    m2 = entry2->qm_q4;
+    q1 = entry1->interp_q << OD_COEFF_SHIFT;
+    q2 = entry2->interp_q << OD_COEFF_SHIFT;
+    x = (log(q)-log(q1))/(log(q2)-log(q1));
+    for (i = 0; i < OD_QM_SIZE; i++) {
+      out[i] = OD_MINI(255, (int)floor(.5 + (1./256)*exp(
+       x*log(m2[i]*entry2->scale_q8) + (1 - x)*log(m1[i]*entry1->scale_q8))));
+    }
+  }
+}
+
+void od_adapt_pvq_ctx_reset(od_pvq_adapt_ctx *state, int is_keyframe) {
+  od_pvq_codeword_ctx *ctx;
+  int i;
+  int pli;
+  int bs;
+  ctx = &state->pvq_codeword_ctx;
+  OD_CDFS_INIT_DYNAMIC(state->pvq_param_model[0].cdf);
+  OD_CDFS_INIT_DYNAMIC(state->pvq_param_model[1].cdf);
+  OD_CDFS_INIT_DYNAMIC(state->pvq_param_model[2].cdf);
+  for (i = 0; i < 2*OD_TXSIZES; i++) {
+    ctx->pvq_adapt[4*i + OD_ADAPT_K_Q8] = 384;
+    ctx->pvq_adapt[4*i + OD_ADAPT_SUM_EX_Q8] = 256;
+    ctx->pvq_adapt[4*i + OD_ADAPT_COUNT_Q8] = 104;
+    ctx->pvq_adapt[4*i + OD_ADAPT_COUNT_EX_Q8] = 128;
+  }
+  OD_CDFS_INIT_DYNAMIC(ctx->pvq_k1_cdf);
+  for (pli = 0; pli < OD_NPLANES_MAX; pli++) {
+    for (bs = 0; bs < OD_TXSIZES; bs++)
+    for (i = 0; i < PVQ_MAX_PARTITIONS; i++) {
+      state->pvq_exg[pli][bs][i] = 2 << 16;
+    }
+  }
+  for (i = 0; i < OD_TXSIZES*PVQ_MAX_PARTITIONS; i++) {
+    state->pvq_ext[i] = is_keyframe ? 24576 : 2 << 16;
+  }
+  OD_CDFS_INIT_DYNAMIC(state->pvq_gaintheta_cdf);
+  OD_CDFS_INIT_Q15(state->pvq_skip_dir_cdf);
+  OD_CDFS_INIT_DYNAMIC(ctx->pvq_split_cdf);
+}
+
+/* QMs are arranged from smallest to largest blocksizes, first for
+   blocks with decimation=0, followed by blocks with decimation=1.*/
+int od_qm_offset(int bs, int xydec)
+{
+    return xydec*OD_QM_STRIDE + OD_QM_OFFSET(bs);
+}
+
+#if defined(OD_FLOAT_PVQ)
+#define OD_DEFAULT_MAG 1.0
+#else
+#define OD_DEFAULT_MAG OD_QM_SCALE
+#endif
+
+/* Initialize the quantization matrix. */
+// Note: When hybrid transform and corresponding scan order is used by PVQ,
+// we don't need seperate qm and qm_inv for each transform type,
+// because AOM does not do magnitude compensation (i.e. simplay x16 for all coeffs).
+void od_init_qm(int16_t *x, int16_t *x_inv, const int *qm) {
+  int i;
+  int j;
+  int16_t y[OD_TXSIZE_MAX*OD_TXSIZE_MAX];
+  int16_t y_inv[OD_TXSIZE_MAX*OD_TXSIZE_MAX];
+  int16_t *x1;
+  int16_t *x1_inv;
+  int off;
+  int bs;
+  int xydec;
+  for (bs = 0; bs < OD_TXSIZES; bs++) {
+    for (xydec = 0; xydec < 2; xydec++) {
+      off = od_qm_offset(bs, xydec);
+      x1 = x + off;
+      x1_inv = x_inv + off;
+      for (i = 0; i < 4 << bs; i++) {
+        for (j = 0; j < 4 << bs; j++) {
+          /*This will ultimately be clamped to fit in 16 bits.*/
+          od_val32 mag;
+          int16_t ytmp;
+          mag = OD_DEFAULT_MAG;
+          if (i != 0 || j != 0) {
+#if defined(OD_FLOAT_PVQ)
+            mag /= 0.0625*qm[(i << 1 >> bs)*8 + (j << 1 >> bs)];
+#else
+            int qmv;
+            qmv = qm[(i << 1 >> bs)*8 + (j << 1 >> bs)];
+            mag *= 16;
+            mag = (mag + (qmv >> 1))/qmv;
+#endif
+            OD_ASSERT(mag > 0.0);
+          }
+          /*Convert to fit in 16 bits.*/
+#if defined(OD_FLOAT_PVQ)
+          y[i*(4 << bs) + j] = (int16_t)OD_MINI(OD_QM_SCALE_MAX,
+           (int32_t)floor(.5 + mag*OD_QM_SCALE));
+          y_inv[i*(4 << bs) + j] = (int16_t)floor(.5
+           + OD_QM_SCALE*OD_QM_INV_SCALE/(double)y[i*(4 << bs) + j]);
+#else
+          y[i*(4 << bs) + j] = (int16_t)OD_MINI(OD_QM_SCALE_MAX, mag);
+          ytmp = y[i*(4 << bs) + j];
+          y_inv[i*(4 << bs) + j] = (int16_t)((OD_QM_SCALE*OD_QM_INV_SCALE
+           + (ytmp >> 1))/ytmp);
+#endif
+        }
+      }
+      od_raster_to_coding_order_16(x1, 4 << bs, y, 4 << bs);
+      od_raster_to_coding_order_16(x1_inv, 4 << bs, y_inv, 4 << bs);
+    }
+  }
+}
+
+/* Maps each possible size (n) in the split k-tokenizer to a different value.
+   Possible values of n are:
+   2, 3, 4, 7, 8, 14, 15, 16, 31, 32, 63, 64, 127, 128
+   Since we don't care about the order (even in the bit-stream) the simplest
+   ordering (implemented here) is:
+   14, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 */
+int od_pvq_size_ctx(int n) {
+  int logn;
+  int odd;
+  logn = OD_ILOG(n - 1);
+  odd = n & 1;
+  return 2*logn - 1 - odd - 7*(n == 14);
+}
+
+/* Maps a length n to a context for the (k=1, n<=16) coder, with a special
+   case when n is the original length (orig_length=1) of the vector (i.e. we
+   haven't split it yet). For orig_length=0, we use the same mapping as
+   od_pvq_size_ctx() up to n=16. When orig_length=1, we map lengths
+   7, 8, 14, 15 to contexts 8 to 11. */
+int od_pvq_k1_ctx(int n, int orig_length) {
+  if (orig_length) return 8 + 2*(n > 8) + (n & 1);
+  else return od_pvq_size_ctx(n);
+}
+
+/* Indexing for the packed quantization matrices. */
+int od_qm_get_index(int bs, int band) {
+  /* The -band/3 term is due to the fact that we force corresponding horizontal
+     and vertical bands to have the same quantization. */
+  OD_ASSERT(bs >= 0 && bs < OD_TXSIZES);
+  return bs*(bs + 1) + band - band/3;
+}
+
+#if !defined(OD_FLOAT_PVQ)
+/*See celt/mathops.c in Opus and tools/cos_search.c.*/
+static int16_t od_pvq_cos_pi_2(int16_t x)
+{
+  int16_t x2;
+  x2 = OD_MULT16_16_Q15(x, x);
+  return OD_MINI(32767, (1073758164 - x*x + x2*(-7654 + OD_MULT16_16_Q16(x2,
+   16573 + OD_MULT16_16_Q16(-2529, x2)))) >> 15);
+}
+#endif
+
+/*Approximates cos(x) for -pi < x < pi.
+  Input is in OD_THETA_SCALE.*/
+od_val16 od_pvq_cos(od_val32 x) {
+#if defined(OD_FLOAT_PVQ)
+  return cos(x);
+#else
+  /*Wrap x around by masking, since cos is periodic.*/
+  x = x & 0x0001ffff;
+  if (x > (1 << 16)) {
+    x = (1 << 17) - x;
+  }
+  if (x & 0x00007fff) {
+    if (x < (1 << 15)) {
+       return od_pvq_cos_pi_2((int16_t)x);
+    }
+    else {
+      return -od_pvq_cos_pi_2((int16_t)(65536 - x));
+    }
+  }
+  else {
+    if (x & 0x0000ffff) {
+      return 0;
+    }
+    else if (x & 0x0001ffff) {
+      return -32767;
+    }
+    else {
+      return 32767;
+    }
+  }
+#endif
+}
+
+/*Approximates sin(x) for 0 <= x < pi.
+  Input is in OD_THETA_SCALE.*/
+od_val16 od_pvq_sin(od_val32 x) {
+#if defined(OD_FLOAT_PVQ)
+  return sin(x);
+#else
+  return od_pvq_cos(32768 - x);
+#endif
+}
+
+#if !defined(OD_FLOAT_PVQ)
+/* Computes an upper-bound on the number of bits required to store the L2 norm
+   of a vector (excluding sign). */
+int od_vector_log_mag(const od_coeff *x, int n) {
+  int i;
+  int32_t sum;
+  sum = 0;
+  for (i = 0; i < n; i++) {
+    int16_t tmp;
+    tmp = x[i] >> 8;
+    sum += tmp*(int32_t)tmp;
+  }
+  /* We add one full bit (instead of rounding OD_ILOG() up) for safety because
+     the >> 8 above causes the sum to be slightly underestimated. */
+  return 8 + 1 + OD_ILOG(n + sum)/2;
+}
+#endif
+
+/** Computes Householder reflection that aligns the reference r to the
+ *  dimension in r with the greatest absolute value. The reflection
+ *  vector is returned in r.
+ *
+ * @param [in,out]  r      reference vector to be reflected, reflection
+ *                         also returned in r
+ * @param [in]      n      number of dimensions in r
+ * @param [in]      gr     gain of reference vector
+ * @param [out]     sign   sign of reflection
+ * @return                 dimension number to which reflection aligns
+ **/
+int od_compute_householder(od_val16 *r, int n, od_val32 gr, int *sign,
+ int shift) {
+  int m;
+  int i;
+  int s;
+  od_val16 maxr;
+  OD_UNUSED(shift);
+  /* Pick component with largest magnitude. Not strictly
+   * necessary, but it helps numerical stability */
+  m = 0;
+  maxr = 0;
+  for (i = 0; i < n; i++) {
+    if (OD_ABS(r[i]) > maxr) {
+      maxr = OD_ABS(r[i]);
+      m = i;
+    }
+  }
+  s = r[m] > 0 ? 1 : -1;
+  /* This turns r into a Householder reflection vector that would reflect
+   * the original r[] to e_m */
+  r[m] += OD_SHR_ROUND(gr*s, shift);
+  *sign = s;
+  return m;
+}
+
+#if !defined(OD_FLOAT_PVQ)
+#define OD_RCP_INSHIFT 15
+#define OD_RCP_OUTSHIFT 14
+static od_val16 od_rcp(od_val16 x)
+{
+  int i;
+  od_val16 n;
+  od_val16 r;
+  i = OD_ILOG(x) - 1;
+  /*n is Q15 with range [0,1).*/
+  n = OD_VSHR_ROUND(x, i - OD_RCP_INSHIFT) - (1 << OD_RCP_INSHIFT);
+  /*Start with a linear approximation:
+    r = 1.8823529411764706-0.9411764705882353*n.
+    The coefficients and the result are Q14 in the range [15420,30840].*/
+  r = 30840 + OD_MULT16_16_Q15(-15420, n);
+  /*Perform two Newton iterations:
+    r -= r*((r*n)-1.Q15)
+       = r*((r*n)+(r-1.Q15)).*/
+  r = r - OD_MULT16_16_Q15(r, (OD_MULT16_16_Q15(r, n) + r - 32768));
+  /*We subtract an extra 1 in the second iteration to avoid overflow; it also
+     neatly compensates for truncation error in the rest of the process.*/
+  r = r - (1 + OD_MULT16_16_Q15(r, OD_MULT16_16_Q15(r, n) + r - 32768));
+  /*r is now the Q15 solution to 2/(n+1), with a maximum relative error
+     of 7.05346E-5, a (relative) RMSE of 2.14418E-5, and a peak absolute
+     error of 1.24665/32768.*/
+  return OD_VSHR_ROUND(r, i - OD_RCP_OUTSHIFT);
+}
+#endif
+
+/** Applies Householder reflection from compute_householder(). The
+ * reflection is its own inverse.
+ *
+ * @param [out]     out    reflected vector
+ * @param [in]      x      vector to be reflected
+ * @param [in]      r      reflection
+ * @param [in]      n      number of dimensions in x,r
+ */
+void od_apply_householder(od_val16 *out, const od_val16 *x, const od_val16 *r,
+ int n) {
+  int i;
+  od_val32 proj;
+  od_val16 proj_1;
+  od_val32 l2r;
+#if !defined(OD_FLOAT_PVQ)
+  od_val16 proj_norm;
+  od_val16 l2r_norm;
+  od_val16 rcp;
+  int proj_shift;
+  int l2r_shift;
+  int outshift;
+#endif
+  /*FIXME: Can we get l2r and/or l2r_shift from an earlier computation?*/
+  l2r = 0;
+  for (i = 0; i < n; i++) {
+    l2r += OD_MULT16_16(r[i], r[i]);
+  }
+  /* Apply Householder reflection */
+  proj = 0;
+  for (i = 0; i < n; i++) {
+    proj += OD_MULT16_16(r[i], x[i]);
+  }
+#if defined(OD_FLOAT_PVQ)
+  proj_1 = proj*2./(1e-100 + l2r);
+  for (i = 0; i < n; i++) {
+    out[i] = x[i] - r[i]*proj_1;
+  }
+#else
+  /*l2r_norm is [0.5, 1.0[ in Q15.*/
+  l2r_shift = (OD_ILOG(l2r) - 1) - 14;
+  l2r_norm = OD_VSHR_ROUND(l2r, l2r_shift);
+  rcp = od_rcp(l2r_norm);
+  proj_shift = (OD_ILOG(abs(proj)) - 1) - 14;
+  /*proj_norm is [0.5, 1.0[ in Q15.*/
+  proj_norm = OD_VSHR_ROUND(proj, proj_shift);
+  proj_1 = OD_MULT16_16_Q15(proj_norm, rcp);
+  /*The proj*2. in the float code becomes -1 in the final outshift.
+    The sign of l2r_shift is positive since we're taking the reciprocal of
+     l2r_norm and this is a right shift.*/
+  outshift = OD_MINI(30, OD_RCP_OUTSHIFT - proj_shift - 1 + l2r_shift);
+  if (outshift >= 0) {
+    for (i = 0; i < n; i++) {
+      int32_t tmp;
+      tmp = OD_MULT16_16(r[i], proj_1);
+      tmp = OD_SHR_ROUND(tmp, outshift);
+      out[i] = x[i] - tmp;
+    }
+  }
+  else {
+    /*FIXME: Can we make this case impossible?
+      Right now, if r[] is all zeros except for 1, 2, or 3 ones, and
+       if x[] is all zeros except for large values at the same position as the
+       ones in r[], then we can end up with a shift of -1.*/
+    for (i = 0; i < n; i++) {
+      int32_t tmp;
+      tmp = OD_MULT16_16(r[i], proj_1);
+      tmp = OD_SHL(tmp, -outshift);
+      out[i] = x[i] - tmp;
+    }
+  }
+#endif
+}
+
+#if !defined(OD_FLOAT_PVQ)
+static od_val16 od_beta_rcp(od_val16 beta){
+  if (beta == OD_BETA(1.))
+    return OD_BETA(1.);
+  else if (beta == OD_BETA(1.5))
+    return OD_BETA(1./1.5);
+  else {
+    od_val16 rcp_beta;
+    /*Shift by 1 less, transposing beta to range [.5, .75] and thus < 32768.*/
+    rcp_beta = od_rcp(beta << (OD_RCP_INSHIFT - 1 - OD_BETA_SHIFT));
+    return OD_SHR_ROUND(rcp_beta, OD_RCP_OUTSHIFT + 1 - OD_BETA_SHIFT);
+  }
+}
+
+#define OD_EXP2_INSHIFT 15
+#define OD_EXP2_FRACSHIFT 15
+#define OD_EXP2_OUTSHIFT 15
+static const int32_t OD_EXP2_C[5] = {32768, 22709, 7913, 1704, 443};
+/*Output is [1.0, 2.0) in Q(OD_EXP2_FRACSHIFT).
+  It does not include the integer offset, which is added in od_exp2 after the
+   final shift).*/
+static int32_t od_exp2_frac(int32_t x)
+{
+  return OD_MULT16_16_Q15(x, (OD_EXP2_C[1] + OD_MULT16_16_Q15(x,
+   (OD_EXP2_C[2] + OD_MULT16_16_Q15(x, (OD_EXP2_C[3]
+   + OD_MULT16_16_Q15(x, OD_EXP2_C[4])))))));
+}
+
+/** Base-2 exponential approximation (2^x) with Q15 input and output.*/
+static int32_t od_exp2(int32_t x)
+{
+  int integer;
+  int32_t frac;
+  integer = x >> OD_EXP2_INSHIFT;
+  if (integer > 14)
+    return 0x7f000000;
+  else if (integer < -15)
+    return 0;
+  frac = od_exp2_frac(x - OD_SHL(integer, OD_EXP2_INSHIFT));
+  return OD_VSHR_ROUND(OD_EXP2_C[0] + frac, -integer) + 1;
+}
+
+#define OD_LOG2_INSHIFT 15
+#define OD_LOG2_OUTSHIFT 15
+#define OD_LOG2_INSCALE_1 (1./(1 << OD_LOG2_INSHIFT))
+#define OD_LOG2_OUTSCALE (1 << OD_LOG2_OUTSHIFT)
+static int16_t od_log2(int16_t x)
+{
+  return x + OD_MULT16_16_Q15(x, (14482 + OD_MULT16_16_Q15(x, (-23234
+   + OD_MULT16_16_Q15(x, (13643 + OD_MULT16_16_Q15(x, (-6403
+   + OD_MULT16_16_Q15(x, 1515)))))))));
+}
+
+static int32_t od_pow(int32_t x, od_val16 beta)
+{
+  int16_t t;
+  int xshift;
+  int log2_x;
+  od_val32 logr;
+  /*FIXME: this conditional is to avoid doing log2(0).*/
+  if (x == 0)
+    return 0;
+  log2_x = (OD_ILOG(x) - 1);
+  xshift = log2_x - OD_LOG2_INSHIFT;
+  /*t should be in range [0.0, 1.0[ in Q(OD_LOG2_INSHIFT).*/
+  t = OD_VSHR(x, xshift) - (1 << OD_LOG2_INSHIFT);
+  /*log2(g/OD_COMPAND_SCALE) = log2(x) - OD_COMPAND_SHIFT in
+     Q(OD_LOG2_OUTSHIFT).*/
+  logr = od_log2(t) + (log2_x - OD_COMPAND_SHIFT)*OD_LOG2_OUTSCALE;
+  logr = OD_MULT16_32_QBETA(beta, logr);
+  return od_exp2(logr);
+}
+#endif
+
+/** Gain companding: raises gain to the power 1/beta for activity masking.
+ *
+ * @param [in]  g     real (uncompanded) gain
+ * @param [in]  q0    uncompanded quality parameter
+ * @param [in]  beta  activity masking beta param (exponent)
+ * @return            g^(1/beta)
+ */
+static od_val32 od_gain_compand(od_val32 g, int q0, od_val16 beta) {
+#if defined(OD_FLOAT_PVQ)
+  if (beta == 1) return OD_CGAIN_SCALE*g/(double)q0;
+  else {
+    return OD_CGAIN_SCALE*OD_COMPAND_SCALE*pow(g*OD_COMPAND_SCALE_1,
+     1./beta)/(double)q0;
+  }
+#else
+  if (beta == OD_BETA(1)) return (OD_CGAIN_SCALE*g + (q0 >> 1))/q0;
+  else {
+    int32_t expr;
+    expr = od_pow(g, od_beta_rcp(beta));
+    expr <<= OD_CGAIN_SHIFT + OD_COMPAND_SHIFT - OD_EXP2_OUTSHIFT;
+    return (expr + (q0 >> 1))/q0;
+  }
+#endif
+}
+
+#if !defined(OD_FLOAT_PVQ)
+#define OD_SQRT_INSHIFT 16
+#define OD_SQRT_OUTSHIFT 15
+static int16_t od_rsqrt_norm(int16_t x);
+
+static int16_t od_sqrt_norm(int32_t x)
+{
+  OD_ASSERT(x < 65536);
+  return OD_MINI(OD_SHR_ROUND(x*od_rsqrt_norm(x), OD_SQRT_OUTSHIFT), 32767);
+}
+
+static int16_t od_sqrt(int32_t x, int *sqrt_shift)
+{
+  int k;
+  int s;
+  int32_t t;
+  if (x == 0) {
+    *sqrt_shift = 0;
+     return 0;
+  }
+  OD_ASSERT(x < (1 << 30));
+  k = ((OD_ILOG(x) - 1) >> 1);
+  /*t is x in the range [0.25, 1) in QINSHIFT, or x*2^(-s).
+    Shift by log2(x) - log2(0.25*(1 << INSHIFT)) to ensure 0.25 lower bound.*/
+  s = 2*k - (OD_SQRT_INSHIFT - 2);
+  t = OD_VSHR(x, s);
+  /*We want to express od_sqrt() in terms of od_sqrt_norm(), which is
+     defined as (2^OUTSHIFT)*sqrt(t*(2^-INSHIFT)) with t=x*(2^-s).
+    This simplifies to 2^(OUTSHIFT-(INSHIFT/2)-(s/2))*sqrt(x), so the caller
+     needs to shift right by OUTSHIFT - INSHIFT/2 - s/2.*/
+  *sqrt_shift = OD_SQRT_OUTSHIFT - ((s + OD_SQRT_INSHIFT) >> 1);
+  return od_sqrt_norm(t);
+}
+#endif
+
+/** Gain expanding: raises gain to the power beta for activity masking.
+ *
+ * @param [in]  cg    companded gain
+ * @param [in]  q0    uncompanded quality parameter
+ * @param [in]  beta  activity masking beta param (exponent)
+ * @return            g^beta
+ */
+od_val32 od_gain_expand(od_val32 cg0, int q0, od_val16 beta) {
+  if (beta == OD_BETA(1)) {
+    /*The multiply fits into 28 bits because the expanded gain has a range from
+       0 to 2^20.*/
+    return OD_SHR_ROUND(cg0*q0, OD_CGAIN_SHIFT);
+  }
+  else if (beta == OD_BETA(1.5)) {
+#if defined(OD_FLOAT_PVQ)
+    double cg;
+    cg = cg0*OD_CGAIN_SCALE_1;
+    cg *= q0*OD_COMPAND_SCALE_1;
+    return OD_COMPAND_SCALE*cg*sqrt(cg);
+#else
+    int32_t irt;
+    int64_t tmp;
+    int sqrt_inshift;
+    int sqrt_outshift;
+    /*cg0 is in Q(OD_CGAIN_SHIFT) and we need to divide it by
+       2^OD_COMPAND_SHIFT.*/
+    irt = od_sqrt(cg0*q0, &sqrt_outshift);
+    sqrt_inshift = (OD_CGAIN_SHIFT + OD_COMPAND_SHIFT) >> 1;
+    /*tmp is in Q(OD_CGAIN_SHIFT + OD_COMPAND_SHIFT).*/
+    tmp = cg0*q0*(int64_t)irt;
+    /*Expanded gain must be in Q(OD_COMPAND_SHIFT), thus OD_COMPAND_SHIFT is
+       not included here.*/
+    return OD_MAXI(1,
+        OD_VSHR_ROUND(tmp, OD_CGAIN_SHIFT + sqrt_outshift + sqrt_inshift));
+#endif
+  }
+  else {
+#if defined(OD_FLOAT_PVQ)
+    /*Expanded gain must be in Q(OD_COMPAND_SHIFT), hence the multiply by
+       OD_COMPAND_SCALE.*/
+    double cg;
+    cg = cg0*OD_CGAIN_SCALE_1;
+    return OD_COMPAND_SCALE*pow(cg*q0*OD_COMPAND_SCALE_1, beta);
+#else
+    int32_t expr;
+    int32_t cg;
+    cg = OD_SHR_ROUND(cg0*q0, OD_CGAIN_SHIFT);
+    expr = od_pow(cg, beta);
+    /*Expanded gain must be in Q(OD_COMPAND_SHIFT), hence the subtraction by
+       OD_COMPAND_SHIFT.*/
+    return OD_MAXI(1, OD_SHR_ROUND(expr, OD_EXP2_OUTSHIFT - OD_COMPAND_SHIFT));
+#endif
+  }
+}
+
+/** Computes the raw and quantized/companded gain of a given input
+ * vector
+ *
+ * @param [in]      x      vector of input data
+ * @param [in]      n      number of elements in vector x
+ * @param [in]      q0     quantizer
+ * @param [out]     g      raw gain
+ * @param [in]      beta   activity masking beta param
+ * @param [in]      bshift shift to be applied to raw gain
+ * @return                 quantized/companded gain
+ */
+od_val32 od_pvq_compute_gain(const od_val16 *x, int n, int q0, od_val32 *g,
+ od_val16 beta, int bshift) {
+  int i;
+  od_val32 acc;
+#if !defined(OD_FLOAT_PVQ)
+  od_val32 irt;
+  int sqrt_shift;
+#else
+  OD_UNUSED(bshift);
+#endif
+  acc = 0;
+  for (i = 0; i < n; i++) {
+    acc += x[i]*(od_val32)x[i];
+  }
+#if defined(OD_FLOAT_PVQ)
+  *g = sqrt(acc);
+#else
+  irt = od_sqrt(acc, &sqrt_shift);
+  *g = OD_VSHR_ROUND(irt, sqrt_shift - bshift);
+#endif
+  /* Normalize gain by quantization step size and apply companding
+     (if ACTIVITY != 1). */
+  return od_gain_compand(*g, q0, beta);
+}
+
+/** Compute theta quantization range from quantized/companded gain
+ *
+ * @param [in]      qcg    quantized companded gain value
+ * @param [in]      beta   activity masking beta param
+ * @return                 max theta value
+ */
+int od_pvq_compute_max_theta(od_val32 qcg, od_val16 beta){
+  /* Set angular resolution (in ra) to match the encoded gain */
+#if defined(OD_FLOAT_PVQ)
+  int ts = (int)floor(.5 + qcg*OD_CGAIN_SCALE_1*M_PI/(2*beta));
+#else
+  int ts = OD_SHR_ROUND(qcg*OD_MULT16_16_QBETA(OD_QCONST32(M_PI/2,
+   OD_CGAIN_SHIFT), od_beta_rcp(beta)), OD_CGAIN_SHIFT*2);
+#endif
+  /* Special case for low gains -- will need to be tuned anyway */
+  if (qcg < OD_QCONST32(1.4, OD_CGAIN_SHIFT)) ts = 1;
+  return ts;
+}
+
+/** Decode quantized theta value from coded value
+ *
+ * @param [in]      t          quantized companded gain value
+ * @param [in]      max_theta  maximum theta value
+ * @return                     decoded theta value
+ */
+od_val32 od_pvq_compute_theta(int t, int max_theta) {
+  if (max_theta != 0) {
+#if defined(OD_FLOAT_PVQ)
+    return OD_MINI(t, max_theta - 1)*.5*M_PI/max_theta;
+#else
+    return (OD_MAX_THETA_SCALE*OD_MINI(t, max_theta - 1)
+     + (max_theta >> 1))/max_theta;
+#endif
+  }
+  else return 0;
+}
+
+#define OD_SQRT_TBL_SHIFT (10)
+
+#define OD_ITHETA_SHIFT 15
+/** Compute the number of pulses used for PVQ encoding a vector from
+ * available metrics (encode and decode side)
+ *
+ * @param [in]      qcg        quantized companded gain value
+ * @param [in]      itheta     quantized PVQ error angle theta
+ * @param [in]      noref      indicates present or lack of reference
+ *                             (prediction)
+ * @param [in]      n          number of elements to be coded
+ * @param [in]      beta       activity masking beta param
+ * @return                     number of pulses to use for coding
+ */
+int od_pvq_compute_k(od_val32 qcg, int itheta, int noref, int n,
+    od_val16 beta) {
+#if !defined(OD_FLOAT_PVQ)
+  /*Lookup table for sqrt(n+3/2) and sqrt(n+2/2) in Q10.
+    Real max values are 32792 and 32784, but clamped to stay within 16 bits.
+    Update with tools/gen_sqrt_tbl if needed.*/
+  static const od_val16 od_sqrt_table[2][13] = {
+   {0, 0, 0, 0, 2290, 2985, 4222, 0, 8256, 0, 16416, 0, 32767},
+   {0, 0, 0, 0, 2401, 3072, 4284, 0, 8287, 0, 16432, 0, 32767}};
+#endif
+  if (noref) {
+    if (qcg == 0) return 0;
+    if (n == 15 && qcg == OD_CGAIN_SCALE && beta > OD_BETA(1.25)) {
+      return 1;
+    }
+    else {
+#if defined(OD_FLOAT_PVQ)
+      return OD_MAXI(1, (int)floor(.5 + (qcg*OD_CGAIN_SCALE_1 - .2)*
+       sqrt((n + 3)/2)/beta));
+#else
+      od_val16 rt;
+      OD_ASSERT(OD_ILOG(n + 1) < 13);
+      rt = od_sqrt_table[1][OD_ILOG(n + 1)];
+      /*FIXME: get rid of 64-bit mul.*/
+      return OD_MAXI(1, OD_SHR_ROUND((int64_t)((qcg
+       - (int64_t)OD_QCONST32(.2, OD_CGAIN_SHIFT))*
+       OD_MULT16_16_QBETA(od_beta_rcp(beta), rt)), OD_CGAIN_SHIFT
+       + OD_SQRT_TBL_SHIFT));
+#endif
+    }
+  }
+  else {
+    if (itheta == 0) return 0;
+    /* Sets K according to gain and theta, based on the high-rate
+       PVQ distortion curves (see PVQ document). Low-rate will have to be
+       perceptually tuned anyway. We subtract 0.2 from the radius as an
+       approximation for the fact that the coefficients aren't identically
+       distributed within a band so at low gain the number of dimensions that
+       are likely to have a pulse is less than n. */
+#if defined(OD_FLOAT_PVQ)
+    return OD_MAXI(1, (int)floor(.5 + (itheta - .2)*sqrt((n + 2)/2)));
+#else
+    od_val16 rt;
+    OD_ASSERT(OD_ILOG(n + 1) < 13);
+    rt = od_sqrt_table[0][OD_ILOG(n + 1)];
+    /*FIXME: get rid of 64-bit mul.*/
+    return OD_MAXI(1, OD_VSHR_ROUND(((OD_SHL(itheta, OD_ITHETA_SHIFT)
+     - OD_QCONST32(.2, OD_ITHETA_SHIFT)))*(int64_t)rt,
+     OD_SQRT_TBL_SHIFT + OD_ITHETA_SHIFT));
+#endif
+  }
+}
+
+#if !defined(OD_FLOAT_PVQ)
+#define OD_RSQRT_INSHIFT 16
+#define OD_RSQRT_OUTSHIFT 14
+/** Reciprocal sqrt approximation where the input is in the range [0.25,1) in
+     Q16 and the output is in the range (1.0, 2.0] in Q14).
+    Error is always within +/1 of round(1/sqrt(t))*/
+static int16_t od_rsqrt_norm(int16_t t)
+{
+  int16_t n;
+  int32_t r;
+  int32_t r2;
+  int32_t ry;
+  int32_t y;
+  int32_t ret;
+  /* Range of n is [-16384,32767] ([-0.5,1) in Q15).*/
+  n = t - 32768;
+  OD_ASSERT(n >= -16384);
+  /*Get a rough initial guess for the root.
+    The optimal minimax quadratic approximation (using relative error) is
+     r = 1.437799046117536+n*(-0.823394375837328+n*0.4096419668459485).
+    Coefficients here, and the final result r, are Q14.*/
+  r = (23565 + OD_MULT16_16_Q15(n, (-13481 + OD_MULT16_16_Q15(n, 6711))));
+  /*We want y = t*r*r-1 in Q15, but t is 32-bit Q16 and r is Q14.
+    We can compute the result from n and r using Q15 multiplies with some
+     adjustment, carefully done to avoid overflow.*/
+  r2 = r*r;
+  y = (((r2 >> 15)*n + r2) >> 12) - 131077;
+  ry = r*y;
+  /*Apply a 2nd-order Householder iteration: r += r*y*(y*0.375-0.5).
+    This yields the Q14 reciprocal square root of the Q16 t, with a maximum
+     relative error of 1.04956E-4, a (relative) RMSE of 2.80979E-5, and a peak
+     absolute error of 2.26591/16384.*/
+  ret = r + ((((ry >> 16)*(3*y) >> 3) - ry) >> 18);
+  OD_ASSERT(ret >= 16384 && ret < 32768);
+  return (int16_t)ret;
+}
+
+static int16_t od_rsqrt(int32_t x, int *rsqrt_shift)
+{
+   int k;
+   int s;
+   int16_t t;
+   k = (OD_ILOG(x) - 1) >> 1;
+  /*t is x in the range [0.25, 1) in QINSHIFT, or x*2^(-s).
+    Shift by log2(x) - log2(0.25*(1 << INSHIFT)) to ensure 0.25 lower bound.*/
+   s = 2*k - (OD_RSQRT_INSHIFT - 2);
+   t = OD_VSHR(x, s);
+   /*We want to express od_rsqrt() in terms of od_rsqrt_norm(), which is
+      defined as (2^OUTSHIFT)/sqrt(t*(2^-INSHIFT)) with t=x*(2^-s).
+     This simplifies to 2^(OUTSHIFT+(INSHIFT/2)+(s/2))/sqrt(x), so the caller
+      needs to shift right by OUTSHIFT + INSHIFT/2 + s/2.*/
+   *rsqrt_shift = OD_RSQRT_OUTSHIFT + ((s + OD_RSQRT_INSHIFT) >> 1);
+   return od_rsqrt_norm(t);
+}
+#endif
+
+/** Synthesizes one parition of coefficient values from a PVQ-encoded
+ * vector.  This 'partial' version is called by the encode loop where
+ * the Householder reflection has already been computed and there's no
+ * need to recompute it.
+ *
+ * @param [out]     xcoeff  output coefficient partition (x in math doc)
+ * @param [in]      ypulse  PVQ-encoded values (y in the math doc); in
+ *                          the noref case, this vector has n entries,
+ *                          in the reference case it contains n-1 entries
+ *                          (the m-th entry is not included)
+ * @param [in]      r       reference vector (prediction)
+ * @param [in]      n       number of elements in this partition
+ * @param [in]      noref   indicates presence or lack of prediction
+ * @param [in]      g       decoded quantized vector gain
+ * @param [in]      theta   decoded theta (prediction error)
+ * @param [in]      m       alignment dimension of Householder reflection
+ * @param [in]      s       sign of Householder reflection
+ * @param [in]      qm_inv  inverse of the QM with magnitude compensation
+ */
+void od_pvq_synthesis_partial(od_coeff *xcoeff, const od_coeff *ypulse,
+ const od_val16 *r16, int n, int noref, od_val32 g, od_val32 theta, int m, int s,
+ const int16_t *qm_inv) {
+  int i;
+  int yy;
+  od_val32 scale;
+  int nn;
+#if !defined(OD_FLOAT_PVQ)
+  int gshift;
+  int qshift;
+#endif
+  OD_ASSERT(g != 0);
+  nn = n-(!noref); /* when noref==0, vector in is sized n-1 */
+  yy = 0;
+  for (i = 0; i < nn; i++)
+    yy += ypulse[i]*(int32_t)ypulse[i];
+#if !defined(OD_FLOAT_PVQ)
+  /* Shift required for the magnitude of the pre-qm synthesis to be guaranteed
+     to fit in 16 bits. In practice, the range will be 8192-16384 after scaling
+     most of the time. */
+  gshift = OD_MAXI(0, OD_ILOG(g) - 14);
+#endif
+  /*scale is g/sqrt(yy) in Q(16-gshift) so that x[]*scale has a norm that fits
+     in 16 bits.*/
+  if (yy == 0) scale = 0;
+#if defined(OD_FLOAT_PVQ)
+  else {
+    scale = g/sqrt(yy);
+  }
+#else
+  else {
+    int rsqrt_shift;
+    int16_t rsqrt;
+    /*FIXME: should be < int64_t*/
+    int64_t tmp;
+    rsqrt = od_rsqrt(yy, &rsqrt_shift);
+    tmp = rsqrt*(int64_t)g;
+    scale = OD_VSHR_ROUND(tmp, rsqrt_shift + gshift - 16);
+  }
+  /* Shift to apply after multiplying by the inverse QM, taking into account
+     gshift. */
+  qshift = OD_QM_INV_SHIFT - gshift;
+#endif
+  if (noref) {
+    for (i = 0; i < n; i++) {
+      od_val32 x;
+      /* This multiply doesn't round, so it introduces some bias.
+         It would be nice (but not critical) to fix this. */
+      x = OD_MULT16_32_Q16(ypulse[i], scale);
+#if defined(OD_FLOAT_PVQ)
+      xcoeff[i] = (od_coeff)floor(.5
+       + x*(qm_inv[i]*OD_QM_INV_SCALE_1));
+#else
+      xcoeff[i] = OD_SHR_ROUND(x*qm_inv[i], qshift);
+#endif
+    }
+  }
+  else{
+    od_val16 x[MAXN];
+    scale = OD_ROUND32(scale*OD_TRIG_SCALE_1*od_pvq_sin(theta));
+    /* The following multiply doesn't round, but it's probably OK since
+       the Householder reflection is likely to undo most of the resulting
+       bias. */
+    for (i = 0; i < m; i++)
+      x[i] = OD_MULT16_32_Q16(ypulse[i], scale);
+    x[m] = OD_ROUND16(-s*(OD_SHR_ROUND(g, gshift))*OD_TRIG_SCALE_1*
+     od_pvq_cos(theta));
+    for (i = m; i < nn; i++)
+      x[i+1] = OD_MULT16_32_Q16(ypulse[i], scale);
+    od_apply_householder(x, x, r16, n);
+    for (i = 0; i < n; i++) {
+#if defined(OD_FLOAT_PVQ)
+      xcoeff[i] = (od_coeff)floor(.5 + (x[i]*(qm_inv[i]*OD_QM_INV_SCALE_1)));
+#else
+      xcoeff[i] = OD_SHR_ROUND(x[i]*qm_inv[i], qshift);
+#endif
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/pvq.h b/third_party/aom/av1/common/pvq.h
new file mode 100644
index 000000000..17e54d4c5
--- /dev/null
+++ b/third_party/aom/av1/common/pvq.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#if !defined(_pvq_H)
+# define _pvq_H (1)
+# include "generic_code.h"
+# include "odintrin.h"
+
+extern const uint16_t EXP_CDF_TABLE[][16];
+extern const uint16_t LAPLACE_OFFSET[];
+
+#if CONFIG_DAALA_DIST
+#define AV1_PVQ_ENABLE_ACTIVITY_MASKING (1)
+#else
+#define AV1_PVQ_ENABLE_ACTIVITY_MASKING (0)
+#endif
+
+# define PVQ_MAX_PARTITIONS (1 + 3*(OD_TXSIZES-1))
+
+# define OD_NOREF_ADAPT_SPEED (4)
+/* Normalized lambda for PVQ quantizer. Since we normalize the gain by q, the
+   distortion is normalized by q^2 and lambda does not need the q^2 factor.
+   At high rate, this would be log(2)/6, but we're using a slightly more
+   aggressive value, closer to:
+   Li, Xiang, et al. "Laplace distribution based Lagrangian rate distortion
+   optimization for hybrid video coding." Circuits and Systems for Video
+   Technology, IEEE Transactions on 19.2 (2009): 193-205.
+   */
+# define OD_PVQ_LAMBDA (.1146)
+
+#define OD_PVQ_SKIP_ZERO 1
+#define OD_PVQ_SKIP_COPY 2
+
+/* Maximum size for coding a PVQ band. */
+#define OD_MAX_PVQ_SIZE (1024)
+
+#if defined(OD_FLOAT_PVQ)
+#define OD_QM_SHIFT (15)
+#else
+#define OD_QM_SHIFT (11)
+#endif
+#define OD_QM_SCALE (1 << OD_QM_SHIFT)
+#if defined(OD_FLOAT_PVQ)
+#define OD_QM_SCALE_1 (1./OD_QM_SCALE)
+#endif
+#define OD_QM_SCALE_MAX 32767
+#define OD_QM_INV_SHIFT (12)
+#define OD_QM_INV_SCALE (1 << OD_QM_INV_SHIFT)
+#if defined(OD_FLOAT_PVQ)
+#define OD_QM_INV_SCALE_1 (1./OD_QM_INV_SCALE)
+#endif
+#define OD_QM_OFFSET(bs) ((((1 << 2*bs) - 1) << 2*OD_LOG_BSIZE0)/3)
+#define OD_QM_STRIDE (OD_QM_OFFSET(OD_TXSIZES))
+#define OD_QM_BUFFER_SIZE (2*OD_QM_STRIDE)
+
+#if !defined(OD_FLOAT_PVQ)
+#define OD_THETA_SHIFT (15)
+#define OD_THETA_SCALE ((1 << OD_THETA_SHIFT)*2./M_PI)
+#define OD_MAX_THETA_SCALE (1 << OD_THETA_SHIFT)
+#define OD_TRIG_SCALE (32768)
+#define OD_BETA_SHIFT (12)
+#define OD_BETA_SCALE_1 (1./(1 << OD_BETA_SHIFT))
+/*Multiplies 16-bit a by 32-bit b and keeps bits [16:64-OD_BETA_SHIFT-1].*/
+#define OD_MULT16_32_QBETA(a, b) \
+ ((int16_t)(a)*(int64_t)(int32_t)(b) >> OD_BETA_SHIFT)
+# define OD_MULT16_16_QBETA(a, b) \
+  ((((int16_t)(a))*((int32_t)(int16_t)(b))) >> OD_BETA_SHIFT)
+#define OD_CGAIN_SHIFT (8)
+#define OD_CGAIN_SCALE (1 << OD_CGAIN_SHIFT)
+#else
+#define OD_BETA_SCALE_1 (1.)
+#define OD_THETA_SCALE (1)
+#define OD_TRIG_SCALE (1)
+#define OD_CGAIN_SCALE (1)
+#endif
+#define OD_THETA_SCALE_1 (1./OD_THETA_SCALE)
+#define OD_TRIG_SCALE_1 (1./OD_TRIG_SCALE)
+#define OD_CGAIN_SCALE_1 (1./OD_CGAIN_SCALE)
+#define OD_CGAIN_SCALE_2 (OD_CGAIN_SCALE_1*OD_CGAIN_SCALE_1)
+
+/* Largest PVQ partition is half the coefficients of largest block size. */
+#define MAXN (OD_TXSIZE_MAX*OD_TXSIZE_MAX/2)
+
+#define OD_COMPAND_SHIFT (8 + OD_COEFF_SHIFT)
+#define OD_COMPAND_SCALE (1 << OD_COMPAND_SHIFT)
+#define OD_COMPAND_SCALE_1 (1./OD_COMPAND_SCALE)
+
+#define OD_QM_SIZE (OD_TXSIZES*(OD_TXSIZES + 1))
+
+#define OD_FLAT_QM 0
+#define OD_HVS_QM  1
+
+# define OD_NSB_ADAPT_CTXS (4)
+
+# define OD_ADAPT_K_Q8        0
+# define OD_ADAPT_SUM_EX_Q8   1
+# define OD_ADAPT_COUNT_Q8    2
+# define OD_ADAPT_COUNT_EX_Q8 3
+
+# define OD_ADAPT_NO_VALUE (-2147483647-1)
+
+typedef enum {
+  PVQ_SKIP = 0x0,
+  DC_CODED = 0x1,
+  AC_CODED = 0x2,
+  AC_DC_CODED = 0x3,
+} PVQ_SKIP_TYPE;
+
+typedef struct od_pvq_adapt_ctx  od_pvq_adapt_ctx;
+typedef struct od_pvq_codeword_ctx od_pvq_codeword_ctx;
+
+struct od_pvq_codeword_ctx {
+  int                 pvq_adapt[2*OD_TXSIZES*OD_NSB_ADAPT_CTXS];
+  /* CDFs are size 16 despite the fact that we're using less than that. */
+  uint16_t            pvq_k1_cdf[12][CDF_SIZE(16)];
+  uint16_t            pvq_split_cdf[22*7][CDF_SIZE(8)];
+};
+
+struct od_pvq_adapt_ctx {
+  od_pvq_codeword_ctx pvq_codeword_ctx;
+  generic_encoder     pvq_param_model[3];
+  int                 pvq_ext[OD_TXSIZES*PVQ_MAX_PARTITIONS];
+  int                 pvq_exg[OD_NPLANES_MAX][OD_TXSIZES][PVQ_MAX_PARTITIONS];
+  uint16_t pvq_gaintheta_cdf[2*OD_TXSIZES*PVQ_MAX_PARTITIONS][CDF_SIZE(16)];
+  uint16_t pvq_skip_dir_cdf[2*(OD_TXSIZES-1)][CDF_SIZE(7)];
+};
+
+typedef struct od_qm_entry {
+  int interp_q;
+  int scale_q8;
+  const unsigned char *qm_q4;
+} od_qm_entry;
+
+extern const od_qm_entry OD_DEFAULT_QMS[2][2][OD_NPLANES_MAX];
+
+void od_adapt_pvq_ctx_reset(od_pvq_adapt_ctx *state, int is_keyframe);
+int od_pvq_size_ctx(int n);
+int od_pvq_k1_ctx(int n, int orig_size);
+
+od_val16 od_pvq_sin(od_val32 x);
+od_val16 od_pvq_cos(od_val32 x);
+#if !defined(OD_FLOAT_PVQ)
+int od_vector_log_mag(const od_coeff *x, int n);
+#endif
+
+void od_interp_qm(unsigned char *out, int q, const od_qm_entry *entry1,
+                  const od_qm_entry *entry2);
+
+int od_qm_get_index(int bs, int band);
+
+extern const od_val16 *const OD_PVQ_BETA[2][OD_NPLANES_MAX][OD_TXSIZES + 1];
+
+void od_init_qm(int16_t *x, int16_t *x_inv, const int *qm);
+int od_compute_householder(od_val16 *r, int n, od_val32 gr, int *sign,
+ int shift);
+void od_apply_householder(od_val16 *out, const od_val16 *x, const od_val16 *r,
+ int n);
+void od_pvq_synthesis_partial(od_coeff *xcoeff, const od_coeff *ypulse,
+                                  const od_val16 *r, int n,
+                                  int noref, od_val32 g,
+                                  od_val32 theta, int m, int s,
+                                  const int16_t *qm_inv);
+od_val32 od_gain_expand(od_val32 cg, int q0, od_val16 beta);
+od_val32 od_pvq_compute_gain(const od_val16 *x, int n, int q0, od_val32 *g,
+ od_val16 beta, int bshift);
+int od_pvq_compute_max_theta(od_val32 qcg, od_val16 beta);
+od_val32 od_pvq_compute_theta(int t, int max_theta);
+int od_pvq_compute_k(od_val32 qcg, int itheta, int noref, int n, od_val16 beta);
+
+int od_vector_is_null(const od_coeff *x, int len);
+int od_qm_offset(int bs, int xydec);
+
+#endif
diff --git a/third_party/aom/av1/common/pvq_state.c b/third_party/aom/av1/common/pvq_state.c
new file mode 100644
index 000000000..197b9b3a8
--- /dev/null
+++ b/third_party/aom/av1/common/pvq_state.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/pvq_state.h"
+#include "av1/common/odintrin.h"
+
+void od_adapt_ctx_reset(od_adapt_ctx *adapt, int is_keyframe) {
+  int pli;
+  od_adapt_pvq_ctx_reset(&adapt->pvq, is_keyframe);
+  OD_CDFS_INIT_Q15(adapt->skip_cdf);
+  for (pli = 0; pli < OD_NPLANES_MAX; pli++) {
+    int i;
+    OD_CDFS_INIT_DYNAMIC(adapt->model_dc[pli].cdf);
+    for (i = 0; i < OD_TXSIZES; i++) {
+      int j;
+      adapt->ex_g[pli][i] = 8;
+      for (j = 0; j < 3; j++) {
+        adapt->ex_dc[pli][i][j] = pli > 0 ? 8 : 32768;
+      }
+    }
+  }
+}
+
+void od_init_skipped_coeffs(int16_t *d, int16_t *pred, int is_keyframe, int bo,
+                            int n, int w) {
+  int i;
+  int j;
+  if (is_keyframe) {
+    for (i = 0; i < n; i++) {
+      for (j = 0; j < n; j++) {
+        /* skip DC */
+        if (i || j) d[bo + i * w + j] = 0;
+      }
+    }
+  } else {
+    for (i = 0; i < n; i++) {
+      for (j = 0; j < n; j++) {
+        d[bo + i * w + j] = pred[i * n + j];
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/pvq_state.h b/third_party/aom/av1/common/pvq_state.h
new file mode 100644
index 000000000..84d454e70
--- /dev/null
+++ b/third_party/aom/av1/common/pvq_state.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#if !defined(_state_H)
+# define _state_H (1)
+
+typedef struct od_state     od_state;
+typedef struct od_adapt_ctx od_adapt_ctx;
+
+# include "generic_code.h"
+# include "odintrin.h"
+# include "pvq.h"
+
+/*Adaptation speed of scalar Laplace encoding.*/
+# define OD_SCALAR_ADAPT_SPEED (4)
+
+struct od_adapt_ctx {
+  /* Support for PVQ encode/decode */
+  od_pvq_adapt_ctx pvq;
+
+  generic_encoder model_dc[OD_NPLANES_MAX];
+
+  int ex_dc[OD_NPLANES_MAX][OD_TXSIZES][3];
+  int ex_g[OD_NPLANES_MAX][OD_TXSIZES];
+
+  /* Joint skip flag for DC and AC */
+  uint16_t skip_cdf[OD_TXSIZES*2][CDF_SIZE(4)];
+};
+
+struct od_state {
+  od_adapt_ctx *adapt;
+  unsigned char pvq_qm_q4[OD_NPLANES_MAX][OD_QM_SIZE];
+  /* Quantization matrices and their inverses. */
+  int16_t qm[OD_QM_BUFFER_SIZE];
+  int16_t qm_inv[OD_QM_BUFFER_SIZE];
+};
+
+void od_adapt_ctx_reset(od_adapt_ctx *state, int is_keyframe);
+void od_init_skipped_coeffs(int16_t *d, int16_t *pred, int is_keyframe,
+ int bo, int n, int w);
+
+#endif
diff --git a/third_party/aom/av1/common/quant_common.c b/third_party/aom/av1/common/quant_common.c
new file mode 100644
index 000000000..763465e48
--- /dev/null
+++ b/third_party/aom/av1/common/quant_common.c
@@ -0,0 +1,11369 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/entropy.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/blockd.h"
+
+#if CONFIG_NEW_QUANT
+// Bin widths expressed as a fraction over 128 of the quant stepsize,
+// for the quantization bins 0-4.
+// So a value x indicates the bin is actually factor x/128 of the
+// nominal quantization step.  For the zero bin, the width is only
+// for one side of zero, so the actual width is twice that.
+//
+// Functions with nuq correspond to "non uniform quantization"
+// TODO(sarahparker, debargha): Optimize these tables
+
+typedef struct {
+  uint8_t knots[NUQ_KNOTS];  // offsets
+  uint8_t doff;              // dequantization
+} qprofile_type;
+
+static const qprofile_type nuq[QUANT_PROFILES][COEF_BANDS] = {
+  {
+      // lossless
+      { { 64, 128, 128 }, 0 },  // dc, band 0
+      { { 64, 128, 128 }, 0 },  // band 1
+      { { 64, 128, 128 }, 0 },  // band 2
+      { { 64, 128, 128 }, 0 },  // band 3
+      { { 64, 128, 128 }, 0 },  // band 4
+      { { 64, 128, 128 }, 0 },  // band 5
+  },
+  {
+      { { 64, 128, 128 }, 4 },   // dc, band 0
+      { { 64, 128, 128 }, 6 },   // band 1
+      { { 64, 128, 128 }, 8 },   // band 2
+      { { 64, 128, 128 }, 10 },  // band 3
+      { { 72, 128, 128 }, 12 },  // band 4
+      { { 80, 128, 128 }, 14 }   // band 5
+  },
+  {
+      { { 64, 128, 128 }, 6 },   // dc, band 0
+      { { 64, 128, 128 }, 8 },   // band 1
+      { { 64, 128, 128 }, 10 },  // band 2
+      { { 64, 128, 128 }, 12 },  // band 3
+      { { 72, 128, 128 }, 14 },  // band 4
+      { { 80, 128, 128 }, 16 }   // band 5
+  },
+  {
+      { { 64, 128, 128 }, 8 },   // dc, band 0
+      { { 64, 128, 128 }, 10 },  // band 1
+      { { 64, 128, 128 }, 12 },  // band 2
+      { { 72, 128, 128 }, 14 },  // band 3
+      { { 76, 128, 128 }, 16 },  // band 4
+      { { 80, 128, 128 }, 18 }   // band 5
+  }
+};
+
+static const uint8_t *get_nuq_knots(int band, int q_profile) {
+  return nuq[q_profile][band].knots;
+}
+
+static INLINE int16_t quant_to_doff_fixed(int band, int q_profile) {
+  return nuq[q_profile][band].doff;
+}
+
+// get cumulative bins
+static INLINE void get_cuml_bins_nuq(int q, int band, tran_low_t *cuml_bins,
+                                     int q_profile) {
+  const uint8_t *knots = get_nuq_knots(band, q_profile);
+  int16_t cuml_knots[NUQ_KNOTS];
+  int i;
+  cuml_knots[0] = knots[0];
+  for (i = 1; i < NUQ_KNOTS; ++i) cuml_knots[i] = cuml_knots[i - 1] + knots[i];
+  for (i = 0; i < NUQ_KNOTS; ++i)
+    cuml_bins[i] = ROUND_POWER_OF_TWO(cuml_knots[i] * q, 7);
+}
+
+void av1_get_dequant_val_nuq(int q, int band, tran_low_t *dq,
+                             tran_low_t *cuml_bins, int q_profile) {
+  const uint8_t *knots = get_nuq_knots(band, q_profile);
+  tran_low_t cuml_bins_[NUQ_KNOTS], *cuml_bins_ptr;
+  tran_low_t doff;
+  int i;
+  cuml_bins_ptr = (cuml_bins ? cuml_bins : cuml_bins_);
+  get_cuml_bins_nuq(q, band, cuml_bins_ptr, q_profile);
+  dq[0] = 0;
+  for (i = 1; i < NUQ_KNOTS; ++i) {
+    doff = quant_to_doff_fixed(band, q_profile);
+    doff = ROUND_POWER_OF_TWO(doff * knots[i], 7);
+    dq[i] =
+        cuml_bins_ptr[i - 1] + ROUND_POWER_OF_TWO((knots[i] - doff * 2) * q, 8);
+  }
+  doff = quant_to_doff_fixed(band, q_profile);
+  dq[NUQ_KNOTS] =
+      cuml_bins_ptr[NUQ_KNOTS - 1] + ROUND_POWER_OF_TWO((64 - doff) * q, 7);
+}
+
+tran_low_t av1_dequant_abscoeff_nuq(int v, int q, const tran_low_t *dq) {
+  if (v <= NUQ_KNOTS)
+    return dq[v];
+  else
+    return dq[NUQ_KNOTS] + (v - NUQ_KNOTS) * q;
+}
+
+tran_low_t av1_dequant_coeff_nuq(int v, int q, const tran_low_t *dq) {
+  tran_low_t dqmag = av1_dequant_abscoeff_nuq(abs(v), q, dq);
+  return (v < 0 ? -dqmag : dqmag);
+}
+#endif  // CONFIG_NEW_QUANT
+
+static const int16_t dc_qlookup[QINDEX_RANGE] = {
+  4,    8,    8,    9,    10,  11,  12,  12,  13,  14,  15,   16,   17,   18,
+  19,   19,   20,   21,   22,  23,  24,  25,  26,  26,  27,   28,   29,   30,
+  31,   32,   32,   33,   34,  35,  36,  37,  38,  38,  39,   40,   41,   42,
+  43,   43,   44,   45,   46,  47,  48,  48,  49,  50,  51,   52,   53,   53,
+  54,   55,   56,   57,   57,  58,  59,  60,  61,  62,  62,   63,   64,   65,
+  66,   66,   67,   68,   69,  70,  70,  71,  72,  73,  74,   74,   75,   76,
+  77,   78,   78,   79,   80,  81,  81,  82,  83,  84,  85,   85,   87,   88,
+  90,   92,   93,   95,   96,  98,  99,  101, 102, 104, 105,  107,  108,  110,
+  111,  113,  114,  116,  117, 118, 120, 121, 123, 125, 127,  129,  131,  134,
+  136,  138,  140,  142,  144, 146, 148, 150, 152, 154, 156,  158,  161,  164,
+  166,  169,  172,  174,  177, 180, 182, 185, 187, 190, 192,  195,  199,  202,
+  205,  208,  211,  214,  217, 220, 223, 226, 230, 233, 237,  240,  243,  247,
+  250,  253,  257,  261,  265, 269, 272, 276, 280, 284, 288,  292,  296,  300,
+  304,  309,  313,  317,  322, 326, 330, 335, 340, 344, 349,  354,  359,  364,
+  369,  374,  379,  384,  389, 395, 400, 406, 411, 417, 423,  429,  435,  441,
+  447,  454,  461,  467,  475, 482, 489, 497, 505, 513, 522,  530,  539,  549,
+  559,  569,  579,  590,  602, 614, 626, 640, 654, 668, 684,  700,  717,  736,
+  755,  775,  796,  819,  843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139,
+  1184, 1232, 1282, 1336,
+};
+
+#if CONFIG_HIGHBITDEPTH
+static const int16_t dc_qlookup_10[QINDEX_RANGE] = {
+  4,    9,    10,   13,   15,   17,   20,   22,   25,   28,   31,   34,   37,
+  40,   43,   47,   50,   53,   57,   60,   64,   68,   71,   75,   78,   82,
+  86,   90,   93,   97,   101,  105,  109,  113,  116,  120,  124,  128,  132,
+  136,  140,  143,  147,  151,  155,  159,  163,  166,  170,  174,  178,  182,
+  185,  189,  193,  197,  200,  204,  208,  212,  215,  219,  223,  226,  230,
+  233,  237,  241,  244,  248,  251,  255,  259,  262,  266,  269,  273,  276,
+  280,  283,  287,  290,  293,  297,  300,  304,  307,  310,  314,  317,  321,
+  324,  327,  331,  334,  337,  343,  350,  356,  362,  369,  375,  381,  387,
+  394,  400,  406,  412,  418,  424,  430,  436,  442,  448,  454,  460,  466,
+  472,  478,  484,  490,  499,  507,  516,  525,  533,  542,  550,  559,  567,
+  576,  584,  592,  601,  609,  617,  625,  634,  644,  655,  666,  676,  687,
+  698,  708,  718,  729,  739,  749,  759,  770,  782,  795,  807,  819,  831,
+  844,  856,  868,  880,  891,  906,  920,  933,  947,  961,  975,  988,  1001,
+  1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170, 1186, 1202,
+  1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379, 1398, 1416, 1436,
+  1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624, 1647, 1670, 1692, 1717,
+  1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, 1958, 1990, 2021, 2054, 2088,
+  2123, 2159, 2197, 2236, 2276, 2319, 2363, 2410, 2458, 2508, 2561, 2616, 2675,
+  2737, 2802, 2871, 2944, 3020, 3102, 3188, 3280, 3375, 3478, 3586, 3702, 3823,
+  3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347,
+};
+
+static const int16_t dc_qlookup_12[QINDEX_RANGE] = {
+  4,     12,    18,    25,    33,    41,    50,    60,    70,    80,    91,
+  103,   115,   127,   140,   153,   166,   180,   194,   208,   222,   237,
+  251,   266,   281,   296,   312,   327,   343,   358,   374,   390,   405,
+  421,   437,   453,   469,   484,   500,   516,   532,   548,   564,   580,
+  596,   611,   627,   643,   659,   674,   690,   706,   721,   737,   752,
+  768,   783,   798,   814,   829,   844,   859,   874,   889,   904,   919,
+  934,   949,   964,   978,   993,   1008,  1022,  1037,  1051,  1065,  1080,
+  1094,  1108,  1122,  1136,  1151,  1165,  1179,  1192,  1206,  1220,  1234,
+  1248,  1261,  1275,  1288,  1302,  1315,  1329,  1342,  1368,  1393,  1419,
+  1444,  1469,  1494,  1519,  1544,  1569,  1594,  1618,  1643,  1668,  1692,
+  1717,  1741,  1765,  1789,  1814,  1838,  1862,  1885,  1909,  1933,  1957,
+  1992,  2027,  2061,  2096,  2130,  2165,  2199,  2233,  2267,  2300,  2334,
+  2367,  2400,  2434,  2467,  2499,  2532,  2575,  2618,  2661,  2704,  2746,
+  2788,  2830,  2872,  2913,  2954,  2995,  3036,  3076,  3127,  3177,  3226,
+  3275,  3324,  3373,  3421,  3469,  3517,  3565,  3621,  3677,  3733,  3788,
+  3843,  3897,  3951,  4005,  4058,  4119,  4181,  4241,  4301,  4361,  4420,
+  4479,  4546,  4612,  4677,  4742,  4807,  4871,  4942,  5013,  5083,  5153,
+  5222,  5291,  5367,  5442,  5517,  5591,  5665,  5745,  5825,  5905,  5984,
+  6063,  6149,  6234,  6319,  6404,  6495,  6587,  6678,  6769,  6867,  6966,
+  7064,  7163,  7269,  7376,  7483,  7599,  7715,  7832,  7958,  8085,  8214,
+  8352,  8492,  8635,  8788,  8945,  9104,  9275,  9450,  9639,  9832,  10031,
+  10245, 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409, 12750, 13118,
+  13501, 13913, 14343, 14807, 15290, 15812, 16356, 16943, 17575, 18237, 18949,
+  19718, 20521, 21387,
+};
+#endif
+
+static const int16_t ac_qlookup[QINDEX_RANGE] = {
+  4,    8,    9,    10,   11,   12,   13,   14,   15,   16,   17,   18,   19,
+  20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
+  33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,   45,
+  46,   47,   48,   49,   50,   51,   52,   53,   54,   55,   56,   57,   58,
+  59,   60,   61,   62,   63,   64,   65,   66,   67,   68,   69,   70,   71,
+  72,   73,   74,   75,   76,   77,   78,   79,   80,   81,   82,   83,   84,
+  85,   86,   87,   88,   89,   90,   91,   92,   93,   94,   95,   96,   97,
+  98,   99,   100,  101,  102,  104,  106,  108,  110,  112,  114,  116,  118,
+  120,  122,  124,  126,  128,  130,  132,  134,  136,  138,  140,  142,  144,
+  146,  148,  150,  152,  155,  158,  161,  164,  167,  170,  173,  176,  179,
+  182,  185,  188,  191,  194,  197,  200,  203,  207,  211,  215,  219,  223,
+  227,  231,  235,  239,  243,  247,  251,  255,  260,  265,  270,  275,  280,
+  285,  290,  295,  300,  305,  311,  317,  323,  329,  335,  341,  347,  353,
+  359,  366,  373,  380,  387,  394,  401,  408,  416,  424,  432,  440,  448,
+  456,  465,  474,  483,  492,  501,  510,  520,  530,  540,  550,  560,  571,
+  582,  593,  604,  615,  627,  639,  651,  663,  676,  689,  702,  715,  729,
+  743,  757,  771,  786,  801,  816,  832,  848,  864,  881,  898,  915,  933,
+  951,  969,  988,  1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151, 1173, 1196,
+  1219, 1243, 1267, 1292, 1317, 1343, 1369, 1396, 1423, 1451, 1479, 1508, 1537,
+  1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
+};
+
+#if CONFIG_HIGHBITDEPTH
+static const int16_t ac_qlookup_10[QINDEX_RANGE] = {
+  4,    9,    11,   13,   16,   18,   21,   24,   27,   30,   33,   37,   40,
+  44,   48,   51,   55,   59,   63,   67,   71,   75,   79,   83,   88,   92,
+  96,   100,  105,  109,  114,  118,  122,  127,  131,  136,  140,  145,  149,
+  154,  158,  163,  168,  172,  177,  181,  186,  190,  195,  199,  204,  208,
+  213,  217,  222,  226,  231,  235,  240,  244,  249,  253,  258,  262,  267,
+  271,  275,  280,  284,  289,  293,  297,  302,  306,  311,  315,  319,  324,
+  328,  332,  337,  341,  345,  349,  354,  358,  362,  367,  371,  375,  379,
+  384,  388,  392,  396,  401,  409,  417,  425,  433,  441,  449,  458,  466,
+  474,  482,  490,  498,  506,  514,  523,  531,  539,  547,  555,  563,  571,
+  579,  588,  596,  604,  616,  628,  640,  652,  664,  676,  688,  700,  713,
+  725,  737,  749,  761,  773,  785,  797,  809,  825,  841,  857,  873,  889,
+  905,  922,  938,  954,  970,  986,  1002, 1018, 1038, 1058, 1078, 1098, 1118,
+  1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386, 1411,
+  1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727, 1759, 1791,
+  1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159, 2199, 2239, 2283,
+  2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703, 2755, 2807, 2859, 2915,
+  2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, 3455, 3523, 3591, 3659, 3731,
+  3803, 3876, 3952, 4028, 4104, 4184, 4264, 4348, 4432, 4516, 4604, 4692, 4784,
+  4876, 4972, 5068, 5168, 5268, 5372, 5476, 5584, 5692, 5804, 5916, 6032, 6148,
+  6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312,
+};
+
+static const int16_t ac_qlookup_12[QINDEX_RANGE] = {
+  4,     13,    19,    27,    35,    44,    54,    64,    75,    87,    99,
+  112,   126,   139,   154,   168,   183,   199,   214,   230,   247,   263,
+  280,   297,   314,   331,   349,   366,   384,   402,   420,   438,   456,
+  475,   493,   511,   530,   548,   567,   586,   604,   623,   642,   660,
+  679,   698,   716,   735,   753,   772,   791,   809,   828,   846,   865,
+  884,   902,   920,   939,   957,   976,   994,   1012,  1030,  1049,  1067,
+  1085,  1103,  1121,  1139,  1157,  1175,  1193,  1211,  1229,  1246,  1264,
+  1282,  1299,  1317,  1335,  1352,  1370,  1387,  1405,  1422,  1440,  1457,
+  1474,  1491,  1509,  1526,  1543,  1560,  1577,  1595,  1627,  1660,  1693,
+  1725,  1758,  1791,  1824,  1856,  1889,  1922,  1954,  1987,  2020,  2052,
+  2085,  2118,  2150,  2183,  2216,  2248,  2281,  2313,  2346,  2378,  2411,
+  2459,  2508,  2556,  2605,  2653,  2701,  2750,  2798,  2847,  2895,  2943,
+  2992,  3040,  3088,  3137,  3185,  3234,  3298,  3362,  3426,  3491,  3555,
+  3619,  3684,  3748,  3812,  3876,  3941,  4005,  4069,  4149,  4230,  4310,
+  4390,  4470,  4550,  4631,  4711,  4791,  4871,  4967,  5064,  5160,  5256,
+  5352,  5448,  5544,  5641,  5737,  5849,  5961,  6073,  6185,  6297,  6410,
+  6522,  6650,  6778,  6906,  7034,  7162,  7290,  7435,  7579,  7723,  7867,
+  8011,  8155,  8315,  8475,  8635,  8795,  8956,  9132,  9308,  9484,  9660,
+  9836,  10028, 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, 11885,
+  12109, 12333, 12573, 12813, 13053, 13309, 13565, 13821, 14093, 14365, 14637,
+  14925, 15213, 15502, 15806, 16110, 16414, 16734, 17054, 17390, 17726, 18062,
+  18414, 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, 21902, 22334,
+  22766, 23214, 23662, 24126, 24590, 25070, 25551, 26047, 26559, 27071, 27599,
+  28143, 28687, 29247,
+};
+#endif
+
+int16_t av1_dc_quant(int qindex, int delta, aom_bit_depth_t bit_depth) {
+#if CONFIG_HIGHBITDEPTH
+  switch (bit_depth) {
+    case AOM_BITS_8: return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_10: return dc_qlookup_10[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_12: return dc_qlookup_12[clamp(qindex + delta, 0, MAXQ)];
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+#else
+  (void)bit_depth;
+  return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
+#endif
+}
+
+int16_t av1_ac_quant(int qindex, int delta, aom_bit_depth_t bit_depth) {
+#if CONFIG_HIGHBITDEPTH
+  switch (bit_depth) {
+    case AOM_BITS_8: return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_10: return ac_qlookup_10[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_12: return ac_qlookup_12[clamp(qindex + delta, 0, MAXQ)];
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+#else
+  (void)bit_depth;
+  return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];
+#endif
+}
+
+int16_t av1_qindex_from_ac(int ac, aom_bit_depth_t bit_depth) {
+  int i;
+  const int16_t *tab = ac_qlookup;
+  ac *= 4;
+#if CONFIG_HIGHBITDEPTH
+  switch (bit_depth) {
+    case AOM_BITS_10: {
+      tab = ac_qlookup_10;
+      ac *= 4;
+      break;
+    }
+    case AOM_BITS_12: {
+      tab = ac_qlookup_12;
+      ac *= 16;
+      break;
+    }
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+#endif
+  (void)bit_depth;
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    if (ac <= tab[i]) return i;
+  }
+  return QINDEX_RANGE - 1;
+}
+
+int av1_get_qindex(const struct segmentation *seg, int segment_id,
+                   int base_qindex) {
+  if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
+    const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
+    const int seg_qindex =
+        seg->abs_delta == SEGMENT_ABSDATA ? data : base_qindex + data;
+    return clamp(seg_qindex, 0, MAXQ);
+  } else {
+    return base_qindex;
+  }
+}
+
+#if CONFIG_AOM_QM
+qm_val_t *aom_iqmatrix(AV1_COMMON *cm, int qmlevel, int is_chroma,
+                       int log2sizem2, int is_intra) {
+  return &cm->giqmatrix[qmlevel][!!is_chroma][!!is_intra][log2sizem2][0];
+}
+qm_val_t *aom_qmatrix(AV1_COMMON *cm, int qmlevel, int is_chroma,
+                      int log2sizem2, int is_intra) {
+  return &cm->gqmatrix[qmlevel][!!is_chroma][!!is_intra][log2sizem2][0];
+}
+
+static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][2]
+                              [4 * 4 + 8 * 8 + 16 * 16 + 32 * 32];
+static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][2]
+                             [4 * 4 + 8 * 8 + 16 * 16 + 32 * 32];
+
+void aom_qm_init(AV1_COMMON *cm) {
+  int q, c, f, t, size;
+  int current;
+  for (q = 0; q < NUM_QM_LEVELS; ++q) {
+    for (c = 0; c < 2; ++c) {
+      for (f = 0; f < 2; ++f) {
+        current = 0;
+        for (t = 0; t < TX_SIZES; ++t) {
+          size = 1 << (t + 2);
+          cm->gqmatrix[q][c][f][t] = &wt_matrix_ref[q][c][f][current];
+          cm->giqmatrix[q][c][f][t] = &iwt_matrix_ref[q][c][f][current];
+          current += size * size;
+        }
+      }
+    }
+  }
+}
+
+static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][2][4 * 4 + 8 * 8 + 16 * 16 +
+                                                    32 * 32] = {
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 71, 124, 214, 71, 112, 165, 241, 124, 165, 254, 331, 214, 241, 331,
+        414,
+        /* Size 8 */
+        64, 47, 51, 69, 97, 132, 173, 218, 47, 54, 52, 62, 81, 109, 142, 181,
+        51, 52, 75, 90, 108, 133, 165, 201, 69, 62, 90, 119, 144, 169, 198, 232,
+        97, 81, 108, 144, 178, 208, 238, 268, 132, 109, 133, 169, 208, 244, 276,
+        305, 173, 142, 165, 198, 238, 276, 309, 338, 218, 181, 201, 232, 268,
+        305, 338, 367,
+        /* Size 16 */
+        64, 54, 47, 49, 51, 59, 69, 81, 97, 111, 132, 150, 173, 193, 218, 218,
+        54, 52, 50, 51, 51, 58, 65, 75, 88, 101, 119, 135, 156, 175, 198, 198,
+        47, 50, 54, 53, 52, 56, 62, 70, 81, 93, 109, 123, 142, 159, 181, 181,
+        49, 51, 53, 57, 61, 67, 73, 82, 93, 104, 120, 134, 153, 170, 191, 191,
+        51, 51, 52, 61, 75, 82, 90, 98, 108, 119, 133, 147, 165, 181, 201, 201,
+        59, 58, 56, 67, 82, 91, 102, 112, 123, 135, 149, 163, 180, 196, 215,
+        215, 69, 65, 62, 73, 90, 102, 119, 130, 144, 155, 169, 182, 198, 214,
+        232, 232, 81, 75, 70, 82, 98, 112, 130, 143, 159, 172, 186, 200, 216,
+        231, 249, 249, 97, 88, 81, 93, 108, 123, 144, 159, 178, 192, 208, 222,
+        238, 252, 268, 268, 111, 101, 93, 104, 119, 135, 155, 172, 192, 207,
+        225, 239, 255, 269, 285, 285, 132, 119, 109, 120, 133, 149, 169, 186,
+        208, 225, 244, 259, 276, 290, 305, 305, 150, 135, 123, 134, 147, 163,
+        182, 200, 222, 239, 259, 274, 291, 305, 321, 321, 173, 156, 142, 153,
+        165, 180, 198, 216, 238, 255, 276, 291, 309, 323, 338, 338, 193, 175,
+        159, 170, 181, 196, 214, 231, 252, 269, 290, 305, 323, 337, 352, 352,
+        218, 198, 181, 191, 201, 215, 232, 249, 268, 285, 305, 321, 338, 352,
+        367, 367, 218, 198, 181, 191, 201, 215, 232, 249, 268, 285, 305, 321,
+        338, 352, 367, 367,
+        /* Size 32 */
+        64, 59, 54, 50, 47, 48, 49, 50, 51, 55, 59, 63, 69, 74, 81, 88, 97, 104,
+        111, 121, 132, 140, 150, 161, 173, 183, 193, 205, 218, 218, 218, 218,
+        59, 56, 53, 51, 49, 49, 50, 51, 51, 54, 58, 62, 67, 72, 78, 84, 92, 99,
+        106, 115, 125, 133, 142, 152, 164, 173, 183, 195, 208, 208, 208, 208,
+        54, 53, 52, 51, 50, 51, 51, 51, 51, 54, 58, 61, 65, 70, 75, 81, 88, 94,
+        101, 110, 119, 127, 135, 145, 156, 165, 175, 186, 198, 198, 198, 198,
+        50, 51, 51, 52, 52, 52, 52, 52, 52, 54, 57, 60, 63, 68, 72, 78, 85, 90,
+        97, 105, 114, 121, 129, 138, 149, 157, 167, 177, 189, 189, 189, 189, 47,
+        49, 50, 52, 54, 54, 53, 52, 52, 54, 56, 59, 62, 66, 70, 75, 81, 87, 93,
+        100, 109, 115, 123, 132, 142, 150, 159, 170, 181, 181, 181, 181, 48, 49,
+        51, 52, 54, 54, 55, 56, 56, 59, 61, 64, 67, 71, 76, 81, 87, 92, 98, 105,
+        114, 121, 128, 137, 147, 155, 164, 174, 186, 186, 186, 186, 49, 50, 51,
+        52, 53, 55, 57, 59, 61, 64, 67, 70, 73, 77, 82, 87, 93, 98, 104, 111,
+        120, 126, 134, 143, 153, 161, 170, 179, 191, 191, 191, 191, 50, 51, 51,
+        52, 52, 56, 59, 63, 68, 71, 74, 77, 81, 85, 89, 94, 100, 105, 111, 118,
+        126, 133, 140, 149, 158, 166, 175, 185, 196, 196, 196, 196, 51, 51, 51,
+        52, 52, 56, 61, 68, 75, 79, 82, 86, 90, 94, 98, 103, 108, 113, 119, 126,
+        133, 140, 147, 155, 165, 172, 181, 191, 201, 201, 201, 201, 55, 54, 54,
+        54, 54, 59, 64, 71, 79, 82, 86, 91, 96, 100, 105, 110, 115, 120, 126,
+        133, 140, 147, 155, 163, 172, 180, 188, 198, 208, 208, 208, 208, 59, 58,
+        58, 57, 56, 61, 67, 74, 82, 86, 91, 96, 102, 107, 112, 117, 123, 129,
+        135, 141, 149, 156, 163, 171, 180, 188, 196, 205, 215, 215, 215, 215,
+        63, 62, 61, 60, 59, 64, 70, 77, 86, 91, 96, 103, 110, 115, 120, 126,
+        133, 138, 144, 151, 158, 165, 172, 180, 189, 196, 204, 213, 223, 223,
+        223, 223, 69, 67, 65, 63, 62, 67, 73, 81, 90, 96, 102, 110, 119, 124,
+        130, 137, 144, 149, 155, 162, 169, 175, 182, 190, 198, 206, 214, 222,
+        232, 232, 232, 232, 74, 72, 70, 68, 66, 71, 77, 85, 94, 100, 107, 115,
+        124, 130, 136, 143, 151, 157, 163, 170, 177, 184, 191, 199, 207, 214,
+        222, 231, 240, 240, 240, 240, 81, 78, 75, 72, 70, 76, 82, 89, 98, 105,
+        112, 120, 130, 136, 143, 151, 159, 165, 172, 179, 186, 193, 200, 208,
+        216, 223, 231, 240, 249, 249, 249, 249, 88, 84, 81, 78, 75, 81, 87, 94,
+        103, 110, 117, 126, 137, 143, 151, 159, 168, 174, 181, 189, 197, 203,
+        211, 218, 226, 234, 241, 249, 258, 258, 258, 258, 97, 92, 88, 85, 81,
+        87, 93, 100, 108, 115, 123, 133, 144, 151, 159, 168, 178, 184, 192, 200,
+        208, 215, 222, 229, 238, 245, 252, 260, 268, 268, 268, 268, 104, 99, 94,
+        90, 87, 92, 98, 105, 113, 120, 129, 138, 149, 157, 165, 174, 184, 191,
+        199, 207, 216, 223, 230, 238, 246, 253, 260, 268, 276, 276, 276, 276,
+        111, 106, 101, 97, 93, 98, 104, 111, 119, 126, 135, 144, 155, 163, 172,
+        181, 192, 199, 207, 215, 225, 232, 239, 247, 255, 262, 269, 277, 285,
+        285, 285, 285, 121, 115, 110, 105, 100, 105, 111, 118, 126, 133, 141,
+        151, 162, 170, 179, 189, 200, 207, 215, 224, 234, 241, 248, 256, 265,
+        272, 279, 287, 295, 295, 295, 295, 132, 125, 119, 114, 109, 114, 120,
+        126, 133, 140, 149, 158, 169, 177, 186, 197, 208, 216, 225, 234, 244,
+        251, 259, 267, 276, 282, 290, 297, 305, 305, 305, 305, 140, 133, 127,
+        121, 115, 121, 126, 133, 140, 147, 156, 165, 175, 184, 193, 203, 215,
+        223, 232, 241, 251, 258, 266, 275, 283, 290, 297, 305, 313, 313, 313,
+        313, 150, 142, 135, 129, 123, 128, 134, 140, 147, 155, 163, 172, 182,
+        191, 200, 211, 222, 230, 239, 248, 259, 266, 274, 283, 291, 298, 305,
+        313, 321, 321, 321, 321, 161, 152, 145, 138, 132, 137, 143, 149, 155,
+        163, 171, 180, 190, 199, 208, 218, 229, 238, 247, 256, 267, 275, 283,
+        291, 300, 307, 314, 322, 329, 329, 329, 329, 173, 164, 156, 149, 142,
+        147, 153, 158, 165, 172, 180, 189, 198, 207, 216, 226, 238, 246, 255,
+        265, 276, 283, 291, 300, 309, 316, 323, 331, 338, 338, 338, 338, 183,
+        173, 165, 157, 150, 155, 161, 166, 172, 180, 188, 196, 206, 214, 223,
+        234, 245, 253, 262, 272, 282, 290, 298, 307, 316, 323, 330, 337, 345,
+        345, 345, 345, 193, 183, 175, 167, 159, 164, 170, 175, 181, 188, 196,
+        204, 214, 222, 231, 241, 252, 260, 269, 279, 290, 297, 305, 314, 323,
+        330, 337, 345, 352, 352, 352, 352, 205, 195, 186, 177, 170, 174, 179,
+        185, 191, 198, 205, 213, 222, 231, 240, 249, 260, 268, 277, 287, 297,
+        305, 313, 322, 331, 337, 345, 352, 360, 360, 360, 360, 218, 208, 198,
+        189, 181, 186, 191, 196, 201, 208, 215, 223, 232, 240, 249, 258, 268,
+        276, 285, 295, 305, 313, 321, 329, 338, 345, 352, 360, 367, 367, 367,
+        367, 218, 208, 198, 189, 181, 186, 191, 196, 201, 208, 215, 223, 232,
+        240, 249, 258, 268, 276, 285, 295, 305, 313, 321, 329, 338, 345, 352,
+        360, 367, 367, 367, 367, 218, 208, 198, 189, 181, 186, 191, 196, 201,
+        208, 215, 223, 232, 240, 249, 258, 268, 276, 285, 295, 305, 313, 321,
+        329, 338, 345, 352, 360, 367, 367, 367, 367, 218, 208, 198, 189, 181,
+        186, 191, 196, 201, 208, 215, 223, 232, 240, 249, 258, 268, 276, 285,
+        295, 305, 313, 321, 329, 338, 345, 352, 360, 367, 367, 367, 367 },
+      { /* Intra matrices */
+        /* Size 4 */
+        16, 18, 33, 60, 18, 29, 45, 68, 33, 45, 72, 98, 60, 68, 98, 129,
+        /* Size 8 */
+        20, 14, 16, 21, 31, 43, 58, 75, 14, 17, 16, 19, 25, 35, 46, 61, 16, 16,
+        24, 28, 34, 43, 54, 68, 21, 19, 28, 38, 47, 56, 67, 80, 31, 25, 34, 47,
+        59, 71, 83, 95, 43, 35, 43, 56, 71, 85, 99, 112, 58, 46, 54, 67, 83, 99,
+        113, 127, 75, 61, 68, 80, 95, 112, 127, 141,
+        /* Size 16 */
+        19, 16, 14, 14, 15, 17, 20, 24, 29, 34, 41, 47, 55, 62, 71, 71, 16, 15,
+        15, 15, 15, 17, 19, 22, 26, 31, 36, 42, 49, 55, 64, 64, 14, 15, 16, 16,
+        15, 17, 18, 21, 24, 28, 33, 38, 44, 50, 58, 58, 14, 15, 16, 17, 18, 20,
+        22, 24, 28, 32, 37, 41, 48, 54, 61, 61, 15, 15, 15, 18, 22, 24, 27, 30,
+        33, 36, 41, 46, 52, 58, 65, 65, 17, 17, 17, 20, 24, 27, 31, 34, 38, 42,
+        46, 51, 57, 63, 70, 70, 20, 19, 18, 22, 27, 31, 36, 40, 45, 49, 53, 58,
+        64, 70, 76, 76, 24, 22, 21, 24, 30, 34, 40, 44, 50, 54, 60, 65, 71, 76,
+        83, 83, 29, 26, 24, 28, 33, 38, 45, 50, 56, 61, 67, 73, 79, 84, 91, 91,
+        34, 31, 28, 32, 36, 42, 49, 54, 61, 67, 74, 79, 86, 91, 98, 98, 41, 36,
+        33, 37, 41, 46, 53, 60, 67, 74, 81, 87, 94, 100, 106, 106, 47, 42, 38,
+        41, 46, 51, 58, 65, 73, 79, 87, 93, 100, 106, 113, 113, 55, 49, 44, 48,
+        52, 57, 64, 71, 79, 86, 94, 100, 108, 114, 121, 121, 62, 55, 50, 54, 58,
+        63, 70, 76, 84, 91, 100, 106, 114, 120, 127, 127, 71, 64, 58, 61, 65,
+        70, 76, 83, 91, 98, 106, 113, 121, 127, 134, 134, 71, 64, 58, 61, 65,
+        70, 76, 83, 91, 98, 106, 113, 121, 127, 134, 134,
+        /* Size 32 */
+        18, 17, 15, 14, 13, 14, 14, 14, 15, 16, 17, 18, 20, 22, 23, 26, 28, 30,
+        33, 36, 40, 42, 45, 49, 53, 57, 60, 65, 69, 69, 69, 69, 17, 16, 15, 14,
+        14, 14, 14, 14, 15, 16, 17, 18, 19, 21, 23, 25, 27, 29, 31, 34, 37, 40,
+        43, 46, 50, 53, 57, 61, 66, 66, 66, 66, 15, 15, 15, 15, 14, 14, 14, 15,
+        15, 15, 16, 18, 19, 20, 22, 24, 26, 28, 30, 32, 35, 38, 41, 44, 48, 51,
+        54, 58, 62, 62, 62, 62, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 17,
+        18, 19, 21, 23, 25, 26, 28, 31, 34, 36, 39, 42, 45, 48, 51, 55, 59, 59,
+        59, 59, 13, 14, 14, 15, 16, 15, 15, 15, 15, 15, 16, 17, 18, 19, 20, 22,
+        24, 25, 27, 29, 32, 34, 37, 40, 43, 46, 49, 52, 56, 56, 56, 56, 14, 14,
+        14, 15, 15, 16, 16, 16, 16, 17, 18, 18, 19, 20, 22, 23, 25, 27, 29, 31,
+        34, 36, 38, 41, 45, 47, 50, 54, 58, 58, 58, 58, 14, 14, 14, 15, 15, 16,
+        16, 17, 18, 18, 19, 20, 21, 22, 24, 25, 27, 29, 31, 33, 36, 38, 40, 43,
+        46, 49, 52, 56, 60, 60, 60, 60, 14, 14, 15, 15, 15, 16, 17, 18, 19, 20,
+        21, 22, 23, 25, 26, 28, 29, 31, 33, 35, 38, 40, 42, 45, 48, 51, 54, 57,
+        61, 61, 61, 61, 15, 15, 15, 15, 15, 16, 18, 19, 22, 23, 24, 25, 26, 27,
+        29, 30, 32, 34, 35, 38, 40, 42, 45, 47, 50, 53, 56, 59, 63, 63, 63, 63,
+        16, 16, 15, 15, 15, 17, 18, 20, 23, 24, 25, 27, 28, 29, 31, 32, 34, 36,
+        38, 40, 42, 45, 47, 50, 53, 56, 59, 62, 66, 66, 66, 66, 17, 17, 16, 16,
+        16, 18, 19, 21, 24, 25, 27, 28, 30, 32, 33, 35, 37, 39, 41, 43, 45, 47,
+        50, 53, 56, 58, 61, 65, 68, 68, 68, 68, 18, 18, 18, 17, 17, 18, 20, 22,
+        25, 27, 28, 30, 33, 34, 36, 38, 40, 42, 44, 46, 48, 51, 53, 56, 59, 62,
+        64, 68, 71, 71, 71, 71, 20, 19, 19, 18, 18, 19, 21, 23, 26, 28, 30, 33,
+        35, 37, 39, 41, 43, 45, 47, 50, 52, 54, 57, 59, 62, 65, 68, 71, 74, 74,
+        74, 74, 22, 21, 20, 19, 19, 20, 22, 25, 27, 29, 32, 34, 37, 39, 41, 43,
+        46, 48, 50, 52, 55, 57, 60, 62, 65, 68, 71, 74, 77, 77, 77, 77, 23, 23,
+        22, 21, 20, 22, 24, 26, 29, 31, 33, 36, 39, 41, 43, 46, 49, 51, 53, 55,
+        58, 60, 63, 66, 69, 71, 74, 77, 81, 81, 81, 81, 26, 25, 24, 23, 22, 23,
+        25, 28, 30, 32, 35, 38, 41, 43, 46, 48, 52, 54, 56, 59, 62, 64, 67, 69,
+        72, 75, 78, 81, 84, 84, 84, 84, 28, 27, 26, 25, 24, 25, 27, 29, 32, 34,
+        37, 40, 43, 46, 49, 52, 55, 57, 60, 63, 66, 68, 71, 74, 77, 79, 82, 85,
+        88, 88, 88, 88, 30, 29, 28, 26, 25, 27, 29, 31, 34, 36, 39, 42, 45, 48,
+        51, 54, 57, 60, 62, 65, 69, 71, 74, 77, 80, 83, 85, 88, 92, 92, 92, 92,
+        33, 31, 30, 28, 27, 29, 31, 33, 35, 38, 41, 44, 47, 50, 53, 56, 60, 62,
+        65, 68, 72, 74, 77, 80, 83, 86, 89, 92, 95, 95, 95, 95, 36, 34, 32, 31,
+        29, 31, 33, 35, 38, 40, 43, 46, 50, 52, 55, 59, 63, 65, 68, 72, 75, 78,
+        81, 84, 87, 90, 93, 96, 99, 99, 99, 99, 40, 37, 35, 34, 32, 34, 36, 38,
+        40, 42, 45, 48, 52, 55, 58, 62, 66, 69, 72, 75, 79, 82, 85, 88, 91, 94,
+        97, 100, 103, 103, 103, 103, 42, 40, 38, 36, 34, 36, 38, 40, 42, 45, 47,
+        51, 54, 57, 60, 64, 68, 71, 74, 78, 82, 85, 88, 91, 94, 97, 100, 103,
+        107, 107, 107, 107, 45, 43, 41, 39, 37, 38, 40, 42, 45, 47, 50, 53, 57,
+        60, 63, 67, 71, 74, 77, 81, 85, 88, 91, 94, 98, 101, 104, 107, 110, 110,
+        110, 110, 49, 46, 44, 42, 40, 41, 43, 45, 47, 50, 53, 56, 59, 62, 66,
+        69, 74, 77, 80, 84, 88, 91, 94, 98, 101, 104, 107, 110, 114, 114, 114,
+        114, 53, 50, 48, 45, 43, 45, 46, 48, 50, 53, 56, 59, 62, 65, 69, 72, 77,
+        80, 83, 87, 91, 94, 98, 101, 105, 108, 111, 114, 118, 118, 118, 118, 57,
+        53, 51, 48, 46, 47, 49, 51, 53, 56, 58, 62, 65, 68, 71, 75, 79, 83, 86,
+        90, 94, 97, 101, 104, 108, 111, 114, 117, 121, 121, 121, 121, 60, 57,
+        54, 51, 49, 50, 52, 54, 56, 59, 61, 64, 68, 71, 74, 78, 82, 85, 89, 93,
+        97, 100, 104, 107, 111, 114, 117, 120, 124, 124, 124, 124, 65, 61, 58,
+        55, 52, 54, 56, 57, 59, 62, 65, 68, 71, 74, 77, 81, 85, 88, 92, 96, 100,
+        103, 107, 110, 114, 117, 120, 124, 127, 127, 127, 127, 69, 66, 62, 59,
+        56, 58, 60, 61, 63, 66, 68, 71, 74, 77, 81, 84, 88, 92, 95, 99, 103,
+        107, 110, 114, 118, 121, 124, 127, 130, 130, 130, 130, 69, 66, 62, 59,
+        56, 58, 60, 61, 63, 66, 68, 71, 74, 77, 81, 84, 88, 92, 95, 99, 103,
+        107, 110, 114, 118, 121, 124, 127, 130, 130, 130, 130, 69, 66, 62, 59,
+        56, 58, 60, 61, 63, 66, 68, 71, 74, 77, 81, 84, 88, 92, 95, 99, 103,
+        107, 110, 114, 118, 121, 124, 127, 130, 130, 130, 130, 69, 66, 62, 59,
+        56, 58, 60, 61, 63, 66, 68, 71, 74, 77, 81, 84, 88, 92, 95, 99, 103,
+        107, 110, 114, 118, 121, 124, 127, 130, 130, 130, 130 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 106, 117, 154, 106, 131, 141, 167, 117, 141, 191, 225, 154, 167,
+        225, 279,
+        /* Size 8 */
+        64, 51, 98, 104, 113, 128, 148, 172, 51, 76, 100, 89, 92, 103, 118, 136,
+        98, 100, 119, 115, 114, 121, 134, 151, 104, 89, 115, 132, 140, 147, 158,
+        173, 113, 92, 114, 140, 160, 174, 186, 201, 128, 103, 121, 147, 174,
+        195, 213, 229, 148, 118, 134, 158, 186, 213, 236, 256, 172, 136, 151,
+        173, 201, 229, 256, 280,
+        /* Size 16 */
+        64, 57, 51, 67, 98, 101, 104, 108, 113, 120, 128, 137, 148, 159, 172,
+        172, 57, 59, 61, 75, 99, 97, 96, 99, 101, 107, 114, 122, 131, 141, 152,
+        152, 51, 61, 76, 86, 100, 94, 89, 91, 92, 97, 103, 110, 118, 126, 136,
+        136, 67, 75, 86, 96, 109, 104, 100, 101, 102, 106, 111, 118, 125, 134,
+        143, 143, 98, 99, 100, 109, 119, 117, 115, 115, 114, 118, 121, 127, 134,
+        142, 151, 151, 101, 97, 94, 104, 117, 120, 123, 124, 126, 129, 133, 139,
+        145, 153, 161, 161, 104, 96, 89, 100, 115, 123, 132, 136, 140, 144, 147,
+        153, 158, 165, 173, 173, 108, 99, 91, 101, 115, 124, 136, 142, 149, 154,
+        160, 165, 171, 178, 186, 186, 113, 101, 92, 102, 114, 126, 140, 149,
+        160, 167, 174, 180, 186, 193, 201, 201, 120, 107, 97, 106, 118, 129,
+        144, 154, 167, 175, 184, 191, 199, 206, 214, 214, 128, 114, 103, 111,
+        121, 133, 147, 160, 174, 184, 195, 204, 213, 221, 229, 229, 137, 122,
+        110, 118, 127, 139, 153, 165, 180, 191, 204, 213, 224, 233, 242, 242,
+        148, 131, 118, 125, 134, 145, 158, 171, 186, 199, 213, 224, 236, 246,
+        256, 256, 159, 141, 126, 134, 142, 153, 165, 178, 193, 206, 221, 233,
+        246, 256, 267, 267, 172, 152, 136, 143, 151, 161, 173, 186, 201, 214,
+        229, 242, 256, 267, 280, 280, 172, 152, 136, 143, 151, 161, 173, 186,
+        201, 214, 229, 242, 256, 267, 280, 280,
+        /* Size 32 */
+        64, 60, 57, 54, 51, 58, 67, 79, 98, 99, 101, 103, 104, 106, 108, 110,
+        113, 116, 120, 124, 128, 133, 137, 143, 148, 153, 159, 165, 172, 172,
+        172, 172, 60, 59, 58, 56, 55, 62, 71, 82, 98, 99, 99, 100, 100, 102,
+        103, 105, 107, 110, 113, 117, 121, 125, 129, 134, 139, 144, 149, 155,
+        161, 161, 161, 161, 57, 58, 59, 60, 61, 67, 75, 85, 99, 98, 97, 97, 96,
+        97, 99, 100, 101, 104, 107, 110, 114, 118, 122, 126, 131, 136, 141, 146,
+        152, 152, 152, 152, 54, 56, 60, 63, 67, 73, 80, 89, 99, 97, 96, 94, 93,
+        94, 94, 95, 97, 99, 102, 105, 108, 112, 115, 120, 124, 128, 133, 138,
+        144, 144, 144, 144, 51, 55, 61, 67, 76, 81, 86, 92, 100, 97, 94, 92, 89,
+        90, 91, 91, 92, 95, 97, 100, 103, 106, 110, 113, 118, 122, 126, 131,
+        136, 136, 136, 136, 58, 62, 67, 73, 81, 85, 91, 97, 104, 101, 99, 97,
+        94, 95, 96, 96, 97, 99, 101, 104, 107, 110, 113, 117, 121, 125, 130,
+        134, 140, 140, 140, 140, 67, 71, 75, 80, 86, 91, 96, 102, 109, 106, 104,
+        102, 100, 101, 101, 102, 102, 104, 106, 109, 111, 114, 118, 121, 125,
+        129, 134, 138, 143, 143, 143, 143, 79, 82, 85, 89, 92, 97, 102, 108,
+        114, 112, 110, 109, 107, 107, 107, 108, 108, 110, 112, 114, 116, 119,
+        122, 126, 129, 133, 138, 142, 147, 147, 147, 147, 98, 98, 99, 99, 100,
+        104, 109, 114, 119, 118, 117, 116, 115, 115, 115, 114, 114, 116, 118,
+        119, 121, 124, 127, 130, 134, 138, 142, 146, 151, 151, 151, 151, 99, 99,
+        98, 97, 97, 101, 106, 112, 118, 118, 118, 119, 119, 119, 119, 120, 120,
+        122, 123, 125, 127, 130, 133, 136, 139, 143, 147, 151, 156, 156, 156,
+        156, 101, 99, 97, 96, 94, 99, 104, 110, 117, 118, 120, 121, 123, 124,
+        124, 125, 126, 128, 129, 131, 133, 136, 139, 142, 145, 149, 153, 157,
+        161, 161, 161, 161, 103, 100, 97, 94, 92, 97, 102, 109, 116, 119, 121,
+        124, 127, 129, 130, 131, 133, 134, 136, 138, 140, 143, 145, 148, 151,
+        155, 159, 163, 167, 167, 167, 167, 104, 100, 96, 93, 89, 94, 100, 107,
+        115, 119, 123, 127, 132, 134, 136, 138, 140, 142, 144, 146, 147, 150,
+        153, 155, 158, 162, 165, 169, 173, 173, 173, 173, 106, 102, 97, 94, 90,
+        95, 101, 107, 115, 119, 124, 129, 134, 137, 139, 142, 145, 147, 149,
+        151, 153, 156, 159, 162, 164, 168, 172, 175, 179, 179, 179, 179, 108,
+        103, 99, 94, 91, 96, 101, 107, 115, 119, 124, 130, 136, 139, 142, 146,
+        149, 152, 154, 157, 160, 162, 165, 168, 171, 175, 178, 182, 186, 186,
+        186, 186, 110, 105, 100, 95, 91, 96, 102, 108, 114, 120, 125, 131, 138,
+        142, 146, 150, 154, 157, 160, 163, 166, 169, 172, 175, 178, 182, 185,
+        189, 193, 193, 193, 193, 113, 107, 101, 97, 92, 97, 102, 108, 114, 120,
+        126, 133, 140, 145, 149, 154, 160, 163, 167, 170, 174, 177, 180, 183,
+        186, 190, 193, 197, 201, 201, 201, 201, 116, 110, 104, 99, 95, 99, 104,
+        110, 116, 122, 128, 134, 142, 147, 152, 157, 163, 167, 171, 175, 179,
+        182, 185, 189, 192, 196, 199, 203, 207, 207, 207, 207, 120, 113, 107,
+        102, 97, 101, 106, 112, 118, 123, 129, 136, 144, 149, 154, 160, 167,
+        171, 175, 179, 184, 187, 191, 195, 199, 202, 206, 210, 214, 214, 214,
+        214, 124, 117, 110, 105, 100, 104, 109, 114, 119, 125, 131, 138, 146,
+        151, 157, 163, 170, 175, 179, 184, 190, 193, 197, 201, 206, 209, 213,
+        217, 221, 221, 221, 221, 128, 121, 114, 108, 103, 107, 111, 116, 121,
+        127, 133, 140, 147, 153, 160, 166, 174, 179, 184, 190, 195, 200, 204,
+        208, 213, 217, 221, 225, 229, 229, 229, 229, 133, 125, 118, 112, 106,
+        110, 114, 119, 124, 130, 136, 143, 150, 156, 162, 169, 177, 182, 187,
+        193, 200, 204, 209, 213, 218, 222, 227, 231, 235, 235, 235, 235, 137,
+        129, 122, 115, 110, 113, 118, 122, 127, 133, 139, 145, 153, 159, 165,
+        172, 180, 185, 191, 197, 204, 209, 213, 219, 224, 228, 233, 237, 242,
+        242, 242, 242, 143, 134, 126, 120, 113, 117, 121, 126, 130, 136, 142,
+        148, 155, 162, 168, 175, 183, 189, 195, 201, 208, 213, 219, 224, 230,
+        234, 239, 244, 249, 249, 249, 249, 148, 139, 131, 124, 118, 121, 125,
+        129, 134, 139, 145, 151, 158, 164, 171, 178, 186, 192, 199, 206, 213,
+        218, 224, 230, 236, 241, 246, 251, 256, 256, 256, 256, 153, 144, 136,
+        128, 122, 125, 129, 133, 138, 143, 149, 155, 162, 168, 175, 182, 190,
+        196, 202, 209, 217, 222, 228, 234, 241, 246, 251, 256, 262, 262, 262,
+        262, 159, 149, 141, 133, 126, 130, 134, 138, 142, 147, 153, 159, 165,
+        172, 178, 185, 193, 199, 206, 213, 221, 227, 233, 239, 246, 251, 256,
+        262, 267, 267, 267, 267, 165, 155, 146, 138, 131, 134, 138, 142, 146,
+        151, 157, 163, 169, 175, 182, 189, 197, 203, 210, 217, 225, 231, 237,
+        244, 251, 256, 262, 267, 273, 273, 273, 273, 172, 161, 152, 144, 136,
+        140, 143, 147, 151, 156, 161, 167, 173, 179, 186, 193, 201, 207, 214,
+        221, 229, 235, 242, 249, 256, 262, 267, 273, 280, 280, 280, 280, 172,
+        161, 152, 144, 136, 140, 143, 147, 151, 156, 161, 167, 173, 179, 186,
+        193, 201, 207, 214, 221, 229, 235, 242, 249, 256, 262, 267, 273, 280,
+        280, 280, 280, 172, 161, 152, 144, 136, 140, 143, 147, 151, 156, 161,
+        167, 173, 179, 186, 193, 201, 207, 214, 221, 229, 235, 242, 249, 256,
+        262, 267, 273, 280, 280, 280, 280, 172, 161, 152, 144, 136, 140, 143,
+        147, 151, 156, 161, 167, 173, 179, 186, 193, 201, 207, 214, 221, 229,
+        235, 242, 249, 256, 262, 267, 273, 280, 280, 280, 280 },
+      { /* Intra matrices */
+        /* Size 4 */
+        23, 40, 44, 59, 40, 50, 54, 64, 44, 54, 74, 89, 59, 64, 89, 114,
+        /* Size 8 */
+        25, 20, 39, 42, 46, 52, 61, 72, 20, 30, 40, 36, 37, 41, 48, 56, 39, 40,
+        49, 47, 46, 49, 55, 62, 42, 36, 47, 54, 58, 61, 66, 73, 46, 37, 46, 58,
+        67, 73, 79, 85, 52, 41, 49, 61, 73, 83, 91, 99, 61, 48, 55, 66, 79, 91,
+        103, 113, 72, 56, 62, 73, 85, 99, 113, 125,
+        /* Size 16 */
+        24, 22, 19, 26, 38, 39, 41, 42, 44, 47, 51, 55, 59, 64, 69, 69, 22, 22,
+        23, 29, 38, 38, 37, 38, 39, 42, 45, 48, 52, 56, 61, 61, 19, 23, 29, 33,
+        39, 37, 34, 35, 36, 38, 40, 43, 46, 50, 54, 54, 26, 29, 33, 37, 42, 41,
+        39, 39, 40, 42, 43, 46, 49, 53, 57, 57, 38, 38, 39, 42, 47, 46, 45, 45,
+        45, 46, 48, 50, 53, 56, 60, 60, 39, 38, 37, 41, 46, 47, 48, 49, 50, 51,
+        53, 55, 58, 61, 65, 65, 41, 37, 34, 39, 45, 48, 52, 54, 56, 57, 59, 61,
+        64, 67, 70, 70, 42, 38, 35, 39, 45, 49, 54, 57, 60, 62, 64, 67, 69, 72,
+        76, 76, 44, 39, 36, 40, 45, 50, 56, 60, 64, 67, 70, 73, 76, 79, 82, 82,
+        47, 42, 38, 42, 46, 51, 57, 62, 67, 71, 75, 78, 82, 85, 89, 89, 51, 45,
+        40, 43, 48, 53, 59, 64, 70, 75, 80, 84, 88, 92, 96, 96, 55, 48, 43, 46,
+        50, 55, 61, 67, 73, 78, 84, 88, 93, 97, 102, 102, 59, 52, 46, 49, 53,
+        58, 64, 69, 76, 82, 88, 93, 99, 104, 109, 109, 64, 56, 50, 53, 56, 61,
+        67, 72, 79, 85, 92, 97, 104, 109, 114, 114, 69, 61, 54, 57, 60, 65, 70,
+        76, 82, 89, 96, 102, 109, 114, 120, 120, 69, 61, 54, 57, 60, 65, 70, 76,
+        82, 89, 96, 102, 109, 114, 120, 120,
+        /* Size 32 */
+        24, 22, 21, 20, 19, 22, 25, 30, 37, 38, 39, 39, 40, 41, 42, 42, 43, 45,
+        46, 48, 50, 51, 54, 56, 58, 60, 63, 65, 68, 68, 68, 68, 22, 22, 22, 21,
+        21, 23, 27, 31, 37, 38, 38, 38, 38, 39, 40, 40, 41, 42, 44, 45, 47, 48,
+        50, 52, 54, 56, 59, 61, 64, 64, 64, 64, 21, 22, 22, 22, 23, 25, 28, 32,
+        38, 37, 37, 37, 37, 37, 38, 38, 39, 40, 41, 42, 44, 45, 47, 49, 51, 53,
+        55, 57, 60, 60, 60, 60, 20, 21, 22, 24, 25, 28, 30, 34, 38, 37, 36, 36,
+        35, 36, 36, 36, 37, 38, 39, 40, 41, 43, 44, 46, 48, 50, 52, 54, 56, 56,
+        56, 56, 19, 21, 23, 25, 29, 30, 33, 35, 38, 37, 36, 35, 34, 34, 34, 35,
+        35, 36, 37, 38, 39, 41, 42, 44, 45, 47, 49, 51, 53, 53, 53, 53, 22, 23,
+        25, 28, 30, 32, 34, 37, 40, 39, 38, 37, 36, 36, 36, 37, 37, 38, 39, 40,
+        41, 42, 44, 45, 47, 49, 50, 52, 54, 54, 54, 54, 25, 27, 28, 30, 33, 34,
+        37, 39, 42, 41, 40, 39, 38, 38, 39, 39, 39, 40, 41, 42, 43, 44, 45, 47,
+        48, 50, 52, 54, 56, 56, 56, 56, 30, 31, 32, 34, 35, 37, 39, 41, 44, 43,
+        42, 42, 41, 41, 41, 41, 41, 42, 43, 44, 45, 46, 47, 49, 50, 52, 54, 55,
+        57, 57, 57, 57, 37, 37, 38, 38, 38, 40, 42, 44, 46, 46, 45, 45, 44, 44,
+        44, 44, 44, 45, 45, 46, 47, 48, 49, 51, 52, 54, 55, 57, 59, 59, 59, 59,
+        38, 38, 37, 37, 37, 39, 41, 43, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47,
+        48, 48, 49, 50, 52, 53, 54, 56, 58, 59, 61, 61, 61, 61, 39, 38, 37, 36,
+        36, 38, 40, 42, 45, 46, 46, 47, 48, 48, 48, 48, 49, 49, 50, 51, 52, 53,
+        54, 55, 57, 58, 60, 62, 64, 64, 64, 64, 39, 38, 37, 36, 35, 37, 39, 42,
+        45, 46, 47, 48, 49, 50, 50, 51, 52, 52, 53, 54, 55, 56, 57, 58, 59, 61,
+        63, 64, 66, 66, 66, 66, 40, 38, 37, 35, 34, 36, 38, 41, 44, 46, 48, 49,
+        51, 52, 53, 54, 55, 55, 56, 57, 58, 59, 60, 61, 62, 64, 65, 67, 69, 69,
+        69, 69, 41, 39, 37, 36, 34, 36, 38, 41, 44, 46, 48, 50, 52, 53, 54, 55,
+        57, 57, 58, 59, 60, 61, 63, 64, 65, 67, 68, 70, 71, 71, 71, 71, 42, 40,
+        38, 36, 34, 36, 39, 41, 44, 46, 48, 50, 53, 54, 56, 57, 59, 60, 61, 62,
+        63, 64, 65, 67, 68, 69, 71, 73, 74, 74, 74, 74, 42, 40, 38, 36, 35, 37,
+        39, 41, 44, 46, 48, 51, 54, 55, 57, 59, 61, 62, 63, 64, 66, 67, 68, 70,
+        71, 73, 74, 76, 78, 78, 78, 78, 43, 41, 39, 37, 35, 37, 39, 41, 44, 46,
+        49, 52, 55, 57, 59, 61, 63, 64, 66, 67, 69, 70, 72, 73, 75, 76, 78, 79,
+        81, 81, 81, 81, 45, 42, 40, 38, 36, 38, 40, 42, 45, 47, 49, 52, 55, 57,
+        60, 62, 64, 66, 68, 69, 71, 73, 74, 76, 77, 79, 80, 82, 84, 84, 84, 84,
+        46, 44, 41, 39, 37, 39, 41, 43, 45, 48, 50, 53, 56, 58, 61, 63, 66, 68,
+        70, 71, 74, 75, 77, 78, 80, 82, 83, 85, 87, 87, 87, 87, 48, 45, 42, 40,
+        38, 40, 42, 44, 46, 48, 51, 54, 57, 59, 62, 64, 67, 69, 71, 74, 76, 78,
+        79, 81, 83, 85, 87, 89, 90, 90, 90, 90, 50, 47, 44, 41, 39, 41, 43, 45,
+        47, 49, 52, 55, 58, 60, 63, 66, 69, 71, 74, 76, 79, 80, 82, 84, 87, 88,
+        90, 92, 94, 94, 94, 94, 51, 48, 45, 43, 41, 42, 44, 46, 48, 50, 53, 56,
+        59, 61, 64, 67, 70, 73, 75, 78, 80, 82, 85, 87, 89, 91, 93, 95, 97, 97,
+        97, 97, 54, 50, 47, 44, 42, 44, 45, 47, 49, 52, 54, 57, 60, 63, 65, 68,
+        72, 74, 77, 79, 82, 85, 87, 89, 92, 94, 96, 98, 100, 100, 100, 100, 56,
+        52, 49, 46, 44, 45, 47, 49, 51, 53, 55, 58, 61, 64, 67, 70, 73, 76, 78,
+        81, 84, 87, 89, 92, 94, 96, 99, 101, 103, 103, 103, 103, 58, 54, 51, 48,
+        45, 47, 48, 50, 52, 54, 57, 59, 62, 65, 68, 71, 75, 77, 80, 83, 87, 89,
+        92, 94, 97, 99, 102, 104, 107, 107, 107, 107, 60, 56, 53, 50, 47, 49,
+        50, 52, 54, 56, 58, 61, 64, 67, 69, 73, 76, 79, 82, 85, 88, 91, 94, 96,
+        99, 102, 104, 107, 109, 109, 109, 109, 63, 59, 55, 52, 49, 50, 52, 54,
+        55, 58, 60, 63, 65, 68, 71, 74, 78, 80, 83, 87, 90, 93, 96, 99, 102,
+        104, 107, 109, 112, 112, 112, 112, 65, 61, 57, 54, 51, 52, 54, 55, 57,
+        59, 62, 64, 67, 70, 73, 76, 79, 82, 85, 89, 92, 95, 98, 101, 104, 107,
+        109, 112, 115, 115, 115, 115, 68, 64, 60, 56, 53, 54, 56, 57, 59, 61,
+        64, 66, 69, 71, 74, 78, 81, 84, 87, 90, 94, 97, 100, 103, 107, 109, 112,
+        115, 118, 118, 118, 118, 68, 64, 60, 56, 53, 54, 56, 57, 59, 61, 64, 66,
+        69, 71, 74, 78, 81, 84, 87, 90, 94, 97, 100, 103, 107, 109, 112, 115,
+        118, 118, 118, 118, 68, 64, 60, 56, 53, 54, 56, 57, 59, 61, 64, 66, 69,
+        71, 74, 78, 81, 84, 87, 90, 94, 97, 100, 103, 107, 109, 112, 115, 118,
+        118, 118, 118, 68, 64, 60, 56, 53, 54, 56, 57, 59, 61, 64, 66, 69, 71,
+        74, 78, 81, 84, 87, 90, 94, 97, 100, 103, 107, 109, 112, 115, 118, 118,
+        118, 118 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 70, 120, 197, 70, 109, 156, 218, 120, 156, 229, 287, 197, 218, 287,
+        344,
+        /* Size 8 */
+        64, 47, 51, 69, 94, 126, 161, 197, 47, 55, 52, 62, 80, 105, 135, 167,
+        51, 52, 75, 88, 105, 127, 154, 183, 69, 62, 88, 115, 136, 157, 181, 207,
+        94, 80, 105, 136, 165, 189, 212, 234, 126, 105, 127, 157, 189, 216, 240,
+        261, 161, 135, 154, 181, 212, 240, 264, 284, 197, 167, 183, 207, 234,
+        261, 284, 303,
+        /* Size 16 */
+        64, 54, 47, 49, 51, 59, 69, 80, 94, 108, 126, 141, 161, 177, 197, 197,
+        54, 53, 51, 51, 52, 58, 65, 74, 87, 99, 115, 129, 147, 162, 181, 181,
+        47, 51, 55, 53, 52, 57, 62, 70, 80, 91, 105, 118, 135, 149, 167, 167,
+        49, 51, 53, 57, 62, 67, 73, 81, 91, 102, 115, 128, 144, 158, 175, 175,
+        51, 52, 52, 62, 75, 81, 88, 96, 105, 115, 127, 139, 154, 167, 183, 183,
+        59, 58, 57, 67, 81, 89, 100, 108, 118, 128, 140, 152, 166, 179, 195,
+        195, 69, 65, 62, 73, 88, 100, 115, 124, 136, 146, 157, 168, 181, 193,
+        207, 207, 80, 74, 70, 81, 96, 108, 124, 136, 149, 159, 172, 183, 195,
+        207, 220, 220, 94, 87, 80, 91, 105, 118, 136, 149, 165, 176, 189, 200,
+        212, 222, 234, 234, 108, 99, 91, 102, 115, 128, 146, 159, 176, 188, 202,
+        213, 225, 235, 247, 247, 126, 115, 105, 115, 127, 140, 157, 172, 189,
+        202, 216, 228, 240, 250, 261, 261, 141, 129, 118, 128, 139, 152, 168,
+        183, 200, 213, 228, 239, 251, 261, 272, 272, 161, 147, 135, 144, 154,
+        166, 181, 195, 212, 225, 240, 251, 264, 273, 284, 284, 177, 162, 149,
+        158, 167, 179, 193, 207, 222, 235, 250, 261, 273, 283, 293, 293, 197,
+        181, 167, 175, 183, 195, 207, 220, 234, 247, 261, 272, 284, 293, 303,
+        303, 197, 181, 167, 175, 183, 195, 207, 220, 234, 247, 261, 272, 284,
+        293, 303, 303,
+        /* Size 32 */
+        64, 59, 54, 51, 47, 48, 49, 50, 51, 55, 59, 64, 69, 74, 80, 86, 94, 101,
+        108, 116, 126, 133, 141, 150, 161, 169, 177, 186, 197, 197, 197, 197,
+        59, 56, 54, 51, 49, 50, 50, 51, 52, 55, 58, 62, 67, 72, 77, 83, 90, 96,
+        103, 111, 120, 127, 135, 143, 153, 161, 169, 178, 189, 189, 189, 189,
+        54, 54, 53, 52, 51, 51, 51, 52, 52, 55, 58, 61, 65, 69, 74, 80, 87, 92,
+        99, 106, 115, 121, 129, 137, 147, 154, 162, 171, 181, 181, 181, 181, 51,
+        51, 52, 52, 53, 53, 52, 52, 52, 55, 57, 60, 63, 67, 72, 77, 83, 89, 95,
+        102, 110, 116, 123, 131, 141, 148, 155, 164, 174, 174, 174, 174, 47, 49,
+        51, 53, 55, 54, 53, 53, 52, 54, 57, 59, 62, 66, 70, 75, 80, 85, 91, 98,
+        105, 111, 118, 126, 135, 142, 149, 158, 167, 167, 167, 167, 48, 50, 51,
+        53, 54, 55, 55, 56, 57, 59, 61, 64, 67, 71, 75, 80, 85, 90, 96, 102,
+        110, 116, 123, 130, 139, 146, 153, 162, 171, 171, 171, 171, 49, 50, 51,
+        52, 53, 55, 57, 59, 62, 64, 67, 70, 73, 77, 81, 86, 91, 96, 102, 108,
+        115, 121, 128, 135, 144, 150, 158, 166, 175, 175, 175, 175, 50, 51, 52,
+        52, 53, 56, 59, 63, 68, 70, 73, 76, 80, 84, 88, 92, 97, 102, 108, 114,
+        121, 127, 133, 140, 148, 155, 162, 170, 179, 179, 179, 179, 51, 52, 52,
+        52, 52, 57, 62, 68, 75, 78, 81, 84, 88, 92, 96, 100, 105, 109, 115, 120,
+        127, 133, 139, 146, 154, 160, 167, 175, 183, 183, 183, 183, 55, 55, 55,
+        55, 54, 59, 64, 70, 78, 81, 85, 89, 94, 98, 102, 106, 111, 116, 121,
+        127, 133, 139, 145, 152, 160, 166, 173, 181, 189, 189, 189, 189, 59, 58,
+        58, 57, 57, 61, 67, 73, 81, 85, 89, 94, 100, 104, 108, 113, 118, 123,
+        128, 134, 140, 146, 152, 159, 166, 173, 179, 187, 195, 195, 195, 195,
+        64, 62, 61, 60, 59, 64, 70, 76, 84, 89, 94, 100, 107, 111, 116, 121,
+        126, 131, 137, 142, 148, 154, 160, 166, 173, 180, 186, 193, 201, 201,
+        201, 201, 69, 67, 65, 63, 62, 67, 73, 80, 88, 94, 100, 107, 115, 119,
+        124, 130, 136, 141, 146, 151, 157, 163, 168, 175, 181, 187, 193, 200,
+        207, 207, 207, 207, 74, 72, 69, 67, 66, 71, 77, 84, 92, 98, 104, 111,
+        119, 124, 130, 136, 142, 147, 152, 158, 164, 170, 175, 181, 188, 194,
+        200, 206, 213, 213, 213, 213, 80, 77, 74, 72, 70, 75, 81, 88, 96, 102,
+        108, 116, 124, 130, 136, 142, 149, 154, 159, 165, 172, 177, 183, 189,
+        195, 201, 207, 213, 220, 220, 220, 220, 86, 83, 80, 77, 75, 80, 86, 92,
+        100, 106, 113, 121, 130, 136, 142, 149, 156, 162, 167, 173, 180, 185,
+        191, 197, 203, 209, 214, 220, 227, 227, 227, 227, 94, 90, 87, 83, 80,
+        85, 91, 97, 105, 111, 118, 126, 136, 142, 149, 156, 165, 170, 176, 182,
+        189, 194, 200, 205, 212, 217, 222, 228, 234, 234, 234, 234, 101, 96, 92,
+        89, 85, 90, 96, 102, 109, 116, 123, 131, 141, 147, 154, 162, 170, 176,
+        182, 188, 195, 200, 206, 212, 218, 223, 229, 234, 240, 240, 240, 240,
+        108, 103, 99, 95, 91, 96, 102, 108, 115, 121, 128, 137, 146, 152, 159,
+        167, 176, 182, 188, 195, 202, 207, 213, 219, 225, 230, 235, 241, 247,
+        247, 247, 247, 116, 111, 106, 102, 98, 102, 108, 114, 120, 127, 134,
+        142, 151, 158, 165, 173, 182, 188, 195, 201, 209, 214, 220, 226, 232,
+        237, 242, 248, 254, 254, 254, 254, 126, 120, 115, 110, 105, 110, 115,
+        121, 127, 133, 140, 148, 157, 164, 172, 180, 189, 195, 202, 209, 216,
+        222, 228, 234, 240, 245, 250, 255, 261, 261, 261, 261, 133, 127, 121,
+        116, 111, 116, 121, 127, 133, 139, 146, 154, 163, 170, 177, 185, 194,
+        200, 207, 214, 222, 227, 233, 239, 245, 250, 255, 261, 266, 266, 266,
+        266, 141, 135, 129, 123, 118, 123, 128, 133, 139, 145, 152, 160, 168,
+        175, 183, 191, 200, 206, 213, 220, 228, 233, 239, 245, 251, 256, 261,
+        266, 272, 272, 272, 272, 150, 143, 137, 131, 126, 130, 135, 140, 146,
+        152, 159, 166, 175, 181, 189, 197, 205, 212, 219, 226, 234, 239, 245,
+        251, 257, 262, 267, 272, 278, 278, 278, 278, 161, 153, 147, 141, 135,
+        139, 144, 148, 154, 160, 166, 173, 181, 188, 195, 203, 212, 218, 225,
+        232, 240, 245, 251, 257, 264, 268, 273, 278, 284, 284, 284, 284, 169,
+        161, 154, 148, 142, 146, 150, 155, 160, 166, 173, 180, 187, 194, 201,
+        209, 217, 223, 230, 237, 245, 250, 256, 262, 268, 273, 278, 283, 288,
+        288, 288, 288, 177, 169, 162, 155, 149, 153, 158, 162, 167, 173, 179,
+        186, 193, 200, 207, 214, 222, 229, 235, 242, 250, 255, 261, 267, 273,
+        278, 283, 288, 293, 293, 293, 293, 186, 178, 171, 164, 158, 162, 166,
+        170, 175, 181, 187, 193, 200, 206, 213, 220, 228, 234, 241, 248, 255,
+        261, 266, 272, 278, 283, 288, 293, 298, 298, 298, 298, 197, 189, 181,
+        174, 167, 171, 175, 179, 183, 189, 195, 201, 207, 213, 220, 227, 234,
+        240, 247, 254, 261, 266, 272, 278, 284, 288, 293, 298, 303, 303, 303,
+        303, 197, 189, 181, 174, 167, 171, 175, 179, 183, 189, 195, 201, 207,
+        213, 220, 227, 234, 240, 247, 254, 261, 266, 272, 278, 284, 288, 293,
+        298, 303, 303, 303, 303, 197, 189, 181, 174, 167, 171, 175, 179, 183,
+        189, 195, 201, 207, 213, 220, 227, 234, 240, 247, 254, 261, 266, 272,
+        278, 284, 288, 293, 298, 303, 303, 303, 303, 197, 189, 181, 174, 167,
+        171, 175, 179, 183, 189, 195, 201, 207, 213, 220, 227, 234, 240, 247,
+        254, 261, 266, 272, 278, 284, 288, 293, 298, 303, 303, 303, 303 },
+      { /* Intra matrices */
+        /* Size 4 */
+        19, 21, 37, 63, 21, 33, 49, 70, 37, 49, 74, 96, 63, 70, 96, 119,
+        /* Size 8 */
+        23, 17, 18, 25, 34, 47, 61, 77, 17, 19, 18, 22, 29, 38, 50, 64, 18, 18,
+        27, 32, 38, 47, 58, 71, 25, 22, 32, 42, 51, 60, 70, 81, 34, 29, 38, 51,
+        63, 73, 83, 94, 47, 38, 47, 60, 73, 85, 96, 106, 61, 50, 58, 70, 83, 96,
+        108, 118, 77, 64, 71, 81, 94, 106, 118, 127,
+        /* Size 16 */
+        22, 18, 16, 17, 17, 20, 23, 27, 33, 38, 45, 51, 58, 65, 73, 73, 18, 18,
+        17, 17, 18, 20, 22, 25, 30, 34, 40, 46, 53, 59, 67, 67, 16, 17, 19, 18,
+        18, 19, 21, 24, 28, 32, 37, 42, 48, 54, 61, 61, 17, 17, 18, 19, 21, 23,
+        25, 28, 31, 35, 40, 45, 52, 57, 64, 64, 17, 18, 18, 21, 26, 28, 31, 33,
+        37, 40, 45, 50, 55, 61, 68, 68, 20, 20, 19, 23, 28, 31, 35, 38, 42, 46,
+        50, 55, 61, 66, 72, 72, 23, 22, 21, 25, 31, 35, 40, 44, 48, 52, 57, 61,
+        67, 72, 78, 78, 27, 25, 24, 28, 33, 38, 44, 48, 54, 58, 63, 67, 73, 78,
+        83, 83, 33, 30, 28, 31, 37, 42, 48, 54, 60, 64, 70, 74, 80, 84, 90, 90,
+        38, 34, 32, 35, 40, 46, 52, 58, 64, 69, 75, 80, 85, 90, 95, 95, 45, 40,
+        37, 40, 45, 50, 57, 63, 70, 75, 82, 87, 92, 97, 102, 102, 51, 46, 42,
+        45, 50, 55, 61, 67, 74, 80, 87, 92, 97, 102, 107, 107, 58, 53, 48, 52,
+        55, 61, 67, 73, 80, 85, 92, 97, 103, 107, 112, 112, 65, 59, 54, 57, 61,
+        66, 72, 78, 84, 90, 97, 102, 107, 112, 117, 117, 73, 67, 61, 64, 68, 72,
+        78, 83, 90, 95, 102, 107, 112, 117, 122, 122, 73, 67, 61, 64, 68, 72,
+        78, 83, 90, 95, 102, 107, 112, 117, 122, 122,
+        /* Size 32 */
+        21, 19, 18, 17, 16, 16, 16, 17, 17, 18, 20, 21, 23, 25, 27, 29, 32, 34,
+        37, 40, 44, 46, 49, 53, 57, 60, 63, 67, 72, 72, 72, 72, 19, 19, 18, 17,
+        16, 16, 17, 17, 17, 18, 19, 21, 22, 24, 26, 28, 31, 33, 35, 38, 41, 44,
+        47, 50, 54, 57, 60, 64, 68, 68, 68, 68, 18, 18, 17, 17, 17, 17, 17, 17,
+        17, 18, 19, 20, 22, 23, 25, 27, 29, 31, 34, 36, 39, 42, 45, 48, 52, 54,
+        57, 61, 65, 65, 65, 65, 17, 17, 17, 17, 17, 17, 17, 17, 17, 18, 19, 20,
+        21, 22, 24, 26, 28, 30, 32, 35, 38, 40, 43, 46, 49, 52, 55, 58, 62, 62,
+        62, 62, 16, 16, 17, 17, 18, 18, 18, 17, 17, 18, 19, 20, 21, 22, 23, 25,
+        27, 29, 31, 33, 36, 38, 41, 44, 47, 50, 53, 56, 60, 60, 60, 60, 16, 16,
+        17, 17, 18, 18, 18, 18, 19, 19, 20, 21, 22, 24, 25, 27, 29, 31, 33, 35,
+        38, 40, 42, 45, 49, 51, 54, 57, 61, 61, 61, 61, 16, 17, 17, 17, 18, 18,
+        19, 20, 20, 21, 22, 23, 24, 26, 27, 29, 31, 33, 35, 37, 40, 42, 44, 47,
+        50, 53, 56, 59, 63, 63, 63, 63, 17, 17, 17, 17, 17, 18, 20, 21, 22, 23,
+        24, 26, 27, 28, 30, 31, 33, 35, 37, 39, 42, 44, 46, 49, 52, 55, 58, 61,
+        64, 64, 64, 64, 17, 17, 17, 17, 17, 19, 20, 22, 25, 26, 27, 28, 30, 31,
+        32, 34, 36, 37, 39, 42, 44, 46, 49, 51, 54, 57, 60, 63, 66, 66, 66, 66,
+        18, 18, 18, 18, 18, 19, 21, 23, 26, 27, 29, 30, 32, 33, 35, 36, 38, 40,
+        42, 44, 46, 49, 51, 54, 57, 59, 62, 65, 68, 68, 68, 68, 20, 19, 19, 19,
+        19, 20, 22, 24, 27, 29, 30, 32, 34, 35, 37, 39, 41, 43, 45, 47, 49, 51,
+        54, 56, 59, 62, 64, 67, 71, 71, 71, 71, 21, 21, 20, 20, 20, 21, 23, 26,
+        28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 57, 59, 62, 64,
+        67, 70, 73, 73, 73, 73, 23, 22, 22, 21, 21, 22, 24, 27, 30, 32, 34, 36,
+        39, 41, 43, 45, 47, 49, 51, 53, 56, 58, 60, 62, 65, 68, 70, 73, 76, 76,
+        76, 76, 25, 24, 23, 22, 22, 24, 26, 28, 31, 33, 35, 38, 41, 43, 45, 47,
+        50, 52, 54, 56, 58, 60, 63, 65, 68, 70, 73, 76, 78, 78, 78, 78, 27, 26,
+        25, 24, 23, 25, 27, 30, 32, 35, 37, 40, 43, 45, 47, 50, 52, 54, 56, 59,
+        61, 63, 66, 68, 71, 73, 76, 78, 81, 81, 81, 81, 29, 28, 27, 26, 25, 27,
+        29, 31, 34, 36, 39, 42, 45, 47, 50, 52, 55, 57, 60, 62, 65, 67, 69, 72,
+        74, 76, 79, 82, 84, 84, 84, 84, 32, 31, 29, 28, 27, 29, 31, 33, 36, 38,
+        41, 44, 47, 50, 52, 55, 58, 61, 63, 66, 68, 70, 73, 75, 78, 80, 82, 85,
+        88, 88, 88, 88, 34, 33, 31, 30, 29, 31, 33, 35, 37, 40, 43, 46, 49, 52,
+        54, 57, 61, 63, 65, 68, 71, 73, 75, 78, 80, 83, 85, 88, 90, 90, 90, 90,
+        37, 35, 34, 32, 31, 33, 35, 37, 39, 42, 45, 48, 51, 54, 56, 60, 63, 65,
+        68, 71, 74, 76, 78, 81, 83, 86, 88, 90, 93, 93, 93, 93, 40, 38, 36, 35,
+        33, 35, 37, 39, 42, 44, 47, 50, 53, 56, 59, 62, 66, 68, 71, 73, 77, 79,
+        81, 84, 87, 89, 91, 93, 96, 96, 96, 96, 44, 41, 39, 38, 36, 38, 40, 42,
+        44, 46, 49, 52, 56, 58, 61, 65, 68, 71, 74, 77, 80, 82, 85, 87, 90, 92,
+        94, 97, 99, 99, 99, 99, 46, 44, 42, 40, 38, 40, 42, 44, 46, 49, 51, 54,
+        58, 60, 63, 67, 70, 73, 76, 79, 82, 84, 87, 90, 92, 95, 97, 99, 102,
+        102, 102, 102, 49, 47, 45, 43, 41, 42, 44, 46, 49, 51, 54, 57, 60, 63,
+        66, 69, 73, 75, 78, 81, 85, 87, 89, 92, 95, 97, 99, 102, 104, 104, 104,
+        104, 53, 50, 48, 46, 44, 45, 47, 49, 51, 54, 56, 59, 62, 65, 68, 72, 75,
+        78, 81, 84, 87, 90, 92, 95, 98, 100, 102, 105, 107, 107, 107, 107, 57,
+        54, 52, 49, 47, 49, 50, 52, 54, 57, 59, 62, 65, 68, 71, 74, 78, 80, 83,
+        87, 90, 92, 95, 98, 101, 103, 105, 107, 110, 110, 110, 110, 60, 57, 54,
+        52, 50, 51, 53, 55, 57, 59, 62, 64, 68, 70, 73, 76, 80, 83, 86, 89, 92,
+        95, 97, 100, 103, 105, 107, 110, 112, 112, 112, 112, 63, 60, 57, 55, 53,
+        54, 56, 58, 60, 62, 64, 67, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 99,
+        102, 105, 107, 109, 112, 114, 114, 114, 114, 67, 64, 61, 58, 56, 57, 59,
+        61, 63, 65, 67, 70, 73, 76, 78, 82, 85, 88, 90, 93, 97, 99, 102, 105,
+        107, 110, 112, 114, 117, 117, 117, 117, 72, 68, 65, 62, 60, 61, 63, 64,
+        66, 68, 71, 73, 76, 78, 81, 84, 88, 90, 93, 96, 99, 102, 104, 107, 110,
+        112, 114, 117, 119, 119, 119, 119, 72, 68, 65, 62, 60, 61, 63, 64, 66,
+        68, 71, 73, 76, 78, 81, 84, 88, 90, 93, 96, 99, 102, 104, 107, 110, 112,
+        114, 117, 119, 119, 119, 119, 72, 68, 65, 62, 60, 61, 63, 64, 66, 68,
+        71, 73, 76, 78, 81, 84, 88, 90, 93, 96, 99, 102, 104, 107, 110, 112,
+        114, 117, 119, 119, 119, 119, 72, 68, 65, 62, 60, 61, 63, 64, 66, 68,
+        71, 73, 76, 78, 81, 84, 88, 90, 93, 96, 99, 102, 104, 107, 110, 112,
+        114, 117, 119, 119, 119, 119 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 104, 114, 146, 104, 126, 136, 158, 114, 136, 178, 207, 146, 158,
+        207, 250,
+        /* Size 8 */
+        64, 51, 96, 102, 109, 123, 141, 161, 51, 75, 98, 88, 91, 100, 114, 131,
+        96, 98, 116, 111, 111, 117, 128, 143, 102, 88, 111, 127, 134, 140, 150,
+        163, 109, 91, 111, 134, 151, 163, 174, 186, 123, 100, 117, 140, 163,
+        181, 196, 209, 141, 114, 128, 150, 174, 196, 214, 230, 161, 131, 143,
+        163, 186, 209, 230, 248,
+        /* Size 16 */
+        64, 57, 51, 67, 96, 99, 102, 106, 109, 116, 123, 132, 141, 151, 161,
+        161, 57, 59, 61, 75, 97, 96, 94, 97, 99, 105, 111, 118, 126, 135, 144,
+        144, 51, 61, 75, 85, 98, 92, 88, 89, 91, 95, 100, 107, 114, 122, 131,
+        131, 67, 75, 85, 94, 106, 102, 98, 99, 100, 104, 108, 114, 121, 128,
+        137, 137, 96, 97, 98, 106, 116, 113, 111, 111, 111, 114, 117, 123, 128,
+        135, 143, 143, 99, 96, 92, 102, 113, 116, 119, 120, 121, 125, 128, 133,
+        138, 145, 152, 152, 102, 94, 88, 98, 111, 119, 127, 131, 134, 137, 140,
+        145, 150, 156, 163, 163, 106, 97, 89, 99, 111, 120, 131, 136, 142, 146,
+        151, 156, 161, 167, 173, 173, 109, 99, 91, 100, 111, 121, 134, 142, 151,
+        157, 163, 168, 174, 179, 186, 186, 116, 105, 95, 104, 114, 125, 137,
+        146, 157, 164, 172, 178, 184, 190, 196, 196, 123, 111, 100, 108, 117,
+        128, 140, 151, 163, 172, 181, 188, 196, 202, 209, 209, 132, 118, 107,
+        114, 123, 133, 145, 156, 168, 178, 188, 196, 205, 211, 219, 219, 141,
+        126, 114, 121, 128, 138, 150, 161, 174, 184, 196, 205, 214, 222, 230,
+        230, 151, 135, 122, 128, 135, 145, 156, 167, 179, 190, 202, 211, 222,
+        230, 238, 238, 161, 144, 131, 137, 143, 152, 163, 173, 186, 196, 209,
+        219, 230, 238, 248, 248, 161, 144, 131, 137, 143, 152, 163, 173, 186,
+        196, 209, 219, 230, 238, 248, 248,
+        /* Size 32 */
+        64, 60, 57, 54, 51, 58, 67, 79, 96, 97, 99, 100, 102, 104, 106, 107,
+        109, 113, 116, 120, 123, 127, 132, 136, 141, 146, 151, 156, 161, 161,
+        161, 161, 60, 59, 58, 57, 56, 62, 70, 81, 96, 97, 97, 98, 98, 99, 101,
+        102, 104, 107, 110, 113, 117, 120, 124, 129, 133, 137, 142, 147, 152,
+        152, 152, 152, 57, 58, 59, 60, 61, 67, 75, 84, 97, 96, 96, 95, 94, 96,
+        97, 98, 99, 102, 105, 107, 111, 114, 118, 122, 126, 130, 135, 139, 144,
+        144, 144, 144, 54, 57, 60, 63, 67, 73, 79, 87, 97, 96, 94, 92, 91, 92,
+        93, 94, 95, 97, 100, 102, 105, 108, 112, 116, 120, 124, 128, 132, 137,
+        137, 137, 137, 51, 56, 61, 67, 75, 80, 85, 91, 98, 95, 92, 90, 88, 89,
+        89, 90, 91, 93, 95, 98, 100, 103, 107, 110, 114, 118, 122, 126, 131,
+        131, 131, 131, 58, 62, 67, 73, 80, 84, 89, 95, 102, 99, 97, 95, 93, 93,
+        94, 94, 95, 97, 99, 102, 104, 107, 110, 114, 117, 121, 125, 129, 134,
+        134, 134, 134, 67, 70, 75, 79, 85, 89, 94, 100, 106, 104, 102, 100, 98,
+        99, 99, 99, 100, 102, 104, 106, 108, 111, 114, 117, 121, 124, 128, 132,
+        137, 137, 137, 137, 79, 81, 84, 87, 91, 95, 100, 105, 110, 109, 107,
+        106, 104, 105, 105, 105, 105, 107, 109, 111, 112, 115, 118, 121, 125,
+        128, 132, 136, 140, 140, 140, 140, 96, 96, 97, 97, 98, 102, 106, 110,
+        116, 114, 113, 112, 111, 111, 111, 111, 111, 112, 114, 116, 117, 120,
+        123, 125, 128, 132, 135, 139, 143, 143, 143, 143, 97, 97, 96, 96, 95,
+        99, 104, 109, 114, 115, 115, 115, 115, 115, 115, 116, 116, 117, 119,
+        121, 122, 125, 128, 130, 133, 137, 140, 144, 148, 148, 148, 148, 99, 97,
+        96, 94, 92, 97, 102, 107, 113, 115, 116, 117, 119, 119, 120, 121, 121,
+        123, 125, 126, 128, 130, 133, 136, 138, 142, 145, 149, 152, 152, 152,
+        152, 100, 98, 95, 92, 90, 95, 100, 106, 112, 115, 117, 120, 123, 124,
+        125, 126, 127, 129, 131, 132, 134, 136, 139, 141, 144, 147, 150, 154,
+        157, 157, 157, 157, 102, 98, 94, 91, 88, 93, 98, 104, 111, 115, 119,
+        123, 127, 129, 131, 132, 134, 136, 137, 139, 140, 143, 145, 147, 150,
+        153, 156, 159, 163, 163, 163, 163, 104, 99, 96, 92, 89, 93, 99, 105,
+        111, 115, 119, 124, 129, 131, 133, 136, 138, 140, 142, 144, 146, 148,
+        150, 153, 155, 158, 161, 164, 168, 168, 168, 168, 106, 101, 97, 93, 89,
+        94, 99, 105, 111, 115, 120, 125, 131, 133, 136, 139, 142, 144, 146, 149,
+        151, 153, 156, 158, 161, 164, 167, 170, 173, 173, 173, 173, 107, 102,
+        98, 94, 90, 94, 99, 105, 111, 116, 121, 126, 132, 136, 139, 143, 147,
+        149, 151, 154, 157, 159, 162, 164, 167, 170, 173, 176, 179, 179, 179,
+        179, 109, 104, 99, 95, 91, 95, 100, 105, 111, 116, 121, 127, 134, 138,
+        142, 147, 151, 154, 157, 160, 163, 166, 168, 171, 174, 177, 179, 182,
+        186, 186, 186, 186, 113, 107, 102, 97, 93, 97, 102, 107, 112, 117, 123,
+        129, 136, 140, 144, 149, 154, 157, 160, 164, 167, 170, 173, 176, 179,
+        182, 185, 188, 191, 191, 191, 191, 116, 110, 105, 100, 95, 99, 104, 109,
+        114, 119, 125, 131, 137, 142, 146, 151, 157, 160, 164, 168, 172, 175,
+        178, 181, 184, 187, 190, 193, 196, 196, 196, 196, 120, 113, 107, 102,
+        98, 102, 106, 111, 116, 121, 126, 132, 139, 144, 149, 154, 160, 164,
+        168, 172, 176, 180, 183, 186, 190, 193, 196, 199, 202, 202, 202, 202,
+        123, 117, 111, 105, 100, 104, 108, 112, 117, 122, 128, 134, 140, 146,
+        151, 157, 163, 167, 172, 176, 181, 185, 188, 192, 196, 199, 202, 205,
+        209, 209, 209, 209, 127, 120, 114, 108, 103, 107, 111, 115, 120, 125,
+        130, 136, 143, 148, 153, 159, 166, 170, 175, 180, 185, 188, 192, 196,
+        200, 203, 207, 210, 214, 214, 214, 214, 132, 124, 118, 112, 107, 110,
+        114, 118, 123, 128, 133, 139, 145, 150, 156, 162, 168, 173, 178, 183,
+        188, 192, 196, 200, 205, 208, 211, 215, 219, 219, 219, 219, 136, 129,
+        122, 116, 110, 114, 117, 121, 125, 130, 136, 141, 147, 153, 158, 164,
+        171, 176, 181, 186, 192, 196, 200, 205, 209, 213, 216, 220, 224, 224,
+        224, 224, 141, 133, 126, 120, 114, 117, 121, 125, 128, 133, 138, 144,
+        150, 155, 161, 167, 174, 179, 184, 190, 196, 200, 205, 209, 214, 218,
+        222, 226, 230, 230, 230, 230, 146, 137, 130, 124, 118, 121, 124, 128,
+        132, 137, 142, 147, 153, 158, 164, 170, 177, 182, 187, 193, 199, 203,
+        208, 213, 218, 222, 226, 230, 234, 234, 234, 234, 151, 142, 135, 128,
+        122, 125, 128, 132, 135, 140, 145, 150, 156, 161, 167, 173, 179, 185,
+        190, 196, 202, 207, 211, 216, 222, 226, 230, 234, 238, 238, 238, 238,
+        156, 147, 139, 132, 126, 129, 132, 136, 139, 144, 149, 154, 159, 164,
+        170, 176, 182, 188, 193, 199, 205, 210, 215, 220, 226, 230, 234, 238,
+        243, 243, 243, 243, 161, 152, 144, 137, 131, 134, 137, 140, 143, 148,
+        152, 157, 163, 168, 173, 179, 186, 191, 196, 202, 209, 214, 219, 224,
+        230, 234, 238, 243, 248, 248, 248, 248, 161, 152, 144, 137, 131, 134,
+        137, 140, 143, 148, 152, 157, 163, 168, 173, 179, 186, 191, 196, 202,
+        209, 214, 219, 224, 230, 234, 238, 243, 248, 248, 248, 248, 161, 152,
+        144, 137, 131, 134, 137, 140, 143, 148, 152, 157, 163, 168, 173, 179,
+        186, 191, 196, 202, 209, 214, 219, 224, 230, 234, 238, 243, 248, 248,
+        248, 248, 161, 152, 144, 137, 131, 134, 137, 140, 143, 148, 152, 157,
+        163, 168, 173, 179, 186, 191, 196, 202, 209, 214, 219, 224, 230, 234,
+        238, 243, 248, 248, 248, 248 },
+      { /* Intra matrices */
+        /* Size 4 */
+        25, 42, 46, 60, 42, 51, 55, 65, 46, 55, 75, 88, 60, 65, 88, 109,
+        /* Size 8 */
+        27, 22, 41, 44, 48, 54, 63, 72, 22, 32, 42, 38, 39, 43, 50, 58, 41, 42,
+        50, 49, 48, 51, 57, 64, 44, 38, 49, 56, 59, 62, 67, 73, 48, 39, 48, 59,
+        67, 73, 78, 84, 54, 43, 51, 62, 73, 82, 90, 96, 63, 50, 57, 67, 78, 90,
+        99, 107, 72, 58, 64, 73, 84, 96, 107, 117,
+        /* Size 16 */
+        26, 23, 21, 27, 40, 41, 43, 44, 46, 49, 52, 56, 61, 65, 70, 70, 23, 24,
+        25, 31, 40, 40, 39, 40, 42, 44, 47, 50, 54, 58, 62, 62, 21, 25, 31, 35,
+        41, 39, 37, 37, 38, 40, 42, 45, 48, 52, 56, 56, 27, 31, 35, 39, 45, 43,
+        41, 41, 42, 44, 46, 48, 51, 55, 58, 58, 40, 40, 41, 45, 49, 48, 47, 47,
+        47, 48, 50, 52, 55, 58, 62, 62, 41, 40, 39, 43, 48, 49, 50, 51, 52, 53,
+        54, 57, 59, 62, 66, 66, 43, 39, 37, 41, 47, 50, 54, 56, 57, 59, 60, 62,
+        65, 68, 71, 71, 44, 40, 37, 41, 47, 51, 56, 58, 61, 63, 65, 67, 70, 73,
+        76, 76, 46, 42, 38, 42, 47, 52, 57, 61, 65, 68, 71, 73, 76, 79, 82, 82,
+        49, 44, 40, 44, 48, 53, 59, 63, 68, 71, 75, 78, 81, 84, 87, 87, 52, 47,
+        42, 46, 50, 54, 60, 65, 71, 75, 80, 83, 87, 90, 93, 93, 56, 50, 45, 48,
+        52, 57, 62, 67, 73, 78, 83, 87, 91, 95, 98, 98, 61, 54, 48, 51, 55, 59,
+        65, 70, 76, 81, 87, 91, 96, 100, 104, 104, 65, 58, 52, 55, 58, 62, 68,
+        73, 79, 84, 90, 95, 100, 104, 108, 108, 70, 62, 56, 58, 62, 66, 71, 76,
+        82, 87, 93, 98, 104, 108, 113, 113, 70, 62, 56, 58, 62, 66, 71, 76, 82,
+        87, 93, 98, 104, 108, 113, 113,
+        /* Size 32 */
+        26, 24, 23, 22, 21, 23, 27, 32, 39, 40, 41, 41, 42, 43, 44, 44, 45, 47,
+        48, 50, 52, 53, 55, 57, 60, 62, 64, 66, 69, 69, 69, 69, 24, 24, 23, 23,
+        22, 25, 29, 33, 40, 40, 40, 40, 40, 41, 42, 42, 43, 44, 46, 47, 49, 50,
+        52, 54, 56, 58, 60, 62, 65, 65, 65, 65, 23, 23, 24, 24, 25, 27, 30, 34,
+        40, 40, 39, 39, 39, 39, 40, 40, 41, 42, 43, 45, 46, 47, 49, 51, 53, 55,
+        57, 59, 61, 61, 61, 61, 22, 23, 24, 26, 27, 30, 32, 36, 40, 39, 39, 38,
+        37, 38, 38, 39, 39, 40, 41, 42, 43, 45, 46, 48, 50, 52, 54, 56, 58, 58,
+        58, 58, 21, 22, 25, 27, 31, 33, 35, 37, 40, 39, 38, 37, 36, 36, 37, 37,
+        37, 38, 39, 40, 41, 43, 44, 46, 47, 49, 51, 53, 55, 55, 55, 55, 23, 25,
+        27, 30, 33, 34, 37, 39, 42, 41, 40, 39, 38, 38, 39, 39, 39, 40, 41, 42,
+        43, 44, 46, 47, 49, 50, 52, 54, 56, 56, 56, 56, 27, 29, 30, 32, 35, 37,
+        39, 41, 44, 43, 42, 41, 40, 41, 41, 41, 41, 42, 43, 44, 45, 46, 47, 49,
+        50, 52, 54, 56, 58, 58, 58, 58, 32, 33, 34, 36, 37, 39, 41, 43, 46, 45,
+        44, 44, 43, 43, 43, 43, 43, 44, 45, 46, 47, 48, 49, 51, 52, 54, 55, 57,
+        59, 59, 59, 59, 39, 40, 40, 40, 40, 42, 44, 46, 48, 48, 47, 47, 46, 46,
+        46, 46, 46, 47, 47, 48, 49, 50, 51, 52, 54, 55, 57, 59, 61, 61, 61, 61,
+        40, 40, 40, 39, 39, 41, 43, 45, 48, 48, 48, 48, 48, 48, 48, 48, 48, 49,
+        50, 50, 51, 52, 53, 55, 56, 57, 59, 61, 63, 63, 63, 63, 41, 40, 39, 39,
+        38, 40, 42, 44, 47, 48, 48, 49, 50, 50, 50, 50, 51, 51, 52, 53, 54, 55,
+        56, 57, 58, 60, 61, 63, 65, 65, 65, 65, 41, 40, 39, 38, 37, 39, 41, 44,
+        47, 48, 49, 50, 51, 52, 52, 53, 53, 54, 55, 56, 56, 57, 58, 60, 61, 62,
+        64, 65, 67, 67, 67, 67, 42, 40, 39, 37, 36, 38, 40, 43, 46, 48, 50, 51,
+        53, 54, 55, 56, 56, 57, 58, 59, 59, 60, 61, 62, 64, 65, 66, 68, 69, 69,
+        69, 69, 43, 41, 39, 38, 36, 38, 41, 43, 46, 48, 50, 52, 54, 55, 56, 57,
+        58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 69, 70, 72, 72, 72, 72, 44, 42,
+        40, 38, 37, 39, 41, 43, 46, 48, 50, 52, 55, 56, 57, 59, 60, 61, 62, 63,
+        64, 65, 66, 67, 69, 70, 71, 73, 74, 74, 74, 74, 44, 42, 40, 39, 37, 39,
+        41, 43, 46, 48, 50, 53, 56, 57, 59, 60, 62, 63, 64, 66, 67, 68, 69, 70,
+        72, 73, 74, 76, 77, 77, 77, 77, 45, 43, 41, 39, 37, 39, 41, 43, 46, 48,
+        51, 53, 56, 58, 60, 62, 64, 65, 67, 68, 70, 71, 72, 73, 75, 76, 77, 79,
+        80, 80, 80, 80, 47, 44, 42, 40, 38, 40, 42, 44, 47, 49, 51, 54, 57, 59,
+        61, 63, 65, 67, 68, 70, 72, 73, 74, 76, 77, 78, 80, 81, 83, 83, 83, 83,
+        48, 46, 43, 41, 39, 41, 43, 45, 47, 50, 52, 55, 58, 60, 62, 64, 67, 68,
+        70, 72, 74, 75, 77, 78, 80, 81, 83, 84, 86, 86, 86, 86, 50, 47, 45, 42,
+        40, 42, 44, 46, 48, 50, 53, 56, 59, 61, 63, 66, 68, 70, 72, 74, 76, 77,
+        79, 81, 82, 84, 85, 87, 89, 89, 89, 89, 52, 49, 46, 43, 41, 43, 45, 47,
+        49, 51, 54, 56, 59, 62, 64, 67, 70, 72, 74, 76, 78, 80, 82, 83, 85, 87,
+        88, 90, 92, 92, 92, 92, 53, 50, 47, 45, 43, 44, 46, 48, 50, 52, 55, 57,
+        60, 63, 65, 68, 71, 73, 75, 77, 80, 82, 83, 85, 87, 89, 91, 92, 94, 94,
+        94, 94, 55, 52, 49, 46, 44, 46, 47, 49, 51, 53, 56, 58, 61, 64, 66, 69,
+        72, 74, 77, 79, 82, 83, 85, 87, 90, 91, 93, 95, 97, 97, 97, 97, 57, 54,
+        51, 48, 46, 47, 49, 51, 52, 55, 57, 60, 62, 65, 67, 70, 73, 76, 78, 81,
+        83, 85, 87, 90, 92, 94, 95, 97, 99, 99, 99, 99, 60, 56, 53, 50, 47, 49,
+        50, 52, 54, 56, 58, 61, 64, 66, 69, 72, 75, 77, 80, 82, 85, 87, 90, 92,
+        94, 96, 98, 100, 102, 102, 102, 102, 62, 58, 55, 52, 49, 50, 52, 54, 55,
+        57, 60, 62, 65, 67, 70, 73, 76, 78, 81, 84, 87, 89, 91, 94, 96, 98, 100,
+        102, 104, 104, 104, 104, 64, 60, 57, 54, 51, 52, 54, 55, 57, 59, 61, 64,
+        66, 69, 71, 74, 77, 80, 83, 85, 88, 91, 93, 95, 98, 100, 102, 104, 107,
+        107, 107, 107, 66, 62, 59, 56, 53, 54, 56, 57, 59, 61, 63, 65, 68, 70,
+        73, 76, 79, 81, 84, 87, 90, 92, 95, 97, 100, 102, 104, 107, 109, 109,
+        109, 109, 69, 65, 61, 58, 55, 56, 58, 59, 61, 63, 65, 67, 69, 72, 74,
+        77, 80, 83, 86, 89, 92, 94, 97, 99, 102, 104, 107, 109, 111, 111, 111,
+        111, 69, 65, 61, 58, 55, 56, 58, 59, 61, 63, 65, 67, 69, 72, 74, 77, 80,
+        83, 86, 89, 92, 94, 97, 99, 102, 104, 107, 109, 111, 111, 111, 111, 69,
+        65, 61, 58, 55, 56, 58, 59, 61, 63, 65, 67, 69, 72, 74, 77, 80, 83, 86,
+        89, 92, 94, 97, 99, 102, 104, 107, 109, 111, 111, 111, 111, 69, 65, 61,
+        58, 55, 56, 58, 59, 61, 63, 65, 67, 69, 72, 74, 77, 80, 83, 86, 89, 92,
+        94, 97, 99, 102, 104, 107, 109, 111, 111, 111, 111 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 70, 116, 182, 70, 106, 148, 199, 116, 148, 207, 251, 182, 199, 251,
+        292,
+        /* Size 8 */
+        64, 48, 52, 69, 92, 120, 150, 179, 48, 55, 53, 62, 79, 102, 128, 155,
+        52, 53, 74, 87, 101, 121, 144, 168, 69, 62, 87, 110, 129, 147, 166, 186,
+        92, 79, 101, 129, 153, 172, 190, 207, 120, 102, 121, 147, 172, 194, 211,
+        226, 150, 128, 144, 166, 190, 211, 228, 242, 179, 155, 168, 186, 207,
+        226, 242, 255,
+        /* Size 16 */
+        64, 55, 48, 50, 52, 59, 69, 79, 92, 104, 120, 133, 150, 163, 179, 179,
+        55, 53, 51, 52, 52, 58, 65, 74, 85, 96, 110, 123, 138, 151, 166, 166,
+        48, 51, 55, 54, 53, 57, 62, 70, 79, 89, 102, 113, 128, 140, 155, 155,
+        50, 52, 54, 58, 62, 67, 72, 80, 89, 99, 111, 122, 135, 147, 161, 161,
+        52, 52, 53, 62, 74, 80, 87, 93, 101, 110, 121, 131, 144, 155, 168, 168,
+        59, 58, 57, 67, 80, 88, 97, 105, 113, 122, 133, 143, 154, 165, 177, 177,
+        69, 65, 62, 72, 87, 97, 110, 119, 129, 137, 147, 156, 166, 176, 186,
+        186, 79, 74, 70, 80, 93, 105, 119, 128, 140, 149, 158, 167, 177, 186,
+        196, 196, 92, 85, 79, 89, 101, 113, 129, 140, 153, 162, 172, 181, 190,
+        198, 207, 207, 104, 96, 89, 99, 110, 122, 137, 149, 162, 171, 182, 191,
+        200, 208, 216, 216, 120, 110, 102, 111, 121, 133, 147, 158, 172, 182,
+        194, 202, 211, 218, 226, 226, 133, 123, 113, 122, 131, 143, 156, 167,
+        181, 191, 202, 210, 219, 227, 234, 234, 150, 138, 128, 135, 144, 154,
+        166, 177, 190, 200, 211, 219, 228, 235, 242, 242, 163, 151, 140, 147,
+        155, 165, 176, 186, 198, 208, 218, 227, 235, 242, 249, 249, 179, 166,
+        155, 161, 168, 177, 186, 196, 207, 216, 226, 234, 242, 249, 255, 255,
+        179, 166, 155, 161, 168, 177, 186, 196, 207, 216, 226, 234, 242, 249,
+        255, 255,
+        /* Size 32 */
+        64, 59, 55, 51, 48, 49, 50, 51, 52, 55, 59, 64, 69, 73, 79, 85, 92, 98,
+        104, 112, 120, 126, 133, 141, 150, 156, 163, 170, 179, 179, 179, 179,
+        59, 56, 54, 52, 50, 50, 51, 52, 52, 55, 59, 62, 67, 71, 76, 82, 89, 94,
+        100, 107, 115, 121, 128, 135, 144, 150, 156, 164, 172, 172, 172, 172,
+        55, 54, 53, 52, 51, 52, 52, 52, 52, 55, 58, 61, 65, 69, 74, 79, 85, 90,
+        96, 103, 110, 116, 123, 130, 138, 144, 151, 158, 166, 166, 166, 166, 51,
+        52, 52, 53, 53, 53, 53, 53, 53, 55, 58, 60, 63, 67, 72, 77, 82, 87, 93,
+        99, 106, 112, 118, 125, 133, 139, 145, 152, 160, 160, 160, 160, 48, 50,
+        51, 53, 55, 55, 54, 53, 53, 55, 57, 59, 62, 66, 70, 74, 79, 84, 89, 95,
+        102, 107, 113, 120, 128, 134, 140, 147, 155, 155, 155, 155, 49, 50, 52,
+        53, 55, 55, 56, 56, 57, 59, 61, 64, 67, 70, 74, 79, 84, 89, 94, 100,
+        106, 112, 117, 124, 131, 137, 144, 150, 158, 158, 158, 158, 50, 51, 52,
+        53, 54, 56, 58, 60, 62, 64, 67, 69, 72, 76, 80, 84, 89, 94, 99, 104,
+        111, 116, 122, 128, 135, 141, 147, 154, 161, 161, 161, 161, 51, 52, 52,
+        53, 53, 56, 60, 63, 67, 70, 73, 76, 79, 82, 86, 90, 95, 99, 104, 110,
+        116, 121, 126, 133, 139, 145, 151, 157, 164, 164, 164, 164, 52, 52, 52,
+        53, 53, 57, 62, 67, 74, 77, 80, 83, 87, 90, 93, 97, 101, 106, 110, 115,
+        121, 126, 131, 137, 144, 149, 155, 161, 168, 168, 168, 168, 55, 55, 55,
+        55, 55, 59, 64, 70, 77, 80, 84, 87, 92, 95, 99, 103, 107, 111, 116, 121,
+        126, 131, 137, 142, 149, 154, 160, 166, 172, 172, 172, 172, 59, 59, 58,
+        58, 57, 61, 67, 73, 80, 84, 88, 92, 97, 101, 105, 109, 113, 118, 122,
+        127, 133, 137, 143, 148, 154, 159, 165, 170, 177, 177, 177, 177, 64, 62,
+        61, 60, 59, 64, 69, 76, 83, 87, 92, 97, 103, 107, 111, 116, 121, 125,
+        129, 134, 139, 144, 149, 154, 160, 165, 170, 176, 181, 181, 181, 181,
+        69, 67, 65, 63, 62, 67, 72, 79, 87, 92, 97, 103, 110, 114, 119, 124,
+        129, 133, 137, 142, 147, 151, 156, 161, 166, 171, 176, 181, 186, 186,
+        186, 186, 73, 71, 69, 67, 66, 70, 76, 82, 90, 95, 101, 107, 114, 119,
+        123, 129, 134, 138, 143, 147, 152, 157, 161, 166, 172, 176, 181, 186,
+        191, 191, 191, 191, 79, 76, 74, 72, 70, 74, 80, 86, 93, 99, 105, 111,
+        119, 123, 128, 134, 140, 144, 149, 153, 158, 163, 167, 172, 177, 182,
+        186, 191, 196, 196, 196, 196, 85, 82, 79, 77, 74, 79, 84, 90, 97, 103,
+        109, 116, 124, 129, 134, 140, 146, 150, 155, 160, 165, 169, 174, 178,
+        183, 188, 192, 197, 202, 202, 202, 202, 92, 89, 85, 82, 79, 84, 89, 95,
+        101, 107, 113, 121, 129, 134, 140, 146, 153, 157, 162, 167, 172, 176,
+        181, 185, 190, 194, 198, 203, 207, 207, 207, 207, 98, 94, 90, 87, 84,
+        89, 94, 99, 106, 111, 118, 125, 133, 138, 144, 150, 157, 162, 167, 172,
+        177, 181, 186, 190, 195, 199, 203, 207, 212, 212, 212, 212, 104, 100,
+        96, 93, 89, 94, 99, 104, 110, 116, 122, 129, 137, 143, 149, 155, 162,
+        167, 171, 177, 182, 186, 191, 195, 200, 204, 208, 212, 216, 216, 216,
+        216, 112, 107, 103, 99, 95, 100, 104, 110, 115, 121, 127, 134, 142, 147,
+        153, 160, 167, 172, 177, 182, 188, 192, 196, 201, 205, 209, 213, 217,
+        221, 221, 221, 221, 120, 115, 110, 106, 102, 106, 111, 116, 121, 126,
+        133, 139, 147, 152, 158, 165, 172, 177, 182, 188, 194, 198, 202, 206,
+        211, 215, 218, 222, 226, 226, 226, 226, 126, 121, 116, 112, 107, 112,
+        116, 121, 126, 131, 137, 144, 151, 157, 163, 169, 176, 181, 186, 192,
+        198, 202, 206, 211, 215, 219, 222, 226, 230, 230, 230, 230, 133, 128,
+        123, 118, 113, 117, 122, 126, 131, 137, 143, 149, 156, 161, 167, 174,
+        181, 186, 191, 196, 202, 206, 210, 215, 219, 223, 227, 230, 234, 234,
+        234, 234, 141, 135, 130, 125, 120, 124, 128, 133, 137, 142, 148, 154,
+        161, 166, 172, 178, 185, 190, 195, 201, 206, 211, 215, 219, 224, 227,
+        231, 234, 238, 238, 238, 238, 150, 144, 138, 133, 128, 131, 135, 139,
+        144, 149, 154, 160, 166, 172, 177, 183, 190, 195, 200, 205, 211, 215,
+        219, 224, 228, 232, 235, 239, 242, 242, 242, 242, 156, 150, 144, 139,
+        134, 137, 141, 145, 149, 154, 159, 165, 171, 176, 182, 188, 194, 199,
+        204, 209, 215, 219, 223, 227, 232, 235, 238, 242, 246, 246, 246, 246,
+        163, 156, 151, 145, 140, 144, 147, 151, 155, 160, 165, 170, 176, 181,
+        186, 192, 198, 203, 208, 213, 218, 222, 227, 231, 235, 238, 242, 245,
+        249, 249, 249, 249, 170, 164, 158, 152, 147, 150, 154, 157, 161, 166,
+        170, 176, 181, 186, 191, 197, 203, 207, 212, 217, 222, 226, 230, 234,
+        239, 242, 245, 249, 252, 252, 252, 252, 179, 172, 166, 160, 155, 158,
+        161, 164, 168, 172, 177, 181, 186, 191, 196, 202, 207, 212, 216, 221,
+        226, 230, 234, 238, 242, 246, 249, 252, 255, 255, 255, 255, 179, 172,
+        166, 160, 155, 158, 161, 164, 168, 172, 177, 181, 186, 191, 196, 202,
+        207, 212, 216, 221, 226, 230, 234, 238, 242, 246, 249, 252, 255, 255,
+        255, 255, 179, 172, 166, 160, 155, 158, 161, 164, 168, 172, 177, 181,
+        186, 191, 196, 202, 207, 212, 216, 221, 226, 230, 234, 238, 242, 246,
+        249, 252, 255, 255, 255, 255, 179, 172, 166, 160, 155, 158, 161, 164,
+        168, 172, 177, 181, 186, 191, 196, 202, 207, 212, 216, 221, 226, 230,
+        234, 238, 242, 246, 249, 252, 255, 255, 255, 255 },
+      { /* Intra matrices */
+        /* Size 4 */
+        21, 23, 40, 65, 23, 36, 52, 72, 40, 52, 75, 93, 65, 72, 93, 111,
+        /* Size 8 */
+        26, 19, 21, 28, 38, 50, 63, 77, 19, 22, 21, 25, 32, 42, 53, 66, 21, 21,
+        30, 35, 42, 50, 61, 72, 28, 25, 35, 46, 54, 62, 71, 81, 38, 32, 42, 54,
+        65, 74, 83, 92, 50, 42, 50, 62, 74, 85, 94, 101, 63, 53, 61, 71, 83, 94,
+        103, 110, 77, 66, 72, 81, 92, 101, 110, 117,
+        /* Size 16 */
+        25, 21, 18, 19, 20, 23, 26, 31, 36, 41, 48, 54, 61, 67, 74, 74, 21, 20,
+        20, 20, 20, 22, 25, 29, 33, 38, 44, 49, 56, 61, 68, 68, 18, 20, 21, 21,
+        20, 22, 24, 27, 31, 35, 40, 45, 51, 57, 63, 63, 19, 20, 21, 22, 24, 26,
+        28, 31, 35, 39, 44, 49, 55, 60, 66, 66, 20, 20, 20, 24, 29, 31, 34, 37,
+        40, 44, 48, 53, 58, 63, 69, 69, 23, 22, 22, 26, 31, 34, 38, 41, 45, 49,
+        53, 58, 63, 68, 73, 73, 26, 25, 24, 28, 34, 38, 44, 47, 52, 55, 60, 64,
+        68, 73, 78, 78, 31, 29, 27, 31, 37, 41, 47, 52, 57, 60, 65, 69, 74, 78,
+        83, 83, 36, 33, 31, 35, 40, 45, 52, 57, 62, 67, 71, 75, 80, 84, 88, 88,
+        41, 38, 35, 39, 44, 49, 55, 60, 67, 71, 76, 80, 84, 88, 92, 92, 48, 44,
+        40, 44, 48, 53, 60, 65, 71, 76, 81, 85, 90, 94, 97, 97, 54, 49, 45, 49,
+        53, 58, 64, 69, 75, 80, 85, 90, 94, 98, 101, 101, 61, 56, 51, 55, 58,
+        63, 68, 74, 80, 84, 90, 94, 99, 102, 106, 106, 67, 61, 57, 60, 63, 68,
+        73, 78, 84, 88, 94, 98, 102, 105, 109, 109, 74, 68, 63, 66, 69, 73, 78,
+        83, 88, 92, 97, 101, 106, 109, 112, 112, 74, 68, 63, 66, 69, 73, 78, 83,
+        88, 92, 97, 101, 106, 109, 112, 112,
+        /* Size 32 */
+        24, 22, 21, 19, 18, 18, 19, 19, 19, 21, 22, 24, 26, 28, 30, 32, 35, 38,
+        40, 43, 47, 50, 53, 56, 60, 63, 66, 69, 73, 73, 73, 73, 22, 21, 20, 19,
+        19, 19, 19, 19, 19, 21, 22, 23, 25, 27, 29, 31, 34, 36, 39, 41, 45, 47,
+        50, 53, 57, 60, 63, 66, 70, 70, 70, 70, 21, 20, 20, 19, 19, 19, 19, 19,
+        20, 21, 22, 23, 25, 26, 28, 30, 33, 35, 37, 40, 43, 45, 48, 51, 55, 57,
+        60, 63, 67, 67, 67, 67, 19, 19, 19, 20, 20, 20, 20, 20, 20, 21, 22, 23,
+        24, 25, 27, 29, 31, 33, 36, 38, 41, 43, 46, 49, 52, 55, 58, 61, 64, 64,
+        64, 64, 18, 19, 19, 20, 21, 20, 20, 20, 20, 21, 21, 22, 23, 25, 26, 28,
+        30, 32, 34, 37, 39, 42, 44, 47, 50, 53, 56, 59, 62, 62, 62, 62, 18, 19,
+        19, 20, 20, 21, 21, 21, 21, 22, 23, 24, 25, 27, 28, 30, 32, 34, 36, 38,
+        41, 43, 46, 49, 52, 54, 57, 60, 63, 63, 63, 63, 19, 19, 19, 20, 20, 21,
+        22, 22, 23, 24, 25, 26, 27, 29, 30, 32, 34, 36, 38, 40, 43, 45, 48, 50,
+        53, 56, 59, 62, 65, 65, 65, 65, 19, 19, 19, 20, 20, 21, 22, 24, 25, 26,
+        27, 29, 30, 31, 33, 35, 36, 38, 40, 43, 45, 47, 50, 52, 55, 58, 60, 63,
+        66, 66, 66, 66, 19, 19, 20, 20, 20, 21, 23, 25, 28, 29, 30, 32, 33, 34,
+        36, 37, 39, 41, 43, 45, 47, 49, 52, 54, 57, 59, 62, 65, 68, 68, 68, 68,
+        21, 21, 21, 21, 21, 22, 24, 26, 29, 30, 32, 33, 35, 37, 38, 40, 42, 43,
+        45, 47, 50, 52, 54, 57, 59, 62, 64, 67, 70, 70, 70, 70, 22, 22, 22, 22,
+        21, 23, 25, 27, 30, 32, 34, 35, 37, 39, 40, 42, 44, 46, 48, 50, 52, 54,
+        57, 59, 62, 64, 66, 69, 72, 72, 72, 72, 24, 23, 23, 23, 22, 24, 26, 29,
+        32, 33, 35, 38, 40, 42, 43, 45, 47, 49, 51, 53, 55, 57, 59, 62, 64, 66,
+        69, 71, 74, 74, 74, 74, 26, 25, 25, 24, 23, 25, 27, 30, 33, 35, 37, 40,
+        43, 45, 46, 48, 51, 52, 54, 56, 58, 60, 62, 65, 67, 69, 71, 74, 76, 76,
+        76, 76, 28, 27, 26, 25, 25, 27, 29, 31, 34, 37, 39, 42, 45, 46, 48, 51,
+        53, 55, 57, 59, 61, 63, 65, 67, 70, 72, 74, 76, 79, 79, 79, 79, 30, 29,
+        28, 27, 26, 28, 30, 33, 36, 38, 40, 43, 46, 48, 51, 53, 55, 57, 59, 61,
+        64, 66, 68, 70, 72, 74, 76, 79, 81, 81, 81, 81, 32, 31, 30, 29, 28, 30,
+        32, 35, 37, 40, 42, 45, 48, 51, 53, 55, 58, 60, 62, 64, 67, 69, 71, 73,
+        75, 77, 79, 81, 83, 83, 83, 83, 35, 34, 33, 31, 30, 32, 34, 36, 39, 42,
+        44, 47, 51, 53, 55, 58, 61, 63, 65, 67, 70, 72, 74, 76, 78, 80, 82, 84,
+        86, 86, 86, 86, 38, 36, 35, 33, 32, 34, 36, 38, 41, 43, 46, 49, 52, 55,
+        57, 60, 63, 65, 67, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 88, 88, 88,
+        40, 39, 37, 36, 34, 36, 38, 40, 43, 45, 48, 51, 54, 57, 59, 62, 65, 67,
+        70, 72, 74, 76, 78, 81, 83, 85, 86, 88, 91, 91, 91, 91, 43, 41, 40, 38,
+        37, 38, 40, 43, 45, 47, 50, 53, 56, 59, 61, 64, 67, 70, 72, 74, 77, 79,
+        81, 83, 85, 87, 89, 91, 93, 93, 93, 93, 47, 45, 43, 41, 39, 41, 43, 45,
+        47, 50, 52, 55, 58, 61, 64, 67, 70, 72, 74, 77, 80, 82, 84, 86, 88, 90,
+        92, 94, 95, 95, 95, 95, 50, 47, 45, 43, 42, 43, 45, 47, 49, 52, 54, 57,
+        60, 63, 66, 69, 72, 74, 76, 79, 82, 84, 86, 88, 90, 92, 94, 95, 97, 97,
+        97, 97, 53, 50, 48, 46, 44, 46, 48, 50, 52, 54, 57, 59, 62, 65, 68, 71,
+        74, 76, 78, 81, 84, 86, 88, 90, 92, 94, 96, 97, 99, 99, 99, 99, 56, 53,
+        51, 49, 47, 49, 50, 52, 54, 57, 59, 62, 65, 67, 70, 73, 76, 78, 81, 83,
+        86, 88, 90, 92, 94, 96, 98, 100, 101, 101, 101, 101, 60, 57, 55, 52, 50,
+        52, 53, 55, 57, 59, 62, 64, 67, 70, 72, 75, 78, 80, 83, 85, 88, 90, 92,
+        94, 97, 98, 100, 102, 104, 104, 104, 104, 63, 60, 57, 55, 53, 54, 56,
+        58, 59, 62, 64, 66, 69, 72, 74, 77, 80, 82, 85, 87, 90, 92, 94, 96, 98,
+        100, 102, 103, 105, 105, 105, 105, 66, 63, 60, 58, 56, 57, 59, 60, 62,
+        64, 66, 69, 71, 74, 76, 79, 82, 84, 86, 89, 92, 94, 96, 98, 100, 102,
+        103, 105, 107, 107, 107, 107, 69, 66, 63, 61, 59, 60, 62, 63, 65, 67,
+        69, 71, 74, 76, 79, 81, 84, 86, 88, 91, 94, 95, 97, 100, 102, 103, 105,
+        107, 108, 108, 108, 108, 73, 70, 67, 64, 62, 63, 65, 66, 68, 70, 72, 74,
+        76, 79, 81, 83, 86, 88, 91, 93, 95, 97, 99, 101, 104, 105, 107, 108,
+        110, 110, 110, 110, 73, 70, 67, 64, 62, 63, 65, 66, 68, 70, 72, 74, 76,
+        79, 81, 83, 86, 88, 91, 93, 95, 97, 99, 101, 104, 105, 107, 108, 110,
+        110, 110, 110, 73, 70, 67, 64, 62, 63, 65, 66, 68, 70, 72, 74, 76, 79,
+        81, 83, 86, 88, 91, 93, 95, 97, 99, 101, 104, 105, 107, 108, 110, 110,
+        110, 110, 73, 70, 67, 64, 62, 63, 65, 66, 68, 70, 72, 74, 76, 79, 81,
+        83, 86, 88, 91, 93, 95, 97, 99, 101, 104, 105, 107, 108, 110, 110, 110,
+        110 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 101, 110, 139, 101, 122, 130, 149, 110, 130, 167, 190, 139, 149,
+        190, 225,
+        /* Size 8 */
+        64, 52, 94, 99, 106, 119, 134, 152, 52, 75, 96, 87, 89, 98, 110, 125,
+        94, 96, 112, 108, 108, 113, 123, 136, 99, 87, 108, 122, 128, 134, 142,
+        153, 106, 89, 108, 128, 143, 153, 162, 172, 119, 98, 113, 134, 153, 168,
+        180, 191, 134, 110, 123, 142, 162, 180, 195, 207, 152, 125, 136, 153,
+        172, 191, 207, 221,
+        /* Size 16 */
+        64, 57, 52, 67, 94, 97, 99, 103, 106, 112, 119, 126, 134, 142, 152, 152,
+        57, 59, 61, 74, 95, 94, 93, 95, 97, 102, 107, 114, 121, 129, 137, 137,
+        52, 61, 75, 84, 96, 91, 87, 88, 89, 93, 98, 104, 110, 117, 125, 125, 67,
+        74, 84, 92, 103, 99, 96, 97, 97, 101, 105, 110, 116, 123, 130, 130, 94,
+        95, 96, 103, 112, 110, 108, 108, 108, 110, 113, 118, 123, 129, 136, 136,
+        97, 94, 91, 99, 110, 112, 115, 116, 117, 120, 123, 127, 132, 138, 144,
+        144, 99, 93, 87, 96, 108, 115, 122, 125, 128, 131, 134, 138, 142, 147,
+        153, 153, 103, 95, 88, 97, 108, 116, 125, 130, 135, 139, 143, 147, 151,
+        156, 162, 162, 106, 97, 89, 97, 108, 117, 128, 135, 143, 148, 153, 157,
+        162, 167, 172, 172, 112, 102, 93, 101, 110, 120, 131, 139, 148, 154,
+        160, 165, 171, 176, 181, 181, 119, 107, 98, 105, 113, 123, 134, 143,
+        153, 160, 168, 174, 180, 185, 191, 191, 126, 114, 104, 110, 118, 127,
+        138, 147, 157, 165, 174, 180, 187, 193, 199, 199, 134, 121, 110, 116,
+        123, 132, 142, 151, 162, 171, 180, 187, 195, 201, 207, 207, 142, 129,
+        117, 123, 129, 138, 147, 156, 167, 176, 185, 193, 201, 207, 214, 214,
+        152, 137, 125, 130, 136, 144, 153, 162, 172, 181, 191, 199, 207, 214,
+        221, 221, 152, 137, 125, 130, 136, 144, 153, 162, 172, 181, 191, 199,
+        207, 214, 221, 221,
+        /* Size 32 */
+        64, 60, 57, 54, 52, 58, 67, 78, 94, 95, 97, 98, 99, 101, 103, 104, 106,
+        109, 112, 115, 119, 122, 126, 130, 134, 138, 142, 147, 152, 152, 152,
+        152, 60, 59, 58, 57, 56, 62, 70, 80, 94, 95, 95, 95, 96, 97, 99, 100,
+        101, 104, 107, 110, 113, 116, 120, 123, 127, 131, 135, 139, 144, 144,
+        144, 144, 57, 58, 59, 60, 61, 67, 74, 83, 95, 94, 94, 93, 93, 94, 95,
+        96, 97, 99, 102, 104, 107, 110, 114, 117, 121, 125, 129, 133, 137, 137,
+        137, 137, 54, 57, 60, 63, 67, 72, 79, 86, 95, 94, 92, 91, 89, 90, 91,
+        92, 93, 95, 97, 100, 102, 105, 109, 112, 115, 119, 123, 127, 131, 131,
+        131, 131, 52, 56, 61, 67, 75, 79, 84, 89, 96, 93, 91, 89, 87, 87, 88,
+        88, 89, 91, 93, 96, 98, 101, 104, 107, 110, 114, 117, 121, 125, 125,
+        125, 125, 58, 62, 67, 72, 79, 83, 88, 93, 99, 97, 95, 93, 91, 92, 92,
+        93, 93, 95, 97, 99, 101, 104, 107, 110, 113, 117, 120, 124, 128, 128,
+        128, 128, 67, 70, 74, 79, 84, 88, 92, 97, 103, 101, 99, 98, 96, 96, 97,
+        97, 97, 99, 101, 103, 105, 108, 110, 113, 116, 120, 123, 127, 130, 130,
+        130, 130, 78, 80, 83, 86, 89, 93, 97, 102, 107, 106, 104, 103, 102, 102,
+        102, 102, 102, 104, 106, 107, 109, 112, 114, 117, 120, 123, 126, 130,
+        133, 133, 133, 133, 94, 94, 95, 95, 96, 99, 103, 107, 112, 111, 110,
+        109, 108, 108, 108, 108, 108, 109, 110, 112, 113, 116, 118, 121, 123,
+        126, 129, 133, 136, 136, 136, 136, 95, 95, 94, 94, 93, 97, 101, 106,
+        111, 111, 111, 111, 111, 111, 112, 112, 112, 113, 115, 116, 118, 120,
+        122, 125, 127, 130, 133, 137, 140, 140, 140, 140, 97, 95, 94, 92, 91,
+        95, 99, 104, 110, 111, 112, 113, 115, 115, 116, 116, 117, 118, 120, 121,
+        123, 125, 127, 129, 132, 135, 138, 141, 144, 144, 144, 144, 98, 95, 93,
+        91, 89, 93, 98, 103, 109, 111, 113, 116, 118, 119, 120, 121, 122, 124,
+        125, 127, 128, 130, 132, 134, 137, 139, 142, 145, 148, 148, 148, 148,
+        99, 96, 93, 89, 87, 91, 96, 102, 108, 111, 115, 118, 122, 124, 125, 127,
+        128, 130, 131, 132, 134, 136, 138, 140, 142, 144, 147, 150, 153, 153,
+        153, 153, 101, 97, 94, 90, 87, 92, 96, 102, 108, 111, 115, 119, 124,
+        125, 127, 129, 132, 133, 135, 136, 138, 140, 142, 144, 146, 149, 152,
+        154, 157, 157, 157, 157, 103, 99, 95, 91, 88, 92, 97, 102, 108, 112,
+        116, 120, 125, 127, 130, 133, 135, 137, 139, 141, 143, 145, 147, 149,
+        151, 154, 156, 159, 162, 162, 162, 162, 104, 100, 96, 92, 88, 93, 97,
+        102, 108, 112, 116, 121, 127, 129, 133, 136, 139, 141, 143, 145, 148,
+        150, 152, 154, 156, 159, 161, 164, 167, 167, 167, 167, 106, 101, 97, 93,
+        89, 93, 97, 102, 108, 112, 117, 122, 128, 132, 135, 139, 143, 145, 148,
+        150, 153, 155, 157, 160, 162, 164, 167, 169, 172, 172, 172, 172, 109,
+        104, 99, 95, 91, 95, 99, 104, 109, 113, 118, 124, 130, 133, 137, 141,
+        145, 148, 151, 154, 157, 159, 161, 164, 166, 169, 171, 174, 176, 176,
+        176, 176, 112, 107, 102, 97, 93, 97, 101, 106, 110, 115, 120, 125, 131,
+        135, 139, 143, 148, 151, 154, 157, 160, 163, 165, 168, 171, 173, 176,
+        178, 181, 181, 181, 181, 115, 110, 104, 100, 96, 99, 103, 107, 112, 116,
+        121, 127, 132, 136, 141, 145, 150, 154, 157, 161, 164, 167, 170, 172,
+        175, 178, 180, 183, 186, 186, 186, 186, 119, 113, 107, 102, 98, 101,
+        105, 109, 113, 118, 123, 128, 134, 138, 143, 148, 153, 157, 160, 164,
+        168, 171, 174, 177, 180, 183, 185, 188, 191, 191, 191, 191, 122, 116,
+        110, 105, 101, 104, 108, 112, 116, 120, 125, 130, 136, 140, 145, 150,
+        155, 159, 163, 167, 171, 174, 177, 180, 184, 186, 189, 192, 195, 195,
+        195, 195, 126, 120, 114, 109, 104, 107, 110, 114, 118, 122, 127, 132,
+        138, 142, 147, 152, 157, 161, 165, 170, 174, 177, 180, 184, 187, 190,
+        193, 196, 199, 199, 199, 199, 130, 123, 117, 112, 107, 110, 113, 117,
+        121, 125, 129, 134, 140, 144, 149, 154, 160, 164, 168, 172, 177, 180,
+        184, 187, 191, 194, 197, 200, 203, 203, 203, 203, 134, 127, 121, 115,
+        110, 113, 116, 120, 123, 127, 132, 137, 142, 146, 151, 156, 162, 166,
+        171, 175, 180, 184, 187, 191, 195, 198, 201, 204, 207, 207, 207, 207,
+        138, 131, 125, 119, 114, 117, 120, 123, 126, 130, 135, 139, 144, 149,
+        154, 159, 164, 169, 173, 178, 183, 186, 190, 194, 198, 201, 204, 207,
+        211, 211, 211, 211, 142, 135, 129, 123, 117, 120, 123, 126, 129, 133,
+        138, 142, 147, 152, 156, 161, 167, 171, 176, 180, 185, 189, 193, 197,
+        201, 204, 207, 211, 214, 214, 214, 214, 147, 139, 133, 127, 121, 124,
+        127, 130, 133, 137, 141, 145, 150, 154, 159, 164, 169, 174, 178, 183,
+        188, 192, 196, 200, 204, 207, 211, 214, 217, 217, 217, 217, 152, 144,
+        137, 131, 125, 128, 130, 133, 136, 140, 144, 148, 153, 157, 162, 167,
+        172, 176, 181, 186, 191, 195, 199, 203, 207, 211, 214, 217, 221, 221,
+        221, 221, 152, 144, 137, 131, 125, 128, 130, 133, 136, 140, 144, 148,
+        153, 157, 162, 167, 172, 176, 181, 186, 191, 195, 199, 203, 207, 211,
+        214, 217, 221, 221, 221, 221, 152, 144, 137, 131, 125, 128, 130, 133,
+        136, 140, 144, 148, 153, 157, 162, 167, 172, 176, 181, 186, 191, 195,
+        199, 203, 207, 211, 214, 217, 221, 221, 221, 221, 152, 144, 137, 131,
+        125, 128, 130, 133, 136, 140, 144, 148, 153, 157, 162, 167, 172, 176,
+        181, 186, 191, 195, 199, 203, 207, 211, 214, 217, 221, 221, 221, 221 },
+      { /* Intra matrices */
+        /* Size 4 */
+        27, 44, 48, 61, 44, 53, 57, 66, 48, 57, 74, 86, 61, 66, 86, 103,
+        /* Size 8 */
+        29, 23, 43, 46, 49, 56, 63, 72, 23, 34, 44, 40, 41, 45, 51, 59, 43, 44,
+        52, 50, 50, 53, 58, 64, 46, 40, 50, 57, 60, 63, 67, 73, 49, 41, 50, 60,
+        68, 73, 78, 83, 56, 45, 53, 63, 73, 81, 88, 93, 63, 51, 58, 67, 78, 88,
+        96, 102, 72, 59, 64, 73, 83, 93, 102, 110,
+        /* Size 16 */
+        28, 25, 23, 29, 42, 43, 45, 46, 48, 51, 54, 58, 62, 66, 70, 70, 25, 26,
+        27, 33, 42, 42, 42, 43, 44, 46, 49, 52, 55, 59, 63, 63, 23, 27, 33, 37,
+        43, 41, 39, 39, 40, 42, 44, 47, 50, 53, 57, 57, 29, 33, 37, 41, 46, 45,
+        43, 44, 44, 46, 47, 50, 53, 56, 60, 60, 42, 42, 43, 46, 51, 50, 49, 49,
+        49, 50, 51, 54, 56, 59, 63, 63, 43, 42, 41, 45, 50, 51, 52, 53, 53, 55,
+        56, 58, 61, 63, 67, 67, 45, 42, 39, 43, 49, 52, 56, 57, 59, 60, 61, 63,
+        65, 68, 71, 71, 46, 43, 39, 44, 49, 53, 57, 60, 62, 64, 66, 68, 70, 73,
+        75, 75, 48, 44, 40, 44, 49, 53, 59, 62, 66, 69, 71, 73, 76, 78, 81, 81,
+        51, 46, 42, 46, 50, 55, 60, 64, 69, 72, 75, 77, 80, 83, 85, 85, 54, 49,
+        44, 47, 51, 56, 61, 66, 71, 75, 79, 82, 85, 88, 91, 91, 58, 52, 47, 50,
+        54, 58, 63, 68, 73, 77, 82, 85, 89, 92, 95, 95, 62, 55, 50, 53, 56, 61,
+        65, 70, 76, 80, 85, 89, 93, 96, 99, 99, 66, 59, 53, 56, 59, 63, 68, 73,
+        78, 83, 88, 92, 96, 99, 103, 103, 70, 63, 57, 60, 63, 67, 71, 75, 81,
+        85, 91, 95, 99, 103, 107, 107, 70, 63, 57, 60, 63, 67, 71, 75, 81, 85,
+        91, 95, 99, 103, 107, 107,
+        /* Size 32 */
+        28, 26, 25, 23, 22, 25, 29, 34, 41, 42, 43, 43, 44, 45, 46, 46, 47, 49,
+        50, 52, 53, 55, 57, 59, 61, 63, 65, 67, 69, 69, 69, 69, 26, 26, 25, 25,
+        24, 27, 31, 35, 42, 42, 42, 42, 42, 43, 44, 44, 45, 46, 48, 49, 50, 52,
+        54, 55, 57, 59, 61, 63, 66, 66, 66, 66, 25, 25, 26, 26, 26, 29, 32, 37,
+        42, 42, 41, 41, 41, 41, 42, 42, 43, 44, 45, 46, 48, 49, 51, 53, 54, 56,
+        58, 60, 62, 62, 62, 62, 23, 25, 26, 28, 29, 32, 34, 38, 42, 41, 41, 40,
+        39, 40, 40, 41, 41, 42, 43, 44, 45, 47, 48, 50, 52, 53, 55, 57, 59, 59,
+        59, 59, 22, 24, 26, 29, 33, 35, 37, 39, 42, 41, 40, 39, 38, 38, 39, 39,
+        39, 40, 41, 42, 43, 45, 46, 48, 49, 51, 53, 54, 56, 56, 56, 56, 25, 27,
+        29, 32, 35, 37, 39, 41, 44, 43, 42, 41, 40, 40, 41, 41, 41, 42, 43, 44,
+        45, 46, 48, 49, 51, 52, 54, 56, 58, 58, 58, 58, 29, 31, 32, 34, 37, 39,
+        41, 43, 46, 45, 44, 43, 43, 43, 43, 43, 43, 44, 45, 46, 47, 48, 49, 51,
+        52, 54, 55, 57, 59, 59, 59, 59, 34, 35, 37, 38, 39, 41, 43, 45, 48, 47,
+        46, 46, 45, 45, 45, 45, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 57, 58,
+        60, 60, 60, 60, 41, 42, 42, 42, 42, 44, 46, 48, 50, 49, 49, 49, 48, 48,
+        48, 48, 48, 49, 49, 50, 51, 52, 53, 54, 55, 57, 58, 60, 62, 62, 62, 62,
+        42, 42, 42, 41, 41, 43, 45, 47, 49, 50, 50, 50, 50, 50, 50, 50, 50, 51,
+        51, 52, 53, 54, 55, 56, 57, 59, 60, 62, 64, 64, 64, 64, 43, 42, 41, 41,
+        40, 42, 44, 46, 49, 50, 50, 51, 51, 52, 52, 52, 52, 53, 54, 54, 55, 56,
+        57, 58, 60, 61, 62, 64, 65, 65, 65, 65, 43, 42, 41, 40, 39, 41, 43, 46,
+        49, 50, 51, 52, 53, 54, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63,
+        65, 66, 68, 68, 68, 68, 44, 42, 41, 39, 38, 40, 43, 45, 48, 50, 51, 53,
+        55, 56, 56, 57, 58, 58, 59, 60, 61, 61, 62, 63, 64, 66, 67, 68, 70, 70,
+        70, 70, 45, 43, 41, 40, 38, 40, 43, 45, 48, 50, 52, 54, 56, 57, 57, 58,
+        59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 71, 72, 72, 72, 72, 46, 44,
+        42, 40, 39, 41, 43, 45, 48, 50, 52, 54, 56, 57, 59, 60, 61, 62, 63, 64,
+        65, 66, 67, 68, 69, 70, 72, 73, 74, 74, 74, 74, 46, 44, 42, 41, 39, 41,
+        43, 45, 48, 50, 52, 54, 57, 58, 60, 61, 63, 64, 65, 66, 67, 68, 70, 71,
+        72, 73, 74, 75, 77, 77, 77, 77, 47, 45, 43, 41, 39, 41, 43, 45, 48, 50,
+        52, 55, 58, 59, 61, 63, 65, 66, 67, 69, 70, 71, 72, 73, 75, 76, 77, 78,
+        79, 79, 79, 79, 49, 46, 44, 42, 40, 42, 44, 46, 49, 51, 53, 56, 58, 60,
+        62, 64, 66, 68, 69, 70, 72, 73, 74, 75, 77, 78, 79, 80, 82, 82, 82, 82,
+        50, 48, 45, 43, 41, 43, 45, 47, 49, 51, 54, 56, 59, 61, 63, 65, 67, 69,
+        70, 72, 74, 75, 76, 78, 79, 80, 81, 83, 84, 84, 84, 84, 52, 49, 46, 44,
+        42, 44, 46, 48, 50, 52, 54, 57, 60, 62, 64, 66, 69, 70, 72, 74, 76, 77,
+        78, 80, 81, 83, 84, 85, 87, 87, 87, 87, 53, 50, 48, 45, 43, 45, 47, 49,
+        51, 53, 55, 58, 61, 63, 65, 67, 70, 72, 74, 76, 78, 79, 81, 82, 84, 85,
+        86, 88, 89, 89, 89, 89, 55, 52, 49, 47, 45, 46, 48, 50, 52, 54, 56, 59,
+        61, 64, 66, 68, 71, 73, 75, 77, 79, 81, 82, 84, 86, 87, 88, 90, 91, 91,
+        91, 91, 57, 54, 51, 48, 46, 48, 49, 51, 53, 55, 57, 60, 62, 65, 67, 70,
+        72, 74, 76, 78, 81, 82, 84, 86, 87, 89, 90, 92, 93, 93, 93, 93, 59, 55,
+        53, 50, 48, 49, 51, 52, 54, 56, 58, 61, 63, 66, 68, 71, 73, 75, 78, 80,
+        82, 84, 86, 87, 89, 91, 92, 94, 96, 96, 96, 96, 61, 57, 54, 52, 49, 51,
+        52, 54, 55, 57, 60, 62, 64, 67, 69, 72, 75, 77, 79, 81, 84, 86, 87, 89,
+        91, 93, 95, 96, 98, 98, 98, 98, 63, 59, 56, 53, 51, 52, 54, 55, 57, 59,
+        61, 63, 66, 68, 70, 73, 76, 78, 80, 83, 85, 87, 89, 91, 93, 95, 96, 98,
+        100, 100, 100, 100, 65, 61, 58, 55, 53, 54, 55, 57, 58, 60, 62, 65, 67,
+        69, 72, 74, 77, 79, 81, 84, 86, 88, 90, 92, 95, 96, 98, 100, 102, 102,
+        102, 102, 67, 63, 60, 57, 54, 56, 57, 58, 60, 62, 64, 66, 68, 71, 73,
+        75, 78, 80, 83, 85, 88, 90, 92, 94, 96, 98, 100, 102, 103, 103, 103,
+        103, 69, 66, 62, 59, 56, 58, 59, 60, 62, 64, 65, 68, 70, 72, 74, 77, 79,
+        82, 84, 87, 89, 91, 93, 96, 98, 100, 102, 103, 105, 105, 105, 105, 69,
+        66, 62, 59, 56, 58, 59, 60, 62, 64, 65, 68, 70, 72, 74, 77, 79, 82, 84,
+        87, 89, 91, 93, 96, 98, 100, 102, 103, 105, 105, 105, 105, 69, 66, 62,
+        59, 56, 58, 59, 60, 62, 64, 65, 68, 70, 72, 74, 77, 79, 82, 84, 87, 89,
+        91, 93, 96, 98, 100, 102, 103, 105, 105, 105, 105, 69, 66, 62, 59, 56,
+        58, 59, 60, 62, 64, 65, 68, 70, 72, 74, 77, 79, 82, 84, 87, 89, 91, 93,
+        96, 98, 100, 102, 103, 105, 105, 105, 105 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 70, 112, 168, 70, 103, 139, 181, 112, 139, 188, 221, 168, 181, 221,
+        251,
+        /* Size 8 */
+        64, 49, 53, 68, 90, 115, 139, 163, 49, 56, 53, 62, 78, 99, 121, 144, 53,
+        53, 74, 85, 98, 115, 135, 154, 68, 62, 85, 106, 122, 137, 153, 169, 90,
+        78, 98, 122, 142, 158, 172, 185, 115, 99, 115, 137, 158, 174, 188, 199,
+        139, 121, 135, 153, 172, 188, 200, 210, 163, 144, 154, 169, 185, 199,
+        210, 220,
+        /* Size 16 */
+        64, 55, 49, 51, 53, 59, 68, 78, 90, 101, 115, 126, 139, 150, 163, 163,
+        55, 54, 52, 52, 53, 58, 65, 73, 84, 94, 106, 117, 130, 140, 153, 153,
+        49, 52, 56, 54, 53, 57, 62, 69, 78, 87, 99, 109, 121, 132, 144, 144, 51,
+        52, 54, 58, 62, 66, 72, 79, 87, 96, 106, 116, 128, 137, 149, 149, 53,
+        53, 53, 62, 74, 79, 85, 91, 98, 106, 115, 124, 135, 144, 154, 154, 59,
+        58, 57, 66, 79, 86, 94, 101, 109, 117, 125, 134, 143, 152, 161, 161, 68,
+        65, 62, 72, 85, 94, 106, 114, 122, 129, 137, 145, 153, 160, 169, 169,
+        78, 73, 69, 79, 91, 101, 114, 122, 131, 139, 147, 154, 162, 169, 176,
+        176, 90, 84, 78, 87, 98, 109, 122, 131, 142, 149, 158, 164, 172, 178,
+        185, 185, 101, 94, 87, 96, 106, 117, 129, 139, 149, 157, 166, 172, 179,
+        185, 191, 191, 115, 106, 99, 106, 115, 125, 137, 147, 158, 166, 174,
+        181, 188, 193, 199, 199, 126, 117, 109, 116, 124, 134, 145, 154, 164,
+        172, 181, 187, 194, 199, 204, 204, 139, 130, 121, 128, 135, 143, 153,
+        162, 172, 179, 188, 194, 200, 205, 210, 210, 150, 140, 132, 137, 144,
+        152, 160, 169, 178, 185, 193, 199, 205, 210, 215, 215, 163, 153, 144,
+        149, 154, 161, 169, 176, 185, 191, 199, 204, 210, 215, 220, 220, 163,
+        153, 144, 149, 154, 161, 169, 176, 185, 191, 199, 204, 210, 215, 220,
+        220,
+        /* Size 32 */
+        64, 59, 55, 52, 49, 50, 51, 52, 53, 56, 59, 64, 68, 73, 78, 83, 90, 95,
+        101, 107, 115, 120, 126, 132, 139, 145, 150, 156, 163, 163, 163, 163,
+        59, 57, 54, 52, 50, 51, 51, 52, 53, 56, 59, 63, 67, 71, 75, 81, 87, 92,
+        97, 103, 110, 115, 121, 127, 134, 140, 145, 151, 158, 158, 158, 158, 55,
+        54, 54, 53, 52, 52, 52, 53, 53, 55, 58, 62, 65, 69, 73, 78, 84, 88, 94,
+        99, 106, 111, 117, 123, 130, 135, 140, 146, 153, 153, 153, 153, 52, 52,
+        53, 53, 54, 54, 53, 53, 53, 55, 58, 61, 63, 67, 71, 76, 81, 85, 90, 96,
+        102, 107, 113, 119, 125, 130, 136, 142, 148, 148, 148, 148, 49, 50, 52,
+        54, 56, 55, 54, 54, 53, 55, 57, 60, 62, 65, 69, 73, 78, 83, 87, 93, 99,
+        104, 109, 115, 121, 126, 132, 137, 144, 144, 144, 144, 50, 51, 52, 54,
+        55, 56, 56, 57, 57, 59, 62, 64, 67, 70, 74, 78, 82, 87, 91, 97, 102,
+        107, 112, 118, 124, 129, 134, 140, 146, 146, 146, 146, 51, 51, 52, 53,
+        54, 56, 58, 60, 62, 64, 66, 69, 72, 75, 79, 83, 87, 91, 96, 101, 106,
+        111, 116, 122, 128, 132, 137, 143, 149, 149, 149, 149, 52, 52, 53, 53,
+        54, 57, 60, 63, 67, 70, 72, 75, 78, 81, 84, 88, 92, 96, 101, 105, 111,
+        115, 120, 125, 131, 136, 140, 146, 151, 151, 151, 151, 53, 53, 53, 53,
+        53, 57, 62, 67, 74, 76, 79, 82, 85, 88, 91, 95, 98, 102, 106, 111, 115,
+        120, 124, 129, 135, 139, 144, 149, 154, 154, 154, 154, 56, 56, 55, 55,
+        55, 59, 64, 70, 76, 79, 82, 86, 89, 93, 96, 99, 103, 107, 111, 115, 120,
+        124, 129, 134, 139, 143, 148, 152, 158, 158, 158, 158, 59, 59, 58, 58,
+        57, 62, 66, 72, 79, 82, 86, 90, 94, 98, 101, 105, 109, 113, 117, 121,
+        125, 129, 134, 138, 143, 147, 152, 156, 161, 161, 161, 161, 64, 63, 62,
+        61, 60, 64, 69, 75, 82, 86, 90, 95, 100, 103, 107, 111, 115, 119, 122,
+        127, 131, 135, 139, 143, 148, 152, 156, 160, 165, 165, 165, 165, 68, 67,
+        65, 63, 62, 67, 72, 78, 85, 89, 94, 100, 106, 110, 114, 118, 122, 125,
+        129, 133, 137, 141, 145, 149, 153, 157, 160, 165, 169, 169, 169, 169,
+        73, 71, 69, 67, 65, 70, 75, 81, 88, 93, 98, 103, 110, 113, 118, 122,
+        126, 130, 134, 138, 142, 145, 149, 153, 157, 161, 164, 168, 173, 173,
+        173, 173, 78, 75, 73, 71, 69, 74, 79, 84, 91, 96, 101, 107, 114, 118,
+        122, 126, 131, 135, 139, 142, 147, 150, 154, 158, 162, 165, 169, 172,
+        176, 176, 176, 176, 83, 81, 78, 76, 73, 78, 83, 88, 95, 99, 105, 111,
+        118, 122, 126, 131, 136, 140, 144, 148, 152, 155, 159, 163, 166, 170,
+        173, 177, 180, 180, 180, 180, 90, 87, 84, 81, 78, 82, 87, 92, 98, 103,
+        109, 115, 122, 126, 131, 136, 142, 146, 149, 153, 158, 161, 164, 168,
+        172, 175, 178, 181, 185, 185, 185, 185, 95, 92, 88, 85, 83, 87, 91, 96,
+        102, 107, 113, 119, 125, 130, 135, 140, 146, 149, 153, 157, 162, 165,
+        168, 172, 175, 178, 181, 185, 188, 188, 188, 188, 101, 97, 94, 90, 87,
+        91, 96, 101, 106, 111, 117, 122, 129, 134, 139, 144, 149, 153, 157, 161,
+        166, 169, 172, 176, 179, 182, 185, 188, 191, 191, 191, 191, 107, 103,
+        99, 96, 93, 97, 101, 105, 111, 115, 121, 127, 133, 138, 142, 148, 153,
+        157, 161, 165, 170, 173, 176, 180, 183, 186, 189, 192, 195, 195, 195,
+        195, 115, 110, 106, 102, 99, 102, 106, 111, 115, 120, 125, 131, 137,
+        142, 147, 152, 158, 162, 166, 170, 174, 177, 181, 184, 188, 190, 193,
+        196, 199, 199, 199, 199, 120, 115, 111, 107, 104, 107, 111, 115, 120,
+        124, 129, 135, 141, 145, 150, 155, 161, 165, 169, 173, 177, 181, 184,
+        187, 191, 193, 196, 199, 202, 202, 202, 202, 126, 121, 117, 113, 109,
+        112, 116, 120, 124, 129, 134, 139, 145, 149, 154, 159, 164, 168, 172,
+        176, 181, 184, 187, 190, 194, 196, 199, 202, 204, 204, 204, 204, 132,
+        127, 123, 119, 115, 118, 122, 125, 129, 134, 138, 143, 149, 153, 158,
+        163, 168, 172, 176, 180, 184, 187, 190, 194, 197, 199, 202, 205, 207,
+        207, 207, 207, 139, 134, 130, 125, 121, 124, 128, 131, 135, 139, 143,
+        148, 153, 157, 162, 166, 172, 175, 179, 183, 188, 191, 194, 197, 200,
+        203, 205, 208, 210, 210, 210, 210, 145, 140, 135, 130, 126, 129, 132,
+        136, 139, 143, 147, 152, 157, 161, 165, 170, 175, 178, 182, 186, 190,
+        193, 196, 199, 203, 205, 208, 210, 213, 213, 213, 213, 150, 145, 140,
+        136, 132, 134, 137, 140, 144, 148, 152, 156, 160, 164, 169, 173, 178,
+        181, 185, 189, 193, 196, 199, 202, 205, 208, 210, 212, 215, 215, 215,
+        215, 156, 151, 146, 142, 137, 140, 143, 146, 149, 152, 156, 160, 165,
+        168, 172, 177, 181, 185, 188, 192, 196, 199, 202, 205, 208, 210, 212,
+        215, 217, 217, 217, 217, 163, 158, 153, 148, 144, 146, 149, 151, 154,
+        158, 161, 165, 169, 173, 176, 180, 185, 188, 191, 195, 199, 202, 204,
+        207, 210, 213, 215, 217, 220, 220, 220, 220, 163, 158, 153, 148, 144,
+        146, 149, 151, 154, 158, 161, 165, 169, 173, 176, 180, 185, 188, 191,
+        195, 199, 202, 204, 207, 210, 213, 215, 217, 220, 220, 220, 220, 163,
+        158, 153, 148, 144, 146, 149, 151, 154, 158, 161, 165, 169, 173, 176,
+        180, 185, 188, 191, 195, 199, 202, 204, 207, 210, 213, 215, 217, 220,
+        220, 220, 220, 163, 158, 153, 148, 144, 146, 149, 151, 154, 158, 161,
+        165, 169, 173, 176, 180, 185, 188, 191, 195, 199, 202, 204, 207, 210,
+        213, 215, 217, 220, 220, 220, 220 },
+      { /* Intra matrices */
+        /* Size 4 */
+        24, 26, 43, 67, 26, 39, 54, 72, 43, 54, 75, 91, 67, 72, 91, 105,
+        /* Size 8 */
+        28, 21, 23, 30, 41, 53, 65, 77, 21, 25, 23, 27, 35, 45, 56, 68, 23, 23,
+        33, 38, 45, 53, 63, 73, 30, 27, 38, 49, 56, 64, 72, 81, 41, 35, 45, 56,
+        67, 75, 82, 89, 53, 45, 53, 64, 75, 84, 91, 97, 65, 56, 63, 72, 82, 91,
+        98, 104, 77, 68, 73, 81, 89, 97, 104, 109,
+        /* Size 16 */
+        27, 24, 21, 21, 22, 25, 29, 34, 39, 44, 51, 56, 63, 68, 75, 75, 24, 23,
+        22, 22, 22, 25, 28, 32, 36, 41, 47, 52, 58, 63, 70, 70, 21, 22, 24, 23,
+        23, 24, 27, 30, 34, 38, 43, 48, 54, 59, 65, 65, 21, 22, 23, 25, 26, 28,
+        31, 34, 38, 42, 47, 52, 57, 62, 68, 68, 22, 22, 23, 26, 32, 34, 37, 40,
+        43, 47, 51, 55, 61, 65, 70, 70, 25, 25, 24, 28, 34, 37, 41, 44, 48, 52,
+        56, 60, 65, 69, 74, 74, 29, 28, 27, 31, 37, 41, 47, 50, 54, 58, 62, 65,
+        70, 74, 78, 78, 34, 32, 30, 34, 40, 44, 50, 54, 59, 62, 67, 70, 74, 78,
+        82, 82, 39, 36, 34, 38, 43, 48, 54, 59, 64, 68, 72, 76, 79, 83, 86, 86,
+        44, 41, 38, 42, 47, 52, 58, 62, 68, 72, 76, 80, 83, 86, 90, 90, 51, 47,
+        43, 47, 51, 56, 62, 67, 72, 76, 81, 84, 88, 91, 94, 94, 56, 52, 48, 52,
+        55, 60, 65, 70, 76, 80, 84, 87, 91, 94, 97, 97, 63, 58, 54, 57, 61, 65,
+        70, 74, 79, 83, 88, 91, 95, 97, 100, 100, 68, 63, 59, 62, 65, 69, 74,
+        78, 83, 86, 91, 94, 97, 100, 103, 103, 75, 70, 65, 68, 70, 74, 78, 82,
+        86, 90, 94, 97, 100, 103, 105, 105, 75, 70, 65, 68, 70, 74, 78, 82, 86,
+        90, 94, 97, 100, 103, 105, 105,
+        /* Size 32 */
+        27, 25, 23, 22, 20, 21, 21, 21, 22, 23, 25, 27, 29, 31, 33, 36, 39, 41,
+        43, 46, 50, 52, 55, 58, 62, 64, 67, 70, 73, 73, 73, 73, 25, 24, 23, 22,
+        21, 21, 21, 22, 22, 23, 25, 26, 28, 30, 32, 34, 37, 39, 42, 45, 48, 50,
+        53, 56, 59, 62, 65, 67, 71, 71, 71, 71, 23, 23, 22, 22, 22, 22, 22, 22,
+        22, 23, 24, 26, 27, 29, 31, 33, 36, 38, 40, 43, 46, 48, 51, 54, 57, 60,
+        62, 65, 68, 68, 68, 68, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 24, 25,
+        27, 28, 30, 32, 34, 36, 39, 41, 44, 46, 49, 52, 55, 57, 60, 63, 66, 66,
+        66, 66, 20, 21, 22, 22, 23, 23, 23, 22, 22, 23, 24, 25, 26, 28, 29, 31,
+        33, 35, 37, 40, 43, 45, 47, 50, 53, 55, 58, 61, 64, 64, 64, 64, 21, 21,
+        22, 22, 23, 23, 23, 24, 24, 25, 26, 27, 28, 30, 31, 33, 35, 37, 39, 42,
+        44, 46, 49, 52, 54, 57, 59, 62, 65, 65, 65, 65, 21, 21, 22, 22, 23, 23,
+        24, 25, 26, 27, 28, 29, 30, 32, 33, 35, 37, 39, 41, 43, 46, 48, 51, 53,
+        56, 58, 61, 63, 66, 66, 66, 66, 21, 22, 22, 22, 22, 24, 25, 27, 28, 29,
+        30, 32, 33, 34, 36, 38, 40, 41, 43, 46, 48, 50, 52, 55, 58, 60, 62, 65,
+        68, 68, 68, 68, 22, 22, 22, 22, 22, 24, 26, 28, 31, 32, 33, 35, 36, 38,
+        39, 41, 42, 44, 46, 48, 50, 52, 54, 57, 59, 62, 64, 66, 69, 69, 69, 69,
+        23, 23, 23, 23, 23, 25, 27, 29, 32, 34, 35, 37, 38, 40, 41, 43, 45, 46,
+        48, 50, 52, 54, 57, 59, 61, 64, 66, 68, 71, 71, 71, 71, 25, 25, 24, 24,
+        24, 26, 28, 30, 33, 35, 37, 39, 41, 42, 44, 45, 47, 49, 51, 53, 55, 57,
+        59, 61, 64, 66, 68, 70, 73, 73, 73, 73, 27, 26, 26, 25, 25, 27, 29, 32,
+        35, 37, 39, 41, 43, 45, 46, 48, 50, 52, 54, 56, 58, 60, 61, 64, 66, 68,
+        70, 72, 74, 74, 74, 74, 29, 28, 27, 27, 26, 28, 30, 33, 36, 38, 41, 43,
+        46, 48, 49, 51, 53, 55, 57, 59, 61, 62, 64, 66, 68, 70, 72, 74, 76, 76,
+        76, 76, 31, 30, 29, 28, 28, 30, 32, 34, 38, 40, 42, 45, 48, 49, 51, 53,
+        56, 57, 59, 61, 63, 65, 66, 68, 71, 72, 74, 76, 78, 78, 78, 78, 33, 32,
+        31, 30, 29, 31, 33, 36, 39, 41, 44, 46, 49, 51, 53, 55, 58, 60, 61, 63,
+        65, 67, 69, 71, 73, 75, 76, 78, 80, 80, 80, 80, 36, 34, 33, 32, 31, 33,
+        35, 38, 41, 43, 45, 48, 51, 53, 55, 58, 60, 62, 64, 66, 68, 70, 71, 73,
+        75, 77, 79, 80, 82, 82, 82, 82, 39, 37, 36, 34, 33, 35, 37, 40, 42, 45,
+        47, 50, 53, 56, 58, 60, 63, 65, 67, 69, 71, 72, 74, 76, 78, 79, 81, 83,
+        85, 85, 85, 85, 41, 39, 38, 36, 35, 37, 39, 41, 44, 46, 49, 52, 55, 57,
+        60, 62, 65, 67, 69, 71, 73, 74, 76, 78, 80, 81, 83, 85, 86, 86, 86, 86,
+        43, 42, 40, 39, 37, 39, 41, 43, 46, 48, 51, 54, 57, 59, 61, 64, 67, 69,
+        70, 73, 75, 76, 78, 80, 82, 83, 85, 86, 88, 88, 88, 88, 46, 45, 43, 41,
+        40, 42, 43, 46, 48, 50, 53, 56, 59, 61, 63, 66, 69, 71, 73, 75, 77, 79,
+        80, 82, 84, 85, 87, 88, 90, 90, 90, 90, 50, 48, 46, 44, 43, 44, 46, 48,
+        50, 52, 55, 58, 61, 63, 65, 68, 71, 73, 75, 77, 79, 81, 83, 84, 86, 87,
+        89, 90, 92, 92, 92, 92, 52, 50, 48, 46, 45, 46, 48, 50, 52, 54, 57, 60,
+        62, 65, 67, 70, 72, 74, 76, 79, 81, 82, 84, 86, 88, 89, 90, 92, 93, 93,
+        93, 93, 55, 53, 51, 49, 47, 49, 51, 52, 54, 57, 59, 61, 64, 66, 69, 71,
+        74, 76, 78, 80, 83, 84, 86, 88, 89, 91, 92, 94, 95, 95, 95, 95, 58, 56,
+        54, 52, 50, 52, 53, 55, 57, 59, 61, 64, 66, 68, 71, 73, 76, 78, 80, 82,
+        84, 86, 88, 89, 91, 92, 94, 95, 97, 97, 97, 97, 62, 59, 57, 55, 53, 54,
+        56, 58, 59, 61, 64, 66, 68, 71, 73, 75, 78, 80, 82, 84, 86, 88, 89, 91,
+        93, 94, 95, 97, 98, 98, 98, 98, 64, 62, 60, 57, 55, 57, 58, 60, 62, 64,
+        66, 68, 70, 72, 75, 77, 79, 81, 83, 85, 87, 89, 91, 92, 94, 95, 97, 98,
+        99, 99, 99, 99, 67, 65, 62, 60, 58, 59, 61, 62, 64, 66, 68, 70, 72, 74,
+        76, 79, 81, 83, 85, 87, 89, 90, 92, 94, 95, 97, 98, 99, 101, 101, 101,
+        101, 70, 67, 65, 63, 61, 62, 63, 65, 66, 68, 70, 72, 74, 76, 78, 80, 83,
+        85, 86, 88, 90, 92, 94, 95, 97, 98, 99, 101, 102, 102, 102, 102, 73, 71,
+        68, 66, 64, 65, 66, 68, 69, 71, 73, 74, 76, 78, 80, 82, 85, 86, 88, 90,
+        92, 93, 95, 97, 98, 99, 101, 102, 103, 103, 103, 103, 73, 71, 68, 66,
+        64, 65, 66, 68, 69, 71, 73, 74, 76, 78, 80, 82, 85, 86, 88, 90, 92, 93,
+        95, 97, 98, 99, 101, 102, 103, 103, 103, 103, 73, 71, 68, 66, 64, 65,
+        66, 68, 69, 71, 73, 74, 76, 78, 80, 82, 85, 86, 88, 90, 92, 93, 95, 97,
+        98, 99, 101, 102, 103, 103, 103, 103, 73, 71, 68, 66, 64, 65, 66, 68,
+        69, 71, 73, 74, 76, 78, 80, 82, 85, 86, 88, 90, 92, 93, 95, 97, 98, 99,
+        101, 102, 103, 103, 103, 103 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 99, 107, 133, 99, 117, 124, 141, 107, 124, 156, 175, 133, 141, 175,
+        203,
+        /* Size 8 */
+        64, 52, 92, 97, 103, 114, 128, 143, 52, 74, 93, 85, 87, 96, 107, 120,
+        92, 93, 108, 105, 104, 109, 118, 129, 97, 85, 105, 117, 122, 127, 134,
+        143, 103, 87, 104, 122, 135, 144, 151, 159, 114, 96, 109, 127, 144, 156,
+        166, 175, 128, 107, 118, 134, 151, 166, 178, 188, 143, 120, 129, 143,
+        159, 175, 188, 198,
+        /* Size 16 */
+        64, 57, 52, 66, 92, 94, 97, 100, 103, 108, 114, 121, 128, 135, 143, 143,
+        57, 59, 61, 74, 93, 92, 91, 93, 95, 99, 104, 110, 116, 123, 130, 130,
+        52, 61, 74, 83, 93, 89, 85, 86, 87, 91, 96, 101, 107, 113, 120, 120, 66,
+        74, 83, 91, 100, 97, 94, 95, 95, 98, 102, 107, 112, 118, 124, 124, 92,
+        93, 93, 100, 108, 106, 105, 104, 104, 107, 109, 114, 118, 123, 129, 129,
+        94, 92, 89, 97, 106, 108, 111, 112, 113, 115, 118, 122, 126, 131, 136,
+        136, 97, 91, 85, 94, 105, 111, 117, 120, 122, 125, 127, 131, 134, 139,
+        143, 143, 100, 93, 86, 95, 104, 112, 120, 124, 129, 132, 135, 139, 142,
+        146, 151, 151, 103, 95, 87, 95, 104, 113, 122, 129, 135, 139, 144, 147,
+        151, 155, 159, 159, 108, 99, 91, 98, 107, 115, 125, 132, 139, 144, 150,
+        154, 158, 162, 167, 167, 114, 104, 96, 102, 109, 118, 127, 135, 144,
+        150, 156, 161, 166, 170, 175, 175, 121, 110, 101, 107, 114, 122, 131,
+        139, 147, 154, 161, 166, 172, 176, 181, 181, 128, 116, 107, 112, 118,
+        126, 134, 142, 151, 158, 166, 172, 178, 183, 188, 188, 135, 123, 113,
+        118, 123, 131, 139, 146, 155, 162, 170, 176, 183, 188, 193, 193, 143,
+        130, 120, 124, 129, 136, 143, 151, 159, 167, 175, 181, 188, 193, 198,
+        198, 143, 130, 120, 124, 129, 136, 143, 151, 159, 167, 175, 181, 188,
+        193, 198, 198,
+        /* Size 32 */
+        64, 61, 57, 55, 52, 58, 66, 77, 92, 93, 94, 96, 97, 98, 100, 101, 103,
+        106, 108, 111, 114, 117, 121, 124, 128, 131, 135, 139, 143, 143, 143,
+        143, 61, 59, 58, 57, 56, 62, 70, 79, 92, 93, 93, 93, 94, 95, 96, 97, 99,
+        101, 104, 106, 109, 112, 115, 118, 122, 125, 129, 132, 136, 136, 136,
+        136, 57, 58, 59, 60, 61, 67, 74, 82, 93, 92, 92, 91, 91, 92, 93, 94, 95,
+        97, 99, 101, 104, 107, 110, 113, 116, 119, 123, 126, 130, 130, 130, 130,
+        55, 57, 60, 63, 67, 72, 78, 85, 93, 92, 90, 89, 88, 89, 89, 90, 91, 93,
+        95, 97, 100, 102, 105, 108, 111, 114, 118, 121, 125, 125, 125, 125, 52,
+        56, 61, 67, 74, 78, 83, 88, 93, 91, 89, 87, 85, 86, 86, 87, 87, 89, 91,
+        93, 96, 98, 101, 104, 107, 110, 113, 116, 120, 120, 120, 120, 58, 62,
+        67, 72, 78, 82, 86, 91, 97, 95, 93, 91, 89, 90, 90, 91, 91, 93, 95, 97,
+        99, 101, 104, 106, 109, 112, 115, 119, 122, 122, 122, 122, 66, 70, 74,
+        78, 83, 86, 91, 95, 100, 99, 97, 95, 94, 94, 95, 95, 95, 97, 98, 100,
+        102, 104, 107, 109, 112, 115, 118, 121, 124, 124, 124, 124, 77, 79, 82,
+        85, 88, 91, 95, 99, 104, 103, 101, 100, 99, 99, 99, 99, 100, 101, 102,
+        104, 106, 108, 110, 113, 115, 118, 121, 124, 127, 127, 127, 127, 92, 92,
+        93, 93, 93, 97, 100, 104, 108, 107, 106, 105, 105, 105, 104, 104, 104,
+        106, 107, 108, 109, 111, 114, 116, 118, 121, 123, 126, 129, 129, 129,
+        129, 93, 93, 92, 92, 91, 95, 99, 103, 107, 107, 107, 107, 108, 108, 108,
+        108, 108, 110, 111, 112, 113, 115, 117, 120, 122, 124, 127, 130, 133,
+        133, 133, 133, 94, 93, 92, 90, 89, 93, 97, 101, 106, 107, 108, 109, 111,
+        111, 112, 112, 113, 114, 115, 116, 118, 120, 122, 124, 126, 128, 131,
+        133, 136, 136, 136, 136, 96, 93, 91, 89, 87, 91, 95, 100, 105, 107, 109,
+        112, 114, 115, 116, 116, 117, 119, 120, 121, 122, 124, 126, 128, 130,
+        132, 135, 137, 140, 140, 140, 140, 97, 94, 91, 88, 85, 89, 94, 99, 105,
+        108, 111, 114, 117, 118, 120, 121, 122, 124, 125, 126, 127, 129, 131,
+        132, 134, 136, 139, 141, 143, 143, 143, 143, 98, 95, 92, 89, 86, 90, 94,
+        99, 105, 108, 111, 115, 118, 120, 122, 124, 125, 127, 128, 130, 131,
+        133, 134, 136, 138, 140, 142, 145, 147, 147, 147, 147, 100, 96, 93, 89,
+        86, 90, 95, 99, 104, 108, 112, 116, 120, 122, 124, 126, 129, 130, 132,
+        133, 135, 137, 139, 140, 142, 144, 146, 149, 151, 151, 151, 151, 101,
+        97, 94, 90, 87, 91, 95, 99, 104, 108, 112, 116, 121, 124, 126, 129, 132,
+        134, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 155, 155,
+        155, 103, 99, 95, 91, 87, 91, 95, 100, 104, 108, 113, 117, 122, 125,
+        129, 132, 135, 137, 139, 142, 144, 146, 147, 149, 151, 153, 155, 157,
+        159, 159, 159, 159, 106, 101, 97, 93, 89, 93, 97, 101, 106, 110, 114,
+        119, 124, 127, 130, 134, 137, 140, 142, 144, 147, 149, 151, 153, 155,
+        157, 159, 161, 163, 163, 163, 163, 108, 104, 99, 95, 91, 95, 98, 102,
+        107, 111, 115, 120, 125, 128, 132, 135, 139, 142, 144, 147, 150, 152,
+        154, 156, 158, 160, 162, 164, 167, 167, 167, 167, 111, 106, 101, 97, 93,
+        97, 100, 104, 108, 112, 116, 121, 126, 130, 133, 137, 142, 144, 147,
+        150, 153, 155, 157, 160, 162, 164, 166, 168, 170, 170, 170, 170, 114,
+        109, 104, 100, 96, 99, 102, 106, 109, 113, 118, 122, 127, 131, 135, 139,
+        144, 147, 150, 153, 156, 159, 161, 164, 166, 168, 170, 172, 175, 175,
+        175, 175, 117, 112, 107, 102, 98, 101, 104, 108, 111, 115, 120, 124,
+        129, 133, 137, 141, 146, 149, 152, 155, 159, 161, 164, 166, 169, 171,
+        173, 175, 178, 178, 178, 178, 121, 115, 110, 105, 101, 104, 107, 110,
+        114, 117, 122, 126, 131, 134, 139, 143, 147, 151, 154, 157, 161, 164,
+        166, 169, 172, 174, 176, 178, 181, 181, 181, 181, 124, 118, 113, 108,
+        104, 106, 109, 113, 116, 120, 124, 128, 132, 136, 140, 145, 149, 153,
+        156, 160, 164, 166, 169, 172, 175, 177, 179, 182, 184, 184, 184, 184,
+        128, 122, 116, 111, 107, 109, 112, 115, 118, 122, 126, 130, 134, 138,
+        142, 147, 151, 155, 158, 162, 166, 169, 172, 175, 178, 180, 183, 185,
+        188, 188, 188, 188, 131, 125, 119, 114, 110, 112, 115, 118, 121, 124,
+        128, 132, 136, 140, 144, 149, 153, 157, 160, 164, 168, 171, 174, 177,
+        180, 183, 185, 188, 190, 190, 190, 190, 135, 129, 123, 118, 113, 115,
+        118, 121, 123, 127, 131, 135, 139, 142, 146, 151, 155, 159, 162, 166,
+        170, 173, 176, 179, 183, 185, 188, 190, 193, 193, 193, 193, 139, 132,
+        126, 121, 116, 119, 121, 124, 126, 130, 133, 137, 141, 145, 149, 153,
+        157, 161, 164, 168, 172, 175, 178, 182, 185, 188, 190, 193, 196, 196,
+        196, 196, 143, 136, 130, 125, 120, 122, 124, 127, 129, 133, 136, 140,
+        143, 147, 151, 155, 159, 163, 167, 170, 175, 178, 181, 184, 188, 190,
+        193, 196, 198, 198, 198, 198, 143, 136, 130, 125, 120, 122, 124, 127,
+        129, 133, 136, 140, 143, 147, 151, 155, 159, 163, 167, 170, 175, 178,
+        181, 184, 188, 190, 193, 196, 198, 198, 198, 198, 143, 136, 130, 125,
+        120, 122, 124, 127, 129, 133, 136, 140, 143, 147, 151, 155, 159, 163,
+        167, 170, 175, 178, 181, 184, 188, 190, 193, 196, 198, 198, 198, 198,
+        143, 136, 130, 125, 120, 122, 124, 127, 129, 133, 136, 140, 143, 147,
+        151, 155, 159, 163, 167, 170, 175, 178, 181, 184, 188, 190, 193, 196,
+        198, 198, 198, 198 },
+      { /* Intra matrices */
+        /* Size 4 */
+        29, 46, 49, 62, 46, 54, 58, 66, 49, 58, 74, 84, 62, 66, 84, 99,
+        /* Size 8 */
+        31, 25, 45, 48, 51, 57, 64, 72, 25, 36, 46, 42, 43, 47, 53, 60, 45, 46,
+        54, 52, 52, 54, 59, 65, 48, 42, 52, 59, 61, 64, 68, 73, 51, 43, 52, 61,
+        68, 73, 77, 82, 57, 47, 54, 64, 73, 80, 86, 90, 64, 53, 59, 68, 77, 86,
+        92, 98, 72, 60, 65, 73, 82, 90, 98, 104,
+        /* Size 16 */
+        30, 27, 24, 31, 44, 45, 47, 48, 50, 53, 56, 59, 63, 66, 71, 71, 27, 28,
+        29, 35, 44, 44, 43, 44, 45, 48, 50, 53, 57, 60, 64, 64, 24, 29, 35, 39,
+        45, 43, 41, 41, 42, 44, 46, 49, 52, 55, 58, 58, 31, 35, 39, 43, 48, 47,
+        45, 45, 46, 47, 49, 52, 54, 58, 61, 61, 44, 44, 45, 48, 52, 51, 51, 51,
+        50, 52, 53, 55, 58, 60, 63, 63, 45, 44, 43, 47, 51, 53, 54, 54, 55, 56,
+        57, 59, 62, 64, 67, 67, 47, 43, 41, 45, 51, 54, 57, 58, 60, 61, 62, 64,
+        66, 68, 71, 71, 48, 44, 41, 45, 51, 54, 58, 61, 63, 65, 67, 68, 70, 73,
+        75, 75, 50, 45, 42, 46, 50, 55, 60, 63, 67, 69, 71, 73, 75, 77, 80, 80,
+        53, 48, 44, 47, 52, 56, 61, 65, 69, 72, 74, 77, 79, 81, 84, 84, 56, 50,
+        46, 49, 53, 57, 62, 67, 71, 74, 78, 81, 83, 86, 88, 88, 59, 53, 49, 52,
+        55, 59, 64, 68, 73, 77, 81, 83, 87, 89, 92, 92, 63, 57, 52, 54, 58, 62,
+        66, 70, 75, 79, 83, 87, 90, 93, 95, 95, 66, 60, 55, 58, 60, 64, 68, 73,
+        77, 81, 86, 89, 93, 95, 98, 98, 71, 64, 58, 61, 63, 67, 71, 75, 80, 84,
+        88, 92, 95, 98, 102, 102, 71, 64, 58, 61, 63, 67, 71, 75, 80, 84, 88,
+        92, 95, 98, 102, 102,
+        /* Size 32 */
+        30, 28, 27, 25, 24, 27, 31, 36, 43, 44, 45, 45, 46, 47, 48, 48, 49, 50,
+        52, 53, 55, 56, 58, 60, 62, 64, 65, 67, 70, 70, 70, 70, 28, 28, 27, 27,
+        26, 29, 33, 37, 44, 44, 44, 44, 44, 45, 46, 46, 47, 48, 49, 51, 52, 54,
+        55, 57, 59, 60, 62, 64, 66, 66, 66, 66, 27, 27, 28, 28, 28, 31, 35, 39,
+        44, 44, 43, 43, 43, 43, 44, 44, 45, 46, 47, 48, 50, 51, 53, 54, 56, 58,
+        59, 61, 63, 63, 63, 63, 25, 27, 28, 30, 31, 34, 37, 40, 44, 43, 43, 42,
+        41, 42, 42, 43, 43, 44, 45, 46, 47, 49, 50, 52, 53, 55, 57, 58, 60, 60,
+        60, 60, 24, 26, 28, 31, 35, 37, 39, 41, 44, 43, 42, 41, 40, 40, 41, 41,
+        41, 42, 43, 44, 45, 47, 48, 49, 51, 53, 54, 56, 58, 58, 58, 58, 27, 29,
+        31, 34, 37, 39, 41, 43, 46, 45, 44, 43, 42, 42, 43, 43, 43, 44, 45, 46,
+        47, 48, 49, 51, 52, 54, 55, 57, 59, 59, 59, 59, 31, 33, 35, 37, 39, 41,
+        43, 45, 48, 47, 46, 45, 45, 45, 45, 45, 45, 46, 47, 48, 49, 50, 51, 52,
+        54, 55, 57, 58, 60, 60, 60, 60, 36, 37, 39, 40, 41, 43, 45, 47, 50, 49,
+        48, 48, 47, 47, 47, 47, 47, 48, 49, 50, 50, 52, 53, 54, 55, 57, 58, 60,
+        61, 61, 61, 61, 43, 44, 44, 44, 44, 46, 48, 50, 52, 51, 51, 50, 50, 50,
+        50, 50, 50, 50, 51, 52, 52, 53, 54, 56, 57, 58, 60, 61, 63, 63, 63, 63,
+        44, 44, 44, 43, 43, 45, 47, 49, 51, 51, 51, 51, 51, 51, 52, 52, 52, 52,
+        53, 54, 54, 55, 56, 58, 59, 60, 61, 63, 64, 64, 64, 64, 45, 44, 43, 43,
+        42, 44, 46, 48, 51, 51, 52, 52, 53, 53, 53, 54, 54, 55, 55, 56, 57, 58,
+        59, 60, 61, 62, 63, 65, 66, 66, 66, 66, 45, 44, 43, 42, 41, 43, 45, 48,
+        50, 51, 52, 53, 55, 55, 55, 56, 56, 57, 58, 58, 59, 60, 61, 62, 63, 64,
+        65, 67, 68, 68, 68, 68, 46, 44, 43, 41, 40, 42, 45, 47, 50, 51, 53, 55,
+        56, 57, 58, 58, 59, 60, 60, 61, 62, 62, 63, 64, 65, 66, 68, 69, 70, 70,
+        70, 70, 47, 45, 43, 42, 40, 42, 45, 47, 50, 51, 53, 55, 57, 58, 59, 60,
+        61, 61, 62, 63, 64, 64, 65, 66, 67, 68, 70, 71, 72, 72, 72, 72, 48, 46,
+        44, 42, 41, 43, 45, 47, 50, 52, 53, 55, 58, 59, 60, 61, 62, 63, 64, 65,
+        66, 67, 67, 68, 69, 71, 72, 73, 74, 74, 74, 74, 48, 46, 44, 43, 41, 43,
+        45, 47, 50, 52, 54, 56, 58, 60, 61, 62, 64, 65, 66, 67, 68, 69, 70, 71,
+        72, 73, 74, 75, 76, 76, 76, 76, 49, 47, 45, 43, 41, 43, 45, 47, 50, 52,
+        54, 56, 59, 61, 62, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
+        79, 79, 79, 79, 50, 48, 46, 44, 42, 44, 46, 48, 50, 52, 55, 57, 60, 61,
+        63, 65, 67, 68, 69, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 80, 80, 80,
+        52, 49, 47, 45, 43, 45, 47, 49, 51, 53, 55, 58, 60, 62, 64, 66, 68, 69,
+        71, 72, 73, 75, 76, 77, 78, 79, 80, 81, 82, 82, 82, 82, 53, 51, 48, 46,
+        44, 46, 48, 50, 52, 54, 56, 58, 61, 63, 65, 67, 69, 70, 72, 74, 75, 76,
+        78, 79, 80, 81, 82, 83, 85, 85, 85, 85, 55, 52, 50, 47, 45, 47, 49, 50,
+        52, 54, 57, 59, 62, 64, 66, 68, 70, 72, 73, 75, 77, 78, 80, 81, 82, 83,
+        84, 86, 87, 87, 87, 87, 56, 54, 51, 49, 47, 48, 50, 52, 53, 55, 58, 60,
+        62, 64, 67, 69, 71, 73, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 89, 89,
+        89, 89, 58, 55, 53, 50, 48, 49, 51, 53, 54, 56, 59, 61, 63, 65, 67, 70,
+        72, 74, 76, 78, 80, 81, 82, 84, 85, 87, 88, 89, 90, 90, 90, 90, 60, 57,
+        54, 52, 49, 51, 52, 54, 56, 58, 60, 62, 64, 66, 68, 71, 73, 75, 77, 79,
+        81, 82, 84, 85, 87, 88, 90, 91, 92, 92, 92, 92, 62, 59, 56, 53, 51, 52,
+        54, 55, 57, 59, 61, 63, 65, 67, 69, 72, 74, 76, 78, 80, 82, 84, 85, 87,
+        89, 90, 91, 93, 94, 94, 94, 94, 64, 60, 58, 55, 53, 54, 55, 57, 58, 60,
+        62, 64, 66, 68, 71, 73, 75, 77, 79, 81, 83, 85, 87, 88, 90, 91, 93, 94,
+        96, 96, 96, 96, 65, 62, 59, 57, 54, 55, 57, 58, 60, 61, 63, 65, 68, 70,
+        72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 91, 93, 94, 96, 97, 97, 97, 97,
+        67, 64, 61, 58, 56, 57, 58, 60, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79,
+        81, 83, 86, 87, 89, 91, 93, 94, 96, 97, 99, 99, 99, 99, 70, 66, 63, 60,
+        58, 59, 60, 61, 63, 64, 66, 68, 70, 72, 74, 76, 79, 80, 82, 85, 87, 89,
+        90, 92, 94, 96, 97, 99, 100, 100, 100, 100, 70, 66, 63, 60, 58, 59, 60,
+        61, 63, 64, 66, 68, 70, 72, 74, 76, 79, 80, 82, 85, 87, 89, 90, 92, 94,
+        96, 97, 99, 100, 100, 100, 100, 70, 66, 63, 60, 58, 59, 60, 61, 63, 64,
+        66, 68, 70, 72, 74, 76, 79, 80, 82, 85, 87, 89, 90, 92, 94, 96, 97, 99,
+        100, 100, 100, 100, 70, 66, 63, 60, 58, 59, 60, 61, 63, 64, 66, 68, 70,
+        72, 74, 76, 79, 80, 82, 85, 87, 89, 90, 92, 94, 96, 97, 99, 100, 100,
+        100, 100 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 69, 108, 155, 69, 100, 131, 166, 108, 131, 171, 197, 155, 166, 197,
+        218,
+        /* Size 8 */
+        64, 49, 53, 68, 88, 109, 130, 149, 49, 56, 54, 62, 77, 96, 115, 134, 53,
+        54, 73, 83, 95, 110, 126, 142, 68, 62, 83, 102, 116, 128, 141, 154, 88,
+        77, 95, 116, 132, 145, 156, 166, 109, 96, 110, 128, 145, 158, 168, 176,
+        130, 115, 126, 141, 156, 168, 177, 185, 149, 134, 142, 154, 166, 176,
+        185, 191,
+        /* Size 16 */
+        64, 56, 49, 51, 53, 60, 68, 77, 88, 97, 109, 119, 130, 139, 149, 149,
+        56, 54, 52, 53, 53, 59, 65, 73, 82, 91, 102, 111, 122, 131, 141, 141,
+        49, 52, 56, 55, 54, 58, 62, 69, 77, 85, 96, 104, 115, 124, 134, 134, 51,
+        53, 55, 58, 62, 66, 71, 78, 85, 93, 102, 111, 120, 128, 138, 138, 53,
+        53, 54, 62, 73, 78, 83, 89, 95, 102, 110, 117, 126, 134, 142, 142, 60,
+        59, 58, 66, 78, 84, 92, 98, 104, 111, 118, 125, 133, 140, 148, 148, 68,
+        65, 62, 71, 83, 92, 102, 108, 116, 122, 128, 134, 141, 147, 154, 154,
+        77, 73, 69, 78, 89, 98, 108, 115, 123, 129, 136, 142, 148, 153, 159,
+        159, 88, 82, 77, 85, 95, 104, 116, 123, 132, 138, 145, 150, 156, 161,
+        166, 166, 97, 91, 85, 93, 102, 111, 122, 129, 138, 144, 151, 156, 162,
+        166, 171, 171, 109, 102, 96, 102, 110, 118, 128, 136, 145, 151, 158,
+        163, 168, 172, 176, 176, 119, 111, 104, 111, 117, 125, 134, 142, 150,
+        156, 163, 168, 173, 176, 180, 180, 130, 122, 115, 120, 126, 133, 141,
+        148, 156, 162, 168, 173, 177, 181, 185, 185, 139, 131, 124, 128, 134,
+        140, 147, 153, 161, 166, 172, 176, 181, 184, 188, 188, 149, 141, 134,
+        138, 142, 148, 154, 159, 166, 171, 176, 180, 185, 188, 191, 191, 149,
+        141, 134, 138, 142, 148, 154, 159, 166, 171, 176, 180, 185, 188, 191,
+        191,
+        /* Size 32 */
+        64, 60, 56, 52, 49, 50, 51, 52, 53, 56, 60, 64, 68, 72, 77, 82, 88, 92,
+        97, 103, 109, 114, 119, 124, 130, 134, 139, 144, 149, 149, 149, 149, 60,
+        57, 55, 53, 51, 51, 52, 53, 53, 56, 59, 63, 66, 70, 75, 79, 85, 89, 94,
+        100, 106, 110, 115, 120, 126, 130, 135, 140, 145, 145, 145, 145, 56, 55,
+        54, 53, 52, 53, 53, 53, 53, 56, 59, 62, 65, 69, 73, 77, 82, 86, 91, 96,
+        102, 106, 111, 116, 122, 126, 131, 136, 141, 141, 141, 141, 52, 53, 53,
+        54, 54, 54, 54, 54, 54, 56, 58, 61, 64, 67, 71, 75, 80, 84, 88, 93, 99,
+        103, 108, 113, 118, 123, 127, 132, 137, 137, 137, 137, 49, 51, 52, 54,
+        56, 55, 55, 54, 54, 56, 58, 60, 62, 65, 69, 73, 77, 81, 85, 90, 96, 100,
+        104, 109, 115, 119, 124, 128, 134, 134, 134, 134, 50, 51, 53, 54, 55,
+        56, 56, 57, 58, 60, 62, 64, 66, 70, 73, 77, 81, 85, 89, 94, 99, 103,
+        107, 112, 118, 122, 126, 131, 136, 136, 136, 136, 51, 52, 53, 54, 55,
+        56, 58, 60, 62, 64, 66, 69, 71, 74, 78, 81, 85, 89, 93, 97, 102, 106,
+        111, 115, 120, 124, 128, 133, 138, 138, 138, 138, 52, 53, 53, 54, 54,
+        57, 60, 63, 67, 69, 72, 74, 77, 80, 83, 86, 90, 94, 97, 101, 106, 110,
+        114, 118, 123, 127, 131, 135, 140, 140, 140, 140, 53, 53, 53, 54, 54,
+        58, 62, 67, 73, 75, 78, 80, 83, 86, 89, 92, 95, 99, 102, 106, 110, 114,
+        117, 122, 126, 130, 134, 138, 142, 142, 142, 142, 56, 56, 56, 56, 56,
+        60, 64, 69, 75, 78, 81, 84, 87, 90, 93, 96, 100, 103, 106, 110, 114,
+        118, 121, 125, 130, 133, 137, 141, 145, 145, 145, 145, 60, 59, 59, 58,
+        58, 62, 66, 72, 78, 81, 84, 88, 92, 95, 98, 101, 104, 108, 111, 115,
+        118, 122, 125, 129, 133, 136, 140, 144, 148, 148, 148, 148, 64, 63, 62,
+        61, 60, 64, 69, 74, 80, 84, 88, 92, 97, 100, 103, 106, 110, 113, 116,
+        119, 123, 126, 130, 133, 137, 140, 143, 147, 151, 151, 151, 151, 68, 66,
+        65, 64, 62, 66, 71, 77, 83, 87, 92, 97, 102, 105, 108, 112, 116, 119,
+        122, 125, 128, 131, 134, 138, 141, 144, 147, 150, 154, 154, 154, 154,
+        72, 70, 69, 67, 65, 70, 74, 80, 86, 90, 95, 100, 105, 108, 112, 116,
+        119, 122, 125, 129, 132, 135, 138, 141, 144, 147, 150, 153, 156, 156,
+        156, 156, 77, 75, 73, 71, 69, 73, 78, 83, 89, 93, 98, 103, 108, 112,
+        115, 119, 123, 126, 129, 133, 136, 139, 142, 145, 148, 151, 153, 156,
+        159, 159, 159, 159, 82, 79, 77, 75, 73, 77, 81, 86, 92, 96, 101, 106,
+        112, 116, 119, 123, 128, 131, 134, 137, 140, 143, 146, 149, 152, 154,
+        157, 160, 162, 162, 162, 162, 88, 85, 82, 80, 77, 81, 85, 90, 95, 100,
+        104, 110, 116, 119, 123, 128, 132, 135, 138, 141, 145, 147, 150, 153,
+        156, 158, 161, 163, 166, 166, 166, 166, 92, 89, 86, 84, 81, 85, 89, 94,
+        99, 103, 108, 113, 119, 122, 126, 131, 135, 138, 141, 144, 148, 150,
+        153, 156, 159, 161, 163, 166, 168, 168, 168, 168, 97, 94, 91, 88, 85,
+        89, 93, 97, 102, 106, 111, 116, 122, 125, 129, 134, 138, 141, 144, 148,
+        151, 154, 156, 159, 162, 164, 166, 168, 171, 171, 171, 171, 103, 100,
+        96, 93, 90, 94, 97, 101, 106, 110, 115, 119, 125, 129, 133, 137, 141,
+        144, 148, 151, 154, 157, 159, 162, 165, 167, 169, 171, 174, 174, 174,
+        174, 109, 106, 102, 99, 96, 99, 102, 106, 110, 114, 118, 123, 128, 132,
+        136, 140, 145, 148, 151, 154, 158, 160, 163, 165, 168, 170, 172, 174,
+        176, 176, 176, 176, 114, 110, 106, 103, 100, 103, 106, 110, 114, 118,
+        122, 126, 131, 135, 139, 143, 147, 150, 154, 157, 160, 163, 165, 168,
+        170, 172, 174, 176, 178, 178, 178, 178, 119, 115, 111, 108, 104, 107,
+        111, 114, 117, 121, 125, 130, 134, 138, 142, 146, 150, 153, 156, 159,
+        163, 165, 168, 170, 173, 174, 176, 178, 180, 180, 180, 180, 124, 120,
+        116, 113, 109, 112, 115, 118, 122, 125, 129, 133, 138, 141, 145, 149,
+        153, 156, 159, 162, 165, 168, 170, 172, 175, 177, 179, 181, 183, 183,
+        183, 183, 130, 126, 122, 118, 115, 118, 120, 123, 126, 130, 133, 137,
+        141, 144, 148, 152, 156, 159, 162, 165, 168, 170, 173, 175, 177, 179,
+        181, 183, 185, 185, 185, 185, 134, 130, 126, 123, 119, 122, 124, 127,
+        130, 133, 136, 140, 144, 147, 151, 154, 158, 161, 164, 167, 170, 172,
+        174, 177, 179, 181, 183, 185, 186, 186, 186, 186, 139, 135, 131, 127,
+        124, 126, 128, 131, 134, 137, 140, 143, 147, 150, 153, 157, 161, 163,
+        166, 169, 172, 174, 176, 179, 181, 183, 184, 186, 188, 188, 188, 188,
+        144, 140, 136, 132, 128, 131, 133, 135, 138, 141, 144, 147, 150, 153,
+        156, 160, 163, 166, 168, 171, 174, 176, 178, 181, 183, 185, 186, 188,
+        190, 190, 190, 190, 149, 145, 141, 137, 134, 136, 138, 140, 142, 145,
+        148, 151, 154, 156, 159, 162, 166, 168, 171, 174, 176, 178, 180, 183,
+        185, 186, 188, 190, 191, 191, 191, 191, 149, 145, 141, 137, 134, 136,
+        138, 140, 142, 145, 148, 151, 154, 156, 159, 162, 166, 168, 171, 174,
+        176, 178, 180, 183, 185, 186, 188, 190, 191, 191, 191, 191, 149, 145,
+        141, 137, 134, 136, 138, 140, 142, 145, 148, 151, 154, 156, 159, 162,
+        166, 168, 171, 174, 176, 178, 180, 183, 185, 186, 188, 190, 191, 191,
+        191, 191, 149, 145, 141, 137, 134, 136, 138, 140, 142, 145, 148, 151,
+        154, 156, 159, 162, 166, 168, 171, 174, 176, 178, 180, 183, 185, 186,
+        188, 190, 191, 191, 191, 191 },
+      { /* Intra matrices */
+        /* Size 4 */
+        26, 29, 46, 68, 29, 42, 57, 73, 46, 57, 75, 88, 68, 73, 88, 99,
+        /* Size 8 */
+        31, 24, 26, 33, 44, 55, 67, 77, 24, 27, 26, 30, 38, 48, 58, 69, 26, 26,
+        36, 41, 47, 55, 64, 73, 33, 30, 41, 51, 59, 65, 73, 80, 44, 38, 47, 59,
+        68, 75, 81, 87, 55, 48, 55, 65, 75, 82, 88, 93, 67, 58, 64, 73, 81, 88,
+        94, 98, 77, 69, 73, 80, 87, 93, 98, 102,
+        /* Size 16 */
+        30, 26, 23, 24, 25, 28, 32, 37, 42, 47, 53, 58, 64, 69, 75, 75, 26, 25,
+        25, 25, 25, 28, 31, 34, 39, 44, 49, 54, 60, 65, 70, 70, 23, 25, 26, 26,
+        25, 27, 29, 33, 37, 41, 46, 51, 56, 61, 66, 66, 24, 25, 26, 27, 29, 31,
+        34, 37, 41, 45, 50, 54, 59, 63, 69, 69, 25, 25, 25, 29, 35, 37, 40, 43,
+        46, 49, 54, 58, 62, 66, 71, 71, 28, 28, 27, 31, 37, 40, 44, 47, 51, 54,
+        58, 62, 66, 70, 74, 74, 32, 31, 29, 34, 40, 44, 50, 53, 57, 60, 63, 67,
+        70, 74, 77, 77, 37, 34, 33, 37, 43, 47, 53, 57, 61, 64, 68, 71, 74, 77,
+        81, 81, 42, 39, 37, 41, 46, 51, 57, 61, 66, 69, 72, 75, 79, 81, 84, 84,
+        47, 44, 41, 45, 49, 54, 60, 64, 69, 72, 76, 79, 82, 84, 87, 87, 53, 49,
+        46, 50, 54, 58, 63, 68, 72, 76, 80, 83, 85, 88, 90, 90, 58, 54, 51, 54,
+        58, 62, 67, 71, 75, 79, 83, 85, 88, 90, 93, 93, 64, 60, 56, 59, 62, 66,
+        70, 74, 79, 82, 85, 88, 91, 93, 95, 95, 69, 65, 61, 63, 66, 70, 74, 77,
+        81, 84, 88, 90, 93, 95, 97, 97, 75, 70, 66, 69, 71, 74, 77, 81, 84, 87,
+        90, 93, 95, 97, 99, 99, 75, 70, 66, 69, 71, 74, 77, 81, 84, 87, 90, 93,
+        95, 97, 99, 99,
+        /* Size 32 */
+        30, 28, 26, 24, 23, 23, 24, 24, 24, 26, 28, 30, 32, 34, 36, 39, 42, 44,
+        46, 49, 52, 55, 57, 60, 63, 66, 68, 71, 73, 73, 73, 73, 28, 26, 25, 24,
+        23, 24, 24, 24, 25, 26, 27, 29, 31, 33, 35, 37, 40, 42, 45, 47, 50, 53,
+        55, 58, 61, 63, 66, 68, 71, 71, 71, 71, 26, 25, 25, 25, 24, 24, 24, 25,
+        25, 26, 27, 29, 30, 32, 34, 36, 39, 41, 43, 46, 49, 51, 53, 56, 59, 61,
+        64, 66, 69, 69, 69, 69, 24, 24, 25, 25, 25, 25, 25, 25, 25, 26, 27, 28,
+        29, 31, 33, 35, 37, 39, 42, 44, 47, 49, 52, 54, 57, 59, 62, 64, 67, 67,
+        67, 67, 23, 23, 24, 25, 26, 26, 25, 25, 25, 26, 27, 28, 29, 30, 32, 34,
+        36, 38, 40, 43, 45, 48, 50, 52, 55, 58, 60, 62, 65, 65, 65, 65, 23, 24,
+        24, 25, 26, 26, 26, 26, 27, 28, 29, 30, 31, 32, 34, 36, 38, 40, 42, 44,
+        47, 49, 51, 54, 57, 59, 61, 64, 66, 66, 66, 66, 24, 24, 24, 25, 25, 26,
+        27, 28, 29, 30, 31, 32, 33, 35, 36, 38, 40, 42, 44, 46, 49, 51, 53, 55,
+        58, 60, 62, 65, 67, 67, 67, 67, 24, 24, 25, 25, 25, 26, 28, 29, 31, 32,
+        33, 35, 36, 37, 39, 41, 43, 44, 46, 48, 51, 53, 55, 57, 60, 62, 64, 66,
+        69, 69, 69, 69, 24, 25, 25, 25, 25, 27, 29, 31, 34, 35, 36, 38, 39, 41,
+        42, 44, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 70, 70, 70, 70,
+        26, 26, 26, 26, 26, 28, 30, 32, 35, 37, 38, 40, 41, 43, 44, 46, 47, 49,
+        51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 71, 71, 71, 28, 27, 27, 27,
+        27, 29, 31, 33, 36, 38, 40, 41, 43, 45, 46, 48, 50, 52, 53, 55, 57, 59,
+        61, 63, 65, 67, 69, 71, 73, 73, 73, 73, 30, 29, 29, 28, 28, 30, 32, 35,
+        38, 40, 41, 44, 46, 47, 49, 51, 53, 54, 56, 58, 60, 61, 63, 65, 67, 69,
+        71, 72, 74, 74, 74, 74, 32, 31, 30, 29, 29, 31, 33, 36, 39, 41, 43, 46,
+        49, 50, 52, 54, 56, 57, 59, 61, 62, 64, 66, 67, 69, 71, 72, 74, 76, 76,
+        76, 76, 34, 33, 32, 31, 30, 32, 35, 37, 41, 43, 45, 47, 50, 52, 54, 56,
+        58, 59, 61, 63, 64, 66, 68, 69, 71, 73, 74, 76, 78, 78, 78, 78, 36, 35,
+        34, 33, 32, 34, 36, 39, 42, 44, 46, 49, 52, 54, 56, 58, 60, 61, 63, 65,
+        66, 68, 70, 71, 73, 74, 76, 78, 79, 79, 79, 79, 39, 37, 36, 35, 34, 36,
+        38, 41, 44, 46, 48, 51, 54, 56, 58, 60, 62, 64, 65, 67, 69, 70, 72, 73,
+        75, 76, 78, 79, 81, 81, 81, 81, 42, 40, 39, 37, 36, 38, 40, 43, 45, 47,
+        50, 53, 56, 58, 60, 62, 64, 66, 68, 69, 71, 73, 74, 76, 77, 79, 80, 81,
+        83, 83, 83, 83, 44, 42, 41, 39, 38, 40, 42, 44, 47, 49, 52, 54, 57, 59,
+        61, 64, 66, 68, 69, 71, 73, 74, 76, 77, 79, 80, 81, 83, 84, 84, 84, 84,
+        46, 45, 43, 42, 40, 42, 44, 46, 49, 51, 53, 56, 59, 61, 63, 65, 68, 69,
+        71, 73, 75, 76, 77, 79, 81, 82, 83, 84, 86, 86, 86, 86, 49, 47, 46, 44,
+        43, 44, 46, 48, 51, 53, 55, 58, 61, 63, 65, 67, 69, 71, 73, 75, 77, 78,
+        79, 81, 82, 83, 85, 86, 87, 87, 87, 87, 52, 50, 49, 47, 45, 47, 49, 51,
+        53, 55, 57, 60, 62, 64, 66, 69, 71, 73, 75, 77, 78, 80, 81, 83, 84, 85,
+        86, 88, 89, 89, 89, 89, 55, 53, 51, 49, 48, 49, 51, 53, 55, 57, 59, 61,
+        64, 66, 68, 70, 73, 74, 76, 78, 80, 81, 82, 84, 85, 86, 88, 89, 90, 90,
+        90, 90, 57, 55, 53, 52, 50, 51, 53, 55, 57, 59, 61, 63, 66, 68, 70, 72,
+        74, 76, 77, 79, 81, 82, 84, 85, 87, 88, 89, 90, 91, 91, 91, 91, 60, 58,
+        56, 54, 52, 54, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 76, 77, 79, 81,
+        83, 84, 85, 87, 88, 89, 90, 91, 92, 92, 92, 92, 63, 61, 59, 57, 55, 57,
+        58, 60, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 82, 84, 85, 87, 88,
+        89, 90, 92, 93, 94, 94, 94, 94, 66, 63, 61, 59, 58, 59, 60, 62, 63, 65,
+        67, 69, 71, 73, 74, 76, 79, 80, 82, 83, 85, 86, 88, 89, 90, 91, 93, 94,
+        95, 95, 95, 95, 68, 66, 64, 62, 60, 61, 62, 64, 65, 67, 69, 71, 72, 74,
+        76, 78, 80, 81, 83, 85, 86, 88, 89, 90, 92, 93, 94, 95, 96, 96, 96, 96,
+        71, 68, 66, 64, 62, 64, 65, 66, 67, 69, 71, 72, 74, 76, 78, 79, 81, 83,
+        84, 86, 88, 89, 90, 91, 93, 94, 95, 96, 97, 97, 97, 97, 73, 71, 69, 67,
+        65, 66, 67, 69, 70, 71, 73, 74, 76, 78, 79, 81, 83, 84, 86, 87, 89, 90,
+        91, 92, 94, 95, 96, 97, 98, 98, 98, 98, 73, 71, 69, 67, 65, 66, 67, 69,
+        70, 71, 73, 74, 76, 78, 79, 81, 83, 84, 86, 87, 89, 90, 91, 92, 94, 95,
+        96, 97, 98, 98, 98, 98, 73, 71, 69, 67, 65, 66, 67, 69, 70, 71, 73, 74,
+        76, 78, 79, 81, 83, 84, 86, 87, 89, 90, 91, 92, 94, 95, 96, 97, 98, 98,
+        98, 98, 73, 71, 69, 67, 65, 66, 67, 69, 70, 71, 73, 74, 76, 78, 79, 81,
+        83, 84, 86, 87, 89, 90, 91, 92, 94, 95, 96, 97, 98, 98, 98, 98 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 96, 104, 126, 96, 113, 119, 133, 104, 119, 146, 162, 126, 133, 162,
+        183,
+        /* Size 8 */
+        64, 53, 90, 94, 100, 110, 121, 134, 53, 73, 91, 84, 86, 93, 103, 115,
+        90, 91, 104, 101, 101, 105, 113, 123, 94, 84, 101, 112, 117, 121, 127,
+        135, 100, 86, 101, 117, 128, 135, 141, 148, 110, 93, 105, 121, 135, 145,
+        153, 160, 121, 103, 113, 127, 141, 153, 163, 170, 134, 115, 123, 135,
+        148, 160, 170, 179,
+        /* Size 16 */
+        64, 58, 53, 66, 90, 92, 94, 97, 100, 105, 110, 115, 121, 127, 134, 134,
+        58, 59, 61, 73, 90, 90, 89, 90, 92, 96, 101, 106, 112, 117, 123, 123,
+        53, 61, 73, 81, 91, 87, 84, 85, 86, 89, 93, 98, 103, 109, 115, 115, 66,
+        73, 81, 89, 97, 94, 92, 92, 93, 96, 99, 103, 108, 113, 119, 119, 90, 90,
+        91, 97, 104, 103, 101, 101, 101, 103, 105, 109, 113, 118, 123, 123, 92,
+        90, 87, 94, 103, 105, 107, 107, 108, 110, 113, 116, 120, 124, 128, 128,
+        94, 89, 84, 92, 101, 107, 112, 115, 117, 119, 121, 124, 127, 131, 135,
+        135, 97, 90, 85, 92, 101, 107, 115, 118, 122, 125, 128, 131, 134, 137,
+        141, 141, 100, 92, 86, 93, 101, 108, 117, 122, 128, 131, 135, 138, 141,
+        144, 148, 148, 105, 96, 89, 96, 103, 110, 119, 125, 131, 135, 140, 143,
+        147, 150, 154, 154, 110, 101, 93, 99, 105, 113, 121, 128, 135, 140, 145,
+        149, 153, 157, 160, 160, 115, 106, 98, 103, 109, 116, 124, 131, 138,
+        143, 149, 153, 158, 161, 165, 165, 121, 112, 103, 108, 113, 120, 127,
+        134, 141, 147, 153, 158, 163, 166, 170, 170, 127, 117, 109, 113, 118,
+        124, 131, 137, 144, 150, 157, 161, 166, 170, 174, 174, 134, 123, 115,
+        119, 123, 128, 135, 141, 148, 154, 160, 165, 170, 174, 179, 179, 134,
+        123, 115, 119, 123, 128, 135, 141, 148, 154, 160, 165, 170, 174, 179,
+        179,
+        /* Size 32 */
+        64, 61, 58, 55, 53, 59, 66, 76, 90, 91, 92, 93, 94, 96, 97, 98, 100,
+        102, 105, 107, 110, 112, 115, 118, 121, 124, 127, 131, 134, 134, 134,
+        134, 61, 60, 59, 58, 57, 62, 70, 78, 90, 90, 91, 91, 91, 93, 94, 95, 96,
+        98, 100, 103, 105, 108, 110, 113, 116, 119, 122, 125, 129, 129, 129,
+        129, 58, 59, 59, 60, 61, 67, 73, 81, 90, 90, 90, 89, 89, 90, 90, 91, 92,
+        94, 96, 98, 101, 103, 106, 109, 112, 114, 117, 120, 123, 123, 123, 123,
+        55, 58, 60, 63, 67, 72, 77, 83, 91, 90, 88, 87, 86, 87, 87, 88, 89, 91,
+        93, 95, 97, 99, 102, 104, 107, 110, 113, 116, 119, 119, 119, 119, 53,
+        57, 61, 67, 73, 77, 81, 86, 91, 89, 87, 85, 84, 84, 85, 85, 86, 88, 89,
+        91, 93, 95, 98, 100, 103, 106, 109, 111, 115, 115, 115, 115, 59, 62, 67,
+        72, 77, 81, 85, 89, 94, 92, 91, 89, 87, 88, 88, 89, 89, 91, 92, 94, 96,
+        98, 100, 103, 105, 108, 111, 114, 117, 117, 117, 117, 66, 70, 73, 77,
+        81, 85, 89, 93, 97, 96, 94, 93, 92, 92, 92, 92, 93, 94, 96, 97, 99, 101,
+        103, 105, 108, 110, 113, 116, 119, 119, 119, 119, 76, 78, 81, 83, 86,
+        89, 93, 97, 101, 99, 98, 97, 96, 96, 96, 97, 97, 98, 99, 101, 102, 104,
+        106, 108, 110, 113, 115, 118, 121, 121, 121, 121, 90, 90, 90, 91, 91,
+        94, 97, 101, 104, 103, 103, 102, 101, 101, 101, 101, 101, 102, 103, 104,
+        105, 107, 109, 111, 113, 115, 118, 120, 123, 123, 123, 123, 91, 90, 90,
+        90, 89, 92, 96, 99, 103, 104, 104, 104, 104, 104, 104, 104, 105, 106,
+        107, 108, 109, 111, 112, 114, 116, 118, 121, 123, 126, 126, 126, 126,
+        92, 91, 90, 88, 87, 91, 94, 98, 103, 104, 105, 106, 107, 107, 107, 108,
+        108, 109, 110, 112, 113, 114, 116, 118, 120, 122, 124, 126, 128, 128,
+        128, 128, 93, 91, 89, 87, 85, 89, 93, 97, 102, 104, 106, 107, 109, 110,
+        111, 112, 112, 113, 115, 116, 117, 118, 120, 121, 123, 125, 127, 129,
+        131, 131, 131, 131, 94, 91, 89, 86, 84, 87, 92, 96, 101, 104, 107, 109,
+        112, 113, 115, 116, 117, 118, 119, 120, 121, 122, 124, 125, 127, 129,
+        131, 133, 135, 135, 135, 135, 96, 93, 90, 87, 84, 88, 92, 96, 101, 104,
+        107, 110, 113, 115, 116, 118, 119, 121, 122, 123, 124, 126, 127, 129,
+        130, 132, 134, 136, 138, 138, 138, 138, 97, 94, 90, 87, 85, 88, 92, 96,
+        101, 104, 107, 111, 115, 116, 118, 120, 122, 123, 125, 126, 128, 129,
+        131, 132, 134, 135, 137, 139, 141, 141, 141, 141, 98, 95, 91, 88, 85,
+        89, 92, 97, 101, 104, 108, 112, 116, 118, 120, 122, 125, 126, 128, 130,
+        131, 133, 134, 136, 137, 139, 141, 142, 144, 144, 144, 144, 100, 96, 92,
+        89, 86, 89, 93, 97, 101, 105, 108, 112, 117, 119, 122, 125, 128, 130,
+        131, 133, 135, 136, 138, 140, 141, 143, 144, 146, 148, 148, 148, 148,
+        102, 98, 94, 91, 88, 91, 94, 98, 102, 106, 109, 113, 118, 121, 123, 126,
+        130, 131, 133, 135, 137, 139, 141, 142, 144, 146, 147, 149, 151, 151,
+        151, 151, 105, 100, 96, 93, 89, 92, 96, 99, 103, 107, 110, 115, 119,
+        122, 125, 128, 131, 133, 135, 138, 140, 142, 143, 145, 147, 149, 150,
+        152, 154, 154, 154, 154, 107, 103, 98, 95, 91, 94, 97, 101, 104, 108,
+        112, 116, 120, 123, 126, 130, 133, 135, 138, 140, 143, 144, 146, 148,
+        150, 152, 153, 155, 157, 157, 157, 157, 110, 105, 101, 97, 93, 96, 99,
+        102, 105, 109, 113, 117, 121, 124, 128, 131, 135, 137, 140, 143, 145,
+        147, 149, 151, 153, 155, 157, 158, 160, 160, 160, 160, 112, 108, 103,
+        99, 95, 98, 101, 104, 107, 111, 114, 118, 122, 126, 129, 133, 136, 139,
+        142, 144, 147, 149, 151, 153, 156, 157, 159, 161, 162, 162, 162, 162,
+        115, 110, 106, 102, 98, 100, 103, 106, 109, 112, 116, 120, 124, 127,
+        131, 134, 138, 141, 143, 146, 149, 151, 153, 156, 158, 160, 161, 163,
+        165, 165, 165, 165, 118, 113, 109, 104, 100, 103, 105, 108, 111, 114,
+        118, 121, 125, 129, 132, 136, 140, 142, 145, 148, 151, 153, 156, 158,
+        160, 162, 164, 166, 168, 168, 168, 168, 121, 116, 112, 107, 103, 105,
+        108, 110, 113, 116, 120, 123, 127, 130, 134, 137, 141, 144, 147, 150,
+        153, 156, 158, 160, 163, 165, 166, 168, 170, 170, 170, 170, 124, 119,
+        114, 110, 106, 108, 110, 113, 115, 118, 122, 125, 129, 132, 135, 139,
+        143, 146, 149, 152, 155, 157, 160, 162, 165, 166, 168, 170, 172, 172,
+        172, 172, 127, 122, 117, 113, 109, 111, 113, 115, 118, 121, 124, 127,
+        131, 134, 137, 141, 144, 147, 150, 153, 157, 159, 161, 164, 166, 168,
+        170, 172, 174, 174, 174, 174, 131, 125, 120, 116, 111, 114, 116, 118,
+        120, 123, 126, 129, 133, 136, 139, 142, 146, 149, 152, 155, 158, 161,
+        163, 166, 168, 170, 172, 174, 177, 177, 177, 177, 134, 129, 123, 119,
+        115, 117, 119, 121, 123, 126, 128, 131, 135, 138, 141, 144, 148, 151,
+        154, 157, 160, 162, 165, 168, 170, 172, 174, 177, 179, 179, 179, 179,
+        134, 129, 123, 119, 115, 117, 119, 121, 123, 126, 128, 131, 135, 138,
+        141, 144, 148, 151, 154, 157, 160, 162, 165, 168, 170, 172, 174, 177,
+        179, 179, 179, 179, 134, 129, 123, 119, 115, 117, 119, 121, 123, 126,
+        128, 131, 135, 138, 141, 144, 148, 151, 154, 157, 160, 162, 165, 168,
+        170, 172, 174, 177, 179, 179, 179, 179, 134, 129, 123, 119, 115, 117,
+        119, 121, 123, 126, 128, 131, 135, 138, 141, 144, 148, 151, 154, 157,
+        160, 162, 165, 168, 170, 172, 174, 177, 179, 179, 179, 179 },
+      { /* Intra matrices */
+        /* Size 4 */
+        31, 47, 51, 63, 47, 56, 59, 67, 51, 59, 74, 82, 63, 67, 82, 95,
+        /* Size 8 */
+        33, 27, 47, 50, 53, 58, 65, 72, 27, 38, 48, 44, 45, 49, 55, 61, 47, 48,
+        55, 54, 53, 56, 60, 66, 50, 44, 54, 60, 62, 65, 68, 73, 53, 45, 53, 62,
+        69, 73, 76, 80, 58, 49, 56, 65, 73, 79, 84, 88, 65, 55, 60, 68, 76, 84,
+        89, 94, 72, 61, 66, 73, 80, 88, 94, 99,
+        /* Size 16 */
+        32, 29, 26, 33, 46, 47, 48, 50, 51, 54, 57, 60, 63, 67, 70, 70, 29, 30,
+        31, 37, 46, 46, 45, 46, 47, 50, 52, 55, 58, 61, 65, 65, 26, 31, 37, 41,
+        47, 45, 43, 43, 44, 46, 48, 50, 53, 56, 60, 60, 33, 37, 41, 45, 50, 48,
+        47, 47, 48, 49, 51, 53, 56, 59, 62, 62, 46, 46, 47, 50, 54, 53, 52, 52,
+        52, 53, 55, 57, 59, 61, 64, 64, 47, 46, 45, 48, 53, 54, 55, 56, 56, 57,
+        59, 60, 62, 65, 67, 67, 48, 45, 43, 47, 52, 55, 58, 60, 61, 62, 63, 65,
+        67, 69, 71, 71, 50, 46, 43, 47, 52, 56, 60, 62, 64, 65, 67, 69, 70, 72,
+        74, 74, 51, 47, 44, 48, 52, 56, 61, 64, 67, 69, 71, 73, 75, 76, 78, 78,
+        54, 50, 46, 49, 53, 57, 62, 65, 69, 71, 74, 76, 78, 80, 82, 82, 57, 52,
+        48, 51, 55, 59, 63, 67, 71, 74, 77, 79, 82, 84, 86, 86, 60, 55, 50, 53,
+        57, 60, 65, 69, 73, 76, 79, 82, 84, 86, 89, 89, 63, 58, 53, 56, 59, 62,
+        67, 70, 75, 78, 82, 84, 87, 89, 92, 92, 67, 61, 56, 59, 61, 65, 69, 72,
+        76, 80, 84, 86, 89, 92, 94, 94, 70, 65, 60, 62, 64, 67, 71, 74, 78, 82,
+        86, 89, 92, 94, 97, 97, 70, 65, 60, 62, 64, 67, 71, 74, 78, 82, 86, 89,
+        92, 94, 97, 97,
+        /* Size 32 */
+        32, 30, 29, 27, 26, 29, 33, 38, 45, 46, 47, 47, 48, 49, 49, 50, 51, 52,
+        53, 55, 56, 58, 59, 61, 63, 64, 66, 68, 70, 70, 70, 70, 30, 30, 29, 29,
+        28, 31, 35, 39, 46, 46, 46, 46, 46, 47, 47, 48, 49, 50, 51, 52, 54, 55,
+        57, 58, 60, 61, 63, 65, 67, 67, 67, 67, 29, 29, 30, 30, 30, 33, 37, 41,
+        46, 46, 45, 45, 45, 45, 46, 46, 47, 48, 49, 50, 51, 53, 54, 56, 57, 59,
+        60, 62, 64, 64, 64, 64, 27, 29, 30, 32, 33, 36, 39, 42, 46, 45, 45, 44,
+        43, 44, 44, 45, 45, 46, 47, 48, 49, 50, 52, 53, 55, 56, 58, 59, 61, 61,
+        61, 61, 26, 28, 30, 33, 37, 39, 41, 43, 46, 45, 44, 43, 42, 42, 43, 43,
+        43, 44, 45, 46, 47, 48, 50, 51, 53, 54, 56, 57, 59, 59, 59, 59, 29, 31,
+        33, 36, 39, 41, 43, 45, 48, 47, 46, 45, 44, 44, 45, 45, 45, 46, 47, 48,
+        49, 50, 51, 52, 54, 55, 57, 58, 60, 60, 60, 60, 33, 35, 37, 39, 41, 43,
+        45, 47, 49, 49, 48, 47, 46, 47, 47, 47, 47, 48, 49, 49, 50, 51, 53, 54,
+        55, 57, 58, 59, 61, 61, 61, 61, 38, 39, 41, 42, 43, 45, 47, 49, 51, 51,
+        50, 49, 49, 49, 49, 49, 49, 50, 51, 51, 52, 53, 54, 55, 57, 58, 59, 61,
+        62, 62, 62, 62, 45, 46, 46, 46, 46, 48, 49, 51, 53, 53, 52, 52, 52, 52,
+        52, 51, 51, 52, 53, 53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 63, 63, 63,
+        46, 46, 46, 45, 45, 47, 49, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54,
+        55, 55, 56, 57, 58, 59, 60, 61, 62, 64, 65, 65, 65, 65, 47, 46, 45, 45,
+        44, 46, 48, 50, 52, 53, 53, 54, 54, 55, 55, 55, 55, 56, 57, 57, 58, 59,
+        60, 61, 62, 63, 64, 65, 67, 67, 67, 67, 47, 46, 45, 44, 43, 45, 47, 49,
+        52, 53, 54, 55, 56, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 63, 64, 65,
+        66, 67, 68, 68, 68, 68, 48, 46, 45, 43, 42, 44, 46, 49, 52, 53, 54, 56,
+        58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 65, 66, 67, 68, 69, 70, 70,
+        70, 70, 49, 47, 45, 44, 42, 44, 47, 49, 52, 53, 55, 56, 58, 59, 60, 61,
+        62, 62, 63, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 72, 72, 72, 49, 47,
+        46, 44, 43, 45, 47, 49, 52, 53, 55, 57, 59, 60, 61, 62, 63, 64, 64, 65,
+        66, 67, 68, 69, 69, 70, 71, 72, 74, 74, 74, 74, 50, 48, 46, 45, 43, 45,
+        47, 49, 51, 53, 55, 57, 59, 61, 62, 63, 65, 65, 66, 67, 68, 69, 70, 71,
+        72, 72, 73, 74, 75, 75, 75, 75, 51, 49, 47, 45, 43, 45, 47, 49, 51, 53,
+        55, 58, 60, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 76,
+        77, 77, 77, 77, 52, 50, 48, 46, 44, 46, 48, 50, 52, 54, 56, 58, 61, 62,
+        64, 65, 67, 68, 69, 70, 72, 72, 73, 74, 75, 76, 77, 78, 79, 79, 79, 79,
+        53, 51, 49, 47, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 64, 66, 68, 69,
+        70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 81, 81, 81, 55, 52, 50, 48,
+        46, 48, 49, 51, 53, 55, 57, 59, 62, 63, 65, 67, 69, 70, 72, 73, 75, 76,
+        77, 78, 79, 80, 81, 82, 83, 83, 83, 83, 56, 54, 51, 49, 47, 49, 50, 52,
+        54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 73, 75, 76, 77, 78, 79, 81, 82,
+        83, 84, 85, 85, 85, 85, 58, 55, 53, 50, 48, 50, 51, 53, 55, 57, 59, 61,
+        63, 65, 67, 69, 71, 72, 74, 76, 77, 78, 80, 81, 82, 83, 84, 85, 86, 86,
+        86, 86, 59, 57, 54, 52, 50, 51, 53, 54, 56, 58, 60, 62, 64, 66, 68, 70,
+        72, 73, 75, 77, 78, 80, 81, 82, 83, 84, 85, 86, 87, 87, 87, 87, 61, 58,
+        56, 53, 51, 52, 54, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 74, 76, 78,
+        79, 81, 82, 83, 85, 86, 87, 88, 89, 89, 89, 89, 63, 60, 57, 55, 53, 54,
+        55, 57, 58, 60, 62, 64, 66, 68, 69, 72, 74, 75, 77, 79, 81, 82, 83, 85,
+        86, 87, 88, 89, 91, 91, 91, 91, 64, 61, 59, 56, 54, 55, 57, 58, 59, 61,
+        63, 65, 67, 69, 70, 72, 75, 76, 78, 80, 82, 83, 84, 86, 87, 88, 89, 91,
+        92, 92, 92, 92, 66, 63, 60, 58, 56, 57, 58, 59, 61, 62, 64, 66, 68, 70,
+        71, 73, 76, 77, 79, 81, 83, 84, 85, 87, 88, 89, 91, 92, 93, 93, 93, 93,
+        68, 65, 62, 59, 57, 58, 59, 61, 62, 64, 65, 67, 69, 71, 72, 74, 76, 78,
+        80, 82, 84, 85, 86, 88, 89, 91, 92, 93, 94, 94, 94, 94, 70, 67, 64, 61,
+        59, 60, 61, 62, 63, 65, 67, 68, 70, 72, 74, 75, 77, 79, 81, 83, 85, 86,
+        87, 89, 91, 92, 93, 94, 96, 96, 96, 96, 70, 67, 64, 61, 59, 60, 61, 62,
+        63, 65, 67, 68, 70, 72, 74, 75, 77, 79, 81, 83, 85, 86, 87, 89, 91, 92,
+        93, 94, 96, 96, 96, 96, 70, 67, 64, 61, 59, 60, 61, 62, 63, 65, 67, 68,
+        70, 72, 74, 75, 77, 79, 81, 83, 85, 86, 87, 89, 91, 92, 93, 94, 96, 96,
+        96, 96, 70, 67, 64, 61, 59, 60, 61, 62, 63, 65, 67, 68, 70, 72, 74, 75,
+        77, 79, 81, 83, 85, 86, 87, 89, 91, 92, 93, 94, 96, 96, 96, 96 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 69, 104, 143, 69, 97, 124, 152, 104, 124, 156, 176, 143, 152, 176,
+        192,
+        /* Size 8 */
+        64, 50, 54, 68, 86, 104, 122, 137, 50, 56, 54, 62, 76, 93, 109, 125, 54,
+        54, 72, 82, 92, 105, 118, 131, 68, 62, 82, 98, 110, 120, 130, 140, 86,
+        76, 92, 110, 123, 133, 142, 150, 104, 93, 105, 120, 133, 144, 151, 158,
+        122, 109, 118, 130, 142, 151, 158, 164, 137, 125, 131, 140, 150, 158,
+        164, 169,
+        /* Size 16 */
+        64, 56, 50, 52, 54, 60, 68, 76, 86, 94, 104, 112, 122, 129, 137, 137,
+        56, 55, 53, 54, 54, 59, 65, 72, 81, 89, 98, 106, 115, 122, 130, 130, 50,
+        53, 56, 55, 54, 58, 62, 69, 76, 84, 93, 100, 109, 116, 125, 125, 52, 54,
+        55, 59, 62, 66, 71, 77, 83, 90, 98, 105, 114, 120, 128, 128, 54, 54, 54,
+        62, 72, 77, 82, 87, 92, 98, 105, 111, 118, 124, 131, 131, 60, 59, 58,
+        66, 77, 82, 89, 94, 100, 106, 112, 118, 124, 130, 136, 136, 68, 65, 62,
+        71, 82, 89, 98, 104, 110, 115, 120, 125, 130, 135, 140, 140, 76, 72, 69,
+        77, 87, 94, 104, 109, 116, 121, 126, 131, 136, 140, 145, 145, 86, 81,
+        76, 83, 92, 100, 110, 116, 123, 128, 133, 138, 142, 146, 150, 150, 94,
+        89, 84, 90, 98, 106, 115, 121, 128, 133, 138, 142, 146, 150, 153, 153,
+        104, 98, 93, 98, 105, 112, 120, 126, 133, 138, 144, 147, 151, 154, 158,
+        158, 112, 106, 100, 105, 111, 118, 125, 131, 138, 142, 147, 151, 155,
+        158, 161, 161, 122, 115, 109, 114, 118, 124, 130, 136, 142, 146, 151,
+        155, 158, 161, 164, 164, 129, 122, 116, 120, 124, 130, 135, 140, 146,
+        150, 154, 158, 161, 164, 166, 166, 137, 130, 125, 128, 131, 136, 140,
+        145, 150, 153, 158, 161, 164, 166, 169, 169, 137, 130, 125, 128, 131,
+        136, 140, 145, 150, 153, 158, 161, 164, 166, 169, 169,
+        /* Size 32 */
+        64, 60, 56, 53, 50, 51, 52, 53, 54, 57, 60, 64, 68, 72, 76, 80, 86, 90,
+        94, 99, 104, 108, 112, 117, 122, 125, 129, 132, 137, 137, 137, 137, 60,
+        58, 55, 53, 52, 52, 53, 53, 54, 57, 59, 63, 66, 70, 74, 78, 83, 87, 91,
+        96, 101, 105, 109, 113, 118, 122, 125, 129, 133, 133, 133, 133, 56, 55,
+        55, 54, 53, 53, 54, 54, 54, 56, 59, 62, 65, 68, 72, 76, 81, 84, 89, 93,
+        98, 102, 106, 110, 115, 118, 122, 126, 130, 130, 130, 130, 53, 53, 54,
+        54, 55, 55, 55, 54, 54, 56, 59, 61, 64, 67, 70, 74, 78, 82, 86, 90, 95,
+        99, 103, 107, 112, 115, 119, 123, 127, 127, 127, 127, 50, 52, 53, 55,
+        56, 56, 55, 55, 54, 56, 58, 60, 62, 65, 69, 72, 76, 80, 84, 88, 93, 96,
+        100, 104, 109, 113, 116, 120, 125, 125, 125, 125, 51, 52, 53, 55, 56,
+        56, 57, 57, 58, 60, 62, 64, 66, 69, 72, 76, 80, 83, 87, 91, 95, 99, 103,
+        107, 111, 115, 118, 122, 126, 126, 126, 126, 52, 53, 54, 55, 55, 57, 59,
+        60, 62, 64, 66, 68, 71, 73, 77, 80, 83, 87, 90, 94, 98, 102, 105, 109,
+        114, 117, 120, 124, 128, 128, 128, 128, 53, 53, 54, 54, 55, 57, 60, 63,
+        67, 69, 71, 73, 76, 78, 81, 84, 88, 91, 94, 98, 101, 105, 108, 112, 116,
+        119, 122, 126, 129, 129, 129, 129, 54, 54, 54, 54, 54, 58, 62, 67, 72,
+        74, 77, 79, 82, 84, 87, 89, 92, 95, 98, 101, 105, 108, 111, 115, 118,
+        121, 124, 128, 131, 131, 131, 131, 57, 57, 56, 56, 56, 60, 64, 69, 74,
+        77, 79, 82, 85, 88, 90, 93, 96, 99, 102, 105, 108, 111, 114, 118, 121,
+        124, 127, 130, 133, 133, 133, 133, 60, 59, 59, 59, 58, 62, 66, 71, 77,
+        79, 82, 86, 89, 92, 94, 97, 100, 103, 106, 109, 112, 115, 118, 121, 124,
+        127, 130, 133, 136, 136, 136, 136, 64, 63, 62, 61, 60, 64, 68, 73, 79,
+        82, 86, 89, 93, 96, 99, 102, 105, 107, 110, 113, 116, 118, 121, 124,
+        127, 130, 132, 135, 138, 138, 138, 138, 68, 66, 65, 64, 62, 66, 71, 76,
+        82, 85, 89, 93, 98, 101, 104, 107, 110, 112, 115, 117, 120, 122, 125,
+        128, 130, 133, 135, 138, 140, 140, 140, 140, 72, 70, 68, 67, 65, 69, 73,
+        78, 84, 88, 92, 96, 101, 104, 106, 110, 113, 115, 118, 120, 123, 125,
+        128, 130, 133, 135, 138, 140, 143, 143, 143, 143, 76, 74, 72, 70, 69,
+        72, 77, 81, 87, 90, 94, 99, 104, 106, 109, 113, 116, 119, 121, 124, 126,
+        129, 131, 133, 136, 138, 140, 142, 145, 145, 145, 145, 80, 78, 76, 74,
+        72, 76, 80, 84, 89, 93, 97, 102, 107, 110, 113, 116, 120, 122, 124, 127,
+        130, 132, 134, 136, 139, 141, 143, 145, 147, 147, 147, 147, 86, 83, 81,
+        78, 76, 80, 83, 88, 92, 96, 100, 105, 110, 113, 116, 120, 123, 126, 128,
+        131, 133, 135, 138, 140, 142, 144, 146, 148, 150, 150, 150, 150, 90, 87,
+        84, 82, 80, 83, 87, 91, 95, 99, 103, 107, 112, 115, 119, 122, 126, 128,
+        131, 133, 136, 138, 140, 142, 144, 146, 148, 150, 151, 151, 151, 151,
+        94, 91, 89, 86, 84, 87, 90, 94, 98, 102, 106, 110, 115, 118, 121, 124,
+        128, 131, 133, 136, 138, 140, 142, 144, 146, 148, 150, 152, 153, 153,
+        153, 153, 99, 96, 93, 90, 88, 91, 94, 98, 101, 105, 109, 113, 117, 120,
+        124, 127, 131, 133, 136, 138, 141, 143, 145, 147, 149, 150, 152, 154,
+        155, 155, 155, 155, 104, 101, 98, 95, 93, 95, 98, 101, 105, 108, 112,
+        116, 120, 123, 126, 130, 133, 136, 138, 141, 144, 145, 147, 149, 151,
+        153, 154, 156, 158, 158, 158, 158, 108, 105, 102, 99, 96, 99, 102, 105,
+        108, 111, 115, 118, 122, 125, 129, 132, 135, 138, 140, 143, 145, 147,
+        149, 151, 153, 154, 156, 158, 159, 159, 159, 159, 112, 109, 106, 103,
+        100, 103, 105, 108, 111, 114, 118, 121, 125, 128, 131, 134, 138, 140,
+        142, 145, 147, 149, 151, 153, 155, 156, 158, 159, 161, 161, 161, 161,
+        117, 113, 110, 107, 104, 107, 109, 112, 115, 118, 121, 124, 128, 130,
+        133, 136, 140, 142, 144, 147, 149, 151, 153, 155, 157, 158, 159, 161,
+        162, 162, 162, 162, 122, 118, 115, 112, 109, 111, 114, 116, 118, 121,
+        124, 127, 130, 133, 136, 139, 142, 144, 146, 149, 151, 153, 155, 157,
+        158, 160, 161, 162, 164, 164, 164, 164, 125, 122, 118, 115, 113, 115,
+        117, 119, 121, 124, 127, 130, 133, 135, 138, 141, 144, 146, 148, 150,
+        153, 154, 156, 158, 160, 161, 162, 164, 165, 165, 165, 165, 129, 125,
+        122, 119, 116, 118, 120, 122, 124, 127, 130, 132, 135, 138, 140, 143,
+        146, 148, 150, 152, 154, 156, 158, 159, 161, 162, 164, 165, 166, 166,
+        166, 166, 132, 129, 126, 123, 120, 122, 124, 126, 128, 130, 133, 135,
+        138, 140, 142, 145, 148, 150, 152, 154, 156, 158, 159, 161, 162, 164,
+        165, 166, 167, 167, 167, 167, 137, 133, 130, 127, 125, 126, 128, 129,
+        131, 133, 136, 138, 140, 143, 145, 147, 150, 151, 153, 155, 158, 159,
+        161, 162, 164, 165, 166, 167, 169, 169, 169, 169, 137, 133, 130, 127,
+        125, 126, 128, 129, 131, 133, 136, 138, 140, 143, 145, 147, 150, 151,
+        153, 155, 158, 159, 161, 162, 164, 165, 166, 167, 169, 169, 169, 169,
+        137, 133, 130, 127, 125, 126, 128, 129, 131, 133, 136, 138, 140, 143,
+        145, 147, 150, 151, 153, 155, 158, 159, 161, 162, 164, 165, 166, 167,
+        169, 169, 169, 169, 137, 133, 130, 127, 125, 126, 128, 129, 131, 133,
+        136, 138, 140, 143, 145, 147, 150, 151, 153, 155, 158, 159, 161, 162,
+        164, 165, 166, 167, 169, 169, 169, 169 },
+      { /* Intra matrices */
+        /* Size 4 */
+        29, 32, 48, 68, 32, 45, 58, 73, 48, 58, 75, 86, 68, 73, 86, 94,
+        /* Size 8 */
+        34, 26, 28, 36, 46, 57, 67, 77, 26, 30, 29, 33, 41, 50, 60, 69, 28, 29,
+        39, 44, 50, 57, 65, 73, 36, 33, 44, 53, 60, 66, 73, 79, 46, 41, 50, 60,
+        68, 75, 80, 85, 57, 50, 57, 66, 75, 81, 86, 90, 67, 60, 65, 73, 80, 86,
+        90, 94, 77, 69, 73, 79, 85, 90, 94, 97,
+        /* Size 16 */
+        33, 29, 26, 27, 28, 31, 35, 39, 45, 50, 55, 60, 65, 70, 74, 74, 29, 28,
+        27, 27, 28, 30, 34, 37, 42, 47, 52, 56, 62, 66, 71, 71, 26, 27, 29, 28,
+        28, 30, 32, 35, 40, 44, 49, 53, 58, 62, 67, 67, 27, 27, 28, 30, 32, 34,
+        37, 40, 44, 47, 52, 56, 61, 65, 69, 69, 28, 28, 28, 32, 37, 40, 43, 45,
+        49, 52, 56, 59, 64, 67, 71, 71, 31, 30, 30, 34, 40, 43, 47, 50, 53, 56,
+        60, 63, 67, 70, 74, 74, 35, 34, 32, 37, 43, 47, 52, 55, 59, 61, 65, 67,
+        71, 74, 77, 77, 39, 37, 35, 40, 45, 50, 55, 58, 62, 65, 68, 71, 74, 77,
+        79, 79, 45, 42, 40, 44, 49, 53, 59, 62, 66, 69, 72, 75, 78, 80, 82, 82,
+        50, 47, 44, 47, 52, 56, 61, 65, 69, 72, 75, 78, 80, 82, 85, 85, 55, 52,
+        49, 52, 56, 60, 65, 68, 72, 75, 79, 81, 83, 85, 87, 87, 60, 56, 53, 56,
+        59, 63, 67, 71, 75, 78, 81, 83, 85, 87, 89, 89, 65, 62, 58, 61, 64, 67,
+        71, 74, 78, 80, 83, 85, 88, 89, 91, 91, 70, 66, 62, 65, 67, 70, 74, 77,
+        80, 82, 85, 87, 89, 91, 93, 93, 74, 71, 67, 69, 71, 74, 77, 79, 82, 85,
+        87, 89, 91, 93, 94, 94, 74, 71, 67, 69, 71, 74, 77, 79, 82, 85, 87, 89,
+        91, 93, 94, 94,
+        /* Size 32 */
+        33, 30, 28, 27, 25, 26, 26, 27, 27, 29, 30, 32, 35, 37, 39, 41, 44, 47,
+        49, 52, 55, 57, 59, 62, 65, 67, 69, 71, 73, 73, 73, 73, 30, 29, 28, 27,
+        26, 26, 27, 27, 27, 29, 30, 32, 34, 36, 38, 40, 43, 45, 47, 50, 53, 55,
+        57, 60, 63, 65, 67, 69, 71, 71, 71, 71, 28, 28, 28, 27, 27, 27, 27, 27,
+        27, 29, 30, 31, 33, 35, 37, 39, 42, 44, 46, 48, 51, 53, 56, 58, 61, 63,
+        65, 67, 70, 70, 70, 70, 27, 27, 27, 27, 28, 28, 28, 27, 27, 28, 30, 31,
+        32, 34, 36, 38, 40, 42, 44, 47, 50, 52, 54, 56, 59, 61, 63, 65, 68, 68,
+        68, 68, 25, 26, 27, 28, 29, 28, 28, 28, 27, 28, 29, 30, 32, 33, 35, 37,
+        39, 41, 43, 45, 48, 50, 52, 55, 57, 59, 61, 64, 66, 66, 66, 66, 26, 26,
+        27, 28, 28, 29, 29, 29, 29, 30, 31, 33, 34, 35, 37, 39, 41, 43, 45, 47,
+        50, 52, 54, 56, 59, 61, 63, 65, 67, 67, 67, 67, 26, 27, 27, 28, 28, 29,
+        30, 31, 32, 33, 34, 35, 36, 38, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57,
+        60, 62, 64, 66, 68, 68, 68, 68, 27, 27, 27, 27, 28, 29, 31, 32, 34, 35,
+        36, 38, 39, 40, 42, 44, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67,
+        69, 69, 69, 69, 27, 27, 27, 27, 27, 29, 32, 34, 37, 38, 39, 41, 42, 43,
+        45, 46, 48, 49, 51, 53, 55, 57, 59, 61, 63, 64, 66, 68, 70, 70, 70, 70,
+        29, 29, 29, 28, 28, 30, 33, 35, 38, 39, 41, 42, 44, 45, 47, 48, 50, 52,
+        53, 55, 57, 59, 60, 62, 64, 66, 68, 69, 71, 71, 71, 71, 30, 30, 30, 30,
+        29, 31, 34, 36, 39, 41, 42, 44, 46, 48, 49, 51, 52, 54, 55, 57, 59, 61,
+        62, 64, 66, 68, 69, 71, 73, 73, 73, 73, 32, 32, 31, 31, 30, 33, 35, 38,
+        41, 42, 44, 46, 49, 50, 52, 53, 55, 56, 58, 59, 61, 63, 64, 66, 68, 69,
+        71, 72, 74, 74, 74, 74, 35, 34, 33, 32, 32, 34, 36, 39, 42, 44, 46, 49,
+        51, 53, 54, 56, 58, 59, 60, 62, 64, 65, 66, 68, 70, 71, 72, 74, 75, 75,
+        75, 75, 37, 36, 35, 34, 33, 35, 38, 40, 43, 45, 48, 50, 53, 54, 56, 58,
+        59, 61, 62, 64, 65, 67, 68, 70, 71, 73, 74, 75, 77, 77, 77, 77, 39, 38,
+        37, 36, 35, 37, 39, 42, 45, 47, 49, 52, 54, 56, 58, 59, 61, 63, 64, 66,
+        67, 69, 70, 71, 73, 74, 75, 77, 78, 78, 78, 78, 41, 40, 39, 38, 37, 39,
+        41, 44, 46, 48, 51, 53, 56, 58, 59, 61, 63, 65, 66, 68, 69, 71, 72, 73,
+        75, 76, 77, 78, 80, 80, 80, 80, 44, 43, 42, 40, 39, 41, 43, 45, 48, 50,
+        52, 55, 58, 59, 61, 63, 65, 67, 68, 70, 71, 73, 74, 75, 76, 78, 79, 80,
+        81, 81, 81, 81, 47, 45, 44, 42, 41, 43, 45, 47, 49, 52, 54, 56, 59, 61,
+        63, 65, 67, 68, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 82, 82, 82, 82,
+        49, 47, 46, 44, 43, 45, 47, 49, 51, 53, 55, 58, 60, 62, 64, 66, 68, 70,
+        71, 73, 74, 75, 77, 78, 79, 80, 81, 82, 83, 83, 83, 83, 52, 50, 48, 47,
+        45, 47, 49, 51, 53, 55, 57, 59, 62, 64, 66, 68, 70, 71, 73, 74, 76, 77,
+        78, 79, 81, 82, 83, 84, 85, 85, 85, 85, 55, 53, 51, 50, 48, 50, 51, 53,
+        55, 57, 59, 61, 64, 65, 67, 69, 71, 73, 74, 76, 77, 79, 80, 81, 82, 83,
+        84, 85, 86, 86, 86, 86, 57, 55, 53, 52, 50, 52, 53, 55, 57, 59, 61, 63,
+        65, 67, 69, 71, 73, 74, 75, 77, 79, 80, 81, 82, 83, 84, 85, 86, 87, 87,
+        87, 87, 59, 57, 56, 54, 52, 54, 55, 57, 59, 60, 62, 64, 66, 68, 70, 72,
+        74, 75, 77, 78, 80, 81, 82, 83, 84, 85, 86, 87, 88, 88, 88, 88, 62, 60,
+        58, 56, 55, 56, 57, 59, 61, 62, 64, 66, 68, 70, 71, 73, 75, 76, 78, 79,
+        81, 82, 83, 84, 85, 86, 87, 88, 89, 89, 89, 89, 65, 63, 61, 59, 57, 59,
+        60, 61, 63, 64, 66, 68, 70, 71, 73, 75, 76, 78, 79, 81, 82, 83, 84, 85,
+        86, 87, 88, 89, 90, 90, 90, 90, 67, 65, 63, 61, 59, 61, 62, 63, 64, 66,
+        68, 69, 71, 73, 74, 76, 78, 79, 80, 82, 83, 84, 85, 86, 87, 88, 89, 90,
+        90, 90, 90, 90, 69, 67, 65, 63, 61, 63, 64, 65, 66, 68, 69, 71, 72, 74,
+        75, 77, 79, 80, 81, 83, 84, 85, 86, 87, 88, 89, 90, 90, 91, 91, 91, 91,
+        71, 69, 67, 65, 64, 65, 66, 67, 68, 69, 71, 72, 74, 75, 77, 78, 80, 81,
+        82, 84, 85, 86, 87, 88, 89, 90, 90, 91, 92, 92, 92, 92, 73, 71, 70, 68,
+        66, 67, 68, 69, 70, 71, 73, 74, 75, 77, 78, 80, 81, 82, 83, 85, 86, 87,
+        88, 89, 90, 90, 91, 92, 93, 93, 93, 93, 73, 71, 70, 68, 66, 67, 68, 69,
+        70, 71, 73, 74, 75, 77, 78, 80, 81, 82, 83, 85, 86, 87, 88, 89, 90, 90,
+        91, 92, 93, 93, 93, 93, 73, 71, 70, 68, 66, 67, 68, 69, 70, 71, 73, 74,
+        75, 77, 78, 80, 81, 82, 83, 85, 86, 87, 88, 89, 90, 90, 91, 92, 93, 93,
+        93, 93, 73, 71, 70, 68, 66, 67, 68, 69, 70, 71, 73, 74, 75, 77, 78, 80,
+        81, 82, 83, 85, 86, 87, 88, 89, 90, 90, 91, 92, 93, 93, 93, 93 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 94, 100, 120, 94, 108, 114, 126, 100, 114, 136, 149, 120, 126, 149,
+        166,
+        /* Size 8 */
+        64, 53, 88, 92, 97, 105, 115, 126, 53, 73, 89, 82, 84, 91, 100, 109, 88,
+        89, 100, 98, 98, 102, 108, 116, 92, 82, 98, 107, 111, 115, 120, 126, 97,
+        84, 98, 111, 121, 127, 132, 137, 105, 91, 102, 115, 127, 135, 142, 147,
+        115, 100, 108, 120, 132, 142, 149, 155, 126, 109, 116, 126, 137, 147,
+        155, 162,
+        /* Size 16 */
+        64, 58, 53, 66, 88, 90, 92, 94, 97, 101, 105, 110, 115, 120, 126, 126,
+        58, 60, 61, 72, 88, 87, 87, 88, 90, 94, 97, 102, 107, 112, 117, 117, 53,
+        61, 73, 80, 89, 85, 82, 83, 84, 87, 91, 95, 100, 104, 109, 109, 66, 72,
+        80, 87, 94, 92, 89, 90, 90, 93, 96, 100, 104, 108, 113, 113, 88, 88, 89,
+        94, 100, 99, 98, 98, 98, 100, 102, 105, 108, 112, 116, 116, 90, 87, 85,
+        92, 99, 101, 102, 103, 104, 106, 108, 111, 114, 117, 121, 121, 92, 87,
+        82, 89, 98, 102, 107, 109, 111, 113, 115, 117, 120, 123, 126, 126, 94,
+        88, 83, 90, 98, 103, 109, 113, 116, 118, 121, 123, 126, 128, 132, 132,
+        97, 90, 84, 90, 98, 104, 111, 116, 121, 124, 127, 129, 132, 134, 137,
+        137, 101, 94, 87, 93, 100, 106, 113, 118, 124, 127, 131, 134, 136, 139,
+        142, 142, 105, 97, 91, 96, 102, 108, 115, 121, 127, 131, 135, 138, 142,
+        144, 147, 147, 110, 102, 95, 100, 105, 111, 117, 123, 129, 134, 138,
+        142, 145, 148, 151, 151, 115, 107, 100, 104, 108, 114, 120, 126, 132,
+        136, 142, 145, 149, 152, 155, 155, 120, 112, 104, 108, 112, 117, 123,
+        128, 134, 139, 144, 148, 152, 155, 158, 158, 126, 117, 109, 113, 116,
+        121, 126, 132, 137, 142, 147, 151, 155, 158, 162, 162, 126, 117, 109,
+        113, 116, 121, 126, 132, 137, 142, 147, 151, 155, 158, 162, 162,
+        /* Size 32 */
+        64, 61, 58, 56, 53, 59, 66, 75, 88, 89, 90, 91, 92, 93, 94, 95, 97, 99,
+        101, 103, 105, 108, 110, 113, 115, 118, 120, 123, 126, 126, 126, 126,
+        61, 60, 59, 58, 57, 63, 69, 77, 88, 88, 89, 89, 89, 90, 91, 92, 93, 95,
+        97, 99, 101, 103, 106, 108, 111, 113, 116, 118, 121, 121, 121, 121, 58,
+        59, 60, 61, 61, 67, 72, 80, 88, 88, 87, 87, 87, 88, 88, 89, 90, 92, 94,
+        95, 97, 100, 102, 104, 107, 109, 112, 114, 117, 117, 117, 117, 56, 58,
+        61, 63, 67, 71, 76, 82, 89, 88, 86, 85, 84, 85, 86, 86, 87, 89, 90, 92,
+        94, 96, 98, 101, 103, 105, 108, 110, 113, 113, 113, 113, 53, 57, 61, 67,
+        73, 76, 80, 84, 89, 87, 85, 84, 82, 83, 83, 84, 84, 86, 87, 89, 91, 93,
+        95, 97, 100, 102, 104, 107, 109, 109, 109, 109, 59, 63, 67, 71, 76, 80,
+        83, 87, 92, 90, 88, 87, 86, 86, 86, 87, 87, 89, 90, 92, 93, 95, 97, 99,
+        102, 104, 106, 109, 111, 111, 111, 111, 66, 69, 72, 76, 80, 83, 87, 90,
+        94, 93, 92, 91, 89, 90, 90, 90, 90, 92, 93, 94, 96, 98, 100, 102, 104,
+        106, 108, 110, 113, 113, 113, 113, 75, 77, 80, 82, 84, 87, 90, 94, 97,
+        96, 95, 94, 93, 94, 94, 94, 94, 95, 96, 97, 99, 100, 102, 104, 106, 108,
+        110, 112, 115, 115, 115, 115, 88, 88, 88, 89, 89, 92, 94, 97, 100, 100,
+        99, 99, 98, 98, 98, 98, 98, 99, 100, 101, 102, 103, 105, 106, 108, 110,
+        112, 114, 116, 116, 116, 116, 89, 88, 88, 88, 87, 90, 93, 96, 100, 100,
+        100, 100, 100, 100, 100, 101, 101, 102, 103, 104, 105, 106, 108, 109,
+        111, 113, 115, 117, 119, 119, 119, 119, 90, 89, 87, 86, 85, 88, 92, 95,
+        99, 100, 101, 102, 102, 103, 103, 104, 104, 105, 106, 107, 108, 109,
+        111, 112, 114, 116, 117, 119, 121, 121, 121, 121, 91, 89, 87, 85, 84,
+        87, 91, 94, 99, 100, 102, 103, 105, 106, 106, 107, 108, 109, 109, 110,
+        111, 113, 114, 115, 117, 118, 120, 122, 124, 124, 124, 124, 92, 89, 87,
+        84, 82, 86, 89, 93, 98, 100, 102, 105, 107, 108, 109, 110, 111, 112,
+        113, 114, 115, 116, 117, 119, 120, 122, 123, 125, 126, 126, 126, 126,
+        93, 90, 88, 85, 83, 86, 90, 94, 98, 100, 103, 106, 108, 110, 111, 112,
+        114, 115, 116, 117, 118, 119, 120, 121, 123, 124, 126, 127, 129, 129,
+        129, 129, 94, 91, 88, 86, 83, 86, 90, 94, 98, 100, 103, 106, 109, 111,
+        113, 114, 116, 117, 118, 119, 121, 122, 123, 124, 126, 127, 128, 130,
+        132, 132, 132, 132, 95, 92, 89, 86, 84, 87, 90, 94, 98, 101, 104, 107,
+        110, 112, 114, 116, 118, 119, 121, 122, 124, 125, 126, 127, 129, 130,
+        131, 133, 134, 134, 134, 134, 97, 93, 90, 87, 84, 87, 90, 94, 98, 101,
+        104, 108, 111, 114, 116, 118, 121, 122, 124, 125, 127, 128, 129, 130,
+        132, 133, 134, 136, 137, 137, 137, 137, 99, 95, 92, 89, 86, 89, 92, 95,
+        99, 102, 105, 109, 112, 115, 117, 119, 122, 124, 125, 127, 129, 130,
+        131, 133, 134, 135, 137, 138, 139, 139, 139, 139, 101, 97, 94, 90, 87,
+        90, 93, 96, 100, 103, 106, 109, 113, 116, 118, 121, 124, 125, 127, 129,
+        131, 132, 134, 135, 136, 138, 139, 140, 142, 142, 142, 142, 103, 99, 95,
+        92, 89, 92, 94, 97, 101, 104, 107, 110, 114, 117, 119, 122, 125, 127,
+        129, 131, 133, 134, 136, 137, 139, 140, 142, 143, 144, 144, 144, 144,
+        105, 101, 97, 94, 91, 93, 96, 99, 102, 105, 108, 111, 115, 118, 121,
+        124, 127, 129, 131, 133, 135, 137, 138, 140, 142, 143, 144, 146, 147,
+        147, 147, 147, 108, 103, 100, 96, 93, 95, 98, 100, 103, 106, 109, 113,
+        116, 119, 122, 125, 128, 130, 132, 134, 137, 138, 140, 142, 143, 145,
+        146, 148, 149, 149, 149, 149, 110, 106, 102, 98, 95, 97, 100, 102, 105,
+        108, 111, 114, 117, 120, 123, 126, 129, 131, 134, 136, 138, 140, 142,
+        143, 145, 147, 148, 149, 151, 151, 151, 151, 113, 108, 104, 101, 97, 99,
+        102, 104, 106, 109, 112, 115, 119, 121, 124, 127, 130, 133, 135, 137,
+        140, 142, 143, 145, 147, 149, 150, 151, 153, 153, 153, 153, 115, 111,
+        107, 103, 100, 102, 104, 106, 108, 111, 114, 117, 120, 123, 126, 129,
+        132, 134, 136, 139, 142, 143, 145, 147, 149, 151, 152, 154, 155, 155,
+        155, 155, 118, 113, 109, 105, 102, 104, 106, 108, 110, 113, 116, 118,
+        122, 124, 127, 130, 133, 135, 138, 140, 143, 145, 147, 149, 151, 152,
+        154, 155, 157, 157, 157, 157, 120, 116, 112, 108, 104, 106, 108, 110,
+        112, 115, 117, 120, 123, 126, 128, 131, 134, 137, 139, 142, 144, 146,
+        148, 150, 152, 154, 155, 157, 158, 158, 158, 158, 123, 118, 114, 110,
+        107, 109, 110, 112, 114, 117, 119, 122, 125, 127, 130, 133, 136, 138,
+        140, 143, 146, 148, 149, 151, 154, 155, 157, 158, 160, 160, 160, 160,
+        126, 121, 117, 113, 109, 111, 113, 115, 116, 119, 121, 124, 126, 129,
+        132, 134, 137, 139, 142, 144, 147, 149, 151, 153, 155, 157, 158, 160,
+        162, 162, 162, 162, 126, 121, 117, 113, 109, 111, 113, 115, 116, 119,
+        121, 124, 126, 129, 132, 134, 137, 139, 142, 144, 147, 149, 151, 153,
+        155, 157, 158, 160, 162, 162, 162, 162, 126, 121, 117, 113, 109, 111,
+        113, 115, 116, 119, 121, 124, 126, 129, 132, 134, 137, 139, 142, 144,
+        147, 149, 151, 153, 155, 157, 158, 160, 162, 162, 162, 162, 126, 121,
+        117, 113, 109, 111, 113, 115, 116, 119, 121, 124, 126, 129, 132, 134,
+        137, 139, 142, 144, 147, 149, 151, 153, 155, 157, 158, 160, 162, 162,
+        162, 162 },
+      { /* Intra matrices */
+        /* Size 4 */
+        33, 49, 53, 64, 49, 57, 60, 67, 53, 60, 73, 81, 64, 67, 81, 91,
+        /* Size 8 */
+        35, 29, 49, 51, 54, 59, 65, 72, 29, 40, 50, 46, 47, 51, 56, 62, 49, 50,
+        56, 55, 55, 57, 61, 66, 51, 46, 55, 61, 63, 65, 68, 72, 54, 47, 55, 63,
+        69, 72, 76, 79, 59, 51, 57, 65, 72, 78, 82, 85, 65, 56, 61, 68, 76, 82,
+        86, 90, 72, 62, 66, 72, 79, 85, 90, 94,
+        /* Size 16 */
+        34, 31, 28, 36, 48, 49, 50, 52, 53, 55, 58, 61, 64, 67, 70, 70, 31, 32,
+        33, 39, 48, 48, 47, 48, 49, 51, 53, 56, 59, 62, 65, 65, 28, 33, 39, 43,
+        49, 47, 45, 45, 46, 48, 50, 52, 55, 57, 61, 61, 36, 39, 43, 47, 52, 50,
+        49, 49, 49, 51, 53, 55, 57, 60, 63, 63, 48, 48, 49, 52, 55, 55, 54, 54,
+        54, 55, 56, 58, 60, 62, 65, 65, 49, 48, 47, 50, 55, 55, 56, 57, 57, 58,
+        60, 61, 63, 65, 68, 68, 50, 47, 45, 49, 54, 56, 59, 61, 62, 63, 64, 65,
+        67, 69, 71, 71, 52, 48, 45, 49, 54, 57, 61, 62, 64, 66, 67, 69, 70, 72,
+        74, 74, 53, 49, 46, 49, 54, 57, 62, 64, 67, 69, 71, 72, 74, 76, 77, 77,
+        55, 51, 48, 51, 55, 58, 63, 66, 69, 71, 73, 75, 77, 78, 80, 80, 58, 53,
+        50, 53, 56, 60, 64, 67, 71, 73, 76, 78, 80, 82, 83, 83, 61, 56, 52, 55,
+        58, 61, 65, 69, 72, 75, 78, 80, 82, 84, 86, 86, 64, 59, 55, 57, 60, 63,
+        67, 70, 74, 77, 80, 82, 85, 86, 88, 88, 67, 62, 57, 60, 62, 65, 69, 72,
+        76, 78, 82, 84, 86, 88, 90, 90, 70, 65, 61, 63, 65, 68, 71, 74, 77, 80,
+        83, 86, 88, 90, 92, 92, 70, 65, 61, 63, 65, 68, 71, 74, 77, 80, 83, 86,
+        88, 90, 92, 92,
+        /* Size 32 */
+        34, 32, 31, 29, 28, 31, 35, 40, 47, 48, 48, 49, 50, 50, 51, 52, 52, 54,
+        55, 56, 57, 59, 60, 62, 63, 65, 66, 68, 70, 70, 70, 70, 32, 32, 31, 31,
+        30, 33, 37, 42, 47, 48, 48, 48, 48, 49, 49, 50, 50, 52, 53, 54, 55, 56,
+        58, 59, 61, 62, 64, 65, 67, 67, 67, 67, 31, 31, 32, 32, 33, 35, 39, 43,
+        48, 47, 47, 47, 47, 47, 48, 48, 49, 50, 51, 52, 53, 54, 55, 57, 58, 60,
+        61, 63, 64, 64, 64, 64, 29, 31, 32, 34, 35, 38, 41, 44, 48, 47, 47, 46,
+        45, 46, 46, 47, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 59, 60, 62, 62,
+        62, 62, 28, 30, 33, 35, 39, 41, 43, 45, 48, 47, 46, 45, 44, 44, 45, 45,
+        45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 57, 58, 60, 60, 60, 60, 31, 33,
+        35, 38, 41, 43, 45, 47, 50, 49, 48, 47, 46, 46, 47, 47, 47, 48, 49, 50,
+        50, 52, 53, 54, 55, 57, 58, 59, 61, 61, 61, 61, 35, 37, 39, 41, 43, 45,
+        47, 49, 51, 50, 50, 49, 48, 48, 49, 49, 49, 50, 50, 51, 52, 53, 54, 55,
+        57, 58, 59, 60, 62, 62, 62, 62, 40, 42, 43, 44, 45, 47, 49, 51, 53, 52,
+        52, 51, 51, 51, 51, 51, 51, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62,
+        63, 63, 63, 63, 47, 47, 48, 48, 48, 50, 51, 53, 55, 54, 54, 54, 53, 53,
+        53, 53, 53, 54, 54, 55, 55, 56, 57, 58, 59, 60, 61, 63, 64, 64, 64, 64,
+        48, 48, 47, 47, 47, 49, 50, 52, 54, 54, 54, 54, 54, 55, 55, 55, 55, 55,
+        56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 65, 65, 65, 48, 48, 47, 47,
+        46, 48, 50, 52, 54, 54, 55, 55, 56, 56, 56, 56, 57, 57, 58, 58, 59, 60,
+        61, 61, 62, 63, 65, 66, 67, 67, 67, 67, 49, 48, 47, 46, 45, 47, 49, 51,
+        54, 54, 55, 56, 57, 58, 58, 58, 59, 59, 60, 60, 61, 62, 63, 63, 64, 65,
+        66, 67, 68, 68, 68, 68, 50, 48, 47, 45, 44, 46, 48, 51, 53, 54, 56, 57,
+        59, 59, 60, 60, 61, 62, 62, 63, 63, 64, 65, 65, 66, 67, 68, 69, 70, 70,
+        70, 70, 50, 49, 47, 46, 44, 46, 48, 51, 53, 55, 56, 58, 59, 60, 61, 62,
+        62, 63, 63, 64, 65, 65, 66, 67, 68, 69, 69, 70, 71, 71, 71, 71, 51, 49,
+        48, 46, 45, 47, 49, 51, 53, 55, 56, 58, 60, 61, 62, 63, 64, 64, 65, 66,
+        66, 67, 68, 69, 69, 70, 71, 72, 73, 73, 73, 73, 52, 50, 48, 47, 45, 47,
+        49, 51, 53, 55, 56, 58, 60, 62, 63, 64, 65, 66, 67, 67, 68, 69, 70, 70,
+        71, 72, 73, 74, 75, 75, 75, 75, 52, 50, 49, 47, 45, 47, 49, 51, 53, 55,
+        57, 59, 61, 62, 64, 65, 66, 67, 68, 69, 70, 71, 72, 72, 73, 74, 75, 76,
+        76, 76, 76, 76, 54, 52, 50, 48, 46, 48, 50, 51, 54, 55, 57, 59, 62, 63,
+        64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 78, 78, 78,
+        55, 53, 51, 49, 47, 49, 50, 52, 54, 56, 58, 60, 62, 63, 65, 67, 68, 69,
+        70, 71, 73, 73, 74, 75, 76, 77, 78, 78, 79, 79, 79, 79, 56, 54, 52, 50,
+        48, 50, 51, 53, 55, 56, 58, 60, 63, 64, 66, 67, 69, 70, 71, 73, 74, 75,
+        76, 77, 77, 78, 79, 80, 81, 81, 81, 81, 57, 55, 53, 51, 49, 50, 52, 54,
+        55, 57, 59, 61, 63, 65, 66, 68, 70, 71, 73, 74, 75, 76, 77, 78, 79, 80,
+        81, 82, 82, 82, 82, 82, 59, 56, 54, 52, 50, 52, 53, 55, 56, 58, 60, 62,
+        64, 65, 67, 69, 71, 72, 73, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 84,
+        84, 84, 60, 58, 55, 53, 51, 53, 54, 56, 57, 59, 61, 63, 65, 66, 68, 70,
+        72, 73, 74, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 85, 85, 85, 62, 59,
+        57, 55, 53, 54, 55, 57, 58, 60, 61, 63, 65, 67, 69, 70, 72, 74, 75, 77,
+        78, 79, 80, 81, 82, 83, 84, 85, 86, 86, 86, 86, 63, 61, 58, 56, 54, 55,
+        57, 58, 59, 61, 62, 64, 66, 68, 69, 71, 73, 75, 76, 77, 79, 80, 81, 82,
+        84, 85, 86, 86, 87, 87, 87, 87, 65, 62, 60, 57, 55, 57, 58, 59, 60, 62,
+        63, 65, 67, 69, 70, 72, 74, 75, 77, 78, 80, 81, 82, 83, 85, 86, 86, 87,
+        88, 88, 88, 88, 66, 64, 61, 59, 57, 58, 59, 60, 61, 63, 65, 66, 68, 69,
+        71, 73, 75, 76, 78, 79, 81, 82, 83, 84, 86, 86, 87, 88, 89, 89, 89, 89,
+        68, 65, 63, 60, 58, 59, 60, 62, 63, 64, 66, 67, 69, 70, 72, 74, 76, 77,
+        78, 80, 82, 83, 84, 85, 86, 87, 88, 89, 90, 90, 90, 90, 70, 67, 64, 62,
+        60, 61, 62, 63, 64, 65, 67, 68, 70, 71, 73, 75, 76, 78, 79, 81, 82, 84,
+        85, 86, 87, 88, 89, 90, 91, 91, 91, 91, 70, 67, 64, 62, 60, 61, 62, 63,
+        64, 65, 67, 68, 70, 71, 73, 75, 76, 78, 79, 81, 82, 84, 85, 86, 87, 88,
+        89, 90, 91, 91, 91, 91, 70, 67, 64, 62, 60, 61, 62, 63, 64, 65, 67, 68,
+        70, 71, 73, 75, 76, 78, 79, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 91,
+        91, 91, 70, 67, 64, 62, 60, 61, 62, 63, 64, 65, 67, 68, 70, 71, 73, 75,
+        76, 78, 79, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 91, 91, 91 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 69, 100, 132, 69, 93, 117, 139, 100, 117, 142, 157, 132, 139, 157,
+        169,
+        /* Size 8 */
+        64, 51, 54, 67, 84, 100, 114, 126, 51, 57, 55, 62, 75, 89, 104, 116, 54,
+        55, 71, 80, 89, 100, 111, 121, 67, 62, 80, 94, 104, 113, 121, 129, 84,
+        75, 89, 104, 115, 123, 130, 136, 100, 89, 100, 113, 123, 131, 137, 142,
+        114, 104, 111, 121, 130, 137, 142, 146, 126, 116, 121, 129, 136, 142,
+        146, 150,
+        /* Size 16 */
+        64, 57, 51, 53, 54, 60, 67, 75, 84, 91, 100, 106, 114, 119, 126, 126,
+        57, 55, 54, 54, 55, 59, 65, 71, 79, 86, 94, 101, 108, 114, 121, 121, 51,
+        54, 57, 56, 55, 58, 62, 68, 75, 82, 89, 96, 104, 109, 116, 116, 53, 54,
+        56, 59, 62, 66, 70, 75, 82, 88, 94, 100, 107, 113, 119, 119, 54, 55, 55,
+        62, 71, 76, 80, 84, 89, 94, 100, 105, 111, 116, 121, 121, 60, 59, 58,
+        66, 76, 81, 87, 91, 96, 101, 106, 111, 116, 120, 125, 125, 67, 65, 62,
+        70, 80, 87, 94, 99, 104, 108, 113, 116, 121, 125, 129, 129, 75, 71, 68,
+        75, 84, 91, 99, 104, 109, 113, 118, 121, 125, 128, 132, 132, 84, 79, 75,
+        82, 89, 96, 104, 109, 115, 119, 123, 126, 130, 133, 136, 136, 91, 86,
+        82, 88, 94, 101, 108, 113, 119, 123, 127, 130, 133, 136, 139, 139, 100,
+        94, 89, 94, 100, 106, 113, 118, 123, 127, 131, 134, 137, 139, 142, 142,
+        106, 101, 96, 100, 105, 111, 116, 121, 126, 130, 134, 137, 140, 142,
+        144, 144, 114, 108, 104, 107, 111, 116, 121, 125, 130, 133, 137, 140,
+        142, 144, 146, 146, 119, 114, 109, 113, 116, 120, 125, 128, 133, 136,
+        139, 142, 144, 146, 148, 148, 126, 121, 116, 119, 121, 125, 129, 132,
+        136, 139, 142, 144, 146, 148, 150, 150, 126, 121, 116, 119, 121, 125,
+        129, 132, 136, 139, 142, 144, 146, 148, 150, 150,
+        /* Size 32 */
+        64, 60, 57, 54, 51, 52, 53, 54, 54, 57, 60, 64, 67, 71, 75, 79, 84, 87,
+        91, 95, 100, 103, 106, 110, 114, 117, 119, 122, 126, 126, 126, 126, 60,
+        58, 56, 54, 52, 53, 53, 54, 55, 57, 60, 63, 66, 69, 73, 77, 81, 85, 88,
+        92, 97, 100, 103, 107, 111, 114, 117, 120, 123, 123, 123, 123, 57, 56,
+        55, 55, 54, 54, 54, 54, 55, 57, 59, 62, 65, 68, 71, 75, 79, 82, 86, 90,
+        94, 97, 101, 104, 108, 111, 114, 117, 121, 121, 121, 121, 54, 54, 55,
+        55, 55, 55, 55, 55, 55, 57, 59, 61, 64, 66, 70, 73, 77, 80, 84, 88, 92,
+        95, 98, 102, 106, 109, 112, 115, 118, 118, 118, 118, 51, 52, 54, 55, 57,
+        56, 56, 56, 55, 57, 58, 60, 62, 65, 68, 71, 75, 78, 82, 85, 89, 93, 96,
+        100, 104, 106, 109, 113, 116, 116, 116, 116, 52, 53, 54, 55, 56, 57, 57,
+        58, 58, 60, 62, 64, 66, 69, 72, 75, 78, 81, 84, 88, 92, 95, 98, 102,
+        105, 108, 111, 114, 117, 117, 117, 117, 53, 53, 54, 55, 56, 57, 59, 61,
+        62, 64, 66, 68, 70, 73, 75, 78, 82, 84, 88, 91, 94, 97, 100, 104, 107,
+        110, 113, 116, 119, 119, 119, 119, 54, 54, 54, 55, 56, 58, 61, 63, 67,
+        68, 70, 72, 75, 77, 80, 82, 85, 88, 91, 94, 97, 100, 103, 106, 109, 112,
+        114, 117, 120, 120, 120, 120, 54, 55, 55, 55, 55, 58, 62, 67, 71, 73,
+        76, 78, 80, 82, 84, 87, 89, 92, 94, 97, 100, 103, 105, 108, 111, 114,
+        116, 119, 121, 121, 121, 121, 57, 57, 57, 57, 57, 60, 64, 68, 73, 76,
+        78, 80, 83, 85, 88, 90, 92, 95, 97, 100, 103, 105, 108, 111, 113, 116,
+        118, 121, 123, 123, 123, 123, 60, 60, 59, 59, 58, 62, 66, 70, 76, 78,
+        81, 84, 87, 89, 91, 93, 96, 98, 101, 103, 106, 108, 111, 113, 116, 118,
+        120, 122, 125, 125, 125, 125, 64, 63, 62, 61, 60, 64, 68, 72, 78, 80,
+        84, 87, 90, 92, 95, 97, 100, 102, 104, 107, 109, 111, 113, 116, 118,
+        120, 122, 124, 127, 127, 127, 127, 67, 66, 65, 64, 62, 66, 70, 75, 80,
+        83, 87, 90, 94, 97, 99, 101, 104, 106, 108, 110, 113, 114, 116, 119,
+        121, 123, 125, 127, 129, 129, 129, 129, 71, 69, 68, 66, 65, 69, 73, 77,
+        82, 85, 89, 92, 97, 99, 101, 104, 107, 109, 111, 113, 115, 117, 119,
+        121, 123, 125, 126, 128, 130, 130, 130, 130, 75, 73, 71, 70, 68, 72, 75,
+        80, 84, 88, 91, 95, 99, 101, 104, 106, 109, 111, 113, 115, 118, 119,
+        121, 123, 125, 127, 128, 130, 132, 132, 132, 132, 79, 77, 75, 73, 71,
+        75, 78, 82, 87, 90, 93, 97, 101, 104, 106, 109, 112, 114, 116, 118, 120,
+        122, 124, 126, 127, 129, 131, 132, 134, 134, 134, 134, 84, 81, 79, 77,
+        75, 78, 82, 85, 89, 92, 96, 100, 104, 107, 109, 112, 115, 117, 119, 121,
+        123, 125, 126, 128, 130, 131, 133, 134, 136, 136, 136, 136, 87, 85, 82,
+        80, 78, 81, 84, 88, 92, 95, 98, 102, 106, 109, 111, 114, 117, 119, 121,
+        123, 125, 127, 128, 130, 132, 133, 134, 136, 137, 137, 137, 137, 91, 88,
+        86, 84, 82, 84, 88, 91, 94, 97, 101, 104, 108, 111, 113, 116, 119, 121,
+        123, 125, 127, 129, 130, 132, 133, 135, 136, 137, 139, 139, 139, 139,
+        95, 92, 90, 88, 85, 88, 91, 94, 97, 100, 103, 107, 110, 113, 115, 118,
+        121, 123, 125, 127, 129, 131, 132, 134, 135, 136, 138, 139, 140, 140,
+        140, 140, 100, 97, 94, 92, 89, 92, 94, 97, 100, 103, 106, 109, 113, 115,
+        118, 120, 123, 125, 127, 129, 131, 133, 134, 135, 137, 138, 139, 141,
+        142, 142, 142, 142, 103, 100, 97, 95, 93, 95, 97, 100, 103, 105, 108,
+        111, 114, 117, 119, 122, 125, 127, 129, 131, 133, 134, 135, 137, 138,
+        139, 141, 142, 143, 143, 143, 143, 106, 103, 101, 98, 96, 98, 100, 103,
+        105, 108, 111, 113, 116, 119, 121, 124, 126, 128, 130, 132, 134, 135,
+        137, 138, 140, 141, 142, 143, 144, 144, 144, 144, 110, 107, 104, 102,
+        100, 102, 104, 106, 108, 111, 113, 116, 119, 121, 123, 126, 128, 130,
+        132, 134, 135, 137, 138, 140, 141, 142, 143, 144, 145, 145, 145, 145,
+        114, 111, 108, 106, 104, 105, 107, 109, 111, 113, 116, 118, 121, 123,
+        125, 127, 130, 132, 133, 135, 137, 138, 140, 141, 142, 143, 144, 145,
+        146, 146, 146, 146, 117, 114, 111, 109, 106, 108, 110, 112, 114, 116,
+        118, 120, 123, 125, 127, 129, 131, 133, 135, 136, 138, 139, 141, 142,
+        143, 144, 145, 146, 147, 147, 147, 147, 119, 117, 114, 112, 109, 111,
+        113, 114, 116, 118, 120, 122, 125, 126, 128, 131, 133, 134, 136, 138,
+        139, 141, 142, 143, 144, 145, 146, 147, 148, 148, 148, 148, 122, 120,
+        117, 115, 113, 114, 116, 117, 119, 121, 122, 124, 127, 128, 130, 132,
+        134, 136, 137, 139, 141, 142, 143, 144, 145, 146, 147, 148, 149, 149,
+        149, 149, 126, 123, 121, 118, 116, 117, 119, 120, 121, 123, 125, 127,
+        129, 130, 132, 134, 136, 137, 139, 140, 142, 143, 144, 145, 146, 147,
+        148, 149, 150, 150, 150, 150, 126, 123, 121, 118, 116, 117, 119, 120,
+        121, 123, 125, 127, 129, 130, 132, 134, 136, 137, 139, 140, 142, 143,
+        144, 145, 146, 147, 148, 149, 150, 150, 150, 150, 126, 123, 121, 118,
+        116, 117, 119, 120, 121, 123, 125, 127, 129, 130, 132, 134, 136, 137,
+        139, 140, 142, 143, 144, 145, 146, 147, 148, 149, 150, 150, 150, 150,
+        126, 123, 121, 118, 116, 117, 119, 120, 121, 123, 125, 127, 129, 130,
+        132, 134, 136, 137, 139, 140, 142, 143, 144, 145, 146, 147, 148, 149,
+        150, 150, 150, 150 },
+      { /* Intra matrices */
+        /* Size 4 */
+        32, 34, 51, 69, 34, 47, 60, 73, 51, 60, 75, 83, 69, 73, 83, 90,
+        /* Size 8 */
+        37, 29, 31, 39, 49, 59, 68, 76, 29, 33, 31, 36, 44, 53, 61, 70, 31, 31,
+        41, 47, 52, 59, 66, 73, 39, 36, 47, 56, 62, 67, 73, 78, 49, 44, 52, 62,
+        69, 74, 79, 83, 59, 53, 59, 67, 74, 79, 83, 87, 68, 61, 66, 73, 79, 83,
+        87, 90, 76, 70, 73, 78, 83, 87, 90, 92,
+        /* Size 16 */
+        36, 32, 28, 29, 30, 34, 38, 42, 48, 52, 57, 61, 66, 70, 74, 74, 32, 31,
+        30, 30, 30, 33, 36, 40, 45, 49, 54, 58, 63, 67, 71, 71, 28, 30, 32, 31,
+        31, 33, 35, 38, 42, 46, 51, 55, 60, 64, 68, 68, 29, 30, 31, 33, 35, 37,
+        40, 43, 46, 50, 54, 58, 62, 66, 69, 69, 30, 30, 31, 35, 40, 43, 45, 48,
+        51, 54, 58, 61, 65, 68, 71, 71, 34, 33, 33, 37, 43, 46, 49, 52, 55, 58,
+        61, 64, 68, 70, 73, 73, 38, 36, 35, 40, 45, 49, 54, 57, 60, 63, 65, 68,
+        71, 73, 76, 76, 42, 40, 38, 43, 48, 52, 57, 60, 63, 66, 69, 71, 74, 76,
+        78, 78, 48, 45, 42, 46, 51, 55, 60, 63, 67, 70, 72, 74, 77, 78, 80, 80,
+        52, 49, 46, 50, 54, 58, 63, 66, 70, 72, 75, 77, 79, 81, 82, 82, 57, 54,
+        51, 54, 58, 61, 65, 69, 72, 75, 77, 79, 81, 83, 84, 84, 61, 58, 55, 58,
+        61, 64, 68, 71, 74, 77, 79, 81, 83, 84, 86, 86, 66, 63, 60, 62, 65, 68,
+        71, 74, 77, 79, 81, 83, 85, 86, 87, 87, 70, 67, 64, 66, 68, 70, 73, 76,
+        78, 81, 83, 84, 86, 87, 89, 89, 74, 71, 68, 69, 71, 73, 76, 78, 80, 82,
+        84, 86, 87, 89, 90, 90, 74, 71, 68, 69, 71, 73, 76, 78, 80, 82, 84, 86,
+        87, 89, 90, 90,
+        /* Size 32 */
+        35, 33, 31, 30, 28, 28, 29, 29, 30, 31, 33, 35, 37, 39, 42, 44, 47, 49,
+        51, 54, 57, 59, 61, 63, 65, 67, 69, 71, 73, 73, 73, 73, 33, 32, 31, 30,
+        29, 29, 29, 30, 30, 31, 33, 35, 37, 39, 41, 43, 46, 48, 50, 52, 55, 57,
+        59, 61, 64, 65, 67, 69, 71, 71, 71, 71, 31, 31, 30, 30, 30, 30, 30, 30,
+        30, 31, 33, 34, 36, 38, 40, 42, 44, 46, 48, 51, 53, 55, 57, 60, 62, 64,
+        66, 68, 70, 70, 70, 70, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 32, 34,
+        35, 37, 39, 41, 43, 45, 47, 49, 52, 54, 56, 58, 61, 62, 64, 66, 68, 68,
+        68, 68, 28, 29, 30, 30, 31, 31, 31, 31, 30, 31, 32, 33, 34, 36, 38, 40,
+        42, 44, 46, 48, 50, 52, 54, 57, 59, 61, 63, 65, 67, 67, 67, 67, 28, 29,
+        30, 30, 31, 31, 32, 32, 32, 33, 34, 35, 37, 38, 40, 42, 44, 46, 47, 50,
+        52, 54, 56, 58, 60, 62, 64, 66, 68, 68, 68, 68, 29, 29, 30, 30, 31, 32,
+        33, 33, 34, 35, 37, 38, 39, 40, 42, 44, 46, 47, 49, 51, 53, 55, 57, 59,
+        61, 63, 65, 67, 68, 68, 68, 68, 29, 30, 30, 30, 31, 32, 33, 35, 37, 38,
+        39, 40, 42, 43, 45, 46, 48, 50, 51, 53, 55, 57, 59, 60, 62, 64, 66, 67,
+        69, 69, 69, 69, 30, 30, 30, 30, 30, 32, 34, 37, 40, 41, 42, 43, 45, 46,
+        47, 49, 50, 52, 53, 55, 57, 58, 60, 62, 64, 65, 67, 68, 70, 70, 70, 70,
+        31, 31, 31, 31, 31, 33, 35, 38, 41, 42, 44, 45, 47, 48, 49, 51, 52, 54,
+        55, 57, 59, 60, 62, 63, 65, 67, 68, 70, 71, 71, 71, 71, 33, 33, 33, 32,
+        32, 34, 37, 39, 42, 44, 45, 47, 49, 50, 51, 53, 54, 56, 57, 59, 60, 62,
+        63, 65, 67, 68, 69, 71, 72, 72, 72, 72, 35, 35, 34, 34, 33, 35, 38, 40,
+        43, 45, 47, 49, 51, 52, 54, 55, 57, 58, 59, 61, 62, 64, 65, 67, 68, 69,
+        71, 72, 74, 74, 74, 74, 37, 37, 36, 35, 34, 37, 39, 42, 45, 47, 49, 51,
+        53, 55, 56, 58, 59, 61, 62, 63, 65, 66, 67, 68, 70, 71, 72, 73, 75, 75,
+        75, 75, 39, 39, 38, 37, 36, 38, 40, 43, 46, 48, 50, 52, 55, 56, 58, 59,
+        61, 62, 63, 65, 66, 67, 69, 70, 71, 72, 73, 75, 76, 76, 76, 76, 42, 41,
+        40, 39, 38, 40, 42, 45, 47, 49, 51, 54, 56, 58, 59, 61, 63, 64, 65, 66,
+        68, 69, 70, 71, 73, 74, 75, 76, 77, 77, 77, 77, 44, 43, 42, 41, 40, 42,
+        44, 46, 49, 51, 53, 55, 58, 59, 61, 63, 64, 66, 67, 68, 69, 71, 72, 73,
+        74, 75, 76, 77, 78, 78, 78, 78, 47, 46, 44, 43, 42, 44, 46, 48, 50, 52,
+        54, 57, 59, 61, 63, 64, 66, 67, 69, 70, 71, 72, 73, 74, 76, 76, 77, 78,
+        79, 79, 79, 79, 49, 48, 46, 45, 44, 46, 47, 50, 52, 54, 56, 58, 61, 62,
+        64, 66, 67, 69, 70, 71, 73, 73, 75, 76, 77, 78, 78, 79, 80, 80, 80, 80,
+        51, 50, 48, 47, 46, 47, 49, 51, 53, 55, 57, 59, 62, 63, 65, 67, 69, 70,
+        71, 72, 74, 75, 76, 77, 78, 79, 79, 80, 81, 81, 81, 81, 54, 52, 51, 49,
+        48, 50, 51, 53, 55, 57, 59, 61, 63, 65, 66, 68, 70, 71, 72, 74, 75, 76,
+        77, 78, 79, 80, 81, 81, 82, 82, 82, 82, 57, 55, 53, 52, 50, 52, 53, 55,
+        57, 59, 60, 62, 65, 66, 68, 69, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+        82, 82, 83, 83, 83, 83, 59, 57, 55, 54, 52, 54, 55, 57, 58, 60, 62, 64,
+        66, 67, 69, 71, 72, 73, 75, 76, 77, 78, 79, 80, 81, 82, 82, 83, 84, 84,
+        84, 84, 61, 59, 57, 56, 54, 56, 57, 59, 60, 62, 63, 65, 67, 69, 70, 72,
+        73, 75, 76, 77, 78, 79, 80, 81, 82, 83, 83, 84, 85, 85, 85, 85, 63, 61,
+        60, 58, 57, 58, 59, 60, 62, 63, 65, 67, 68, 70, 71, 73, 74, 76, 77, 78,
+        79, 80, 81, 82, 83, 83, 84, 85, 85, 85, 85, 85, 65, 64, 62, 61, 59, 60,
+        61, 62, 64, 65, 67, 68, 70, 71, 73, 74, 76, 77, 78, 79, 80, 81, 82, 83,
+        84, 84, 85, 86, 86, 86, 86, 86, 67, 65, 64, 62, 61, 62, 63, 64, 65, 67,
+        68, 69, 71, 72, 74, 75, 76, 78, 79, 80, 81, 82, 83, 83, 84, 85, 86, 86,
+        87, 87, 87, 87, 69, 67, 66, 64, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73,
+        75, 76, 77, 78, 79, 81, 82, 82, 83, 84, 85, 86, 86, 87, 87, 87, 87, 87,
+        71, 69, 68, 66, 65, 66, 67, 67, 68, 70, 71, 72, 73, 75, 76, 77, 78, 79,
+        80, 81, 82, 83, 84, 85, 86, 86, 87, 87, 88, 88, 88, 88, 73, 71, 70, 68,
+        67, 68, 68, 69, 70, 71, 72, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
+        85, 85, 86, 87, 87, 88, 89, 89, 89, 89, 73, 71, 70, 68, 67, 68, 68, 69,
+        70, 71, 72, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 85, 86, 87,
+        87, 88, 89, 89, 89, 89, 73, 71, 70, 68, 67, 68, 68, 69, 70, 71, 72, 74,
+        75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 85, 86, 87, 87, 88, 89, 89,
+        89, 89, 73, 71, 70, 68, 67, 68, 68, 69, 70, 71, 72, 74, 75, 76, 77, 78,
+        79, 80, 81, 82, 83, 84, 85, 85, 86, 87, 87, 88, 89, 89, 89, 89 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 91, 97, 113, 91, 104, 108, 119, 97, 108, 127, 138, 113, 119, 138,
+        151,
+        /* Size 8 */
+        64, 54, 86, 89, 93, 101, 109, 118, 54, 72, 87, 81, 82, 88, 96, 104, 86,
+        87, 97, 95, 94, 98, 103, 110, 89, 81, 95, 103, 106, 109, 113, 119, 93,
+        82, 94, 106, 114, 119, 123, 127, 101, 88, 98, 109, 119, 126, 131, 135,
+        109, 96, 103, 113, 123, 131, 137, 142, 118, 104, 110, 119, 127, 135,
+        142, 147,
+        /* Size 16 */
+        64, 58, 54, 66, 86, 87, 89, 91, 93, 97, 101, 105, 109, 114, 118, 118,
+        58, 60, 62, 72, 86, 85, 85, 86, 88, 91, 94, 98, 102, 106, 111, 111, 54,
+        62, 72, 79, 87, 84, 81, 81, 82, 85, 88, 92, 96, 100, 104, 104, 66, 72,
+        79, 85, 91, 89, 87, 87, 88, 90, 93, 96, 100, 103, 107, 107, 86, 86, 87,
+        91, 97, 96, 95, 94, 94, 96, 98, 100, 103, 107, 110, 110, 87, 85, 84, 89,
+        96, 97, 98, 99, 100, 101, 103, 106, 108, 111, 114, 114, 89, 85, 81, 87,
+        95, 98, 103, 104, 106, 108, 109, 111, 113, 116, 119, 119, 91, 86, 81,
+        87, 94, 99, 104, 107, 110, 112, 114, 116, 118, 120, 123, 123, 93, 88,
+        82, 88, 94, 100, 106, 110, 114, 116, 119, 121, 123, 125, 127, 127, 97,
+        91, 85, 90, 96, 101, 108, 112, 116, 119, 122, 124, 127, 129, 131, 131,
+        101, 94, 88, 93, 98, 103, 109, 114, 119, 122, 126, 128, 131, 133, 135,
+        135, 105, 98, 92, 96, 100, 106, 111, 116, 121, 124, 128, 131, 134, 136,
+        138, 138, 109, 102, 96, 100, 103, 108, 113, 118, 123, 127, 131, 134,
+        137, 139, 142, 142, 114, 106, 100, 103, 107, 111, 116, 120, 125, 129,
+        133, 136, 139, 142, 144, 144, 118, 111, 104, 107, 110, 114, 119, 123,
+        127, 131, 135, 138, 142, 144, 147, 147, 118, 111, 104, 107, 110, 114,
+        119, 123, 127, 131, 135, 138, 142, 144, 147, 147,
+        /* Size 32 */
+        64, 61, 58, 56, 54, 59, 66, 75, 86, 86, 87, 88, 89, 90, 91, 92, 93, 95,
+        97, 99, 101, 103, 105, 107, 109, 111, 114, 116, 118, 118, 118, 118, 61,
+        60, 59, 58, 57, 63, 69, 76, 86, 86, 86, 87, 87, 88, 89, 90, 90, 92, 94,
+        96, 97, 99, 101, 103, 106, 108, 110, 112, 114, 114, 114, 114, 58, 59,
+        60, 61, 62, 66, 72, 78, 86, 86, 85, 85, 85, 85, 86, 87, 88, 89, 91, 92,
+        94, 96, 98, 100, 102, 104, 106, 109, 111, 111, 111, 111, 56, 58, 61, 63,
+        66, 71, 75, 80, 86, 85, 84, 84, 83, 83, 84, 84, 85, 86, 88, 89, 91, 93,
+        95, 97, 99, 101, 103, 105, 108, 108, 108, 108, 54, 57, 62, 66, 72, 75,
+        79, 83, 87, 85, 84, 82, 81, 81, 81, 82, 82, 84, 85, 87, 88, 90, 92, 94,
+        96, 98, 100, 102, 104, 104, 104, 104, 59, 63, 66, 71, 75, 78, 82, 85,
+        89, 88, 86, 85, 84, 84, 84, 85, 85, 86, 88, 89, 90, 92, 94, 96, 98, 100,
+        102, 104, 106, 106, 106, 106, 66, 69, 72, 75, 79, 82, 85, 88, 91, 90,
+        89, 88, 87, 87, 87, 88, 88, 89, 90, 91, 93, 94, 96, 98, 100, 101, 103,
+        105, 107, 107, 107, 107, 75, 76, 78, 80, 83, 85, 88, 91, 94, 93, 92, 91,
+        91, 91, 91, 91, 91, 92, 93, 94, 95, 97, 98, 100, 101, 103, 105, 107,
+        109, 109, 109, 109, 86, 86, 86, 86, 87, 89, 91, 94, 97, 96, 96, 95, 95,
+        94, 94, 94, 94, 95, 96, 97, 98, 99, 100, 102, 103, 105, 107, 109, 110,
+        110, 110, 110, 86, 86, 86, 85, 85, 88, 90, 93, 96, 96, 96, 96, 96, 97,
+        97, 97, 97, 98, 99, 99, 100, 102, 103, 104, 106, 107, 109, 111, 112,
+        112, 112, 112, 87, 86, 85, 84, 84, 86, 89, 92, 96, 96, 97, 98, 98, 99,
+        99, 100, 100, 101, 101, 102, 103, 104, 106, 107, 108, 110, 111, 113,
+        114, 114, 114, 114, 88, 87, 85, 84, 82, 85, 88, 91, 95, 96, 98, 99, 101,
+        101, 102, 102, 103, 104, 104, 105, 106, 107, 108, 109, 111, 112, 113,
+        115, 116, 116, 116, 116, 89, 87, 85, 83, 81, 84, 87, 91, 95, 96, 98,
+        101, 103, 104, 104, 105, 106, 107, 108, 108, 109, 110, 111, 112, 113,
+        115, 116, 117, 119, 119, 119, 119, 90, 88, 85, 83, 81, 84, 87, 91, 94,
+        97, 99, 101, 104, 105, 106, 107, 108, 109, 110, 110, 111, 112, 113, 114,
+        116, 117, 118, 119, 121, 121, 121, 121, 91, 89, 86, 84, 81, 84, 87, 91,
+        94, 97, 99, 102, 104, 106, 107, 108, 110, 111, 112, 113, 114, 115, 116,
+        117, 118, 119, 120, 122, 123, 123, 123, 123, 92, 90, 87, 84, 82, 85, 88,
+        91, 94, 97, 100, 102, 105, 107, 108, 110, 112, 113, 114, 115, 116, 117,
+        118, 119, 120, 122, 123, 124, 125, 125, 125, 125, 93, 90, 88, 85, 82,
+        85, 88, 91, 94, 97, 100, 103, 106, 108, 110, 112, 114, 115, 116, 118,
+        119, 120, 121, 122, 123, 124, 125, 126, 127, 127, 127, 127, 95, 92, 89,
+        86, 84, 86, 89, 92, 95, 98, 101, 104, 107, 109, 111, 113, 115, 116, 118,
+        119, 120, 122, 123, 124, 125, 126, 127, 128, 129, 129, 129, 129, 97, 94,
+        91, 88, 85, 88, 90, 93, 96, 99, 101, 104, 108, 110, 112, 114, 116, 118,
+        119, 121, 122, 123, 124, 126, 127, 128, 129, 130, 131, 131, 131, 131,
+        99, 96, 92, 89, 87, 89, 91, 94, 97, 99, 102, 105, 108, 110, 113, 115,
+        118, 119, 121, 122, 124, 125, 126, 128, 129, 130, 131, 132, 133, 133,
+        133, 133, 101, 97, 94, 91, 88, 90, 93, 95, 98, 100, 103, 106, 109, 111,
+        114, 116, 119, 120, 122, 124, 126, 127, 128, 130, 131, 132, 133, 134,
+        135, 135, 135, 135, 103, 99, 96, 93, 90, 92, 94, 97, 99, 102, 104, 107,
+        110, 112, 115, 117, 120, 122, 123, 125, 127, 128, 130, 131, 132, 133,
+        134, 136, 137, 137, 137, 137, 105, 101, 98, 95, 92, 94, 96, 98, 100,
+        103, 106, 108, 111, 113, 116, 118, 121, 123, 124, 126, 128, 130, 131,
+        132, 134, 135, 136, 137, 138, 138, 138, 138, 107, 103, 100, 97, 94, 96,
+        98, 100, 102, 104, 107, 109, 112, 114, 117, 119, 122, 124, 126, 128,
+        130, 131, 132, 134, 135, 136, 138, 139, 140, 140, 140, 140, 109, 106,
+        102, 99, 96, 98, 100, 101, 103, 106, 108, 111, 113, 116, 118, 120, 123,
+        125, 127, 129, 131, 132, 134, 135, 137, 138, 139, 140, 142, 142, 142,
+        142, 111, 108, 104, 101, 98, 100, 101, 103, 105, 107, 110, 112, 115,
+        117, 119, 122, 124, 126, 128, 130, 132, 133, 135, 136, 138, 139, 140,
+        142, 143, 143, 143, 143, 114, 110, 106, 103, 100, 102, 103, 105, 107,
+        109, 111, 113, 116, 118, 120, 123, 125, 127, 129, 131, 133, 134, 136,
+        138, 139, 140, 142, 143, 144, 144, 144, 144, 116, 112, 109, 105, 102,
+        104, 105, 107, 109, 111, 113, 115, 117, 119, 122, 124, 126, 128, 130,
+        132, 134, 136, 137, 139, 140, 142, 143, 144, 145, 145, 145, 145, 118,
+        114, 111, 108, 104, 106, 107, 109, 110, 112, 114, 116, 119, 121, 123,
+        125, 127, 129, 131, 133, 135, 137, 138, 140, 142, 143, 144, 145, 147,
+        147, 147, 147, 118, 114, 111, 108, 104, 106, 107, 109, 110, 112, 114,
+        116, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 138, 140, 142,
+        143, 144, 145, 147, 147, 147, 147, 118, 114, 111, 108, 104, 106, 107,
+        109, 110, 112, 114, 116, 119, 121, 123, 125, 127, 129, 131, 133, 135,
+        137, 138, 140, 142, 143, 144, 145, 147, 147, 147, 147, 118, 114, 111,
+        108, 104, 106, 107, 109, 110, 112, 114, 116, 119, 121, 123, 125, 127,
+        129, 131, 133, 135, 137, 138, 140, 142, 143, 144, 145, 147, 147, 147,
+        147 },
+      { /* Intra matrices */
+        /* Size 4 */
+        35, 51, 54, 64, 51, 58, 61, 67, 54, 61, 72, 79, 64, 67, 79, 87,
+        /* Size 8 */
+        37, 31, 51, 53, 56, 60, 66, 71, 31, 42, 51, 48, 49, 52, 57, 63, 51, 51,
+        58, 56, 56, 58, 62, 66, 53, 48, 56, 61, 64, 66, 68, 72, 56, 49, 56, 64,
+        69, 72, 75, 77, 60, 52, 58, 66, 72, 76, 80, 83, 66, 57, 62, 68, 75, 80,
+        84, 87, 71, 63, 66, 72, 77, 83, 87, 90,
+        /* Size 16 */
+        37, 33, 31, 38, 50, 51, 52, 53, 55, 57, 59, 62, 64, 67, 70, 70, 33, 34,
+        35, 41, 50, 50, 49, 50, 51, 53, 55, 57, 60, 63, 65, 65, 31, 35, 41, 45,
+        50, 48, 47, 47, 48, 49, 51, 54, 56, 59, 61, 61, 38, 41, 45, 49, 53, 52,
+        51, 51, 51, 53, 54, 56, 58, 61, 63, 63, 50, 50, 50, 53, 57, 56, 55, 55,
+        55, 56, 57, 59, 61, 63, 65, 65, 51, 50, 48, 52, 56, 57, 58, 58, 58, 59,
+        60, 62, 64, 66, 68, 68, 52, 49, 47, 51, 55, 58, 60, 61, 62, 63, 64, 66,
+        67, 69, 70, 70, 53, 50, 47, 51, 55, 58, 61, 63, 65, 66, 67, 69, 70, 71,
+        73, 73, 55, 51, 48, 51, 55, 58, 62, 65, 67, 69, 70, 72, 73, 75, 76, 76,
+        57, 53, 49, 53, 56, 59, 63, 66, 69, 71, 73, 74, 76, 77, 78, 78, 59, 55,
+        51, 54, 57, 60, 64, 67, 70, 73, 75, 77, 78, 80, 81, 81, 62, 57, 54, 56,
+        59, 62, 66, 69, 72, 74, 77, 78, 80, 82, 83, 83, 64, 60, 56, 58, 61, 64,
+        67, 70, 73, 76, 78, 80, 82, 84, 85, 85, 67, 63, 59, 61, 63, 66, 69, 71,
+        75, 77, 80, 82, 84, 85, 87, 87, 70, 65, 61, 63, 65, 68, 70, 73, 76, 78,
+        81, 83, 85, 87, 89, 89, 70, 65, 61, 63, 65, 68, 70, 73, 76, 78, 81, 83,
+        85, 87, 89, 89,
+        /* Size 32 */
+        36, 35, 33, 32, 30, 33, 37, 43, 49, 50, 50, 51, 51, 52, 53, 53, 54, 55,
+        56, 57, 58, 60, 61, 62, 64, 65, 66, 68, 69, 69, 69, 69, 35, 34, 33, 33,
+        32, 35, 39, 44, 49, 49, 50, 50, 50, 50, 51, 52, 52, 53, 54, 55, 56, 58,
+        59, 60, 62, 63, 64, 66, 67, 67, 67, 67, 33, 33, 34, 34, 35, 38, 41, 45,
+        49, 49, 49, 49, 49, 49, 49, 50, 50, 51, 52, 53, 54, 56, 57, 58, 59, 61,
+        62, 63, 65, 65, 65, 65, 32, 33, 34, 36, 38, 40, 43, 46, 50, 49, 48, 48,
+        47, 48, 48, 48, 49, 50, 51, 51, 52, 54, 55, 56, 57, 59, 60, 61, 63, 63,
+        63, 63, 30, 32, 35, 38, 41, 43, 45, 47, 50, 49, 48, 47, 46, 46, 47, 47,
+        47, 48, 49, 50, 51, 52, 53, 54, 55, 57, 58, 59, 61, 61, 61, 61, 33, 35,
+        38, 40, 43, 45, 47, 49, 51, 50, 50, 49, 48, 48, 48, 49, 49, 50, 50, 51,
+        52, 53, 54, 55, 57, 58, 59, 60, 62, 62, 62, 62, 37, 39, 41, 43, 45, 47,
+        49, 51, 53, 52, 51, 51, 50, 50, 50, 50, 51, 51, 52, 53, 54, 54, 56, 57,
+        58, 59, 60, 61, 63, 63, 63, 63, 43, 44, 45, 46, 47, 49, 51, 52, 54, 54,
+        53, 53, 52, 52, 52, 52, 52, 53, 54, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+        63, 63, 63, 63, 49, 49, 49, 50, 50, 51, 53, 54, 56, 56, 55, 55, 55, 55,
+        55, 55, 54, 55, 56, 56, 57, 57, 58, 59, 60, 61, 62, 63, 64, 64, 64, 64,
+        50, 49, 49, 49, 49, 50, 52, 54, 56, 56, 56, 56, 56, 56, 56, 56, 56, 57,
+        57, 58, 58, 59, 60, 61, 62, 63, 64, 65, 66, 66, 66, 66, 50, 50, 49, 48,
+        48, 50, 51, 53, 55, 56, 56, 57, 57, 57, 57, 58, 58, 58, 59, 59, 60, 61,
+        61, 62, 63, 64, 65, 66, 67, 67, 67, 67, 51, 50, 49, 48, 47, 49, 51, 53,
+        55, 56, 57, 57, 58, 59, 59, 59, 60, 60, 61, 61, 62, 62, 63, 64, 65, 65,
+        66, 67, 68, 68, 68, 68, 51, 50, 49, 47, 46, 48, 50, 52, 55, 56, 57, 58,
+        60, 60, 61, 61, 62, 62, 63, 63, 64, 64, 65, 66, 66, 67, 68, 69, 70, 70,
+        70, 70, 52, 50, 49, 48, 46, 48, 50, 52, 55, 56, 57, 59, 60, 61, 62, 62,
+        63, 63, 64, 65, 65, 66, 66, 67, 68, 68, 69, 70, 71, 71, 71, 71, 53, 51,
+        49, 48, 47, 48, 50, 52, 55, 56, 57, 59, 61, 62, 62, 63, 64, 65, 65, 66,
+        67, 67, 68, 69, 69, 70, 71, 71, 72, 72, 72, 72, 53, 52, 50, 48, 47, 49,
+        50, 52, 55, 56, 58, 59, 61, 62, 63, 64, 65, 66, 67, 67, 68, 69, 69, 70,
+        71, 71, 72, 73, 74, 74, 74, 74, 54, 52, 50, 49, 47, 49, 51, 52, 54, 56,
+        58, 60, 62, 63, 64, 65, 67, 67, 68, 69, 70, 70, 71, 72, 72, 73, 74, 74,
+        75, 75, 75, 75, 55, 53, 51, 50, 48, 50, 51, 53, 55, 57, 58, 60, 62, 63,
+        65, 66, 67, 68, 69, 70, 71, 71, 72, 73, 74, 74, 75, 76, 76, 76, 76, 76,
+        56, 54, 52, 51, 49, 50, 52, 54, 56, 57, 59, 61, 63, 64, 65, 67, 68, 69,
+        70, 71, 72, 73, 73, 74, 75, 76, 76, 77, 78, 78, 78, 78, 57, 55, 53, 51,
+        50, 51, 53, 54, 56, 58, 59, 61, 63, 65, 66, 67, 69, 70, 71, 72, 73, 74,
+        75, 75, 76, 77, 78, 78, 79, 79, 79, 79, 58, 56, 54, 52, 51, 52, 54, 55,
+        57, 58, 60, 62, 64, 65, 67, 68, 70, 71, 72, 73, 74, 75, 76, 77, 77, 78,
+        79, 80, 80, 80, 80, 80, 60, 58, 56, 54, 52, 53, 54, 56, 57, 59, 61, 62,
+        64, 66, 67, 69, 70, 71, 73, 74, 75, 76, 77, 78, 78, 79, 80, 81, 81, 81,
+        81, 81, 61, 59, 57, 55, 53, 54, 56, 57, 58, 60, 61, 63, 65, 66, 68, 69,
+        71, 72, 73, 75, 76, 77, 78, 78, 79, 80, 81, 82, 82, 82, 82, 82, 62, 60,
+        58, 56, 54, 55, 57, 58, 59, 61, 62, 64, 66, 67, 69, 70, 72, 73, 74, 75,
+        77, 78, 78, 79, 80, 81, 82, 83, 83, 83, 83, 83, 64, 62, 59, 57, 55, 57,
+        58, 59, 60, 62, 63, 65, 66, 68, 69, 71, 72, 74, 75, 76, 77, 78, 79, 80,
+        81, 82, 83, 84, 84, 84, 84, 84, 65, 63, 61, 59, 57, 58, 59, 60, 61, 63,
+        64, 65, 67, 68, 70, 71, 73, 74, 76, 77, 78, 79, 80, 81, 82, 83, 84, 84,
+        85, 85, 85, 85, 66, 64, 62, 60, 58, 59, 60, 61, 62, 64, 65, 66, 68, 69,
+        71, 72, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84, 84, 85, 86, 86, 86, 86,
+        68, 66, 63, 61, 59, 60, 61, 62, 63, 65, 66, 67, 69, 70, 71, 73, 74, 76,
+        77, 78, 80, 81, 82, 83, 84, 84, 85, 86, 87, 87, 87, 87, 69, 67, 65, 63,
+        61, 62, 63, 63, 64, 66, 67, 68, 70, 71, 72, 74, 75, 76, 78, 79, 80, 81,
+        82, 83, 84, 85, 86, 87, 88, 88, 88, 88, 69, 67, 65, 63, 61, 62, 63, 63,
+        64, 66, 67, 68, 70, 71, 72, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84, 85,
+        86, 87, 88, 88, 88, 88, 69, 67, 65, 63, 61, 62, 63, 63, 64, 66, 67, 68,
+        70, 71, 72, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 88,
+        88, 88, 69, 67, 65, 63, 61, 62, 63, 63, 64, 66, 67, 68, 70, 71, 72, 74,
+        75, 76, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 88, 88, 88 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 68, 96, 122, 68, 90, 110, 128, 96, 110, 130, 142, 122, 128, 142,
+        150,
+        /* Size 8 */
+        64, 52, 55, 67, 81, 95, 107, 116, 52, 58, 56, 63, 74, 86, 98, 108, 55,
+        56, 71, 78, 86, 95, 104, 113, 67, 63, 78, 91, 99, 106, 112, 118, 81, 74,
+        86, 99, 108, 114, 119, 124, 95, 86, 95, 106, 114, 120, 125, 128, 107,
+        98, 104, 112, 119, 125, 129, 132, 116, 108, 113, 118, 124, 128, 132,
+        134,
+        /* Size 16 */
+        64, 57, 52, 53, 55, 61, 67, 74, 81, 88, 95, 100, 107, 111, 116, 116, 57,
+        56, 55, 55, 55, 60, 65, 71, 77, 84, 91, 96, 102, 107, 112, 112, 52, 55,
+        58, 57, 56, 59, 63, 68, 74, 80, 86, 92, 98, 103, 108, 108, 53, 55, 57,
+        59, 62, 66, 70, 74, 80, 85, 91, 96, 101, 106, 110, 110, 55, 55, 56, 62,
+        71, 74, 78, 82, 86, 91, 95, 100, 104, 108, 113, 113, 61, 60, 59, 66, 74,
+        79, 84, 88, 92, 96, 100, 104, 108, 112, 115, 115, 67, 65, 63, 70, 78,
+        84, 91, 94, 99, 102, 106, 109, 112, 115, 118, 118, 74, 71, 68, 74, 82,
+        88, 94, 98, 103, 106, 110, 112, 116, 118, 121, 121, 81, 77, 74, 80, 86,
+        92, 99, 103, 108, 111, 114, 117, 119, 121, 124, 124, 88, 84, 80, 85, 91,
+        96, 102, 106, 111, 114, 117, 119, 122, 124, 126, 126, 95, 91, 86, 91,
+        95, 100, 106, 110, 114, 117, 120, 122, 125, 126, 128, 128, 100, 96, 92,
+        96, 100, 104, 109, 112, 117, 119, 122, 124, 127, 128, 130, 130, 107,
+        102, 98, 101, 104, 108, 112, 116, 119, 122, 125, 127, 129, 130, 132,
+        132, 111, 107, 103, 106, 108, 112, 115, 118, 121, 124, 126, 128, 130,
+        131, 133, 133, 116, 112, 108, 110, 113, 115, 118, 121, 124, 126, 128,
+        130, 132, 133, 134, 134, 116, 112, 108, 110, 113, 115, 118, 121, 124,
+        126, 128, 130, 132, 133, 134, 134,
+        /* Size 32 */
+        64, 60, 57, 55, 52, 53, 53, 54, 55, 58, 61, 64, 67, 70, 74, 77, 81, 84,
+        88, 91, 95, 98, 100, 103, 107, 109, 111, 113, 116, 116, 116, 116, 60,
+        58, 57, 55, 53, 54, 54, 55, 55, 58, 60, 63, 66, 69, 72, 76, 79, 82, 86,
+        89, 93, 95, 98, 101, 104, 107, 109, 111, 114, 114, 114, 114, 57, 57, 56,
+        55, 55, 55, 55, 55, 55, 57, 60, 62, 65, 68, 71, 74, 77, 80, 84, 87, 91,
+        93, 96, 99, 102, 105, 107, 109, 112, 112, 112, 112, 55, 55, 55, 56, 56,
+        56, 56, 56, 56, 57, 59, 61, 64, 66, 69, 72, 76, 79, 82, 85, 88, 91, 94,
+        97, 100, 103, 105, 108, 110, 110, 110, 110, 52, 53, 55, 56, 58, 57, 57,
+        56, 56, 57, 59, 61, 63, 65, 68, 71, 74, 77, 80, 83, 86, 89, 92, 95, 98,
+        101, 103, 106, 108, 108, 108, 108, 53, 54, 55, 56, 57, 58, 58, 58, 59,
+        60, 62, 64, 66, 68, 71, 74, 77, 79, 82, 85, 89, 91, 94, 97, 100, 102,
+        104, 107, 109, 109, 109, 109, 53, 54, 55, 56, 57, 58, 59, 61, 62, 64,
+        66, 68, 70, 72, 74, 77, 80, 82, 85, 88, 91, 93, 96, 98, 101, 103, 106,
+        108, 110, 110, 110, 110, 54, 55, 55, 56, 56, 58, 61, 63, 66, 68, 70, 72,
+        74, 76, 78, 80, 83, 85, 88, 90, 93, 95, 98, 100, 103, 105, 107, 109,
+        112, 112, 112, 112, 55, 55, 55, 56, 56, 59, 62, 66, 71, 73, 74, 76, 78,
+        80, 82, 84, 86, 88, 91, 93, 95, 97, 100, 102, 104, 106, 108, 110, 113,
+        113, 113, 113, 58, 58, 57, 57, 57, 60, 64, 68, 73, 74, 77, 79, 81, 83,
+        85, 87, 89, 91, 93, 95, 98, 100, 102, 104, 106, 108, 110, 112, 114, 114,
+        114, 114, 61, 60, 60, 59, 59, 62, 66, 70, 74, 77, 79, 81, 84, 86, 88,
+        90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 113, 115, 115,
+        115, 115, 64, 63, 62, 61, 61, 64, 68, 72, 76, 79, 81, 84, 87, 89, 91,
+        93, 95, 97, 99, 101, 103, 105, 106, 108, 110, 112, 113, 115, 117, 117,
+        117, 117, 67, 66, 65, 64, 63, 66, 70, 74, 78, 81, 84, 87, 91, 92, 94,
+        96, 99, 100, 102, 104, 106, 107, 109, 110, 112, 114, 115, 117, 118, 118,
+        118, 118, 70, 69, 68, 66, 65, 68, 72, 76, 80, 83, 86, 89, 92, 94, 96,
+        99, 101, 102, 104, 106, 108, 109, 111, 112, 114, 115, 117, 118, 119,
+        119, 119, 119, 74, 72, 71, 69, 68, 71, 74, 78, 82, 85, 88, 91, 94, 96,
+        98, 101, 103, 105, 106, 108, 110, 111, 112, 114, 116, 117, 118, 119,
+        121, 121, 121, 121, 77, 76, 74, 72, 71, 74, 77, 80, 84, 87, 90, 93, 96,
+        99, 101, 103, 105, 107, 108, 110, 112, 113, 114, 116, 117, 118, 120,
+        121, 122, 122, 122, 122, 81, 79, 77, 76, 74, 77, 80, 83, 86, 89, 92, 95,
+        99, 101, 103, 105, 108, 109, 111, 112, 114, 115, 117, 118, 119, 120,
+        121, 122, 124, 124, 124, 124, 84, 82, 80, 79, 77, 79, 82, 85, 88, 91,
+        94, 97, 100, 102, 105, 107, 109, 111, 112, 114, 115, 117, 118, 119, 120,
+        121, 123, 124, 125, 125, 125, 125, 88, 86, 84, 82, 80, 82, 85, 88, 91,
+        93, 96, 99, 102, 104, 106, 108, 111, 112, 114, 115, 117, 118, 119, 121,
+        122, 123, 124, 125, 126, 126, 126, 126, 91, 89, 87, 85, 83, 85, 88, 90,
+        93, 95, 98, 101, 104, 106, 108, 110, 112, 114, 115, 117, 119, 120, 121,
+        122, 123, 124, 125, 126, 127, 127, 127, 127, 95, 93, 91, 88, 86, 89, 91,
+        93, 95, 98, 100, 103, 106, 108, 110, 112, 114, 115, 117, 119, 120, 121,
+        122, 123, 125, 125, 126, 127, 128, 128, 128, 128, 98, 95, 93, 91, 89,
+        91, 93, 95, 97, 100, 102, 105, 107, 109, 111, 113, 115, 117, 118, 120,
+        121, 122, 123, 124, 126, 126, 127, 128, 129, 129, 129, 129, 100, 98, 96,
+        94, 92, 94, 96, 98, 100, 102, 104, 106, 109, 111, 112, 114, 117, 118,
+        119, 121, 122, 123, 124, 125, 127, 127, 128, 129, 130, 130, 130, 130,
+        103, 101, 99, 97, 95, 97, 98, 100, 102, 104, 106, 108, 110, 112, 114,
+        116, 118, 119, 121, 122, 123, 124, 125, 127, 128, 128, 129, 130, 131,
+        131, 131, 131, 107, 104, 102, 100, 98, 100, 101, 103, 104, 106, 108,
+        110, 112, 114, 116, 117, 119, 120, 122, 123, 125, 126, 127, 128, 129,
+        129, 130, 131, 132, 132, 132, 132, 109, 107, 105, 103, 101, 102, 103,
+        105, 106, 108, 110, 112, 114, 115, 117, 118, 120, 121, 123, 124, 125,
+        126, 127, 128, 129, 130, 131, 131, 132, 132, 132, 132, 111, 109, 107,
+        105, 103, 104, 106, 107, 108, 110, 112, 113, 115, 117, 118, 120, 121,
+        123, 124, 125, 126, 127, 128, 129, 130, 131, 131, 132, 133, 133, 133,
+        133, 113, 111, 109, 108, 106, 107, 108, 109, 110, 112, 113, 115, 117,
+        118, 119, 121, 122, 124, 125, 126, 127, 128, 129, 130, 131, 131, 132,
+        133, 134, 134, 134, 134, 116, 114, 112, 110, 108, 109, 110, 112, 113,
+        114, 115, 117, 118, 119, 121, 122, 124, 125, 126, 127, 128, 129, 130,
+        131, 132, 132, 133, 134, 134, 134, 134, 134, 116, 114, 112, 110, 108,
+        109, 110, 112, 113, 114, 115, 117, 118, 119, 121, 122, 124, 125, 126,
+        127, 128, 129, 130, 131, 132, 132, 133, 134, 134, 134, 134, 134, 116,
+        114, 112, 110, 108, 109, 110, 112, 113, 114, 115, 117, 118, 119, 121,
+        122, 124, 125, 126, 127, 128, 129, 130, 131, 132, 132, 133, 134, 134,
+        134, 134, 134, 116, 114, 112, 110, 108, 109, 110, 112, 113, 114, 115,
+        117, 118, 119, 121, 122, 124, 125, 126, 127, 128, 129, 130, 131, 132,
+        132, 133, 134, 134, 134, 134, 134 },
+      { /* Intra matrices */
+        /* Size 4 */
+        35, 37, 53, 69, 37, 50, 61, 72, 53, 61, 74, 81, 69, 72, 81, 86,
+        /* Size 8 */
+        40, 32, 34, 42, 51, 60, 68, 75, 32, 36, 34, 39, 46, 55, 63, 70, 34, 34,
+        44, 49, 54, 61, 67, 73, 42, 39, 49, 57, 63, 68, 72, 76, 51, 46, 54, 63,
+        69, 74, 77, 80, 60, 55, 61, 68, 74, 78, 81, 84, 68, 63, 67, 72, 77, 81,
+        84, 86, 75, 70, 73, 76, 80, 84, 86, 88,
+        /* Size 16 */
+        39, 35, 31, 32, 33, 37, 41, 45, 50, 54, 59, 63, 67, 70, 73, 73, 35, 34,
+        33, 33, 33, 36, 39, 43, 47, 51, 56, 60, 64, 67, 70, 70, 31, 33, 35, 34,
+        34, 36, 38, 41, 45, 49, 53, 57, 61, 64, 68, 68, 32, 33, 34, 36, 38, 40,
+        42, 45, 49, 52, 56, 59, 63, 66, 69, 69, 33, 33, 34, 38, 43, 45, 48, 50,
+        53, 56, 59, 62, 65, 68, 71, 71, 37, 36, 36, 40, 45, 48, 52, 54, 57, 60,
+        62, 65, 68, 70, 73, 73, 41, 39, 38, 42, 48, 52, 56, 59, 61, 64, 66, 68,
+        71, 73, 75, 75, 45, 43, 41, 45, 50, 54, 59, 61, 64, 66, 69, 71, 73, 75,
+        77, 77, 50, 47, 45, 49, 53, 57, 61, 64, 67, 70, 72, 74, 75, 77, 78, 78,
+        54, 51, 49, 52, 56, 60, 64, 66, 70, 72, 74, 76, 77, 79, 80, 80, 59, 56,
+        53, 56, 59, 62, 66, 69, 72, 74, 76, 78, 79, 80, 82, 82, 63, 60, 57, 59,
+        62, 65, 68, 71, 74, 76, 78, 79, 81, 82, 83, 83, 67, 64, 61, 63, 65, 68,
+        71, 73, 75, 77, 79, 81, 82, 83, 84, 84, 70, 67, 64, 66, 68, 70, 73, 75,
+        77, 79, 80, 82, 83, 84, 85, 85, 73, 70, 68, 69, 71, 73, 75, 77, 78, 80,
+        82, 83, 84, 85, 86, 86, 73, 70, 68, 69, 71, 73, 75, 77, 78, 80, 82, 83,
+        84, 85, 86, 86,
+        /* Size 32 */
+        38, 36, 34, 32, 31, 31, 32, 32, 33, 34, 36, 38, 40, 42, 44, 47, 49, 51,
+        53, 56, 58, 60, 62, 64, 66, 67, 69, 71, 72, 72, 72, 72, 36, 35, 34, 33,
+        32, 32, 32, 33, 33, 34, 36, 38, 40, 41, 43, 46, 48, 50, 52, 54, 57, 59,
+        60, 62, 64, 66, 68, 69, 71, 71, 71, 71, 34, 34, 33, 33, 32, 33, 33, 33,
+        33, 34, 36, 37, 39, 41, 42, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65,
+        66, 68, 70, 70, 70, 70, 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 37,
+        38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 63, 65, 67, 68, 68,
+        68, 68, 31, 32, 32, 33, 34, 34, 34, 33, 33, 34, 35, 36, 37, 39, 41, 43,
+        45, 46, 48, 50, 53, 54, 56, 58, 60, 62, 64, 65, 67, 67, 67, 67, 31, 32,
+        33, 33, 34, 34, 35, 35, 35, 36, 37, 38, 40, 41, 43, 44, 46, 48, 50, 52,
+        54, 56, 57, 59, 61, 63, 64, 66, 68, 68, 68, 68, 32, 32, 33, 33, 34, 35,
+        35, 36, 37, 38, 39, 41, 42, 43, 45, 46, 48, 50, 52, 53, 55, 57, 59, 61,
+        62, 64, 65, 67, 69, 69, 69, 69, 32, 33, 33, 33, 33, 35, 36, 38, 40, 41,
+        42, 43, 44, 46, 47, 49, 50, 52, 53, 55, 57, 58, 60, 62, 63, 65, 66, 68,
+        69, 69, 69, 69, 33, 33, 33, 33, 33, 35, 37, 40, 43, 44, 45, 46, 47, 49,
+        50, 51, 53, 54, 55, 57, 58, 60, 61, 63, 65, 66, 67, 69, 70, 70, 70, 70,
+        34, 34, 34, 34, 34, 36, 38, 41, 44, 45, 46, 48, 49, 50, 52, 53, 54, 56,
+        57, 59, 60, 61, 63, 64, 66, 67, 68, 70, 71, 71, 71, 71, 36, 36, 36, 35,
+        35, 37, 39, 42, 45, 46, 48, 49, 51, 52, 54, 55, 56, 58, 59, 60, 62, 63,
+        64, 66, 67, 68, 69, 71, 72, 72, 72, 72, 38, 38, 37, 37, 36, 38, 41, 43,
+        46, 48, 49, 51, 53, 54, 56, 57, 58, 60, 61, 62, 63, 65, 66, 67, 68, 69,
+        71, 72, 73, 73, 73, 73, 40, 40, 39, 38, 37, 40, 42, 44, 47, 49, 51, 53,
+        55, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 74,
+        74, 74, 42, 41, 41, 40, 39, 41, 43, 46, 49, 50, 52, 54, 57, 58, 59, 61,
+        62, 63, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 75, 75, 44, 43,
+        42, 42, 41, 43, 45, 47, 50, 52, 54, 56, 58, 59, 61, 62, 64, 65, 66, 67,
+        68, 69, 70, 71, 72, 73, 74, 75, 76, 76, 76, 76, 47, 46, 45, 44, 43, 44,
+        46, 49, 51, 53, 55, 57, 59, 61, 62, 64, 65, 66, 67, 68, 69, 70, 71, 72,
+        73, 74, 75, 76, 77, 77, 77, 77, 49, 48, 47, 46, 45, 46, 48, 50, 53, 54,
+        56, 58, 61, 62, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 74, 75, 76, 77,
+        78, 78, 78, 78, 51, 50, 49, 48, 46, 48, 50, 52, 54, 56, 58, 60, 62, 63,
+        65, 66, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 78, 78, 78, 78,
+        53, 52, 51, 50, 48, 50, 52, 53, 55, 57, 59, 61, 63, 64, 66, 67, 69, 70,
+        71, 72, 73, 74, 75, 75, 76, 77, 78, 78, 79, 79, 79, 79, 56, 54, 53, 52,
+        50, 52, 53, 55, 57, 59, 60, 62, 64, 65, 67, 68, 70, 71, 72, 73, 74, 75,
+        76, 76, 77, 78, 79, 79, 80, 80, 80, 80, 58, 57, 55, 54, 53, 54, 55, 57,
+        58, 60, 62, 63, 65, 67, 68, 69, 71, 72, 73, 74, 75, 76, 77, 77, 78, 79,
+        79, 80, 81, 81, 81, 81, 60, 59, 57, 56, 54, 56, 57, 58, 60, 61, 63, 65,
+        66, 68, 69, 70, 72, 73, 74, 75, 76, 77, 77, 78, 79, 79, 80, 81, 81, 81,
+        81, 81, 62, 60, 59, 58, 56, 57, 59, 60, 61, 63, 64, 66, 67, 69, 70, 71,
+        73, 74, 75, 76, 77, 77, 78, 79, 80, 80, 81, 81, 82, 82, 82, 82, 64, 62,
+        61, 60, 58, 59, 61, 62, 63, 64, 66, 67, 69, 70, 71, 72, 74, 75, 75, 76,
+        77, 78, 79, 80, 80, 81, 81, 82, 82, 82, 82, 82, 66, 64, 63, 62, 60, 61,
+        62, 63, 65, 66, 67, 68, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 80,
+        81, 82, 82, 83, 83, 83, 83, 83, 67, 66, 65, 63, 62, 63, 64, 65, 66, 67,
+        68, 69, 71, 72, 73, 74, 75, 76, 77, 78, 79, 79, 80, 81, 82, 82, 83, 83,
+        84, 84, 84, 84, 69, 68, 66, 65, 64, 64, 65, 66, 67, 68, 69, 71, 72, 73,
+        74, 75, 76, 77, 78, 79, 79, 80, 81, 81, 82, 83, 83, 84, 84, 84, 84, 84,
+        71, 69, 68, 67, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,
+        78, 79, 80, 81, 81, 82, 83, 83, 84, 84, 84, 84, 84, 84, 72, 71, 70, 68,
+        67, 68, 69, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 78, 79, 80, 81, 81,
+        82, 82, 83, 84, 84, 84, 85, 85, 85, 85, 72, 71, 70, 68, 67, 68, 69, 69,
+        70, 71, 72, 73, 74, 75, 76, 77, 78, 78, 79, 80, 81, 81, 82, 82, 83, 84,
+        84, 84, 85, 85, 85, 85, 72, 71, 70, 68, 67, 68, 69, 69, 70, 71, 72, 73,
+        74, 75, 76, 77, 78, 78, 79, 80, 81, 81, 82, 82, 83, 84, 84, 84, 85, 85,
+        85, 85, 72, 71, 70, 68, 67, 68, 69, 69, 70, 71, 72, 73, 74, 75, 76, 77,
+        78, 78, 79, 80, 81, 81, 82, 82, 83, 84, 84, 84, 85, 85, 85, 85 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 88, 93, 107, 88, 99, 103, 112, 93, 103, 119, 127, 107, 112, 127,
+        137,
+        /* Size 8 */
+        64, 54, 83, 87, 90, 97, 104, 111, 54, 71, 84, 79, 81, 86, 92, 100, 83,
+        84, 93, 91, 91, 94, 99, 104, 87, 79, 91, 98, 101, 103, 107, 111, 90, 81,
+        91, 101, 107, 111, 115, 118, 97, 86, 94, 103, 111, 117, 121, 124, 104,
+        92, 99, 107, 115, 121, 126, 129, 111, 100, 104, 111, 118, 124, 129, 133,
+        /* Size 16 */
+        64, 59, 54, 66, 83, 85, 87, 88, 90, 93, 97, 100, 104, 107, 111, 111, 59,
+        60, 62, 71, 84, 83, 83, 84, 85, 88, 91, 94, 98, 101, 105, 105, 54, 62,
+        71, 77, 84, 82, 79, 80, 81, 83, 86, 89, 92, 96, 100, 100, 66, 71, 77,
+        83, 88, 87, 85, 85, 85, 87, 90, 92, 95, 99, 102, 102, 83, 84, 84, 88,
+        93, 92, 91, 91, 91, 92, 94, 96, 99, 101, 104, 104, 85, 83, 82, 87, 92,
+        93, 95, 95, 96, 97, 98, 100, 103, 105, 108, 108, 87, 83, 79, 85, 91, 95,
+        98, 100, 101, 102, 103, 105, 107, 109, 111, 111, 88, 84, 80, 85, 91, 95,
+        100, 102, 104, 106, 107, 109, 111, 113, 115, 115, 90, 85, 81, 85, 91,
+        96, 101, 104, 107, 109, 111, 113, 115, 116, 118, 118, 93, 88, 83, 87,
+        92, 97, 102, 106, 109, 112, 114, 116, 118, 120, 121, 121, 97, 91, 86,
+        90, 94, 98, 103, 107, 111, 114, 117, 119, 121, 123, 124, 124, 100, 94,
+        89, 92, 96, 100, 105, 109, 113, 116, 119, 121, 123, 125, 127, 127, 104,
+        98, 92, 95, 99, 103, 107, 111, 115, 118, 121, 123, 126, 128, 129, 129,
+        107, 101, 96, 99, 101, 105, 109, 113, 116, 120, 123, 125, 128, 129, 131,
+        131, 111, 105, 100, 102, 104, 108, 111, 115, 118, 121, 124, 127, 129,
+        131, 133, 133, 111, 105, 100, 102, 104, 108, 111, 115, 118, 121, 124,
+        127, 129, 131, 133, 133,
+        /* Size 32 */
+        64, 61, 59, 57, 54, 60, 66, 74, 83, 84, 85, 86, 87, 87, 88, 89, 90, 92,
+        93, 95, 97, 98, 100, 102, 104, 105, 107, 109, 111, 111, 111, 111, 61,
+        60, 60, 59, 58, 63, 68, 75, 84, 84, 84, 84, 85, 85, 86, 87, 88, 89, 90,
+        92, 94, 95, 97, 99, 101, 102, 104, 106, 108, 108, 108, 108, 59, 60, 60,
+        61, 62, 66, 71, 77, 84, 84, 83, 83, 83, 83, 84, 84, 85, 86, 88, 89, 91,
+        92, 94, 96, 98, 99, 101, 103, 105, 105, 105, 105, 57, 59, 61, 64, 66,
+        70, 74, 79, 84, 83, 82, 82, 81, 81, 82, 82, 83, 84, 85, 87, 88, 90, 91,
+        93, 95, 97, 98, 100, 102, 102, 102, 102, 54, 58, 62, 66, 71, 74, 77, 81,
+        84, 83, 82, 80, 79, 79, 80, 80, 81, 82, 83, 84, 86, 87, 89, 91, 92, 94,
+        96, 98, 100, 100, 100, 100, 60, 63, 66, 70, 74, 77, 80, 83, 86, 85, 84,
+        83, 82, 82, 82, 83, 83, 84, 85, 86, 88, 89, 91, 92, 94, 95, 97, 99, 101,
+        101, 101, 101, 66, 68, 71, 74, 77, 80, 83, 85, 88, 88, 87, 86, 85, 85,
+        85, 85, 85, 86, 87, 89, 90, 91, 92, 94, 95, 97, 99, 100, 102, 102, 102,
+        102, 74, 75, 77, 79, 81, 83, 85, 88, 91, 90, 89, 89, 88, 88, 88, 88, 88,
+        89, 90, 91, 92, 93, 94, 96, 97, 98, 100, 102, 103, 103, 103, 103, 83,
+        84, 84, 84, 84, 86, 88, 91, 93, 93, 92, 92, 91, 91, 91, 91, 91, 92, 92,
+        93, 94, 95, 96, 97, 99, 100, 101, 103, 104, 104, 104, 104, 84, 84, 84,
+        83, 83, 85, 88, 90, 93, 93, 93, 93, 93, 93, 93, 93, 93, 94, 95, 95, 96,
+        97, 98, 99, 101, 102, 103, 105, 106, 106, 106, 106, 85, 84, 83, 82, 82,
+        84, 87, 89, 92, 93, 93, 94, 95, 95, 95, 95, 96, 96, 97, 98, 98, 99, 100,
+        102, 103, 104, 105, 106, 108, 108, 108, 108, 86, 84, 83, 82, 80, 83, 86,
+        89, 92, 93, 94, 95, 96, 97, 97, 98, 98, 99, 100, 100, 101, 102, 103,
+        104, 105, 106, 107, 108, 109, 109, 109, 109, 87, 85, 83, 81, 79, 82, 85,
+        88, 91, 93, 95, 96, 98, 99, 100, 100, 101, 102, 102, 103, 103, 104, 105,
+        106, 107, 108, 109, 110, 111, 111, 111, 111, 87, 85, 83, 81, 79, 82, 85,
+        88, 91, 93, 95, 97, 99, 100, 101, 102, 102, 103, 104, 105, 105, 106,
+        107, 108, 109, 110, 111, 112, 113, 113, 113, 113, 88, 86, 84, 82, 80,
+        82, 85, 88, 91, 93, 95, 97, 100, 101, 102, 103, 104, 105, 106, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115, 115, 115, 115, 89, 87, 84, 82,
+        80, 83, 85, 88, 91, 93, 95, 98, 100, 102, 103, 104, 106, 107, 107, 108,
+        109, 110, 111, 112, 113, 114, 115, 115, 116, 116, 116, 116, 90, 88, 85,
+        83, 81, 83, 85, 88, 91, 93, 96, 98, 101, 102, 104, 106, 107, 108, 109,
+        110, 111, 112, 113, 114, 115, 116, 116, 117, 118, 118, 118, 118, 92, 89,
+        86, 84, 82, 84, 86, 89, 92, 94, 96, 99, 102, 103, 105, 107, 108, 109,
+        111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 120, 120, 120,
+        93, 90, 88, 85, 83, 85, 87, 90, 92, 95, 97, 100, 102, 104, 106, 107,
+        109, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 120, 121, 121,
+        121, 121, 95, 92, 89, 87, 84, 86, 89, 91, 93, 95, 98, 100, 103, 105,
+        106, 108, 110, 112, 113, 114, 116, 116, 117, 118, 119, 120, 121, 122,
+        123, 123, 123, 123, 97, 94, 91, 88, 86, 88, 90, 92, 94, 96, 98, 101,
+        103, 105, 107, 109, 111, 113, 114, 116, 117, 118, 119, 120, 121, 122,
+        123, 124, 124, 124, 124, 124, 98, 95, 92, 90, 87, 89, 91, 93, 95, 97,
+        99, 102, 104, 106, 108, 110, 112, 114, 115, 116, 118, 119, 120, 121,
+        122, 123, 124, 125, 126, 126, 126, 126, 100, 97, 94, 91, 89, 91, 92, 94,
+        96, 98, 100, 103, 105, 107, 109, 111, 113, 114, 116, 117, 119, 120, 121,
+        122, 123, 124, 125, 126, 127, 127, 127, 127, 102, 99, 96, 93, 91, 92,
+        94, 96, 97, 99, 102, 104, 106, 108, 110, 112, 114, 115, 117, 118, 120,
+        121, 122, 123, 125, 125, 126, 127, 128, 128, 128, 128, 104, 101, 98, 95,
+        92, 94, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 116, 118,
+        119, 121, 122, 123, 125, 126, 127, 128, 128, 129, 129, 129, 129, 105,
+        102, 99, 97, 94, 95, 97, 98, 100, 102, 104, 106, 108, 110, 112, 114,
+        116, 117, 119, 120, 122, 123, 124, 125, 127, 128, 128, 129, 130, 130,
+        130, 130, 107, 104, 101, 98, 96, 97, 99, 100, 101, 103, 105, 107, 109,
+        111, 113, 115, 116, 118, 120, 121, 123, 124, 125, 126, 128, 128, 129,
+        130, 131, 131, 131, 131, 109, 106, 103, 100, 98, 99, 100, 102, 103, 105,
+        106, 108, 110, 112, 114, 115, 117, 119, 120, 122, 124, 125, 126, 127,
+        128, 129, 130, 131, 132, 132, 132, 132, 111, 108, 105, 102, 100, 101,
+        102, 103, 104, 106, 108, 109, 111, 113, 115, 116, 118, 120, 121, 123,
+        124, 126, 127, 128, 129, 130, 131, 132, 133, 133, 133, 133, 111, 108,
+        105, 102, 100, 101, 102, 103, 104, 106, 108, 109, 111, 113, 115, 116,
+        118, 120, 121, 123, 124, 126, 127, 128, 129, 130, 131, 132, 133, 133,
+        133, 133, 111, 108, 105, 102, 100, 101, 102, 103, 104, 106, 108, 109,
+        111, 113, 115, 116, 118, 120, 121, 123, 124, 126, 127, 128, 129, 130,
+        131, 132, 133, 133, 133, 133, 111, 108, 105, 102, 100, 101, 102, 103,
+        104, 106, 108, 109, 111, 113, 115, 116, 118, 120, 121, 123, 124, 126,
+        127, 128, 129, 130, 131, 132, 133, 133, 133, 133 },
+      { /* Intra matrices */
+        /* Size 4 */
+        37, 52, 55, 64, 52, 59, 62, 67, 55, 62, 72, 77, 64, 67, 77, 84,
+        /* Size 8 */
+        40, 33, 52, 54, 57, 61, 66, 71, 33, 44, 53, 49, 50, 54, 58, 63, 52, 53,
+        59, 57, 57, 59, 63, 66, 54, 49, 57, 62, 64, 66, 68, 71, 57, 50, 57, 64,
+        68, 71, 74, 76, 61, 54, 59, 66, 71, 75, 78, 80, 66, 58, 63, 68, 74, 78,
+        81, 84, 71, 63, 66, 71, 76, 80, 84, 86,
+        /* Size 16 */
+        39, 36, 33, 40, 51, 52, 53, 55, 56, 58, 60, 62, 65, 67, 70, 70, 36, 37,
+        38, 43, 52, 51, 51, 52, 53, 54, 56, 58, 61, 63, 66, 66, 33, 38, 44, 47,
+        52, 50, 49, 49, 50, 51, 53, 55, 57, 60, 62, 62, 40, 43, 47, 51, 55, 53,
+        52, 52, 53, 54, 55, 57, 59, 61, 64, 64, 51, 52, 52, 55, 58, 57, 56, 56,
+        56, 57, 58, 60, 61, 63, 65, 65, 52, 51, 50, 53, 57, 58, 59, 59, 59, 60,
+        61, 63, 64, 66, 68, 68, 53, 51, 49, 52, 56, 59, 61, 62, 63, 64, 65, 66,
+        67, 68, 70, 70, 55, 52, 49, 52, 56, 59, 62, 64, 65, 66, 67, 68, 70, 71,
+        72, 72, 56, 53, 50, 53, 56, 59, 63, 65, 67, 69, 70, 71, 72, 73, 75, 75,
+        58, 54, 51, 54, 57, 60, 64, 66, 69, 70, 72, 73, 74, 76, 77, 77, 60, 56,
+        53, 55, 58, 61, 65, 67, 70, 72, 74, 75, 77, 78, 79, 79, 62, 58, 55, 57,
+        60, 63, 66, 68, 71, 73, 75, 77, 78, 79, 81, 81, 65, 61, 57, 59, 61, 64,
+        67, 70, 72, 74, 77, 78, 80, 81, 82, 82, 67, 63, 60, 61, 63, 66, 68, 71,
+        73, 76, 78, 79, 81, 82, 84, 84, 70, 66, 62, 64, 65, 68, 70, 72, 75, 77,
+        79, 81, 82, 84, 85, 85, 70, 66, 62, 64, 65, 68, 70, 72, 75, 77, 79, 81,
+        82, 84, 85, 85,
+        /* Size 32 */
+        39, 37, 35, 34, 33, 36, 40, 45, 51, 51, 52, 52, 53, 54, 54, 55, 55, 56,
+        57, 58, 59, 61, 62, 63, 64, 65, 67, 68, 69, 69, 69, 69, 37, 36, 36, 35,
+        35, 38, 41, 46, 51, 51, 51, 52, 52, 52, 53, 53, 54, 55, 56, 57, 58, 59,
+        60, 61, 62, 63, 64, 66, 67, 67, 67, 67, 35, 36, 36, 37, 37, 40, 43, 47,
+        51, 51, 51, 51, 50, 51, 51, 52, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+        63, 64, 65, 65, 65, 65, 34, 35, 37, 38, 40, 42, 45, 48, 51, 51, 50, 50,
+        49, 50, 50, 50, 51, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 63, 63,
+        63, 63, 33, 35, 37, 40, 43, 45, 47, 49, 52, 51, 50, 49, 48, 48, 49, 49,
+        49, 50, 51, 52, 52, 53, 55, 56, 57, 58, 59, 60, 62, 62, 62, 62, 36, 38,
+        40, 42, 45, 47, 49, 51, 53, 52, 51, 51, 50, 50, 50, 50, 51, 51, 52, 53,
+        54, 55, 56, 57, 58, 59, 60, 61, 62, 62, 62, 62, 40, 41, 43, 45, 47, 49,
+        50, 52, 54, 54, 53, 52, 52, 52, 52, 52, 52, 53, 54, 54, 55, 56, 57, 58,
+        59, 60, 61, 62, 63, 63, 63, 63, 45, 46, 47, 48, 49, 51, 52, 54, 56, 55,
+        55, 54, 54, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63,
+        64, 64, 64, 64, 51, 51, 51, 51, 52, 53, 54, 56, 57, 57, 57, 56, 56, 56,
+        56, 56, 56, 56, 57, 57, 58, 59, 59, 60, 61, 62, 63, 64, 65, 65, 65, 65,
+        51, 51, 51, 51, 51, 52, 54, 55, 57, 57, 57, 57, 57, 57, 57, 57, 57, 58,
+        58, 59, 59, 60, 61, 61, 62, 63, 64, 65, 66, 66, 66, 66, 52, 51, 51, 50,
+        50, 51, 53, 55, 57, 57, 57, 58, 58, 58, 59, 59, 59, 59, 60, 60, 61, 61,
+        62, 63, 64, 64, 65, 66, 67, 67, 67, 67, 52, 52, 51, 50, 49, 51, 52, 54,
+        56, 57, 58, 59, 59, 60, 60, 60, 61, 61, 61, 62, 62, 63, 64, 64, 65, 66,
+        66, 67, 68, 68, 68, 68, 53, 52, 50, 49, 48, 50, 52, 54, 56, 57, 58, 59,
+        61, 61, 61, 62, 62, 63, 63, 64, 64, 65, 65, 66, 66, 67, 68, 69, 69, 69,
+        69, 69, 54, 52, 51, 50, 48, 50, 52, 54, 56, 57, 58, 60, 61, 62, 62, 63,
+        63, 64, 64, 65, 65, 66, 66, 67, 68, 68, 69, 70, 70, 70, 70, 70, 54, 53,
+        51, 50, 49, 50, 52, 54, 56, 57, 59, 60, 61, 62, 63, 64, 64, 65, 66, 66,
+        67, 67, 68, 68, 69, 70, 70, 71, 72, 72, 72, 72, 55, 53, 52, 50, 49, 50,
+        52, 54, 56, 57, 59, 60, 62, 63, 64, 65, 66, 66, 67, 67, 68, 69, 69, 70,
+        70, 71, 71, 72, 73, 73, 73, 73, 55, 54, 52, 51, 49, 51, 52, 54, 56, 57,
+        59, 61, 62, 63, 64, 66, 67, 67, 68, 69, 69, 70, 70, 71, 72, 72, 73, 73,
+        74, 74, 74, 74, 56, 55, 53, 51, 50, 51, 53, 55, 56, 58, 59, 61, 63, 64,
+        65, 66, 67, 68, 69, 70, 70, 71, 71, 72, 73, 73, 74, 74, 75, 75, 75, 75,
+        57, 56, 54, 52, 51, 52, 54, 55, 57, 58, 60, 61, 63, 64, 66, 67, 68, 69,
+        70, 70, 71, 72, 72, 73, 74, 74, 75, 75, 76, 76, 76, 76, 58, 57, 55, 53,
+        52, 53, 54, 56, 57, 59, 60, 62, 64, 65, 66, 67, 69, 70, 70, 71, 72, 73,
+        73, 74, 75, 75, 76, 77, 77, 77, 77, 77, 59, 58, 56, 54, 52, 54, 55, 56,
+        58, 59, 61, 62, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 76,
+        77, 78, 78, 78, 78, 78, 61, 59, 57, 55, 53, 55, 56, 57, 59, 60, 61, 63,
+        65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 77, 78, 78, 79, 79,
+        79, 79, 62, 60, 58, 56, 55, 56, 57, 58, 59, 61, 62, 64, 65, 66, 68, 69,
+        70, 71, 72, 73, 75, 75, 76, 77, 77, 78, 79, 79, 80, 80, 80, 80, 63, 61,
+        59, 57, 56, 57, 58, 59, 60, 61, 63, 64, 66, 67, 68, 70, 71, 72, 73, 74,
+        75, 76, 77, 77, 78, 79, 79, 80, 81, 81, 81, 81, 64, 62, 60, 58, 57, 58,
+        59, 60, 61, 62, 64, 65, 66, 68, 69, 70, 72, 73, 74, 75, 76, 77, 77, 78,
+        79, 80, 80, 81, 82, 82, 82, 82, 65, 63, 61, 60, 58, 59, 60, 61, 62, 63,
+        64, 66, 67, 68, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 80, 81, 82,
+        82, 82, 82, 82, 67, 64, 63, 61, 59, 60, 61, 62, 63, 64, 65, 66, 68, 69,
+        70, 71, 73, 74, 75, 76, 77, 78, 79, 79, 80, 81, 82, 82, 83, 83, 83, 83,
+        68, 66, 64, 62, 60, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74,
+        75, 77, 78, 78, 79, 80, 81, 82, 82, 83, 84, 84, 84, 84, 69, 67, 65, 63,
+        62, 62, 63, 64, 65, 66, 67, 68, 69, 70, 72, 73, 74, 75, 76, 77, 78, 79,
+        80, 81, 82, 82, 83, 84, 84, 84, 84, 84, 69, 67, 65, 63, 62, 62, 63, 64,
+        65, 66, 67, 68, 69, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 82,
+        83, 84, 84, 84, 84, 84, 69, 67, 65, 63, 62, 62, 63, 64, 65, 66, 67, 68,
+        69, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 82, 83, 84, 84, 84,
+        84, 84, 69, 67, 65, 63, 62, 62, 63, 64, 65, 66, 67, 68, 69, 70, 72, 73,
+        74, 75, 76, 77, 78, 79, 80, 81, 82, 82, 83, 84, 84, 84, 84, 84 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 68, 92, 113, 68, 87, 103, 117, 92, 103, 119, 128, 113, 117, 128,
+        134,
+        /* Size 8 */
+        64, 53, 56, 67, 79, 91, 100, 107, 53, 58, 56, 63, 73, 84, 93, 101, 56,
+        56, 70, 77, 83, 91, 98, 105, 67, 63, 77, 87, 94, 99, 104, 109, 79, 73,
+        83, 94, 101, 106, 110, 113, 91, 84, 91, 99, 106, 110, 114, 116, 100, 93,
+        98, 104, 110, 114, 117, 119, 107, 101, 105, 109, 113, 116, 119, 121,
+        /* Size 16 */
+        64, 58, 53, 54, 56, 61, 67, 72, 79, 84, 91, 95, 100, 103, 107, 107, 58,
+        57, 55, 56, 56, 60, 65, 70, 76, 81, 87, 91, 96, 100, 104, 104, 53, 55,
+        58, 57, 56, 59, 63, 67, 73, 78, 84, 88, 93, 97, 101, 101, 54, 56, 57,
+        60, 63, 66, 69, 73, 78, 82, 87, 91, 96, 99, 103, 103, 56, 56, 56, 63,
+        70, 73, 77, 80, 83, 87, 91, 94, 98, 101, 105, 105, 61, 60, 59, 66, 73,
+        77, 81, 85, 88, 91, 95, 98, 101, 104, 107, 107, 67, 65, 63, 69, 77, 81,
+        87, 90, 94, 96, 99, 102, 104, 106, 109, 109, 72, 70, 67, 73, 80, 85, 90,
+        93, 97, 100, 102, 104, 107, 109, 111, 111, 79, 76, 73, 78, 83, 88, 94,
+        97, 101, 103, 106, 108, 110, 111, 113, 113, 84, 81, 78, 82, 87, 91, 96,
+        100, 103, 105, 108, 110, 112, 113, 115, 115, 91, 87, 84, 87, 91, 95, 99,
+        102, 106, 108, 110, 112, 114, 115, 116, 116, 95, 91, 88, 91, 94, 98,
+        102, 104, 108, 110, 112, 114, 115, 116, 118, 118, 100, 96, 93, 96, 98,
+        101, 104, 107, 110, 112, 114, 115, 117, 118, 119, 119, 103, 100, 97, 99,
+        101, 104, 106, 109, 111, 113, 115, 116, 118, 119, 120, 120, 107, 104,
+        101, 103, 105, 107, 109, 111, 113, 115, 116, 118, 119, 120, 121, 121,
+        107, 104, 101, 103, 105, 107, 109, 111, 113, 115, 116, 118, 119, 120,
+        121, 121,
+        /* Size 32 */
+        64, 61, 58, 55, 53, 54, 54, 55, 56, 58, 61, 64, 67, 70, 72, 76, 79, 82,
+        84, 87, 91, 93, 95, 97, 100, 102, 103, 105, 107, 107, 107, 107, 61, 59,
+        57, 56, 54, 55, 55, 56, 56, 58, 60, 63, 66, 68, 71, 74, 78, 80, 83, 86,
+        89, 91, 93, 96, 98, 100, 102, 104, 106, 106, 106, 106, 58, 57, 57, 56,
+        55, 56, 56, 56, 56, 58, 60, 62, 65, 67, 70, 73, 76, 78, 81, 84, 87, 89,
+        91, 94, 96, 98, 100, 102, 104, 104, 104, 104, 55, 56, 56, 56, 57, 57,
+        57, 56, 56, 58, 60, 62, 64, 66, 69, 71, 74, 77, 79, 82, 85, 87, 90, 92,
+        95, 97, 99, 101, 103, 103, 103, 103, 53, 54, 55, 57, 58, 58, 57, 57, 56,
+        58, 59, 61, 63, 65, 67, 70, 73, 75, 78, 81, 84, 86, 88, 91, 93, 95, 97,
+        99, 101, 101, 101, 101, 54, 55, 56, 57, 58, 58, 59, 59, 59, 61, 62, 64,
+        66, 68, 70, 73, 75, 77, 80, 82, 85, 87, 90, 92, 94, 96, 98, 100, 102,
+        102, 102, 102, 54, 55, 56, 57, 57, 59, 60, 61, 63, 64, 66, 67, 69, 71,
+        73, 75, 78, 80, 82, 84, 87, 89, 91, 93, 96, 97, 99, 101, 103, 103, 103,
+        103, 55, 56, 56, 56, 57, 59, 61, 63, 66, 68, 69, 71, 73, 74, 76, 78, 80,
+        82, 84, 87, 89, 91, 93, 95, 97, 99, 100, 102, 104, 104, 104, 104, 56,
+        56, 56, 56, 56, 59, 63, 66, 70, 72, 73, 75, 77, 78, 80, 81, 83, 85, 87,
+        89, 91, 93, 94, 96, 98, 100, 101, 103, 105, 105, 105, 105, 58, 58, 58,
+        58, 58, 61, 64, 68, 72, 73, 75, 77, 79, 80, 82, 84, 86, 87, 89, 91, 93,
+        94, 96, 98, 100, 101, 103, 104, 106, 106, 106, 106, 61, 60, 60, 60, 59,
+        62, 66, 69, 73, 75, 77, 79, 81, 83, 85, 86, 88, 90, 91, 93, 95, 96, 98,
+        99, 101, 102, 104, 105, 107, 107, 107, 107, 64, 63, 62, 62, 61, 64, 67,
+        71, 75, 77, 79, 82, 84, 86, 87, 89, 91, 92, 94, 95, 97, 98, 100, 101,
+        103, 104, 105, 106, 108, 108, 108, 108, 67, 66, 65, 64, 63, 66, 69, 73,
+        77, 79, 81, 84, 87, 88, 90, 92, 94, 95, 96, 98, 99, 100, 102, 103, 104,
+        105, 106, 108, 109, 109, 109, 109, 70, 68, 67, 66, 65, 68, 71, 74, 78,
+        80, 83, 86, 88, 90, 92, 93, 95, 97, 98, 99, 101, 102, 103, 104, 106,
+        107, 108, 109, 110, 110, 110, 110, 72, 71, 70, 69, 67, 70, 73, 76, 80,
+        82, 85, 87, 90, 92, 93, 95, 97, 98, 100, 101, 102, 103, 104, 106, 107,
+        108, 109, 110, 111, 111, 111, 111, 76, 74, 73, 71, 70, 73, 75, 78, 81,
+        84, 86, 89, 92, 93, 95, 97, 99, 100, 101, 103, 104, 105, 106, 107, 108,
+        109, 110, 111, 112, 112, 112, 112, 79, 78, 76, 74, 73, 75, 78, 80, 83,
+        86, 88, 91, 94, 95, 97, 99, 101, 102, 103, 104, 106, 107, 108, 109, 110,
+        110, 111, 112, 113, 113, 113, 113, 82, 80, 78, 77, 75, 77, 80, 82, 85,
+        87, 90, 92, 95, 97, 98, 100, 102, 103, 104, 106, 107, 108, 109, 110,
+        111, 111, 112, 113, 114, 114, 114, 114, 84, 83, 81, 79, 78, 80, 82, 84,
+        87, 89, 91, 94, 96, 98, 100, 101, 103, 104, 105, 107, 108, 109, 110,
+        111, 112, 112, 113, 114, 115, 115, 115, 115, 87, 86, 84, 82, 81, 82, 84,
+        87, 89, 91, 93, 95, 98, 99, 101, 103, 104, 106, 107, 108, 109, 110, 111,
+        112, 113, 113, 114, 115, 116, 116, 116, 116, 91, 89, 87, 85, 84, 85, 87,
+        89, 91, 93, 95, 97, 99, 101, 102, 104, 106, 107, 108, 109, 110, 111,
+        112, 113, 114, 114, 115, 116, 116, 116, 116, 116, 93, 91, 89, 87, 86,
+        87, 89, 91, 93, 94, 96, 98, 100, 102, 103, 105, 107, 108, 109, 110, 111,
+        112, 113, 114, 114, 115, 116, 116, 117, 117, 117, 117, 95, 93, 91, 90,
+        88, 90, 91, 93, 94, 96, 98, 100, 102, 103, 104, 106, 108, 109, 110, 111,
+        112, 113, 114, 114, 115, 116, 116, 117, 118, 118, 118, 118, 97, 96, 94,
+        92, 91, 92, 93, 95, 96, 98, 99, 101, 103, 104, 106, 107, 109, 110, 111,
+        112, 113, 114, 114, 115, 116, 117, 117, 118, 118, 118, 118, 118, 100,
+        98, 96, 95, 93, 94, 96, 97, 98, 100, 101, 103, 104, 106, 107, 108, 110,
+        111, 112, 113, 114, 114, 115, 116, 117, 117, 118, 118, 119, 119, 119,
+        119, 102, 100, 98, 97, 95, 96, 97, 99, 100, 101, 102, 104, 105, 107,
+        108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 117, 118, 118, 119,
+        119, 119, 119, 119, 103, 102, 100, 99, 97, 98, 99, 100, 101, 103, 104,
+        105, 106, 108, 109, 110, 111, 112, 113, 114, 115, 116, 116, 117, 118,
+        118, 119, 119, 120, 120, 120, 120, 105, 104, 102, 101, 99, 100, 101,
+        102, 103, 104, 105, 106, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+        116, 117, 118, 118, 119, 119, 120, 120, 120, 120, 120, 107, 106, 104,
+        103, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
+        114, 115, 116, 116, 117, 118, 118, 119, 119, 120, 120, 121, 121, 121,
+        121, 107, 106, 104, 103, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+        110, 111, 112, 113, 114, 115, 116, 116, 117, 118, 118, 119, 119, 120,
+        120, 121, 121, 121, 121, 107, 106, 104, 103, 101, 102, 103, 104, 105,
+        106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 116, 117, 118,
+        118, 119, 119, 120, 120, 121, 121, 121, 121, 107, 106, 104, 103, 101,
+        102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
+        116, 116, 117, 118, 118, 119, 119, 120, 120, 121, 121, 121, 121 },
+      { /* Intra matrices */
+        /* Size 4 */
+        38, 40, 55, 69, 40, 52, 62, 72, 55, 62, 73, 79, 69, 72, 79, 83,
+        /* Size 8 */
+        43, 35, 37, 45, 53, 62, 68, 74, 35, 39, 37, 42, 49, 56, 64, 70, 37, 37,
+        47, 52, 56, 62, 67, 72, 45, 42, 52, 59, 64, 68, 72, 75, 53, 49, 56, 64,
+        69, 73, 76, 78, 62, 56, 62, 68, 73, 76, 79, 81, 68, 64, 67, 72, 76, 79,
+        81, 83, 74, 70, 72, 75, 78, 81, 83, 84,
+        /* Size 16 */
+        42, 38, 34, 35, 36, 40, 44, 48, 52, 56, 60, 63, 67, 70, 72, 72, 38, 37,
+        36, 36, 36, 39, 42, 46, 50, 54, 58, 61, 65, 67, 70, 70, 34, 36, 38, 37,
+        37, 39, 41, 44, 48, 51, 55, 59, 62, 65, 68, 68, 35, 36, 37, 39, 41, 43,
+        45, 48, 51, 54, 58, 61, 64, 67, 69, 69, 36, 36, 37, 41, 46, 48, 50, 53,
+        55, 58, 61, 63, 66, 68, 70, 70, 40, 39, 39, 43, 48, 51, 54, 56, 59, 61,
+        63, 66, 68, 70, 72, 72, 44, 42, 41, 45, 50, 54, 58, 60, 62, 64, 66, 68,
+        70, 72, 74, 74, 48, 46, 44, 48, 53, 56, 60, 62, 65, 67, 69, 70, 72, 74,
+        75, 75, 52, 50, 48, 51, 55, 59, 62, 65, 68, 69, 71, 73, 74, 75, 77, 77,
+        56, 54, 51, 54, 58, 61, 64, 67, 69, 71, 73, 74, 76, 77, 78, 78, 60, 58,
+        55, 58, 61, 63, 66, 69, 71, 73, 75, 76, 77, 78, 79, 79, 63, 61, 59, 61,
+        63, 66, 68, 70, 73, 74, 76, 77, 78, 79, 80, 80, 67, 65, 62, 64, 66, 68,
+        70, 72, 74, 76, 77, 78, 79, 80, 81, 81, 70, 67, 65, 67, 68, 70, 72, 74,
+        75, 77, 78, 79, 80, 81, 82, 82, 72, 70, 68, 69, 70, 72, 74, 75, 77, 78,
+        79, 80, 81, 82, 82, 82, 72, 70, 68, 69, 70, 72, 74, 75, 77, 78, 79, 80,
+        81, 82, 82, 82,
+        /* Size 32 */
+        41, 39, 37, 35, 34, 34, 35, 35, 36, 37, 39, 41, 43, 45, 47, 49, 52, 54,
+        55, 57, 60, 61, 63, 65, 66, 68, 69, 70, 72, 72, 72, 72, 39, 38, 37, 36,
+        35, 35, 35, 36, 36, 37, 39, 41, 43, 44, 46, 48, 51, 52, 54, 56, 58, 60,
+        62, 63, 65, 66, 68, 69, 70, 70, 70, 70, 37, 37, 36, 36, 36, 36, 36, 36,
+        36, 37, 39, 40, 42, 43, 45, 47, 49, 51, 53, 55, 57, 59, 60, 62, 64, 65,
+        67, 68, 69, 69, 69, 69, 35, 36, 36, 36, 36, 36, 36, 36, 36, 37, 38, 40,
+        41, 43, 44, 46, 48, 50, 52, 54, 56, 57, 59, 61, 63, 64, 65, 67, 68, 68,
+        68, 68, 34, 35, 36, 36, 37, 37, 37, 37, 36, 37, 38, 39, 40, 42, 44, 45,
+        47, 49, 51, 53, 55, 56, 58, 60, 62, 63, 64, 66, 67, 67, 67, 67, 34, 35,
+        36, 36, 37, 37, 38, 38, 38, 39, 40, 41, 42, 44, 45, 47, 49, 51, 52, 54,
+        56, 57, 59, 61, 62, 64, 65, 66, 68, 68, 68, 68, 35, 35, 36, 36, 37, 38,
+        38, 39, 40, 41, 42, 44, 45, 46, 48, 49, 51, 52, 54, 55, 57, 59, 60, 62,
+        63, 65, 66, 67, 69, 69, 69, 69, 35, 36, 36, 36, 37, 38, 39, 41, 43, 44,
+        45, 46, 47, 48, 50, 51, 53, 54, 55, 57, 59, 60, 61, 63, 64, 65, 67, 68,
+        69, 69, 69, 69, 36, 36, 36, 36, 36, 38, 40, 43, 45, 46, 48, 49, 50, 51,
+        52, 53, 55, 56, 57, 58, 60, 61, 62, 64, 65, 66, 67, 69, 70, 70, 70, 70,
+        37, 37, 37, 37, 37, 39, 41, 44, 46, 48, 49, 50, 52, 53, 54, 55, 56, 57,
+        59, 60, 61, 62, 64, 65, 66, 67, 68, 69, 71, 71, 71, 71, 39, 39, 39, 38,
+        38, 40, 42, 45, 48, 49, 50, 52, 53, 54, 56, 57, 58, 59, 60, 61, 63, 64,
+        65, 66, 67, 68, 69, 70, 71, 71, 71, 71, 41, 41, 40, 40, 39, 41, 44, 46,
+        49, 50, 52, 53, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+        70, 71, 72, 72, 72, 72, 43, 43, 42, 41, 40, 42, 45, 47, 50, 52, 53, 55,
+        57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 70, 71, 72, 73, 73,
+        73, 73, 45, 44, 43, 43, 42, 44, 46, 48, 51, 53, 54, 56, 58, 59, 61, 62,
+        63, 64, 65, 66, 67, 68, 69, 69, 70, 71, 72, 73, 74, 74, 74, 74, 47, 46,
+        45, 44, 44, 45, 48, 50, 52, 54, 56, 57, 59, 61, 62, 63, 64, 65, 66, 67,
+        68, 69, 70, 71, 71, 72, 73, 74, 74, 74, 74, 74, 49, 48, 47, 46, 45, 47,
+        49, 51, 53, 55, 57, 59, 61, 62, 63, 64, 66, 66, 67, 68, 69, 70, 71, 72,
+        72, 73, 74, 74, 75, 75, 75, 75, 52, 51, 49, 48, 47, 49, 51, 53, 55, 56,
+        58, 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 71, 72, 73, 73, 74, 75, 75,
+        76, 76, 76, 76, 54, 52, 51, 50, 49, 51, 52, 54, 56, 57, 59, 61, 63, 64,
+        65, 66, 68, 69, 70, 70, 71, 72, 73, 73, 74, 75, 75, 76, 76, 76, 76, 76,
+        55, 54, 53, 52, 51, 52, 54, 55, 57, 59, 60, 62, 64, 65, 66, 67, 69, 70,
+        70, 71, 72, 73, 73, 74, 75, 75, 76, 77, 77, 77, 77, 77, 57, 56, 55, 54,
+        53, 54, 55, 57, 58, 60, 61, 63, 65, 66, 67, 68, 70, 70, 71, 72, 73, 74,
+        74, 75, 76, 76, 77, 77, 78, 78, 78, 78, 60, 58, 57, 56, 55, 56, 57, 59,
+        60, 61, 63, 64, 66, 67, 68, 69, 71, 71, 72, 73, 74, 75, 75, 76, 76, 77,
+        77, 78, 78, 78, 78, 78, 61, 60, 59, 57, 56, 57, 59, 60, 61, 62, 64, 65,
+        67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79,
+        79, 79, 63, 62, 60, 59, 58, 59, 60, 61, 62, 64, 65, 66, 68, 69, 70, 71,
+        72, 73, 73, 74, 75, 76, 76, 77, 77, 78, 78, 79, 79, 79, 79, 79, 65, 63,
+        62, 61, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73, 73, 74, 75,
+        76, 76, 77, 77, 78, 78, 79, 79, 80, 80, 80, 80, 66, 65, 64, 63, 62, 62,
+        63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 76, 77, 77, 78,
+        79, 79, 79, 80, 80, 80, 80, 80, 68, 66, 65, 64, 63, 64, 65, 65, 66, 67,
+        68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 77, 78, 78, 79, 79, 80, 80,
+        81, 81, 81, 81, 69, 68, 67, 65, 64, 65, 66, 67, 67, 68, 69, 70, 71, 72,
+        73, 74, 75, 75, 76, 77, 77, 78, 78, 79, 79, 80, 80, 81, 81, 81, 81, 81,
+        70, 69, 68, 67, 66, 66, 67, 68, 69, 69, 70, 71, 72, 73, 74, 74, 75, 76,
+        77, 77, 78, 78, 79, 79, 80, 80, 81, 81, 81, 81, 81, 81, 72, 70, 69, 68,
+        67, 68, 69, 69, 70, 71, 71, 72, 73, 74, 74, 75, 76, 76, 77, 78, 78, 79,
+        79, 80, 80, 81, 81, 81, 82, 82, 82, 82, 72, 70, 69, 68, 67, 68, 69, 69,
+        70, 71, 71, 72, 73, 74, 74, 75, 76, 76, 77, 78, 78, 79, 79, 80, 80, 81,
+        81, 81, 82, 82, 82, 82, 72, 70, 69, 68, 67, 68, 69, 69, 70, 71, 71, 72,
+        73, 74, 74, 75, 76, 76, 77, 78, 78, 79, 79, 80, 80, 81, 81, 81, 82, 82,
+        82, 82, 72, 70, 69, 68, 67, 68, 69, 69, 70, 71, 71, 72, 73, 74, 74, 75,
+        76, 76, 77, 78, 78, 79, 79, 80, 80, 81, 81, 81, 82, 82, 82, 82 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 86, 90, 101, 86, 95, 98, 105, 90, 98, 110, 117, 101, 105, 117, 125,
+        /* Size 8 */
+        64, 55, 81, 84, 87, 92, 98, 104, 55, 71, 82, 77, 79, 83, 89, 95, 81, 82,
+        89, 88, 88, 90, 94, 99, 84, 77, 88, 94, 96, 98, 101, 104, 87, 79, 88,
+        96, 101, 104, 107, 110, 92, 83, 90, 98, 104, 109, 112, 115, 98, 89, 94,
+        101, 107, 112, 116, 118, 104, 95, 99, 104, 110, 115, 118, 121,
+        /* Size 16 */
+        64, 59, 55, 66, 81, 82, 84, 85, 87, 90, 92, 95, 98, 101, 104, 104, 59,
+        61, 62, 70, 82, 81, 80, 82, 83, 85, 87, 90, 93, 96, 99, 99, 55, 62, 71,
+        76, 82, 80, 77, 78, 79, 81, 83, 86, 89, 92, 95, 95, 66, 70, 76, 80, 86,
+        84, 82, 83, 83, 85, 86, 89, 91, 94, 97, 97, 81, 82, 82, 86, 89, 89, 88,
+        88, 88, 89, 90, 92, 94, 96, 99, 99, 82, 81, 80, 84, 89, 90, 91, 91, 92,
+        93, 94, 96, 97, 99, 101, 101, 84, 80, 77, 82, 88, 91, 94, 95, 96, 97,
+        98, 99, 101, 102, 104, 104, 85, 82, 78, 83, 88, 91, 95, 97, 98, 100,
+        101, 102, 104, 105, 107, 107, 87, 83, 79, 83, 88, 92, 96, 98, 101, 103,
+        104, 106, 107, 108, 110, 110, 90, 85, 81, 85, 89, 93, 97, 100, 103, 105,
+        107, 108, 109, 111, 112, 112, 92, 87, 83, 86, 90, 94, 98, 101, 104, 107,
+        109, 110, 112, 113, 115, 115, 95, 90, 86, 89, 92, 96, 99, 102, 106, 108,
+        110, 112, 114, 115, 116, 116, 98, 93, 89, 91, 94, 97, 101, 104, 107,
+        109, 112, 114, 116, 117, 118, 118, 101, 96, 92, 94, 96, 99, 102, 105,
+        108, 111, 113, 115, 117, 118, 120, 120, 104, 99, 95, 97, 99, 101, 104,
+        107, 110, 112, 115, 116, 118, 120, 121, 121, 104, 99, 95, 97, 99, 101,
+        104, 107, 110, 112, 115, 116, 118, 120, 121, 121,
+        /* Size 32 */
+        64, 62, 59, 57, 55, 60, 66, 73, 81, 82, 82, 83, 84, 85, 85, 86, 87, 88,
+        90, 91, 92, 94, 95, 97, 98, 100, 101, 102, 104, 104, 104, 104, 62, 61,
+        60, 59, 58, 63, 68, 74, 81, 82, 82, 82, 82, 83, 83, 84, 85, 86, 87, 88,
+        90, 91, 93, 94, 96, 97, 98, 100, 101, 101, 101, 101, 59, 60, 61, 61, 62,
+        66, 70, 76, 82, 81, 81, 81, 80, 81, 82, 82, 83, 84, 85, 86, 87, 89, 90,
+        92, 93, 95, 96, 98, 99, 99, 99, 99, 57, 59, 61, 64, 66, 69, 73, 77, 82,
+        81, 80, 80, 79, 79, 80, 80, 81, 82, 83, 84, 85, 87, 88, 89, 91, 92, 94,
+        95, 97, 97, 97, 97, 55, 58, 62, 66, 71, 73, 76, 79, 82, 81, 80, 78, 77,
+        78, 78, 78, 79, 80, 81, 82, 83, 84, 86, 87, 89, 90, 92, 93, 95, 95, 95,
+        95, 60, 63, 66, 69, 73, 76, 78, 81, 84, 83, 82, 81, 80, 80, 80, 81, 81,
+        82, 83, 84, 85, 86, 87, 89, 90, 91, 93, 94, 96, 96, 96, 96, 66, 68, 70,
+        73, 76, 78, 80, 83, 86, 85, 84, 83, 82, 82, 83, 83, 83, 84, 85, 86, 86,
+        88, 89, 90, 91, 93, 94, 95, 97, 97, 97, 97, 73, 74, 76, 77, 79, 81, 83,
+        85, 87, 87, 86, 86, 85, 85, 85, 85, 85, 86, 87, 87, 88, 89, 90, 92, 93,
+        94, 95, 96, 98, 98, 98, 98, 81, 81, 82, 82, 82, 84, 86, 87, 89, 89, 89,
+        88, 88, 88, 88, 88, 88, 88, 89, 89, 90, 91, 92, 93, 94, 95, 96, 98, 99,
+        99, 99, 99, 82, 82, 81, 81, 81, 83, 85, 87, 89, 89, 89, 89, 89, 89, 89,
+        89, 90, 90, 91, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 100, 100, 100,
+        82, 82, 81, 80, 80, 82, 84, 86, 89, 89, 90, 90, 91, 91, 91, 91, 92, 92,
+        93, 93, 94, 95, 96, 96, 97, 98, 99, 100, 101, 101, 101, 101, 83, 82, 81,
+        80, 78, 81, 83, 86, 88, 89, 90, 91, 92, 92, 93, 93, 94, 94, 95, 95, 96,
+        97, 97, 98, 99, 100, 101, 102, 103, 103, 103, 103, 84, 82, 80, 79, 77,
+        80, 82, 85, 88, 89, 91, 92, 94, 94, 95, 95, 96, 96, 97, 97, 98, 99, 99,
+        100, 101, 102, 102, 103, 104, 104, 104, 104, 85, 83, 81, 79, 78, 80, 82,
+        85, 88, 89, 91, 92, 94, 95, 96, 96, 97, 98, 98, 99, 99, 100, 101, 102,
+        102, 103, 104, 105, 106, 106, 106, 106, 85, 83, 82, 80, 78, 80, 83, 85,
+        88, 89, 91, 93, 95, 96, 97, 98, 98, 99, 100, 100, 101, 102, 102, 103,
+        104, 105, 105, 106, 107, 107, 107, 107, 86, 84, 82, 80, 78, 81, 83, 85,
+        88, 89, 91, 93, 95, 96, 98, 99, 100, 100, 101, 102, 103, 103, 104, 105,
+        105, 106, 107, 108, 108, 108, 108, 108, 87, 85, 83, 81, 79, 81, 83, 85,
+        88, 90, 92, 94, 96, 97, 98, 100, 101, 102, 103, 104, 104, 105, 106, 106,
+        107, 108, 108, 109, 110, 110, 110, 110, 88, 86, 84, 82, 80, 82, 84, 86,
+        88, 90, 92, 94, 96, 98, 99, 100, 102, 103, 104, 105, 105, 106, 107, 108,
+        108, 109, 110, 110, 111, 111, 111, 111, 90, 87, 85, 83, 81, 83, 85, 87,
+        89, 91, 93, 95, 97, 98, 100, 101, 103, 104, 105, 106, 107, 107, 108,
+        109, 109, 110, 111, 111, 112, 112, 112, 112, 91, 88, 86, 84, 82, 84, 86,
+        87, 89, 91, 93, 95, 97, 99, 100, 102, 104, 105, 106, 107, 108, 108, 109,
+        110, 111, 111, 112, 113, 113, 113, 113, 113, 92, 90, 87, 85, 83, 85, 86,
+        88, 90, 92, 94, 96, 98, 99, 101, 103, 104, 105, 107, 108, 109, 110, 110,
+        111, 112, 113, 113, 114, 115, 115, 115, 115, 94, 91, 89, 87, 84, 86, 88,
+        89, 91, 93, 95, 97, 99, 100, 102, 103, 105, 106, 107, 108, 110, 110,
+        111, 112, 113, 114, 114, 115, 116, 116, 116, 116, 95, 93, 90, 88, 86,
+        87, 89, 90, 92, 94, 96, 97, 99, 101, 102, 104, 106, 107, 108, 109, 110,
+        111, 112, 113, 114, 114, 115, 116, 116, 116, 116, 116, 97, 94, 92, 89,
+        87, 89, 90, 92, 93, 95, 96, 98, 100, 102, 103, 105, 106, 108, 109, 110,
+        111, 112, 113, 114, 115, 115, 116, 117, 117, 117, 117, 117, 98, 96, 93,
+        91, 89, 90, 91, 93, 94, 96, 97, 99, 101, 102, 104, 105, 107, 108, 109,
+        111, 112, 113, 114, 115, 116, 116, 117, 118, 118, 118, 118, 118, 100,
+        97, 95, 92, 90, 91, 93, 94, 95, 97, 98, 100, 102, 103, 105, 106, 108,
+        109, 110, 111, 113, 114, 114, 115, 116, 117, 118, 118, 119, 119, 119,
+        119, 101, 98, 96, 94, 92, 93, 94, 95, 96, 98, 99, 101, 102, 104, 105,
+        107, 108, 110, 111, 112, 113, 114, 115, 116, 117, 118, 118, 119, 120,
+        120, 120, 120, 102, 100, 98, 95, 93, 94, 95, 96, 98, 99, 100, 102, 103,
+        105, 106, 108, 109, 110, 111, 113, 114, 115, 116, 117, 118, 118, 119,
+        120, 121, 121, 121, 121, 104, 101, 99, 97, 95, 96, 97, 98, 99, 100, 101,
+        103, 104, 106, 107, 108, 110, 111, 112, 113, 115, 116, 116, 117, 118,
+        119, 120, 121, 121, 121, 121, 121, 104, 101, 99, 97, 95, 96, 97, 98, 99,
+        100, 101, 103, 104, 106, 107, 108, 110, 111, 112, 113, 115, 116, 116,
+        117, 118, 119, 120, 121, 121, 121, 121, 121, 104, 101, 99, 97, 95, 96,
+        97, 98, 99, 100, 101, 103, 104, 106, 107, 108, 110, 111, 112, 113, 115,
+        116, 116, 117, 118, 119, 120, 121, 121, 121, 121, 121, 104, 101, 99, 97,
+        95, 96, 97, 98, 99, 100, 101, 103, 104, 106, 107, 108, 110, 111, 112,
+        113, 115, 116, 116, 117, 118, 119, 120, 121, 121, 121, 121, 121 },
+      { /* Intra matrices */
+        /* Size 4 */
+        40, 54, 57, 65, 54, 60, 62, 67, 57, 62, 71, 75, 65, 67, 75, 81,
+        /* Size 8 */
+        42, 36, 54, 56, 58, 62, 66, 70, 36, 47, 54, 51, 52, 55, 59, 64, 54, 54,
+        60, 59, 58, 60, 63, 66, 56, 51, 59, 63, 64, 66, 68, 70, 58, 52, 58, 64,
+        68, 70, 72, 74, 62, 55, 60, 66, 70, 74, 76, 78, 66, 59, 63, 68, 72, 76,
+        79, 81, 70, 64, 66, 70, 74, 78, 81, 83,
+        /* Size 16 */
+        41, 38, 35, 43, 53, 54, 55, 56, 57, 59, 61, 63, 65, 67, 69, 69, 38, 39,
+        40, 46, 53, 53, 53, 53, 54, 56, 57, 59, 62, 64, 66, 66, 35, 40, 46, 49,
+        54, 52, 51, 51, 51, 53, 54, 56, 58, 60, 63, 63, 43, 46, 49, 53, 56, 55,
+        54, 54, 54, 56, 57, 58, 60, 62, 64, 64, 53, 53, 54, 56, 59, 58, 58, 58,
+        58, 58, 59, 61, 62, 64, 65, 65, 54, 53, 52, 55, 58, 59, 60, 60, 60, 61,
+        62, 63, 64, 66, 67, 67, 55, 53, 51, 54, 58, 60, 62, 63, 63, 64, 65, 66,
+        67, 68, 69, 69, 56, 53, 51, 54, 58, 60, 63, 64, 65, 66, 67, 68, 69, 70,
+        71, 71, 57, 54, 51, 54, 58, 60, 63, 65, 67, 68, 69, 70, 71, 72, 73, 73,
+        59, 56, 53, 56, 58, 61, 64, 66, 68, 70, 71, 72, 73, 74, 75, 75, 61, 57,
+        54, 57, 59, 62, 65, 67, 69, 71, 73, 74, 75, 76, 77, 77, 63, 59, 56, 58,
+        61, 63, 66, 68, 70, 72, 74, 75, 76, 77, 78, 78, 65, 62, 58, 60, 62, 64,
+        67, 69, 71, 73, 75, 76, 78, 79, 80, 80, 67, 64, 60, 62, 64, 66, 68, 70,
+        72, 74, 76, 77, 79, 80, 81, 81, 69, 66, 63, 64, 65, 67, 69, 71, 73, 75,
+        77, 78, 80, 81, 82, 82, 69, 66, 63, 64, 65, 67, 69, 71, 73, 75, 77, 78,
+        80, 81, 82, 82,
+        /* Size 32 */
+        41, 39, 38, 37, 35, 38, 42, 47, 53, 53, 54, 54, 55, 55, 56, 56, 57, 58,
+        59, 59, 60, 61, 62, 63, 65, 65, 66, 68, 69, 69, 69, 69, 39, 39, 38, 38,
+        37, 40, 44, 48, 53, 53, 53, 53, 53, 54, 54, 55, 55, 56, 57, 58, 59, 60,
+        61, 62, 63, 64, 65, 66, 67, 67, 67, 67, 38, 38, 39, 39, 40, 42, 45, 49,
+        53, 53, 53, 52, 52, 53, 53, 53, 54, 55, 55, 56, 57, 58, 59, 60, 61, 62,
+        63, 64, 65, 65, 65, 65, 37, 38, 39, 41, 42, 45, 47, 50, 53, 53, 52, 52,
+        51, 51, 52, 52, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 63, 64, 64,
+        64, 64, 35, 37, 40, 42, 46, 47, 49, 51, 53, 52, 52, 51, 50, 50, 51, 51,
+        51, 52, 53, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 62, 62, 62, 38, 40,
+        42, 45, 47, 49, 51, 52, 54, 54, 53, 52, 52, 52, 52, 52, 52, 53, 54, 54,
+        55, 56, 57, 58, 59, 60, 61, 62, 63, 63, 63, 63, 42, 44, 45, 47, 49, 51,
+        52, 54, 56, 55, 55, 54, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 59,
+        60, 61, 62, 63, 64, 64, 64, 64, 47, 48, 49, 50, 51, 52, 54, 55, 57, 57,
+        56, 56, 55, 55, 55, 55, 55, 56, 57, 57, 58, 58, 59, 60, 61, 62, 62, 63,
+        64, 64, 64, 64, 53, 53, 53, 53, 53, 54, 56, 57, 58, 58, 58, 58, 57, 57,
+        57, 57, 57, 58, 58, 58, 59, 60, 60, 61, 62, 62, 63, 64, 65, 65, 65, 65,
+        53, 53, 53, 53, 52, 54, 55, 57, 58, 58, 58, 58, 58, 58, 58, 58, 59, 59,
+        59, 60, 60, 61, 61, 62, 63, 64, 64, 65, 66, 66, 66, 66, 54, 53, 53, 52,
+        52, 53, 55, 56, 58, 58, 59, 59, 59, 59, 60, 60, 60, 60, 61, 61, 61, 62,
+        63, 63, 64, 65, 65, 66, 67, 67, 67, 67, 54, 53, 52, 52, 51, 52, 54, 56,
+        58, 58, 59, 60, 60, 61, 61, 61, 61, 62, 62, 63, 63, 63, 64, 65, 65, 66,
+        66, 67, 68, 68, 68, 68, 55, 53, 52, 51, 50, 52, 53, 55, 57, 58, 59, 60,
+        61, 62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 66, 66, 67, 68, 68, 69, 69,
+        69, 69, 55, 54, 53, 51, 50, 52, 54, 55, 57, 58, 59, 61, 62, 62, 63, 63,
+        64, 64, 65, 65, 65, 66, 66, 67, 67, 68, 69, 69, 70, 70, 70, 70, 56, 54,
+        53, 52, 51, 52, 54, 55, 57, 58, 60, 61, 62, 63, 63, 64, 65, 65, 66, 66,
+        67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 71, 71, 56, 55, 53, 52, 51, 52,
+        54, 55, 57, 58, 60, 61, 63, 63, 64, 65, 66, 66, 67, 67, 68, 68, 69, 69,
+        70, 70, 71, 71, 72, 72, 72, 72, 57, 55, 54, 52, 51, 52, 54, 55, 57, 59,
+        60, 61, 63, 64, 65, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 72, 72,
+        73, 73, 73, 73, 58, 56, 55, 53, 52, 53, 55, 56, 58, 59, 60, 62, 63, 64,
+        65, 66, 67, 68, 68, 69, 70, 70, 71, 71, 72, 72, 73, 73, 74, 74, 74, 74,
+        59, 57, 55, 54, 53, 54, 55, 57, 58, 59, 61, 62, 64, 65, 66, 67, 68, 68,
+        69, 70, 70, 71, 71, 72, 73, 73, 73, 74, 74, 74, 74, 74, 59, 58, 56, 55,
+        53, 54, 56, 57, 58, 60, 61, 63, 64, 65, 66, 67, 68, 69, 70, 71, 71, 72,
+        72, 73, 73, 74, 74, 75, 75, 75, 75, 75, 60, 59, 57, 56, 54, 55, 56, 58,
+        59, 60, 61, 63, 64, 65, 67, 68, 69, 70, 70, 71, 72, 73, 73, 74, 74, 75,
+        75, 76, 76, 76, 76, 76, 61, 60, 58, 56, 55, 56, 57, 58, 60, 61, 62, 63,
+        65, 66, 67, 68, 69, 70, 71, 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77,
+        77, 77, 62, 61, 59, 57, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 68, 69,
+        70, 71, 71, 72, 73, 74, 74, 75, 76, 76, 77, 77, 78, 78, 78, 78, 63, 62,
+        60, 58, 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73,
+        74, 74, 75, 76, 76, 77, 77, 78, 78, 78, 78, 78, 65, 63, 61, 59, 58, 59,
+        60, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 73, 74, 75, 76, 76,
+        77, 77, 78, 78, 79, 79, 79, 79, 65, 64, 62, 60, 59, 60, 61, 62, 62, 64,
+        65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 77, 78, 78, 79,
+        79, 79, 79, 79, 66, 65, 63, 61, 60, 61, 62, 62, 63, 64, 65, 66, 68, 69,
+        70, 71, 72, 73, 73, 74, 75, 76, 77, 77, 78, 78, 79, 79, 80, 80, 80, 80,
+        68, 66, 64, 63, 61, 62, 63, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73,
+        74, 75, 76, 76, 77, 78, 78, 79, 79, 80, 81, 81, 81, 81, 69, 67, 65, 64,
+        62, 63, 64, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 74, 75, 76, 77,
+        78, 78, 79, 79, 80, 81, 81, 81, 81, 81, 69, 67, 65, 64, 62, 63, 64, 64,
+        65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 74, 75, 76, 77, 78, 78, 79, 79,
+        80, 81, 81, 81, 81, 81, 69, 67, 65, 64, 62, 63, 64, 64, 65, 66, 67, 68,
+        69, 70, 71, 72, 73, 74, 74, 75, 76, 77, 78, 78, 79, 79, 80, 81, 81, 81,
+        81, 81, 69, 67, 65, 64, 62, 63, 64, 64, 65, 66, 67, 68, 69, 70, 71, 72,
+        73, 74, 74, 75, 76, 77, 78, 78, 79, 79, 80, 81, 81, 81, 81, 81 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 67, 88, 105, 67, 84, 97, 108, 88, 97, 109, 116, 105, 108, 116, 120,
+        /* Size 8 */
+        64, 54, 57, 66, 77, 86, 94, 99, 54, 59, 57, 63, 72, 81, 88, 95, 57, 57,
+        69, 75, 80, 87, 92, 97, 66, 63, 75, 83, 89, 93, 97, 101, 77, 72, 80, 89,
+        94, 98, 101, 104, 86, 81, 87, 93, 98, 102, 104, 106, 94, 88, 92, 97,
+        101, 104, 106, 108, 99, 95, 97, 101, 104, 106, 108, 109,
+        /* Size 16 */
+        64, 59, 54, 55, 57, 61, 66, 71, 77, 81, 86, 90, 94, 96, 99, 99, 59, 57,
+        56, 57, 57, 61, 65, 69, 74, 79, 83, 87, 91, 94, 97, 97, 54, 56, 59, 58,
+        57, 60, 63, 67, 72, 76, 81, 84, 88, 91, 95, 95, 55, 57, 58, 60, 63, 65,
+        68, 72, 76, 79, 83, 87, 90, 93, 96, 96, 57, 57, 57, 63, 69, 72, 75, 78,
+        80, 83, 87, 89, 92, 95, 97, 97, 61, 61, 60, 65, 72, 75, 79, 82, 84, 87,
+        90, 92, 95, 97, 99, 99, 66, 65, 63, 68, 75, 79, 83, 86, 89, 91, 93, 95,
+        97, 99, 101, 101, 71, 69, 67, 72, 78, 82, 86, 89, 91, 93, 95, 97, 99,
+        100, 102, 102, 77, 74, 72, 76, 80, 84, 89, 91, 94, 96, 98, 100, 101,
+        102, 104, 104, 81, 79, 76, 79, 83, 87, 91, 93, 96, 98, 100, 101, 103,
+        104, 105, 105, 86, 83, 81, 83, 87, 90, 93, 95, 98, 100, 102, 103, 104,
+        105, 106, 106, 90, 87, 84, 87, 89, 92, 95, 97, 100, 101, 103, 104, 105,
+        106, 107, 107, 94, 91, 88, 90, 92, 95, 97, 99, 101, 103, 104, 105, 106,
+        107, 108, 108, 96, 94, 91, 93, 95, 97, 99, 100, 102, 104, 105, 106, 107,
+        108, 109, 109, 99, 97, 95, 96, 97, 99, 101, 102, 104, 105, 106, 107,
+        108, 109, 109, 109, 99, 97, 95, 96, 97, 99, 101, 102, 104, 105, 106,
+        107, 108, 109, 109, 109,
+        /* Size 32 */
+        64, 61, 59, 56, 54, 55, 55, 56, 57, 59, 61, 64, 66, 69, 71, 74, 77, 79,
+        81, 84, 86, 88, 90, 92, 94, 95, 96, 98, 99, 99, 99, 99, 61, 60, 58, 57,
+        55, 56, 56, 56, 57, 59, 61, 63, 66, 68, 70, 73, 76, 78, 80, 82, 85, 87,
+        88, 90, 92, 94, 95, 97, 98, 98, 98, 98, 59, 58, 57, 57, 56, 56, 57, 57,
+        57, 59, 61, 63, 65, 67, 69, 72, 74, 76, 79, 81, 83, 85, 87, 89, 91, 92,
+        94, 95, 97, 97, 97, 97, 56, 57, 57, 57, 58, 57, 57, 57, 57, 59, 60, 62,
+        64, 66, 68, 70, 73, 75, 77, 79, 82, 84, 86, 88, 90, 91, 93, 94, 96, 96,
+        96, 96, 54, 55, 56, 58, 59, 58, 58, 58, 57, 59, 60, 61, 63, 65, 67, 69,
+        72, 74, 76, 78, 81, 82, 84, 86, 88, 90, 91, 93, 95, 95, 95, 95, 55, 56,
+        56, 57, 58, 59, 59, 59, 60, 61, 63, 64, 65, 67, 69, 71, 74, 76, 78, 80,
+        82, 84, 86, 87, 89, 91, 92, 94, 95, 95, 95, 95, 55, 56, 57, 57, 58, 59,
+        60, 61, 63, 64, 65, 67, 68, 70, 72, 74, 76, 78, 79, 81, 83, 85, 87, 89,
+        90, 92, 93, 95, 96, 96, 96, 96, 56, 56, 57, 57, 58, 59, 61, 64, 66, 67,
+        68, 70, 71, 73, 75, 76, 78, 80, 81, 83, 85, 86, 88, 90, 91, 93, 94, 95,
+        97, 97, 97, 97, 57, 57, 57, 57, 57, 60, 63, 66, 69, 71, 72, 73, 75, 76,
+        78, 79, 80, 82, 83, 85, 87, 88, 89, 91, 92, 94, 95, 96, 97, 97, 97, 97,
+        59, 59, 59, 59, 59, 61, 64, 67, 71, 72, 74, 75, 77, 78, 79, 81, 82, 84,
+        85, 87, 88, 89, 91, 92, 93, 95, 96, 97, 98, 98, 98, 98, 61, 61, 61, 60,
+        60, 63, 65, 68, 72, 74, 75, 77, 79, 80, 82, 83, 84, 86, 87, 88, 90, 91,
+        92, 93, 95, 96, 97, 98, 99, 99, 99, 99, 64, 63, 63, 62, 61, 64, 67, 70,
+        73, 75, 77, 79, 81, 82, 84, 85, 86, 88, 89, 90, 91, 92, 93, 95, 96, 97,
+        98, 99, 100, 100, 100, 100, 66, 66, 65, 64, 63, 65, 68, 71, 75, 77, 79,
+        81, 83, 85, 86, 87, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
+        101, 101, 101, 101, 69, 68, 67, 66, 65, 67, 70, 73, 76, 78, 80, 82, 85,
+        86, 87, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 100, 101, 101,
+        101, 101, 71, 70, 69, 68, 67, 69, 72, 75, 78, 79, 82, 84, 86, 87, 89,
+        90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 100, 101, 102, 102, 102,
+        102, 74, 73, 72, 70, 69, 71, 74, 76, 79, 81, 83, 85, 87, 89, 90, 91, 93,
+        94, 95, 96, 97, 98, 98, 99, 100, 101, 101, 102, 103, 103, 103, 103, 77,
+        76, 74, 73, 72, 74, 76, 78, 80, 82, 84, 86, 89, 90, 91, 93, 94, 95, 96,
+        97, 98, 99, 100, 100, 101, 102, 102, 103, 104, 104, 104, 104, 79, 78,
+        76, 75, 74, 76, 78, 80, 82, 84, 86, 88, 90, 91, 92, 94, 95, 96, 97, 98,
+        99, 100, 100, 101, 102, 102, 103, 104, 104, 104, 104, 104, 81, 80, 79,
+        77, 76, 78, 79, 81, 83, 85, 87, 89, 91, 92, 93, 95, 96, 97, 98, 99, 100,
+        101, 101, 102, 103, 103, 104, 104, 105, 105, 105, 105, 84, 82, 81, 79,
+        78, 80, 81, 83, 85, 87, 88, 90, 92, 93, 94, 96, 97, 98, 99, 100, 101,
+        101, 102, 103, 103, 104, 104, 105, 105, 105, 105, 105, 86, 85, 83, 82,
+        81, 82, 83, 85, 87, 88, 90, 91, 93, 94, 95, 97, 98, 99, 100, 101, 102,
+        102, 103, 103, 104, 105, 105, 106, 106, 106, 106, 106, 88, 87, 85, 84,
+        82, 84, 85, 86, 88, 89, 91, 92, 94, 95, 96, 98, 99, 100, 101, 101, 102,
+        103, 103, 104, 105, 105, 106, 106, 107, 107, 107, 107, 90, 88, 87, 86,
+        84, 86, 87, 88, 89, 91, 92, 93, 95, 96, 97, 98, 100, 100, 101, 102, 103,
+        103, 104, 105, 105, 106, 106, 107, 107, 107, 107, 107, 92, 90, 89, 88,
+        86, 87, 89, 90, 91, 92, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 103,
+        104, 105, 105, 106, 106, 107, 107, 107, 107, 107, 107, 94, 92, 91, 90,
+        88, 89, 90, 91, 92, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 103,
+        104, 105, 105, 106, 106, 107, 107, 108, 108, 108, 108, 108, 95, 94, 92,
+        91, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 102, 103,
+        104, 105, 105, 106, 106, 107, 107, 107, 108, 108, 108, 108, 108, 96, 95,
+        94, 93, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 100, 101, 102, 103,
+        104, 104, 105, 106, 106, 107, 107, 107, 108, 108, 109, 109, 109, 109,
+        98, 97, 95, 94, 93, 94, 95, 95, 96, 97, 98, 99, 100, 100, 101, 102, 103,
+        104, 104, 105, 106, 106, 107, 107, 108, 108, 108, 109, 109, 109, 109,
+        109, 99, 98, 97, 96, 95, 95, 96, 97, 97, 98, 99, 100, 101, 101, 102,
+        103, 104, 104, 105, 105, 106, 107, 107, 107, 108, 108, 109, 109, 109,
+        109, 109, 109, 99, 98, 97, 96, 95, 95, 96, 97, 97, 98, 99, 100, 101,
+        101, 102, 103, 104, 104, 105, 105, 106, 107, 107, 107, 108, 108, 109,
+        109, 109, 109, 109, 109, 99, 98, 97, 96, 95, 95, 96, 97, 97, 98, 99,
+        100, 101, 101, 102, 103, 104, 104, 105, 105, 106, 107, 107, 107, 108,
+        108, 109, 109, 109, 109, 109, 109, 99, 98, 97, 96, 95, 95, 96, 97, 97,
+        98, 99, 100, 101, 101, 102, 103, 104, 104, 105, 105, 106, 107, 107, 107,
+        108, 108, 109, 109, 109, 109, 109, 109 },
+      { /* Intra matrices */
+        /* Size 4 */
+        41, 43, 57, 69, 43, 54, 63, 71, 57, 63, 72, 77, 69, 71, 77, 80,
+        /* Size 8 */
+        46, 38, 40, 47, 55, 63, 68, 73, 38, 42, 41, 45, 51, 58, 64, 69, 40, 41,
+        50, 54, 58, 63, 67, 71, 47, 45, 54, 60, 64, 68, 71, 74, 55, 51, 58, 64,
+        69, 72, 74, 76, 63, 58, 63, 68, 72, 75, 77, 78, 68, 64, 67, 71, 74, 77,
+        78, 80, 73, 69, 71, 74, 76, 78, 80, 81,
+        /* Size 16 */
+        45, 41, 38, 38, 39, 43, 47, 50, 54, 58, 61, 64, 67, 69, 71, 71, 41, 40,
+        39, 39, 40, 42, 45, 49, 52, 56, 59, 62, 65, 67, 70, 70, 38, 39, 41, 40,
+        40, 42, 44, 47, 50, 54, 57, 60, 63, 65, 68, 68, 38, 39, 40, 42, 44, 46,
+        48, 51, 54, 56, 59, 62, 65, 67, 69, 69, 39, 40, 40, 44, 49, 51, 53, 55,
+        57, 59, 62, 64, 66, 68, 70, 70, 43, 42, 42, 46, 51, 53, 56, 58, 60, 62,
+        64, 66, 68, 69, 71, 71, 47, 45, 44, 48, 53, 56, 59, 61, 63, 65, 67, 68,
+        70, 71, 72, 72, 50, 49, 47, 51, 55, 58, 61, 63, 65, 67, 69, 70, 71, 72,
+        74, 74, 54, 52, 50, 54, 57, 60, 63, 65, 68, 69, 71, 72, 73, 74, 75, 75,
+        58, 56, 54, 56, 59, 62, 65, 67, 69, 70, 72, 73, 74, 75, 76, 76, 61, 59,
+        57, 59, 62, 64, 67, 69, 71, 72, 73, 74, 75, 76, 77, 77, 64, 62, 60, 62,
+        64, 66, 68, 70, 72, 73, 74, 75, 76, 77, 78, 78, 67, 65, 63, 65, 66, 68,
+        70, 71, 73, 74, 75, 76, 77, 78, 78, 78, 69, 67, 65, 67, 68, 69, 71, 72,
+        74, 75, 76, 77, 78, 78, 79, 79, 71, 70, 68, 69, 70, 71, 72, 74, 75, 76,
+        77, 78, 78, 79, 79, 79, 71, 70, 68, 69, 70, 71, 72, 74, 75, 76, 77, 78,
+        78, 79, 79, 79,
+        /* Size 32 */
+        44, 42, 40, 39, 37, 38, 38, 39, 39, 41, 42, 44, 46, 48, 50, 52, 54, 56,
+        57, 59, 61, 62, 64, 65, 66, 68, 69, 70, 71, 71, 71, 71, 42, 41, 40, 39,
+        38, 38, 39, 39, 39, 41, 42, 44, 45, 47, 49, 51, 53, 54, 56, 58, 60, 61,
+        62, 64, 65, 67, 68, 69, 70, 70, 70, 70, 40, 40, 40, 39, 39, 39, 39, 39,
+        39, 41, 42, 43, 45, 46, 48, 50, 52, 53, 55, 57, 59, 60, 61, 63, 64, 66,
+        67, 68, 69, 69, 69, 69, 39, 39, 39, 39, 40, 40, 40, 39, 39, 40, 42, 43,
+        44, 46, 47, 49, 51, 52, 54, 56, 58, 59, 60, 62, 63, 65, 66, 67, 68, 68,
+        68, 68, 37, 38, 39, 40, 41, 40, 40, 40, 39, 40, 41, 42, 44, 45, 47, 48,
+        50, 51, 53, 55, 57, 58, 59, 61, 63, 64, 65, 66, 67, 67, 67, 67, 38, 38,
+        39, 40, 40, 41, 41, 41, 41, 42, 43, 44, 45, 47, 48, 50, 51, 53, 54, 56,
+        58, 59, 60, 62, 63, 64, 65, 67, 68, 68, 68, 68, 38, 39, 39, 40, 40, 41,
+        42, 43, 43, 44, 45, 46, 48, 49, 50, 52, 53, 54, 56, 57, 59, 60, 61, 63,
+        64, 65, 66, 67, 68, 68, 68, 68, 39, 39, 39, 39, 40, 41, 43, 44, 46, 47,
+        48, 49, 50, 51, 52, 53, 55, 56, 57, 59, 60, 61, 62, 63, 65, 66, 67, 68,
+        69, 69, 69, 69, 39, 39, 39, 39, 39, 41, 43, 46, 48, 49, 50, 51, 52, 53,
+        54, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 69, 69, 69,
+        41, 41, 41, 40, 40, 42, 44, 47, 49, 50, 51, 53, 54, 55, 56, 57, 58, 59,
+        60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 70, 70, 70, 42, 42, 42, 42,
+        41, 43, 45, 48, 50, 51, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        65, 66, 67, 68, 69, 70, 71, 71, 71, 71, 44, 44, 43, 43, 42, 44, 46, 49,
+        51, 53, 54, 55, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 66, 67, 68, 69,
+        70, 70, 71, 71, 71, 71, 46, 45, 45, 44, 44, 45, 48, 50, 52, 54, 55, 57,
+        59, 60, 61, 62, 63, 63, 64, 65, 66, 67, 67, 68, 69, 70, 70, 71, 72, 72,
+        72, 72, 48, 47, 46, 46, 45, 47, 49, 51, 53, 55, 56, 58, 60, 61, 62, 63,
+        64, 64, 65, 66, 67, 68, 68, 69, 70, 70, 71, 72, 72, 72, 72, 72, 50, 49,
+        48, 47, 47, 48, 50, 52, 54, 56, 57, 59, 61, 62, 63, 64, 65, 66, 66, 67,
+        68, 69, 69, 70, 71, 71, 72, 72, 73, 73, 73, 73, 52, 51, 50, 49, 48, 50,
+        52, 53, 55, 57, 58, 60, 62, 63, 64, 65, 66, 67, 67, 68, 69, 69, 70, 71,
+        71, 72, 72, 73, 74, 74, 74, 74, 54, 53, 52, 51, 50, 51, 53, 55, 56, 58,
+        59, 61, 63, 64, 65, 66, 67, 68, 68, 69, 70, 70, 71, 72, 72, 73, 73, 74,
+        74, 74, 74, 74, 56, 54, 53, 52, 51, 53, 54, 56, 58, 59, 60, 62, 63, 64,
+        66, 67, 68, 68, 69, 70, 71, 71, 72, 72, 73, 73, 74, 74, 75, 75, 75, 75,
+        57, 56, 55, 54, 53, 54, 56, 57, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69,
+        70, 71, 71, 72, 72, 73, 73, 74, 74, 75, 75, 75, 75, 75, 59, 58, 57, 56,
+        55, 56, 57, 59, 60, 61, 62, 64, 65, 66, 67, 68, 69, 70, 71, 71, 72, 72,
+        73, 73, 74, 74, 75, 75, 76, 76, 76, 76, 61, 60, 59, 58, 57, 58, 59, 60,
+        61, 62, 63, 65, 66, 67, 68, 69, 70, 71, 71, 72, 73, 73, 74, 74, 75, 75,
+        75, 76, 76, 76, 76, 76, 62, 61, 60, 59, 58, 59, 60, 61, 62, 63, 64, 66,
+        67, 68, 69, 69, 70, 71, 72, 72, 73, 74, 74, 75, 75, 75, 76, 76, 76, 76,
+        76, 76, 64, 62, 61, 60, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
+        71, 72, 72, 73, 74, 74, 75, 75, 75, 76, 76, 76, 77, 77, 77, 77, 65, 64,
+        63, 62, 61, 62, 63, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 72, 73, 73,
+        74, 75, 75, 75, 76, 76, 77, 77, 77, 77, 77, 77, 66, 65, 64, 63, 63, 63,
+        64, 65, 65, 66, 67, 68, 69, 70, 71, 71, 72, 73, 73, 74, 75, 75, 75, 76,
+        76, 77, 77, 77, 78, 78, 78, 78, 68, 67, 66, 65, 64, 64, 65, 66, 66, 67,
+        68, 69, 70, 70, 71, 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 77, 78,
+        78, 78, 78, 78, 69, 68, 67, 66, 65, 65, 66, 67, 67, 68, 69, 70, 70, 71,
+        72, 72, 73, 74, 74, 75, 75, 76, 76, 77, 77, 77, 77, 78, 78, 78, 78, 78,
+        70, 69, 68, 67, 66, 67, 67, 68, 68, 69, 70, 70, 71, 72, 72, 73, 74, 74,
+        75, 75, 76, 76, 76, 77, 77, 78, 78, 78, 78, 78, 78, 78, 71, 70, 69, 68,
+        67, 68, 68, 69, 69, 70, 71, 71, 72, 72, 73, 74, 74, 75, 75, 76, 76, 76,
+        77, 77, 78, 78, 78, 78, 79, 79, 79, 79, 71, 70, 69, 68, 67, 68, 68, 69,
+        69, 70, 71, 71, 72, 72, 73, 74, 74, 75, 75, 76, 76, 76, 77, 77, 78, 78,
+        78, 78, 79, 79, 79, 79, 71, 70, 69, 68, 67, 68, 68, 69, 69, 70, 71, 71,
+        72, 72, 73, 74, 74, 75, 75, 76, 76, 76, 77, 77, 78, 78, 78, 78, 79, 79,
+        79, 79, 71, 70, 69, 68, 67, 68, 68, 69, 69, 70, 71, 71, 72, 72, 73, 74,
+        74, 75, 75, 76, 76, 76, 77, 77, 78, 78, 78, 78, 79, 79, 79, 79 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 83, 86, 96, 83, 90, 93, 98, 86, 93, 103, 108, 96, 98, 108, 114,
+        /* Size 8 */
+        64, 56, 79, 81, 84, 88, 93, 97, 56, 70, 80, 76, 77, 81, 85, 90, 79, 80,
+        86, 84, 84, 86, 90, 93, 81, 76, 84, 89, 91, 93, 95, 98, 84, 77, 84, 91,
+        95, 98, 100, 102, 88, 81, 86, 93, 98, 101, 104, 106, 93, 85, 90, 95,
+        100, 104, 106, 108, 97, 90, 93, 98, 102, 106, 108, 111,
+        /* Size 16 */
+        64, 60, 56, 66, 79, 80, 81, 82, 84, 86, 88, 90, 93, 95, 97, 97, 60, 61,
+        62, 70, 79, 79, 78, 79, 80, 82, 84, 86, 89, 91, 94, 94, 56, 62, 70, 74,
+        80, 78, 76, 76, 77, 79, 81, 83, 85, 88, 90, 90, 66, 70, 74, 78, 83, 81,
+        80, 80, 80, 82, 83, 85, 87, 89, 92, 92, 79, 79, 80, 83, 86, 85, 84, 84,
+        84, 85, 86, 88, 90, 91, 93, 93, 80, 79, 78, 81, 85, 86, 87, 87, 88, 88,
+        89, 91, 92, 94, 95, 95, 81, 78, 76, 80, 84, 87, 89, 90, 91, 92, 93, 94,
+        95, 96, 98, 98, 82, 79, 76, 80, 84, 87, 90, 92, 93, 94, 95, 96, 97, 98,
+        100, 100, 84, 80, 77, 80, 84, 88, 91, 93, 95, 96, 98, 99, 100, 101, 102,
+        102, 86, 82, 79, 82, 85, 88, 92, 94, 96, 98, 99, 101, 102, 103, 104,
+        104, 88, 84, 81, 83, 86, 89, 93, 95, 98, 99, 101, 102, 104, 105, 106,
+        106, 90, 86, 83, 85, 88, 91, 94, 96, 99, 101, 102, 104, 105, 106, 107,
+        107, 93, 89, 85, 87, 90, 92, 95, 97, 100, 102, 104, 105, 106, 107, 108,
+        108, 95, 91, 88, 89, 91, 94, 96, 98, 101, 103, 105, 106, 107, 108, 109,
+        109, 97, 94, 90, 92, 93, 95, 98, 100, 102, 104, 106, 107, 108, 109, 111,
+        111, 97, 94, 90, 92, 93, 95, 98, 100, 102, 104, 106, 107, 108, 109, 111,
+        111,
+        /* Size 32 */
+        64, 62, 60, 58, 56, 60, 66, 72, 79, 79, 80, 81, 81, 82, 82, 83, 84, 85,
+        86, 87, 88, 89, 90, 92, 93, 94, 95, 96, 97, 97, 97, 97, 62, 61, 60, 60,
+        59, 63, 68, 73, 79, 79, 79, 80, 80, 80, 81, 81, 82, 83, 84, 85, 86, 87,
+        88, 90, 91, 92, 93, 94, 95, 95, 95, 95, 60, 60, 61, 62, 62, 66, 70, 74,
+        79, 79, 79, 79, 78, 79, 79, 80, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90,
+        91, 92, 94, 94, 94, 94, 58, 60, 62, 64, 66, 69, 72, 75, 79, 79, 78, 78,
+        77, 77, 78, 78, 78, 79, 80, 81, 82, 83, 85, 86, 87, 88, 89, 91, 92, 92,
+        92, 92, 56, 59, 62, 66, 70, 72, 74, 77, 80, 79, 78, 77, 76, 76, 76, 77,
+        77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 90, 90, 90, 60, 63,
+        66, 69, 72, 74, 76, 79, 81, 80, 79, 78, 78, 78, 78, 78, 79, 79, 80, 81,
+        82, 83, 84, 85, 86, 87, 89, 90, 91, 91, 91, 91, 66, 68, 70, 72, 74, 76,
+        78, 80, 83, 82, 81, 80, 80, 80, 80, 80, 80, 81, 82, 83, 83, 84, 85, 86,
+        87, 88, 89, 91, 92, 92, 92, 92, 72, 73, 74, 75, 77, 79, 80, 82, 84, 84,
+        83, 83, 82, 82, 82, 82, 82, 83, 83, 84, 85, 86, 87, 87, 88, 89, 90, 91,
+        92, 92, 92, 92, 79, 79, 79, 79, 80, 81, 83, 84, 86, 85, 85, 85, 84, 84,
+        84, 84, 84, 85, 85, 86, 86, 87, 88, 89, 90, 90, 91, 92, 93, 93, 93, 93,
+        79, 79, 79, 79, 79, 80, 82, 84, 85, 85, 85, 85, 86, 86, 86, 86, 86, 86,
+        87, 87, 88, 88, 89, 90, 91, 92, 93, 93, 94, 94, 94, 94, 80, 79, 79, 78,
+        78, 79, 81, 83, 85, 85, 86, 86, 87, 87, 87, 87, 88, 88, 88, 89, 89, 90,
+        91, 91, 92, 93, 94, 95, 95, 95, 95, 95, 81, 80, 79, 78, 77, 78, 80, 83,
+        85, 85, 86, 87, 88, 88, 89, 89, 89, 90, 90, 91, 91, 92, 92, 93, 93, 94,
+        95, 96, 96, 96, 96, 96, 81, 80, 78, 77, 76, 78, 80, 82, 84, 86, 87, 88,
+        89, 90, 90, 91, 91, 91, 92, 92, 93, 93, 94, 94, 95, 96, 96, 97, 98, 98,
+        98, 98, 82, 80, 79, 77, 76, 78, 80, 82, 84, 86, 87, 88, 90, 90, 91, 91,
+        92, 92, 93, 93, 94, 94, 95, 95, 96, 97, 97, 98, 99, 99, 99, 99, 82, 81,
+        79, 78, 76, 78, 80, 82, 84, 86, 87, 89, 90, 91, 92, 92, 93, 94, 94, 95,
+        95, 96, 96, 97, 97, 98, 98, 99, 100, 100, 100, 100, 83, 81, 80, 78, 77,
+        78, 80, 82, 84, 86, 87, 89, 91, 91, 92, 93, 94, 95, 95, 96, 96, 97, 97,
+        98, 99, 99, 100, 100, 101, 101, 101, 101, 84, 82, 80, 78, 77, 79, 80,
+        82, 84, 86, 88, 89, 91, 92, 93, 94, 95, 96, 96, 97, 98, 98, 99, 99, 100,
+        100, 101, 101, 102, 102, 102, 102, 85, 83, 81, 79, 78, 79, 81, 83, 85,
+        86, 88, 90, 91, 92, 94, 95, 96, 96, 97, 98, 99, 99, 100, 100, 101, 101,
+        102, 102, 103, 103, 103, 103, 86, 84, 82, 80, 79, 80, 82, 83, 85, 87,
+        88, 90, 92, 93, 94, 95, 96, 97, 98, 99, 99, 100, 101, 101, 102, 102,
+        103, 103, 104, 104, 104, 104, 87, 85, 83, 81, 80, 81, 83, 84, 86, 87,
+        89, 91, 92, 93, 95, 96, 97, 98, 99, 99, 100, 101, 101, 102, 103, 103,
+        104, 104, 105, 105, 105, 105, 88, 86, 84, 82, 81, 82, 83, 85, 86, 88,
+        89, 91, 93, 94, 95, 96, 98, 99, 99, 100, 101, 102, 102, 103, 104, 104,
+        105, 105, 106, 106, 106, 106, 89, 87, 85, 83, 82, 83, 84, 86, 87, 88,
+        90, 92, 93, 94, 96, 97, 98, 99, 100, 101, 102, 102, 103, 104, 104, 105,
+        105, 106, 106, 106, 106, 106, 90, 88, 86, 85, 83, 84, 85, 87, 88, 89,
+        91, 92, 94, 95, 96, 97, 99, 100, 101, 101, 102, 103, 104, 104, 105, 105,
+        106, 106, 107, 107, 107, 107, 92, 90, 88, 86, 84, 85, 86, 87, 89, 90,
+        91, 93, 94, 95, 97, 98, 99, 100, 101, 102, 103, 104, 104, 105, 106, 106,
+        107, 107, 108, 108, 108, 108, 93, 91, 89, 87, 85, 86, 87, 88, 90, 91,
+        92, 93, 95, 96, 97, 99, 100, 101, 102, 103, 104, 104, 105, 106, 106,
+        107, 107, 108, 108, 108, 108, 108, 94, 92, 90, 88, 86, 87, 88, 89, 90,
+        92, 93, 94, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 105, 106, 107,
+        107, 108, 108, 109, 109, 109, 109, 95, 93, 91, 89, 88, 89, 89, 90, 91,
+        93, 94, 95, 96, 97, 98, 100, 101, 102, 103, 104, 105, 105, 106, 107,
+        107, 108, 108, 109, 109, 109, 109, 109, 96, 94, 92, 91, 89, 90, 91, 91,
+        92, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 106, 107,
+        108, 108, 109, 109, 110, 110, 110, 110, 97, 95, 94, 92, 90, 91, 92, 92,
+        93, 94, 95, 96, 98, 99, 100, 101, 102, 103, 104, 105, 106, 106, 107,
+        108, 108, 109, 109, 110, 111, 111, 111, 111, 97, 95, 94, 92, 90, 91, 92,
+        92, 93, 94, 95, 96, 98, 99, 100, 101, 102, 103, 104, 105, 106, 106, 107,
+        108, 108, 109, 109, 110, 111, 111, 111, 111, 97, 95, 94, 92, 90, 91, 92,
+        92, 93, 94, 95, 96, 98, 99, 100, 101, 102, 103, 104, 105, 106, 106, 107,
+        108, 108, 109, 109, 110, 111, 111, 111, 111, 97, 95, 94, 92, 90, 91, 92,
+        92, 93, 94, 95, 96, 98, 99, 100, 101, 102, 103, 104, 105, 106, 106, 107,
+        108, 108, 109, 109, 110, 111, 111, 111, 111 },
+      { /* Intra matrices */
+        /* Size 4 */
+        42, 56, 58, 65, 56, 61, 63, 67, 58, 63, 70, 74, 65, 67, 74, 78,
+        /* Size 8 */
+        45, 39, 55, 57, 59, 62, 66, 69, 39, 49, 56, 53, 54, 57, 60, 64, 55, 56,
+        61, 60, 60, 61, 63, 66, 57, 53, 60, 63, 65, 66, 68, 70, 59, 54, 60, 65,
+        68, 70, 71, 73, 62, 57, 61, 66, 70, 72, 74, 76, 66, 60, 63, 68, 71, 74,
+        76, 78, 69, 64, 66, 70, 73, 76, 78, 80,
+        /* Size 16 */
+        44, 41, 38, 45, 55, 56, 56, 57, 58, 60, 62, 63, 65, 67, 69, 69, 41, 42,
+        43, 48, 55, 55, 54, 55, 56, 57, 59, 60, 62, 64, 66, 66, 38, 43, 48, 52,
+        55, 54, 52, 53, 53, 55, 56, 58, 59, 61, 63, 63, 45, 48, 52, 54, 57, 56,
+        55, 56, 56, 57, 58, 60, 61, 63, 64, 64, 55, 55, 55, 57, 60, 59, 59, 59,
+        59, 59, 60, 61, 63, 64, 66, 66, 56, 55, 54, 56, 59, 60, 61, 61, 61, 62,
+        63, 64, 65, 66, 67, 67, 56, 54, 52, 55, 59, 61, 62, 63, 64, 64, 65, 66,
+        67, 68, 69, 69, 57, 55, 53, 56, 59, 61, 63, 64, 65, 66, 67, 68, 69, 69,
+        70, 70, 58, 56, 53, 56, 59, 61, 64, 65, 67, 68, 69, 70, 70, 71, 72, 72,
+        60, 57, 55, 57, 59, 62, 64, 66, 68, 69, 70, 71, 72, 73, 73, 73, 62, 59,
+        56, 58, 60, 63, 65, 67, 69, 70, 71, 72, 73, 74, 75, 75, 63, 60, 58, 60,
+        61, 64, 66, 68, 70, 71, 72, 73, 74, 75, 76, 76, 65, 62, 59, 61, 63, 65,
+        67, 69, 70, 72, 73, 74, 75, 76, 77, 77, 67, 64, 61, 63, 64, 66, 68, 69,
+        71, 73, 74, 75, 76, 77, 78, 78, 69, 66, 63, 64, 66, 67, 69, 70, 72, 73,
+        75, 76, 77, 78, 79, 79, 69, 66, 63, 64, 66, 67, 69, 70, 72, 73, 75, 76,
+        77, 78, 79, 79,
+        /* Size 32 */
+        44, 42, 41, 39, 38, 41, 45, 49, 54, 55, 55, 56, 56, 57, 57, 58, 58, 59,
+        60, 60, 61, 62, 63, 64, 65, 66, 66, 67, 68, 68, 68, 68, 42, 42, 41, 41,
+        40, 43, 46, 50, 55, 55, 55, 55, 55, 55, 56, 56, 57, 57, 58, 59, 60, 61,
+        61, 62, 63, 64, 65, 66, 67, 67, 67, 67, 41, 41, 42, 42, 42, 45, 48, 51,
+        55, 55, 54, 54, 54, 54, 55, 55, 55, 56, 57, 58, 58, 59, 60, 61, 62, 63,
+        63, 64, 65, 65, 65, 65, 39, 41, 42, 43, 45, 47, 49, 52, 55, 54, 54, 53,
+        53, 53, 54, 54, 54, 55, 55, 56, 57, 58, 59, 59, 60, 61, 62, 63, 64, 64,
+        64, 64, 38, 40, 42, 45, 48, 49, 51, 53, 55, 54, 53, 53, 52, 52, 53, 53,
+        53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 63, 63, 63, 41, 43,
+        45, 47, 49, 51, 53, 54, 56, 55, 55, 54, 54, 54, 54, 54, 54, 55, 55, 56,
+        57, 57, 58, 59, 60, 61, 62, 62, 63, 63, 63, 63, 45, 46, 48, 49, 51, 53,
+        54, 56, 57, 57, 56, 56, 55, 55, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60,
+        61, 61, 62, 63, 64, 64, 64, 64, 49, 50, 51, 52, 53, 54, 56, 57, 58, 58,
+        57, 57, 57, 57, 57, 57, 57, 57, 58, 58, 59, 59, 60, 61, 61, 62, 63, 64,
+        64, 64, 64, 64, 54, 55, 55, 55, 55, 56, 57, 58, 59, 59, 59, 59, 58, 58,
+        58, 58, 58, 59, 59, 59, 60, 60, 61, 62, 62, 63, 64, 64, 65, 65, 65, 65,
+        55, 55, 55, 54, 54, 55, 57, 58, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60,
+        60, 61, 61, 62, 62, 63, 63, 64, 65, 65, 66, 66, 66, 66, 55, 55, 54, 54,
+        53, 55, 56, 57, 59, 59, 60, 60, 60, 60, 60, 61, 61, 61, 61, 62, 62, 63,
+        63, 64, 64, 65, 65, 66, 67, 67, 67, 67, 56, 55, 54, 53, 53, 54, 56, 57,
+        59, 59, 60, 60, 61, 61, 62, 62, 62, 62, 63, 63, 63, 64, 64, 65, 65, 66,
+        66, 67, 67, 67, 67, 67, 56, 55, 54, 53, 52, 54, 55, 57, 58, 59, 60, 61,
+        62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 67, 67, 68, 68, 68,
+        68, 68, 57, 55, 54, 53, 52, 54, 55, 57, 58, 59, 60, 61, 62, 63, 63, 64,
+        64, 64, 65, 65, 66, 66, 66, 67, 67, 68, 68, 69, 69, 69, 69, 69, 57, 56,
+        55, 54, 53, 54, 55, 57, 58, 59, 60, 62, 63, 63, 64, 64, 65, 65, 66, 66,
+        66, 67, 67, 68, 68, 69, 69, 69, 70, 70, 70, 70, 58, 56, 55, 54, 53, 54,
+        55, 57, 58, 59, 61, 62, 63, 64, 64, 65, 66, 66, 67, 67, 67, 68, 68, 69,
+        69, 69, 70, 70, 71, 71, 71, 71, 58, 57, 55, 54, 53, 54, 56, 57, 58, 60,
+        61, 62, 63, 64, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 70, 71, 71,
+        72, 72, 72, 72, 59, 57, 56, 55, 54, 55, 56, 57, 59, 60, 61, 62, 64, 64,
+        65, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 71, 72, 72, 72, 72, 72,
+        60, 58, 57, 55, 54, 55, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 67, 68,
+        69, 69, 70, 70, 71, 71, 71, 72, 72, 73, 73, 73, 73, 73, 60, 59, 58, 56,
+        55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 70, 70, 71,
+        71, 72, 72, 72, 73, 73, 74, 74, 74, 74, 61, 60, 58, 57, 56, 57, 58, 59,
+        60, 61, 62, 63, 65, 66, 66, 67, 68, 69, 70, 70, 71, 71, 72, 72, 73, 73,
+        74, 74, 74, 74, 74, 74, 62, 61, 59, 58, 56, 57, 58, 59, 60, 62, 63, 64,
+        65, 66, 67, 68, 69, 69, 70, 71, 71, 72, 72, 73, 73, 74, 74, 74, 75, 75,
+        75, 75, 63, 61, 60, 59, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
+        69, 70, 71, 71, 72, 72, 73, 73, 74, 74, 75, 75, 75, 75, 75, 75, 64, 62,
+        61, 59, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 70, 71, 72,
+        72, 73, 73, 74, 74, 75, 75, 76, 76, 76, 76, 76, 65, 63, 62, 60, 59, 60,
+        61, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 71, 72, 73, 73, 74, 74,
+        75, 75, 76, 76, 76, 76, 76, 76, 66, 64, 63, 61, 60, 61, 61, 62, 63, 64,
+        65, 66, 67, 68, 69, 69, 70, 71, 72, 72, 73, 74, 74, 75, 75, 76, 76, 76,
+        77, 77, 77, 77, 66, 65, 63, 62, 61, 62, 62, 63, 64, 65, 65, 66, 67, 68,
+        69, 70, 71, 71, 72, 73, 74, 74, 75, 75, 76, 76, 76, 77, 77, 77, 77, 77,
+        67, 66, 64, 63, 62, 62, 63, 64, 64, 65, 66, 67, 68, 69, 69, 70, 71, 72,
+        73, 73, 74, 74, 75, 76, 76, 76, 77, 77, 78, 78, 78, 78, 68, 67, 65, 64,
+        63, 63, 64, 64, 65, 66, 67, 67, 68, 69, 70, 71, 72, 72, 73, 74, 74, 75,
+        75, 76, 76, 77, 77, 78, 78, 78, 78, 78, 68, 67, 65, 64, 63, 63, 64, 64,
+        65, 66, 67, 67, 68, 69, 70, 71, 72, 72, 73, 74, 74, 75, 75, 76, 76, 77,
+        77, 78, 78, 78, 78, 78, 68, 67, 65, 64, 63, 63, 64, 64, 65, 66, 67, 67,
+        68, 69, 70, 71, 72, 72, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 78,
+        78, 78, 68, 67, 65, 64, 63, 63, 64, 64, 65, 66, 67, 67, 68, 69, 70, 71,
+        72, 72, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 78, 78, 78 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 67, 84, 97, 67, 81, 91, 99, 84, 91, 100, 105, 97, 99, 105, 108,
+        /* Size 8 */
+        64, 55, 58, 66, 75, 82, 88, 92, 55, 59, 58, 63, 70, 78, 84, 89, 58, 58,
+        68, 73, 78, 82, 87, 91, 66, 63, 73, 80, 84, 87, 90, 93, 75, 70, 78, 84,
+        88, 91, 93, 95, 82, 78, 82, 87, 91, 94, 96, 97, 88, 84, 87, 90, 93, 96,
+        97, 98, 92, 89, 91, 93, 95, 97, 98, 99,
+        /* Size 16 */
+        64, 59, 55, 56, 58, 62, 66, 70, 75, 78, 82, 85, 88, 90, 92, 92, 59, 58,
+        57, 58, 58, 61, 65, 68, 73, 76, 80, 83, 86, 88, 90, 90, 55, 57, 59, 59,
+        58, 60, 63, 67, 70, 74, 78, 81, 84, 86, 89, 89, 56, 58, 59, 61, 63, 65,
+        68, 71, 74, 77, 80, 83, 85, 87, 90, 90, 58, 58, 58, 63, 68, 71, 73, 75,
+        78, 80, 82, 85, 87, 89, 91, 91, 62, 61, 60, 65, 71, 73, 76, 78, 81, 83,
+        85, 87, 89, 90, 92, 92, 66, 65, 63, 68, 73, 76, 80, 82, 84, 86, 87, 89,
+        90, 92, 93, 93, 70, 68, 67, 71, 75, 78, 82, 84, 86, 88, 89, 91, 92, 93,
+        94, 94, 75, 73, 70, 74, 78, 81, 84, 86, 88, 90, 91, 92, 93, 94, 95, 95,
+        78, 76, 74, 77, 80, 83, 86, 88, 90, 91, 92, 93, 94, 95, 96, 96, 82, 80,
+        78, 80, 82, 85, 87, 89, 91, 92, 94, 95, 96, 96, 97, 97, 85, 83, 81, 83,
+        85, 87, 89, 91, 92, 93, 95, 96, 96, 97, 98, 98, 88, 86, 84, 85, 87, 89,
+        90, 92, 93, 94, 96, 96, 97, 98, 98, 98, 90, 88, 86, 87, 89, 90, 92, 93,
+        94, 95, 96, 97, 98, 98, 99, 99, 92, 90, 89, 90, 91, 92, 93, 94, 95, 96,
+        97, 98, 98, 99, 99, 99, 92, 90, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
+        98, 99, 99, 99,
+        /* Size 32 */
+        64, 62, 59, 57, 55, 56, 56, 57, 58, 60, 62, 64, 66, 68, 70, 72, 75, 77,
+        78, 80, 82, 84, 85, 86, 88, 89, 90, 91, 92, 92, 92, 92, 62, 60, 59, 58,
+        56, 57, 57, 57, 58, 60, 61, 63, 65, 67, 69, 71, 74, 75, 77, 79, 81, 82,
+        84, 85, 87, 88, 89, 90, 91, 91, 91, 91, 59, 59, 58, 58, 57, 57, 58, 58,
+        58, 59, 61, 63, 65, 66, 68, 70, 73, 74, 76, 78, 80, 81, 83, 84, 86, 87,
+        88, 89, 90, 90, 90, 90, 57, 58, 58, 58, 58, 58, 58, 58, 58, 59, 61, 62,
+        64, 66, 67, 69, 72, 73, 75, 77, 79, 80, 82, 83, 85, 86, 87, 88, 90, 90,
+        90, 90, 55, 56, 57, 58, 59, 59, 59, 58, 58, 59, 60, 62, 63, 65, 67, 68,
+        70, 72, 74, 76, 78, 79, 81, 82, 84, 85, 86, 87, 89, 89, 89, 89, 56, 57,
+        57, 58, 59, 59, 60, 60, 60, 62, 63, 64, 65, 67, 69, 70, 72, 74, 75, 77,
+        79, 80, 82, 83, 85, 86, 87, 88, 89, 89, 89, 89, 56, 57, 58, 58, 59, 60,
+        61, 62, 63, 64, 65, 66, 68, 69, 71, 72, 74, 75, 77, 78, 80, 81, 83, 84,
+        85, 86, 87, 89, 90, 90, 90, 90, 57, 57, 58, 58, 58, 60, 62, 64, 66, 67,
+        68, 69, 70, 72, 73, 74, 76, 77, 78, 80, 81, 82, 84, 85, 86, 87, 88, 89,
+        90, 90, 90, 90, 58, 58, 58, 58, 58, 60, 63, 66, 68, 70, 71, 72, 73, 74,
+        75, 76, 78, 79, 80, 81, 82, 83, 85, 86, 87, 88, 89, 90, 91, 91, 91, 91,
+        60, 60, 59, 59, 59, 62, 64, 67, 70, 71, 72, 73, 75, 76, 77, 78, 79, 80,
+        81, 82, 84, 85, 86, 87, 88, 89, 89, 90, 91, 91, 91, 91, 62, 61, 61, 61,
+        60, 63, 65, 68, 71, 72, 73, 75, 76, 77, 78, 80, 81, 82, 83, 84, 85, 86,
+        87, 88, 89, 89, 90, 91, 92, 92, 92, 92, 64, 63, 63, 62, 62, 64, 66, 69,
+        72, 73, 75, 76, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 89, 90,
+        91, 92, 92, 92, 92, 92, 66, 65, 65, 64, 63, 65, 68, 70, 73, 75, 76, 78,
+        80, 81, 82, 83, 84, 85, 86, 87, 87, 88, 89, 90, 90, 91, 92, 92, 93, 93,
+        93, 93, 68, 67, 66, 66, 65, 67, 69, 72, 74, 76, 77, 79, 81, 82, 83, 84,
+        85, 86, 87, 87, 88, 89, 90, 90, 91, 92, 92, 93, 94, 94, 94, 94, 70, 69,
+        68, 67, 67, 69, 71, 73, 75, 77, 78, 80, 82, 83, 84, 85, 86, 87, 88, 88,
+        89, 90, 91, 91, 92, 92, 93, 94, 94, 94, 94, 94, 72, 71, 70, 69, 68, 70,
+        72, 74, 76, 78, 80, 81, 83, 84, 85, 86, 87, 88, 89, 89, 90, 91, 91, 92,
+        93, 93, 94, 94, 95, 95, 95, 95, 75, 74, 73, 72, 70, 72, 74, 76, 78, 79,
+        81, 82, 84, 85, 86, 87, 88, 89, 90, 90, 91, 92, 92, 93, 93, 94, 94, 95,
+        95, 95, 95, 95, 77, 75, 74, 73, 72, 74, 75, 77, 79, 80, 82, 83, 85, 86,
+        87, 88, 89, 90, 90, 91, 92, 92, 93, 93, 94, 94, 95, 95, 96, 96, 96, 96,
+        78, 77, 76, 75, 74, 75, 77, 78, 80, 81, 83, 84, 86, 87, 88, 89, 90, 90,
+        91, 92, 92, 93, 93, 94, 94, 95, 95, 96, 96, 96, 96, 96, 80, 79, 78, 77,
+        76, 77, 78, 80, 81, 82, 84, 85, 87, 87, 88, 89, 90, 91, 92, 92, 93, 94,
+        94, 95, 95, 95, 96, 96, 97, 97, 97, 97, 82, 81, 80, 79, 78, 79, 80, 81,
+        82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 92, 93, 94, 94, 95, 95, 96, 96,
+        96, 97, 97, 97, 97, 97, 84, 82, 81, 80, 79, 80, 81, 82, 83, 85, 86, 87,
+        88, 89, 90, 91, 92, 92, 93, 94, 94, 95, 95, 96, 96, 96, 97, 97, 97, 97,
+        97, 97, 85, 84, 83, 82, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 91,
+        92, 93, 93, 94, 95, 95, 96, 96, 96, 97, 97, 97, 98, 98, 98, 98, 86, 85,
+        84, 83, 82, 83, 84, 85, 86, 87, 88, 89, 90, 90, 91, 92, 93, 93, 94, 95,
+        95, 96, 96, 96, 97, 97, 97, 98, 98, 98, 98, 98, 88, 87, 86, 85, 84, 85,
+        85, 86, 87, 88, 89, 89, 90, 91, 92, 93, 93, 94, 94, 95, 96, 96, 96, 97,
+        97, 97, 98, 98, 98, 98, 98, 98, 89, 88, 87, 86, 85, 86, 86, 87, 88, 89,
+        89, 90, 91, 92, 92, 93, 94, 94, 95, 95, 96, 96, 97, 97, 97, 98, 98, 98,
+        99, 99, 99, 99, 90, 89, 88, 87, 86, 87, 87, 88, 89, 89, 90, 91, 92, 92,
+        93, 94, 94, 95, 95, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 99, 99,
+        91, 90, 89, 88, 87, 88, 89, 89, 90, 90, 91, 92, 92, 93, 94, 94, 95, 95,
+        96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 99, 99, 99, 92, 91, 90, 90,
+        89, 89, 90, 90, 91, 91, 92, 92, 93, 94, 94, 95, 95, 96, 96, 97, 97, 97,
+        98, 98, 98, 99, 99, 99, 99, 99, 99, 99, 92, 91, 90, 90, 89, 89, 90, 90,
+        91, 91, 92, 92, 93, 94, 94, 95, 95, 96, 96, 97, 97, 97, 98, 98, 98, 99,
+        99, 99, 99, 99, 99, 99, 92, 91, 90, 90, 89, 89, 90, 90, 91, 91, 92, 92,
+        93, 94, 94, 95, 95, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 99, 99,
+        99, 99, 92, 91, 90, 90, 89, 89, 90, 90, 91, 91, 92, 92, 93, 94, 94, 95,
+        95, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 99, 99, 99, 99 },
+      { /* Intra matrices */
+        /* Size 4 */
+        44, 46, 58, 68, 46, 56, 64, 70, 58, 64, 71, 74, 68, 70, 74, 77,
+        /* Size 8 */
+        49, 42, 44, 50, 57, 63, 68, 71, 42, 45, 44, 48, 54, 60, 65, 69, 44, 44,
+        52, 56, 60, 63, 67, 70, 50, 48, 56, 61, 65, 68, 70, 72, 57, 54, 60, 65,
+        68, 71, 73, 74, 63, 60, 63, 68, 71, 73, 74, 76, 68, 65, 67, 70, 73, 74,
+        76, 77, 71, 69, 70, 72, 74, 76, 77, 78,
+        /* Size 16 */
+        48, 44, 41, 42, 43, 46, 49, 53, 56, 59, 62, 65, 67, 69, 70, 70, 44, 43,
+        43, 43, 43, 45, 48, 51, 55, 57, 61, 63, 65, 67, 69, 69, 41, 43, 44, 44,
+        43, 45, 47, 50, 53, 56, 59, 61, 64, 66, 68, 68, 42, 43, 44, 45, 47, 49,
+        51, 53, 56, 58, 61, 63, 65, 67, 68, 68, 43, 43, 43, 47, 51, 53, 55, 57,
+        59, 61, 63, 64, 66, 68, 69, 69, 46, 45, 45, 49, 53, 55, 58, 59, 61, 63,
+        64, 66, 68, 69, 70, 70, 49, 48, 47, 51, 55, 58, 61, 62, 64, 65, 67, 68,
+        69, 70, 71, 71, 53, 51, 50, 53, 57, 59, 62, 64, 66, 67, 68, 69, 70, 71,
+        72, 72, 56, 55, 53, 56, 59, 61, 64, 66, 67, 69, 70, 71, 72, 72, 73, 73,
+        59, 57, 56, 58, 61, 63, 65, 67, 69, 70, 71, 72, 72, 73, 74, 74, 62, 61,
+        59, 61, 63, 64, 67, 68, 70, 71, 72, 73, 73, 74, 75, 75, 65, 63, 61, 63,
+        64, 66, 68, 69, 71, 72, 73, 73, 74, 75, 75, 75, 67, 65, 64, 65, 66, 68,
+        69, 70, 72, 72, 73, 74, 75, 75, 76, 76, 69, 67, 66, 67, 68, 69, 70, 71,
+        72, 73, 74, 75, 75, 76, 76, 76, 70, 69, 68, 68, 69, 70, 71, 72, 73, 74,
+        75, 75, 76, 76, 76, 76, 70, 69, 68, 68, 69, 70, 71, 72, 73, 74, 75, 75,
+        76, 76, 76, 76,
+        /* Size 32 */
+        47, 46, 44, 42, 41, 41, 42, 42, 43, 44, 46, 47, 49, 51, 52, 54, 56, 57,
+        59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 70, 70, 70, 46, 44, 43, 42,
+        41, 42, 42, 42, 43, 44, 45, 47, 48, 50, 52, 53, 55, 56, 58, 59, 61, 62,
+        63, 64, 66, 66, 67, 68, 69, 69, 69, 69, 44, 43, 43, 43, 42, 42, 42, 43,
+        43, 44, 45, 46, 48, 49, 51, 52, 54, 56, 57, 58, 60, 61, 62, 64, 65, 66,
+        67, 68, 68, 68, 68, 68, 42, 42, 43, 43, 43, 43, 43, 43, 43, 44, 45, 46,
+        47, 49, 50, 52, 53, 55, 56, 58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 68,
+        68, 68, 41, 41, 42, 43, 44, 44, 43, 43, 43, 44, 45, 46, 47, 48, 49, 51,
+        53, 54, 55, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 67, 67, 67, 41, 42,
+        42, 43, 44, 44, 44, 44, 45, 46, 46, 47, 48, 50, 51, 52, 54, 55, 56, 58,
+        59, 60, 61, 63, 64, 65, 66, 67, 68, 68, 68, 68, 42, 42, 42, 43, 43, 44,
+        45, 46, 47, 47, 48, 49, 50, 51, 53, 54, 55, 56, 58, 59, 60, 61, 62, 63,
+        64, 65, 66, 67, 68, 68, 68, 68, 42, 42, 43, 43, 43, 44, 46, 47, 49, 50,
+        50, 51, 52, 53, 54, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 67,
+        68, 68, 68, 68, 43, 43, 43, 43, 43, 45, 47, 49, 51, 52, 53, 54, 55, 55,
+        56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 66, 67, 68, 69, 69, 69, 69,
+        44, 44, 44, 44, 44, 46, 47, 50, 52, 53, 54, 55, 56, 57, 58, 58, 59, 60,
+        61, 62, 63, 64, 65, 65, 66, 67, 68, 68, 69, 69, 69, 69, 46, 45, 45, 45,
+        45, 46, 48, 50, 53, 54, 55, 56, 57, 58, 59, 60, 61, 61, 62, 63, 64, 65,
+        65, 66, 67, 68, 68, 69, 70, 70, 70, 70, 47, 47, 46, 46, 46, 47, 49, 51,
+        54, 55, 56, 57, 59, 59, 60, 61, 62, 63, 63, 64, 65, 66, 66, 67, 68, 68,
+        69, 70, 70, 70, 70, 70, 49, 48, 48, 47, 47, 48, 50, 52, 55, 56, 57, 59,
+        60, 61, 62, 63, 63, 64, 65, 65, 66, 67, 67, 68, 69, 69, 70, 70, 71, 71,
+        71, 71, 51, 50, 49, 49, 48, 50, 51, 53, 55, 57, 58, 59, 61, 62, 63, 63,
+        64, 65, 65, 66, 67, 67, 68, 69, 69, 70, 70, 71, 71, 71, 71, 71, 52, 52,
+        51, 50, 49, 51, 53, 54, 56, 58, 59, 60, 62, 63, 63, 64, 65, 66, 66, 67,
+        68, 68, 69, 69, 70, 70, 71, 71, 72, 72, 72, 72, 54, 53, 52, 52, 51, 52,
+        54, 56, 57, 58, 60, 61, 63, 63, 64, 65, 66, 67, 67, 68, 68, 69, 69, 70,
+        70, 71, 71, 72, 72, 72, 72, 72, 56, 55, 54, 53, 53, 54, 55, 57, 58, 59,
+        61, 62, 63, 64, 65, 66, 67, 67, 68, 69, 69, 70, 70, 71, 71, 71, 72, 72,
+        72, 72, 72, 72, 57, 56, 56, 55, 54, 55, 56, 58, 59, 60, 61, 63, 64, 65,
+        66, 67, 67, 68, 69, 69, 70, 70, 71, 71, 71, 72, 72, 72, 73, 73, 73, 73,
+        59, 58, 57, 56, 55, 56, 58, 59, 60, 61, 62, 63, 65, 65, 66, 67, 68, 69,
+        69, 70, 70, 71, 71, 71, 72, 72, 73, 73, 73, 73, 73, 73, 60, 59, 58, 58,
+        57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 69, 70, 70, 71, 71,
+        72, 72, 72, 73, 73, 73, 74, 74, 74, 74, 62, 61, 60, 59, 58, 59, 60, 61,
+        62, 63, 64, 65, 66, 67, 68, 68, 69, 70, 70, 71, 71, 72, 72, 72, 73, 73,
+        73, 74, 74, 74, 74, 74, 63, 62, 61, 60, 59, 60, 61, 62, 63, 64, 65, 66,
+        67, 67, 68, 69, 70, 70, 71, 71, 72, 72, 72, 73, 73, 73, 74, 74, 74, 74,
+        74, 74, 64, 63, 62, 61, 61, 61, 62, 63, 64, 65, 65, 66, 67, 68, 69, 69,
+        70, 71, 71, 72, 72, 72, 73, 73, 73, 74, 74, 74, 75, 75, 75, 75, 65, 64,
+        64, 63, 62, 63, 63, 64, 65, 65, 66, 67, 68, 69, 69, 70, 71, 71, 71, 72,
+        72, 73, 73, 73, 74, 74, 74, 75, 75, 75, 75, 75, 66, 66, 65, 64, 63, 64,
+        64, 65, 66, 66, 67, 68, 69, 69, 70, 70, 71, 71, 72, 72, 73, 73, 73, 74,
+        74, 74, 75, 75, 75, 75, 75, 75, 67, 66, 66, 65, 64, 65, 65, 66, 66, 67,
+        68, 68, 69, 70, 70, 71, 71, 72, 72, 73, 73, 73, 74, 74, 74, 75, 75, 75,
+        75, 75, 75, 75, 68, 67, 67, 66, 65, 66, 66, 67, 67, 68, 68, 69, 70, 70,
+        71, 71, 72, 72, 73, 73, 73, 74, 74, 74, 75, 75, 75, 75, 75, 75, 75, 75,
+        69, 68, 68, 67, 66, 67, 67, 67, 68, 68, 69, 70, 70, 71, 71, 72, 72, 72,
+        73, 73, 74, 74, 74, 75, 75, 75, 75, 75, 76, 76, 76, 76, 70, 69, 68, 68,
+        67, 68, 68, 68, 69, 69, 70, 70, 71, 71, 72, 72, 72, 73, 73, 74, 74, 74,
+        75, 75, 75, 75, 75, 76, 76, 76, 76, 76, 70, 69, 68, 68, 67, 68, 68, 68,
+        69, 69, 70, 70, 71, 71, 72, 72, 72, 73, 73, 74, 74, 74, 75, 75, 75, 75,
+        75, 76, 76, 76, 76, 76, 70, 69, 68, 68, 67, 68, 68, 68, 69, 69, 70, 70,
+        71, 71, 72, 72, 72, 73, 73, 74, 74, 74, 75, 75, 75, 75, 75, 76, 76, 76,
+        76, 76, 70, 69, 68, 68, 67, 68, 68, 68, 69, 69, 70, 70, 71, 71, 72, 72,
+        72, 73, 73, 74, 74, 74, 75, 75, 75, 75, 75, 76, 76, 76, 76, 76 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 80, 83, 90, 80, 86, 88, 92, 83, 88, 96, 99, 90, 92, 99, 104,
+        /* Size 8 */
+        64, 57, 77, 78, 80, 84, 88, 91, 57, 69, 77, 74, 75, 78, 82, 86, 77, 77,
+        82, 81, 81, 82, 85, 88, 78, 74, 81, 85, 86, 87, 89, 91, 80, 75, 81, 86,
+        89, 91, 93, 95, 84, 78, 82, 87, 91, 94, 96, 97, 88, 82, 85, 89, 93, 96,
+        98, 99, 91, 86, 88, 91, 95, 97, 99, 101,
+        /* Size 16 */
+        64, 60, 57, 65, 77, 77, 78, 79, 80, 82, 84, 86, 88, 89, 91, 91, 60, 61,
+        62, 69, 77, 76, 76, 77, 78, 79, 81, 83, 85, 86, 88, 88, 57, 62, 69, 73,
+        77, 75, 74, 74, 75, 76, 78, 80, 82, 84, 86, 86, 65, 69, 73, 76, 80, 78,
+        77, 78, 78, 79, 80, 82, 83, 85, 87, 87, 77, 77, 77, 80, 82, 82, 81, 81,
+        81, 82, 82, 84, 85, 87, 88, 88, 77, 76, 75, 78, 82, 82, 83, 83, 83, 84,
+        85, 86, 87, 88, 90, 90, 78, 76, 74, 77, 81, 83, 85, 85, 86, 87, 87, 88,
+        89, 90, 91, 91, 79, 77, 74, 78, 81, 83, 85, 87, 88, 89, 89, 90, 91, 92,
+        93, 93, 80, 78, 75, 78, 81, 83, 86, 88, 89, 90, 91, 92, 93, 94, 95, 95,
+        82, 79, 76, 79, 82, 84, 87, 89, 90, 92, 93, 94, 94, 95, 96, 96, 84, 81,
+        78, 80, 82, 85, 87, 89, 91, 93, 94, 95, 96, 97, 97, 97, 86, 83, 80, 82,
+        84, 86, 88, 90, 92, 94, 95, 96, 97, 98, 98, 98, 88, 85, 82, 83, 85, 87,
+        89, 91, 93, 94, 96, 97, 98, 99, 99, 99, 89, 86, 84, 85, 87, 88, 90, 92,
+        94, 95, 97, 98, 99, 99, 100, 100, 91, 88, 86, 87, 88, 90, 91, 93, 95,
+        96, 97, 98, 99, 100, 101, 101, 91, 88, 86, 87, 88, 90, 91, 93, 95, 96,
+        97, 98, 99, 100, 101, 101,
+        /* Size 32 */
+        64, 62, 60, 59, 57, 61, 65, 70, 77, 77, 77, 78, 78, 79, 79, 80, 80, 81,
+        82, 83, 84, 85, 86, 87, 88, 88, 89, 90, 91, 91, 91, 91, 62, 61, 61, 60,
+        60, 63, 67, 72, 77, 77, 77, 77, 77, 78, 78, 79, 79, 80, 81, 81, 82, 83,
+        84, 85, 86, 87, 88, 89, 90, 90, 90, 90, 60, 61, 61, 62, 62, 65, 69, 73,
+        77, 77, 76, 76, 76, 76, 77, 77, 78, 78, 79, 80, 81, 82, 83, 84, 85, 85,
+        86, 87, 88, 88, 88, 88, 59, 60, 62, 64, 66, 68, 71, 74, 77, 76, 76, 75,
+        75, 75, 76, 76, 76, 77, 78, 79, 79, 80, 81, 82, 83, 84, 85, 86, 87, 87,
+        87, 87, 57, 60, 62, 66, 69, 71, 73, 75, 77, 76, 75, 75, 74, 74, 74, 75,
+        75, 76, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 86, 86, 86, 61, 63,
+        65, 68, 71, 73, 74, 76, 78, 78, 77, 76, 76, 76, 76, 76, 76, 77, 78, 78,
+        79, 80, 81, 82, 82, 83, 84, 85, 86, 86, 86, 86, 65, 67, 69, 71, 73, 74,
+        76, 78, 80, 79, 78, 78, 77, 77, 78, 78, 78, 78, 79, 80, 80, 81, 82, 82,
+        83, 84, 85, 86, 87, 87, 87, 87, 70, 72, 73, 74, 75, 76, 78, 79, 81, 80,
+        80, 80, 79, 79, 79, 79, 79, 80, 80, 81, 81, 82, 83, 83, 84, 85, 86, 87,
+        87, 87, 87, 87, 77, 77, 77, 77, 77, 78, 80, 81, 82, 82, 82, 81, 81, 81,
+        81, 81, 81, 81, 82, 82, 82, 83, 84, 84, 85, 86, 87, 87, 88, 88, 88, 88,
+        77, 77, 77, 76, 76, 78, 79, 80, 82, 82, 82, 82, 82, 82, 82, 82, 82, 83,
+        83, 83, 84, 84, 85, 85, 86, 87, 87, 88, 89, 89, 89, 89, 77, 77, 76, 76,
+        75, 77, 78, 80, 82, 82, 82, 83, 83, 83, 83, 83, 83, 84, 84, 85, 85, 85,
+        86, 87, 87, 88, 88, 89, 90, 90, 90, 90, 78, 77, 76, 75, 75, 76, 78, 80,
+        81, 82, 83, 83, 84, 84, 84, 85, 85, 85, 86, 86, 86, 87, 87, 88, 88, 89,
+        89, 90, 90, 90, 90, 90, 78, 77, 76, 75, 74, 76, 77, 79, 81, 82, 83, 84,
+        85, 85, 85, 86, 86, 87, 87, 87, 87, 88, 88, 89, 89, 90, 90, 91, 91, 91,
+        91, 91, 79, 78, 76, 75, 74, 76, 77, 79, 81, 82, 83, 84, 85, 86, 86, 87,
+        87, 87, 88, 88, 88, 89, 89, 90, 90, 91, 91, 92, 92, 92, 92, 92, 79, 78,
+        77, 76, 74, 76, 78, 79, 81, 82, 83, 84, 85, 86, 87, 87, 88, 88, 89, 89,
+        89, 90, 90, 91, 91, 92, 92, 92, 93, 93, 93, 93, 80, 79, 77, 76, 75, 76,
+        78, 79, 81, 82, 83, 85, 86, 87, 87, 88, 89, 89, 89, 90, 90, 91, 91, 92,
+        92, 92, 93, 93, 94, 94, 94, 94, 80, 79, 78, 76, 75, 76, 78, 79, 81, 82,
+        83, 85, 86, 87, 88, 89, 89, 90, 90, 91, 91, 92, 92, 93, 93, 93, 94, 94,
+        95, 95, 95, 95, 81, 80, 78, 77, 76, 77, 78, 80, 81, 83, 84, 85, 87, 87,
+        88, 89, 90, 90, 91, 91, 92, 92, 93, 93, 94, 94, 94, 95, 95, 95, 95, 95,
+        82, 81, 79, 78, 76, 78, 79, 80, 82, 83, 84, 86, 87, 88, 89, 89, 90, 91,
+        92, 92, 93, 93, 94, 94, 94, 95, 95, 96, 96, 96, 96, 96, 83, 81, 80, 79,
+        77, 78, 80, 81, 82, 83, 85, 86, 87, 88, 89, 90, 91, 91, 92, 93, 93, 94,
+        94, 95, 95, 95, 96, 96, 97, 97, 97, 97, 84, 82, 81, 79, 78, 79, 80, 81,
+        82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 93, 94, 94, 95, 95, 96, 96,
+        97, 97, 97, 97, 97, 97, 85, 83, 82, 80, 79, 80, 81, 82, 83, 84, 85, 87,
+        88, 89, 90, 91, 92, 92, 93, 94, 94, 95, 95, 96, 96, 97, 97, 97, 98, 98,
+        98, 98, 86, 84, 83, 81, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91,
+        92, 93, 94, 94, 95, 95, 96, 96, 97, 97, 98, 98, 98, 98, 98, 98, 87, 85,
+        84, 82, 81, 82, 82, 83, 84, 85, 87, 88, 89, 90, 91, 92, 93, 93, 94, 95,
+        95, 96, 96, 97, 97, 98, 98, 98, 99, 99, 99, 99, 88, 86, 85, 83, 82, 82,
+        83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 96, 97, 97,
+        98, 98, 99, 99, 99, 99, 99, 99, 88, 87, 85, 84, 83, 83, 84, 85, 86, 87,
+        88, 89, 90, 91, 92, 92, 93, 94, 95, 95, 96, 97, 97, 98, 98, 99, 99, 99,
+        100, 100, 100, 100, 89, 88, 86, 85, 84, 84, 85, 86, 87, 87, 88, 89, 90,
+        91, 92, 93, 94, 94, 95, 96, 97, 97, 98, 98, 99, 99, 99, 100, 100, 100,
+        100, 100, 90, 89, 87, 86, 85, 85, 86, 87, 87, 88, 89, 90, 91, 92, 92,
+        93, 94, 95, 96, 96, 97, 97, 98, 98, 99, 99, 100, 100, 101, 101, 101,
+        101, 91, 90, 88, 87, 86, 86, 87, 87, 88, 89, 90, 90, 91, 92, 93, 94, 95,
+        95, 96, 97, 97, 98, 98, 99, 99, 100, 100, 101, 101, 101, 101, 101, 91,
+        90, 88, 87, 86, 86, 87, 87, 88, 89, 90, 90, 91, 92, 93, 94, 95, 95, 96,
+        97, 97, 98, 98, 99, 99, 100, 100, 101, 101, 101, 101, 101, 91, 90, 88,
+        87, 86, 86, 87, 87, 88, 89, 90, 90, 91, 92, 93, 94, 95, 95, 96, 97, 97,
+        98, 98, 99, 99, 100, 100, 101, 101, 101, 101, 101, 91, 90, 88, 87, 86,
+        86, 87, 87, 88, 89, 90, 90, 91, 92, 93, 94, 95, 95, 96, 97, 97, 98, 98,
+        99, 99, 100, 100, 101, 101, 101, 101, 101 },
+      { /* Intra matrices */
+        /* Size 4 */
+        45, 57, 59, 65, 57, 62, 63, 67, 59, 63, 69, 72, 65, 67, 72, 76,
+        /* Size 8 */
+        47, 42, 57, 59, 60, 63, 66, 69, 42, 51, 57, 55, 56, 58, 61, 64, 57, 57,
+        61, 61, 61, 62, 64, 66, 59, 55, 61, 64, 65, 66, 67, 69, 60, 56, 61, 65,
+        67, 69, 70, 71, 63, 58, 62, 66, 69, 71, 72, 74, 66, 61, 64, 67, 70, 72,
+        74, 75, 69, 64, 66, 69, 71, 74, 75, 77,
+        /* Size 16 */
+        47, 44, 41, 48, 56, 57, 58, 59, 60, 61, 62, 64, 65, 67, 68, 68, 44, 45,
+        46, 50, 57, 56, 56, 57, 57, 58, 60, 61, 63, 64, 66, 66, 41, 46, 51, 54,
+        57, 56, 54, 55, 55, 56, 57, 59, 60, 62, 63, 63, 48, 50, 54, 56, 59, 58,
+        57, 57, 57, 58, 59, 60, 62, 63, 64, 64, 56, 57, 57, 59, 61, 60, 60, 60,
+        60, 60, 61, 62, 63, 64, 65, 65, 57, 56, 56, 58, 60, 61, 61, 62, 62, 62,
+        63, 64, 65, 66, 67, 67, 58, 56, 54, 57, 60, 61, 63, 63, 64, 65, 65, 66,
+        66, 67, 68, 68, 59, 57, 55, 57, 60, 62, 63, 64, 65, 66, 67, 67, 68, 69,
+        69, 69, 60, 57, 55, 57, 60, 62, 64, 65, 67, 67, 68, 69, 69, 70, 71, 71,
+        61, 58, 56, 58, 60, 62, 65, 66, 67, 68, 69, 70, 71, 71, 72, 72, 62, 60,
+        57, 59, 61, 63, 65, 67, 68, 69, 70, 71, 72, 72, 73, 73, 64, 61, 59, 60,
+        62, 64, 66, 67, 69, 70, 71, 72, 73, 73, 74, 74, 65, 63, 60, 62, 63, 65,
+        66, 68, 69, 71, 72, 73, 73, 74, 75, 75, 67, 64, 62, 63, 64, 66, 67, 69,
+        70, 71, 72, 73, 74, 75, 75, 75, 68, 66, 63, 64, 65, 67, 68, 69, 71, 72,
+        73, 74, 75, 75, 76, 76, 68, 66, 63, 64, 65, 67, 68, 69, 71, 72, 73, 74,
+        75, 75, 76, 76,
+        /* Size 32 */
+        46, 45, 44, 42, 41, 44, 47, 51, 56, 56, 57, 57, 58, 58, 58, 59, 59, 60,
+        61, 61, 62, 63, 63, 64, 65, 65, 66, 67, 68, 68, 68, 68, 45, 45, 44, 44,
+        43, 46, 49, 52, 56, 56, 56, 57, 57, 57, 57, 58, 58, 59, 59, 60, 61, 61,
+        62, 63, 64, 64, 65, 66, 66, 66, 66, 66, 44, 44, 44, 45, 45, 48, 50, 53,
+        56, 56, 56, 56, 56, 56, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 62, 63,
+        64, 65, 65, 65, 65, 65, 42, 44, 45, 46, 48, 50, 52, 54, 56, 56, 56, 55,
+        55, 55, 55, 56, 56, 56, 57, 58, 58, 59, 60, 60, 61, 62, 63, 63, 64, 64,
+        64, 64, 41, 43, 45, 48, 50, 52, 53, 55, 57, 56, 55, 55, 54, 54, 54, 55,
+        55, 55, 56, 57, 57, 58, 59, 59, 60, 61, 62, 62, 63, 63, 63, 63, 44, 46,
+        48, 50, 52, 53, 54, 56, 57, 57, 56, 56, 55, 56, 56, 56, 56, 56, 57, 57,
+        58, 59, 59, 60, 61, 61, 62, 63, 64, 64, 64, 64, 47, 49, 50, 52, 53, 54,
+        56, 57, 58, 58, 58, 57, 57, 57, 57, 57, 57, 58, 58, 58, 59, 60, 60, 61,
+        61, 62, 63, 63, 64, 64, 64, 64, 51, 52, 53, 54, 55, 56, 57, 58, 59, 59,
+        59, 58, 58, 58, 58, 58, 58, 59, 59, 59, 60, 60, 61, 62, 62, 63, 63, 64,
+        65, 65, 65, 65, 56, 56, 56, 56, 57, 57, 58, 59, 60, 60, 60, 60, 60, 60,
+        60, 60, 60, 60, 60, 60, 61, 61, 62, 62, 63, 63, 64, 65, 65, 65, 65, 65,
+        56, 56, 56, 56, 56, 57, 58, 59, 60, 60, 60, 60, 60, 60, 60, 60, 61, 61,
+        61, 61, 62, 62, 63, 63, 64, 64, 65, 65, 66, 66, 66, 66, 57, 56, 56, 56,
+        55, 56, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61, 62, 62, 62, 62, 63, 63,
+        64, 64, 64, 65, 65, 66, 66, 66, 66, 66, 57, 57, 56, 55, 55, 56, 57, 58,
+        60, 60, 61, 61, 62, 62, 62, 62, 63, 63, 63, 63, 64, 64, 64, 65, 65, 66,
+        66, 67, 67, 67, 67, 67, 58, 57, 56, 55, 54, 55, 57, 58, 60, 60, 61, 62,
+        63, 63, 63, 63, 64, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, 68, 68,
+        68, 68, 58, 57, 56, 55, 54, 56, 57, 58, 60, 60, 61, 62, 63, 63, 64, 64,
+        64, 65, 65, 65, 65, 66, 66, 66, 67, 67, 68, 68, 68, 68, 68, 68, 58, 57,
+        56, 55, 54, 56, 57, 58, 60, 60, 61, 62, 63, 64, 64, 64, 65, 65, 66, 66,
+        66, 67, 67, 67, 68, 68, 68, 69, 69, 69, 69, 69, 59, 58, 57, 56, 55, 56,
+        57, 58, 60, 60, 61, 62, 63, 64, 64, 65, 66, 66, 66, 67, 67, 67, 68, 68,
+        68, 69, 69, 69, 70, 70, 70, 70, 59, 58, 57, 56, 55, 56, 57, 58, 60, 61,
+        62, 63, 64, 64, 65, 66, 66, 67, 67, 67, 68, 68, 68, 69, 69, 69, 70, 70,
+        70, 70, 70, 70, 60, 59, 58, 56, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65,
+        65, 66, 67, 67, 67, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 71, 71,
+        61, 59, 58, 57, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 66, 67, 67,
+        68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 71, 71, 71, 61, 60, 59, 58,
+        57, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 67, 68, 68, 69, 69, 70,
+        70, 70, 71, 71, 71, 72, 72, 72, 72, 72, 62, 61, 59, 58, 57, 58, 59, 60,
+        61, 62, 63, 64, 65, 65, 66, 67, 68, 68, 69, 69, 70, 70, 71, 71, 71, 72,
+        72, 72, 72, 72, 72, 72, 63, 61, 60, 59, 58, 59, 60, 60, 61, 62, 63, 64,
+        65, 66, 67, 67, 68, 69, 69, 70, 70, 71, 71, 71, 72, 72, 72, 73, 73, 73,
+        73, 73, 63, 62, 61, 60, 59, 59, 60, 61, 62, 63, 64, 64, 65, 66, 67, 68,
+        68, 69, 69, 70, 71, 71, 71, 72, 72, 72, 73, 73, 73, 73, 73, 73, 64, 63,
+        62, 60, 59, 60, 61, 62, 62, 63, 64, 65, 66, 66, 67, 68, 69, 69, 70, 70,
+        71, 71, 72, 72, 73, 73, 73, 73, 74, 74, 74, 74, 65, 64, 62, 61, 60, 61,
+        61, 62, 63, 64, 64, 65, 66, 67, 68, 68, 69, 70, 70, 71, 71, 72, 72, 73,
+        73, 73, 74, 74, 74, 74, 74, 74, 65, 64, 63, 62, 61, 61, 62, 63, 63, 64,
+        65, 66, 66, 67, 68, 69, 69, 70, 70, 71, 72, 72, 72, 73, 73, 74, 74, 74,
+        74, 74, 74, 74, 66, 65, 64, 63, 62, 62, 63, 63, 64, 65, 65, 66, 67, 68,
+        68, 69, 70, 70, 71, 71, 72, 72, 73, 73, 74, 74, 74, 74, 75, 75, 75, 75,
+        67, 66, 65, 63, 62, 63, 63, 64, 65, 65, 66, 67, 67, 68, 69, 69, 70, 71,
+        71, 72, 72, 73, 73, 73, 74, 74, 74, 75, 75, 75, 75, 75, 68, 66, 65, 64,
+        63, 64, 64, 65, 65, 66, 66, 67, 68, 68, 69, 70, 70, 71, 71, 72, 72, 73,
+        73, 74, 74, 74, 75, 75, 75, 75, 75, 75, 68, 66, 65, 64, 63, 64, 64, 65,
+        65, 66, 66, 67, 68, 68, 69, 70, 70, 71, 71, 72, 72, 73, 73, 74, 74, 74,
+        75, 75, 75, 75, 75, 75, 68, 66, 65, 64, 63, 64, 64, 65, 65, 66, 66, 67,
+        68, 68, 69, 70, 70, 71, 71, 72, 72, 73, 73, 74, 74, 74, 75, 75, 75, 75,
+        75, 75, 68, 66, 65, 64, 63, 64, 64, 65, 65, 66, 66, 67, 68, 68, 69, 70,
+        70, 71, 71, 72, 72, 73, 73, 74, 74, 74, 75, 75, 75, 75, 75, 75 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 67, 80, 89, 67, 77, 85, 91, 80, 85, 92, 95, 89, 91, 95, 97,
+        /* Size 8 */
+        64, 57, 59, 66, 73, 78, 82, 85, 57, 60, 59, 63, 69, 75, 80, 83, 59, 59,
+        68, 71, 75, 78, 82, 84, 66, 63, 71, 77, 80, 82, 84, 86, 73, 69, 75, 80,
+        83, 85, 86, 88, 78, 75, 78, 82, 85, 87, 88, 89, 82, 80, 82, 84, 86, 88,
+        89, 90, 85, 83, 84, 86, 88, 89, 90, 91,
+        /* Size 16 */
+        64, 60, 57, 58, 59, 62, 66, 69, 73, 75, 78, 80, 82, 84, 85, 85, 60, 59,
+        58, 59, 59, 62, 64, 68, 71, 74, 77, 79, 81, 83, 84, 84, 57, 58, 60, 60,
+        59, 61, 63, 66, 69, 72, 75, 77, 80, 81, 83, 83, 58, 59, 60, 61, 63, 65,
+        67, 69, 72, 74, 77, 79, 81, 82, 84, 84, 59, 59, 59, 63, 68, 69, 71, 73,
+        75, 77, 78, 80, 82, 83, 84, 84, 62, 62, 61, 65, 69, 72, 74, 75, 77, 79,
+        80, 82, 83, 84, 85, 85, 66, 64, 63, 67, 71, 74, 77, 78, 80, 81, 82, 83,
+        84, 85, 86, 86, 69, 68, 66, 69, 73, 75, 78, 80, 81, 82, 83, 84, 85, 86,
+        87, 87, 73, 71, 69, 72, 75, 77, 80, 81, 83, 84, 85, 86, 86, 87, 88, 88,
+        75, 74, 72, 74, 77, 79, 81, 82, 84, 85, 86, 86, 87, 88, 88, 88, 78, 77,
+        75, 77, 78, 80, 82, 83, 85, 86, 87, 87, 88, 88, 89, 89, 80, 79, 77, 79,
+        80, 82, 83, 84, 86, 86, 87, 88, 89, 89, 89, 89, 82, 81, 80, 81, 82, 83,
+        84, 85, 86, 87, 88, 89, 89, 89, 90, 90, 84, 83, 81, 82, 83, 84, 85, 86,
+        87, 88, 88, 89, 89, 90, 90, 90, 85, 84, 83, 84, 84, 85, 86, 87, 88, 88,
+        89, 89, 90, 90, 91, 91, 85, 84, 83, 84, 84, 85, 86, 87, 88, 88, 89, 89,
+        90, 90, 91, 91,
+        /* Size 32 */
+        64, 62, 60, 58, 57, 57, 58, 58, 59, 60, 62, 64, 66, 67, 69, 71, 73, 74,
+        75, 77, 78, 79, 80, 81, 82, 83, 84, 85, 85, 85, 85, 85, 62, 61, 60, 59,
+        57, 58, 58, 58, 59, 60, 62, 63, 65, 67, 68, 70, 72, 73, 75, 76, 77, 78,
+        79, 81, 82, 82, 83, 84, 85, 85, 85, 85, 60, 60, 59, 59, 58, 58, 59, 59,
+        59, 60, 62, 63, 64, 66, 68, 69, 71, 72, 74, 75, 77, 78, 79, 80, 81, 82,
+        83, 83, 84, 84, 84, 84, 58, 59, 59, 59, 59, 59, 59, 59, 59, 60, 61, 63,
+        64, 65, 67, 68, 70, 71, 73, 74, 76, 77, 78, 79, 80, 81, 82, 83, 84, 84,
+        84, 84, 57, 57, 58, 59, 60, 60, 60, 59, 59, 60, 61, 62, 63, 65, 66, 68,
+        69, 71, 72, 73, 75, 76, 77, 78, 80, 80, 81, 82, 83, 83, 83, 83, 57, 58,
+        58, 59, 60, 60, 60, 61, 61, 62, 63, 64, 65, 66, 68, 69, 71, 72, 73, 74,
+        76, 77, 78, 79, 80, 81, 82, 83, 83, 83, 83, 83, 58, 58, 59, 59, 60, 60,
+        61, 62, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73, 74, 75, 77, 78, 79, 80,
+        81, 81, 82, 83, 84, 84, 84, 84, 58, 58, 59, 59, 59, 61, 62, 64, 65, 66,
+        67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 78, 79, 80, 81, 82, 83, 83,
+        84, 84, 84, 84, 59, 59, 59, 59, 59, 61, 63, 65, 68, 69, 69, 70, 71, 72,
+        73, 74, 75, 76, 77, 77, 78, 79, 80, 81, 82, 82, 83, 84, 84, 84, 84, 84,
+        60, 60, 60, 60, 60, 62, 64, 66, 69, 69, 70, 71, 73, 73, 74, 75, 76, 77,
+        78, 78, 79, 80, 81, 82, 82, 83, 84, 84, 85, 85, 85, 85, 62, 62, 62, 61,
+        61, 63, 65, 67, 69, 70, 72, 73, 74, 75, 75, 76, 77, 78, 79, 79, 80, 81,
+        82, 82, 83, 84, 84, 85, 85, 85, 85, 85, 64, 63, 63, 63, 62, 64, 66, 68,
+        70, 71, 73, 74, 75, 76, 77, 78, 78, 79, 80, 80, 81, 82, 82, 83, 84, 84,
+        85, 85, 86, 86, 86, 86, 66, 65, 64, 64, 63, 65, 67, 69, 71, 73, 74, 75,
+        77, 77, 78, 79, 80, 80, 81, 81, 82, 83, 83, 84, 84, 85, 85, 86, 86, 86,
+        86, 86, 67, 67, 66, 65, 65, 66, 68, 70, 72, 73, 75, 76, 77, 78, 79, 80,
+        80, 81, 82, 82, 83, 83, 84, 84, 85, 85, 86, 86, 86, 86, 86, 86, 69, 68,
+        68, 67, 66, 68, 69, 71, 73, 74, 75, 77, 78, 79, 80, 80, 81, 82, 82, 83,
+        83, 84, 84, 85, 85, 86, 86, 86, 87, 87, 87, 87, 71, 70, 69, 68, 68, 69,
+        71, 72, 74, 75, 76, 78, 79, 80, 80, 81, 82, 83, 83, 84, 84, 85, 85, 85,
+        86, 86, 87, 87, 87, 87, 87, 87, 73, 72, 71, 70, 69, 71, 72, 73, 75, 76,
+        77, 78, 80, 80, 81, 82, 83, 83, 84, 84, 85, 85, 86, 86, 86, 87, 87, 87,
+        88, 88, 88, 88, 74, 73, 72, 71, 71, 72, 73, 74, 76, 77, 78, 79, 80, 81,
+        82, 83, 83, 84, 84, 85, 85, 86, 86, 86, 87, 87, 87, 88, 88, 88, 88, 88,
+        75, 75, 74, 73, 72, 73, 74, 75, 77, 78, 79, 80, 81, 82, 82, 83, 84, 84,
+        85, 85, 86, 86, 86, 87, 87, 87, 88, 88, 88, 88, 88, 88, 77, 76, 75, 74,
+        73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 84, 85, 85, 86, 86, 87,
+        87, 87, 88, 88, 88, 88, 89, 89, 89, 89, 78, 77, 77, 76, 75, 76, 77, 78,
+        78, 79, 80, 81, 82, 83, 83, 84, 85, 85, 86, 86, 87, 87, 87, 88, 88, 88,
+        88, 89, 89, 89, 89, 89, 79, 78, 78, 77, 76, 77, 78, 78, 79, 80, 81, 82,
+        83, 83, 84, 85, 85, 86, 86, 87, 87, 87, 88, 88, 88, 88, 89, 89, 89, 89,
+        89, 89, 80, 79, 79, 78, 77, 78, 79, 79, 80, 81, 82, 82, 83, 84, 84, 85,
+        86, 86, 86, 87, 87, 88, 88, 88, 89, 89, 89, 89, 89, 89, 89, 89, 81, 81,
+        80, 79, 78, 79, 80, 80, 81, 82, 82, 83, 84, 84, 85, 85, 86, 86, 87, 87,
+        88, 88, 88, 88, 89, 89, 89, 89, 90, 90, 90, 90, 82, 82, 81, 80, 80, 80,
+        81, 81, 82, 82, 83, 84, 84, 85, 85, 86, 86, 87, 87, 88, 88, 88, 89, 89,
+        89, 89, 89, 90, 90, 90, 90, 90, 83, 82, 82, 81, 80, 81, 81, 82, 82, 83,
+        84, 84, 85, 85, 86, 86, 87, 87, 87, 88, 88, 88, 89, 89, 89, 89, 90, 90,
+        90, 90, 90, 90, 84, 83, 83, 82, 81, 82, 82, 83, 83, 84, 84, 85, 85, 86,
+        86, 87, 87, 87, 88, 88, 88, 89, 89, 89, 89, 90, 90, 90, 90, 90, 90, 90,
+        85, 84, 83, 83, 82, 83, 83, 83, 84, 84, 85, 85, 86, 86, 86, 87, 87, 88,
+        88, 88, 89, 89, 89, 89, 90, 90, 90, 90, 90, 90, 90, 90, 85, 85, 84, 84,
+        83, 83, 84, 84, 84, 85, 85, 86, 86, 86, 87, 87, 88, 88, 88, 89, 89, 89,
+        89, 90, 90, 90, 90, 90, 91, 91, 91, 91, 85, 85, 84, 84, 83, 83, 84, 84,
+        84, 85, 85, 86, 86, 86, 87, 87, 88, 88, 88, 89, 89, 89, 89, 90, 90, 90,
+        90, 90, 91, 91, 91, 91, 85, 85, 84, 84, 83, 83, 84, 84, 84, 85, 85, 86,
+        86, 86, 87, 87, 88, 88, 88, 89, 89, 89, 89, 90, 90, 90, 90, 90, 91, 91,
+        91, 91, 85, 85, 84, 84, 83, 83, 84, 84, 84, 85, 85, 86, 86, 86, 87, 87,
+        88, 88, 88, 89, 89, 89, 89, 90, 90, 90, 90, 90, 91, 91, 91, 91 },
+      { /* Intra matrices */
+        /* Size 4 */
+        48, 50, 60, 68, 50, 58, 64, 69, 60, 64, 70, 72, 68, 69, 72, 74,
+        /* Size 8 */
+        52, 45, 47, 53, 59, 64, 68, 70, 45, 48, 47, 51, 56, 61, 65, 68, 47, 47,
+        55, 58, 61, 64, 67, 69, 53, 51, 58, 62, 65, 67, 69, 71, 59, 56, 61, 65,
+        68, 70, 71, 72, 64, 61, 64, 67, 70, 71, 72, 73, 68, 65, 67, 69, 71, 72,
+        73, 74, 70, 68, 69, 71, 72, 73, 74, 75,
+        /* Size 16 */
+        51, 48, 45, 46, 47, 49, 52, 55, 58, 61, 63, 65, 67, 68, 69, 69, 48, 47,
+        46, 46, 47, 49, 51, 54, 57, 59, 62, 63, 65, 67, 68, 68, 45, 46, 48, 47,
+        47, 48, 50, 53, 55, 58, 60, 62, 64, 66, 67, 67, 46, 46, 47, 49, 50, 52,
+        53, 55, 58, 60, 62, 63, 65, 66, 68, 68, 47, 47, 47, 50, 54, 56, 57, 59,
+        60, 62, 63, 65, 66, 67, 68, 68, 49, 49, 48, 52, 56, 57, 59, 61, 62, 63,
+        65, 66, 67, 68, 69, 69, 52, 51, 50, 53, 57, 59, 62, 63, 64, 65, 66, 67,
+        68, 69, 70, 70, 55, 54, 53, 55, 59, 61, 63, 64, 66, 67, 68, 68, 69, 70,
+        71, 71, 58, 57, 55, 58, 60, 62, 64, 66, 67, 68, 69, 69, 70, 71, 71, 71,
+        61, 59, 58, 60, 62, 63, 65, 67, 68, 69, 70, 70, 71, 71, 72, 72, 63, 62,
+        60, 62, 63, 65, 66, 68, 69, 70, 70, 71, 71, 72, 72, 72, 65, 63, 62, 63,
+        65, 66, 67, 68, 69, 70, 71, 71, 72, 72, 73, 73, 67, 65, 64, 65, 66, 67,
+        68, 69, 70, 71, 71, 72, 72, 73, 73, 73, 68, 67, 66, 66, 67, 68, 69, 70,
+        71, 71, 72, 72, 73, 73, 73, 73, 69, 68, 67, 68, 68, 69, 70, 71, 71, 72,
+        72, 73, 73, 73, 74, 74, 69, 68, 67, 68, 68, 69, 70, 71, 71, 72, 72, 73,
+        73, 73, 74, 74,
+        /* Size 32 */
+        51, 49, 47, 46, 45, 45, 45, 46, 46, 48, 49, 50, 52, 53, 55, 56, 58, 59,
+        60, 61, 63, 64, 64, 65, 66, 67, 68, 68, 69, 69, 69, 69, 49, 48, 47, 46,
+        45, 46, 46, 46, 46, 48, 49, 50, 52, 53, 54, 56, 57, 58, 59, 61, 62, 63,
+        64, 65, 66, 66, 67, 68, 68, 68, 68, 68, 47, 47, 47, 46, 46, 46, 46, 46,
+        46, 47, 49, 50, 51, 52, 54, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66,
+        66, 67, 68, 68, 68, 68, 46, 46, 46, 47, 47, 47, 47, 47, 46, 47, 48, 49,
+        50, 52, 53, 54, 56, 57, 58, 59, 61, 61, 62, 63, 64, 65, 66, 67, 67, 67,
+        67, 67, 45, 45, 46, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 51, 52, 54,
+        55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 65, 66, 67, 67, 67, 67, 45, 46,
+        46, 47, 47, 47, 48, 48, 48, 49, 50, 51, 51, 53, 54, 55, 56, 57, 58, 59,
+        61, 61, 62, 63, 64, 65, 66, 66, 67, 67, 67, 67, 45, 46, 46, 47, 47, 48,
+        48, 49, 50, 51, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        65, 65, 66, 67, 67, 67, 67, 67, 46, 46, 46, 47, 47, 48, 49, 50, 52, 52,
+        53, 54, 55, 56, 57, 58, 58, 59, 60, 61, 62, 63, 64, 64, 65, 66, 66, 67,
+        68, 68, 68, 68, 46, 46, 46, 46, 47, 48, 50, 52, 54, 54, 55, 56, 57, 57,
+        58, 59, 60, 60, 61, 62, 63, 64, 64, 65, 66, 66, 67, 67, 68, 68, 68, 68,
+        48, 48, 47, 47, 47, 49, 51, 52, 54, 55, 56, 57, 58, 59, 59, 60, 61, 61,
+        62, 63, 64, 64, 65, 66, 66, 67, 67, 68, 68, 68, 68, 68, 49, 49, 49, 48,
+        48, 50, 51, 53, 55, 56, 57, 58, 59, 60, 60, 61, 62, 62, 63, 64, 64, 65,
+        66, 66, 67, 67, 68, 68, 69, 69, 69, 69, 50, 50, 50, 49, 49, 51, 52, 54,
+        56, 57, 58, 59, 60, 61, 61, 62, 63, 63, 64, 65, 65, 66, 66, 67, 67, 68,
+        68, 69, 69, 69, 69, 69, 52, 52, 51, 50, 50, 51, 53, 55, 57, 58, 59, 60,
+        61, 62, 63, 63, 64, 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 69, 69,
+        69, 69, 53, 53, 52, 52, 51, 53, 54, 56, 57, 59, 60, 61, 62, 63, 63, 64,
+        65, 65, 66, 66, 67, 67, 67, 68, 68, 69, 69, 69, 70, 70, 70, 70, 55, 54,
+        54, 53, 52, 54, 55, 57, 58, 59, 60, 61, 63, 63, 64, 65, 65, 66, 66, 67,
+        67, 68, 68, 68, 69, 69, 69, 70, 70, 70, 70, 70, 56, 56, 55, 54, 54, 55,
+        56, 58, 59, 60, 61, 62, 63, 64, 65, 65, 66, 66, 67, 67, 68, 68, 68, 69,
+        69, 70, 70, 70, 70, 70, 70, 70, 58, 57, 56, 56, 55, 56, 57, 58, 60, 61,
+        62, 63, 64, 65, 65, 66, 67, 67, 67, 68, 68, 69, 69, 69, 70, 70, 70, 71,
+        71, 71, 71, 71, 59, 58, 58, 57, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
+        66, 66, 67, 67, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 71, 71, 71,
+        60, 59, 59, 58, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 66, 67, 67, 68,
+        68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 71, 71, 71, 71, 61, 61, 60, 59,
+        59, 59, 60, 61, 62, 63, 64, 65, 65, 66, 67, 67, 68, 68, 69, 69, 70, 70,
+        70, 70, 71, 71, 71, 71, 72, 72, 72, 72, 63, 62, 61, 61, 60, 61, 61, 62,
+        63, 64, 64, 65, 66, 67, 67, 68, 68, 69, 69, 70, 70, 70, 70, 71, 71, 71,
+        71, 72, 72, 72, 72, 72, 64, 63, 62, 61, 61, 61, 62, 63, 64, 64, 65, 66,
+        66, 67, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 71, 72, 72, 72, 72,
+        72, 72, 64, 64, 63, 62, 62, 62, 63, 64, 64, 65, 66, 66, 67, 67, 68, 68,
+        69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 72, 72, 72, 72, 72, 65, 65,
+        64, 63, 63, 63, 64, 64, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 70,
+        71, 71, 71, 72, 72, 72, 72, 72, 72, 72, 72, 72, 66, 66, 65, 64, 64, 64,
+        65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72,
+        72, 72, 72, 73, 73, 73, 73, 73, 67, 66, 66, 65, 65, 65, 65, 66, 66, 67,
+        67, 68, 68, 69, 69, 70, 70, 70, 71, 71, 71, 71, 72, 72, 72, 72, 73, 73,
+        73, 73, 73, 73, 68, 67, 66, 66, 65, 66, 66, 66, 67, 67, 68, 68, 69, 69,
+        69, 70, 70, 71, 71, 71, 71, 72, 72, 72, 72, 73, 73, 73, 73, 73, 73, 73,
+        68, 68, 67, 67, 66, 66, 67, 67, 67, 68, 68, 69, 69, 69, 70, 70, 71, 71,
+        71, 71, 72, 72, 72, 72, 73, 73, 73, 73, 73, 73, 73, 73, 69, 68, 68, 67,
+        67, 67, 67, 68, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72,
+        72, 72, 73, 73, 73, 73, 73, 73, 73, 73, 69, 68, 68, 67, 67, 67, 67, 68,
+        68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 72, 72, 73, 73,
+        73, 73, 73, 73, 73, 73, 69, 68, 68, 67, 67, 67, 67, 68, 68, 68, 69, 69,
+        69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 72, 72, 73, 73, 73, 73, 73, 73,
+        73, 73, 69, 68, 68, 67, 67, 67, 67, 68, 68, 68, 69, 69, 69, 70, 70, 70,
+        71, 71, 71, 72, 72, 72, 72, 72, 73, 73, 73, 73, 73, 73, 73, 73 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 77, 79, 85, 77, 81, 83, 86, 79, 83, 89, 91, 85, 86, 91, 94,
+        /* Size 8 */
+        64, 58, 74, 76, 77, 80, 83, 85, 58, 68, 75, 72, 73, 75, 78, 81, 74, 75,
+        78, 78, 78, 79, 81, 83, 76, 72, 78, 80, 82, 83, 84, 85, 77, 73, 78, 82,
+        84, 85, 87, 88, 80, 75, 79, 83, 85, 87, 89, 90, 83, 78, 81, 84, 87, 89,
+        90, 91, 85, 81, 83, 85, 88, 90, 91, 92,
+        /* Size 16 */
+        64, 61, 58, 65, 74, 75, 76, 76, 77, 78, 80, 81, 83, 84, 85, 85, 61, 62,
+        63, 68, 74, 74, 74, 74, 75, 76, 77, 79, 80, 82, 83, 83, 58, 63, 68, 71,
+        75, 73, 72, 72, 73, 74, 75, 77, 78, 80, 81, 81, 65, 68, 71, 74, 76, 76,
+        75, 75, 75, 76, 77, 78, 79, 81, 82, 82, 74, 74, 75, 76, 78, 78, 78, 78,
+        78, 78, 79, 80, 81, 82, 83, 83, 75, 74, 73, 76, 78, 79, 79, 79, 79, 80,
+        81, 81, 82, 83, 84, 84, 76, 74, 72, 75, 78, 79, 80, 81, 82, 82, 83, 83,
+        84, 85, 85, 85, 76, 74, 72, 75, 78, 79, 81, 82, 83, 83, 84, 85, 85, 86,
+        86, 86, 77, 75, 73, 75, 78, 79, 82, 83, 84, 85, 85, 86, 87, 87, 88, 88,
+        78, 76, 74, 76, 78, 80, 82, 83, 85, 85, 86, 87, 88, 88, 89, 89, 80, 77,
+        75, 77, 79, 81, 83, 84, 85, 86, 87, 88, 89, 89, 90, 90, 81, 79, 77, 78,
+        80, 81, 83, 85, 86, 87, 88, 89, 89, 90, 90, 90, 83, 80, 78, 79, 81, 82,
+        84, 85, 87, 88, 89, 89, 90, 91, 91, 91, 84, 82, 80, 81, 82, 83, 85, 86,
+        87, 88, 89, 90, 91, 91, 92, 92, 85, 83, 81, 82, 83, 84, 85, 86, 88, 89,
+        90, 90, 91, 92, 92, 92, 85, 83, 81, 82, 83, 84, 85, 86, 88, 89, 90, 90,
+        91, 92, 92, 92,
+        /* Size 32 */
+        64, 62, 61, 59, 58, 61, 65, 69, 74, 74, 75, 75, 76, 76, 76, 77, 77, 78,
+        78, 79, 80, 80, 81, 82, 83, 83, 84, 85, 85, 85, 85, 85, 62, 62, 61, 61,
+        60, 63, 67, 70, 74, 74, 74, 75, 75, 75, 75, 76, 76, 77, 77, 78, 79, 79,
+        80, 81, 81, 82, 83, 83, 84, 84, 84, 84, 61, 61, 62, 62, 63, 65, 68, 71,
+        74, 74, 74, 74, 74, 74, 74, 75, 75, 76, 76, 77, 77, 78, 79, 80, 80, 81,
+        82, 82, 83, 83, 83, 83, 59, 61, 62, 64, 65, 67, 70, 72, 74, 74, 74, 73,
+        73, 73, 73, 74, 74, 74, 75, 76, 76, 77, 78, 78, 79, 80, 81, 81, 82, 82,
+        82, 82, 58, 60, 63, 65, 68, 70, 71, 73, 75, 74, 73, 73, 72, 72, 72, 73,
+        73, 73, 74, 75, 75, 76, 77, 77, 78, 79, 80, 80, 81, 81, 81, 81, 61, 63,
+        65, 67, 70, 71, 72, 74, 76, 75, 74, 74, 73, 74, 74, 74, 74, 74, 75, 76,
+        76, 77, 77, 78, 79, 79, 80, 81, 81, 81, 81, 81, 65, 67, 68, 70, 71, 72,
+        74, 75, 76, 76, 76, 75, 75, 75, 75, 75, 75, 76, 76, 76, 77, 78, 78, 79,
+        79, 80, 81, 81, 82, 82, 82, 82, 69, 70, 71, 72, 73, 74, 75, 76, 77, 77,
+        77, 76, 76, 76, 76, 76, 76, 77, 77, 77, 78, 78, 79, 79, 80, 81, 81, 82,
+        82, 82, 82, 82, 74, 74, 74, 74, 75, 76, 76, 77, 78, 78, 78, 78, 78, 78,
+        78, 78, 78, 78, 78, 78, 79, 79, 80, 80, 81, 81, 82, 82, 83, 83, 83, 83,
+        74, 74, 74, 74, 74, 75, 76, 77, 78, 78, 78, 78, 78, 78, 78, 78, 78, 79,
+        79, 79, 80, 80, 81, 81, 81, 82, 82, 83, 83, 83, 83, 83, 75, 74, 74, 74,
+        73, 74, 76, 77, 78, 78, 79, 79, 79, 79, 79, 79, 79, 80, 80, 80, 81, 81,
+        81, 82, 82, 83, 83, 84, 84, 84, 84, 84, 75, 75, 74, 73, 73, 74, 75, 76,
+        78, 78, 79, 79, 80, 80, 80, 80, 81, 81, 81, 81, 82, 82, 82, 83, 83, 83,
+        84, 84, 85, 85, 85, 85, 76, 75, 74, 73, 72, 73, 75, 76, 78, 78, 79, 80,
+        80, 81, 81, 81, 82, 82, 82, 82, 83, 83, 83, 83, 84, 84, 85, 85, 85, 85,
+        85, 85, 76, 75, 74, 73, 72, 74, 75, 76, 78, 78, 79, 80, 81, 81, 81, 82,
+        82, 82, 83, 83, 83, 84, 84, 84, 84, 85, 85, 86, 86, 86, 86, 86, 76, 75,
+        74, 73, 72, 74, 75, 76, 78, 78, 79, 80, 81, 81, 82, 82, 83, 83, 83, 84,
+        84, 84, 85, 85, 85, 85, 86, 86, 86, 86, 86, 86, 77, 76, 75, 74, 73, 74,
+        75, 76, 78, 78, 79, 80, 81, 82, 82, 83, 83, 84, 84, 84, 85, 85, 85, 86,
+        86, 86, 86, 87, 87, 87, 87, 87, 77, 76, 75, 74, 73, 74, 75, 76, 78, 78,
+        79, 81, 82, 82, 83, 83, 84, 84, 85, 85, 85, 86, 86, 86, 87, 87, 87, 87,
+        88, 88, 88, 88, 78, 77, 76, 74, 73, 74, 76, 77, 78, 79, 80, 81, 82, 82,
+        83, 84, 84, 85, 85, 85, 86, 86, 86, 87, 87, 87, 88, 88, 88, 88, 88, 88,
+        78, 77, 76, 75, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 83, 84, 85, 85,
+        85, 86, 86, 87, 87, 87, 88, 88, 88, 88, 89, 89, 89, 89, 79, 78, 77, 76,
+        75, 76, 76, 77, 78, 79, 80, 81, 82, 83, 84, 84, 85, 85, 86, 86, 87, 87,
+        87, 88, 88, 88, 89, 89, 89, 89, 89, 89, 80, 79, 77, 76, 75, 76, 77, 78,
+        79, 80, 81, 82, 83, 83, 84, 85, 85, 86, 86, 87, 87, 88, 88, 88, 89, 89,
+        89, 89, 90, 90, 90, 90, 80, 79, 78, 77, 76, 77, 78, 78, 79, 80, 81, 82,
+        83, 84, 84, 85, 86, 86, 87, 87, 88, 88, 88, 89, 89, 89, 89, 90, 90, 90,
+        90, 90, 81, 80, 79, 78, 77, 77, 78, 79, 80, 81, 81, 82, 83, 84, 85, 85,
+        86, 86, 87, 87, 88, 88, 89, 89, 89, 90, 90, 90, 90, 90, 90, 90, 82, 81,
+        80, 78, 77, 78, 79, 79, 80, 81, 82, 83, 83, 84, 85, 86, 86, 87, 87, 88,
+        88, 89, 89, 89, 90, 90, 90, 90, 91, 91, 91, 91, 83, 81, 80, 79, 78, 79,
+        79, 80, 81, 81, 82, 83, 84, 84, 85, 86, 87, 87, 88, 88, 89, 89, 89, 90,
+        90, 90, 91, 91, 91, 91, 91, 91, 83, 82, 81, 80, 79, 79, 80, 81, 81, 82,
+        83, 83, 84, 85, 85, 86, 87, 87, 88, 88, 89, 89, 90, 90, 90, 91, 91, 91,
+        91, 91, 91, 91, 84, 83, 82, 81, 80, 80, 81, 81, 82, 82, 83, 84, 85, 85,
+        86, 86, 87, 88, 88, 89, 89, 89, 90, 90, 91, 91, 91, 91, 92, 92, 92, 92,
+        85, 83, 82, 81, 80, 81, 81, 82, 82, 83, 84, 84, 85, 86, 86, 87, 87, 88,
+        88, 89, 89, 90, 90, 90, 91, 91, 91, 92, 92, 92, 92, 92, 85, 84, 83, 82,
+        81, 81, 82, 82, 83, 83, 84, 85, 85, 86, 86, 87, 88, 88, 89, 89, 90, 90,
+        90, 91, 91, 91, 92, 92, 92, 92, 92, 92, 85, 84, 83, 82, 81, 81, 82, 82,
+        83, 83, 84, 85, 85, 86, 86, 87, 88, 88, 89, 89, 90, 90, 90, 91, 91, 91,
+        92, 92, 92, 92, 92, 92, 85, 84, 83, 82, 81, 81, 82, 82, 83, 83, 84, 85,
+        85, 86, 86, 87, 88, 88, 89, 89, 90, 90, 90, 91, 91, 91, 92, 92, 92, 92,
+        92, 92, 85, 84, 83, 82, 81, 81, 82, 82, 83, 83, 84, 85, 85, 86, 86, 87,
+        88, 88, 89, 89, 90, 90, 90, 91, 91, 91, 92, 92, 92, 92, 92, 92 },
+      { /* Intra matrices */
+        /* Size 4 */
+        48, 59, 60, 65, 59, 62, 64, 66, 60, 64, 68, 70, 65, 66, 70, 73,
+        /* Size 8 */
+        50, 45, 59, 60, 61, 63, 66, 68, 45, 54, 59, 57, 57, 59, 62, 64, 59, 59,
+        62, 61, 61, 62, 64, 66, 60, 57, 61, 64, 65, 66, 67, 68, 61, 57, 61, 65,
+        67, 68, 69, 70, 63, 59, 62, 66, 68, 70, 71, 72, 66, 62, 64, 67, 69, 71,
+        72, 73, 68, 64, 66, 68, 70, 72, 73, 74,
+        /* Size 16 */
+        50, 47, 45, 51, 58, 59, 59, 60, 61, 62, 63, 64, 65, 66, 67, 67, 47, 48,
+        49, 53, 58, 58, 58, 58, 59, 60, 61, 62, 63, 64, 65, 65, 45, 49, 53, 56,
+        58, 57, 56, 57, 57, 58, 59, 60, 61, 63, 64, 64, 51, 53, 56, 58, 60, 59,
+        59, 59, 59, 60, 60, 61, 62, 63, 65, 65, 58, 58, 58, 60, 62, 61, 61, 61,
+        61, 61, 62, 63, 63, 64, 65, 65, 59, 58, 57, 59, 61, 62, 62, 62, 62, 63,
+        63, 64, 65, 66, 66, 66, 59, 58, 56, 59, 61, 62, 63, 64, 64, 65, 65, 66,
+        66, 67, 67, 67, 60, 58, 57, 59, 61, 62, 64, 64, 65, 66, 66, 67, 67, 68,
+        68, 68, 61, 59, 57, 59, 61, 62, 64, 65, 66, 67, 67, 68, 68, 69, 69, 69,
+        62, 60, 58, 60, 61, 63, 65, 66, 67, 68, 68, 69, 69, 70, 70, 70, 63, 61,
+        59, 60, 62, 63, 65, 66, 67, 68, 69, 70, 70, 71, 71, 71, 64, 62, 60, 61,
+        63, 64, 66, 67, 68, 69, 70, 70, 71, 71, 72, 72, 65, 63, 61, 62, 63, 65,
+        66, 67, 68, 69, 70, 71, 71, 72, 72, 72, 66, 64, 63, 63, 64, 66, 67, 68,
+        69, 70, 71, 71, 72, 72, 73, 73, 67, 65, 64, 65, 65, 66, 67, 68, 69, 70,
+        71, 72, 72, 73, 73, 73, 67, 65, 64, 65, 65, 66, 67, 68, 69, 70, 71, 72,
+        72, 73, 73, 73,
+        /* Size 32 */
+        49, 48, 47, 46, 45, 47, 50, 54, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61,
+        61, 62, 62, 63, 64, 64, 65, 65, 66, 66, 67, 67, 67, 67, 48, 48, 47, 47,
+        46, 49, 52, 55, 58, 58, 58, 58, 58, 58, 59, 59, 59, 60, 60, 61, 61, 62,
+        63, 63, 64, 64, 65, 65, 66, 66, 66, 66, 47, 47, 48, 48, 48, 50, 53, 55,
+        58, 58, 58, 58, 57, 58, 58, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 63,
+        64, 65, 65, 65, 65, 65, 46, 47, 48, 49, 51, 52, 54, 56, 58, 58, 57, 57,
+        57, 57, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 63, 63, 64, 64, 64,
+        64, 64, 45, 46, 48, 51, 53, 54, 55, 57, 58, 58, 57, 57, 56, 56, 56, 57,
+        57, 57, 58, 58, 59, 59, 60, 60, 61, 62, 62, 63, 63, 63, 63, 63, 47, 49,
+        50, 52, 54, 55, 56, 58, 59, 58, 58, 58, 57, 57, 57, 57, 58, 58, 58, 59,
+        59, 60, 60, 61, 62, 62, 63, 63, 64, 64, 64, 64, 50, 52, 53, 54, 55, 56,
+        57, 59, 60, 59, 59, 59, 58, 58, 58, 58, 59, 59, 59, 60, 60, 61, 61, 62,
+        62, 63, 63, 64, 64, 64, 64, 64, 54, 55, 55, 56, 57, 58, 59, 60, 60, 60,
+        60, 60, 59, 59, 59, 60, 60, 60, 60, 60, 61, 61, 62, 62, 63, 63, 64, 64,
+        65, 65, 65, 65, 58, 58, 58, 58, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61,
+        61, 61, 61, 61, 61, 61, 62, 62, 62, 63, 63, 64, 64, 65, 65, 65, 65, 65,
+        58, 58, 58, 58, 58, 58, 59, 60, 61, 61, 61, 61, 61, 61, 61, 61, 61, 62,
+        62, 62, 62, 63, 63, 63, 64, 64, 65, 65, 66, 66, 66, 66, 58, 58, 58, 57,
+        57, 58, 59, 60, 61, 61, 61, 62, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63,
+        64, 64, 64, 65, 65, 66, 66, 66, 66, 66, 59, 58, 58, 57, 57, 58, 59, 60,
+        61, 61, 62, 62, 62, 63, 63, 63, 63, 63, 63, 64, 64, 64, 65, 65, 65, 65,
+        66, 66, 67, 67, 67, 67, 59, 58, 57, 57, 56, 57, 58, 59, 61, 61, 62, 62,
+        63, 63, 63, 64, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 66, 67, 67, 67,
+        67, 67, 59, 58, 58, 57, 56, 57, 58, 59, 61, 61, 62, 63, 63, 64, 64, 64,
+        64, 65, 65, 65, 65, 66, 66, 66, 66, 67, 67, 67, 68, 68, 68, 68, 60, 59,
+        58, 57, 56, 57, 58, 59, 61, 61, 62, 63, 63, 64, 64, 65, 65, 65, 65, 66,
+        66, 66, 66, 67, 67, 67, 67, 68, 68, 68, 68, 68, 60, 59, 58, 57, 57, 57,
+        58, 60, 61, 61, 62, 63, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, 67, 67,
+        68, 68, 68, 68, 69, 69, 69, 69, 60, 59, 58, 58, 57, 58, 59, 60, 61, 61,
+        62, 63, 64, 64, 65, 65, 66, 66, 67, 67, 67, 67, 68, 68, 68, 68, 69, 69,
+        69, 69, 69, 69, 61, 60, 59, 58, 57, 58, 59, 60, 61, 62, 62, 63, 64, 65,
+        65, 66, 66, 67, 67, 67, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69,
+        61, 60, 59, 59, 58, 58, 59, 60, 61, 62, 63, 63, 64, 65, 65, 66, 67, 67,
+        67, 68, 68, 68, 68, 69, 69, 69, 69, 70, 70, 70, 70, 70, 62, 61, 60, 59,
+        58, 59, 60, 60, 61, 62, 63, 64, 65, 65, 66, 66, 67, 67, 68, 68, 68, 69,
+        69, 69, 69, 70, 70, 70, 70, 70, 70, 70, 62, 61, 61, 60, 59, 59, 60, 61,
+        62, 62, 63, 64, 65, 65, 66, 67, 67, 68, 68, 68, 69, 69, 69, 70, 70, 70,
+        70, 70, 71, 71, 71, 71, 63, 62, 61, 60, 59, 60, 61, 61, 62, 63, 63, 64,
+        65, 66, 66, 67, 67, 68, 68, 69, 69, 69, 70, 70, 70, 70, 71, 71, 71, 71,
+        71, 71, 64, 63, 62, 61, 60, 60, 61, 62, 62, 63, 64, 65, 65, 66, 66, 67,
+        68, 68, 68, 69, 69, 70, 70, 70, 70, 71, 71, 71, 71, 71, 71, 71, 64, 63,
+        62, 61, 60, 61, 62, 62, 63, 63, 64, 65, 66, 66, 67, 67, 68, 68, 69, 69,
+        70, 70, 70, 70, 71, 71, 71, 71, 72, 72, 72, 72, 65, 64, 63, 62, 61, 62,
+        62, 63, 63, 64, 64, 65, 66, 66, 67, 68, 68, 69, 69, 69, 70, 70, 70, 71,
+        71, 71, 71, 72, 72, 72, 72, 72, 65, 64, 63, 63, 62, 62, 63, 63, 64, 64,
+        65, 65, 66, 67, 67, 68, 68, 69, 69, 70, 70, 70, 71, 71, 71, 71, 72, 72,
+        72, 72, 72, 72, 66, 65, 64, 63, 62, 63, 63, 64, 64, 65, 65, 66, 66, 67,
+        67, 68, 69, 69, 69, 70, 70, 71, 71, 71, 71, 72, 72, 72, 72, 72, 72, 72,
+        66, 65, 65, 64, 63, 63, 64, 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69,
+        70, 70, 70, 71, 71, 71, 72, 72, 72, 72, 73, 73, 73, 73, 67, 66, 65, 64,
+        63, 64, 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 69, 70, 70, 71, 71,
+        71, 72, 72, 72, 72, 73, 73, 73, 73, 73, 67, 66, 65, 64, 63, 64, 64, 65,
+        65, 66, 66, 67, 67, 68, 68, 69, 69, 69, 70, 70, 71, 71, 71, 72, 72, 72,
+        72, 73, 73, 73, 73, 73, 67, 66, 65, 64, 63, 64, 64, 65, 65, 66, 66, 67,
+        67, 68, 68, 69, 69, 69, 70, 70, 71, 71, 71, 72, 72, 72, 72, 73, 73, 73,
+        73, 73, 67, 66, 65, 64, 63, 64, 64, 65, 65, 66, 66, 67, 67, 68, 68, 69,
+        69, 69, 70, 70, 71, 71, 71, 72, 72, 72, 72, 73, 73, 73, 73, 73 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 66, 76, 82, 66, 74, 79, 83, 76, 79, 84, 86, 82, 83, 86, 87,
+        /* Size 8 */
+        64, 58, 60, 65, 71, 75, 77, 79, 58, 61, 60, 63, 68, 72, 75, 78, 60, 60,
+        67, 69, 72, 75, 77, 79, 65, 63, 69, 73, 75, 77, 79, 80, 71, 68, 72, 75,
+        78, 79, 80, 81, 75, 72, 75, 77, 79, 80, 81, 82, 77, 75, 77, 79, 80, 81,
+        82, 82, 79, 78, 79, 80, 81, 82, 82, 83,
+        /* Size 16 */
+        64, 61, 58, 59, 60, 62, 65, 68, 71, 72, 75, 76, 77, 78, 79, 79, 61, 60,
+        60, 60, 60, 62, 64, 67, 69, 71, 73, 75, 76, 77, 79, 79, 58, 60, 61, 61,
+        60, 62, 63, 66, 68, 70, 72, 74, 75, 77, 78, 78, 59, 60, 61, 62, 63, 65,
+        66, 68, 70, 72, 73, 75, 76, 77, 78, 78, 60, 60, 60, 63, 67, 68, 69, 71,
+        72, 73, 75, 76, 77, 78, 79, 79, 62, 62, 62, 65, 68, 70, 71, 73, 74, 75,
+        76, 77, 78, 79, 79, 79, 65, 64, 63, 66, 69, 71, 73, 74, 75, 76, 77, 78,
+        79, 79, 80, 80, 68, 67, 66, 68, 71, 73, 74, 75, 77, 77, 78, 79, 79, 80,
+        80, 80, 71, 69, 68, 70, 72, 74, 75, 77, 78, 78, 79, 80, 80, 80, 81, 81,
+        72, 71, 70, 72, 73, 75, 76, 77, 78, 79, 80, 80, 81, 81, 81, 81, 75, 73,
+        72, 73, 75, 76, 77, 78, 79, 80, 80, 81, 81, 81, 82, 82, 76, 75, 74, 75,
+        76, 77, 78, 79, 80, 80, 81, 81, 81, 82, 82, 82, 77, 76, 75, 76, 77, 78,
+        79, 79, 80, 81, 81, 81, 82, 82, 82, 82, 78, 77, 77, 77, 78, 79, 79, 80,
+        80, 81, 81, 82, 82, 82, 83, 83, 79, 79, 78, 78, 79, 79, 80, 80, 81, 81,
+        82, 82, 82, 83, 83, 83, 79, 79, 78, 78, 79, 79, 80, 80, 81, 81, 82, 82,
+        82, 83, 83, 83,
+        /* Size 32 */
+        64, 62, 61, 60, 58, 59, 59, 59, 60, 61, 62, 64, 65, 67, 68, 69, 71, 71,
+        72, 73, 75, 75, 76, 77, 77, 78, 78, 79, 79, 79, 79, 79, 62, 61, 61, 60,
+        59, 59, 59, 60, 60, 61, 62, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75,
+        75, 76, 77, 77, 78, 78, 79, 79, 79, 79, 61, 61, 60, 60, 60, 60, 60, 60,
+        60, 61, 62, 63, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 76, 77,
+        77, 78, 79, 79, 79, 79, 60, 60, 60, 60, 60, 60, 60, 60, 60, 61, 62, 63,
+        64, 65, 66, 67, 69, 70, 71, 72, 73, 73, 74, 75, 76, 76, 77, 78, 78, 78,
+        78, 78, 58, 59, 60, 60, 61, 61, 61, 60, 60, 61, 62, 63, 63, 64, 66, 67,
+        68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 77, 78, 78, 78, 78, 59, 59,
+        60, 60, 61, 61, 61, 61, 62, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
+        73, 73, 74, 75, 76, 76, 77, 77, 78, 78, 78, 78, 59, 59, 60, 60, 61, 61,
+        62, 63, 63, 64, 65, 66, 66, 67, 68, 69, 70, 71, 72, 72, 73, 74, 75, 75,
+        76, 77, 77, 78, 78, 78, 78, 78, 59, 60, 60, 60, 60, 61, 63, 64, 65, 66,
+        66, 67, 68, 69, 69, 70, 71, 72, 72, 73, 74, 75, 75, 76, 77, 77, 77, 78,
+        78, 78, 78, 78, 60, 60, 60, 60, 60, 62, 63, 65, 67, 67, 68, 69, 69, 70,
+        71, 71, 72, 73, 73, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79, 79, 79,
+        61, 61, 61, 61, 61, 62, 64, 66, 67, 68, 69, 70, 70, 71, 72, 72, 73, 73,
+        74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79, 79, 79, 79, 62, 62, 62, 62,
+        62, 63, 65, 66, 68, 69, 70, 71, 71, 72, 73, 73, 74, 74, 75, 75, 76, 76,
+        77, 77, 78, 78, 79, 79, 79, 79, 79, 79, 64, 64, 63, 63, 63, 64, 66, 67,
+        69, 70, 71, 71, 72, 73, 73, 74, 75, 75, 76, 76, 76, 77, 77, 78, 78, 79,
+        79, 79, 80, 80, 80, 80, 65, 65, 64, 64, 63, 65, 66, 68, 69, 70, 71, 72,
+        73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 78, 79, 79, 79, 80, 80, 80,
+        80, 80, 67, 66, 65, 65, 64, 66, 67, 69, 70, 71, 72, 73, 74, 74, 75, 75,
+        76, 76, 77, 77, 78, 78, 78, 79, 79, 79, 80, 80, 80, 80, 80, 80, 68, 67,
+        67, 66, 66, 67, 68, 69, 71, 72, 73, 73, 74, 75, 75, 76, 77, 77, 77, 78,
+        78, 78, 79, 79, 79, 80, 80, 80, 80, 80, 80, 80, 69, 69, 68, 67, 67, 68,
+        69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 77, 77, 78, 78, 79, 79, 79, 79,
+        80, 80, 80, 80, 81, 81, 81, 81, 71, 70, 69, 69, 68, 69, 70, 71, 72, 73,
+        74, 75, 75, 76, 77, 77, 78, 78, 78, 79, 79, 79, 80, 80, 80, 80, 80, 81,
+        81, 81, 81, 81, 71, 71, 70, 70, 69, 70, 71, 72, 73, 73, 74, 75, 76, 76,
+        77, 77, 78, 78, 79, 79, 79, 80, 80, 80, 80, 80, 81, 81, 81, 81, 81, 81,
+        72, 72, 71, 71, 70, 71, 72, 72, 73, 74, 75, 76, 76, 77, 77, 78, 78, 79,
+        79, 79, 80, 80, 80, 80, 81, 81, 81, 81, 81, 81, 81, 81, 73, 73, 72, 72,
+        71, 72, 72, 73, 74, 75, 75, 76, 77, 77, 78, 78, 79, 79, 79, 80, 80, 80,
+        80, 81, 81, 81, 81, 81, 82, 82, 82, 82, 75, 74, 73, 73, 72, 73, 73, 74,
+        75, 75, 76, 76, 77, 78, 78, 79, 79, 79, 80, 80, 80, 80, 81, 81, 81, 81,
+        81, 82, 82, 82, 82, 82, 75, 75, 74, 73, 73, 73, 74, 75, 75, 76, 76, 77,
+        78, 78, 78, 79, 79, 80, 80, 80, 80, 81, 81, 81, 81, 81, 82, 82, 82, 82,
+        82, 82, 76, 75, 75, 74, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79,
+        80, 80, 80, 80, 81, 81, 81, 81, 81, 82, 82, 82, 82, 82, 82, 82, 77, 76,
+        76, 75, 75, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79, 79, 80, 80, 80, 81,
+        81, 81, 81, 81, 82, 82, 82, 82, 82, 82, 82, 82, 77, 77, 76, 76, 75, 76,
+        76, 77, 77, 77, 78, 78, 79, 79, 79, 80, 80, 80, 81, 81, 81, 81, 81, 82,
+        82, 82, 82, 82, 82, 82, 82, 82, 78, 77, 77, 76, 76, 76, 77, 77, 77, 78,
+        78, 79, 79, 79, 80, 80, 80, 80, 81, 81, 81, 81, 82, 82, 82, 82, 82, 82,
+        82, 82, 82, 82, 78, 78, 77, 77, 77, 77, 77, 77, 78, 78, 79, 79, 79, 80,
+        80, 80, 80, 81, 81, 81, 81, 82, 82, 82, 82, 82, 82, 82, 83, 83, 83, 83,
+        79, 78, 78, 78, 77, 77, 78, 78, 78, 79, 79, 79, 80, 80, 80, 80, 81, 81,
+        81, 81, 82, 82, 82, 82, 82, 82, 82, 83, 83, 83, 83, 83, 79, 79, 79, 78,
+        78, 78, 78, 78, 79, 79, 79, 80, 80, 80, 80, 81, 81, 81, 81, 82, 82, 82,
+        82, 82, 82, 82, 83, 83, 83, 83, 83, 83, 79, 79, 79, 78, 78, 78, 78, 78,
+        79, 79, 79, 80, 80, 80, 80, 81, 81, 81, 81, 82, 82, 82, 82, 82, 82, 82,
+        83, 83, 83, 83, 83, 83, 79, 79, 79, 78, 78, 78, 78, 78, 79, 79, 79, 80,
+        80, 80, 80, 81, 81, 81, 81, 82, 82, 82, 82, 82, 82, 82, 83, 83, 83, 83,
+        83, 83, 79, 79, 79, 78, 78, 78, 78, 78, 79, 79, 79, 80, 80, 80, 80, 81,
+        81, 81, 81, 82, 82, 82, 82, 82, 82, 82, 83, 83, 83, 83, 83, 83 },
+      { /* Intra matrices */
+        /* Size 4 */
+        51, 53, 61, 67, 53, 60, 65, 68, 61, 65, 68, 70, 67, 68, 70, 71,
+        /* Size 8 */
+        55, 49, 51, 56, 61, 64, 67, 69, 49, 52, 51, 54, 58, 62, 65, 67, 51, 51,
+        57, 60, 62, 64, 66, 68, 56, 54, 60, 63, 65, 67, 68, 69, 61, 58, 62, 65,
+        67, 68, 69, 70, 64, 62, 64, 67, 68, 70, 70, 71, 67, 65, 66, 68, 69, 70,
+        71, 71, 69, 67, 68, 69, 70, 71, 71, 72,
+        /* Size 16 */
+        54, 51, 49, 50, 50, 53, 55, 58, 60, 62, 64, 65, 66, 67, 68, 68, 51, 51,
+        50, 50, 51, 52, 54, 57, 59, 61, 63, 64, 65, 66, 67, 67, 49, 50, 51, 51,
+        51, 52, 54, 56, 58, 60, 61, 63, 64, 65, 67, 67, 50, 50, 51, 52, 53, 55,
+        56, 58, 59, 61, 63, 64, 65, 66, 67, 67, 50, 51, 51, 53, 57, 58, 59, 60,
+        61, 63, 64, 65, 66, 67, 67, 67, 53, 52, 52, 55, 58, 59, 61, 62, 63, 64,
+        65, 66, 67, 67, 68, 68, 55, 54, 54, 56, 59, 61, 63, 63, 64, 65, 66, 67,
+        67, 68, 69, 69, 58, 57, 56, 58, 60, 62, 63, 64, 65, 66, 67, 67, 68, 68,
+        69, 69, 60, 59, 58, 59, 61, 63, 64, 65, 66, 67, 68, 68, 69, 69, 69, 69,
+        62, 61, 60, 61, 63, 64, 65, 66, 67, 68, 68, 69, 69, 69, 70, 70, 64, 63,
+        61, 63, 64, 65, 66, 67, 68, 68, 69, 69, 70, 70, 70, 70, 65, 64, 63, 64,
+        65, 66, 67, 67, 68, 69, 69, 70, 70, 70, 70, 70, 66, 65, 64, 65, 66, 67,
+        67, 68, 69, 69, 70, 70, 70, 71, 71, 71, 67, 66, 65, 66, 67, 67, 68, 68,
+        69, 69, 70, 70, 71, 71, 71, 71, 68, 67, 67, 67, 67, 68, 69, 69, 69, 70,
+        70, 70, 71, 71, 71, 71, 68, 67, 67, 67, 67, 68, 69, 69, 69, 70, 70, 70,
+        71, 71, 71, 71,
+        /* Size 32 */
+        54, 52, 51, 50, 49, 49, 49, 50, 50, 51, 53, 54, 55, 56, 57, 58, 60, 61,
+        61, 62, 63, 64, 65, 65, 66, 66, 67, 67, 68, 68, 68, 68, 52, 52, 51, 50,
+        49, 50, 50, 50, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 63,
+        64, 65, 65, 66, 66, 67, 67, 67, 67, 67, 51, 51, 51, 50, 50, 50, 50, 50,
+        50, 51, 52, 53, 54, 55, 56, 57, 59, 59, 60, 61, 62, 63, 64, 64, 65, 65,
+        66, 67, 67, 67, 67, 67, 50, 50, 50, 50, 51, 51, 50, 50, 50, 51, 52, 53,
+        54, 55, 56, 57, 58, 59, 60, 61, 62, 62, 63, 64, 65, 65, 66, 66, 67, 67,
+        67, 67, 49, 49, 50, 51, 51, 51, 51, 51, 50, 51, 52, 53, 53, 54, 55, 56,
+        57, 58, 59, 60, 61, 62, 63, 63, 64, 65, 65, 66, 66, 66, 66, 66, 49, 50,
+        50, 51, 51, 51, 51, 52, 52, 52, 53, 54, 55, 55, 56, 57, 58, 59, 60, 61,
+        62, 62, 63, 64, 64, 65, 65, 66, 67, 67, 67, 67, 49, 50, 50, 50, 51, 51,
+        52, 53, 53, 54, 55, 55, 56, 57, 58, 58, 59, 60, 61, 61, 62, 63, 63, 64,
+        65, 65, 66, 66, 67, 67, 67, 67, 50, 50, 50, 50, 51, 52, 53, 54, 55, 55,
+        56, 57, 57, 58, 59, 59, 60, 61, 61, 62, 63, 63, 64, 65, 65, 66, 66, 66,
+        67, 67, 67, 67, 50, 50, 50, 50, 50, 52, 53, 55, 56, 57, 58, 58, 59, 59,
+        60, 60, 61, 62, 62, 63, 63, 64, 64, 65, 65, 66, 66, 67, 67, 67, 67, 67,
+        51, 51, 51, 51, 51, 52, 54, 55, 57, 58, 58, 59, 60, 60, 61, 61, 62, 62,
+        63, 63, 64, 64, 65, 65, 66, 66, 67, 67, 67, 67, 67, 67, 53, 52, 52, 52,
+        52, 53, 55, 56, 58, 58, 59, 60, 60, 61, 62, 62, 63, 63, 64, 64, 65, 65,
+        65, 66, 66, 67, 67, 67, 68, 68, 68, 68, 54, 53, 53, 53, 53, 54, 55, 57,
+        58, 59, 60, 61, 61, 62, 62, 63, 63, 64, 64, 65, 65, 65, 66, 66, 67, 67,
+        67, 68, 68, 68, 68, 68, 55, 55, 54, 54, 53, 55, 56, 57, 59, 60, 60, 61,
+        62, 63, 63, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, 67, 68, 68, 68, 68,
+        68, 68, 56, 56, 55, 55, 54, 55, 57, 58, 59, 60, 61, 62, 63, 63, 64, 64,
+        65, 65, 65, 66, 66, 66, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 57, 57,
+        56, 56, 55, 56, 58, 59, 60, 61, 62, 62, 63, 64, 64, 65, 65, 65, 66, 66,
+        67, 67, 67, 67, 68, 68, 68, 68, 69, 69, 69, 69, 58, 58, 57, 57, 56, 57,
+        58, 59, 60, 61, 62, 63, 64, 64, 65, 65, 66, 66, 66, 67, 67, 67, 67, 68,
+        68, 68, 68, 69, 69, 69, 69, 69, 60, 59, 59, 58, 57, 58, 59, 60, 61, 62,
+        63, 63, 64, 65, 65, 66, 66, 66, 67, 67, 67, 68, 68, 68, 68, 69, 69, 69,
+        69, 69, 69, 69, 61, 60, 59, 59, 58, 59, 60, 61, 62, 62, 63, 64, 65, 65,
+        65, 66, 66, 67, 67, 67, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69,
+        61, 61, 60, 60, 59, 60, 61, 61, 62, 63, 64, 64, 65, 65, 66, 66, 67, 67,
+        67, 68, 68, 68, 68, 69, 69, 69, 69, 69, 70, 70, 70, 70, 62, 62, 61, 61,
+        60, 61, 61, 62, 63, 63, 64, 65, 65, 66, 66, 67, 67, 67, 68, 68, 68, 68,
+        69, 69, 69, 69, 69, 70, 70, 70, 70, 70, 63, 63, 62, 62, 61, 62, 62, 63,
+        63, 64, 65, 65, 66, 66, 67, 67, 67, 68, 68, 68, 69, 69, 69, 69, 69, 69,
+        70, 70, 70, 70, 70, 70, 64, 63, 63, 62, 62, 62, 63, 63, 64, 64, 65, 65,
+        66, 66, 67, 67, 68, 68, 68, 68, 69, 69, 69, 69, 69, 70, 70, 70, 70, 70,
+        70, 70, 65, 64, 64, 63, 63, 63, 63, 64, 64, 65, 65, 66, 66, 67, 67, 67,
+        68, 68, 68, 69, 69, 69, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70, 65, 65,
+        64, 64, 63, 64, 64, 65, 65, 65, 66, 66, 67, 67, 67, 68, 68, 68, 69, 69,
+        69, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70, 70, 66, 65, 65, 65, 64, 64,
+        65, 65, 65, 66, 66, 67, 67, 67, 68, 68, 68, 69, 69, 69, 69, 69, 70, 70,
+        70, 70, 70, 70, 70, 70, 70, 70, 66, 66, 65, 65, 65, 65, 65, 66, 66, 66,
+        67, 67, 67, 68, 68, 68, 69, 69, 69, 69, 69, 70, 70, 70, 70, 70, 70, 70,
+        71, 71, 71, 71, 67, 66, 66, 66, 65, 65, 66, 66, 66, 67, 67, 67, 68, 68,
+        68, 68, 69, 69, 69, 69, 70, 70, 70, 70, 70, 70, 70, 71, 71, 71, 71, 71,
+        67, 67, 67, 66, 66, 66, 66, 66, 67, 67, 67, 68, 68, 68, 68, 69, 69, 69,
+        69, 70, 70, 70, 70, 70, 70, 70, 71, 71, 71, 71, 71, 71, 68, 67, 67, 67,
+        66, 67, 67, 67, 67, 67, 68, 68, 68, 68, 69, 69, 69, 69, 70, 70, 70, 70,
+        70, 70, 70, 71, 71, 71, 71, 71, 71, 71, 68, 67, 67, 67, 66, 67, 67, 67,
+        67, 67, 68, 68, 68, 68, 69, 69, 69, 69, 70, 70, 70, 70, 70, 70, 70, 71,
+        71, 71, 71, 71, 71, 71, 68, 67, 67, 67, 66, 67, 67, 67, 67, 67, 68, 68,
+        68, 68, 69, 69, 69, 69, 70, 70, 70, 70, 70, 70, 70, 71, 71, 71, 71, 71,
+        71, 71, 68, 67, 67, 67, 66, 67, 67, 67, 67, 67, 68, 68, 68, 68, 69, 69,
+        69, 69, 70, 70, 70, 70, 70, 70, 70, 71, 71, 71, 71, 71, 71, 71 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 74, 75, 79, 74, 77, 78, 80, 75, 78, 82, 84, 79, 80, 84, 86,
+        /* Size 8 */
+        64, 59, 72, 73, 74, 76, 78, 80, 59, 67, 72, 70, 71, 72, 75, 77, 72, 72,
+        75, 74, 74, 75, 76, 78, 73, 70, 74, 76, 77, 78, 79, 80, 74, 71, 74, 77,
+        79, 80, 80, 81, 76, 72, 75, 78, 80, 81, 82, 82, 78, 75, 76, 79, 80, 82,
+        83, 83, 80, 77, 78, 80, 81, 82, 83, 84,
+        /* Size 16 */
+        64, 62, 59, 65, 72, 72, 73, 73, 74, 75, 76, 77, 78, 79, 80, 80, 62, 62,
+        63, 67, 72, 72, 71, 72, 72, 73, 74, 75, 76, 77, 78, 78, 59, 63, 67, 70,
+        72, 71, 70, 70, 71, 72, 72, 74, 75, 76, 77, 77, 65, 67, 70, 71, 73, 73,
+        72, 72, 72, 73, 74, 75, 75, 76, 77, 77, 72, 72, 72, 73, 75, 74, 74, 74,
+        74, 75, 75, 76, 76, 77, 78, 78, 72, 72, 71, 73, 74, 75, 75, 75, 76, 76,
+        76, 77, 77, 78, 79, 79, 73, 71, 70, 72, 74, 75, 76, 77, 77, 77, 78, 78,
+        79, 79, 80, 80, 73, 72, 70, 72, 74, 75, 77, 77, 78, 78, 79, 79, 79, 80,
+        80, 80, 74, 72, 71, 72, 74, 76, 77, 78, 79, 79, 80, 80, 80, 81, 81, 81,
+        75, 73, 72, 73, 75, 76, 77, 78, 79, 80, 80, 81, 81, 81, 82, 82, 76, 74,
+        72, 74, 75, 76, 78, 79, 80, 80, 81, 81, 82, 82, 82, 82, 77, 75, 74, 75,
+        76, 77, 78, 79, 80, 81, 81, 82, 82, 83, 83, 83, 78, 76, 75, 75, 76, 77,
+        79, 79, 80, 81, 82, 82, 83, 83, 83, 83, 79, 77, 76, 76, 77, 78, 79, 80,
+        81, 81, 82, 83, 83, 83, 84, 84, 80, 78, 77, 77, 78, 79, 80, 80, 81, 82,
+        82, 83, 83, 84, 84, 84, 80, 78, 77, 77, 78, 79, 80, 80, 81, 82, 82, 83,
+        83, 84, 84, 84,
+        /* Size 32 */
+        64, 63, 62, 60, 59, 62, 65, 68, 72, 72, 72, 72, 73, 73, 73, 74, 74, 74,
+        75, 75, 76, 76, 77, 77, 78, 78, 79, 79, 80, 80, 80, 80, 63, 62, 62, 61,
+        61, 63, 66, 69, 72, 72, 72, 72, 72, 72, 73, 73, 73, 74, 74, 74, 75, 75,
+        76, 76, 77, 77, 78, 78, 79, 79, 79, 79, 62, 62, 62, 63, 63, 65, 67, 69,
+        72, 72, 72, 72, 71, 72, 72, 72, 72, 73, 73, 74, 74, 75, 75, 76, 76, 77,
+        77, 78, 78, 78, 78, 78, 60, 61, 63, 64, 65, 67, 68, 70, 72, 72, 71, 71,
+        71, 71, 71, 71, 71, 72, 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 77,
+        77, 77, 59, 61, 63, 65, 67, 68, 70, 71, 72, 72, 71, 71, 70, 70, 70, 71,
+        71, 71, 72, 72, 72, 73, 74, 74, 75, 75, 76, 76, 77, 77, 77, 77, 62, 63,
+        65, 67, 68, 69, 70, 72, 73, 72, 72, 72, 71, 71, 71, 71, 72, 72, 72, 73,
+        73, 74, 74, 75, 75, 75, 76, 76, 77, 77, 77, 77, 65, 66, 67, 68, 70, 70,
+        71, 72, 73, 73, 73, 72, 72, 72, 72, 72, 72, 73, 73, 73, 74, 74, 75, 75,
+        75, 76, 76, 77, 77, 77, 77, 77, 68, 69, 69, 70, 71, 72, 72, 73, 74, 74,
+        74, 73, 73, 73, 73, 73, 73, 74, 74, 74, 74, 75, 75, 76, 76, 76, 77, 77,
+        78, 78, 78, 78, 72, 72, 72, 72, 72, 73, 73, 74, 75, 75, 74, 74, 74, 74,
+        74, 74, 74, 74, 75, 75, 75, 75, 76, 76, 76, 77, 77, 78, 78, 78, 78, 78,
+        72, 72, 72, 72, 72, 72, 73, 74, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75,
+        75, 75, 76, 76, 76, 77, 77, 77, 78, 78, 78, 78, 78, 78, 72, 72, 72, 71,
+        71, 72, 73, 74, 74, 75, 75, 75, 75, 75, 75, 75, 76, 76, 76, 76, 76, 77,
+        77, 77, 77, 78, 78, 78, 79, 79, 79, 79, 72, 72, 72, 71, 71, 72, 72, 73,
+        74, 75, 75, 75, 76, 76, 76, 76, 76, 76, 77, 77, 77, 77, 77, 78, 78, 78,
+        79, 79, 79, 79, 79, 79, 73, 72, 71, 71, 70, 71, 72, 73, 74, 75, 75, 76,
+        76, 76, 77, 77, 77, 77, 77, 78, 78, 78, 78, 78, 79, 79, 79, 79, 80, 80,
+        80, 80, 73, 72, 72, 71, 70, 71, 72, 73, 74, 75, 75, 76, 76, 77, 77, 77,
+        77, 78, 78, 78, 78, 78, 79, 79, 79, 79, 80, 80, 80, 80, 80, 80, 73, 73,
+        72, 71, 70, 71, 72, 73, 74, 75, 75, 76, 77, 77, 77, 78, 78, 78, 78, 78,
+        79, 79, 79, 79, 79, 80, 80, 80, 80, 80, 80, 80, 74, 73, 72, 71, 71, 71,
+        72, 73, 74, 75, 75, 76, 77, 77, 78, 78, 78, 78, 79, 79, 79, 79, 80, 80,
+        80, 80, 80, 81, 81, 81, 81, 81, 74, 73, 72, 71, 71, 72, 72, 73, 74, 75,
+        76, 76, 77, 77, 78, 78, 79, 79, 79, 79, 80, 80, 80, 80, 80, 81, 81, 81,
+        81, 81, 81, 81, 74, 74, 73, 72, 71, 72, 73, 74, 74, 75, 76, 76, 77, 78,
+        78, 78, 79, 79, 79, 80, 80, 80, 80, 81, 81, 81, 81, 81, 82, 82, 82, 82,
+        75, 74, 73, 72, 72, 72, 73, 74, 75, 75, 76, 77, 77, 78, 78, 79, 79, 79,
+        80, 80, 80, 80, 81, 81, 81, 81, 81, 82, 82, 82, 82, 82, 75, 74, 74, 73,
+        72, 73, 73, 74, 75, 75, 76, 77, 78, 78, 78, 79, 79, 80, 80, 80, 81, 81,
+        81, 81, 81, 82, 82, 82, 82, 82, 82, 82, 76, 75, 74, 73, 72, 73, 74, 74,
+        75, 76, 76, 77, 78, 78, 79, 79, 80, 80, 80, 81, 81, 81, 81, 82, 82, 82,
+        82, 82, 82, 82, 82, 82, 76, 75, 75, 74, 73, 74, 74, 75, 75, 76, 77, 77,
+        78, 78, 79, 79, 80, 80, 80, 81, 81, 81, 82, 82, 82, 82, 82, 83, 83, 83,
+        83, 83, 77, 76, 75, 74, 74, 74, 75, 75, 76, 76, 77, 77, 78, 79, 79, 80,
+        80, 80, 81, 81, 81, 82, 82, 82, 82, 82, 83, 83, 83, 83, 83, 83, 77, 76,
+        76, 75, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79, 80, 80, 81, 81, 81,
+        82, 82, 82, 82, 83, 83, 83, 83, 83, 83, 83, 83, 78, 77, 76, 75, 75, 75,
+        75, 76, 76, 77, 77, 78, 79, 79, 79, 80, 80, 81, 81, 81, 82, 82, 82, 83,
+        83, 83, 83, 83, 83, 83, 83, 83, 78, 77, 77, 76, 75, 75, 76, 76, 77, 77,
+        78, 78, 79, 79, 80, 80, 81, 81, 81, 82, 82, 82, 82, 83, 83, 83, 83, 83,
+        84, 84, 84, 84, 79, 78, 77, 76, 76, 76, 76, 77, 77, 78, 78, 79, 79, 80,
+        80, 80, 81, 81, 81, 82, 82, 82, 83, 83, 83, 83, 83, 84, 84, 84, 84, 84,
+        79, 78, 78, 77, 76, 76, 77, 77, 78, 78, 78, 79, 79, 80, 80, 81, 81, 81,
+        82, 82, 82, 83, 83, 83, 83, 83, 84, 84, 84, 84, 84, 84, 80, 79, 78, 77,
+        77, 77, 77, 78, 78, 78, 79, 79, 80, 80, 80, 81, 81, 82, 82, 82, 82, 83,
+        83, 83, 83, 84, 84, 84, 84, 84, 84, 84, 80, 79, 78, 77, 77, 77, 77, 78,
+        78, 78, 79, 79, 80, 80, 80, 81, 81, 82, 82, 82, 82, 83, 83, 83, 83, 84,
+        84, 84, 84, 84, 84, 84, 80, 79, 78, 77, 77, 77, 77, 78, 78, 78, 79, 79,
+        80, 80, 80, 81, 81, 82, 82, 82, 82, 83, 83, 83, 83, 84, 84, 84, 84, 84,
+        84, 84, 80, 79, 78, 77, 77, 77, 77, 78, 78, 78, 79, 79, 80, 80, 80, 81,
+        81, 82, 82, 82, 82, 83, 83, 83, 83, 84, 84, 84, 84, 84, 84, 84 },
+      { /* Intra matrices */
+        /* Size 4 */
+        52, 60, 61, 65, 60, 63, 64, 66, 61, 64, 67, 69, 65, 66, 69, 71,
+        /* Size 8 */
+        53, 49, 60, 61, 62, 64, 65, 67, 49, 56, 60, 59, 59, 61, 63, 64, 60, 60,
+        63, 62, 62, 63, 64, 66, 61, 59, 62, 64, 65, 65, 66, 67, 62, 59, 62, 65,
+        66, 67, 68, 69, 64, 61, 63, 65, 67, 68, 69, 70, 65, 63, 64, 66, 68, 69,
+        70, 71, 67, 64, 66, 67, 69, 70, 71, 71,
+        /* Size 16 */
+        53, 51, 49, 54, 60, 60, 61, 61, 62, 62, 63, 64, 65, 66, 67, 67, 51, 51,
+        52, 56, 60, 60, 59, 60, 60, 61, 62, 63, 64, 64, 65, 65, 49, 52, 56, 58,
+        60, 59, 58, 58, 59, 59, 60, 61, 62, 63, 64, 64, 54, 56, 58, 59, 61, 61,
+        60, 60, 60, 61, 61, 62, 63, 64, 65, 65, 60, 60, 60, 61, 62, 62, 62, 62,
+        62, 62, 63, 63, 64, 64, 65, 65, 60, 60, 59, 61, 62, 62, 63, 63, 63, 63,
+        64, 64, 65, 65, 66, 66, 61, 59, 58, 60, 62, 63, 64, 64, 64, 65, 65, 65,
+        66, 66, 67, 67, 61, 60, 58, 60, 62, 63, 64, 64, 65, 65, 66, 66, 67, 67,
+        67, 67, 62, 60, 59, 60, 62, 63, 64, 65, 66, 66, 67, 67, 67, 68, 68, 68,
+        62, 61, 59, 61, 62, 63, 65, 65, 66, 67, 67, 68, 68, 68, 69, 69, 63, 62,
+        60, 61, 63, 64, 65, 66, 67, 67, 68, 68, 69, 69, 69, 69, 64, 63, 61, 62,
+        63, 64, 65, 66, 67, 68, 68, 69, 69, 69, 70, 70, 65, 64, 62, 63, 64, 65,
+        66, 67, 67, 68, 69, 69, 69, 70, 70, 70, 66, 64, 63, 64, 64, 65, 66, 67,
+        68, 68, 69, 69, 70, 70, 70, 70, 67, 65, 64, 65, 65, 66, 67, 67, 68, 69,
+        69, 70, 70, 70, 71, 71, 67, 65, 64, 65, 65, 66, 67, 67, 68, 69, 69, 70,
+        70, 70, 71, 71,
+        /* Size 32 */
+        53, 52, 51, 50, 49, 51, 53, 56, 59, 60, 60, 60, 60, 61, 61, 61, 61, 62,
+        62, 63, 63, 63, 64, 64, 65, 65, 66, 66, 66, 66, 66, 66, 52, 51, 51, 50,
+        50, 52, 54, 57, 59, 60, 60, 60, 60, 60, 60, 60, 61, 61, 61, 62, 62, 63,
+        63, 64, 64, 64, 65, 65, 66, 66, 66, 66, 51, 51, 51, 51, 52, 54, 55, 57,
+        60, 59, 59, 59, 59, 59, 60, 60, 60, 60, 61, 61, 62, 62, 62, 63, 63, 64,
+        64, 65, 65, 65, 65, 65, 50, 50, 51, 52, 54, 55, 56, 58, 60, 59, 59, 59,
+        59, 59, 59, 59, 59, 60, 60, 60, 61, 61, 62, 62, 63, 63, 63, 64, 64, 64,
+        64, 64, 49, 50, 52, 54, 55, 56, 58, 59, 60, 59, 59, 58, 58, 58, 58, 58,
+        59, 59, 59, 60, 60, 61, 61, 61, 62, 62, 63, 63, 64, 64, 64, 64, 51, 52,
+        54, 55, 56, 57, 58, 59, 60, 60, 60, 59, 59, 59, 59, 59, 59, 60, 60, 60,
+        61, 61, 61, 62, 62, 63, 63, 64, 64, 64, 64, 64, 53, 54, 55, 56, 58, 58,
+        59, 60, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 61, 61, 61, 62, 62, 62,
+        63, 63, 64, 64, 64, 64, 64, 64, 56, 57, 57, 58, 59, 59, 60, 61, 61, 61,
+        61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 62, 62, 62, 63, 63, 63, 64, 64,
+        65, 65, 65, 65, 59, 59, 60, 60, 60, 60, 61, 61, 62, 62, 62, 62, 62, 62,
+        62, 62, 62, 62, 62, 62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 65, 65,
+        60, 60, 59, 59, 59, 60, 61, 61, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62,
+        63, 63, 63, 63, 63, 64, 64, 64, 65, 65, 65, 65, 65, 65, 60, 60, 59, 59,
+        59, 60, 60, 61, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 64,
+        64, 64, 64, 65, 65, 65, 66, 66, 66, 66, 60, 60, 59, 59, 58, 59, 60, 61,
+        62, 62, 62, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 65, 65, 65, 65,
+        65, 66, 66, 66, 66, 66, 60, 60, 59, 59, 58, 59, 60, 61, 62, 62, 62, 63,
+        63, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66,
+        66, 66, 61, 60, 59, 59, 58, 59, 60, 61, 62, 62, 63, 63, 64, 64, 64, 64,
+        64, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 67, 67, 67, 67, 67, 61, 60,
+        60, 59, 58, 59, 60, 61, 62, 62, 63, 63, 64, 64, 64, 65, 65, 65, 65, 65,
+        66, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 61, 60, 60, 59, 58, 59,
+        60, 61, 62, 62, 63, 63, 64, 64, 65, 65, 65, 65, 66, 66, 66, 66, 66, 67,
+        67, 67, 67, 67, 67, 67, 67, 67, 61, 61, 60, 59, 59, 59, 60, 61, 62, 62,
+        63, 63, 64, 64, 65, 65, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 68,
+        68, 68, 68, 68, 62, 61, 60, 60, 59, 60, 60, 61, 62, 62, 63, 64, 64, 65,
+        65, 65, 66, 66, 66, 66, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68,
+        62, 61, 61, 60, 59, 60, 61, 61, 62, 63, 63, 64, 64, 65, 65, 66, 66, 66,
+        66, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 68, 63, 62, 61, 60,
+        60, 60, 61, 61, 62, 63, 63, 64, 65, 65, 65, 66, 66, 66, 67, 67, 67, 67,
+        68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 63, 62, 62, 61, 60, 61, 61, 62,
+        62, 63, 63, 64, 65, 65, 66, 66, 66, 67, 67, 67, 68, 68, 68, 68, 68, 68,
+        69, 69, 69, 69, 69, 69, 63, 63, 62, 61, 61, 61, 62, 62, 63, 63, 64, 64,
+        65, 65, 66, 66, 67, 67, 67, 67, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69,
+        69, 69, 64, 63, 62, 62, 61, 61, 62, 62, 63, 63, 64, 65, 65, 65, 66, 66,
+        67, 67, 67, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69, 64, 64,
+        63, 62, 61, 62, 62, 63, 63, 64, 64, 65, 65, 66, 66, 67, 67, 67, 68, 68,
+        68, 68, 69, 69, 69, 69, 69, 69, 70, 70, 70, 70, 65, 64, 63, 63, 62, 62,
+        63, 63, 64, 64, 64, 65, 65, 66, 66, 67, 67, 67, 68, 68, 68, 69, 69, 69,
+        69, 69, 70, 70, 70, 70, 70, 70, 65, 64, 64, 63, 62, 63, 63, 63, 64, 64,
+        65, 65, 66, 66, 66, 67, 67, 68, 68, 68, 68, 69, 69, 69, 69, 70, 70, 70,
+        70, 70, 70, 70, 66, 65, 64, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66,
+        67, 67, 67, 68, 68, 68, 69, 69, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70,
+        66, 65, 65, 64, 63, 64, 64, 64, 65, 65, 65, 66, 66, 67, 67, 67, 68, 68,
+        68, 69, 69, 69, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70, 66, 66, 65, 64,
+        64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, 67, 68, 68, 68, 69, 69, 69,
+        69, 70, 70, 70, 70, 70, 70, 70, 70, 70, 66, 66, 65, 64, 64, 64, 64, 65,
+        65, 65, 66, 66, 66, 67, 67, 67, 68, 68, 68, 69, 69, 69, 69, 70, 70, 70,
+        70, 70, 70, 70, 70, 70, 66, 66, 65, 64, 64, 64, 64, 65, 65, 65, 66, 66,
+        66, 67, 67, 67, 68, 68, 68, 69, 69, 69, 69, 70, 70, 70, 70, 70, 70, 70,
+        70, 70, 66, 66, 65, 64, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, 67,
+        68, 68, 68, 69, 69, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70, 70 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 65, 72, 76, 65, 71, 74, 76, 72, 74, 77, 78, 76, 76, 78, 79,
+        /* Size 8 */
+        64, 60, 61, 65, 68, 71, 73, 74, 60, 62, 61, 64, 67, 69, 71, 73, 61, 61,
+        66, 68, 69, 71, 72, 73, 65, 64, 68, 70, 71, 72, 73, 74, 68, 67, 69, 71,
+        73, 74, 74, 75, 71, 69, 71, 72, 74, 74, 75, 75, 73, 71, 72, 73, 74, 75,
+        75, 76, 74, 73, 73, 74, 75, 75, 76, 76,
+        /* Size 16 */
+        64, 62, 60, 60, 61, 63, 65, 67, 68, 70, 71, 72, 73, 73, 74, 74, 62, 61,
+        61, 61, 61, 63, 64, 66, 68, 69, 70, 71, 72, 73, 73, 73, 60, 61, 62, 62,
+        61, 62, 64, 65, 67, 68, 69, 70, 71, 72, 73, 73, 60, 61, 62, 63, 64, 65,
+        66, 67, 68, 69, 70, 71, 72, 73, 73, 73, 61, 61, 61, 64, 66, 67, 68, 68,
+        69, 70, 71, 72, 72, 73, 73, 73, 63, 63, 62, 65, 67, 68, 69, 70, 70, 71,
+        72, 72, 73, 73, 74, 74, 65, 64, 64, 66, 68, 69, 70, 71, 71, 72, 72, 73,
+        73, 74, 74, 74, 67, 66, 65, 67, 68, 70, 71, 71, 72, 73, 73, 73, 74, 74,
+        74, 74, 68, 68, 67, 68, 69, 70, 71, 72, 73, 73, 74, 74, 74, 74, 75, 75,
+        70, 69, 68, 69, 70, 71, 72, 73, 73, 74, 74, 74, 75, 75, 75, 75, 71, 70,
+        69, 70, 71, 72, 72, 73, 74, 74, 74, 75, 75, 75, 75, 75, 72, 71, 70, 71,
+        72, 72, 73, 73, 74, 74, 75, 75, 75, 75, 75, 75, 73, 72, 71, 72, 72, 73,
+        73, 74, 74, 75, 75, 75, 75, 75, 76, 76, 73, 73, 72, 73, 73, 73, 74, 74,
+        74, 75, 75, 75, 75, 76, 76, 76, 74, 73, 73, 73, 73, 74, 74, 74, 75, 75,
+        75, 75, 76, 76, 76, 76, 74, 73, 73, 73, 73, 74, 74, 74, 75, 75, 75, 75,
+        76, 76, 76, 76,
+        /* Size 32 */
+        64, 63, 62, 61, 60, 60, 60, 61, 61, 62, 63, 64, 65, 66, 67, 67, 68, 69,
+        70, 70, 71, 71, 72, 72, 73, 73, 73, 74, 74, 74, 74, 74, 63, 62, 62, 61,
+        60, 61, 61, 61, 61, 62, 63, 64, 65, 65, 66, 67, 68, 69, 69, 70, 70, 71,
+        71, 72, 72, 73, 73, 73, 74, 74, 74, 74, 62, 62, 61, 61, 61, 61, 61, 61,
+        61, 62, 63, 63, 64, 65, 66, 67, 68, 68, 69, 69, 70, 71, 71, 72, 72, 72,
+        73, 73, 73, 73, 73, 73, 61, 61, 61, 61, 61, 61, 61, 61, 61, 62, 63, 63,
+        64, 65, 65, 66, 67, 68, 68, 69, 70, 70, 71, 71, 72, 72, 72, 73, 73, 73,
+        73, 73, 60, 60, 61, 61, 62, 62, 62, 61, 61, 62, 62, 63, 64, 64, 65, 66,
+        67, 67, 68, 69, 69, 70, 70, 71, 71, 72, 72, 73, 73, 73, 73, 73, 60, 61,
+        61, 61, 62, 62, 62, 62, 62, 63, 63, 64, 65, 65, 66, 67, 67, 68, 69, 69,
+        70, 70, 71, 71, 72, 72, 72, 73, 73, 73, 73, 73, 60, 61, 61, 61, 62, 62,
+        63, 63, 64, 64, 65, 65, 66, 66, 67, 67, 68, 69, 69, 70, 70, 71, 71, 71,
+        72, 72, 73, 73, 73, 73, 73, 73, 61, 61, 61, 61, 61, 62, 63, 64, 65, 65,
+        66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 71, 72, 72, 72, 73, 73,
+        73, 73, 73, 73, 61, 61, 61, 61, 61, 62, 64, 65, 66, 66, 67, 67, 68, 68,
+        68, 69, 69, 70, 70, 71, 71, 71, 72, 72, 72, 73, 73, 73, 73, 73, 73, 73,
+        62, 62, 62, 62, 62, 63, 64, 65, 66, 67, 67, 68, 68, 69, 69, 69, 70, 70,
+        71, 71, 71, 72, 72, 72, 73, 73, 73, 73, 74, 74, 74, 74, 63, 63, 63, 63,
+        62, 63, 65, 66, 67, 67, 68, 68, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72,
+        72, 73, 73, 73, 73, 74, 74, 74, 74, 74, 64, 64, 63, 63, 63, 64, 65, 66,
+        67, 68, 68, 69, 69, 70, 70, 71, 71, 71, 72, 72, 72, 72, 73, 73, 73, 73,
+        74, 74, 74, 74, 74, 74, 65, 65, 64, 64, 64, 65, 66, 67, 68, 68, 69, 69,
+        70, 70, 71, 71, 71, 72, 72, 72, 72, 73, 73, 73, 73, 74, 74, 74, 74, 74,
+        74, 74, 66, 65, 65, 65, 64, 65, 66, 67, 68, 69, 69, 70, 70, 71, 71, 71,
+        72, 72, 72, 73, 73, 73, 73, 73, 74, 74, 74, 74, 74, 74, 74, 74, 67, 66,
+        66, 65, 65, 66, 67, 68, 68, 69, 70, 70, 71, 71, 71, 72, 72, 72, 73, 73,
+        73, 73, 73, 74, 74, 74, 74, 74, 74, 74, 74, 74, 67, 67, 67, 66, 66, 67,
+        67, 68, 69, 69, 70, 71, 71, 71, 72, 72, 72, 73, 73, 73, 73, 74, 74, 74,
+        74, 74, 74, 74, 75, 75, 75, 75, 68, 68, 68, 67, 67, 67, 68, 69, 69, 70,
+        70, 71, 71, 72, 72, 72, 73, 73, 73, 73, 74, 74, 74, 74, 74, 74, 74, 75,
+        75, 75, 75, 75, 69, 69, 68, 68, 67, 68, 69, 69, 70, 70, 71, 71, 72, 72,
+        72, 73, 73, 73, 73, 74, 74, 74, 74, 74, 74, 75, 75, 75, 75, 75, 75, 75,
+        70, 69, 69, 68, 68, 69, 69, 70, 70, 71, 71, 72, 72, 72, 73, 73, 73, 73,
+        74, 74, 74, 74, 74, 74, 75, 75, 75, 75, 75, 75, 75, 75, 70, 70, 69, 69,
+        69, 69, 70, 70, 71, 71, 71, 72, 72, 73, 73, 73, 73, 74, 74, 74, 74, 74,
+        74, 75, 75, 75, 75, 75, 75, 75, 75, 75, 71, 70, 70, 70, 69, 70, 70, 71,
+        71, 71, 72, 72, 72, 73, 73, 73, 74, 74, 74, 74, 74, 74, 75, 75, 75, 75,
+        75, 75, 75, 75, 75, 75, 71, 71, 71, 70, 70, 70, 71, 71, 71, 72, 72, 72,
+        73, 73, 73, 74, 74, 74, 74, 74, 74, 75, 75, 75, 75, 75, 75, 75, 75, 75,
+        75, 75, 72, 71, 71, 71, 70, 71, 71, 71, 72, 72, 72, 73, 73, 73, 73, 74,
+        74, 74, 74, 74, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 72, 72,
+        72, 71, 71, 71, 71, 72, 72, 72, 73, 73, 73, 73, 74, 74, 74, 74, 74, 75,
+        75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 73, 72, 72, 72, 71, 72,
+        72, 72, 72, 73, 73, 73, 73, 74, 74, 74, 74, 74, 75, 75, 75, 75, 75, 75,
+        75, 75, 75, 75, 76, 76, 76, 76, 73, 73, 72, 72, 72, 72, 72, 72, 73, 73,
+        73, 73, 74, 74, 74, 74, 74, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 76,
+        76, 76, 76, 76, 73, 73, 73, 72, 72, 72, 73, 73, 73, 73, 73, 74, 74, 74,
+        74, 74, 74, 75, 75, 75, 75, 75, 75, 75, 75, 75, 76, 76, 76, 76, 76, 76,
+        74, 73, 73, 73, 73, 73, 73, 73, 73, 73, 74, 74, 74, 74, 74, 74, 75, 75,
+        75, 75, 75, 75, 75, 75, 75, 76, 76, 76, 76, 76, 76, 76, 74, 74, 73, 73,
+        73, 73, 73, 73, 73, 74, 74, 74, 74, 74, 74, 75, 75, 75, 75, 75, 75, 75,
+        75, 75, 76, 76, 76, 76, 76, 76, 76, 76, 74, 74, 73, 73, 73, 73, 73, 73,
+        73, 74, 74, 74, 74, 74, 74, 75, 75, 75, 75, 75, 75, 75, 75, 75, 76, 76,
+        76, 76, 76, 76, 76, 76, 74, 74, 73, 73, 73, 73, 73, 73, 73, 74, 74, 74,
+        74, 74, 74, 75, 75, 75, 75, 75, 75, 75, 75, 75, 76, 76, 76, 76, 76, 76,
+        76, 76, 74, 74, 73, 73, 73, 73, 73, 73, 73, 74, 74, 74, 74, 74, 74, 75,
+        75, 75, 75, 75, 75, 75, 75, 75, 76, 76, 76, 76, 76, 76, 76, 76 },
+      { /* Intra matrices */
+        /* Size 4 */
+        55, 57, 62, 66, 57, 61, 65, 67, 62, 65, 67, 68, 66, 67, 68, 69,
+        /* Size 8 */
+        58, 54, 55, 59, 62, 64, 66, 67, 54, 56, 55, 57, 60, 63, 65, 66, 55, 55,
+        60, 61, 63, 64, 66, 67, 59, 57, 61, 64, 65, 66, 67, 68, 62, 60, 63, 65,
+        66, 67, 68, 68, 64, 63, 64, 66, 67, 68, 68, 69, 66, 65, 66, 67, 68, 68,
+        69, 69, 67, 66, 67, 68, 68, 69, 69, 69,
+        /* Size 16 */
+        57, 55, 53, 54, 55, 56, 58, 60, 62, 63, 64, 65, 66, 66, 67, 67, 55, 55,
+        54, 55, 55, 56, 58, 59, 61, 62, 63, 64, 65, 66, 66, 66, 53, 54, 55, 55,
+        55, 56, 57, 58, 60, 61, 63, 63, 64, 65, 66, 66, 54, 55, 55, 56, 57, 58,
+        59, 60, 61, 62, 63, 64, 65, 66, 66, 66, 55, 55, 55, 57, 59, 60, 61, 62,
+        62, 63, 64, 65, 65, 66, 66, 66, 56, 56, 56, 58, 60, 61, 62, 63, 63, 64,
+        65, 65, 66, 66, 67, 67, 58, 58, 57, 59, 61, 62, 63, 64, 65, 65, 66, 66,
+        66, 67, 67, 67, 60, 59, 58, 60, 62, 63, 64, 65, 65, 66, 66, 66, 67, 67,
+        67, 67, 62, 61, 60, 61, 62, 63, 65, 65, 66, 66, 67, 67, 67, 67, 68, 68,
+        63, 62, 61, 62, 63, 64, 65, 66, 66, 67, 67, 67, 67, 68, 68, 68, 64, 63,
+        63, 63, 64, 65, 66, 66, 67, 67, 67, 68, 68, 68, 68, 68, 65, 64, 63, 64,
+        65, 65, 66, 66, 67, 67, 68, 68, 68, 68, 68, 68, 66, 65, 64, 65, 65, 66,
+        66, 67, 67, 67, 68, 68, 68, 68, 68, 68, 66, 66, 65, 66, 66, 66, 67, 67,
+        67, 68, 68, 68, 68, 68, 69, 69, 67, 66, 66, 66, 66, 67, 67, 67, 68, 68,
+        68, 68, 68, 69, 69, 69, 67, 66, 66, 66, 66, 67, 67, 67, 68, 68, 68, 68,
+        68, 69, 69, 69,
+        /* Size 32 */
+        57, 56, 55, 54, 53, 54, 54, 54, 54, 55, 56, 57, 58, 59, 60, 60, 61, 62,
+        63, 63, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, 67, 67, 56, 56, 55, 54,
+        54, 54, 54, 54, 54, 55, 56, 57, 58, 58, 59, 60, 61, 62, 62, 63, 63, 64,
+        64, 65, 65, 65, 66, 66, 66, 66, 66, 66, 55, 55, 55, 54, 54, 54, 54, 54,
+        55, 55, 56, 57, 57, 58, 59, 60, 61, 61, 62, 62, 63, 63, 64, 64, 65, 65,
+        65, 66, 66, 66, 66, 66, 54, 54, 54, 55, 55, 55, 55, 55, 55, 55, 56, 56,
+        57, 58, 59, 59, 60, 61, 61, 62, 63, 63, 64, 64, 65, 65, 65, 66, 66, 66,
+        66, 66, 53, 54, 54, 55, 55, 55, 55, 55, 55, 55, 56, 56, 57, 57, 58, 59,
+        60, 60, 61, 62, 62, 63, 63, 64, 64, 65, 65, 65, 66, 66, 66, 66, 54, 54,
+        54, 55, 55, 55, 55, 56, 56, 56, 57, 57, 58, 58, 59, 60, 60, 61, 61, 62,
+        63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 66, 54, 54, 54, 55, 55, 55,
+        56, 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 63, 63, 63, 64, 64,
+        65, 65, 65, 66, 66, 66, 66, 66, 54, 54, 54, 55, 55, 56, 56, 57, 58, 58,
+        59, 59, 60, 60, 61, 61, 62, 62, 62, 63, 63, 64, 64, 65, 65, 65, 65, 66,
+        66, 66, 66, 66, 54, 54, 55, 55, 55, 56, 57, 58, 59, 59, 60, 60, 61, 61,
+        61, 62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 66, 66, 66,
+        55, 55, 55, 55, 55, 56, 57, 58, 59, 60, 60, 61, 61, 62, 62, 62, 63, 63,
+        63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 56, 56, 56, 56,
+        56, 57, 58, 59, 60, 60, 61, 61, 62, 62, 63, 63, 63, 64, 64, 64, 65, 65,
+        65, 65, 66, 66, 66, 66, 67, 67, 67, 67, 57, 57, 57, 56, 56, 57, 58, 59,
+        60, 61, 61, 62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66,
+        66, 66, 67, 67, 67, 67, 58, 58, 57, 57, 57, 58, 59, 60, 61, 61, 62, 62,
+        63, 63, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 67, 67, 67,
+        67, 67, 59, 58, 58, 58, 57, 58, 59, 60, 61, 62, 62, 63, 63, 64, 64, 64,
+        65, 65, 65, 65, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 60, 59,
+        59, 59, 58, 59, 60, 61, 61, 62, 63, 63, 64, 64, 64, 65, 65, 65, 65, 66,
+        66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 67, 60, 60, 60, 59, 59, 60,
+        60, 61, 62, 62, 63, 63, 64, 64, 65, 65, 65, 65, 66, 66, 66, 66, 66, 67,
+        67, 67, 67, 67, 67, 67, 67, 67, 61, 61, 61, 60, 60, 60, 61, 62, 62, 63,
+        63, 64, 64, 65, 65, 65, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67,
+        67, 67, 67, 67, 62, 62, 61, 61, 60, 61, 61, 62, 63, 63, 64, 64, 65, 65,
+        65, 65, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68,
+        63, 62, 62, 61, 61, 61, 62, 62, 63, 63, 64, 64, 65, 65, 65, 66, 66, 66,
+        66, 67, 67, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 63, 63, 62, 62,
+        62, 62, 63, 63, 63, 64, 64, 65, 65, 65, 66, 66, 66, 66, 67, 67, 67, 67,
+        67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 64, 63, 63, 63, 62, 63, 63, 63,
+        64, 64, 65, 65, 65, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 68, 68,
+        68, 68, 68, 68, 68, 68, 64, 64, 63, 63, 63, 63, 63, 64, 64, 64, 65, 65,
+        66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68,
+        68, 68, 65, 64, 64, 64, 63, 64, 64, 64, 64, 65, 65, 65, 66, 66, 66, 66,
+        67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 65, 65,
+        64, 64, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 66, 67, 67, 67, 67, 67,
+        67, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 65, 65, 65, 65, 64, 64,
+        65, 65, 65, 65, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68,
+        68, 68, 68, 68, 68, 68, 68, 68, 66, 65, 65, 65, 65, 65, 65, 65, 65, 66,
+        66, 66, 66, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 68,
+        68, 68, 68, 68, 66, 66, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 67,
+        67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68,
+        66, 66, 66, 66, 65, 65, 66, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67,
+        68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 67, 66, 66, 66,
+        66, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68,
+        68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 67, 66, 66, 66, 66, 66, 66, 66,
+        66, 66, 67, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 68,
+        68, 68, 68, 68, 68, 68, 67, 66, 66, 66, 66, 66, 66, 66, 66, 66, 67, 67,
+        67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68,
+        68, 68, 67, 66, 66, 66, 66, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67,
+        67, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 71, 72, 74, 71, 73, 73, 75, 72, 73, 76, 77, 74, 75, 77, 78,
+        /* Size 8 */
+        64, 61, 69, 70, 71, 72, 73, 74, 61, 66, 69, 68, 69, 70, 71, 72, 69, 69,
+        71, 71, 71, 71, 72, 73, 70, 68, 71, 72, 73, 73, 74, 74, 71, 69, 71, 73,
+        74, 74, 75, 75, 72, 70, 71, 73, 74, 75, 75, 76, 73, 71, 72, 74, 75, 75,
+        76, 76, 74, 72, 73, 74, 75, 76, 76, 77,
+        /* Size 16 */
+        64, 62, 61, 65, 69, 70, 70, 70, 71, 71, 72, 72, 73, 74, 74, 74, 62, 63,
+        63, 66, 69, 69, 69, 69, 70, 70, 71, 71, 72, 73, 73, 73, 61, 63, 66, 68,
+        69, 69, 68, 68, 69, 69, 70, 70, 71, 72, 72, 72, 65, 66, 68, 69, 70, 70,
+        69, 70, 70, 70, 70, 71, 72, 72, 73, 73, 69, 69, 69, 70, 71, 71, 71, 71,
+        71, 71, 71, 72, 72, 73, 73, 73, 70, 69, 69, 70, 71, 71, 71, 72, 72, 72,
+        72, 72, 73, 73, 74, 74, 70, 69, 68, 69, 71, 71, 72, 72, 73, 73, 73, 73,
+        74, 74, 74, 74, 70, 69, 68, 70, 71, 72, 72, 73, 73, 73, 74, 74, 74, 74,
+        75, 75, 71, 70, 69, 70, 71, 72, 73, 73, 74, 74, 74, 74, 75, 75, 75, 75,
+        71, 70, 69, 70, 71, 72, 73, 73, 74, 74, 75, 75, 75, 75, 76, 76, 72, 71,
+        70, 70, 71, 72, 73, 74, 74, 75, 75, 75, 75, 76, 76, 76, 72, 71, 70, 71,
+        72, 72, 73, 74, 74, 75, 75, 75, 76, 76, 76, 76, 73, 72, 71, 72, 72, 73,
+        74, 74, 75, 75, 75, 76, 76, 76, 76, 76, 74, 73, 72, 72, 73, 73, 74, 74,
+        75, 75, 76, 76, 76, 76, 77, 77, 74, 73, 72, 73, 73, 74, 74, 75, 75, 76,
+        76, 76, 76, 77, 77, 77, 74, 73, 72, 73, 73, 74, 74, 75, 75, 76, 76, 76,
+        76, 77, 77, 77,
+        /* Size 32 */
+        64, 63, 62, 61, 61, 63, 65, 67, 69, 69, 70, 70, 70, 70, 70, 70, 71, 71,
+        71, 72, 72, 72, 72, 73, 73, 73, 74, 74, 74, 74, 74, 74, 63, 63, 62, 62,
+        62, 64, 65, 67, 69, 69, 69, 69, 69, 70, 70, 70, 70, 70, 71, 71, 71, 72,
+        72, 72, 73, 73, 73, 73, 74, 74, 74, 74, 62, 62, 63, 63, 63, 65, 66, 68,
+        69, 69, 69, 69, 69, 69, 69, 69, 70, 70, 70, 70, 71, 71, 71, 72, 72, 72,
+        73, 73, 73, 73, 73, 73, 61, 62, 63, 64, 65, 66, 67, 68, 69, 69, 69, 69,
+        69, 69, 69, 69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 72, 73, 73,
+        73, 73, 61, 62, 63, 65, 66, 67, 68, 69, 69, 69, 69, 68, 68, 68, 68, 68,
+        69, 69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 72, 72, 72, 63, 64,
+        65, 66, 67, 68, 68, 69, 70, 70, 69, 69, 69, 69, 69, 69, 69, 69, 70, 70,
+        70, 70, 71, 71, 71, 72, 72, 72, 73, 73, 73, 73, 65, 65, 66, 67, 68, 68,
+        69, 70, 70, 70, 70, 70, 69, 70, 70, 70, 70, 70, 70, 70, 70, 71, 71, 71,
+        72, 72, 72, 72, 73, 73, 73, 73, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71,
+        70, 70, 70, 70, 70, 70, 70, 70, 71, 71, 71, 71, 71, 72, 72, 72, 72, 73,
+        73, 73, 73, 73, 69, 69, 69, 69, 69, 70, 70, 71, 71, 71, 71, 71, 71, 71,
+        71, 71, 71, 71, 71, 71, 71, 72, 72, 72, 72, 72, 73, 73, 73, 73, 73, 73,
+        69, 69, 69, 69, 69, 70, 70, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
+        71, 72, 72, 72, 72, 72, 73, 73, 73, 73, 73, 73, 73, 73, 70, 69, 69, 69,
+        69, 69, 70, 70, 71, 71, 71, 71, 71, 72, 72, 72, 72, 72, 72, 72, 72, 72,
+        72, 73, 73, 73, 73, 73, 74, 74, 74, 74, 70, 69, 69, 69, 68, 69, 70, 70,
+        71, 71, 71, 72, 72, 72, 72, 72, 72, 72, 72, 72, 73, 73, 73, 73, 73, 73,
+        74, 74, 74, 74, 74, 74, 70, 69, 69, 69, 68, 69, 69, 70, 71, 71, 71, 72,
+        72, 72, 72, 72, 73, 73, 73, 73, 73, 73, 73, 73, 74, 74, 74, 74, 74, 74,
+        74, 74, 70, 70, 69, 69, 68, 69, 70, 70, 71, 71, 72, 72, 72, 72, 73, 73,
+        73, 73, 73, 73, 73, 73, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 70, 70,
+        69, 69, 68, 69, 70, 70, 71, 71, 72, 72, 72, 73, 73, 73, 73, 73, 73, 73,
+        74, 74, 74, 74, 74, 74, 74, 75, 75, 75, 75, 75, 70, 70, 69, 69, 68, 69,
+        70, 70, 71, 71, 72, 72, 72, 73, 73, 73, 73, 73, 74, 74, 74, 74, 74, 74,
+        74, 74, 75, 75, 75, 75, 75, 75, 71, 70, 70, 69, 69, 69, 70, 70, 71, 71,
+        72, 72, 73, 73, 73, 73, 74, 74, 74, 74, 74, 74, 74, 75, 75, 75, 75, 75,
+        75, 75, 75, 75, 71, 70, 70, 69, 69, 69, 70, 70, 71, 71, 72, 72, 73, 73,
+        73, 73, 74, 74, 74, 74, 74, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75,
+        71, 71, 70, 70, 69, 70, 70, 71, 71, 71, 72, 72, 73, 73, 73, 74, 74, 74,
+        74, 74, 75, 75, 75, 75, 75, 75, 75, 75, 76, 76, 76, 76, 72, 71, 70, 70,
+        69, 70, 70, 71, 71, 72, 72, 72, 73, 73, 73, 74, 74, 74, 74, 75, 75, 75,
+        75, 75, 75, 75, 75, 76, 76, 76, 76, 76, 72, 71, 71, 70, 70, 70, 70, 71,
+        71, 72, 72, 73, 73, 73, 74, 74, 74, 74, 75, 75, 75, 75, 75, 75, 75, 76,
+        76, 76, 76, 76, 76, 76, 72, 72, 71, 71, 70, 70, 71, 71, 72, 72, 72, 73,
+        73, 73, 74, 74, 74, 75, 75, 75, 75, 75, 75, 75, 76, 76, 76, 76, 76, 76,
+        76, 76, 72, 72, 71, 71, 70, 71, 71, 71, 72, 72, 72, 73, 73, 74, 74, 74,
+        74, 75, 75, 75, 75, 75, 75, 76, 76, 76, 76, 76, 76, 76, 76, 76, 73, 72,
+        72, 71, 71, 71, 71, 72, 72, 72, 73, 73, 73, 74, 74, 74, 75, 75, 75, 75,
+        75, 75, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 73, 73, 72, 72, 71, 71,
+        72, 72, 72, 73, 73, 73, 74, 74, 74, 74, 75, 75, 75, 75, 75, 76, 76, 76,
+        76, 76, 76, 76, 76, 76, 76, 76, 73, 73, 72, 72, 71, 72, 72, 72, 72, 73,
+        73, 73, 74, 74, 74, 74, 75, 75, 75, 75, 76, 76, 76, 76, 76, 76, 76, 76,
+        77, 77, 77, 77, 74, 73, 73, 72, 72, 72, 72, 72, 73, 73, 73, 74, 74, 74,
+        74, 75, 75, 75, 75, 75, 76, 76, 76, 76, 76, 76, 76, 77, 77, 77, 77, 77,
+        74, 73, 73, 72, 72, 72, 72, 73, 73, 73, 73, 74, 74, 74, 75, 75, 75, 75,
+        75, 76, 76, 76, 76, 76, 76, 76, 77, 77, 77, 77, 77, 77, 74, 74, 73, 73,
+        72, 73, 73, 73, 73, 73, 74, 74, 74, 74, 75, 75, 75, 75, 76, 76, 76, 76,
+        76, 76, 76, 77, 77, 77, 77, 77, 77, 77, 74, 74, 73, 73, 72, 73, 73, 73,
+        73, 73, 74, 74, 74, 74, 75, 75, 75, 75, 76, 76, 76, 76, 76, 76, 76, 77,
+        77, 77, 77, 77, 77, 77, 74, 74, 73, 73, 72, 73, 73, 73, 73, 73, 74, 74,
+        74, 74, 75, 75, 75, 75, 76, 76, 76, 76, 76, 76, 76, 77, 77, 77, 77, 77,
+        77, 77, 74, 74, 73, 73, 72, 73, 73, 73, 73, 73, 74, 74, 74, 74, 75, 75,
+        75, 75, 76, 76, 76, 76, 76, 76, 76, 77, 77, 77, 77, 77, 77, 77 },
+      { /* Intra matrices */
+        /* Size 4 */
+        55, 61, 62, 65, 61, 63, 64, 65, 62, 64, 66, 67, 65, 65, 67, 68,
+        /* Size 8 */
+        57, 53, 61, 62, 63, 64, 65, 66, 53, 59, 62, 60, 61, 62, 63, 64, 61, 62,
+        63, 63, 63, 63, 64, 65, 62, 60, 63, 64, 65, 65, 66, 66, 63, 61, 63, 65,
+        66, 66, 67, 67, 64, 62, 63, 65, 66, 67, 67, 68, 65, 63, 64, 66, 67, 67,
+        68, 68, 66, 64, 65, 66, 67, 68, 68, 69,
+        /* Size 16 */
+        56, 55, 53, 57, 61, 61, 62, 62, 62, 63, 64, 64, 65, 65, 66, 66, 55, 55,
+        56, 58, 61, 61, 61, 61, 61, 62, 63, 63, 64, 64, 65, 65, 53, 56, 58, 60,
+        61, 61, 60, 60, 61, 61, 62, 62, 63, 63, 64, 64, 57, 58, 60, 61, 62, 62,
+        61, 61, 62, 62, 62, 63, 63, 64, 64, 64, 61, 61, 61, 62, 63, 63, 63, 63,
+        63, 63, 63, 64, 64, 64, 65, 65, 61, 61, 61, 62, 63, 63, 63, 63, 63, 64,
+        64, 64, 65, 65, 65, 65, 62, 61, 60, 61, 63, 63, 64, 64, 64, 64, 65, 65,
+        65, 65, 66, 66, 62, 61, 60, 61, 63, 63, 64, 64, 65, 65, 65, 65, 66, 66,
+        66, 66, 62, 61, 61, 62, 63, 63, 64, 65, 65, 66, 66, 66, 66, 66, 67, 67,
+        63, 62, 61, 62, 63, 64, 64, 65, 66, 66, 66, 66, 67, 67, 67, 67, 64, 63,
+        62, 62, 63, 64, 65, 65, 66, 66, 67, 67, 67, 67, 67, 67, 64, 63, 62, 63,
+        64, 64, 65, 65, 66, 66, 67, 67, 67, 68, 68, 68, 65, 64, 63, 63, 64, 65,
+        65, 66, 66, 67, 67, 67, 68, 68, 68, 68, 65, 64, 63, 64, 64, 65, 65, 66,
+        66, 67, 67, 68, 68, 68, 68, 68, 66, 65, 64, 64, 65, 65, 66, 66, 67, 67,
+        67, 68, 68, 68, 68, 68, 66, 65, 64, 64, 65, 65, 66, 66, 67, 67, 67, 68,
+        68, 68, 68, 68,
+        /* Size 32 */
+        56, 55, 55, 54, 53, 55, 57, 59, 61, 61, 61, 61, 62, 62, 62, 62, 62, 63,
+        63, 63, 63, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 66, 55, 55, 55, 54,
+        54, 56, 57, 59, 61, 61, 61, 61, 61, 61, 62, 62, 62, 62, 62, 63, 63, 63,
+        63, 64, 64, 64, 65, 65, 65, 65, 65, 65, 55, 55, 55, 55, 55, 57, 58, 60,
+        61, 61, 61, 61, 61, 61, 61, 61, 61, 62, 62, 62, 62, 63, 63, 63, 64, 64,
+        64, 64, 65, 65, 65, 65, 54, 54, 55, 56, 57, 58, 59, 60, 61, 61, 61, 61,
+        60, 61, 61, 61, 61, 61, 61, 62, 62, 62, 63, 63, 63, 63, 64, 64, 64, 64,
+        64, 64, 53, 54, 55, 57, 58, 59, 60, 60, 61, 61, 61, 60, 60, 60, 60, 60,
+        60, 61, 61, 61, 61, 62, 62, 62, 63, 63, 63, 64, 64, 64, 64, 64, 55, 56,
+        57, 58, 59, 60, 60, 61, 62, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 62,
+        62, 62, 62, 63, 63, 63, 64, 64, 64, 64, 64, 64, 57, 57, 58, 59, 60, 60,
+        61, 61, 62, 62, 62, 61, 61, 61, 61, 61, 61, 62, 62, 62, 62, 62, 63, 63,
+        63, 64, 64, 64, 64, 64, 64, 64, 59, 59, 60, 60, 60, 61, 61, 62, 62, 62,
+        62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 64, 64, 64, 64,
+        64, 64, 64, 64, 61, 61, 61, 61, 61, 62, 62, 62, 63, 63, 63, 63, 62, 62,
+        62, 62, 62, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 65, 65, 65, 65,
+        61, 61, 61, 61, 61, 61, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 61, 61, 61, 61,
+        61, 61, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64,
+        64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 61, 61, 61, 61, 60, 61, 61, 62,
+        63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65,
+        65, 65, 65, 65, 65, 65, 62, 61, 61, 60, 60, 61, 61, 62, 62, 63, 63, 63,
+        64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66,
+        66, 66, 62, 61, 61, 61, 60, 61, 61, 62, 62, 63, 63, 63, 64, 64, 64, 64,
+        64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 62, 62,
+        61, 61, 60, 61, 61, 62, 62, 63, 63, 64, 64, 64, 64, 64, 65, 65, 65, 65,
+        65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 62, 62, 61, 61, 60, 61,
+        61, 62, 62, 63, 63, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 66, 66,
+        66, 66, 66, 66, 66, 66, 66, 66, 62, 62, 61, 61, 60, 61, 61, 62, 62, 63,
+        63, 64, 64, 64, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66,
+        67, 67, 67, 67, 63, 62, 62, 61, 61, 61, 62, 62, 63, 63, 63, 64, 64, 64,
+        65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67,
+        63, 62, 62, 61, 61, 61, 62, 62, 63, 63, 64, 64, 64, 65, 65, 65, 65, 66,
+        66, 66, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 63, 63, 62, 62,
+        61, 62, 62, 62, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 66, 66, 66,
+        66, 67, 67, 67, 67, 67, 67, 67, 67, 67, 63, 63, 62, 62, 61, 62, 62, 63,
+        63, 63, 64, 64, 65, 65, 65, 65, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67,
+        67, 67, 67, 67, 67, 67, 64, 63, 63, 62, 62, 62, 62, 63, 63, 64, 64, 64,
+        65, 65, 65, 65, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+        67, 67, 64, 63, 63, 63, 62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66,
+        66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 64, 64,
+        63, 63, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 66, 67,
+        67, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 65, 64, 64, 63, 63, 63,
+        63, 64, 64, 64, 64, 65, 65, 65, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67,
+        67, 68, 68, 68, 68, 68, 68, 68, 65, 64, 64, 63, 63, 63, 64, 64, 64, 64,
+        65, 65, 65, 65, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68,
+        68, 68, 68, 68, 65, 65, 64, 64, 63, 64, 64, 64, 64, 65, 65, 65, 65, 66,
+        66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68,
+        65, 65, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 66, 67,
+        67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 68, 66, 65, 65, 64,
+        64, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67,
+        68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 66, 65, 65, 64, 64, 64, 64, 64,
+        65, 65, 65, 65, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68,
+        68, 68, 68, 68, 68, 68, 66, 65, 65, 64, 64, 64, 64, 64, 65, 65, 65, 65,
+        66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68,
+        68, 68, 66, 65, 65, 64, 64, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 66,
+        67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 65, 68, 70, 65, 67, 69, 70, 68, 69, 70, 71, 70, 70, 71, 71,
+        /* Size 8 */
+        64, 62, 62, 64, 66, 67, 68, 69, 62, 63, 63, 64, 65, 67, 68, 68, 62, 63,
+        65, 66, 67, 67, 68, 69, 64, 64, 66, 67, 68, 68, 69, 69, 66, 65, 67, 68,
+        68, 69, 69, 69, 67, 67, 67, 68, 69, 69, 69, 69, 68, 68, 68, 69, 69, 69,
+        69, 69, 69, 68, 69, 69, 69, 69, 69, 70,
+        /* Size 16 */
+        64, 63, 62, 62, 62, 63, 64, 65, 66, 67, 67, 68, 68, 68, 69, 69, 63, 63,
+        62, 62, 63, 63, 64, 65, 66, 66, 67, 67, 68, 68, 69, 69, 62, 62, 63, 63,
+        63, 63, 64, 65, 65, 66, 67, 67, 68, 68, 68, 68, 62, 62, 63, 63, 64, 64,
+        65, 65, 66, 67, 67, 67, 68, 68, 68, 68, 62, 63, 63, 64, 65, 65, 66, 66,
+        67, 67, 67, 68, 68, 68, 69, 69, 63, 63, 63, 64, 65, 66, 66, 67, 67, 67,
+        68, 68, 68, 68, 69, 69, 64, 64, 64, 65, 66, 66, 67, 67, 68, 68, 68, 68,
+        69, 69, 69, 69, 65, 65, 65, 65, 66, 67, 67, 68, 68, 68, 68, 69, 69, 69,
+        69, 69, 66, 66, 65, 66, 67, 67, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69,
+        67, 66, 66, 67, 67, 67, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 67, 67,
+        67, 67, 67, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 68, 67, 67, 67,
+        68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69, 68, 68, 68, 68, 68, 68,
+        69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 68, 68, 68, 68, 68, 68, 69, 69,
+        69, 69, 69, 69, 69, 69, 70, 70, 69, 69, 68, 68, 69, 69, 69, 69, 69, 69,
+        69, 69, 69, 70, 70, 70, 69, 69, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69,
+        69, 70, 70, 70,
+        /* Size 32 */
+        64, 63, 63, 62, 62, 62, 62, 62, 62, 63, 63, 64, 64, 65, 65, 66, 66, 66,
+        67, 67, 67, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 63, 63, 63, 62,
+        62, 62, 62, 62, 62, 63, 63, 64, 64, 65, 65, 66, 66, 66, 67, 67, 67, 67,
+        68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 63, 63, 63, 62, 62, 62, 62, 62,
+        63, 63, 63, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, 67, 67, 68, 68, 68,
+        68, 68, 69, 69, 69, 69, 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 64,
+        64, 64, 65, 65, 66, 66, 66, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68,
+        68, 68, 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 63, 64, 64, 65, 65,
+        65, 66, 66, 66, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 62, 62,
+        62, 63, 63, 63, 63, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67,
+        67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 68, 62, 62, 62, 63, 63, 63,
+        63, 63, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 67, 67, 67, 67, 67, 68,
+        68, 68, 68, 68, 68, 68, 68, 68, 62, 62, 62, 63, 63, 63, 63, 64, 64, 65,
+        65, 65, 65, 66, 66, 66, 66, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68,
+        68, 68, 68, 68, 62, 62, 63, 63, 63, 63, 64, 64, 65, 65, 65, 66, 66, 66,
+        66, 66, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69,
+        63, 63, 63, 63, 63, 63, 64, 65, 65, 65, 66, 66, 66, 66, 67, 67, 67, 67,
+        67, 67, 68, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 63, 63, 63, 63,
+        63, 64, 64, 65, 65, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 68, 68, 68,
+        68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 64, 64, 64, 64, 63, 64, 65, 65,
+        66, 66, 66, 66, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 68,
+        69, 69, 69, 69, 69, 69, 64, 64, 64, 64, 64, 64, 65, 65, 66, 66, 66, 67,
+        67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69,
+        69, 69, 65, 65, 65, 64, 64, 65, 65, 66, 66, 66, 67, 67, 67, 67, 67, 68,
+        68, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69, 65, 65,
+        65, 65, 65, 65, 65, 66, 66, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68,
+        68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 66, 66, 65, 65, 65, 65,
+        66, 66, 66, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69,
+        69, 69, 69, 69, 69, 69, 69, 69, 66, 66, 66, 66, 65, 66, 66, 66, 67, 67,
+        67, 67, 68, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69,
+        69, 69, 69, 69, 66, 66, 66, 66, 66, 66, 66, 67, 67, 67, 67, 68, 68, 68,
+        68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69,
+        67, 67, 66, 66, 66, 66, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 69,
+        69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 67, 67, 67, 67,
+        66, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69,
+        69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 67, 67, 67, 67, 67, 67, 67, 67,
+        67, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69,
+        69, 69, 69, 69, 69, 69, 68, 67, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68,
+        68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69,
+        69, 69, 68, 68, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 69, 69,
+        69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 68, 68,
+        68, 68, 67, 68, 68, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69,
+        69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 68, 68, 68, 68, 68, 68,
+        68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69,
+        69, 69, 69, 69, 69, 69, 69, 69, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68,
+        68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69,
+        70, 70, 70, 70, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 69, 69, 69,
+        69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 70, 70, 70, 70, 70,
+        69, 68, 68, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69,
+        69, 69, 69, 69, 69, 69, 69, 69, 70, 70, 70, 70, 70, 70, 69, 69, 69, 68,
+        68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69,
+        69, 69, 69, 70, 70, 70, 70, 70, 70, 70, 69, 69, 69, 68, 68, 68, 68, 68,
+        69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 70,
+        70, 70, 70, 70, 70, 70, 69, 69, 69, 68, 68, 68, 68, 68, 69, 69, 69, 69,
+        69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 70, 70, 70, 70, 70,
+        70, 70, 69, 69, 69, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69,
+        69, 69, 69, 69, 69, 69, 69, 69, 69, 70, 70, 70, 70, 70, 70, 70 },
+      { /* Intra matrices */
+        /* Size 4 */
+        59, 60, 63, 65, 60, 63, 64, 65, 63, 64, 66, 66, 65, 65, 66, 66,
+        /* Size 8 */
+        61, 59, 59, 61, 63, 64, 65, 66, 59, 60, 59, 61, 62, 64, 65, 65, 59, 59,
+        62, 63, 64, 64, 65, 65, 61, 61, 63, 64, 65, 65, 65, 66, 63, 62, 64, 65,
+        65, 66, 66, 66, 64, 64, 64, 65, 66, 66, 66, 66, 65, 65, 65, 65, 66, 66,
+        66, 66, 66, 65, 65, 66, 66, 66, 66, 67,
+        /* Size 16 */
+        61, 60, 58, 59, 59, 60, 61, 62, 63, 63, 64, 64, 65, 65, 65, 65, 60, 59,
+        59, 59, 59, 60, 61, 62, 62, 63, 64, 64, 65, 65, 65, 65, 58, 59, 60, 59,
+        59, 60, 60, 61, 62, 63, 63, 64, 64, 65, 65, 65, 59, 59, 59, 60, 60, 61,
+        61, 62, 63, 63, 64, 64, 65, 65, 65, 65, 59, 59, 59, 60, 62, 62, 63, 63,
+        63, 64, 64, 64, 65, 65, 65, 65, 60, 60, 60, 61, 62, 63, 63, 63, 64, 64,
+        64, 65, 65, 65, 65, 65, 61, 61, 60, 61, 63, 63, 64, 64, 64, 65, 65, 65,
+        65, 65, 66, 66, 62, 62, 61, 62, 63, 63, 64, 64, 65, 65, 65, 65, 65, 66,
+        66, 66, 63, 62, 62, 63, 63, 64, 64, 65, 65, 65, 65, 65, 66, 66, 66, 66,
+        63, 63, 63, 63, 64, 64, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 64, 64,
+        63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 64, 64, 64, 64,
+        64, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 65, 65, 64, 65, 65, 65,
+        65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 65, 65, 65, 65, 65, 65, 65, 66,
+        66, 66, 66, 66, 66, 66, 66, 66, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66,
+        66, 66, 66, 66, 66, 66, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66,
+        66, 66, 66, 66,
+        /* Size 32 */
+        61, 60, 59, 59, 58, 59, 59, 59, 59, 60, 60, 61, 61, 61, 62, 62, 63, 63,
+        63, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 60, 60, 59, 59,
+        59, 59, 59, 59, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 63, 63, 64, 64,
+        64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 59, 59, 59, 59, 59, 59, 59, 59,
+        59, 59, 60, 60, 61, 61, 62, 62, 62, 63, 63, 63, 64, 64, 64, 64, 65, 65,
+        65, 65, 65, 65, 65, 65, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60,
+        61, 61, 61, 62, 62, 62, 63, 63, 63, 64, 64, 64, 64, 65, 65, 65, 65, 65,
+        65, 65, 58, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 61, 61, 62,
+        62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 59, 59,
+        59, 59, 59, 59, 60, 60, 60, 60, 60, 61, 61, 61, 62, 62, 62, 63, 63, 63,
+        63, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 59, 59, 59, 59, 59, 60,
+        60, 60, 60, 61, 61, 61, 61, 62, 62, 62, 63, 63, 63, 63, 64, 64, 64, 64,
+        64, 65, 65, 65, 65, 65, 65, 65, 59, 59, 59, 59, 59, 60, 60, 60, 61, 61,
+        61, 62, 62, 62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 65, 65, 65, 65,
+        65, 65, 65, 65, 59, 59, 59, 59, 59, 60, 60, 61, 62, 62, 62, 62, 62, 63,
+        63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65,
+        60, 59, 59, 59, 59, 60, 61, 61, 62, 62, 62, 62, 63, 63, 63, 63, 63, 64,
+        64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 60, 60, 60, 60,
+        60, 60, 61, 61, 62, 62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64,
+        65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 61, 60, 60, 60, 60, 61, 61, 62,
+        62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65,
+        65, 65, 65, 65, 65, 65, 61, 61, 61, 61, 60, 61, 61, 62, 62, 63, 63, 63,
+        64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+        65, 65, 61, 61, 61, 61, 61, 61, 62, 62, 63, 63, 63, 63, 64, 64, 64, 64,
+        64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 62, 62,
+        62, 61, 61, 62, 62, 62, 63, 63, 63, 64, 64, 64, 64, 64, 65, 65, 65, 65,
+        65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 62, 62, 62, 62, 62, 62,
+        62, 63, 63, 63, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+        65, 65, 66, 66, 66, 66, 66, 66, 63, 63, 62, 62, 62, 62, 63, 63, 63, 63,
+        64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66,
+        66, 66, 66, 66, 63, 63, 63, 62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 65,
+        65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66,
+        63, 63, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65,
+        65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 64, 63, 63, 63,
+        63, 63, 63, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66,
+        66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 64, 64, 64, 63, 63, 63, 64, 64,
+        64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66,
+        66, 66, 66, 66, 66, 66, 64, 64, 64, 64, 63, 64, 64, 64, 64, 64, 64, 65,
+        65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
+        66, 66, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65,
+        65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 65, 64,
+        64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66,
+        66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 65, 65, 65, 64, 64, 64,
+        64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66,
+        66, 66, 66, 66, 66, 66, 66, 66, 65, 65, 65, 65, 64, 64, 65, 65, 65, 65,
+        65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
+        66, 66, 66, 66, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+        65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
+        65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66,
+        66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 65, 65, 65, 65,
+        65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66,
+        66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 65, 65, 65, 65, 65, 65, 65, 65,
+        65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
+        66, 66, 66, 66, 66, 66, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+        65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
+        66, 66, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66,
+        66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 67, 68, 69, 67, 68, 69, 69, 68, 69, 70, 70, 69, 69, 70, 71,
+        /* Size 8 */
+        64, 62, 67, 67, 67, 68, 68, 69, 62, 65, 67, 66, 66, 67, 68, 68, 67, 67,
+        68, 67, 67, 68, 68, 69, 67, 66, 67, 68, 68, 68, 69, 69, 67, 66, 67, 68,
+        69, 69, 69, 69, 68, 67, 68, 68, 69, 69, 70, 70, 68, 68, 68, 69, 69, 70,
+        70, 70, 69, 68, 69, 69, 69, 70, 70, 70,
+        /* Size 16 */
+        64, 63, 62, 64, 67, 67, 67, 67, 67, 68, 68, 68, 68, 69, 69, 69, 63, 63,
+        64, 65, 67, 67, 67, 67, 67, 67, 67, 68, 68, 68, 69, 69, 62, 64, 65, 66,
+        67, 66, 66, 66, 66, 67, 67, 67, 68, 68, 68, 68, 64, 65, 66, 67, 67, 67,
+        67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 67, 67, 67, 67, 68, 67, 67, 67,
+        67, 68, 68, 68, 68, 68, 69, 69, 67, 67, 66, 67, 67, 68, 68, 68, 68, 68,
+        68, 68, 68, 69, 69, 69, 67, 67, 66, 67, 67, 68, 68, 68, 68, 68, 68, 69,
+        69, 69, 69, 69, 67, 67, 66, 67, 67, 68, 68, 68, 68, 69, 69, 69, 69, 69,
+        69, 69, 67, 67, 66, 67, 67, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69,
+        68, 67, 67, 67, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 70, 70, 68, 67,
+        67, 67, 68, 68, 68, 69, 69, 69, 69, 69, 70, 70, 70, 70, 68, 68, 67, 68,
+        68, 68, 69, 69, 69, 69, 69, 70, 70, 70, 70, 70, 68, 68, 68, 68, 68, 68,
+        69, 69, 69, 69, 70, 70, 70, 70, 70, 70, 69, 68, 68, 68, 68, 69, 69, 69,
+        69, 69, 70, 70, 70, 70, 70, 70, 69, 69, 68, 68, 69, 69, 69, 69, 69, 70,
+        70, 70, 70, 70, 70, 70, 69, 69, 68, 68, 69, 69, 69, 69, 69, 70, 70, 70,
+        70, 70, 70, 70,
+        /* Size 32 */
+        64, 64, 63, 63, 62, 63, 64, 65, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+        68, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 64, 63, 63, 63,
+        63, 64, 65, 66, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 68, 68,
+        68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 63, 63, 63, 63, 64, 64, 65, 66,
+        67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68,
+        68, 68, 69, 69, 69, 69, 63, 63, 63, 64, 64, 65, 66, 66, 67, 67, 67, 66,
+        66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68,
+        68, 68, 62, 63, 64, 64, 65, 66, 66, 66, 67, 67, 66, 66, 66, 66, 66, 66,
+        66, 66, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 63, 64,
+        64, 65, 66, 66, 66, 67, 67, 67, 67, 67, 66, 66, 67, 67, 67, 67, 67, 67,
+        67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 68, 64, 65, 65, 66, 66, 66,
+        67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 68, 68,
+        68, 68, 68, 68, 68, 68, 68, 68, 65, 66, 66, 66, 66, 67, 67, 67, 67, 67,
+        67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68,
+        68, 68, 68, 68, 67, 67, 67, 67, 67, 67, 67, 67, 68, 68, 67, 67, 67, 67,
+        67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69,
+        67, 67, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68,
+        68, 68, 68, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 67, 67, 67, 67,
+        66, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68,
+        68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 67, 67, 67, 66, 66, 67, 67, 67,
+        67, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 69, 69,
+        69, 69, 69, 69, 69, 69, 67, 67, 67, 66, 66, 66, 67, 67, 67, 68, 68, 68,
+        68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69,
+        69, 69, 67, 67, 67, 66, 66, 66, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68,
+        68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 67, 67,
+        67, 66, 66, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 69, 69, 69,
+        69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 67, 67, 67, 67, 66, 67,
+        67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69,
+        69, 69, 69, 69, 69, 69, 69, 69, 67, 67, 67, 67, 66, 67, 67, 67, 67, 68,
+        68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69,
+        69, 69, 69, 69, 67, 67, 67, 67, 66, 67, 67, 67, 67, 68, 68, 68, 68, 68,
+        69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69,
+        68, 67, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69,
+        69, 69, 69, 69, 69, 69, 69, 69, 69, 70, 70, 70, 70, 70, 68, 67, 67, 67,
+        67, 67, 67, 67, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69,
+        69, 69, 69, 70, 70, 70, 70, 70, 70, 70, 68, 68, 67, 67, 67, 67, 67, 67,
+        68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 70, 70, 70,
+        70, 70, 70, 70, 70, 70, 68, 68, 68, 67, 67, 67, 67, 68, 68, 68, 68, 68,
+        68, 69, 69, 69, 69, 69, 69, 69, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70,
+        70, 70, 68, 68, 68, 67, 67, 67, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69,
+        69, 69, 69, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 68, 68,
+        68, 68, 67, 68, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69,
+        70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 68, 68, 68, 68, 68, 68,
+        68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69, 70, 70, 70, 70,
+        70, 70, 70, 70, 70, 70, 70, 70, 69, 68, 68, 68, 68, 68, 68, 68, 68, 68,
+        68, 69, 69, 69, 69, 69, 69, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70, 70,
+        70, 70, 70, 70, 69, 68, 68, 68, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69,
+        69, 69, 69, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
+        69, 69, 68, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69,
+        70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 69, 69, 69, 68,
+        68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 70, 70, 70, 70,
+        70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 69, 69, 69, 68, 68, 68, 68, 68,
+        69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70,
+        70, 70, 70, 70, 70, 70, 69, 69, 69, 68, 68, 68, 68, 68, 69, 69, 69, 69,
+        69, 69, 69, 69, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
+        70, 70, 69, 69, 69, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69,
+        69, 69, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70 },
+      { /* Intra matrices */
+        /* Size 4 */
+        59, 63, 63, 64, 63, 64, 64, 65, 63, 64, 65, 66, 64, 65, 66, 66,
+        /* Size 8 */
+        60, 58, 63, 63, 63, 64, 65, 65, 58, 61, 63, 62, 62, 63, 64, 64, 63, 63,
+        64, 64, 64, 64, 64, 65, 63, 62, 64, 64, 64, 65, 65, 65, 63, 62, 64, 64,
+        65, 65, 65, 66, 64, 63, 64, 65, 65, 65, 66, 66, 65, 64, 64, 65, 65, 66,
+        66, 66, 65, 64, 65, 65, 66, 66, 66, 66,
+        /* Size 16 */
+        60, 59, 58, 60, 63, 63, 63, 63, 63, 64, 64, 64, 64, 65, 65, 65, 59, 59,
+        60, 61, 63, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 58, 60, 61, 62,
+        63, 62, 62, 62, 62, 63, 63, 63, 63, 64, 64, 64, 60, 61, 62, 62, 63, 63,
+        63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 63, 63, 63, 63, 64, 63, 63, 63,
+        63, 63, 64, 64, 64, 64, 64, 64, 63, 63, 62, 63, 63, 64, 64, 64, 64, 64,
+        64, 64, 64, 65, 65, 65, 63, 63, 62, 63, 63, 64, 64, 64, 64, 64, 64, 65,
+        65, 65, 65, 65, 63, 63, 62, 63, 63, 64, 64, 64, 64, 65, 65, 65, 65, 65,
+        65, 65, 63, 63, 62, 63, 63, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65,
+        64, 63, 63, 63, 63, 64, 64, 65, 65, 65, 65, 65, 65, 65, 66, 66, 64, 63,
+        63, 63, 64, 64, 64, 65, 65, 65, 65, 65, 66, 66, 66, 66, 64, 64, 63, 63,
+        64, 64, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 64, 64, 63, 64, 64, 64,
+        65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 65, 64, 64, 64, 64, 65, 65, 65,
+        65, 65, 66, 66, 66, 66, 66, 66, 65, 64, 64, 64, 64, 65, 65, 65, 65, 66,
+        66, 66, 66, 66, 66, 66, 65, 64, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66,
+        66, 66, 66, 66,
+        /* Size 32 */
+        60, 59, 59, 59, 58, 59, 60, 61, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63,
+        63, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 59, 59, 59, 59,
+        59, 60, 61, 62, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 64, 64,
+        64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 59, 59, 59, 59, 60, 60, 61, 62,
+        63, 63, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 59, 59, 59, 60, 60, 61, 61, 62, 63, 62, 62, 62,
+        62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64,
+        64, 64, 58, 59, 60, 60, 61, 61, 62, 62, 63, 62, 62, 62, 62, 62, 62, 62,
+        62, 62, 62, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 59, 60,
+        60, 61, 61, 62, 62, 62, 63, 63, 63, 62, 62, 62, 62, 62, 62, 63, 63, 63,
+        63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 60, 61, 61, 61, 62, 62,
+        62, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 61, 62, 62, 62, 62, 62, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        63, 63, 63, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 63, 63, 62, 62,
+        62, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 63, 63, 62, 62, 62, 62, 63, 63,
+        63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        65, 65, 65, 65, 65, 65, 63, 63, 62, 62, 62, 62, 63, 63, 63, 63, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65,
+        65, 65, 63, 63, 62, 62, 62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 63, 63,
+        63, 62, 62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65,
+        65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 63, 63, 63, 62, 62, 62,
+        63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65,
+        65, 65, 65, 65, 65, 65, 65, 65, 63, 63, 63, 62, 62, 62, 63, 63, 63, 63,
+        64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+        65, 65, 65, 65, 63, 63, 63, 63, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64,
+        64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+        63, 63, 63, 63, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 65, 65, 65,
+        65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 64, 63, 63, 63,
+        63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65,
+        65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 64, 64, 63, 63, 63, 63, 63, 63,
+        64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+        66, 66, 66, 66, 66, 66, 64, 64, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64,
+        64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66,
+        66, 66, 64, 64, 64, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 65, 65, 65,
+        65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 64, 64,
+        64, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65,
+        65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 64, 64, 64, 64, 63, 64,
+        64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66,
+        66, 66, 66, 66, 66, 66, 66, 66, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66,
+        66, 66, 66, 66, 65, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65,
+        65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
+        65, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65,
+        65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 65, 65, 64, 64,
+        64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66,
+        66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 65, 65, 64, 64, 64, 64, 64, 64,
+        64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66,
+        66, 66, 66, 66, 66, 66, 65, 65, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65,
+        65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
+        66, 66, 65, 65, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65,
+        65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        /* Size 8 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        /* Size 16 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64,
+        /* Size 32 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+      { /* Intra matrices */
+        /* Size 4 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        /* Size 8 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        /* Size 16 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64,
+        /* Size 32 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        /* Size 8 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        /* Size 16 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64,
+        /* Size 32 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+      { /* Intra matrices */
+        /* Size 4 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        /* Size 8 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        /* Size 16 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64,
+        /* Size 32 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } } }
+};
+
+static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][2][4 * 4 + 8 * 8 + 16 * 16 +
+                                                   32 * 32] = {
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 58, 33, 19, 58, 37, 25, 17, 33, 25, 16, 12, 19, 17, 12, 10,
+        /* Size 8 */
+        64, 87, 80, 59, 42, 31, 24, 19, 87, 75, 79, 66, 50, 38, 29, 23, 80, 79,
+        54, 46, 38, 31, 25, 20, 59, 66, 46, 34, 29, 24, 21, 18, 42, 50, 38, 29,
+        23, 20, 17, 15, 31, 38, 31, 24, 20, 17, 15, 13, 24, 29, 25, 21, 17, 15,
+        13, 12, 19, 23, 20, 18, 15, 13, 12, 11,
+        /* Size 16 */
+        64, 76, 87, 84, 80, 70, 59, 51, 42, 37, 31, 27, 24, 21, 19, 19, 76, 79,
+        81, 81, 80, 71, 63, 55, 46, 40, 34, 30, 26, 23, 21, 21, 87, 81, 75, 77,
+        79, 73, 66, 58, 50, 44, 38, 33, 29, 26, 23, 23, 84, 81, 77, 72, 67, 61,
+        56, 50, 44, 39, 34, 31, 27, 24, 21, 21, 80, 80, 79, 67, 54, 50, 46, 42,
+        38, 34, 31, 28, 25, 23, 20, 20, 70, 71, 73, 61, 50, 45, 40, 37, 33, 30,
+        28, 25, 23, 21, 19, 19, 59, 63, 66, 56, 46, 40, 34, 31, 29, 26, 24, 22,
+        21, 19, 18, 18, 51, 55, 58, 50, 42, 37, 31, 29, 26, 24, 22, 20, 19, 18,
+        16, 16, 42, 46, 50, 44, 38, 33, 29, 26, 23, 21, 20, 18, 17, 16, 15, 15,
+        37, 40, 44, 39, 34, 30, 26, 24, 21, 20, 18, 17, 16, 15, 14, 14, 31, 34,
+        38, 34, 31, 28, 24, 22, 20, 18, 17, 16, 15, 14, 13, 13, 27, 30, 33, 31,
+        28, 25, 22, 20, 18, 17, 16, 15, 14, 13, 13, 13, 24, 26, 29, 27, 25, 23,
+        21, 19, 17, 16, 15, 14, 13, 13, 12, 12, 21, 23, 26, 24, 23, 21, 19, 18,
+        16, 15, 14, 13, 13, 12, 12, 12, 19, 21, 23, 21, 20, 19, 18, 16, 15, 14,
+        13, 13, 12, 12, 11, 11, 19, 21, 23, 21, 20, 19, 18, 16, 15, 14, 13, 13,
+        12, 12, 11, 11,
+        /* Size 32 */
+        64, 70, 76, 82, 87, 86, 84, 82, 80, 75, 70, 65, 59, 55, 51, 47, 42, 40,
+        37, 34, 31, 29, 27, 26, 24, 22, 21, 20, 19, 19, 19, 19, 70, 74, 77, 81,
+        84, 83, 82, 81, 80, 75, 71, 66, 61, 57, 53, 49, 44, 41, 39, 36, 33, 31,
+        29, 27, 25, 24, 22, 21, 20, 20, 20, 20, 76, 77, 79, 80, 81, 81, 81, 80,
+        80, 75, 71, 67, 63, 59, 55, 51, 46, 43, 40, 37, 34, 32, 30, 28, 26, 25,
+        23, 22, 21, 21, 21, 21, 82, 81, 80, 79, 78, 79, 79, 79, 79, 76, 72, 68,
+        65, 61, 56, 52, 48, 45, 42, 39, 36, 34, 32, 30, 27, 26, 25, 23, 22, 22,
+        22, 22, 87, 84, 81, 78, 75, 76, 77, 78, 79, 76, 73, 70, 66, 62, 58, 54,
+        50, 47, 44, 41, 38, 36, 33, 31, 29, 27, 26, 24, 23, 23, 23, 23, 86, 83,
+        81, 79, 76, 75, 75, 74, 73, 70, 67, 64, 61, 58, 54, 51, 47, 44, 42, 39,
+        36, 34, 32, 30, 28, 26, 25, 23, 22, 22, 22, 22, 84, 82, 81, 79, 77, 75,
+        72, 69, 67, 64, 61, 59, 56, 53, 50, 47, 44, 42, 39, 37, 34, 32, 31, 29,
+        27, 25, 24, 23, 21, 21, 21, 21, 82, 81, 80, 79, 78, 74, 69, 65, 61, 58,
+        56, 53, 51, 48, 46, 44, 41, 39, 37, 35, 33, 31, 29, 28, 26, 25, 23, 22,
+        21, 21, 21, 21, 80, 80, 80, 79, 79, 73, 67, 61, 54, 52, 50, 48, 46, 44,
+        42, 40, 38, 36, 34, 33, 31, 29, 28, 26, 25, 24, 23, 22, 20, 20, 20, 20,
+        75, 75, 75, 76, 76, 70, 64, 58, 52, 50, 47, 45, 43, 41, 39, 37, 36, 34,
+        32, 31, 29, 28, 27, 25, 24, 23, 22, 21, 20, 20, 20, 20, 70, 71, 71, 72,
+        73, 67, 61, 56, 50, 47, 45, 42, 40, 38, 37, 35, 33, 32, 30, 29, 28, 26,
+        25, 24, 23, 22, 21, 20, 19, 19, 19, 19, 65, 66, 67, 68, 70, 64, 59, 53,
+        48, 45, 42, 40, 37, 36, 34, 32, 31, 30, 28, 27, 26, 25, 24, 23, 22, 21,
+        20, 19, 18, 18, 18, 18, 59, 61, 63, 65, 66, 61, 56, 51, 46, 43, 40, 37,
+        34, 33, 31, 30, 29, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 18, 18, 18,
+        18, 18, 55, 57, 59, 61, 62, 58, 53, 48, 44, 41, 38, 36, 33, 31, 30, 29,
+        27, 26, 25, 24, 23, 22, 21, 21, 20, 19, 18, 18, 17, 17, 17, 17, 51, 53,
+        55, 56, 58, 54, 50, 46, 42, 39, 37, 34, 31, 30, 29, 27, 26, 25, 24, 23,
+        22, 21, 20, 20, 19, 18, 18, 17, 16, 16, 16, 16, 47, 49, 51, 52, 54, 51,
+        47, 44, 40, 37, 35, 32, 30, 29, 27, 26, 24, 24, 23, 22, 21, 20, 19, 19,
+        18, 18, 17, 16, 16, 16, 16, 16, 42, 44, 46, 48, 50, 47, 44, 41, 38, 36,
+        33, 31, 29, 27, 26, 24, 23, 22, 21, 21, 20, 19, 18, 18, 17, 17, 16, 16,
+        15, 15, 15, 15, 40, 41, 43, 45, 47, 44, 42, 39, 36, 34, 32, 30, 27, 26,
+        25, 24, 22, 21, 21, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 15, 15,
+        37, 39, 40, 42, 44, 42, 39, 37, 34, 32, 30, 28, 26, 25, 24, 23, 21, 21,
+        20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 34, 36, 37, 39,
+        41, 39, 37, 35, 33, 31, 29, 27, 25, 24, 23, 22, 21, 20, 19, 18, 18, 17,
+        16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 31, 33, 34, 36, 38, 36, 34, 33,
+        31, 29, 28, 26, 24, 23, 22, 21, 20, 19, 18, 18, 17, 16, 16, 15, 15, 15,
+        14, 14, 13, 13, 13, 13, 29, 31, 32, 34, 36, 34, 32, 31, 29, 28, 26, 25,
+        23, 22, 21, 20, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13,
+        13, 13, 27, 29, 30, 32, 33, 32, 31, 29, 28, 27, 25, 24, 22, 21, 20, 19,
+        18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 26, 27,
+        28, 30, 31, 30, 29, 28, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 17, 16,
+        15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 24, 25, 26, 27, 29, 28,
+        27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 14,
+        13, 13, 13, 12, 12, 12, 12, 12, 22, 24, 25, 26, 27, 26, 25, 25, 24, 23,
+        22, 21, 20, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12,
+        12, 12, 12, 12, 21, 22, 23, 25, 26, 25, 24, 23, 23, 22, 21, 20, 19, 18,
+        18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12,
+        20, 21, 22, 23, 24, 23, 23, 22, 22, 21, 20, 19, 18, 18, 17, 16, 16, 15,
+        15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 19, 20, 21, 22,
+        23, 22, 21, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13,
+        13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 19, 20, 21, 22, 23, 22, 21, 21,
+        20, 20, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12,
+        12, 11, 11, 11, 11, 11, 19, 20, 21, 22, 23, 22, 21, 21, 20, 20, 19, 18,
+        18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11,
+        11, 11, 19, 20, 21, 22, 23, 22, 21, 21, 20, 20, 19, 18, 18, 17, 16, 16,
+        15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11 },
+      { /* Intra matrices */
+        /* Size 4 */
+        249, 225, 124, 69, 225, 139, 91, 60, 124, 91, 57, 42, 69, 60, 42, 32,
+        /* Size 8 */
+        206, 285, 261, 191, 134, 96, 71, 55, 285, 245, 257, 214, 161, 118, 88,
+        68, 261, 257, 174, 145, 119, 95, 75, 60, 191, 214, 145, 107, 87, 73, 61,
+        51, 134, 161, 119, 87, 69, 58, 50, 43, 96, 118, 95, 73, 58, 48, 42, 37,
+        71, 88, 75, 61, 50, 42, 36, 32, 55, 68, 60, 51, 43, 37, 32, 29,
+        /* Size 16 */
+        217, 259, 300, 287, 275, 237, 200, 171, 141, 121, 101, 88, 75, 66, 57,
+        57, 259, 269, 279, 275, 272, 243, 213, 184, 155, 134, 113, 98, 84, 74,
+        64, 64, 300, 279, 257, 264, 270, 248, 226, 197, 169, 147, 124, 109, 93,
+        82, 71, 71, 287, 275, 264, 245, 227, 208, 189, 168, 147, 130, 112, 99,
+        86, 77, 67, 67, 275, 272, 270, 227, 183, 168, 152, 139, 125, 113, 100,
+        90, 79, 71, 63, 63, 237, 243, 248, 208, 168, 150, 132, 120, 109, 98, 88,
+        80, 72, 65, 58, 58, 200, 213, 226, 189, 152, 132, 113, 102, 92, 84, 77,
+        70, 64, 59, 54, 54, 171, 184, 197, 168, 139, 120, 102, 92, 82, 75, 69,
+        63, 58, 54, 49, 49, 141, 155, 169, 147, 125, 109, 92, 82, 73, 67, 61,
+        56, 52, 49, 45, 45, 121, 134, 147, 130, 113, 98, 84, 75, 67, 61, 56, 52,
+        48, 45, 42, 42, 101, 113, 124, 112, 100, 88, 77, 69, 61, 56, 50, 47, 44,
+        41, 39, 39, 88, 98, 109, 99, 90, 80, 70, 63, 56, 52, 47, 44, 41, 39, 36,
+        36, 75, 84, 93, 86, 79, 72, 64, 58, 52, 48, 44, 41, 38, 36, 34, 34, 66,
+        74, 82, 77, 71, 65, 59, 54, 49, 45, 41, 39, 36, 34, 32, 32, 57, 64, 71,
+        67, 63, 58, 54, 49, 45, 42, 39, 36, 34, 32, 31, 31, 57, 64, 71, 67, 63,
+        58, 54, 49, 45, 42, 39, 36, 34, 32, 31, 31,
+        /* Size 32 */
+        223, 244, 265, 287, 308, 301, 295, 288, 282, 263, 244, 225, 206, 190,
+        175, 160, 145, 134, 124, 114, 104, 97, 90, 83, 77, 72, 68, 63, 59, 59,
+        59, 59, 244, 257, 271, 284, 297, 293, 289, 285, 281, 264, 246, 229, 212,
+        197, 182, 167, 152, 141, 131, 120, 110, 103, 95, 88, 81, 77, 72, 67, 62,
+        62, 62, 62, 265, 271, 276, 281, 286, 284, 283, 281, 280, 264, 249, 234,
+        219, 204, 189, 174, 159, 148, 137, 127, 116, 108, 101, 93, 86, 81, 76,
+        71, 66, 66, 66, 66, 287, 284, 281, 278, 275, 276, 277, 278, 278, 265,
+        252, 238, 225, 210, 196, 181, 166, 155, 144, 133, 122, 114, 106, 98, 91,
+        85, 80, 75, 69, 69, 69, 69, 308, 297, 286, 275, 264, 267, 271, 274, 277,
+        266, 254, 243, 231, 217, 203, 188, 174, 162, 151, 139, 128, 120, 111,
+        103, 95, 90, 84, 78, 73, 73, 73, 73, 301, 293, 284, 276, 267, 264, 261,
+        258, 255, 244, 234, 223, 213, 200, 187, 175, 162, 152, 142, 132, 121,
+        114, 107, 99, 92, 87, 81, 76, 71, 71, 71, 71, 295, 289, 283, 277, 271,
+        261, 252, 242, 233, 223, 213, 203, 194, 183, 172, 162, 151, 142, 133,
+        124, 115, 108, 102, 95, 88, 83, 79, 74, 69, 69, 69, 69, 288, 285, 281,
+        278, 274, 258, 242, 226, 210, 201, 193, 184, 175, 166, 157, 149, 140,
+        132, 124, 117, 109, 103, 97, 91, 85, 80, 76, 71, 67, 67, 67, 67, 282,
+        281, 280, 278, 277, 255, 233, 210, 188, 180, 172, 164, 156, 149, 142,
+        135, 129, 122, 116, 109, 103, 97, 92, 87, 81, 77, 73, 69, 65, 65, 65,
+        65, 263, 264, 264, 265, 266, 244, 223, 201, 180, 171, 163, 154, 146,
+        139, 133, 126, 120, 114, 108, 103, 97, 92, 87, 82, 77, 74, 70, 66, 62,
+        62, 62, 62, 244, 246, 249, 252, 254, 234, 213, 193, 172, 163, 154, 145,
+        136, 130, 124, 118, 111, 106, 101, 96, 91, 86, 82, 78, 73, 70, 67, 63,
+        60, 60, 60, 60, 225, 229, 234, 238, 243, 223, 203, 184, 164, 154, 145,
+        135, 126, 120, 114, 109, 103, 98, 94, 89, 85, 81, 77, 73, 70, 67, 64,
+        61, 57, 57, 57, 57, 206, 212, 219, 225, 231, 213, 194, 175, 156, 146,
+        136, 126, 116, 110, 105, 100, 94, 90, 87, 83, 79, 76, 72, 69, 66, 63,
+        60, 58, 55, 55, 55, 55, 190, 197, 204, 210, 217, 200, 183, 166, 149,
+        139, 130, 120, 110, 105, 100, 95, 89, 86, 82, 78, 75, 72, 69, 66, 63,
+        60, 58, 55, 53, 53, 53, 53, 175, 182, 189, 196, 203, 187, 172, 157, 142,
+        133, 124, 114, 105, 100, 95, 90, 84, 81, 77, 74, 71, 68, 65, 62, 60, 57,
+        55, 53, 51, 51, 51, 51, 160, 167, 174, 181, 188, 175, 162, 149, 135,
+        126, 118, 109, 100, 95, 90, 84, 79, 76, 73, 70, 66, 64, 61, 59, 57, 55,
+        53, 51, 49, 49, 49, 49, 145, 152, 159, 166, 174, 162, 151, 140, 129,
+        120, 111, 103, 94, 89, 84, 79, 74, 71, 68, 65, 62, 60, 58, 56, 53, 52,
+        50, 48, 46, 46, 46, 46, 134, 141, 148, 155, 162, 152, 142, 132, 122,
+        114, 106, 98, 90, 86, 81, 76, 71, 68, 66, 63, 60, 58, 55, 53, 51, 50,
+        48, 46, 45, 45, 45, 45, 124, 131, 137, 144, 151, 142, 133, 124, 116,
+        108, 101, 94, 87, 82, 77, 73, 68, 66, 63, 60, 57, 55, 53, 51, 49, 48,
+        46, 45, 43, 43, 43, 43, 114, 120, 127, 133, 139, 132, 124, 117, 109,
+        103, 96, 89, 83, 78, 74, 70, 65, 63, 60, 57, 54, 53, 51, 49, 47, 46, 44,
+        43, 41, 41, 41, 41, 104, 110, 116, 122, 128, 121, 115, 109, 103, 97, 91,
+        85, 79, 75, 71, 66, 62, 60, 57, 54, 52, 50, 48, 47, 45, 44, 42, 41, 40,
+        40, 40, 40, 97, 103, 108, 114, 120, 114, 108, 103, 97, 92, 86, 81, 76,
+        72, 68, 64, 60, 58, 55, 53, 50, 48, 47, 45, 43, 42, 41, 40, 38, 38, 38,
+        38, 90, 95, 101, 106, 111, 107, 102, 97, 92, 87, 82, 77, 72, 69, 65, 61,
+        58, 55, 53, 51, 48, 47, 45, 44, 42, 41, 40, 38, 37, 37, 37, 37, 83, 88,
+        93, 98, 103, 99, 95, 91, 87, 82, 78, 73, 69, 66, 62, 59, 56, 53, 51, 49,
+        47, 45, 44, 42, 40, 39, 38, 37, 36, 36, 36, 36, 77, 81, 86, 91, 95, 92,
+        88, 85, 81, 77, 73, 70, 66, 63, 60, 57, 53, 51, 49, 47, 45, 43, 42, 40,
+        39, 38, 37, 36, 35, 35, 35, 35, 72, 77, 81, 85, 90, 87, 83, 80, 77, 74,
+        70, 67, 63, 60, 57, 55, 52, 50, 48, 46, 44, 42, 41, 39, 38, 37, 36, 35,
+        34, 34, 34, 34, 68, 72, 76, 80, 84, 81, 79, 76, 73, 70, 67, 64, 60, 58,
+        55, 53, 50, 48, 46, 44, 42, 41, 40, 38, 37, 36, 35, 34, 33, 33, 33, 33,
+        63, 67, 71, 75, 78, 76, 74, 71, 69, 66, 63, 61, 58, 55, 53, 51, 48, 46,
+        45, 43, 41, 40, 38, 37, 36, 35, 34, 33, 32, 32, 32, 32, 59, 62, 66, 69,
+        73, 71, 69, 67, 65, 62, 60, 57, 55, 53, 51, 49, 46, 45, 43, 41, 40, 38,
+        37, 36, 35, 34, 33, 32, 31, 31, 31, 31, 59, 62, 66, 69, 73, 71, 69, 67,
+        65, 62, 60, 57, 55, 53, 51, 49, 46, 45, 43, 41, 40, 38, 37, 36, 35, 34,
+        33, 32, 31, 31, 31, 31, 59, 62, 66, 69, 73, 71, 69, 67, 65, 62, 60, 57,
+        55, 53, 51, 49, 46, 45, 43, 41, 40, 38, 37, 36, 35, 34, 33, 32, 31, 31,
+        31, 31, 59, 62, 66, 69, 73, 71, 69, 67, 65, 62, 60, 57, 55, 53, 51, 49,
+        46, 45, 43, 41, 40, 38, 37, 36, 35, 34, 33, 32, 31, 31, 31, 31 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 39, 35, 27, 39, 31, 29, 25, 35, 29, 21, 18, 27, 25, 18, 15,
+        /* Size 8 */
+        64, 81, 42, 39, 36, 32, 28, 24, 81, 54, 41, 46, 44, 40, 35, 30, 42, 41,
+        34, 36, 36, 34, 31, 27, 39, 46, 36, 31, 29, 28, 26, 24, 36, 44, 36, 29,
+        26, 24, 22, 20, 32, 40, 34, 28, 24, 21, 19, 18, 28, 35, 31, 26, 22, 19,
+        17, 16, 24, 30, 27, 24, 20, 18, 16, 15,
+        /* Size 16 */
+        64, 72, 81, 61, 42, 41, 39, 38, 36, 34, 32, 30, 28, 26, 24, 24, 72, 70,
+        67, 54, 42, 42, 43, 42, 40, 38, 36, 34, 31, 29, 27, 27, 81, 67, 54, 48,
+        41, 44, 46, 45, 44, 42, 40, 37, 35, 32, 30, 30, 61, 54, 48, 43, 38, 39,
+        41, 40, 40, 39, 37, 35, 33, 31, 29, 29, 42, 42, 41, 38, 34, 35, 36, 36,
+        36, 35, 34, 32, 31, 29, 27, 27, 41, 42, 44, 39, 35, 34, 33, 33, 33, 32,
+        31, 30, 28, 27, 25, 25, 39, 43, 46, 41, 36, 33, 31, 30, 29, 29, 28, 27,
+        26, 25, 24, 24, 38, 42, 45, 40, 36, 33, 30, 29, 27, 27, 26, 25, 24, 23,
+        22, 22, 36, 40, 44, 40, 36, 33, 29, 27, 26, 25, 24, 23, 22, 21, 20, 20,
+        34, 38, 42, 39, 35, 32, 29, 27, 25, 23, 22, 21, 21, 20, 19, 19, 32, 36,
+        40, 37, 34, 31, 28, 26, 24, 22, 21, 20, 19, 19, 18, 18, 30, 34, 37, 35,
+        32, 30, 27, 25, 23, 21, 20, 19, 18, 18, 17, 17, 28, 31, 35, 33, 31, 28,
+        26, 24, 22, 21, 19, 18, 17, 17, 16, 16, 26, 29, 32, 31, 29, 27, 25, 23,
+        21, 20, 19, 18, 17, 16, 15, 15, 24, 27, 30, 29, 27, 25, 24, 22, 20, 19,
+        18, 17, 16, 15, 15, 15, 24, 27, 30, 29, 27, 25, 24, 22, 20, 19, 18, 17,
+        16, 15, 15, 15,
+        /* Size 32 */
+        64, 68, 72, 76, 81, 71, 61, 52, 42, 41, 41, 40, 39, 39, 38, 37, 36, 35,
+        34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 24, 24, 24, 68, 70, 71, 73,
+        74, 66, 58, 50, 42, 42, 41, 41, 41, 40, 40, 39, 38, 37, 36, 35, 34, 33,
+        32, 31, 29, 28, 27, 26, 25, 25, 25, 25, 72, 71, 70, 69, 67, 61, 54, 48,
+        42, 42, 42, 42, 43, 42, 42, 41, 40, 39, 38, 37, 36, 35, 34, 32, 31, 30,
+        29, 28, 27, 27, 27, 27, 76, 73, 69, 65, 61, 56, 51, 46, 41, 42, 43, 44,
+        44, 44, 43, 43, 42, 41, 40, 39, 38, 37, 36, 34, 33, 32, 31, 30, 29, 29,
+        29, 29, 81, 74, 67, 61, 54, 51, 48, 44, 41, 42, 44, 45, 46, 46, 45, 45,
+        44, 43, 42, 41, 40, 39, 37, 36, 35, 34, 32, 31, 30, 30, 30, 30, 71, 66,
+        61, 56, 51, 48, 45, 42, 39, 40, 41, 42, 43, 43, 43, 43, 42, 41, 40, 39,
+        38, 37, 36, 35, 34, 33, 32, 30, 29, 29, 29, 29, 61, 58, 54, 51, 48, 45,
+        43, 40, 38, 38, 39, 40, 41, 41, 40, 40, 40, 39, 39, 38, 37, 36, 35, 34,
+        33, 32, 31, 30, 29, 29, 29, 29, 52, 50, 48, 46, 44, 42, 40, 38, 36, 37,
+        37, 38, 38, 38, 38, 38, 38, 37, 37, 36, 35, 34, 33, 33, 32, 31, 30, 29,
+        28, 28, 28, 28, 42, 42, 42, 41, 41, 39, 38, 36, 34, 35, 35, 35, 36, 36,
+        36, 36, 36, 35, 35, 34, 34, 33, 32, 31, 31, 30, 29, 28, 27, 27, 27, 27,
+        41, 42, 42, 42, 42, 40, 38, 37, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34,
+        33, 33, 32, 32, 31, 30, 29, 29, 28, 27, 26, 26, 26, 26, 41, 41, 42, 43,
+        44, 41, 39, 37, 35, 35, 34, 34, 33, 33, 33, 33, 33, 32, 32, 31, 31, 30,
+        30, 29, 28, 28, 27, 26, 25, 25, 25, 25, 40, 41, 42, 44, 45, 42, 40, 38,
+        35, 35, 34, 33, 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 27, 26,
+        26, 25, 25, 25, 25, 25, 39, 41, 43, 44, 46, 43, 41, 38, 36, 35, 33, 32,
+        31, 31, 30, 30, 29, 29, 29, 28, 28, 27, 27, 26, 26, 25, 25, 24, 24, 24,
+        24, 24, 39, 40, 42, 44, 46, 43, 41, 38, 36, 34, 33, 32, 31, 30, 29, 29,
+        28, 28, 28, 27, 27, 26, 26, 25, 25, 24, 24, 23, 23, 23, 23, 23, 38, 40,
+        42, 43, 45, 43, 40, 38, 36, 34, 33, 32, 30, 29, 29, 28, 27, 27, 27, 26,
+        26, 25, 25, 24, 24, 23, 23, 23, 22, 22, 22, 22, 37, 39, 41, 43, 45, 43,
+        40, 38, 36, 34, 33, 31, 30, 29, 28, 27, 27, 26, 26, 25, 25, 24, 24, 23,
+        23, 23, 22, 22, 21, 21, 21, 21, 36, 38, 40, 42, 44, 42, 40, 38, 36, 34,
+        33, 31, 29, 28, 27, 27, 26, 25, 25, 24, 24, 23, 23, 22, 22, 22, 21, 21,
+        20, 20, 20, 20, 35, 37, 39, 41, 43, 41, 39, 37, 35, 34, 32, 30, 29, 28,
+        27, 26, 25, 25, 24, 23, 23, 23, 22, 22, 21, 21, 21, 20, 20, 20, 20, 20,
+        34, 36, 38, 40, 42, 40, 39, 37, 35, 33, 32, 30, 29, 28, 27, 26, 25, 24,
+        23, 23, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 33, 35, 37, 39,
+        41, 39, 38, 36, 34, 33, 31, 30, 28, 27, 26, 25, 24, 23, 23, 22, 22, 21,
+        21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 32, 34, 36, 38, 40, 38, 37, 35,
+        34, 32, 31, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 21, 20, 20, 19, 19,
+        19, 18, 18, 18, 18, 18, 31, 33, 35, 37, 39, 37, 36, 34, 33, 32, 30, 29,
+        27, 26, 25, 24, 23, 23, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17,
+        17, 17, 30, 32, 34, 36, 37, 36, 35, 33, 32, 31, 30, 28, 27, 26, 25, 24,
+        23, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 29, 31,
+        32, 34, 36, 35, 34, 33, 31, 30, 29, 28, 26, 25, 24, 23, 22, 22, 21, 20,
+        20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 28, 29, 31, 33, 35, 34,
+        33, 32, 31, 29, 28, 27, 26, 25, 24, 23, 22, 21, 21, 20, 19, 19, 18, 18,
+        17, 17, 17, 16, 16, 16, 16, 16, 27, 28, 30, 32, 34, 33, 32, 31, 30, 29,
+        28, 26, 25, 24, 23, 23, 22, 21, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16,
+        16, 16, 16, 16, 26, 27, 29, 31, 32, 32, 31, 30, 29, 28, 27, 26, 25, 24,
+        23, 22, 21, 21, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15,
+        25, 26, 28, 30, 31, 30, 30, 29, 28, 27, 26, 25, 24, 23, 23, 22, 21, 20,
+        20, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 24, 25, 27, 29,
+        30, 29, 29, 28, 27, 26, 25, 25, 24, 23, 22, 21, 20, 20, 19, 19, 18, 17,
+        17, 16, 16, 16, 15, 15, 15, 15, 15, 15, 24, 25, 27, 29, 30, 29, 29, 28,
+        27, 26, 25, 25, 24, 23, 22, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 16,
+        15, 15, 15, 15, 15, 15, 24, 25, 27, 29, 30, 29, 29, 28, 27, 26, 25, 25,
+        24, 23, 22, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15,
+        15, 15, 24, 25, 27, 29, 30, 29, 29, 28, 27, 26, 25, 25, 24, 23, 22, 21,
+        20, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 15 },
+      { /* Intra matrices */
+        /* Size 4 */
+        175, 103, 93, 70, 103, 83, 76, 64, 93, 76, 55, 46, 70, 64, 46, 36,
+        /* Size 8 */
+        162, 205, 104, 97, 90, 78, 67, 57, 205, 136, 102, 115, 111, 99, 86, 73,
+        104, 102, 84, 88, 88, 83, 75, 66, 97, 115, 88, 75, 71, 67, 62, 56, 90,
+        111, 88, 71, 62, 56, 52, 48, 78, 99, 83, 67, 56, 49, 45, 41, 67, 86, 75,
+        62, 52, 45, 40, 36, 57, 73, 66, 56, 48, 41, 36, 33,
+        /* Size 16 */
+        168, 190, 213, 160, 108, 104, 101, 97, 93, 87, 81, 75, 69, 64, 59, 59,
+        190, 184, 177, 142, 107, 108, 110, 107, 104, 98, 92, 85, 79, 73, 67, 67,
+        213, 177, 141, 123, 106, 112, 119, 117, 115, 109, 103, 96, 89, 82, 76,
+        76, 160, 142, 123, 110, 96, 101, 105, 104, 103, 99, 94, 89, 83, 77, 72,
+        72, 108, 107, 106, 96, 87, 89, 91, 91, 91, 89, 86, 82, 77, 73, 68, 68,
+        104, 108, 112, 101, 89, 87, 85, 84, 82, 80, 78, 74, 71, 67, 63, 63, 101,
+        110, 119, 105, 91, 85, 78, 76, 74, 72, 70, 67, 64, 61, 58, 58, 97, 107,
+        117, 104, 91, 84, 76, 72, 69, 66, 64, 62, 59, 57, 54, 54, 93, 104, 115,
+        103, 91, 82, 74, 69, 64, 61, 58, 56, 54, 52, 50, 50, 87, 98, 109, 99,
+        89, 80, 72, 66, 61, 58, 55, 52, 50, 48, 46, 46, 81, 92, 103, 94, 86, 78,
+        70, 64, 58, 55, 51, 49, 46, 45, 43, 43, 75, 85, 96, 89, 82, 74, 67, 62,
+        56, 52, 49, 46, 44, 42, 40, 40, 69, 79, 89, 83, 77, 71, 64, 59, 54, 50,
+        46, 44, 41, 40, 38, 38, 64, 73, 82, 77, 73, 67, 61, 57, 52, 48, 45, 42,
+        40, 38, 36, 36, 59, 67, 76, 72, 68, 63, 58, 54, 50, 46, 43, 40, 38, 36,
+        34, 34, 59, 67, 76, 72, 68, 63, 58, 54, 50, 46, 43, 40, 38, 36, 34, 34,
+        /* Size 32 */
+        171, 182, 194, 205, 217, 190, 163, 137, 110, 108, 106, 104, 103, 101,
+        99, 97, 95, 92, 89, 86, 83, 80, 77, 74, 71, 68, 65, 63, 60, 60, 60, 60,
+        182, 186, 190, 194, 198, 176, 154, 132, 109, 109, 108, 108, 107, 105,
+        104, 102, 100, 97, 94, 91, 88, 85, 82, 79, 76, 73, 70, 67, 64, 64, 64,
+        64, 194, 190, 187, 184, 180, 162, 144, 127, 109, 110, 110, 111, 112,
+        110, 109, 107, 106, 103, 100, 97, 94, 90, 87, 84, 80, 78, 75, 72, 69,
+        69, 69, 69, 205, 194, 184, 173, 162, 148, 135, 122, 108, 110, 112, 114,
+        116, 115, 114, 113, 111, 108, 105, 102, 99, 96, 92, 89, 85, 82, 79, 76,
+        73, 73, 73, 73, 217, 198, 180, 162, 144, 135, 126, 117, 108, 111, 114,
+        118, 121, 120, 119, 118, 117, 114, 111, 108, 104, 101, 97, 94, 90, 87,
+        84, 80, 77, 77, 77, 77, 190, 176, 162, 148, 135, 127, 119, 111, 103,
+        106, 108, 111, 114, 113, 112, 112, 111, 108, 106, 103, 100, 97, 94, 91,
+        87, 84, 81, 78, 75, 75, 75, 75, 163, 154, 144, 135, 126, 119, 112, 105,
+        98, 100, 103, 105, 107, 106, 106, 105, 105, 103, 101, 98, 96, 93, 90,
+        87, 85, 82, 79, 76, 73, 73, 73, 73, 137, 132, 127, 122, 117, 111, 105,
+        99, 94, 95, 97, 98, 100, 100, 99, 99, 99, 97, 95, 94, 92, 89, 87, 84,
+        82, 79, 76, 74, 71, 71, 71, 71, 110, 109, 109, 108, 108, 103, 98, 94,
+        89, 90, 91, 92, 93, 93, 93, 93, 93, 92, 90, 89, 87, 85, 83, 81, 79, 76,
+        74, 72, 69, 69, 69, 69, 108, 109, 110, 110, 111, 106, 100, 95, 90, 90,
+        90, 90, 89, 89, 89, 89, 89, 87, 86, 85, 83, 81, 79, 77, 75, 73, 71, 69,
+        67, 67, 67, 67, 106, 108, 110, 112, 114, 108, 103, 97, 91, 90, 89, 87,
+        86, 86, 85, 85, 84, 83, 82, 80, 79, 77, 76, 74, 72, 70, 68, 66, 64, 64,
+        64, 64, 104, 108, 111, 114, 118, 111, 105, 98, 92, 90, 87, 85, 83, 82,
+        81, 80, 79, 78, 77, 76, 75, 74, 72, 70, 69, 67, 65, 64, 62, 62, 62, 62,
+        103, 107, 112, 116, 121, 114, 107, 100, 93, 89, 86, 83, 80, 78, 77, 76,
+        75, 74, 73, 72, 71, 70, 68, 67, 66, 64, 63, 61, 60, 60, 60, 60, 101,
+        105, 110, 115, 120, 113, 106, 100, 93, 89, 86, 82, 78, 77, 75, 74, 72,
+        71, 70, 69, 68, 67, 65, 64, 63, 62, 60, 59, 57, 57, 57, 57, 99, 104,
+        109, 114, 119, 112, 106, 99, 93, 89, 85, 81, 77, 75, 74, 72, 70, 69, 68,
+        66, 65, 64, 63, 61, 60, 59, 58, 56, 55, 55, 55, 55, 97, 102, 107, 113,
+        118, 112, 105, 99, 93, 89, 85, 80, 76, 74, 72, 70, 67, 66, 65, 64, 62,
+        61, 60, 59, 58, 56, 55, 54, 53, 53, 53, 53, 95, 100, 106, 111, 117, 111,
+        105, 99, 93, 89, 84, 79, 75, 72, 70, 67, 65, 64, 62, 61, 59, 58, 57, 56,
+        55, 54, 53, 52, 51, 51, 51, 51, 92, 97, 103, 108, 114, 108, 103, 97, 92,
+        87, 83, 78, 74, 71, 69, 66, 64, 62, 61, 59, 57, 56, 55, 54, 53, 52, 51,
+        50, 49, 49, 49, 49, 89, 94, 100, 105, 111, 106, 101, 95, 90, 86, 82, 77,
+        73, 70, 68, 65, 62, 61, 59, 57, 56, 55, 53, 52, 51, 50, 49, 48, 47, 47,
+        47, 47, 86, 91, 97, 102, 108, 103, 98, 94, 89, 85, 80, 76, 72, 69, 66,
+        64, 61, 59, 57, 56, 54, 53, 52, 50, 49, 48, 47, 46, 45, 45, 45, 45, 83,
+        88, 94, 99, 104, 100, 96, 92, 87, 83, 79, 75, 71, 68, 65, 62, 59, 57,
+        56, 54, 52, 51, 50, 48, 47, 46, 45, 44, 44, 44, 44, 44, 80, 85, 90, 96,
+        101, 97, 93, 89, 85, 81, 77, 74, 70, 67, 64, 61, 58, 56, 55, 53, 51, 50,
+        48, 47, 46, 45, 44, 43, 42, 42, 42, 42, 77, 82, 87, 92, 97, 94, 90, 87,
+        83, 79, 76, 72, 68, 65, 63, 60, 57, 55, 53, 52, 50, 48, 47, 46, 45, 44,
+        43, 42, 41, 41, 41, 41, 74, 79, 84, 89, 94, 91, 87, 84, 81, 77, 74, 70,
+        67, 64, 61, 59, 56, 54, 52, 50, 48, 47, 46, 45, 43, 42, 42, 41, 40, 40,
+        40, 40, 71, 76, 80, 85, 90, 87, 85, 82, 79, 75, 72, 69, 66, 63, 60, 58,
+        55, 53, 51, 49, 47, 46, 45, 43, 42, 41, 40, 39, 38, 38, 38, 38, 68, 73,
+        78, 82, 87, 84, 82, 79, 76, 73, 70, 67, 64, 62, 59, 56, 54, 52, 50, 48,
+        46, 45, 44, 42, 41, 40, 39, 38, 37, 37, 37, 37, 65, 70, 75, 79, 84, 81,
+        79, 76, 74, 71, 68, 65, 63, 60, 58, 55, 53, 51, 49, 47, 45, 44, 43, 42,
+        40, 39, 38, 37, 37, 37, 37, 37, 63, 67, 72, 76, 80, 78, 76, 74, 72, 69,
+        66, 64, 61, 59, 56, 54, 52, 50, 48, 46, 44, 43, 42, 41, 39, 38, 37, 37,
+        36, 36, 36, 36, 60, 64, 69, 73, 77, 75, 73, 71, 69, 67, 64, 62, 60, 57,
+        55, 53, 51, 49, 47, 45, 44, 42, 41, 40, 38, 37, 37, 36, 35, 35, 35, 35,
+        60, 64, 69, 73, 77, 75, 73, 71, 69, 67, 64, 62, 60, 57, 55, 53, 51, 49,
+        47, 45, 44, 42, 41, 40, 38, 37, 37, 36, 35, 35, 35, 35, 60, 64, 69, 73,
+        77, 75, 73, 71, 69, 67, 64, 62, 60, 57, 55, 53, 51, 49, 47, 45, 44, 42,
+        41, 40, 38, 37, 37, 36, 35, 35, 35, 35, 60, 64, 69, 73, 77, 75, 73, 71,
+        69, 67, 64, 62, 60, 57, 55, 53, 51, 49, 47, 45, 44, 42, 41, 40, 38, 37,
+        37, 36, 35, 35, 35, 35 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 58, 34, 21, 58, 38, 26, 19, 34, 26, 18, 14, 21, 19, 14, 12,
+        /* Size 8 */
+        64, 86, 80, 59, 43, 33, 25, 21, 86, 75, 78, 66, 51, 39, 30, 24, 80, 78,
+        55, 46, 39, 32, 27, 22, 59, 66, 46, 36, 30, 26, 23, 20, 43, 51, 39, 30,
+        25, 22, 19, 17, 33, 39, 32, 26, 22, 19, 17, 16, 25, 30, 27, 23, 19, 17,
+        16, 14, 21, 24, 22, 20, 17, 16, 14, 14,
+        /* Size 16 */
+        64, 75, 86, 83, 80, 70, 59, 51, 43, 38, 33, 29, 25, 23, 21, 21, 75, 78,
+        81, 80, 79, 71, 63, 55, 47, 41, 36, 32, 28, 25, 23, 23, 86, 81, 75, 77,
+        78, 72, 66, 59, 51, 45, 39, 35, 30, 27, 24, 24, 83, 80, 77, 72, 67, 61,
+        56, 51, 45, 40, 36, 32, 29, 26, 23, 23, 80, 79, 78, 67, 55, 51, 46, 43,
+        39, 36, 32, 29, 27, 24, 22, 22, 70, 71, 72, 61, 51, 46, 41, 38, 35, 32,
+        29, 27, 25, 23, 21, 21, 59, 63, 66, 56, 46, 41, 36, 33, 30, 28, 26, 24,
+        23, 21, 20, 20, 51, 55, 59, 51, 43, 38, 33, 30, 28, 26, 24, 22, 21, 20,
+        19, 19, 43, 47, 51, 45, 39, 35, 30, 28, 25, 23, 22, 21, 19, 18, 17, 17,
+        38, 41, 45, 40, 36, 32, 28, 26, 23, 22, 20, 19, 18, 17, 17, 17, 33, 36,
+        39, 36, 32, 29, 26, 24, 22, 20, 19, 18, 17, 16, 16, 16, 29, 32, 35, 32,
+        29, 27, 24, 22, 21, 19, 18, 17, 16, 16, 15, 15, 25, 28, 30, 29, 27, 25,
+        23, 21, 19, 18, 17, 16, 16, 15, 14, 14, 23, 25, 27, 26, 24, 23, 21, 20,
+        18, 17, 16, 16, 15, 14, 14, 14, 21, 23, 24, 23, 22, 21, 20, 19, 17, 17,
+        16, 15, 14, 14, 14, 14, 21, 23, 24, 23, 22, 21, 20, 19, 17, 17, 16, 15,
+        14, 14, 14, 14,
+        /* Size 32 */
+        64, 70, 75, 81, 86, 85, 83, 81, 80, 75, 70, 64, 59, 55, 51, 47, 43, 41,
+        38, 35, 33, 31, 29, 27, 25, 24, 23, 22, 21, 21, 21, 21, 70, 73, 77, 80,
+        84, 82, 81, 80, 79, 75, 70, 66, 61, 57, 53, 49, 45, 43, 40, 37, 34, 32,
+        30, 29, 27, 25, 24, 23, 22, 22, 22, 22, 75, 77, 78, 79, 81, 80, 80, 79,
+        79, 75, 71, 67, 63, 59, 55, 51, 47, 44, 41, 39, 36, 34, 32, 30, 28, 27,
+        25, 24, 23, 23, 23, 23, 81, 80, 79, 78, 78, 78, 78, 78, 79, 75, 72, 68,
+        65, 61, 57, 53, 49, 46, 43, 40, 37, 35, 33, 31, 29, 28, 26, 25, 24, 24,
+        24, 24, 86, 84, 81, 78, 75, 76, 77, 77, 78, 75, 72, 69, 66, 62, 59, 55,
+        51, 48, 45, 42, 39, 37, 35, 33, 30, 29, 27, 26, 24, 24, 24, 24, 85, 82,
+        80, 78, 76, 75, 74, 73, 72, 70, 67, 64, 61, 58, 55, 51, 48, 45, 43, 40,
+        37, 35, 33, 31, 29, 28, 27, 25, 24, 24, 24, 24, 83, 81, 80, 78, 77, 74,
+        72, 69, 67, 64, 61, 59, 56, 54, 51, 48, 45, 43, 40, 38, 36, 34, 32, 30,
+        29, 27, 26, 25, 23, 23, 23, 23, 81, 80, 79, 78, 77, 73, 69, 65, 61, 58,
+        56, 54, 51, 49, 47, 44, 42, 40, 38, 36, 34, 32, 31, 29, 28, 26, 25, 24,
+        23, 23, 23, 23, 80, 79, 79, 79, 78, 72, 67, 61, 55, 53, 51, 48, 46, 45,
+        43, 41, 39, 37, 36, 34, 32, 31, 29, 28, 27, 26, 24, 23, 22, 22, 22, 22,
+        75, 75, 75, 75, 75, 70, 64, 58, 53, 50, 48, 46, 44, 42, 40, 39, 37, 35,
+        34, 32, 31, 29, 28, 27, 26, 25, 24, 23, 22, 22, 22, 22, 70, 70, 71, 72,
+        72, 67, 61, 56, 51, 48, 46, 43, 41, 39, 38, 36, 35, 33, 32, 31, 29, 28,
+        27, 26, 25, 24, 23, 22, 21, 21, 21, 21, 64, 66, 67, 68, 69, 64, 59, 54,
+        48, 46, 43, 41, 38, 37, 35, 34, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23,
+        22, 21, 20, 20, 20, 20, 59, 61, 63, 65, 66, 61, 56, 51, 46, 44, 41, 38,
+        36, 34, 33, 32, 30, 29, 28, 27, 26, 25, 24, 23, 23, 22, 21, 20, 20, 20,
+        20, 20, 55, 57, 59, 61, 62, 58, 54, 49, 45, 42, 39, 37, 34, 33, 32, 30,
+        29, 28, 27, 26, 25, 24, 23, 23, 22, 21, 20, 20, 19, 19, 19, 19, 51, 53,
+        55, 57, 59, 55, 51, 47, 43, 40, 38, 35, 33, 32, 30, 29, 28, 27, 26, 25,
+        24, 23, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 47, 49, 51, 53, 55, 51,
+        48, 44, 41, 39, 36, 34, 32, 30, 29, 28, 26, 25, 24, 24, 23, 22, 21, 21,
+        20, 20, 19, 19, 18, 18, 18, 18, 43, 45, 47, 49, 51, 48, 45, 42, 39, 37,
+        35, 32, 30, 29, 28, 26, 25, 24, 23, 22, 22, 21, 21, 20, 19, 19, 18, 18,
+        17, 17, 17, 17, 41, 43, 44, 46, 48, 45, 43, 40, 37, 35, 33, 31, 29, 28,
+        27, 25, 24, 23, 23, 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 17, 17,
+        38, 40, 41, 43, 45, 43, 40, 38, 36, 34, 32, 30, 28, 27, 26, 24, 23, 23,
+        22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 35, 37, 39, 40,
+        42, 40, 38, 36, 34, 32, 31, 29, 27, 26, 25, 24, 22, 22, 21, 20, 20, 19,
+        19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 33, 34, 36, 37, 39, 37, 36, 34,
+        32, 31, 29, 28, 26, 25, 24, 23, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17,
+        16, 16, 16, 16, 16, 16, 31, 32, 34, 35, 37, 35, 34, 32, 31, 29, 28, 27,
+        25, 24, 23, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15,
+        15, 15, 29, 30, 32, 33, 35, 33, 32, 31, 29, 28, 27, 26, 24, 23, 22, 21,
+        21, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 27, 29,
+        30, 31, 33, 31, 30, 29, 28, 27, 26, 25, 23, 23, 22, 21, 20, 19, 19, 18,
+        18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 15, 25, 27, 28, 29, 30, 29,
+        29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 19, 18, 18, 17, 17, 16, 16,
+        16, 15, 15, 15, 14, 14, 14, 14, 24, 25, 27, 28, 29, 28, 27, 26, 26, 25,
+        24, 23, 22, 21, 20, 20, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 14,
+        14, 14, 14, 14, 23, 24, 25, 26, 27, 27, 26, 25, 24, 24, 23, 22, 21, 20,
+        20, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14,
+        22, 23, 24, 25, 26, 25, 25, 24, 23, 23, 22, 21, 20, 20, 19, 19, 18, 17,
+        17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 21, 22, 23, 24,
+        24, 24, 23, 23, 22, 22, 21, 20, 20, 19, 19, 18, 17, 17, 17, 16, 16, 15,
+        15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 21, 22, 23, 24, 24, 24, 23, 23,
+        22, 22, 21, 20, 20, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14,
+        14, 14, 14, 14, 14, 14, 21, 22, 23, 24, 24, 24, 23, 23, 22, 22, 21, 20,
+        20, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14,
+        14, 14, 21, 22, 23, 24, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, 19, 18,
+        17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14 },
+      { /* Intra matrices */
+        /* Size 4 */
+        217, 196, 112, 65, 196, 124, 84, 58, 112, 84, 55, 43, 65, 58, 43, 34,
+        /* Size 8 */
+        180, 246, 226, 167, 120, 88, 67, 53, 246, 212, 222, 187, 142, 107, 81,
+        64, 226, 222, 153, 128, 107, 87, 71, 58, 167, 187, 128, 97, 81, 69, 59,
+        50, 120, 142, 107, 81, 65, 56, 49, 44, 88, 107, 87, 69, 56, 48, 43, 39,
+        67, 81, 71, 59, 49, 43, 38, 35, 53, 64, 58, 50, 44, 39, 35, 32,
+        /* Size 16 */
+        188, 223, 257, 246, 236, 205, 174, 150, 125, 108, 92, 81, 70, 63, 56,
+        56, 223, 231, 239, 237, 234, 209, 185, 161, 137, 119, 102, 90, 78, 70,
+        62, 62, 257, 239, 221, 227, 232, 214, 195, 172, 148, 130, 111, 98, 85,
+        76, 67, 67, 246, 237, 227, 211, 196, 180, 165, 148, 130, 116, 101, 90,
+        80, 72, 64, 64, 236, 234, 232, 196, 160, 147, 134, 123, 112, 102, 91,
+        83, 74, 67, 61, 61, 205, 209, 214, 180, 147, 133, 118, 108, 98, 90, 82,
+        75, 68, 62, 57, 57, 174, 185, 195, 165, 134, 118, 102, 93, 84, 78, 72,
+        67, 61, 57, 53, 53, 150, 161, 172, 148, 123, 108, 93, 85, 76, 71, 65,
+        61, 56, 53, 49, 49, 125, 137, 148, 130, 112, 98, 84, 76, 68, 64, 59, 55,
+        51, 49, 46, 46, 108, 119, 130, 116, 102, 90, 78, 71, 64, 59, 54, 51, 48,
+        46, 43, 43, 92, 102, 111, 101, 91, 82, 72, 65, 59, 54, 50, 47, 45, 42,
+        40, 40, 81, 90, 98, 90, 83, 75, 67, 61, 55, 51, 47, 45, 42, 40, 38, 38,
+        70, 78, 85, 80, 74, 68, 61, 56, 51, 48, 45, 42, 40, 38, 36, 36, 63, 70,
+        76, 72, 67, 62, 57, 53, 49, 46, 42, 40, 38, 37, 35, 35, 56, 62, 67, 64,
+        61, 57, 53, 49, 46, 43, 40, 38, 36, 35, 34, 34, 56, 62, 67, 64, 61, 57,
+        53, 49, 46, 43, 40, 38, 36, 35, 34, 34,
+        /* Size 32 */
+        193, 210, 228, 245, 263, 258, 252, 247, 241, 226, 210, 194, 178, 166,
+        153, 141, 128, 120, 111, 103, 94, 89, 83, 77, 72, 68, 65, 61, 57, 57,
+        57, 57, 210, 221, 232, 243, 254, 250, 247, 244, 240, 226, 212, 198, 184,
+        171, 159, 146, 134, 125, 117, 108, 99, 93, 87, 82, 76, 72, 68, 64, 60,
+        60, 60, 60, 228, 232, 236, 240, 245, 243, 242, 241, 239, 227, 214, 202,
+        189, 177, 165, 152, 140, 131, 122, 113, 104, 98, 92, 86, 80, 75, 71, 67,
+        63, 63, 63, 63, 245, 243, 240, 238, 236, 236, 237, 238, 238, 227, 216,
+        205, 194, 182, 170, 158, 146, 137, 127, 118, 109, 103, 96, 90, 83, 79,
+        75, 70, 66, 66, 66, 66, 263, 254, 245, 236, 227, 229, 232, 235, 238,
+        228, 219, 209, 200, 188, 176, 164, 152, 142, 133, 123, 114, 107, 101,
+        94, 87, 83, 78, 73, 69, 69, 69, 69, 258, 250, 243, 236, 229, 227, 224,
+        222, 219, 210, 202, 193, 184, 174, 163, 153, 143, 134, 126, 117, 109,
+        103, 97, 90, 84, 80, 76, 71, 67, 67, 67, 67, 252, 247, 242, 237, 232,
+        224, 216, 208, 201, 193, 185, 177, 169, 160, 151, 142, 133, 126, 118,
+        111, 104, 98, 93, 87, 81, 77, 73, 69, 65, 65, 65, 65, 247, 244, 241,
+        238, 235, 222, 208, 195, 182, 175, 168, 160, 153, 146, 139, 131, 124,
+        118, 111, 105, 98, 93, 88, 83, 78, 75, 71, 67, 64, 64, 64, 64, 241, 240,
+        239, 238, 238, 219, 201, 182, 164, 157, 151, 144, 137, 132, 126, 120,
+        115, 109, 104, 99, 93, 89, 84, 80, 76, 72, 69, 65, 62, 62, 62, 62, 226,
+        226, 227, 227, 228, 210, 193, 175, 157, 150, 143, 136, 129, 124, 118,
+        113, 108, 103, 98, 93, 88, 84, 80, 76, 72, 69, 66, 63, 60, 60, 60, 60,
+        210, 212, 214, 216, 219, 202, 185, 168, 151, 143, 136, 128, 121, 116,
+        111, 106, 101, 96, 92, 88, 83, 80, 76, 73, 69, 66, 64, 61, 58, 58, 58,
+        58, 194, 198, 202, 205, 209, 193, 177, 160, 144, 136, 128, 120, 112,
+        108, 103, 98, 94, 90, 86, 82, 79, 75, 72, 69, 66, 64, 61, 59, 56, 56,
+        56, 56, 178, 184, 189, 194, 200, 184, 169, 153, 137, 129, 121, 112, 104,
+        100, 95, 91, 86, 83, 80, 77, 74, 71, 68, 66, 63, 61, 58, 56, 54, 54, 54,
+        54, 166, 171, 177, 182, 188, 174, 160, 146, 132, 124, 116, 108, 100, 95,
+        91, 87, 82, 79, 76, 73, 70, 68, 65, 63, 60, 58, 56, 54, 52, 52, 52, 52,
+        153, 159, 165, 170, 176, 163, 151, 139, 126, 118, 111, 103, 95, 91, 87,
+        82, 78, 75, 73, 70, 67, 65, 62, 60, 58, 56, 54, 52, 50, 50, 50, 50, 141,
+        146, 152, 158, 164, 153, 142, 131, 120, 113, 106, 98, 91, 87, 82, 78,
+        74, 71, 69, 66, 63, 61, 59, 57, 55, 54, 52, 50, 49, 49, 49, 49, 128,
+        134, 140, 146, 152, 143, 133, 124, 115, 108, 101, 94, 86, 82, 78, 74,
+        70, 68, 65, 63, 60, 58, 56, 54, 53, 51, 50, 48, 47, 47, 47, 47, 120,
+        125, 131, 137, 142, 134, 126, 118, 109, 103, 96, 90, 83, 79, 75, 71, 68,
+        65, 63, 60, 58, 56, 54, 53, 51, 50, 48, 47, 45, 45, 45, 45, 111, 117,
+        122, 127, 133, 126, 118, 111, 104, 98, 92, 86, 80, 76, 73, 69, 65, 63,
+        60, 58, 56, 54, 52, 51, 49, 48, 47, 45, 44, 44, 44, 44, 103, 108, 113,
+        118, 123, 117, 111, 105, 99, 93, 88, 82, 77, 73, 70, 66, 63, 60, 58, 56,
+        53, 52, 50, 49, 47, 46, 45, 44, 43, 43, 43, 43, 94, 99, 104, 109, 114,
+        109, 104, 98, 93, 88, 83, 79, 74, 70, 67, 63, 60, 58, 56, 53, 51, 50,
+        48, 47, 46, 44, 43, 42, 41, 41, 41, 41, 89, 93, 98, 103, 107, 103, 98,
+        93, 89, 84, 80, 75, 71, 68, 65, 61, 58, 56, 54, 52, 50, 49, 47, 46, 44,
+        43, 42, 41, 40, 40, 40, 40, 83, 87, 92, 96, 101, 97, 93, 88, 84, 80, 76,
+        72, 68, 65, 62, 59, 56, 54, 52, 50, 48, 47, 46, 44, 43, 42, 41, 40, 39,
+        39, 39, 39, 77, 82, 86, 90, 94, 90, 87, 83, 80, 76, 73, 69, 66, 63, 60,
+        57, 54, 53, 51, 49, 47, 46, 44, 43, 42, 41, 40, 39, 38, 38, 38, 38, 72,
+        76, 80, 83, 87, 84, 81, 78, 76, 72, 69, 66, 63, 60, 58, 55, 53, 51, 49,
+        47, 46, 44, 43, 42, 41, 40, 39, 38, 37, 37, 37, 37, 68, 72, 75, 79, 83,
+        80, 77, 75, 72, 69, 66, 64, 61, 58, 56, 54, 51, 50, 48, 46, 44, 43, 42,
+        41, 40, 39, 38, 37, 37, 37, 37, 37, 65, 68, 71, 75, 78, 76, 73, 71, 69,
+        66, 64, 61, 58, 56, 54, 52, 50, 48, 47, 45, 43, 42, 41, 40, 39, 38, 37,
+        37, 36, 36, 36, 36, 61, 64, 67, 70, 73, 71, 69, 67, 65, 63, 61, 59, 56,
+        54, 52, 50, 48, 47, 45, 44, 42, 41, 40, 39, 38, 37, 37, 36, 35, 35, 35,
+        35, 57, 60, 63, 66, 69, 67, 65, 64, 62, 60, 58, 56, 54, 52, 50, 49, 47,
+        45, 44, 43, 41, 40, 39, 38, 37, 37, 36, 35, 34, 34, 34, 34, 57, 60, 63,
+        66, 69, 67, 65, 64, 62, 60, 58, 56, 54, 52, 50, 49, 47, 45, 44, 43, 41,
+        40, 39, 38, 37, 37, 36, 35, 34, 34, 34, 34, 57, 60, 63, 66, 69, 67, 65,
+        64, 62, 60, 58, 56, 54, 52, 50, 49, 47, 45, 44, 43, 41, 40, 39, 38, 37,
+        37, 36, 35, 34, 34, 34, 34, 57, 60, 63, 66, 69, 67, 65, 64, 62, 60, 58,
+        56, 54, 52, 50, 49, 47, 45, 44, 43, 41, 40, 39, 38, 37, 37, 36, 35, 34,
+        34, 34, 34 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 39, 36, 28, 39, 32, 30, 26, 36, 30, 23, 20, 28, 26, 20, 16,
+        /* Size 8 */
+        64, 80, 43, 40, 37, 33, 29, 25, 80, 55, 42, 47, 45, 41, 36, 31, 43, 42,
+        35, 37, 37, 35, 32, 29, 40, 47, 37, 32, 31, 29, 27, 25, 37, 45, 37, 31,
+        27, 25, 24, 22, 33, 41, 35, 29, 25, 23, 21, 20, 29, 36, 32, 27, 24, 21,
+        19, 18, 25, 31, 29, 25, 22, 20, 18, 17,
+        /* Size 16 */
+        64, 72, 80, 61, 43, 41, 40, 39, 37, 35, 33, 31, 29, 27, 25, 25, 72, 70,
+        67, 55, 42, 43, 43, 42, 41, 39, 37, 35, 32, 30, 28, 28, 80, 67, 55, 48,
+        42, 44, 47, 46, 45, 43, 41, 38, 36, 34, 31, 31, 61, 55, 48, 43, 39, 40,
+        42, 41, 41, 39, 38, 36, 34, 32, 30, 30, 43, 42, 42, 39, 35, 36, 37, 37,
+        37, 36, 35, 33, 32, 30, 29, 29, 41, 43, 44, 40, 36, 35, 34, 34, 34, 33,
+        32, 31, 30, 28, 27, 27, 40, 43, 47, 42, 37, 34, 32, 31, 31, 30, 29, 28,
+        27, 26, 25, 25, 39, 42, 46, 41, 37, 34, 31, 30, 29, 28, 27, 26, 25, 25,
+        24, 24, 37, 41, 45, 41, 37, 34, 31, 29, 27, 26, 25, 24, 24, 23, 22, 22,
+        35, 39, 43, 39, 36, 33, 30, 28, 26, 25, 24, 23, 22, 22, 21, 21, 33, 37,
+        41, 38, 35, 32, 29, 27, 25, 24, 23, 22, 21, 20, 20, 20, 31, 35, 38, 36,
+        33, 31, 28, 26, 24, 23, 22, 21, 20, 19, 19, 19, 29, 32, 36, 34, 32, 30,
+        27, 25, 24, 22, 21, 20, 19, 18, 18, 18, 27, 30, 34, 32, 30, 28, 26, 25,
+        23, 22, 20, 19, 18, 18, 17, 17, 25, 28, 31, 30, 29, 27, 25, 24, 22, 21,
+        20, 19, 18, 17, 17, 17, 25, 28, 31, 30, 29, 27, 25, 24, 22, 21, 20, 19,
+        18, 17, 17, 17,
+        /* Size 32 */
+        64, 68, 72, 76, 80, 71, 61, 52, 43, 42, 41, 41, 40, 39, 39, 38, 37, 36,
+        35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 25, 25, 25, 68, 69, 71, 72,
+        74, 66, 58, 50, 43, 42, 42, 42, 42, 41, 41, 40, 39, 38, 37, 36, 35, 34,
+        33, 32, 31, 30, 29, 28, 27, 27, 27, 27, 72, 71, 70, 68, 67, 61, 55, 49,
+        42, 43, 43, 43, 43, 43, 42, 42, 41, 40, 39, 38, 37, 36, 35, 34, 32, 31,
+        30, 29, 28, 28, 28, 28, 76, 72, 68, 65, 61, 56, 52, 47, 42, 43, 44, 44,
+        45, 45, 44, 44, 43, 42, 41, 40, 39, 38, 37, 35, 34, 33, 32, 31, 30, 30,
+        30, 30, 80, 74, 67, 61, 55, 51, 48, 45, 42, 43, 44, 45, 47, 46, 46, 46,
+        45, 44, 43, 42, 41, 40, 38, 37, 36, 35, 34, 33, 31, 31, 31, 31, 71, 66,
+        61, 56, 51, 49, 46, 43, 40, 41, 42, 43, 44, 44, 44, 43, 43, 42, 41, 40,
+        39, 38, 37, 36, 35, 34, 33, 32, 31, 31, 31, 31, 61, 58, 55, 52, 48, 46,
+        43, 41, 39, 39, 40, 41, 42, 42, 41, 41, 41, 40, 39, 39, 38, 37, 36, 35,
+        34, 33, 32, 31, 30, 30, 30, 30, 52, 50, 49, 47, 45, 43, 41, 39, 37, 38,
+        38, 39, 39, 39, 39, 39, 39, 38, 38, 37, 36, 36, 35, 34, 33, 32, 31, 30,
+        29, 29, 29, 29, 43, 43, 42, 42, 42, 40, 39, 37, 35, 36, 36, 36, 37, 37,
+        37, 37, 37, 36, 36, 35, 35, 34, 33, 33, 32, 31, 30, 29, 29, 29, 29, 29,
+        42, 42, 43, 43, 43, 41, 39, 38, 36, 36, 36, 36, 36, 36, 35, 35, 35, 35,
+        34, 34, 33, 33, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 41, 42, 43, 44,
+        44, 42, 40, 38, 36, 36, 35, 35, 34, 34, 34, 34, 34, 33, 33, 32, 32, 31,
+        31, 30, 30, 29, 28, 28, 27, 27, 27, 27, 41, 42, 43, 44, 45, 43, 41, 39,
+        36, 36, 35, 34, 33, 33, 33, 32, 32, 32, 31, 31, 31, 30, 30, 29, 28, 28,
+        27, 27, 26, 26, 26, 26, 40, 42, 43, 45, 47, 44, 42, 39, 37, 36, 34, 33,
+        32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 25, 25,
+        25, 25, 39, 41, 43, 45, 46, 44, 42, 39, 37, 36, 34, 33, 32, 31, 31, 30,
+        30, 29, 29, 29, 28, 28, 27, 27, 26, 26, 25, 25, 24, 24, 24, 24, 39, 41,
+        42, 44, 46, 44, 41, 39, 37, 35, 34, 33, 31, 31, 30, 29, 29, 28, 28, 28,
+        27, 27, 26, 26, 25, 25, 25, 24, 24, 24, 24, 24, 38, 40, 42, 44, 46, 43,
+        41, 39, 37, 35, 34, 32, 31, 30, 29, 29, 28, 27, 27, 27, 26, 26, 25, 25,
+        25, 24, 24, 23, 23, 23, 23, 23, 37, 39, 41, 43, 45, 43, 41, 39, 37, 35,
+        34, 32, 31, 30, 29, 28, 27, 27, 26, 26, 25, 25, 24, 24, 24, 23, 23, 22,
+        22, 22, 22, 22, 36, 38, 40, 42, 44, 42, 40, 38, 36, 35, 33, 32, 30, 29,
+        28, 27, 27, 26, 26, 25, 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 21,
+        35, 37, 39, 41, 43, 41, 39, 38, 36, 34, 33, 31, 30, 29, 28, 27, 26, 26,
+        25, 24, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 21, 21, 34, 36, 38, 40,
+        42, 40, 39, 37, 35, 34, 32, 31, 30, 29, 28, 27, 26, 25, 24, 24, 23, 23,
+        22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 33, 35, 37, 39, 41, 39, 38, 36,
+        35, 33, 32, 31, 29, 28, 27, 26, 25, 24, 24, 23, 23, 22, 22, 21, 21, 21,
+        20, 20, 20, 20, 20, 20, 32, 34, 36, 38, 40, 38, 37, 36, 34, 33, 31, 30,
+        29, 28, 27, 26, 25, 24, 23, 23, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19,
+        19, 19, 31, 33, 35, 37, 38, 37, 36, 35, 33, 32, 31, 30, 28, 27, 26, 25,
+        24, 24, 23, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 30, 32,
+        34, 35, 37, 36, 35, 34, 33, 31, 30, 29, 28, 27, 26, 25, 24, 23, 23, 22,
+        21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 29, 31, 32, 34, 36, 35,
+        34, 33, 32, 31, 30, 28, 27, 26, 25, 25, 24, 23, 22, 22, 21, 20, 20, 20,
+        19, 19, 18, 18, 18, 18, 18, 18, 28, 30, 31, 33, 35, 34, 33, 32, 31, 30,
+        29, 28, 27, 26, 25, 24, 23, 23, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18,
+        18, 18, 18, 18, 27, 29, 30, 32, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25,
+        25, 24, 23, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17,
+        26, 28, 29, 31, 33, 32, 31, 30, 29, 28, 28, 27, 26, 25, 24, 23, 22, 22,
+        21, 21, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 25, 27, 28, 30,
+        31, 31, 30, 29, 29, 28, 27, 26, 25, 24, 24, 23, 22, 21, 21, 20, 20, 19,
+        19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 25, 27, 28, 30, 31, 31, 30, 29,
+        29, 28, 27, 26, 25, 24, 24, 23, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18,
+        17, 17, 17, 17, 17, 17, 25, 27, 28, 30, 31, 31, 30, 29, 29, 28, 27, 26,
+        25, 24, 24, 23, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17,
+        17, 17, 25, 27, 28, 30, 31, 31, 30, 29, 29, 28, 27, 26, 25, 24, 24, 23,
+        22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17 },
+      { /* Intra matrices */
+        /* Size 4 */
+        162, 98, 89, 68, 98, 80, 74, 63, 89, 74, 55, 47, 68, 63, 47, 38,
+        /* Size 8 */
+        151, 190, 99, 93, 86, 76, 66, 57, 190, 128, 97, 108, 105, 94, 82, 71,
+        99, 97, 81, 84, 85, 80, 72, 64, 93, 108, 84, 73, 69, 66, 61, 56, 86,
+        105, 85, 69, 61, 56, 52, 49, 76, 94, 80, 66, 56, 50, 46, 43, 66, 82, 72,
+        61, 52, 46, 41, 38, 57, 71, 64, 56, 49, 43, 38, 35,
+        /* Size 16 */
+        156, 176, 196, 149, 102, 99, 96, 92, 89, 83, 78, 73, 68, 63, 58, 58,
+        176, 170, 164, 133, 101, 103, 104, 101, 99, 93, 88, 82, 76, 71, 66, 66,
+        196, 164, 132, 116, 100, 106, 112, 110, 108, 103, 97, 91, 85, 79, 73,
+        73, 149, 133, 116, 104, 92, 96, 99, 99, 98, 94, 90, 85, 80, 75, 70, 70,
+        102, 101, 100, 92, 84, 85, 87, 87, 87, 85, 82, 79, 75, 71, 67, 67, 99,
+        103, 106, 96, 85, 83, 81, 80, 79, 77, 75, 72, 69, 66, 62, 62, 96, 104,
+        112, 99, 87, 81, 76, 74, 71, 70, 68, 66, 63, 61, 58, 58, 92, 101, 110,
+        99, 87, 80, 74, 70, 67, 65, 63, 61, 59, 56, 54, 54, 89, 99, 108, 98, 87,
+        79, 71, 67, 63, 60, 58, 56, 54, 52, 50, 50, 83, 93, 103, 94, 85, 77, 70,
+        65, 60, 57, 55, 53, 51, 49, 47, 47, 78, 88, 97, 90, 82, 75, 68, 63, 58,
+        55, 51, 49, 47, 46, 44, 44, 73, 82, 91, 85, 79, 72, 66, 61, 56, 53, 49,
+        47, 45, 43, 42, 42, 68, 76, 85, 80, 75, 69, 63, 59, 54, 51, 47, 45, 43,
+        41, 39, 39, 63, 71, 79, 75, 71, 66, 61, 56, 52, 49, 46, 43, 41, 39, 38,
+        38, 58, 66, 73, 70, 67, 62, 58, 54, 50, 47, 44, 42, 39, 38, 36, 36, 58,
+        66, 73, 70, 67, 62, 58, 54, 50, 47, 44, 42, 39, 38, 36, 36,
+        /* Size 32 */
+        158, 169, 179, 189, 199, 175, 152, 128, 104, 102, 101, 99, 97, 96, 94,
+        92, 90, 88, 85, 82, 79, 77, 74, 71, 69, 66, 64, 62, 59, 59, 59, 59, 169,
+        172, 176, 179, 183, 163, 143, 123, 103, 103, 102, 102, 101, 100, 98, 97,
+        95, 93, 90, 87, 84, 82, 79, 76, 73, 71, 68, 66, 63, 63, 63, 63, 179,
+        176, 173, 170, 167, 151, 135, 119, 103, 104, 104, 105, 106, 104, 103,
+        102, 100, 97, 95, 92, 89, 86, 83, 81, 78, 75, 72, 70, 67, 67, 67, 67,
+        189, 179, 170, 160, 150, 138, 126, 114, 102, 104, 106, 108, 110, 109,
+        107, 106, 105, 102, 100, 97, 94, 91, 88, 85, 82, 79, 76, 74, 71, 71, 71,
+        71, 199, 183, 167, 150, 134, 126, 118, 110, 102, 105, 108, 111, 114,
+        113, 112, 111, 110, 107, 105, 102, 99, 96, 93, 90, 86, 84, 81, 78, 75,
+        75, 75, 75, 175, 163, 151, 138, 126, 119, 112, 105, 98, 100, 103, 105,
+        107, 107, 106, 106, 105, 102, 100, 98, 95, 92, 90, 87, 84, 81, 78, 76,
+        73, 73, 73, 73, 152, 143, 135, 126, 118, 112, 106, 100, 94, 95, 97, 99,
+        101, 101, 100, 100, 100, 98, 96, 94, 91, 89, 86, 84, 81, 79, 76, 74, 71,
+        71, 71, 71, 128, 123, 119, 114, 110, 105, 100, 95, 89, 91, 92, 94, 95,
+        95, 95, 94, 94, 93, 91, 89, 88, 85, 83, 81, 79, 76, 74, 72, 69, 69, 69,
+        69, 104, 103, 103, 102, 102, 98, 94, 89, 85, 86, 87, 88, 89, 89, 89, 89,
+        89, 88, 86, 85, 84, 82, 80, 78, 76, 74, 72, 70, 68, 68, 68, 68, 102,
+        103, 104, 104, 105, 100, 95, 91, 86, 86, 86, 86, 86, 85, 85, 85, 85, 84,
+        83, 81, 80, 78, 77, 75, 73, 71, 69, 67, 66, 66, 66, 66, 101, 102, 104,
+        106, 108, 103, 97, 92, 87, 86, 85, 84, 83, 82, 82, 81, 81, 80, 79, 78,
+        77, 75, 73, 72, 70, 69, 67, 65, 63, 63, 63, 63, 99, 102, 105, 108, 111,
+        105, 99, 94, 88, 86, 84, 82, 80, 79, 78, 78, 77, 76, 75, 74, 73, 71, 70,
+        69, 67, 66, 64, 63, 61, 61, 61, 61, 97, 101, 106, 110, 114, 107, 101,
+        95, 89, 86, 83, 80, 77, 76, 75, 74, 73, 72, 71, 70, 69, 68, 67, 66, 64,
+        63, 62, 60, 59, 59, 59, 59, 96, 100, 104, 109, 113, 107, 101, 95, 89,
+        85, 82, 79, 76, 74, 73, 72, 70, 69, 68, 68, 67, 65, 64, 63, 62, 61, 60,
+        58, 57, 57, 57, 57, 94, 98, 103, 107, 112, 106, 100, 95, 89, 85, 82, 78,
+        75, 73, 72, 70, 68, 67, 66, 65, 64, 63, 62, 61, 60, 58, 57, 56, 55, 55,
+        55, 55, 92, 97, 102, 106, 111, 106, 100, 94, 89, 85, 81, 78, 74, 72, 70,
+        68, 66, 65, 64, 63, 61, 60, 59, 58, 57, 56, 55, 54, 53, 53, 53, 53, 90,
+        95, 100, 105, 110, 105, 100, 94, 89, 85, 81, 77, 73, 70, 68, 66, 64, 63,
+        61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 51, 51, 51, 88, 93, 97, 102,
+        107, 102, 98, 93, 88, 84, 80, 76, 72, 69, 67, 65, 63, 61, 60, 59, 57,
+        56, 55, 54, 53, 52, 51, 50, 49, 49, 49, 49, 85, 90, 95, 100, 105, 100,
+        96, 91, 86, 83, 79, 75, 71, 68, 66, 64, 61, 60, 58, 57, 56, 55, 53, 52,
+        51, 51, 50, 49, 48, 48, 48, 48, 82, 87, 92, 97, 102, 98, 94, 89, 85, 81,
+        78, 74, 70, 68, 65, 63, 60, 59, 57, 55, 54, 53, 52, 51, 50, 49, 48, 47,
+        46, 46, 46, 46, 79, 84, 89, 94, 99, 95, 91, 88, 84, 80, 77, 73, 69, 67,
+        64, 61, 59, 57, 56, 54, 52, 51, 50, 49, 48, 47, 46, 46, 45, 45, 45, 45,
+        77, 82, 86, 91, 96, 92, 89, 85, 82, 78, 75, 71, 68, 65, 63, 60, 58, 56,
+        55, 53, 51, 50, 49, 48, 47, 46, 45, 44, 44, 44, 44, 44, 74, 79, 83, 88,
+        93, 90, 86, 83, 80, 77, 73, 70, 67, 64, 62, 59, 57, 55, 53, 52, 50, 49,
+        48, 47, 46, 45, 44, 43, 42, 42, 42, 42, 71, 76, 81, 85, 90, 87, 84, 81,
+        78, 75, 72, 69, 66, 63, 61, 58, 56, 54, 52, 51, 49, 48, 47, 46, 45, 44,
+        43, 42, 41, 41, 41, 41, 69, 73, 78, 82, 86, 84, 81, 79, 76, 73, 70, 67,
+        64, 62, 60, 57, 55, 53, 51, 50, 48, 47, 46, 45, 43, 43, 42, 41, 40, 40,
+        40, 40, 66, 71, 75, 79, 84, 81, 79, 76, 74, 71, 69, 66, 63, 61, 58, 56,
+        54, 52, 51, 49, 47, 46, 45, 44, 43, 42, 41, 40, 39, 39, 39, 39, 64, 68,
+        72, 76, 81, 78, 76, 74, 72, 69, 67, 64, 62, 60, 57, 55, 53, 51, 50, 48,
+        46, 45, 44, 43, 42, 41, 40, 39, 38, 38, 38, 38, 62, 66, 70, 74, 78, 76,
+        74, 72, 70, 67, 65, 63, 60, 58, 56, 54, 52, 50, 49, 47, 46, 44, 43, 42,
+        41, 40, 39, 38, 38, 38, 38, 38, 59, 63, 67, 71, 75, 73, 71, 69, 68, 66,
+        63, 61, 59, 57, 55, 53, 51, 49, 48, 46, 45, 44, 42, 41, 40, 39, 38, 38,
+        37, 37, 37, 37, 59, 63, 67, 71, 75, 73, 71, 69, 68, 66, 63, 61, 59, 57,
+        55, 53, 51, 49, 48, 46, 45, 44, 42, 41, 40, 39, 38, 38, 37, 37, 37, 37,
+        59, 63, 67, 71, 75, 73, 71, 69, 68, 66, 63, 61, 59, 57, 55, 53, 51, 49,
+        48, 46, 45, 44, 42, 41, 40, 39, 38, 38, 37, 37, 37, 37, 59, 63, 67, 71,
+        75, 73, 71, 69, 68, 66, 63, 61, 59, 57, 55, 53, 51, 49, 48, 46, 45, 44,
+        42, 41, 40, 39, 38, 38, 37, 37, 37, 37 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 58, 35, 23, 58, 39, 28, 21, 35, 28, 20, 16, 23, 21, 16, 14,
+        /* Size 8 */
+        64, 85, 79, 60, 44, 34, 27, 23, 85, 74, 78, 66, 52, 40, 32, 26, 79, 78,
+        55, 47, 40, 34, 29, 24, 60, 66, 47, 37, 32, 28, 25, 22, 44, 52, 40, 32,
+        27, 24, 22, 20, 34, 40, 34, 28, 24, 21, 19, 18, 27, 32, 29, 25, 22, 19,
+        18, 17, 23, 26, 24, 22, 20, 18, 17, 16,
+        /* Size 16 */
+        64, 75, 85, 82, 79, 69, 60, 52, 44, 39, 34, 31, 27, 25, 23, 23, 75, 77,
+        80, 79, 78, 71, 63, 55, 48, 43, 37, 33, 30, 27, 25, 25, 85, 80, 74, 76,
+        78, 72, 66, 59, 52, 46, 40, 36, 32, 29, 26, 26, 82, 79, 76, 71, 66, 62,
+        57, 51, 46, 42, 37, 34, 30, 28, 25, 25, 79, 78, 78, 66, 55, 51, 47, 44,
+        40, 37, 34, 31, 29, 26, 24, 24, 69, 71, 72, 62, 51, 47, 42, 39, 36, 33,
+        31, 29, 27, 25, 23, 23, 60, 63, 66, 57, 47, 42, 37, 34, 32, 30, 28, 26,
+        25, 23, 22, 22, 52, 55, 59, 51, 44, 39, 34, 32, 29, 28, 26, 24, 23, 22,
+        21, 21, 44, 48, 52, 46, 40, 36, 32, 29, 27, 25, 24, 23, 22, 21, 20, 20,
+        39, 43, 46, 42, 37, 33, 30, 28, 25, 24, 22, 21, 20, 20, 19, 19, 34, 37,
+        40, 37, 34, 31, 28, 26, 24, 22, 21, 20, 19, 19, 18, 18, 31, 33, 36, 34,
+        31, 29, 26, 24, 23, 21, 20, 19, 19, 18, 17, 17, 27, 30, 32, 30, 29, 27,
+        25, 23, 22, 20, 19, 19, 18, 17, 17, 17, 25, 27, 29, 28, 26, 25, 23, 22,
+        21, 20, 19, 18, 17, 17, 16, 16, 23, 25, 26, 25, 24, 23, 22, 21, 20, 19,
+        18, 17, 17, 16, 16, 16, 23, 25, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+        17, 16, 16, 16,
+        /* Size 32 */
+        64, 69, 75, 80, 85, 84, 82, 80, 79, 74, 69, 64, 60, 56, 52, 48, 44, 42,
+        39, 37, 34, 32, 31, 29, 27, 26, 25, 24, 23, 23, 23, 23, 69, 73, 76, 79,
+        83, 82, 81, 80, 78, 74, 70, 66, 61, 58, 54, 50, 46, 44, 41, 38, 36, 34,
+        32, 30, 29, 27, 26, 25, 24, 24, 24, 24, 75, 76, 77, 79, 80, 79, 79, 79,
+        78, 74, 71, 67, 63, 59, 55, 52, 48, 45, 43, 40, 37, 35, 33, 32, 30, 28,
+        27, 26, 25, 25, 25, 25, 80, 79, 79, 78, 77, 77, 77, 78, 78, 75, 71, 68,
+        65, 61, 57, 54, 50, 47, 44, 41, 39, 37, 35, 33, 31, 30, 28, 27, 26, 26,
+        26, 26, 85, 83, 80, 77, 74, 75, 76, 77, 78, 75, 72, 69, 66, 63, 59, 55,
+        52, 49, 46, 43, 40, 38, 36, 34, 32, 31, 29, 28, 26, 26, 26, 26, 84, 82,
+        79, 77, 75, 74, 74, 73, 72, 69, 67, 64, 61, 58, 55, 52, 49, 46, 44, 41,
+        39, 37, 35, 33, 31, 30, 29, 27, 26, 26, 26, 26, 82, 81, 79, 77, 76, 74,
+        71, 69, 66, 64, 62, 59, 57, 54, 51, 49, 46, 44, 42, 39, 37, 35, 34, 32,
+        30, 29, 28, 27, 25, 25, 25, 25, 80, 80, 79, 78, 77, 73, 69, 65, 61, 59,
+        56, 54, 52, 50, 48, 45, 43, 41, 39, 37, 35, 34, 32, 31, 29, 28, 27, 26,
+        25, 25, 25, 25, 79, 78, 78, 78, 78, 72, 66, 61, 55, 53, 51, 49, 47, 46,
+        44, 42, 40, 39, 37, 36, 34, 33, 31, 30, 29, 27, 26, 25, 24, 24, 24, 24,
+        74, 74, 74, 75, 75, 69, 64, 59, 53, 51, 49, 47, 45, 43, 41, 40, 38, 37,
+        35, 34, 32, 31, 30, 29, 28, 27, 26, 25, 24, 24, 24, 24, 69, 70, 71, 71,
+        72, 67, 62, 56, 51, 49, 47, 44, 42, 41, 39, 38, 36, 35, 33, 32, 31, 30,
+        29, 28, 27, 26, 25, 24, 23, 23, 23, 23, 64, 66, 67, 68, 69, 64, 59, 54,
+        49, 47, 44, 42, 40, 38, 37, 35, 34, 33, 32, 31, 29, 28, 28, 27, 26, 25,
+        24, 23, 23, 23, 23, 23, 60, 61, 63, 65, 66, 61, 57, 52, 47, 45, 42, 40,
+        37, 36, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 25, 24, 23, 23, 22, 22,
+        22, 22, 56, 58, 59, 61, 63, 58, 54, 50, 46, 43, 41, 38, 36, 34, 33, 32,
+        31, 30, 29, 28, 27, 26, 25, 25, 24, 23, 23, 22, 21, 21, 21, 21, 52, 54,
+        55, 57, 59, 55, 51, 48, 44, 41, 39, 37, 34, 33, 32, 31, 29, 28, 28, 27,
+        26, 25, 24, 24, 23, 23, 22, 21, 21, 21, 21, 21, 48, 50, 52, 54, 55, 52,
+        49, 45, 42, 40, 38, 35, 33, 32, 31, 29, 28, 27, 26, 26, 25, 24, 24, 23,
+        22, 22, 21, 21, 20, 20, 20, 20, 44, 46, 48, 50, 52, 49, 46, 43, 40, 38,
+        36, 34, 32, 31, 29, 28, 27, 26, 25, 25, 24, 23, 23, 22, 22, 21, 21, 20,
+        20, 20, 20, 20, 42, 44, 45, 47, 49, 46, 44, 41, 39, 37, 35, 33, 31, 30,
+        28, 27, 26, 25, 25, 24, 23, 23, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19,
+        39, 41, 43, 44, 46, 44, 42, 39, 37, 35, 33, 32, 30, 29, 28, 26, 25, 25,
+        24, 23, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 37, 38, 40, 41,
+        43, 41, 39, 37, 36, 34, 32, 31, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21,
+        21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 34, 36, 37, 39, 40, 39, 37, 35,
+        34, 32, 31, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 21, 20, 20, 19, 19,
+        19, 18, 18, 18, 18, 18, 32, 34, 35, 37, 38, 37, 35, 34, 33, 31, 30, 28,
+        27, 26, 25, 24, 23, 23, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18, 18,
+        18, 18, 31, 32, 33, 35, 36, 35, 34, 32, 31, 30, 29, 28, 26, 25, 24, 24,
+        23, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 29, 30,
+        32, 33, 34, 33, 32, 31, 30, 29, 28, 27, 25, 25, 24, 23, 22, 22, 21, 20,
+        20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 27, 29, 30, 31, 32, 31,
+        30, 29, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 20, 19, 19, 19, 18,
+        18, 18, 17, 17, 17, 17, 17, 17, 26, 27, 28, 30, 31, 30, 29, 28, 27, 27,
+        26, 25, 24, 23, 23, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17,
+        17, 17, 17, 17, 25, 26, 27, 28, 29, 29, 28, 27, 26, 26, 25, 24, 23, 23,
+        22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16,
+        24, 25, 26, 27, 28, 27, 27, 26, 25, 25, 24, 23, 23, 22, 21, 21, 20, 20,
+        19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 23, 24, 25, 26,
+        26, 26, 25, 25, 24, 24, 23, 23, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18,
+        17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 23, 24, 25, 26, 26, 26, 25, 25,
+        24, 24, 23, 23, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17,
+        16, 16, 16, 16, 16, 16, 23, 24, 25, 26, 26, 26, 25, 25, 24, 24, 23, 23,
+        22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16,
+        16, 16, 23, 24, 25, 26, 26, 26, 25, 25, 24, 24, 23, 23, 22, 21, 21, 20,
+        20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16 },
+      { /* Intra matrices */
+        /* Size 4 */
+        192, 174, 103, 63, 174, 113, 79, 57, 103, 79, 55, 44, 63, 57, 44, 37,
+        /* Size 8 */
+        160, 216, 199, 149, 109, 82, 65, 53, 216, 187, 195, 166, 128, 98, 77,
+        62, 199, 195, 137, 116, 98, 81, 67, 57, 149, 166, 116, 90, 76, 66, 57,
+        50, 109, 128, 98, 76, 63, 55, 49, 45, 82, 98, 81, 66, 55, 48, 44, 40,
+        65, 77, 67, 57, 49, 44, 40, 37, 53, 62, 57, 50, 45, 40, 37, 35,
+        /* Size 16 */
+        167, 195, 224, 216, 207, 181, 155, 134, 113, 99, 85, 76, 67, 61, 55, 55,
+        195, 202, 209, 207, 205, 184, 164, 143, 123, 108, 94, 84, 74, 67, 60,
+        60, 224, 209, 194, 199, 203, 188, 172, 153, 133, 117, 102, 91, 80, 72,
+        65, 65, 216, 207, 199, 186, 173, 160, 147, 132, 118, 106, 93, 84, 75,
+        68, 62, 62, 207, 205, 203, 173, 143, 132, 121, 112, 102, 94, 85, 78, 70,
+        65, 59, 59, 181, 184, 188, 160, 132, 120, 107, 99, 91, 84, 77, 71, 65,
+        60, 56, 56, 155, 164, 172, 147, 121, 107, 94, 86, 79, 74, 69, 64, 60,
+        56, 53, 53, 134, 143, 153, 132, 112, 99, 86, 79, 72, 68, 63, 59, 56, 53,
+        50, 50, 113, 123, 133, 118, 102, 91, 79, 72, 66, 62, 57, 54, 51, 49, 47,
+        47, 99, 108, 117, 106, 94, 84, 74, 68, 62, 58, 54, 51, 48, 46, 44, 44,
+        85, 94, 102, 93, 85, 77, 69, 63, 57, 54, 50, 48, 46, 44, 42, 42, 76, 84,
+        91, 84, 78, 71, 64, 59, 54, 51, 48, 46, 44, 42, 40, 40, 67, 74, 80, 75,
+        70, 65, 60, 56, 51, 48, 46, 44, 42, 40, 39, 39, 61, 67, 72, 68, 65, 60,
+        56, 53, 49, 46, 44, 42, 40, 39, 38, 38, 55, 60, 65, 62, 59, 56, 53, 50,
+        47, 44, 42, 40, 39, 38, 36, 36, 55, 60, 65, 62, 59, 56, 53, 50, 47, 44,
+        42, 40, 39, 38, 36, 36,
+        /* Size 32 */
+        170, 185, 200, 214, 229, 225, 220, 215, 211, 198, 185, 171, 158, 147,
+        137, 126, 116, 109, 101, 94, 87, 83, 78, 73, 69, 66, 62, 59, 56, 56, 56,
+        56, 185, 194, 203, 212, 221, 219, 216, 213, 210, 198, 186, 174, 163,
+        152, 142, 131, 121, 113, 106, 99, 91, 87, 82, 77, 72, 69, 65, 62, 59,
+        59, 59, 59, 200, 203, 207, 210, 214, 213, 212, 210, 209, 199, 188, 178,
+        167, 157, 146, 136, 126, 118, 111, 103, 96, 90, 85, 80, 75, 72, 68, 65,
+        61, 61, 61, 61, 214, 212, 210, 208, 206, 207, 207, 208, 209, 199, 190,
+        181, 172, 161, 151, 141, 131, 123, 115, 108, 100, 94, 89, 84, 78, 75,
+        71, 67, 64, 64, 64, 64, 229, 221, 214, 206, 199, 201, 203, 205, 208,
+        200, 192, 184, 176, 166, 156, 146, 136, 128, 120, 112, 104, 98, 93, 87,
+        81, 78, 74, 70, 66, 66, 66, 66, 225, 219, 213, 207, 201, 199, 197, 194,
+        192, 185, 178, 170, 163, 154, 145, 137, 128, 121, 114, 107, 100, 95, 89,
+        84, 79, 75, 72, 68, 65, 65, 65, 65, 220, 216, 212, 207, 203, 197, 190,
+        183, 177, 170, 163, 157, 150, 142, 135, 128, 120, 114, 108, 102, 95, 91,
+        86, 81, 77, 73, 70, 67, 63, 63, 63, 63, 215, 213, 210, 208, 205, 194,
+        183, 172, 161, 155, 149, 143, 137, 131, 125, 118, 112, 107, 102, 96, 91,
+        87, 83, 78, 74, 71, 68, 65, 62, 62, 62, 62, 211, 210, 209, 209, 208,
+        192, 177, 161, 146, 140, 135, 129, 124, 119, 114, 109, 105, 100, 96, 91,
+        87, 83, 79, 75, 72, 69, 66, 63, 60, 60, 60, 60, 198, 198, 199, 199, 200,
+        185, 170, 155, 140, 134, 128, 123, 117, 112, 108, 103, 99, 95, 91, 87,
+        82, 79, 76, 72, 69, 66, 64, 61, 59, 59, 59, 59, 185, 186, 188, 190, 192,
+        178, 163, 149, 135, 128, 122, 116, 110, 105, 101, 97, 93, 89, 86, 82,
+        78, 75, 72, 69, 66, 64, 62, 59, 57, 57, 57, 57, 171, 174, 178, 181, 184,
+        170, 157, 143, 129, 123, 116, 109, 103, 99, 95, 91, 87, 84, 81, 77, 74,
+        72, 69, 66, 64, 62, 60, 57, 55, 55, 55, 55, 158, 163, 167, 172, 176,
+        163, 150, 137, 124, 117, 110, 103, 96, 92, 88, 85, 81, 78, 75, 73, 70,
+        68, 66, 63, 61, 59, 57, 55, 54, 54, 54, 54, 147, 152, 157, 161, 166,
+        154, 142, 131, 119, 112, 105, 99, 92, 88, 85, 81, 77, 75, 72, 70, 67,
+        65, 63, 61, 59, 57, 55, 54, 52, 52, 52, 52, 137, 142, 146, 151, 156,
+        145, 135, 125, 114, 108, 101, 95, 88, 85, 81, 78, 74, 72, 69, 67, 64,
+        62, 61, 59, 57, 55, 54, 52, 51, 51, 51, 51, 126, 131, 136, 141, 146,
+        137, 128, 118, 109, 103, 97, 91, 85, 81, 78, 74, 71, 68, 66, 64, 61, 60,
+        58, 56, 55, 53, 52, 50, 49, 49, 49, 49, 116, 121, 126, 131, 136, 128,
+        120, 112, 105, 99, 93, 87, 81, 77, 74, 71, 67, 65, 63, 61, 59, 57, 56,
+        54, 52, 51, 50, 49, 48, 48, 48, 48, 109, 113, 118, 123, 128, 121, 114,
+        107, 100, 95, 89, 84, 78, 75, 72, 68, 65, 63, 61, 59, 57, 55, 54, 52,
+        51, 50, 49, 48, 46, 46, 46, 46, 101, 106, 111, 115, 120, 114, 108, 102,
+        96, 91, 86, 81, 75, 72, 69, 66, 63, 61, 59, 57, 55, 54, 52, 51, 49, 48,
+        47, 46, 45, 45, 45, 45, 94, 99, 103, 108, 112, 107, 102, 96, 91, 87, 82,
+        77, 73, 70, 67, 64, 61, 59, 57, 55, 53, 52, 51, 49, 48, 47, 46, 45, 44,
+        44, 44, 44, 87, 91, 96, 100, 104, 100, 95, 91, 87, 82, 78, 74, 70, 67,
+        64, 61, 59, 57, 55, 53, 51, 50, 49, 48, 47, 46, 45, 44, 43, 43, 43, 43,
+        83, 87, 90, 94, 98, 95, 91, 87, 83, 79, 75, 72, 68, 65, 62, 60, 57, 55,
+        54, 52, 50, 49, 48, 47, 45, 45, 44, 43, 42, 42, 42, 42, 78, 82, 85, 89,
+        93, 89, 86, 83, 79, 76, 72, 69, 66, 63, 61, 58, 56, 54, 52, 51, 49, 48,
+        47, 46, 44, 44, 43, 42, 41, 41, 41, 41, 73, 77, 80, 84, 87, 84, 81, 78,
+        75, 72, 69, 66, 63, 61, 59, 56, 54, 52, 51, 49, 48, 47, 46, 45, 43, 43,
+        42, 41, 40, 40, 40, 40, 69, 72, 75, 78, 81, 79, 77, 74, 72, 69, 66, 64,
+        61, 59, 57, 55, 52, 51, 49, 48, 47, 45, 44, 43, 42, 42, 41, 40, 40, 40,
+        40, 40, 66, 69, 72, 75, 78, 75, 73, 71, 69, 66, 64, 62, 59, 57, 55, 53,
+        51, 50, 48, 47, 46, 45, 44, 43, 42, 41, 40, 40, 39, 39, 39, 39, 62, 65,
+        68, 71, 74, 72, 70, 68, 66, 64, 62, 60, 57, 55, 54, 52, 50, 49, 47, 46,
+        45, 44, 43, 42, 41, 40, 40, 39, 38, 38, 38, 38, 59, 62, 65, 67, 70, 68,
+        67, 65, 63, 61, 59, 57, 55, 54, 52, 50, 49, 48, 46, 45, 44, 43, 42, 41,
+        40, 40, 39, 38, 38, 38, 38, 38, 56, 59, 61, 64, 66, 65, 63, 62, 60, 59,
+        57, 55, 54, 52, 51, 49, 48, 46, 45, 44, 43, 42, 41, 40, 40, 39, 38, 38,
+        37, 37, 37, 37, 56, 59, 61, 64, 66, 65, 63, 62, 60, 59, 57, 55, 54, 52,
+        51, 49, 48, 46, 45, 44, 43, 42, 41, 40, 40, 39, 38, 38, 37, 37, 37, 37,
+        56, 59, 61, 64, 66, 65, 63, 62, 60, 59, 57, 55, 54, 52, 51, 49, 48, 46,
+        45, 44, 43, 42, 41, 40, 40, 39, 38, 38, 37, 37, 37, 37, 56, 59, 61, 64,
+        66, 65, 63, 62, 60, 59, 57, 55, 54, 52, 51, 49, 48, 46, 45, 44, 43, 42,
+        41, 40, 40, 39, 38, 38, 37, 37, 37, 37 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 40, 37, 29, 40, 34, 32, 27, 37, 32, 25, 22, 29, 27, 22, 18,
+        /* Size 8 */
+        64, 79, 44, 41, 39, 35, 31, 27, 79, 55, 43, 47, 46, 42, 37, 33, 44, 43,
+        37, 38, 38, 36, 33, 30, 41, 47, 38, 34, 32, 31, 29, 27, 39, 46, 38, 32,
+        29, 27, 25, 24, 35, 42, 36, 31, 27, 24, 23, 21, 31, 37, 33, 29, 25, 23,
+        21, 20, 27, 33, 30, 27, 24, 21, 20, 19,
+        /* Size 16 */
+        64, 72, 79, 62, 44, 42, 41, 40, 39, 37, 35, 33, 31, 29, 27, 27, 72, 69,
+        67, 55, 43, 44, 44, 43, 42, 40, 38, 36, 34, 32, 30, 30, 79, 67, 55, 49,
+        43, 45, 47, 47, 46, 44, 42, 39, 37, 35, 33, 33, 62, 55, 49, 44, 40, 41,
+        43, 42, 42, 41, 39, 37, 35, 33, 31, 31, 44, 43, 43, 40, 37, 37, 38, 38,
+        38, 37, 36, 35, 33, 32, 30, 30, 42, 44, 45, 41, 37, 37, 36, 35, 35, 34,
+        33, 32, 31, 30, 28, 28, 41, 44, 47, 43, 38, 36, 34, 33, 32, 31, 31, 30,
+        29, 28, 27, 27, 40, 43, 47, 42, 38, 35, 33, 32, 30, 29, 29, 28, 27, 26,
+        25, 25, 39, 42, 46, 42, 38, 35, 32, 30, 29, 28, 27, 26, 25, 25, 24, 24,
+        37, 40, 44, 41, 37, 34, 31, 29, 28, 27, 26, 25, 24, 23, 23, 23, 35, 38,
+        42, 39, 36, 33, 31, 29, 27, 26, 24, 24, 23, 22, 21, 21, 33, 36, 39, 37,
+        35, 32, 30, 28, 26, 25, 24, 23, 22, 21, 21, 21, 31, 34, 37, 35, 33, 31,
+        29, 27, 25, 24, 23, 22, 21, 20, 20, 20, 29, 32, 35, 33, 32, 30, 28, 26,
+        25, 23, 22, 21, 20, 20, 19, 19, 27, 30, 33, 31, 30, 28, 27, 25, 24, 23,
+        21, 21, 20, 19, 19, 19, 27, 30, 33, 31, 30, 28, 27, 25, 24, 23, 21, 21,
+        20, 19, 19, 19,
+        /* Size 32 */
+        64, 68, 72, 75, 79, 70, 62, 53, 44, 43, 42, 42, 41, 41, 40, 39, 39, 38,
+        37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 27, 27, 27, 68, 69, 71, 72,
+        73, 66, 58, 51, 43, 43, 43, 43, 43, 42, 42, 41, 40, 39, 38, 37, 36, 35,
+        34, 33, 32, 31, 30, 29, 28, 28, 28, 28, 72, 71, 69, 68, 67, 61, 55, 49,
+        43, 44, 44, 44, 44, 44, 43, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
+        32, 31, 30, 30, 30, 30, 75, 72, 68, 65, 61, 57, 52, 48, 43, 44, 44, 45,
+        46, 45, 45, 45, 44, 43, 42, 41, 40, 39, 38, 37, 35, 34, 33, 32, 31, 31,
+        31, 31, 79, 73, 67, 61, 55, 52, 49, 46, 43, 44, 45, 46, 47, 47, 47, 46,
+        46, 45, 44, 43, 42, 41, 39, 38, 37, 36, 35, 34, 33, 33, 33, 33, 70, 66,
+        61, 57, 52, 49, 47, 44, 41, 42, 43, 44, 45, 45, 44, 44, 44, 43, 42, 41,
+        40, 39, 38, 37, 36, 35, 34, 33, 32, 32, 32, 32, 62, 58, 55, 52, 49, 47,
+        44, 42, 40, 40, 41, 42, 43, 42, 42, 42, 42, 41, 41, 40, 39, 38, 37, 36,
+        35, 34, 33, 32, 31, 31, 31, 31, 53, 51, 49, 48, 46, 44, 42, 40, 38, 39,
+        39, 40, 40, 40, 40, 40, 40, 39, 39, 38, 38, 37, 36, 35, 34, 33, 32, 32,
+        31, 31, 31, 31, 44, 43, 43, 43, 43, 41, 40, 38, 37, 37, 37, 38, 38, 38,
+        38, 38, 38, 38, 37, 37, 36, 35, 35, 34, 33, 32, 32, 31, 30, 30, 30, 30,
+        43, 43, 44, 44, 44, 42, 40, 39, 37, 37, 37, 37, 37, 37, 37, 37, 37, 36,
+        36, 35, 35, 34, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 42, 43, 44, 44,
+        45, 43, 41, 39, 37, 37, 37, 36, 36, 36, 35, 35, 35, 35, 34, 34, 33, 33,
+        32, 32, 31, 30, 30, 29, 28, 28, 28, 28, 42, 43, 44, 45, 46, 44, 42, 40,
+        38, 37, 36, 35, 35, 34, 34, 34, 33, 33, 33, 32, 32, 31, 31, 30, 30, 29,
+        29, 28, 28, 28, 28, 28, 41, 43, 44, 46, 47, 45, 43, 40, 38, 37, 36, 35,
+        34, 33, 33, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 27,
+        27, 27, 41, 42, 44, 45, 47, 45, 42, 40, 38, 37, 36, 34, 33, 33, 32, 32,
+        31, 31, 30, 30, 30, 29, 29, 28, 28, 27, 27, 27, 26, 26, 26, 26, 40, 42,
+        43, 45, 47, 44, 42, 40, 38, 37, 35, 34, 33, 32, 32, 31, 30, 30, 29, 29,
+        29, 28, 28, 27, 27, 27, 26, 26, 25, 25, 25, 25, 39, 41, 43, 45, 46, 44,
+        42, 40, 38, 37, 35, 34, 32, 32, 31, 30, 29, 29, 29, 28, 28, 27, 27, 27,
+        26, 26, 25, 25, 25, 25, 25, 25, 39, 40, 42, 44, 46, 44, 42, 40, 38, 37,
+        35, 33, 32, 31, 30, 29, 29, 28, 28, 27, 27, 26, 26, 26, 25, 25, 25, 24,
+        24, 24, 24, 24, 38, 39, 41, 43, 45, 43, 41, 39, 38, 36, 35, 33, 32, 31,
+        30, 29, 28, 28, 27, 27, 26, 26, 25, 25, 25, 24, 24, 24, 23, 23, 23, 23,
+        37, 38, 40, 42, 44, 42, 41, 39, 37, 36, 34, 33, 31, 30, 29, 29, 28, 27,
+        27, 26, 26, 25, 25, 24, 24, 24, 23, 23, 23, 23, 23, 23, 36, 37, 39, 41,
+        43, 41, 40, 38, 37, 35, 34, 32, 31, 30, 29, 28, 27, 27, 26, 26, 25, 25,
+        24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 35, 36, 38, 40, 42, 40, 39, 38,
+        36, 35, 33, 32, 31, 30, 29, 28, 27, 26, 26, 25, 24, 24, 24, 23, 23, 22,
+        22, 22, 21, 21, 21, 21, 34, 35, 37, 39, 41, 39, 38, 37, 35, 34, 33, 31,
+        30, 29, 28, 27, 26, 26, 25, 25, 24, 24, 23, 23, 22, 22, 22, 21, 21, 21,
+        21, 21, 33, 34, 36, 38, 39, 38, 37, 36, 35, 33, 32, 31, 30, 29, 28, 27,
+        26, 25, 25, 24, 24, 23, 23, 22, 22, 22, 21, 21, 21, 21, 21, 21, 32, 33,
+        35, 37, 38, 37, 36, 35, 34, 33, 32, 30, 29, 28, 27, 27, 26, 25, 24, 24,
+        23, 23, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 31, 32, 34, 35, 37, 36,
+        35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 25, 24, 23, 23, 22, 22, 21,
+        21, 21, 20, 20, 20, 20, 20, 20, 30, 31, 33, 34, 36, 35, 34, 33, 32, 31,
+        30, 29, 28, 27, 27, 26, 25, 24, 24, 23, 22, 22, 22, 21, 21, 20, 20, 20,
+        19, 19, 19, 19, 29, 30, 32, 33, 35, 34, 33, 32, 32, 31, 30, 29, 28, 27,
+        26, 25, 25, 24, 23, 23, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19,
+        28, 29, 31, 32, 34, 33, 32, 32, 31, 30, 29, 28, 27, 27, 26, 25, 24, 24,
+        23, 22, 22, 21, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 27, 28, 30, 31,
+        33, 32, 31, 31, 30, 29, 28, 28, 27, 26, 25, 25, 24, 23, 23, 22, 21, 21,
+        21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 27, 28, 30, 31, 33, 32, 31, 31,
+        30, 29, 28, 28, 27, 26, 25, 25, 24, 23, 23, 22, 21, 21, 21, 20, 20, 19,
+        19, 19, 19, 19, 19, 19, 27, 28, 30, 31, 33, 32, 31, 31, 30, 29, 28, 28,
+        27, 26, 25, 25, 24, 23, 23, 22, 21, 21, 21, 20, 20, 19, 19, 19, 19, 19,
+        19, 19, 27, 28, 30, 31, 33, 32, 31, 31, 30, 29, 28, 28, 27, 26, 25, 25,
+        24, 23, 23, 22, 21, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19 },
+      { /* Intra matrices */
+        /* Size 4 */
+        152, 94, 86, 67, 94, 77, 72, 62, 86, 72, 55, 48, 67, 62, 48, 40,
+        /* Size 8 */
+        141, 176, 95, 89, 83, 74, 65, 57, 176, 120, 93, 103, 100, 90, 80, 70,
+        95, 93, 79, 81, 82, 77, 71, 64, 89, 103, 81, 71, 68, 65, 61, 56, 83,
+        100, 82, 68, 60, 56, 53, 49, 74, 90, 77, 65, 56, 50, 47, 44, 65, 80, 71,
+        61, 53, 47, 43, 40, 57, 70, 64, 56, 49, 44, 40, 37,
+        /* Size 16 */
+        145, 163, 181, 139, 97, 94, 91, 88, 85, 81, 76, 71, 66, 62, 58, 58, 163,
+        158, 152, 124, 96, 98, 99, 96, 94, 89, 84, 79, 74, 70, 65, 65, 181, 152,
+        124, 110, 95, 101, 106, 104, 103, 98, 93, 87, 82, 77, 72, 72, 139, 124,
+        110, 99, 88, 91, 95, 94, 93, 90, 86, 82, 77, 73, 69, 69, 97, 96, 95, 88,
+        81, 82, 84, 84, 84, 82, 80, 76, 73, 69, 65, 65, 94, 98, 101, 91, 82, 80,
+        79, 78, 77, 75, 73, 70, 68, 65, 62, 62, 91, 99, 106, 95, 84, 79, 74, 72,
+        70, 68, 67, 65, 63, 60, 58, 58, 88, 96, 104, 94, 84, 78, 72, 69, 66, 64,
+        62, 60, 58, 56, 54, 54, 85, 94, 103, 93, 84, 77, 70, 66, 62, 60, 58, 56,
+        54, 52, 51, 51, 81, 89, 98, 90, 82, 75, 68, 64, 60, 57, 55, 53, 51, 50,
+        48, 48, 76, 84, 93, 86, 80, 73, 67, 62, 58, 55, 52, 50, 48, 47, 45, 45,
+        71, 79, 87, 82, 76, 70, 65, 60, 56, 53, 50, 48, 46, 45, 43, 43, 66, 74,
+        82, 77, 73, 68, 63, 58, 54, 51, 48, 46, 44, 43, 41, 41, 62, 70, 77, 73,
+        69, 65, 60, 56, 52, 50, 47, 45, 43, 41, 40, 40, 58, 65, 72, 69, 65, 62,
+        58, 54, 51, 48, 45, 43, 41, 40, 38, 38, 58, 65, 72, 69, 65, 62, 58, 54,
+        51, 48, 45, 43, 41, 40, 38, 38,
+        /* Size 32 */
+        147, 156, 165, 175, 184, 163, 141, 120, 99, 97, 96, 94, 93, 91, 90, 88,
+        87, 84, 82, 79, 77, 75, 72, 70, 67, 65, 63, 61, 59, 59, 59, 59, 156,
+        160, 163, 166, 169, 151, 134, 116, 98, 98, 97, 97, 97, 95, 94, 92, 91,
+        89, 86, 84, 81, 79, 76, 74, 71, 69, 67, 65, 62, 62, 62, 62, 165, 163,
+        160, 157, 155, 140, 126, 112, 98, 98, 99, 100, 100, 99, 98, 97, 95, 93,
+        91, 88, 86, 83, 80, 78, 75, 73, 71, 68, 66, 66, 66, 66, 175, 166, 157,
+        149, 140, 129, 119, 108, 97, 99, 101, 102, 104, 103, 102, 101, 100, 97,
+        95, 93, 90, 87, 85, 82, 79, 77, 74, 72, 69, 69, 69, 69, 184, 169, 155,
+        140, 126, 118, 111, 104, 97, 100, 102, 105, 108, 107, 106, 105, 104,
+        102, 99, 97, 94, 92, 89, 86, 83, 81, 78, 75, 73, 73, 73, 73, 163, 151,
+        140, 129, 118, 112, 106, 99, 93, 95, 98, 100, 102, 101, 101, 100, 100,
+        97, 95, 93, 91, 88, 86, 83, 81, 78, 76, 74, 71, 71, 71, 71, 141, 134,
+        126, 119, 111, 106, 100, 95, 89, 91, 93, 95, 96, 96, 96, 95, 95, 93, 91,
+        89, 88, 85, 83, 81, 79, 76, 74, 72, 70, 70, 70, 70, 120, 116, 112, 108,
+        104, 99, 95, 90, 86, 87, 88, 89, 91, 91, 90, 90, 90, 89, 87, 86, 84, 82,
+        80, 78, 76, 74, 72, 70, 68, 68, 68, 68, 99, 98, 98, 97, 97, 93, 89, 86,
+        82, 83, 84, 84, 85, 85, 85, 85, 85, 84, 83, 82, 81, 79, 77, 76, 74, 72,
+        70, 68, 66, 66, 66, 66, 97, 98, 98, 99, 100, 95, 91, 87, 83, 83, 83, 83,
+        82, 82, 82, 82, 82, 81, 80, 79, 78, 76, 74, 73, 71, 70, 68, 66, 64, 64,
+        64, 64, 96, 97, 99, 101, 102, 98, 93, 88, 84, 83, 82, 81, 80, 79, 79,
+        79, 78, 77, 76, 75, 74, 73, 71, 70, 69, 67, 66, 64, 63, 63, 63, 63, 94,
+        97, 100, 102, 105, 100, 95, 89, 84, 83, 81, 79, 77, 77, 76, 75, 75, 74,
+        73, 72, 71, 70, 69, 67, 66, 65, 63, 62, 61, 61, 61, 61, 93, 97, 100,
+        104, 108, 102, 96, 91, 85, 82, 80, 77, 75, 74, 73, 72, 71, 70, 69, 68,
+        68, 67, 66, 65, 64, 62, 61, 60, 59, 59, 59, 59, 91, 95, 99, 103, 107,
+        101, 96, 91, 85, 82, 79, 77, 74, 72, 71, 70, 69, 68, 67, 66, 65, 64, 63,
+        62, 61, 60, 59, 58, 57, 57, 57, 57, 90, 94, 98, 102, 106, 101, 96, 90,
+        85, 82, 79, 76, 73, 71, 70, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58,
+        57, 56, 55, 55, 55, 55, 88, 92, 97, 101, 105, 100, 95, 90, 85, 82, 79,
+        75, 72, 70, 68, 67, 65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53,
+        53, 53, 53, 87, 91, 95, 100, 104, 100, 95, 90, 85, 82, 78, 75, 71, 69,
+        67, 65, 63, 62, 61, 60, 58, 58, 57, 56, 55, 54, 53, 52, 52, 52, 52, 52,
+        84, 89, 93, 97, 102, 97, 93, 89, 84, 81, 77, 74, 70, 68, 66, 64, 62, 61,
+        59, 58, 57, 56, 55, 54, 53, 53, 52, 51, 50, 50, 50, 50, 82, 86, 91, 95,
+        99, 95, 91, 87, 83, 80, 76, 73, 69, 67, 65, 63, 61, 59, 58, 57, 56, 55,
+        54, 53, 52, 51, 50, 50, 49, 49, 49, 49, 79, 84, 88, 93, 97, 93, 89, 86,
+        82, 79, 75, 72, 68, 66, 64, 62, 60, 58, 57, 56, 54, 53, 52, 51, 50, 50,
+        49, 48, 47, 47, 47, 47, 77, 81, 86, 90, 94, 91, 88, 84, 81, 78, 74, 71,
+        68, 65, 63, 61, 58, 57, 56, 54, 53, 52, 51, 50, 49, 48, 47, 47, 46, 46,
+        46, 46, 75, 79, 83, 87, 92, 88, 85, 82, 79, 76, 73, 70, 67, 64, 62, 60,
+        58, 56, 55, 53, 52, 51, 50, 49, 48, 47, 46, 46, 45, 45, 45, 45, 72, 76,
+        80, 85, 89, 86, 83, 80, 77, 74, 71, 69, 66, 63, 61, 59, 57, 55, 54, 52,
+        51, 50, 49, 48, 47, 46, 45, 45, 44, 44, 44, 44, 70, 74, 78, 82, 86, 83,
+        81, 78, 76, 73, 70, 67, 65, 62, 60, 58, 56, 54, 53, 51, 50, 49, 48, 47,
+        46, 45, 44, 44, 43, 43, 43, 43, 67, 71, 75, 79, 83, 81, 79, 76, 74, 71,
+        69, 66, 64, 61, 59, 57, 55, 53, 52, 50, 49, 48, 47, 46, 45, 44, 43, 43,
+        42, 42, 42, 42, 65, 69, 73, 77, 81, 78, 76, 74, 72, 70, 67, 65, 62, 60,
+        58, 56, 54, 53, 51, 50, 48, 47, 46, 45, 44, 43, 43, 42, 41, 41, 41, 41,
+        63, 67, 71, 74, 78, 76, 74, 72, 70, 68, 66, 63, 61, 59, 57, 55, 53, 52,
+        50, 49, 47, 46, 45, 44, 43, 43, 42, 41, 40, 40, 40, 40, 61, 65, 68, 72,
+        75, 74, 72, 70, 68, 66, 64, 62, 60, 58, 56, 54, 52, 51, 50, 48, 47, 46,
+        45, 44, 43, 42, 41, 40, 40, 40, 40, 40, 59, 62, 66, 69, 73, 71, 70, 68,
+        66, 64, 63, 61, 59, 57, 55, 53, 52, 50, 49, 47, 46, 45, 44, 43, 42, 41,
+        40, 40, 39, 39, 39, 39, 59, 62, 66, 69, 73, 71, 70, 68, 66, 64, 63, 61,
+        59, 57, 55, 53, 52, 50, 49, 47, 46, 45, 44, 43, 42, 41, 40, 40, 39, 39,
+        39, 39, 59, 62, 66, 69, 73, 71, 70, 68, 66, 64, 63, 61, 59, 57, 55, 53,
+        52, 50, 49, 47, 46, 45, 44, 43, 42, 41, 40, 40, 39, 39, 39, 39, 59, 62,
+        66, 69, 73, 71, 70, 68, 66, 64, 63, 61, 59, 57, 55, 53, 52, 50, 49, 47,
+        46, 45, 44, 43, 42, 41, 40, 40, 39, 39, 39, 39 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 59, 37, 24, 59, 40, 29, 23, 37, 29, 22, 19, 24, 23, 19, 16,
+        /* Size 8 */
+        64, 84, 78, 60, 45, 36, 29, 25, 84, 74, 77, 66, 52, 41, 34, 28, 78, 77,
+        56, 48, 42, 36, 30, 27, 60, 66, 48, 39, 34, 30, 27, 24, 45, 52, 42, 34,
+        29, 26, 24, 22, 36, 41, 36, 30, 26, 23, 22, 21, 29, 34, 30, 27, 24, 22,
+        20, 19, 25, 28, 27, 24, 22, 21, 19, 19,
+        /* Size 16 */
+        64, 74, 84, 81, 78, 69, 60, 53, 45, 41, 36, 33, 29, 27, 25, 25, 74, 77,
+        79, 78, 77, 70, 63, 56, 49, 44, 39, 35, 32, 29, 27, 27, 84, 79, 74, 75,
+        77, 71, 66, 59, 52, 47, 41, 38, 34, 31, 28, 28, 81, 78, 75, 71, 66, 62,
+        57, 52, 47, 43, 38, 35, 32, 30, 28, 28, 78, 77, 77, 66, 56, 52, 48, 45,
+        42, 39, 36, 33, 30, 29, 27, 27, 69, 70, 71, 62, 52, 48, 43, 40, 38, 35,
+        33, 31, 29, 27, 25, 25, 60, 63, 66, 57, 48, 43, 39, 36, 34, 32, 30, 28,
+        27, 26, 24, 24, 53, 56, 59, 52, 45, 40, 36, 34, 31, 30, 28, 27, 25, 24,
+        23, 23, 45, 49, 52, 47, 42, 38, 34, 31, 29, 27, 26, 25, 24, 23, 22, 22,
+        41, 44, 47, 43, 39, 35, 32, 30, 27, 26, 25, 24, 23, 22, 21, 21, 36, 39,
+        41, 38, 36, 33, 30, 28, 26, 25, 23, 23, 22, 21, 21, 21, 33, 35, 38, 35,
+        33, 31, 28, 27, 25, 24, 23, 22, 21, 21, 20, 20, 29, 32, 34, 32, 30, 29,
+        27, 25, 24, 23, 22, 21, 20, 20, 19, 19, 27, 29, 31, 30, 29, 27, 26, 24,
+        23, 22, 21, 21, 20, 20, 19, 19, 25, 27, 28, 28, 27, 25, 24, 23, 22, 21,
+        21, 20, 19, 19, 19, 19, 25, 27, 28, 28, 27, 25, 24, 23, 22, 21, 21, 20,
+        19, 19, 19, 19,
+        /* Size 32 */
+        64, 69, 74, 79, 84, 83, 81, 80, 78, 73, 69, 64, 60, 56, 53, 49, 45, 43,
+        41, 38, 36, 34, 33, 31, 29, 28, 27, 26, 25, 25, 25, 25, 69, 72, 75, 78,
+        82, 81, 80, 79, 78, 74, 70, 66, 61, 58, 54, 51, 47, 45, 42, 40, 37, 35,
+        34, 32, 30, 29, 28, 27, 26, 26, 26, 26, 74, 75, 77, 78, 79, 79, 78, 78,
+        77, 74, 70, 67, 63, 59, 56, 52, 49, 46, 44, 41, 39, 37, 35, 33, 32, 30,
+        29, 28, 27, 27, 27, 27, 79, 78, 78, 77, 76, 77, 77, 77, 77, 74, 71, 68,
+        65, 61, 58, 54, 51, 48, 45, 43, 40, 38, 36, 35, 33, 31, 30, 29, 28, 28,
+        28, 28, 84, 82, 79, 76, 74, 75, 75, 76, 77, 74, 71, 69, 66, 63, 59, 56,
+        52, 50, 47, 44, 41, 40, 38, 36, 34, 32, 31, 30, 28, 28, 28, 28, 83, 81,
+        79, 77, 75, 74, 73, 72, 72, 69, 67, 64, 62, 59, 56, 53, 50, 47, 45, 42,
+        40, 38, 36, 35, 33, 32, 30, 29, 28, 28, 28, 28, 81, 80, 78, 77, 75, 73,
+        71, 69, 66, 64, 62, 59, 57, 55, 52, 50, 47, 45, 43, 41, 38, 37, 35, 34,
+        32, 31, 30, 29, 28, 28, 28, 28, 80, 79, 78, 77, 76, 72, 69, 65, 61, 59,
+        57, 55, 53, 51, 48, 46, 44, 43, 41, 39, 37, 36, 34, 33, 31, 30, 29, 28,
+        27, 27, 27, 27, 78, 78, 77, 77, 77, 72, 66, 61, 56, 54, 52, 50, 48, 47,
+        45, 43, 42, 40, 39, 37, 36, 34, 33, 32, 30, 29, 29, 28, 27, 27, 27, 27,
+        73, 74, 74, 74, 74, 69, 64, 59, 54, 52, 50, 48, 46, 44, 43, 41, 40, 38,
+        37, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 26, 26, 26, 69, 70, 70, 71,
+        71, 67, 62, 57, 52, 50, 48, 46, 43, 42, 40, 39, 38, 36, 35, 34, 33, 32,
+        31, 30, 29, 28, 27, 26, 25, 25, 25, 25, 64, 66, 67, 68, 69, 64, 59, 55,
+        50, 48, 46, 43, 41, 40, 38, 37, 36, 35, 33, 32, 31, 30, 30, 29, 28, 27,
+        26, 26, 25, 25, 25, 25, 60, 61, 63, 65, 66, 62, 57, 53, 48, 46, 43, 41,
+        39, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 28, 27, 26, 26, 25, 24, 24,
+        24, 24, 56, 58, 59, 61, 63, 59, 55, 51, 47, 44, 42, 40, 37, 36, 35, 34,
+        32, 32, 31, 30, 29, 28, 27, 27, 26, 25, 25, 24, 24, 24, 24, 24, 53, 54,
+        56, 58, 59, 56, 52, 48, 45, 43, 40, 38, 36, 35, 34, 32, 31, 30, 30, 29,
+        28, 27, 27, 26, 25, 25, 24, 24, 23, 23, 23, 23, 49, 51, 52, 54, 56, 53,
+        50, 46, 43, 41, 39, 37, 35, 34, 32, 31, 30, 29, 28, 28, 27, 26, 26, 25,
+        25, 24, 24, 23, 23, 23, 23, 23, 45, 47, 49, 51, 52, 50, 47, 44, 42, 40,
+        38, 36, 34, 32, 31, 30, 29, 28, 27, 27, 26, 25, 25, 24, 24, 23, 23, 23,
+        22, 22, 22, 22, 43, 45, 46, 48, 50, 47, 45, 43, 40, 38, 36, 35, 33, 32,
+        30, 29, 28, 27, 27, 26, 25, 25, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22,
+        41, 42, 44, 45, 47, 45, 43, 41, 39, 37, 35, 33, 32, 31, 30, 28, 27, 27,
+        26, 25, 25, 24, 24, 23, 23, 22, 22, 22, 21, 21, 21, 21, 38, 40, 41, 43,
+        44, 42, 41, 39, 37, 35, 34, 32, 31, 30, 29, 28, 27, 26, 25, 25, 24, 24,
+        23, 23, 22, 22, 22, 21, 21, 21, 21, 21, 36, 37, 39, 40, 41, 40, 38, 37,
+        36, 34, 33, 31, 30, 29, 28, 27, 26, 25, 25, 24, 23, 23, 23, 22, 22, 22,
+        21, 21, 21, 21, 21, 21, 34, 35, 37, 38, 40, 38, 37, 36, 34, 33, 32, 30,
+        29, 28, 27, 26, 25, 25, 24, 24, 23, 23, 22, 22, 21, 21, 21, 21, 20, 20,
+        20, 20, 33, 34, 35, 36, 38, 36, 35, 34, 33, 32, 31, 30, 28, 27, 27, 26,
+        25, 24, 24, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 20, 31, 32,
+        33, 35, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 24, 23, 23,
+        22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 20, 20, 29, 30, 32, 33, 34, 33,
+        32, 31, 30, 30, 29, 28, 27, 26, 25, 25, 24, 23, 23, 22, 22, 21, 21, 21,
+        20, 20, 20, 20, 19, 19, 19, 19, 28, 29, 30, 31, 32, 32, 31, 30, 29, 29,
+        28, 27, 26, 25, 25, 24, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19,
+        19, 19, 19, 19, 27, 28, 29, 30, 31, 30, 30, 29, 29, 28, 27, 26, 26, 25,
+        24, 24, 23, 23, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19,
+        26, 27, 28, 29, 30, 29, 29, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, 22,
+        22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 25, 26, 27, 28,
+        28, 28, 28, 27, 27, 26, 25, 25, 24, 24, 23, 23, 22, 22, 21, 21, 21, 20,
+        20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 25, 26, 27, 28, 28, 28, 28, 27,
+        27, 26, 25, 25, 24, 24, 23, 23, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19,
+        19, 19, 19, 19, 19, 19, 25, 26, 27, 28, 28, 28, 28, 27, 27, 26, 25, 25,
+        24, 24, 23, 23, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19,
+        19, 19, 25, 26, 27, 28, 28, 28, 28, 27, 27, 26, 25, 25, 24, 24, 23, 23,
+        22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19 },
+      { /* Intra matrices */
+        /* Size 4 */
+        171, 157, 95, 62, 157, 104, 75, 57, 95, 75, 54, 45, 62, 57, 45, 39,
+        /* Size 8 */
+        144, 192, 177, 135, 101, 78, 63, 53, 192, 167, 174, 149, 117, 91, 73,
+        61, 177, 174, 125, 107, 92, 77, 65, 56, 135, 149, 107, 84, 73, 64, 57,
+        51, 101, 117, 92, 73, 62, 55, 50, 46, 78, 91, 77, 64, 55, 49, 45, 42,
+        63, 73, 65, 57, 50, 45, 42, 39, 53, 61, 56, 51, 46, 42, 39, 38,
+        /* Size 16 */
+        149, 174, 199, 191, 184, 162, 140, 122, 104, 92, 81, 73, 65, 60, 55, 55,
+        174, 180, 186, 184, 182, 165, 147, 130, 113, 100, 88, 79, 70, 65, 59,
+        59, 199, 186, 173, 177, 181, 168, 154, 138, 121, 108, 95, 85, 76, 69,
+        63, 63, 191, 184, 177, 166, 155, 144, 133, 120, 108, 98, 87, 80, 72, 66,
+        61, 61, 184, 182, 181, 155, 129, 120, 111, 103, 95, 88, 80, 74, 68, 63,
+        58, 58, 162, 165, 168, 144, 120, 110, 99, 92, 85, 79, 73, 68, 63, 59,
+        55, 55, 140, 147, 154, 133, 111, 99, 88, 81, 75, 71, 66, 63, 59, 56, 53,
+        53, 122, 130, 138, 120, 103, 92, 81, 75, 70, 66, 62, 58, 55, 53, 50, 50,
+        104, 113, 121, 108, 95, 85, 75, 70, 64, 60, 57, 54, 52, 50, 48, 48, 92,
+        100, 108, 98, 88, 79, 71, 66, 60, 57, 54, 51, 49, 47, 46, 46, 81, 88,
+        95, 87, 80, 73, 66, 62, 57, 54, 51, 49, 47, 45, 44, 44, 73, 79, 85, 80,
+        74, 68, 63, 58, 54, 51, 49, 47, 45, 44, 42, 42, 65, 70, 76, 72, 68, 63,
+        59, 55, 52, 49, 47, 45, 43, 42, 41, 41, 60, 65, 69, 66, 63, 59, 56, 53,
+        50, 47, 45, 44, 42, 41, 40, 40, 55, 59, 63, 61, 58, 55, 53, 50, 48, 46,
+        44, 42, 41, 40, 39, 39, 55, 59, 63, 61, 58, 55, 53, 50, 48, 46, 44, 42,
+        41, 40, 39, 39,
+        /* Size 32 */
+        152, 165, 177, 190, 202, 198, 195, 191, 187, 176, 165, 153, 142, 133,
+        124, 115, 106, 100, 94, 88, 82, 78, 74, 70, 66, 64, 61, 58, 56, 56, 56,
+        56, 165, 173, 180, 188, 196, 193, 191, 189, 186, 176, 166, 156, 146,
+        137, 128, 119, 111, 104, 98, 92, 86, 82, 77, 73, 69, 66, 63, 61, 58, 58,
+        58, 58, 177, 180, 183, 186, 189, 188, 187, 187, 186, 177, 168, 159, 150,
+        141, 132, 124, 115, 108, 102, 96, 89, 85, 80, 76, 72, 69, 66, 63, 60,
+        60, 60, 60, 190, 188, 186, 185, 183, 183, 184, 184, 185, 177, 169, 161,
+        154, 145, 136, 128, 119, 112, 106, 99, 93, 88, 84, 79, 74, 71, 68, 65,
+        62, 62, 62, 62, 202, 196, 189, 183, 176, 178, 180, 182, 184, 177, 171,
+        164, 157, 149, 140, 132, 123, 117, 110, 103, 96, 92, 87, 82, 77, 74, 71,
+        67, 64, 64, 64, 64, 198, 193, 188, 183, 178, 177, 175, 173, 171, 165,
+        159, 152, 146, 139, 131, 124, 117, 111, 105, 99, 93, 88, 84, 80, 75, 72,
+        69, 66, 63, 63, 63, 63, 195, 191, 187, 184, 180, 175, 169, 164, 158,
+        152, 147, 141, 135, 129, 123, 116, 110, 105, 99, 94, 89, 85, 81, 77, 73,
+        70, 67, 65, 62, 62, 62, 62, 191, 189, 187, 184, 182, 173, 164, 154, 145,
+        140, 134, 129, 124, 119, 114, 109, 103, 99, 94, 90, 85, 82, 78, 75, 71,
+        68, 66, 63, 61, 61, 61, 61, 187, 186, 186, 185, 184, 171, 158, 145, 132,
+        127, 122, 118, 113, 109, 105, 101, 97, 93, 89, 85, 82, 78, 75, 72, 69,
+        67, 64, 62, 59, 59, 59, 59, 176, 176, 177, 177, 177, 165, 152, 140, 127,
+        122, 117, 112, 107, 103, 99, 96, 92, 88, 85, 81, 78, 75, 72, 70, 67, 64,
+        62, 60, 58, 58, 58, 58, 165, 166, 168, 169, 171, 159, 147, 134, 122,
+        117, 112, 106, 101, 97, 94, 90, 87, 84, 81, 78, 75, 72, 69, 67, 64, 62,
+        60, 58, 56, 56, 56, 56, 153, 156, 159, 161, 164, 152, 141, 129, 118,
+        112, 106, 101, 95, 92, 88, 85, 82, 79, 76, 74, 71, 69, 67, 64, 62, 60,
+        59, 57, 55, 55, 55, 55, 142, 146, 150, 154, 157, 146, 135, 124, 113,
+        107, 101, 95, 89, 86, 83, 80, 77, 74, 72, 70, 68, 66, 64, 62, 60, 58,
+        57, 55, 54, 54, 54, 54, 133, 137, 141, 145, 149, 139, 129, 119, 109,
+        103, 97, 92, 86, 83, 80, 77, 74, 72, 69, 67, 65, 63, 62, 60, 58, 57, 55,
+        54, 52, 52, 52, 52, 124, 128, 132, 136, 140, 131, 123, 114, 105, 99, 94,
+        88, 83, 80, 77, 74, 71, 69, 67, 65, 63, 61, 59, 58, 56, 55, 54, 52, 51,
+        51, 51, 51, 115, 119, 124, 128, 132, 124, 116, 109, 101, 96, 90, 85, 80,
+        77, 74, 71, 68, 66, 64, 62, 60, 59, 57, 56, 54, 53, 52, 51, 50, 50, 50,
+        50, 106, 111, 115, 119, 123, 117, 110, 103, 97, 92, 87, 82, 77, 74, 71,
+        68, 65, 63, 61, 60, 58, 57, 55, 54, 53, 52, 51, 50, 48, 48, 48, 48, 100,
+        104, 108, 112, 117, 111, 105, 99, 93, 88, 84, 79, 74, 72, 69, 66, 63,
+        61, 60, 58, 56, 55, 54, 53, 51, 50, 49, 48, 47, 47, 47, 47, 94, 98, 102,
+        106, 110, 105, 99, 94, 89, 85, 81, 76, 72, 69, 67, 64, 61, 60, 58, 56,
+        55, 54, 52, 51, 50, 49, 48, 47, 46, 46, 46, 46, 88, 92, 96, 99, 103, 99,
+        94, 90, 85, 81, 78, 74, 70, 67, 65, 62, 60, 58, 56, 55, 53, 52, 51, 50,
+        49, 48, 47, 46, 46, 46, 46, 46, 82, 86, 89, 93, 96, 93, 89, 85, 82, 78,
+        75, 71, 68, 65, 63, 60, 58, 56, 55, 53, 52, 51, 50, 49, 48, 47, 46, 45,
+        45, 45, 45, 45, 78, 82, 85, 88, 92, 88, 85, 82, 78, 75, 72, 69, 66, 63,
+        61, 59, 57, 55, 54, 52, 51, 50, 49, 48, 47, 46, 45, 45, 44, 44, 44, 44,
+        74, 77, 80, 84, 87, 84, 81, 78, 75, 72, 69, 67, 64, 62, 59, 57, 55, 54,
+        52, 51, 50, 49, 48, 47, 46, 45, 44, 44, 43, 43, 43, 43, 70, 73, 76, 79,
+        82, 80, 77, 75, 72, 70, 67, 64, 62, 60, 58, 56, 54, 53, 51, 50, 49, 48,
+        47, 46, 45, 44, 44, 43, 42, 42, 42, 42, 66, 69, 72, 74, 77, 75, 73, 71,
+        69, 67, 64, 62, 60, 58, 56, 54, 53, 51, 50, 49, 48, 47, 46, 45, 44, 44,
+        43, 42, 42, 42, 42, 42, 64, 66, 69, 71, 74, 72, 70, 68, 67, 64, 62, 60,
+        58, 57, 55, 53, 52, 50, 49, 48, 47, 46, 45, 44, 44, 43, 42, 42, 41, 41,
+        41, 41, 61, 63, 66, 68, 71, 69, 67, 66, 64, 62, 60, 59, 57, 55, 54, 52,
+        51, 49, 48, 47, 46, 45, 44, 44, 43, 42, 42, 41, 41, 41, 41, 41, 58, 61,
+        63, 65, 67, 66, 65, 63, 62, 60, 58, 57, 55, 54, 52, 51, 50, 48, 47, 46,
+        45, 45, 44, 43, 42, 42, 41, 41, 40, 40, 40, 40, 56, 58, 60, 62, 64, 63,
+        62, 61, 59, 58, 56, 55, 54, 52, 51, 50, 48, 47, 46, 46, 45, 44, 43, 42,
+        42, 41, 41, 40, 40, 40, 40, 40, 56, 58, 60, 62, 64, 63, 62, 61, 59, 58,
+        56, 55, 54, 52, 51, 50, 48, 47, 46, 46, 45, 44, 43, 42, 42, 41, 41, 40,
+        40, 40, 40, 40, 56, 58, 60, 62, 64, 63, 62, 61, 59, 58, 56, 55, 54, 52,
+        51, 50, 48, 47, 46, 46, 45, 44, 43, 42, 42, 41, 41, 40, 40, 40, 40, 40,
+        56, 58, 60, 62, 64, 63, 62, 61, 59, 58, 56, 55, 54, 52, 51, 50, 48, 47,
+        46, 46, 45, 44, 43, 42, 42, 41, 41, 40, 40, 40, 40, 40 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 41, 38, 31, 41, 35, 33, 29, 38, 33, 26, 23, 31, 29, 23, 20,
+        /* Size 8 */
+        64, 79, 45, 42, 40, 36, 32, 29, 79, 55, 44, 48, 47, 43, 38, 34, 45, 44,
+        38, 39, 39, 37, 35, 32, 42, 48, 39, 35, 33, 32, 31, 29, 40, 47, 39, 33,
+        30, 28, 27, 26, 36, 43, 37, 32, 28, 26, 25, 23, 32, 38, 35, 31, 27, 25,
+        23, 22, 29, 34, 32, 29, 26, 23, 22, 21,
+        /* Size 16 */
+        64, 71, 79, 62, 45, 43, 42, 41, 40, 38, 36, 34, 32, 30, 29, 29, 71, 69,
+        67, 56, 44, 45, 45, 44, 43, 41, 39, 37, 35, 33, 31, 31, 79, 67, 55, 50,
+        44, 46, 48, 47, 47, 45, 43, 41, 38, 36, 34, 34, 62, 56, 50, 45, 41, 42,
+        44, 43, 43, 42, 40, 38, 37, 35, 33, 33, 45, 44, 44, 41, 38, 39, 39, 39,
+        39, 38, 37, 36, 35, 33, 32, 32, 43, 45, 46, 42, 39, 38, 37, 37, 36, 36,
+        35, 34, 33, 31, 30, 30, 42, 45, 48, 44, 39, 37, 35, 34, 33, 33, 32, 31,
+        31, 30, 29, 29, 41, 44, 47, 43, 39, 37, 34, 33, 32, 31, 30, 30, 29, 28,
+        27, 27, 40, 43, 47, 43, 39, 36, 33, 32, 30, 29, 28, 28, 27, 26, 26, 26,
+        38, 41, 45, 42, 38, 36, 33, 31, 29, 28, 27, 27, 26, 25, 25, 25, 36, 39,
+        43, 40, 37, 35, 32, 30, 28, 27, 26, 25, 25, 24, 23, 23, 34, 37, 41, 38,
+        36, 34, 31, 30, 28, 27, 25, 25, 24, 23, 23, 23, 32, 35, 38, 37, 35, 33,
+        31, 29, 27, 26, 25, 24, 23, 22, 22, 22, 30, 33, 36, 35, 33, 31, 30, 28,
+        26, 25, 24, 23, 22, 22, 21, 21, 29, 31, 34, 33, 32, 30, 29, 27, 26, 25,
+        23, 23, 22, 21, 21, 21, 29, 31, 34, 33, 32, 30, 29, 27, 26, 25, 23, 23,
+        22, 21, 21, 21,
+        /* Size 32 */
+        64, 68, 71, 75, 79, 70, 62, 53, 45, 44, 43, 43, 42, 42, 41, 40, 40, 39,
+        38, 37, 36, 35, 34, 33, 32, 31, 30, 30, 29, 29, 29, 29, 68, 69, 70, 72,
+        73, 66, 59, 52, 44, 44, 44, 44, 44, 43, 43, 42, 42, 41, 40, 39, 38, 37,
+        36, 35, 34, 33, 32, 31, 30, 30, 30, 30, 71, 70, 69, 68, 67, 61, 56, 50,
+        44, 44, 45, 45, 45, 45, 44, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34,
+        33, 32, 31, 31, 31, 31, 75, 72, 68, 65, 61, 57, 53, 48, 44, 45, 45, 46,
+        47, 46, 46, 45, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 33,
+        33, 33, 79, 73, 67, 61, 55, 52, 50, 47, 44, 45, 46, 47, 48, 48, 47, 47,
+        47, 46, 45, 44, 43, 42, 41, 40, 38, 37, 36, 35, 34, 34, 34, 34, 70, 66,
+        61, 57, 52, 50, 47, 45, 42, 43, 44, 45, 46, 46, 45, 45, 45, 44, 43, 42,
+        42, 41, 39, 38, 37, 36, 36, 35, 34, 34, 34, 34, 62, 59, 56, 53, 50, 47,
+        45, 43, 41, 42, 42, 43, 44, 43, 43, 43, 43, 42, 42, 41, 40, 39, 38, 37,
+        37, 36, 35, 34, 33, 33, 33, 33, 53, 52, 50, 48, 47, 45, 43, 41, 39, 40,
+        40, 41, 41, 41, 41, 41, 41, 41, 40, 39, 39, 38, 37, 36, 36, 35, 34, 33,
+        32, 32, 32, 32, 45, 44, 44, 44, 44, 42, 41, 39, 38, 38, 39, 39, 39, 39,
+        39, 39, 39, 39, 38, 38, 37, 37, 36, 35, 35, 34, 33, 32, 32, 32, 32, 32,
+        44, 44, 44, 45, 45, 43, 42, 40, 38, 38, 38, 38, 38, 38, 38, 38, 38, 37,
+        37, 37, 36, 36, 35, 34, 34, 33, 32, 32, 31, 31, 31, 31, 43, 44, 45, 45,
+        46, 44, 42, 40, 39, 38, 38, 37, 37, 37, 37, 37, 36, 36, 36, 35, 35, 34,
+        34, 33, 33, 32, 31, 31, 30, 30, 30, 30, 43, 44, 45, 46, 47, 45, 43, 41,
+        39, 38, 37, 37, 36, 36, 35, 35, 35, 35, 34, 34, 34, 33, 33, 32, 32, 31,
+        30, 30, 29, 29, 29, 29, 42, 44, 45, 47, 48, 46, 44, 41, 39, 38, 37, 36,
+        35, 35, 34, 34, 33, 33, 33, 33, 32, 32, 31, 31, 31, 30, 30, 29, 29, 29,
+        29, 29, 42, 43, 45, 46, 48, 46, 43, 41, 39, 38, 37, 36, 35, 34, 34, 33,
+        33, 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 41, 43,
+        44, 46, 47, 45, 43, 41, 39, 38, 37, 35, 34, 34, 33, 32, 32, 31, 31, 31,
+        30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 27, 27, 40, 42, 44, 45, 47, 45,
+        43, 41, 39, 38, 37, 35, 34, 33, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28,
+        28, 28, 27, 27, 26, 26, 26, 26, 40, 42, 43, 45, 47, 45, 43, 41, 39, 38,
+        36, 35, 33, 33, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 27, 26, 26,
+        26, 26, 26, 26, 39, 41, 42, 44, 46, 44, 42, 41, 39, 37, 36, 35, 33, 32,
+        31, 31, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 26, 25, 25, 25, 25, 25,
+        38, 40, 41, 43, 45, 43, 42, 40, 38, 37, 36, 34, 33, 32, 31, 30, 29, 29,
+        28, 28, 27, 27, 27, 26, 26, 26, 25, 25, 25, 25, 25, 25, 37, 39, 40, 42,
+        44, 42, 41, 39, 38, 37, 35, 34, 33, 32, 31, 30, 29, 28, 28, 27, 27, 26,
+        26, 26, 25, 25, 25, 24, 24, 24, 24, 24, 36, 38, 39, 41, 43, 42, 40, 39,
+        37, 36, 35, 34, 32, 31, 30, 29, 28, 28, 27, 27, 26, 26, 25, 25, 25, 24,
+        24, 24, 23, 23, 23, 23, 35, 37, 38, 40, 42, 41, 39, 38, 37, 36, 34, 33,
+        32, 31, 30, 29, 28, 28, 27, 26, 26, 25, 25, 25, 24, 24, 24, 23, 23, 23,
+        23, 23, 34, 36, 37, 39, 41, 39, 38, 37, 36, 35, 34, 33, 31, 30, 30, 29,
+        28, 27, 27, 26, 25, 25, 25, 24, 24, 24, 23, 23, 23, 23, 23, 23, 33, 35,
+        36, 38, 40, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 27, 26, 26,
+        25, 25, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 32, 34, 35, 37, 38, 37,
+        37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 26, 25, 25, 24, 24, 23,
+        23, 23, 22, 22, 22, 22, 22, 22, 31, 33, 34, 36, 37, 36, 36, 35, 34, 33,
+        32, 31, 30, 29, 28, 28, 27, 26, 26, 25, 24, 24, 24, 23, 23, 22, 22, 22,
+        22, 22, 22, 22, 30, 32, 33, 35, 36, 36, 35, 34, 33, 32, 31, 30, 30, 29,
+        28, 27, 26, 26, 25, 25, 24, 24, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21,
+        30, 31, 32, 34, 35, 35, 34, 33, 32, 32, 31, 30, 29, 28, 28, 27, 26, 25,
+        25, 24, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 21, 21, 29, 30, 31, 33,
+        34, 34, 33, 32, 32, 31, 30, 29, 29, 28, 27, 26, 26, 25, 25, 24, 23, 23,
+        23, 22, 22, 22, 21, 21, 21, 21, 21, 21, 29, 30, 31, 33, 34, 34, 33, 32,
+        32, 31, 30, 29, 29, 28, 27, 26, 26, 25, 25, 24, 23, 23, 23, 22, 22, 22,
+        21, 21, 21, 21, 21, 21, 29, 30, 31, 33, 34, 34, 33, 32, 32, 31, 30, 29,
+        29, 28, 27, 26, 26, 25, 25, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 21,
+        21, 21, 29, 30, 31, 33, 34, 34, 33, 32, 32, 31, 30, 29, 29, 28, 27, 26,
+        26, 25, 25, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 21, 21, 21 },
+      { /* Intra matrices */
+        /* Size 4 */
+        142, 90, 83, 66, 90, 75, 71, 62, 83, 71, 55, 49, 66, 62, 49, 41,
+        /* Size 8 */
+        132, 163, 91, 85, 80, 72, 64, 57, 163, 113, 89, 98, 95, 87, 77, 68, 91,
+        89, 76, 79, 79, 75, 69, 63, 85, 98, 79, 70, 67, 64, 60, 56, 80, 95, 79,
+        67, 60, 56, 53, 50, 72, 87, 75, 64, 56, 51, 48, 45, 64, 77, 69, 60, 53,
+        48, 44, 42, 57, 68, 63, 56, 50, 45, 42, 39,
+        /* Size 16 */
+        136, 152, 168, 130, 93, 90, 88, 85, 82, 78, 74, 70, 65, 62, 58, 58, 152,
+        147, 142, 117, 92, 93, 94, 92, 90, 86, 81, 77, 72, 68, 64, 64, 168, 142,
+        116, 104, 91, 96, 101, 99, 98, 93, 89, 84, 79, 75, 70, 70, 130, 117,
+        104, 94, 85, 88, 91, 90, 90, 86, 83, 79, 75, 71, 67, 67, 93, 92, 91, 85,
+        78, 80, 81, 81, 81, 79, 77, 74, 71, 68, 65, 65, 90, 93, 96, 88, 80, 78,
+        76, 76, 75, 73, 71, 69, 67, 64, 61, 61, 88, 94, 101, 91, 81, 76, 72, 70,
+        68, 67, 66, 64, 62, 60, 58, 58, 85, 92, 99, 90, 81, 76, 70, 68, 65, 63,
+        62, 60, 58, 56, 55, 55, 82, 90, 98, 90, 81, 75, 68, 65, 61, 60, 58, 56,
+        54, 53, 51, 51, 78, 86, 93, 86, 79, 73, 67, 63, 60, 57, 55, 53, 52, 50,
+        49, 49, 74, 81, 89, 83, 77, 71, 66, 62, 58, 55, 52, 51, 49, 48, 47, 47,
+        70, 77, 84, 79, 74, 69, 64, 60, 56, 53, 51, 49, 47, 46, 45, 45, 65, 72,
+        79, 75, 71, 67, 62, 58, 54, 52, 49, 47, 46, 44, 43, 43, 62, 68, 75, 71,
+        68, 64, 60, 56, 53, 50, 48, 46, 44, 43, 42, 42, 58, 64, 70, 67, 65, 61,
+        58, 55, 51, 49, 47, 45, 43, 42, 40, 40, 58, 64, 70, 67, 65, 61, 58, 55,
+        51, 49, 47, 45, 43, 42, 40, 40,
+        /* Size 32 */
+        137, 146, 154, 162, 170, 151, 132, 113, 94, 93, 92, 90, 89, 88, 86, 85,
+        83, 81, 79, 77, 75, 73, 71, 68, 66, 64, 63, 61, 59, 59, 59, 59, 146,
+        148, 151, 154, 157, 141, 125, 110, 94, 93, 93, 93, 92, 91, 90, 89, 87,
+        85, 83, 81, 79, 76, 74, 72, 70, 68, 66, 64, 62, 62, 62, 62, 154, 151,
+        149, 146, 144, 131, 119, 106, 93, 94, 94, 95, 96, 94, 93, 92, 91, 89,
+        87, 85, 83, 80, 78, 76, 73, 71, 69, 67, 65, 65, 65, 65, 162, 154, 146,
+        139, 131, 122, 112, 103, 93, 94, 96, 97, 99, 98, 97, 96, 95, 93, 91, 89,
+        86, 84, 82, 79, 77, 75, 72, 70, 68, 68, 68, 68, 170, 157, 144, 131, 118,
+        112, 105, 99, 93, 95, 97, 100, 102, 101, 101, 100, 99, 97, 95, 93, 90,
+        88, 85, 83, 80, 78, 76, 73, 71, 71, 71, 71, 151, 141, 131, 122, 112,
+        106, 100, 95, 89, 91, 93, 95, 97, 97, 96, 95, 95, 93, 91, 89, 87, 85,
+        83, 81, 78, 76, 74, 72, 70, 70, 70, 70, 132, 125, 119, 112, 105, 100,
+        96, 91, 86, 87, 89, 91, 92, 92, 91, 91, 91, 89, 88, 86, 84, 82, 80, 78,
+        76, 74, 72, 70, 68, 68, 68, 68, 113, 110, 106, 103, 99, 95, 91, 87, 83,
+        84, 85, 86, 87, 87, 87, 87, 87, 85, 84, 83, 81, 80, 78, 76, 74, 72, 70,
+        69, 67, 67, 67, 67, 94, 94, 93, 93, 93, 89, 86, 83, 79, 80, 81, 81, 82,
+        82, 82, 82, 82, 81, 80, 79, 78, 77, 75, 74, 72, 70, 69, 67, 65, 65, 65,
+        65, 93, 93, 94, 94, 95, 91, 87, 84, 80, 80, 80, 80, 80, 80, 79, 79, 79,
+        78, 77, 76, 75, 74, 73, 71, 70, 68, 67, 65, 64, 64, 64, 64, 92, 93, 94,
+        96, 97, 93, 89, 85, 81, 80, 79, 78, 77, 77, 77, 76, 76, 75, 74, 73, 72,
+        71, 70, 69, 67, 66, 65, 63, 62, 62, 62, 62, 90, 93, 95, 97, 100, 95, 91,
+        86, 81, 80, 78, 77, 75, 74, 74, 73, 73, 72, 71, 70, 69, 68, 67, 66, 65,
+        64, 63, 61, 60, 60, 60, 60, 89, 92, 96, 99, 102, 97, 92, 87, 82, 80, 77,
+        75, 73, 72, 71, 70, 69, 69, 68, 67, 67, 66, 65, 64, 63, 62, 61, 60, 59,
+        59, 59, 59, 88, 91, 94, 98, 101, 97, 92, 87, 82, 80, 77, 74, 72, 71, 70,
+        69, 68, 67, 66, 65, 64, 64, 63, 62, 61, 60, 59, 58, 57, 57, 57, 57, 86,
+        90, 93, 97, 101, 96, 91, 87, 82, 79, 77, 74, 71, 70, 68, 67, 66, 65, 64,
+        63, 62, 62, 61, 60, 59, 58, 57, 56, 55, 55, 55, 55, 85, 89, 92, 96, 100,
+        95, 91, 87, 82, 79, 76, 73, 70, 69, 67, 66, 64, 63, 62, 61, 60, 60, 59,
+        58, 57, 56, 55, 55, 54, 54, 54, 54, 83, 87, 91, 95, 99, 95, 91, 87, 82,
+        79, 76, 73, 69, 68, 66, 64, 62, 61, 60, 59, 58, 58, 57, 56, 55, 54, 54,
+        53, 52, 52, 52, 52, 81, 85, 89, 93, 97, 93, 89, 85, 81, 78, 75, 72, 69,
+        67, 65, 63, 61, 60, 59, 58, 57, 56, 55, 55, 54, 53, 52, 52, 51, 51, 51,
+        51, 79, 83, 87, 91, 95, 91, 88, 84, 80, 77, 74, 71, 68, 66, 64, 62, 60,
+        59, 58, 57, 56, 55, 54, 53, 53, 52, 51, 50, 50, 50, 50, 50, 77, 81, 85,
+        89, 93, 89, 86, 83, 79, 76, 73, 70, 67, 65, 63, 61, 59, 58, 57, 56, 54,
+        54, 53, 52, 51, 50, 50, 49, 48, 48, 48, 48, 75, 79, 83, 86, 90, 87, 84,
+        81, 78, 75, 72, 69, 67, 64, 62, 60, 58, 57, 56, 54, 53, 52, 52, 51, 50,
+        49, 48, 48, 47, 47, 47, 47, 73, 76, 80, 84, 88, 85, 82, 80, 77, 74, 71,
+        68, 66, 64, 62, 60, 58, 56, 55, 54, 52, 52, 51, 50, 49, 48, 48, 47, 46,
+        46, 46, 46, 71, 74, 78, 82, 85, 83, 80, 78, 75, 73, 70, 67, 65, 63, 61,
+        59, 57, 55, 54, 53, 52, 51, 50, 49, 48, 47, 47, 46, 45, 45, 45, 45, 68,
+        72, 76, 79, 83, 81, 78, 76, 74, 71, 69, 66, 64, 62, 60, 58, 56, 55, 53,
+        52, 51, 50, 49, 48, 47, 46, 46, 45, 44, 44, 44, 44, 66, 70, 73, 77, 80,
+        78, 76, 74, 72, 70, 67, 65, 63, 61, 59, 57, 55, 54, 53, 51, 50, 49, 48,
+        47, 46, 45, 45, 44, 44, 44, 44, 44, 64, 68, 71, 75, 78, 76, 74, 72, 70,
+        68, 66, 64, 62, 60, 58, 56, 54, 53, 52, 50, 49, 48, 47, 46, 45, 45, 44,
+        44, 43, 43, 43, 43, 63, 66, 69, 72, 76, 74, 72, 70, 69, 67, 65, 63, 61,
+        59, 57, 55, 54, 52, 51, 50, 48, 48, 47, 46, 45, 44, 44, 43, 42, 42, 42,
+        42, 61, 64, 67, 70, 73, 72, 70, 69, 67, 65, 63, 61, 60, 58, 56, 55, 53,
+        52, 50, 49, 48, 47, 46, 45, 44, 44, 43, 42, 42, 42, 42, 42, 59, 62, 65,
+        68, 71, 70, 68, 67, 65, 64, 62, 60, 59, 57, 55, 54, 52, 51, 50, 48, 47,
+        46, 45, 44, 44, 43, 42, 42, 41, 41, 41, 41, 59, 62, 65, 68, 71, 70, 68,
+        67, 65, 64, 62, 60, 59, 57, 55, 54, 52, 51, 50, 48, 47, 46, 45, 44, 44,
+        43, 42, 42, 41, 41, 41, 41, 59, 62, 65, 68, 71, 70, 68, 67, 65, 64, 62,
+        60, 59, 57, 55, 54, 52, 51, 50, 48, 47, 46, 45, 44, 44, 43, 42, 42, 41,
+        41, 41, 41, 59, 62, 65, 68, 71, 70, 68, 67, 65, 64, 62, 60, 59, 57, 55,
+        54, 52, 51, 50, 48, 47, 46, 45, 44, 44, 43, 42, 42, 41, 41, 41,
+        41 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 59, 38, 26, 59, 41, 31, 25, 38, 31, 24, 21, 26, 25, 21, 19,
+        /* Size 8 */
+        64, 83, 77, 60, 47, 37, 31, 28, 83, 73, 76, 66, 53, 43, 36, 31, 77, 76,
+        56, 49, 43, 37, 32, 29, 60, 66, 49, 40, 35, 32, 29, 27, 47, 53, 43, 35,
+        31, 28, 26, 25, 37, 43, 37, 32, 28, 26, 24, 23, 31, 36, 32, 29, 26, 24,
+        23, 22, 28, 31, 29, 27, 25, 23, 22, 21,
+        /* Size 16 */
+        64, 73, 83, 80, 77, 69, 60, 53, 47, 42, 37, 34, 31, 30, 28, 28, 73, 76,
+        78, 77, 77, 70, 63, 56, 50, 45, 40, 37, 34, 31, 29, 29, 83, 78, 73, 75,
+        76, 71, 66, 59, 53, 48, 43, 39, 36, 33, 31, 31, 80, 77, 75, 70, 66, 62,
+        58, 53, 48, 44, 40, 37, 34, 32, 30, 30, 77, 77, 76, 66, 56, 53, 49, 46,
+        43, 40, 37, 35, 32, 31, 29, 29, 69, 70, 71, 62, 53, 49, 45, 42, 39, 37,
+        35, 33, 31, 29, 28, 28, 60, 63, 66, 58, 49, 45, 40, 38, 35, 34, 32, 31,
+        29, 28, 27, 27, 53, 56, 59, 53, 46, 42, 38, 35, 33, 32, 30, 29, 28, 27,
+        26, 26, 47, 50, 53, 48, 43, 39, 35, 33, 31, 30, 28, 27, 26, 26, 25, 25,
+        42, 45, 48, 44, 40, 37, 34, 32, 30, 28, 27, 26, 25, 25, 24, 24, 37, 40,
+        43, 40, 37, 35, 32, 30, 28, 27, 26, 25, 24, 24, 23, 23, 34, 37, 39, 37,
+        35, 33, 31, 29, 27, 26, 25, 24, 24, 23, 23, 23, 31, 34, 36, 34, 32, 31,
+        29, 28, 26, 25, 24, 24, 23, 23, 22, 22, 30, 31, 33, 32, 31, 29, 28, 27,
+        26, 25, 24, 23, 23, 22, 22, 22, 28, 29, 31, 30, 29, 28, 27, 26, 25, 24,
+        23, 23, 22, 22, 21, 21, 28, 29, 31, 30, 29, 28, 27, 26, 25, 24, 23, 23,
+        22, 22, 21, 21,
+        /* Size 32 */
+        64, 69, 73, 78, 83, 81, 80, 79, 77, 73, 69, 64, 60, 57, 53, 50, 47, 44,
+        42, 40, 37, 36, 34, 33, 31, 30, 30, 29, 28, 28, 28, 28, 69, 72, 75, 78,
+        80, 80, 79, 78, 77, 73, 69, 65, 62, 58, 55, 52, 48, 46, 44, 41, 39, 37,
+        36, 34, 33, 31, 30, 29, 28, 28, 28, 28, 73, 75, 76, 77, 78, 78, 77, 77,
+        77, 73, 70, 66, 63, 60, 56, 53, 50, 47, 45, 43, 40, 38, 37, 35, 34, 32,
+        31, 30, 29, 29, 29, 29, 78, 78, 77, 76, 76, 76, 76, 76, 76, 73, 70, 67,
+        64, 61, 58, 55, 51, 49, 46, 44, 41, 40, 38, 36, 35, 33, 32, 31, 30, 30,
+        30, 30, 83, 80, 78, 76, 73, 74, 75, 75, 76, 74, 71, 68, 66, 63, 59, 56,
+        53, 50, 48, 45, 43, 41, 39, 37, 36, 34, 33, 32, 31, 31, 31, 31, 81, 80,
+        78, 76, 74, 73, 72, 72, 71, 69, 66, 64, 62, 59, 56, 53, 51, 48, 46, 44,
+        41, 40, 38, 36, 35, 34, 33, 31, 30, 30, 30, 30, 80, 79, 77, 76, 75, 72,
+        70, 68, 66, 64, 62, 60, 58, 55, 53, 50, 48, 46, 44, 42, 40, 39, 37, 36,
+        34, 33, 32, 31, 30, 30, 30, 30, 79, 78, 77, 76, 75, 72, 68, 65, 61, 59,
+        57, 55, 53, 51, 49, 47, 46, 44, 42, 40, 39, 37, 36, 35, 33, 32, 31, 30,
+        29, 29, 29, 29, 77, 77, 77, 76, 76, 71, 66, 61, 56, 54, 53, 51, 49, 48,
+        46, 45, 43, 42, 40, 39, 37, 36, 35, 34, 32, 32, 31, 30, 29, 29, 29, 29,
+        73, 73, 73, 73, 74, 69, 64, 59, 54, 53, 51, 49, 47, 45, 44, 43, 41, 40,
+        39, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 28, 28, 28, 69, 69, 70, 70,
+        71, 66, 62, 57, 53, 51, 49, 47, 45, 43, 42, 41, 39, 38, 37, 36, 35, 34,
+        33, 32, 31, 30, 29, 29, 28, 28, 28, 28, 64, 65, 66, 67, 68, 64, 60, 55,
+        51, 49, 47, 45, 42, 41, 40, 39, 37, 36, 35, 34, 33, 32, 32, 31, 30, 29,
+        29, 28, 27, 27, 27, 27, 60, 62, 63, 64, 66, 62, 58, 53, 49, 47, 45, 42,
+        40, 39, 38, 37, 35, 35, 34, 33, 32, 31, 31, 30, 29, 28, 28, 27, 27, 27,
+        27, 27, 57, 58, 60, 61, 63, 59, 55, 51, 48, 45, 43, 41, 39, 38, 37, 35,
+        34, 33, 33, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 26, 26, 26, 53, 55,
+        56, 58, 59, 56, 53, 49, 46, 44, 42, 40, 38, 37, 35, 34, 33, 32, 32, 31,
+        30, 30, 29, 28, 28, 27, 27, 26, 26, 26, 26, 26, 50, 52, 53, 55, 56, 53,
+        50, 47, 45, 43, 41, 39, 37, 35, 34, 33, 32, 31, 31, 30, 29, 29, 28, 28,
+        27, 27, 26, 26, 25, 25, 25, 25, 47, 48, 50, 51, 53, 51, 48, 46, 43, 41,
+        39, 37, 35, 34, 33, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 26, 26, 25,
+        25, 25, 25, 25, 44, 46, 47, 49, 50, 48, 46, 44, 42, 40, 38, 36, 35, 33,
+        32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 26, 25, 25, 25, 24, 24, 24, 24,
+        42, 44, 45, 46, 48, 46, 44, 42, 40, 39, 37, 35, 34, 33, 32, 31, 30, 29,
+        28, 28, 27, 27, 26, 26, 25, 25, 25, 24, 24, 24, 24, 24, 40, 41, 43, 44,
+        45, 44, 42, 40, 39, 37, 36, 34, 33, 32, 31, 30, 29, 28, 28, 27, 27, 26,
+        26, 25, 25, 25, 24, 24, 24, 24, 24, 24, 37, 39, 40, 41, 43, 41, 40, 39,
+        37, 36, 35, 33, 32, 31, 30, 29, 28, 28, 27, 27, 26, 26, 25, 25, 24, 24,
+        24, 24, 23, 23, 23, 23, 36, 37, 38, 40, 41, 40, 39, 37, 36, 35, 34, 32,
+        31, 30, 30, 29, 28, 27, 27, 26, 26, 25, 25, 24, 24, 24, 24, 23, 23, 23,
+        23, 23, 34, 36, 37, 38, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28,
+        27, 27, 26, 26, 25, 25, 24, 24, 24, 23, 23, 23, 23, 23, 23, 23, 33, 34,
+        35, 36, 37, 36, 36, 35, 34, 33, 32, 31, 30, 29, 28, 28, 27, 26, 26, 25,
+        25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 31, 33, 34, 35, 36, 35,
+        34, 33, 32, 32, 31, 30, 29, 28, 28, 27, 26, 26, 25, 25, 24, 24, 24, 23,
+        23, 23, 23, 22, 22, 22, 22, 22, 30, 31, 32, 33, 34, 34, 33, 32, 32, 31,
+        30, 29, 28, 28, 27, 27, 26, 25, 25, 25, 24, 24, 23, 23, 23, 23, 22, 22,
+        22, 22, 22, 22, 30, 30, 31, 32, 33, 33, 32, 31, 31, 30, 29, 29, 28, 27,
+        27, 26, 26, 25, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22,
+        29, 29, 30, 31, 32, 31, 31, 30, 30, 29, 29, 28, 27, 27, 26, 26, 25, 25,
+        24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 29, 30,
+        31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 25, 25, 24, 24, 24, 23, 23,
+        23, 22, 22, 22, 22, 22, 21, 21, 21, 21, 28, 28, 29, 30, 31, 30, 30, 29,
+        29, 28, 28, 27, 27, 26, 26, 25, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22,
+        22, 22, 21, 21, 21, 21, 28, 28, 29, 30, 31, 30, 30, 29, 29, 28, 28, 27,
+        27, 26, 26, 25, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21,
+        21, 21, 28, 28, 29, 30, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 25,
+        25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 21 },
+      { /* Intra matrices */
+        /* Size 4 */
+        155, 142, 90, 61, 142, 97, 72, 56, 90, 72, 54, 46, 61, 56, 46, 41,
+        /* Size 8 */
+        131, 172, 159, 123, 94, 74, 62, 53, 172, 151, 157, 135, 108, 86, 70, 60,
+        159, 157, 115, 99, 86, 74, 64, 56, 123, 135, 99, 80, 70, 63, 56, 51, 94,
+        108, 86, 70, 61, 55, 50, 47, 74, 86, 74, 63, 55, 50, 46, 44, 62, 70, 64,
+        56, 50, 46, 44, 42, 53, 60, 56, 51, 47, 44, 42, 40,
+        /* Size 16 */
+        136, 157, 177, 171, 165, 146, 127, 112, 97, 87, 77, 70, 64, 59, 55, 55,
+        157, 162, 167, 165, 163, 148, 133, 119, 104, 93, 83, 75, 68, 63, 58, 58,
+        177, 167, 156, 159, 162, 151, 140, 126, 111, 100, 89, 81, 73, 67, 62,
+        62, 171, 165, 159, 150, 140, 131, 121, 111, 100, 91, 83, 76, 69, 65, 60,
+        60, 165, 163, 162, 140, 118, 110, 103, 96, 89, 83, 76, 71, 66, 62, 58,
+        58, 146, 148, 151, 131, 110, 102, 93, 87, 81, 76, 71, 66, 62, 59, 55,
+        55, 127, 133, 140, 121, 103, 93, 83, 78, 72, 68, 65, 61, 58, 56, 53, 53,
+        112, 119, 126, 111, 96, 87, 78, 72, 67, 64, 61, 58, 55, 53, 51, 51, 97,
+        104, 111, 100, 89, 81, 72, 67, 63, 60, 57, 54, 52, 50, 49, 49, 87, 93,
+        100, 91, 83, 76, 68, 64, 60, 57, 54, 52, 50, 49, 47, 47, 77, 83, 89, 83,
+        76, 71, 65, 61, 57, 54, 51, 50, 48, 47, 45, 45, 70, 75, 81, 76, 71, 66,
+        61, 58, 54, 52, 50, 48, 46, 45, 44, 44, 64, 68, 73, 69, 66, 62, 58, 55,
+        52, 50, 48, 46, 45, 44, 43, 43, 59, 63, 67, 65, 62, 59, 56, 53, 50, 49,
+        47, 45, 44, 43, 42, 42, 55, 58, 62, 60, 58, 55, 53, 51, 49, 47, 45, 44,
+        43, 42, 41, 41, 55, 58, 62, 60, 58, 55, 53, 51, 49, 47, 45, 44, 43, 42,
+        41, 41,
+        /* Size 32 */
+        138, 148, 159, 170, 180, 177, 174, 171, 167, 158, 148, 139, 129, 122,
+        114, 106, 99, 94, 88, 83, 78, 75, 71, 68, 65, 62, 60, 58, 56, 56, 56,
+        56, 148, 155, 162, 168, 175, 173, 171, 169, 167, 158, 150, 141, 132,
+        125, 117, 110, 102, 97, 92, 86, 81, 78, 74, 71, 67, 65, 62, 60, 58, 58,
+        58, 58, 159, 162, 164, 167, 169, 169, 168, 167, 166, 159, 151, 143, 136,
+        128, 121, 113, 106, 100, 95, 90, 84, 80, 77, 73, 69, 67, 64, 62, 59, 59,
+        59, 59, 170, 168, 167, 165, 164, 164, 165, 165, 166, 159, 152, 146, 139,
+        132, 124, 117, 110, 104, 98, 93, 87, 83, 79, 76, 72, 69, 66, 64, 61, 61,
+        61, 61, 180, 175, 169, 164, 158, 160, 162, 163, 165, 159, 154, 148, 142,
+        135, 128, 120, 113, 107, 102, 96, 90, 86, 82, 78, 74, 71, 68, 66, 63,
+        63, 63, 63, 177, 173, 169, 164, 160, 159, 157, 155, 154, 149, 143, 138,
+        133, 126, 120, 114, 108, 102, 97, 92, 87, 83, 80, 76, 72, 70, 67, 64,
+        62, 62, 62, 62, 174, 171, 168, 165, 162, 157, 152, 147, 143, 138, 133,
+        128, 123, 118, 113, 107, 102, 97, 93, 88, 84, 81, 77, 74, 70, 68, 66,
+        63, 61, 61, 61, 61, 171, 169, 167, 165, 163, 155, 147, 139, 132, 127,
+        123, 118, 114, 109, 105, 101, 96, 92, 89, 85, 81, 78, 75, 72, 69, 66,
+        64, 62, 60, 60, 60, 60, 167, 167, 166, 166, 165, 154, 143, 132, 120,
+        116, 112, 108, 104, 101, 98, 94, 91, 87, 84, 81, 78, 75, 72, 70, 67, 65,
+        63, 61, 59, 59, 59, 59, 158, 158, 159, 159, 159, 149, 138, 127, 116,
+        112, 108, 104, 99, 96, 93, 90, 86, 83, 81, 78, 75, 72, 70, 67, 65, 63,
+        61, 59, 57, 57, 57, 57, 148, 150, 151, 152, 154, 143, 133, 123, 112,
+        108, 103, 99, 94, 91, 88, 85, 82, 79, 77, 74, 72, 70, 67, 65, 63, 61,
+        60, 58, 56, 56, 56, 56, 139, 141, 143, 146, 148, 138, 128, 118, 108,
+        104, 99, 94, 89, 86, 83, 81, 78, 76, 73, 71, 69, 67, 65, 63, 61, 60, 58,
+        57, 55, 55, 55, 55, 129, 132, 136, 139, 142, 133, 123, 114, 104, 99, 94,
+        89, 84, 81, 79, 76, 74, 72, 70, 68, 66, 64, 62, 61, 59, 58, 57, 55, 54,
+        54, 54, 54, 122, 125, 128, 132, 135, 126, 118, 109, 101, 96, 91, 86, 81,
+        79, 76, 74, 71, 69, 67, 66, 64, 62, 61, 59, 58, 56, 55, 54, 53, 53, 53,
+        53, 114, 117, 121, 124, 128, 120, 113, 105, 98, 93, 88, 83, 79, 76, 74,
+        71, 69, 67, 65, 63, 62, 60, 59, 57, 56, 55, 54, 53, 52, 52, 52, 52, 106,
+        110, 113, 117, 120, 114, 107, 101, 94, 90, 85, 81, 76, 74, 71, 69, 66,
+        64, 63, 61, 60, 58, 57, 56, 55, 54, 53, 52, 51, 51, 51, 51, 99, 102,
+        106, 110, 113, 108, 102, 96, 91, 86, 82, 78, 74, 71, 69, 66, 64, 62, 61,
+        59, 57, 56, 55, 54, 53, 52, 51, 50, 49, 49, 49, 49, 94, 97, 100, 104,
+        107, 102, 97, 92, 87, 83, 79, 76, 72, 69, 67, 64, 62, 61, 59, 58, 56,
+        55, 54, 53, 52, 51, 50, 49, 49, 49, 49, 49, 88, 92, 95, 98, 102, 97, 93,
+        89, 84, 81, 77, 73, 70, 67, 65, 63, 61, 59, 58, 56, 55, 54, 53, 52, 51,
+        50, 49, 49, 48, 48, 48, 48, 83, 86, 90, 93, 96, 92, 88, 85, 81, 78, 74,
+        71, 68, 66, 63, 61, 59, 58, 56, 55, 54, 53, 52, 51, 50, 49, 48, 48, 47,
+        47, 47, 47, 78, 81, 84, 87, 90, 87, 84, 81, 78, 75, 72, 69, 66, 64, 62,
+        60, 57, 56, 55, 54, 52, 51, 50, 50, 49, 48, 47, 47, 46, 46, 46, 46, 75,
+        78, 80, 83, 86, 83, 81, 78, 75, 72, 70, 67, 64, 62, 60, 58, 56, 55, 54,
+        53, 51, 51, 50, 49, 48, 47, 47, 46, 46, 46, 46, 46, 71, 74, 77, 79, 82,
+        80, 77, 75, 72, 70, 67, 65, 62, 61, 59, 57, 55, 54, 53, 52, 50, 50, 49,
+        48, 47, 47, 46, 45, 45, 45, 45, 45, 68, 71, 73, 76, 78, 76, 74, 72, 70,
+        67, 65, 63, 61, 59, 57, 56, 54, 53, 52, 51, 50, 49, 48, 47, 47, 46, 45,
+        45, 44, 44, 44, 44, 65, 67, 69, 72, 74, 72, 70, 69, 67, 65, 63, 61, 59,
+        58, 56, 55, 53, 52, 51, 50, 49, 48, 47, 47, 46, 45, 45, 44, 44, 44, 44,
+        44, 62, 65, 67, 69, 71, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, 54, 52,
+        51, 50, 49, 48, 47, 47, 46, 45, 45, 44, 44, 43, 43, 43, 43, 60, 62, 64,
+        66, 68, 67, 66, 64, 63, 61, 60, 58, 57, 55, 54, 53, 51, 50, 49, 48, 47,
+        47, 46, 45, 45, 44, 44, 43, 43, 43, 43, 43, 58, 60, 62, 64, 66, 64, 63,
+        62, 61, 59, 58, 57, 55, 54, 53, 52, 50, 49, 49, 48, 47, 46, 45, 45, 44,
+        44, 43, 43, 42, 42, 42, 42, 56, 58, 59, 61, 63, 62, 61, 60, 59, 57, 56,
+        55, 54, 53, 52, 51, 49, 49, 48, 47, 46, 46, 45, 44, 44, 43, 43, 42, 42,
+        42, 42, 42, 56, 58, 59, 61, 63, 62, 61, 60, 59, 57, 56, 55, 54, 53, 52,
+        51, 49, 49, 48, 47, 46, 46, 45, 44, 44, 43, 43, 42, 42, 42, 42, 42, 56,
+        58, 59, 61, 63, 62, 61, 60, 59, 57, 56, 55, 54, 53, 52, 51, 49, 49, 48,
+        47, 46, 46, 45, 44, 44, 43, 43, 42, 42, 42, 42, 42, 56, 58, 59, 61, 63,
+        62, 61, 60, 59, 57, 56, 55, 54, 53, 52, 51, 49, 49, 48, 47, 46, 46, 45,
+        44, 44, 43, 43, 42, 42, 42, 42, 42 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 42, 40, 32, 42, 36, 34, 31, 40, 34, 28, 25, 32, 31, 25, 22,
+        /* Size 8 */
+        64, 78, 46, 43, 41, 37, 34, 31, 78, 56, 45, 49, 48, 44, 40, 36, 46, 45,
+        39, 40, 41, 39, 36, 33, 43, 49, 40, 36, 35, 34, 32, 30, 41, 48, 41, 35,
+        32, 30, 29, 28, 37, 44, 39, 34, 30, 28, 27, 26, 34, 40, 36, 32, 29, 27,
+        25, 24, 31, 36, 33, 30, 28, 26, 24, 23,
+        /* Size 16 */
+        64, 71, 78, 62, 46, 45, 43, 42, 41, 39, 37, 36, 34, 32, 31, 31, 71, 69,
+        67, 56, 45, 46, 46, 45, 44, 43, 41, 39, 37, 35, 33, 33, 78, 67, 56, 50,
+        45, 47, 49, 48, 48, 46, 44, 42, 40, 38, 36, 36, 62, 56, 50, 46, 42, 43,
+        45, 44, 44, 43, 41, 40, 38, 36, 35, 35, 46, 45, 45, 42, 39, 40, 40, 40,
+        41, 40, 39, 38, 36, 35, 33, 33, 45, 46, 47, 43, 40, 39, 38, 38, 38, 37,
+        36, 35, 34, 33, 32, 32, 43, 46, 49, 45, 40, 38, 36, 36, 35, 34, 34, 33,
+        32, 31, 30, 30, 42, 45, 48, 44, 40, 38, 36, 35, 34, 33, 32, 31, 31, 30,
+        29, 29, 41, 44, 48, 44, 41, 38, 35, 34, 32, 31, 30, 30, 29, 28, 28, 28,
+        39, 43, 46, 43, 40, 37, 34, 33, 31, 30, 29, 29, 28, 27, 27, 27, 37, 41,
+        44, 41, 39, 36, 34, 32, 30, 29, 28, 27, 27, 26, 26, 26, 36, 39, 42, 40,
+        38, 35, 33, 31, 30, 29, 27, 27, 26, 25, 25, 25, 34, 37, 40, 38, 36, 34,
+        32, 31, 29, 28, 27, 26, 25, 25, 24, 24, 32, 35, 38, 36, 35, 33, 31, 30,
+        28, 27, 26, 25, 25, 24, 23, 23, 31, 33, 36, 35, 33, 32, 30, 29, 28, 27,
+        26, 25, 24, 23, 23, 23, 31, 33, 36, 35, 33, 32, 30, 29, 28, 27, 26, 25,
+        24, 23, 23, 23,
+        /* Size 32 */
+        64, 67, 71, 74, 78, 70, 62, 54, 46, 45, 45, 44, 43, 43, 42, 42, 41, 40,
+        39, 38, 37, 36, 36, 35, 34, 33, 32, 31, 31, 31, 31, 31, 67, 69, 70, 71,
+        72, 66, 59, 52, 45, 45, 45, 45, 45, 44, 44, 43, 43, 42, 41, 40, 39, 38,
+        37, 36, 35, 34, 34, 33, 32, 32, 32, 32, 71, 70, 69, 68, 67, 61, 56, 51,
+        45, 46, 46, 46, 46, 46, 45, 45, 44, 43, 43, 42, 41, 40, 39, 38, 37, 36,
+        35, 34, 33, 33, 33, 33, 74, 71, 68, 65, 61, 57, 53, 49, 45, 46, 46, 47,
+        48, 47, 47, 46, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 34,
+        34, 34, 78, 72, 67, 61, 56, 53, 50, 48, 45, 46, 47, 48, 49, 49, 48, 48,
+        48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 36, 36, 36, 70, 66,
+        61, 57, 53, 51, 48, 46, 44, 44, 45, 46, 47, 47, 46, 46, 46, 45, 44, 44,
+        43, 42, 41, 40, 39, 38, 37, 36, 35, 35, 35, 35, 62, 59, 56, 53, 50, 48,
+        46, 44, 42, 43, 43, 44, 45, 45, 44, 44, 44, 43, 43, 42, 41, 41, 40, 39,
+        38, 37, 36, 35, 35, 35, 35, 35, 54, 52, 51, 49, 48, 46, 44, 42, 41, 41,
+        42, 42, 43, 43, 42, 42, 42, 42, 41, 41, 40, 39, 39, 38, 37, 36, 36, 35,
+        34, 34, 34, 34, 46, 45, 45, 45, 45, 44, 42, 41, 39, 40, 40, 40, 40, 40,
+        40, 41, 41, 40, 40, 39, 39, 38, 38, 37, 36, 35, 35, 34, 33, 33, 33, 33,
+        45, 45, 46, 46, 46, 44, 43, 41, 40, 40, 40, 39, 39, 39, 39, 39, 39, 39,
+        38, 38, 38, 37, 36, 36, 35, 35, 34, 33, 33, 33, 33, 33, 45, 45, 46, 46,
+        47, 45, 43, 42, 40, 40, 39, 39, 38, 38, 38, 38, 38, 37, 37, 37, 36, 36,
+        35, 35, 34, 34, 33, 32, 32, 32, 32, 32, 44, 45, 46, 47, 48, 46, 44, 42,
+        40, 39, 39, 38, 37, 37, 37, 37, 36, 36, 36, 35, 35, 35, 34, 34, 33, 33,
+        32, 32, 31, 31, 31, 31, 43, 45, 46, 48, 49, 47, 45, 43, 40, 39, 38, 37,
+        36, 36, 36, 35, 35, 35, 34, 34, 34, 33, 33, 33, 32, 32, 31, 31, 30, 30,
+        30, 30, 43, 44, 46, 47, 49, 47, 45, 43, 40, 39, 38, 37, 36, 36, 35, 35,
+        34, 34, 34, 33, 33, 33, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 42, 44,
+        45, 47, 48, 46, 44, 42, 40, 39, 38, 37, 36, 35, 35, 34, 34, 33, 33, 32,
+        32, 32, 31, 31, 31, 30, 30, 29, 29, 29, 29, 29, 42, 43, 45, 46, 48, 46,
+        44, 42, 41, 39, 38, 37, 35, 35, 34, 33, 33, 32, 32, 32, 31, 31, 31, 30,
+        30, 29, 29, 29, 28, 28, 28, 28, 41, 43, 44, 46, 48, 46, 44, 42, 41, 39,
+        38, 36, 35, 34, 34, 33, 32, 32, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28,
+        28, 28, 28, 28, 40, 42, 43, 45, 47, 45, 43, 42, 40, 39, 37, 36, 35, 34,
+        33, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 27, 27, 27,
+        39, 41, 43, 44, 46, 44, 43, 41, 40, 38, 37, 36, 34, 34, 33, 32, 31, 31,
+        30, 30, 29, 29, 29, 28, 28, 28, 27, 27, 27, 27, 27, 27, 38, 40, 42, 43,
+        45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32, 32, 31, 30, 30, 29, 29, 28,
+        28, 28, 27, 27, 27, 26, 26, 26, 26, 26, 37, 39, 41, 42, 44, 43, 41, 40,
+        39, 38, 36, 35, 34, 33, 32, 31, 30, 30, 29, 29, 28, 28, 27, 27, 27, 26,
+        26, 26, 26, 26, 26, 26, 36, 38, 40, 41, 43, 42, 41, 39, 38, 37, 36, 35,
+        33, 33, 32, 31, 30, 29, 29, 28, 28, 27, 27, 27, 26, 26, 26, 25, 25, 25,
+        25, 25, 36, 37, 39, 40, 42, 41, 40, 39, 38, 36, 35, 34, 33, 32, 31, 31,
+        30, 29, 29, 28, 27, 27, 27, 26, 26, 26, 25, 25, 25, 25, 25, 25, 35, 36,
+        38, 39, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 29, 28, 28,
+        27, 27, 26, 26, 26, 25, 25, 25, 24, 24, 24, 24, 34, 35, 37, 38, 40, 39,
+        38, 37, 36, 35, 34, 33, 32, 31, 31, 30, 29, 28, 28, 27, 27, 26, 26, 26,
+        25, 25, 25, 24, 24, 24, 24, 24, 33, 34, 36, 37, 39, 38, 37, 36, 35, 35,
+        34, 33, 32, 31, 30, 29, 29, 28, 28, 27, 26, 26, 26, 25, 25, 25, 24, 24,
+        24, 24, 24, 24, 32, 34, 35, 36, 38, 37, 36, 36, 35, 34, 33, 32, 31, 31,
+        30, 29, 28, 28, 27, 27, 26, 26, 25, 25, 25, 24, 24, 24, 23, 23, 23, 23,
+        31, 33, 34, 35, 37, 36, 35, 35, 34, 33, 32, 32, 31, 30, 29, 29, 28, 28,
+        27, 26, 26, 25, 25, 25, 24, 24, 24, 23, 23, 23, 23, 23, 31, 32, 33, 34,
+        36, 35, 35, 34, 33, 33, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 26, 25,
+        25, 24, 24, 24, 23, 23, 23, 23, 23, 23, 31, 32, 33, 34, 36, 35, 35, 34,
+        33, 33, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 26, 25, 25, 24, 24, 24,
+        23, 23, 23, 23, 23, 23, 31, 32, 33, 34, 36, 35, 35, 34, 33, 33, 32, 31,
+        30, 30, 29, 28, 28, 27, 27, 26, 26, 25, 25, 24, 24, 24, 23, 23, 23, 23,
+        23, 23, 31, 32, 33, 34, 36, 35, 35, 34, 33, 33, 32, 31, 30, 30, 29, 28,
+        28, 27, 27, 26, 26, 25, 25, 24, 24, 24, 23, 23, 23, 23, 23, 23 },
+      { /* Intra matrices */
+        /* Size 4 */
+        133, 87, 80, 65, 87, 73, 69, 61, 80, 69, 56, 50, 65, 61, 50, 43,
+        /* Size 8 */
+        124, 152, 87, 83, 78, 70, 63, 57, 152, 107, 86, 94, 91, 84, 75, 67, 87,
+        86, 74, 77, 77, 73, 68, 62, 83, 94, 77, 69, 66, 63, 60, 56, 78, 91, 77,
+        66, 60, 56, 54, 51, 70, 84, 73, 63, 56, 52, 49, 47, 63, 75, 68, 60, 54,
+        49, 46, 44, 57, 67, 62, 56, 51, 47, 44, 41,
+        /* Size 16 */
+        127, 141, 155, 122, 89, 87, 84, 82, 80, 76, 72, 68, 65, 61, 58, 58, 141,
+        137, 133, 111, 88, 89, 90, 88, 87, 83, 79, 75, 71, 67, 63, 63, 155, 133,
+        110, 99, 88, 92, 96, 95, 93, 90, 86, 81, 77, 73, 69, 69, 122, 111, 99,
+        90, 82, 85, 87, 87, 86, 83, 80, 77, 73, 70, 66, 66, 89, 88, 88, 82, 76,
+        77, 78, 79, 79, 77, 75, 72, 70, 67, 64, 64, 87, 89, 92, 85, 77, 76, 74,
+        74, 73, 71, 70, 68, 66, 63, 61, 61, 84, 90, 96, 87, 78, 74, 70, 69, 67,
+        66, 65, 63, 62, 60, 58, 58, 82, 88, 95, 87, 79, 74, 69, 67, 64, 63, 61,
+        60, 58, 57, 55, 55, 80, 87, 93, 86, 79, 73, 67, 64, 61, 59, 58, 56, 55,
+        54, 52, 52, 76, 83, 90, 83, 77, 71, 66, 63, 59, 57, 55, 54, 53, 51, 50,
+        50, 72, 79, 86, 80, 75, 70, 65, 61, 58, 55, 53, 52, 50, 49, 48, 48, 68,
+        75, 81, 77, 72, 68, 63, 60, 56, 54, 52, 50, 49, 47, 46, 46, 65, 71, 77,
+        73, 70, 66, 62, 58, 55, 53, 50, 49, 47, 46, 45, 45, 61, 67, 73, 70, 67,
+        63, 60, 57, 54, 51, 49, 47, 46, 45, 43, 43, 58, 63, 69, 66, 64, 61, 58,
+        55, 52, 50, 48, 46, 45, 43, 42, 42, 58, 63, 69, 66, 64, 61, 58, 55, 52,
+        50, 48, 46, 45, 43, 42, 42,
+        /* Size 32 */
+        129, 136, 143, 150, 157, 141, 124, 107, 90, 89, 88, 87, 86, 84, 83, 82,
+        81, 79, 77, 75, 73, 71, 69, 67, 65, 64, 62, 60, 59, 59, 59, 59, 136,
+        138, 141, 143, 146, 132, 118, 104, 90, 90, 89, 89, 88, 87, 86, 85, 84,
+        82, 80, 78, 76, 74, 72, 71, 69, 67, 65, 63, 62, 62, 62, 62, 143, 141,
+        139, 137, 134, 123, 112, 101, 89, 90, 90, 91, 91, 90, 89, 89, 88, 86,
+        84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 64, 64, 64, 150, 143, 137,
+        130, 123, 114, 106, 98, 89, 90, 92, 93, 94, 93, 93, 92, 91, 89, 87, 85,
+        83, 81, 79, 77, 75, 73, 71, 69, 67, 67, 67, 67, 157, 146, 134, 123, 111,
+        106, 100, 94, 89, 91, 93, 95, 97, 97, 96, 95, 95, 93, 91, 89, 87, 85,
+        82, 80, 78, 76, 74, 72, 70, 70, 70, 70, 141, 132, 123, 114, 106, 101,
+        96, 91, 86, 88, 89, 91, 93, 92, 92, 91, 91, 89, 87, 86, 84, 82, 80, 78,
+        76, 74, 72, 70, 68, 68, 68, 68, 124, 118, 112, 106, 100, 96, 91, 87, 83,
+        84, 86, 87, 88, 88, 88, 87, 87, 86, 84, 83, 81, 80, 78, 76, 74, 72, 71,
+        69, 67, 67, 67, 67, 107, 104, 101, 98, 94, 91, 87, 84, 80, 81, 82, 83,
+        84, 84, 84, 83, 83, 82, 81, 80, 79, 77, 76, 74, 72, 71, 69, 68, 66, 66,
+        66, 66, 90, 90, 89, 89, 89, 86, 83, 80, 77, 78, 78, 79, 79, 79, 80, 80,
+        80, 79, 78, 77, 76, 75, 73, 72, 71, 69, 68, 66, 65, 65, 65, 65, 89, 90,
+        90, 90, 91, 88, 84, 81, 78, 78, 77, 77, 77, 77, 77, 77, 77, 76, 75, 74,
+        73, 72, 71, 70, 69, 67, 66, 64, 63, 63, 63, 63, 88, 89, 90, 92, 93, 89,
+        86, 82, 78, 77, 77, 76, 75, 75, 75, 74, 74, 73, 72, 72, 71, 70, 69, 68,
+        66, 65, 64, 63, 62, 62, 62, 62, 87, 89, 91, 93, 95, 91, 87, 83, 79, 77,
+        76, 75, 73, 73, 72, 72, 71, 70, 70, 69, 68, 67, 66, 65, 64, 63, 62, 61,
+        60, 60, 60, 60, 86, 88, 91, 94, 97, 93, 88, 84, 79, 77, 75, 73, 71, 70,
+        70, 69, 68, 68, 67, 66, 66, 65, 64, 63, 62, 61, 60, 59, 59, 59, 59, 59,
+        84, 87, 90, 93, 97, 92, 88, 84, 79, 77, 75, 73, 70, 69, 68, 68, 67, 66,
+        65, 65, 64, 63, 62, 61, 61, 60, 59, 58, 57, 57, 57, 57, 83, 86, 89, 93,
+        96, 92, 88, 84, 80, 77, 75, 72, 70, 68, 67, 66, 65, 64, 64, 63, 62, 61,
+        60, 60, 59, 58, 57, 57, 56, 56, 56, 56, 82, 85, 89, 92, 95, 91, 87, 83,
+        80, 77, 74, 72, 69, 68, 66, 65, 63, 63, 62, 61, 60, 59, 59, 58, 57, 57,
+        56, 55, 54, 54, 54, 54, 81, 84, 88, 91, 95, 91, 87, 83, 80, 77, 74, 71,
+        68, 67, 65, 63, 62, 61, 60, 59, 58, 58, 57, 56, 56, 55, 54, 54, 53, 53,
+        53, 53, 79, 82, 86, 89, 93, 89, 86, 82, 79, 76, 73, 70, 68, 66, 64, 63,
+        61, 60, 59, 58, 57, 57, 56, 55, 54, 54, 53, 52, 52, 52, 52, 52, 77, 80,
+        84, 87, 91, 87, 84, 81, 78, 75, 72, 70, 67, 65, 64, 62, 60, 59, 58, 57,
+        56, 55, 55, 54, 53, 53, 52, 51, 51, 51, 51, 51, 75, 78, 82, 85, 89, 86,
+        83, 80, 77, 74, 72, 69, 66, 65, 63, 61, 59, 58, 57, 56, 55, 54, 53, 53,
+        52, 51, 51, 50, 50, 50, 50, 50, 73, 76, 80, 83, 87, 84, 81, 79, 76, 73,
+        71, 68, 66, 64, 62, 60, 58, 57, 56, 55, 54, 53, 52, 52, 51, 50, 50, 49,
+        48, 48, 48, 48, 71, 74, 78, 81, 85, 82, 80, 77, 75, 72, 70, 67, 65, 63,
+        61, 59, 58, 57, 55, 54, 53, 52, 52, 51, 50, 49, 49, 48, 48, 48, 48, 48,
+        69, 72, 76, 79, 82, 80, 78, 76, 73, 71, 69, 66, 64, 62, 60, 59, 57, 56,
+        55, 53, 52, 52, 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 67, 71, 74, 77,
+        80, 78, 76, 74, 72, 70, 68, 65, 63, 61, 60, 58, 56, 55, 54, 53, 52, 51,
+        50, 49, 48, 48, 47, 47, 46, 46, 46, 46, 65, 69, 72, 75, 78, 76, 74, 72,
+        71, 69, 66, 64, 62, 61, 59, 57, 56, 54, 53, 52, 51, 50, 49, 48, 48, 47,
+        46, 46, 45, 45, 45, 45, 64, 67, 70, 73, 76, 74, 72, 71, 69, 67, 65, 63,
+        61, 60, 58, 57, 55, 54, 53, 51, 50, 49, 49, 48, 47, 46, 46, 45, 45, 45,
+        45, 45, 62, 65, 68, 71, 74, 72, 71, 69, 68, 66, 64, 62, 60, 59, 57, 56,
+        54, 53, 52, 51, 50, 49, 48, 47, 46, 46, 45, 45, 44, 44, 44, 44, 60, 63,
+        66, 69, 72, 70, 69, 68, 66, 64, 63, 61, 59, 58, 57, 55, 54, 52, 51, 50,
+        49, 48, 47, 47, 46, 45, 45, 44, 43, 43, 43, 43, 59, 62, 64, 67, 70, 68,
+        67, 66, 65, 63, 62, 60, 59, 57, 56, 54, 53, 52, 51, 50, 48, 48, 47, 46,
+        45, 45, 44, 43, 43, 43, 43, 43, 59, 62, 64, 67, 70, 68, 67, 66, 65, 63,
+        62, 60, 59, 57, 56, 54, 53, 52, 51, 50, 48, 48, 47, 46, 45, 45, 44, 43,
+        43, 43, 43, 43, 59, 62, 64, 67, 70, 68, 67, 66, 65, 63, 62, 60, 59, 57,
+        56, 54, 53, 52, 51, 50, 48, 48, 47, 46, 45, 45, 44, 43, 43, 43, 43, 43,
+        59, 62, 64, 67, 70, 68, 67, 66, 65, 63, 62, 60, 59, 57, 56, 54, 53, 52,
+        51, 50, 48, 48, 47, 46, 45, 45, 44, 43, 43, 43, 43, 43 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 59, 40, 29, 59, 42, 33, 27, 40, 33, 26, 23, 29, 27, 23, 21,
+        /* Size 8 */
+        64, 82, 76, 60, 48, 39, 34, 30, 82, 73, 75, 66, 54, 44, 38, 33, 76, 75,
+        57, 50, 44, 39, 35, 31, 60, 66, 50, 42, 37, 34, 31, 29, 48, 54, 44, 37,
+        33, 31, 29, 27, 39, 44, 39, 34, 31, 29, 27, 26, 34, 38, 35, 31, 29, 27,
+        26, 25, 30, 33, 31, 29, 27, 26, 25, 24,
+        /* Size 16 */
+        64, 73, 82, 79, 76, 68, 60, 54, 48, 44, 39, 36, 34, 32, 30, 30, 73, 75,
+        77, 76, 76, 69, 63, 57, 51, 46, 42, 39, 36, 34, 31, 31, 82, 77, 73, 74,
+        75, 71, 66, 60, 54, 49, 44, 41, 38, 35, 33, 33, 79, 76, 74, 70, 66, 62,
+        58, 54, 49, 45, 42, 39, 36, 34, 32, 32, 76, 76, 75, 66, 57, 53, 50, 47,
+        44, 42, 39, 37, 35, 33, 31, 31, 68, 69, 71, 62, 53, 50, 46, 43, 41, 39,
+        37, 35, 33, 32, 30, 30, 60, 63, 66, 58, 50, 46, 42, 40, 37, 36, 34, 33,
+        31, 30, 29, 29, 54, 57, 60, 54, 47, 43, 40, 37, 35, 34, 32, 31, 30, 29,
+        28, 28, 48, 51, 54, 49, 44, 41, 37, 35, 33, 32, 31, 30, 29, 28, 27, 27,
+        44, 46, 49, 45, 42, 39, 36, 34, 32, 31, 30, 29, 28, 27, 27, 27, 39, 42,
+        44, 42, 39, 37, 34, 32, 31, 30, 29, 28, 27, 27, 26, 26, 36, 39, 41, 39,
+        37, 35, 33, 31, 30, 29, 28, 27, 26, 26, 25, 25, 34, 36, 38, 36, 35, 33,
+        31, 30, 29, 28, 27, 26, 26, 25, 25, 25, 32, 34, 35, 34, 33, 32, 30, 29,
+        28, 27, 27, 26, 25, 25, 25, 25, 30, 31, 33, 32, 31, 30, 29, 28, 27, 27,
+        26, 25, 25, 25, 24, 24, 30, 31, 33, 32, 31, 30, 29, 28, 27, 27, 26, 25,
+        25, 25, 24, 24,
+        /* Size 32 */
+        64, 68, 73, 77, 82, 80, 79, 78, 76, 72, 68, 64, 60, 57, 54, 51, 48, 46,
+        44, 41, 39, 38, 36, 35, 34, 33, 32, 31, 30, 30, 30, 30, 68, 71, 74, 77,
+        79, 79, 78, 77, 76, 72, 69, 65, 62, 59, 56, 52, 49, 47, 45, 43, 41, 39,
+        38, 36, 35, 34, 33, 32, 31, 31, 31, 31, 73, 74, 75, 76, 77, 77, 76, 76,
+        76, 73, 69, 66, 63, 60, 57, 54, 51, 49, 46, 44, 42, 40, 39, 37, 36, 35,
+        34, 32, 31, 31, 31, 31, 77, 77, 76, 75, 75, 75, 75, 75, 76, 73, 70, 67,
+        64, 61, 58, 55, 52, 50, 48, 45, 43, 41, 40, 38, 37, 35, 34, 33, 32, 32,
+        32, 32, 82, 79, 77, 75, 73, 73, 74, 75, 75, 73, 71, 68, 66, 63, 60, 57,
+        54, 51, 49, 47, 44, 43, 41, 39, 38, 36, 35, 34, 33, 33, 33, 33, 80, 79,
+        77, 75, 73, 73, 72, 71, 71, 68, 66, 64, 62, 59, 57, 54, 51, 49, 47, 45,
+        43, 41, 40, 38, 37, 36, 35, 34, 32, 32, 32, 32, 79, 78, 76, 75, 74, 72,
+        70, 68, 66, 64, 62, 60, 58, 56, 54, 51, 49, 47, 45, 44, 42, 40, 39, 37,
+        36, 35, 34, 33, 32, 32, 32, 32, 78, 77, 76, 75, 75, 71, 68, 65, 61, 60,
+        58, 56, 54, 52, 50, 49, 47, 45, 44, 42, 40, 39, 38, 37, 35, 34, 33, 33,
+        32, 32, 32, 32, 76, 76, 76, 76, 75, 71, 66, 61, 57, 55, 53, 52, 50, 49,
+        47, 46, 44, 43, 42, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 31, 31, 31,
+        72, 72, 73, 73, 73, 68, 64, 60, 55, 53, 52, 50, 48, 47, 45, 44, 43, 41,
+        40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 31, 31, 31, 31, 68, 69, 69, 70,
+        71, 66, 62, 58, 53, 52, 50, 48, 46, 45, 43, 42, 41, 40, 39, 38, 37, 36,
+        35, 34, 33, 32, 32, 31, 30, 30, 30, 30, 64, 65, 66, 67, 68, 64, 60, 56,
+        52, 50, 48, 46, 44, 43, 41, 40, 39, 38, 37, 36, 35, 35, 34, 33, 32, 32,
+        31, 30, 30, 30, 30, 30, 60, 62, 63, 64, 66, 62, 58, 54, 50, 48, 46, 44,
+        42, 41, 40, 38, 37, 37, 36, 35, 34, 33, 33, 32, 31, 31, 30, 30, 29, 29,
+        29, 29, 57, 59, 60, 61, 63, 59, 56, 52, 49, 47, 45, 43, 41, 40, 38, 37,
+        36, 36, 35, 34, 33, 33, 32, 31, 31, 30, 30, 29, 29, 29, 29, 29, 54, 56,
+        57, 58, 60, 57, 54, 50, 47, 45, 43, 41, 40, 38, 37, 36, 35, 35, 34, 33,
+        32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 28, 28, 51, 52, 54, 55, 57, 54,
+        51, 49, 46, 44, 42, 40, 38, 37, 36, 35, 34, 34, 33, 32, 32, 31, 31, 30,
+        30, 29, 29, 28, 28, 28, 28, 28, 48, 49, 51, 52, 54, 51, 49, 47, 44, 43,
+        41, 39, 37, 36, 35, 34, 33, 33, 32, 31, 31, 30, 30, 29, 29, 28, 28, 28,
+        27, 27, 27, 27, 46, 47, 49, 50, 51, 49, 47, 45, 43, 41, 40, 38, 37, 36,
+        35, 34, 33, 32, 31, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 27, 27, 27,
+        44, 45, 46, 48, 49, 47, 45, 44, 42, 40, 39, 37, 36, 35, 34, 33, 32, 31,
+        31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 27, 27, 27, 27, 41, 43, 44, 45,
+        47, 45, 44, 42, 40, 39, 38, 36, 35, 34, 33, 32, 31, 31, 30, 30, 29, 29,
+        28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 39, 41, 42, 43, 44, 43, 42, 40,
+        39, 38, 37, 35, 34, 33, 32, 32, 31, 30, 30, 29, 29, 28, 28, 27, 27, 27,
+        27, 26, 26, 26, 26, 26, 38, 39, 40, 41, 43, 41, 40, 39, 38, 37, 36, 35,
+        33, 33, 32, 31, 30, 30, 29, 29, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26,
+        26, 26, 36, 38, 39, 40, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 31,
+        30, 29, 29, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 25, 25, 25, 35, 36,
+        37, 38, 39, 38, 37, 37, 36, 35, 34, 33, 32, 31, 31, 30, 29, 29, 28, 28,
+        27, 27, 27, 26, 26, 26, 26, 25, 25, 25, 25, 25, 34, 35, 36, 37, 38, 37,
+        36, 35, 35, 34, 33, 32, 31, 31, 30, 30, 29, 28, 28, 28, 27, 27, 26, 26,
+        26, 26, 25, 25, 25, 25, 25, 25, 33, 34, 35, 35, 36, 36, 35, 34, 34, 33,
+        32, 32, 31, 30, 30, 29, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 25, 25,
+        25, 25, 25, 25, 32, 33, 34, 34, 35, 35, 34, 33, 33, 32, 32, 31, 30, 30,
+        29, 29, 28, 28, 27, 27, 27, 26, 26, 26, 25, 25, 25, 25, 25, 25, 25, 25,
+        31, 32, 32, 33, 34, 34, 33, 33, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27,
+        27, 27, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 24, 24, 30, 31, 31, 32,
+        33, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 27, 26, 26, 26,
+        25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 30, 31, 31, 32, 33, 32, 32, 32,
+        31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 27, 26, 26, 26, 25, 25, 25, 25,
+        25, 24, 24, 24, 24, 24, 30, 31, 31, 32, 33, 32, 32, 32, 31, 31, 30, 30,
+        29, 29, 28, 28, 27, 27, 27, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 24,
+        24, 24, 30, 31, 31, 32, 33, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28,
+        27, 27, 27, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24 },
+      { /* Intra matrices */
+        /* Size 4 */
+        141, 130, 85, 60, 130, 91, 70, 56, 85, 70, 55, 48, 60, 56, 48, 43,
+        /* Size 8 */
+        120, 155, 144, 113, 88, 72, 61, 53, 155, 137, 143, 124, 100, 82, 68, 59,
+        144, 143, 106, 93, 82, 71, 63, 56, 113, 124, 93, 77, 68, 62, 56, 52, 88,
+        100, 82, 68, 60, 55, 51, 48, 72, 82, 71, 62, 55, 51, 48, 46, 61, 68, 63,
+        56, 51, 48, 45, 44, 53, 59, 56, 52, 48, 46, 44, 42,
+        /* Size 16 */
+        124, 142, 160, 154, 149, 133, 117, 104, 91, 82, 74, 68, 63, 59, 55, 55,
+        142, 146, 150, 149, 148, 135, 122, 110, 97, 88, 79, 73, 66, 62, 58, 58,
+        160, 150, 141, 144, 147, 137, 128, 115, 103, 94, 84, 77, 70, 66, 61, 61,
+        154, 149, 144, 136, 128, 120, 112, 103, 94, 86, 79, 73, 67, 63, 59, 59,
+        149, 148, 147, 128, 109, 103, 96, 90, 84, 79, 73, 69, 64, 61, 58, 58,
+        133, 135, 137, 120, 103, 95, 87, 82, 77, 73, 68, 65, 61, 58, 56, 56,
+        117, 122, 128, 112, 96, 87, 79, 74, 70, 67, 63, 61, 58, 56, 53, 53, 104,
+        110, 115, 103, 90, 82, 74, 70, 66, 63, 60, 58, 55, 53, 52, 52, 91, 97,
+        103, 94, 84, 77, 70, 66, 62, 59, 57, 55, 53, 51, 50, 50, 82, 88, 94, 86,
+        79, 73, 67, 63, 59, 57, 54, 53, 51, 50, 48, 48, 74, 79, 84, 79, 73, 68,
+        63, 60, 57, 54, 52, 51, 49, 48, 47, 47, 68, 73, 77, 73, 69, 65, 61, 58,
+        55, 53, 51, 49, 48, 47, 46, 46, 63, 66, 70, 67, 64, 61, 58, 55, 53, 51,
+        49, 48, 47, 46, 45, 45, 59, 62, 66, 63, 61, 58, 56, 53, 51, 50, 48, 47,
+        46, 45, 44, 44, 55, 58, 61, 59, 58, 56, 53, 52, 50, 48, 47, 46, 45, 44,
+        44, 44, 55, 58, 61, 59, 58, 56, 53, 52, 50, 48, 47, 46, 45, 44, 44, 44,
+        /* Size 32 */
+        126, 135, 144, 153, 162, 159, 157, 154, 151, 143, 135, 127, 118, 112,
+        105, 99, 92, 88, 84, 79, 75, 72, 69, 66, 63, 62, 60, 58, 56, 56, 56, 56,
+        135, 141, 146, 152, 157, 156, 154, 152, 151, 143, 136, 129, 121, 115,
+        108, 102, 96, 91, 87, 82, 78, 75, 71, 68, 65, 63, 61, 59, 57, 57, 57,
+        57, 144, 146, 148, 151, 153, 152, 151, 151, 150, 144, 137, 130, 124,
+        118, 111, 105, 99, 94, 89, 85, 80, 77, 74, 71, 67, 65, 63, 61, 59, 59,
+        59, 59, 153, 152, 151, 149, 148, 148, 149, 149, 150, 144, 138, 132, 127,
+        120, 114, 108, 102, 97, 92, 87, 83, 79, 76, 73, 69, 67, 65, 63, 60, 60,
+        60, 60, 162, 157, 153, 148, 143, 145, 146, 148, 149, 144, 139, 134, 130,
+        123, 117, 111, 105, 100, 95, 90, 85, 82, 78, 75, 71, 69, 67, 64, 62, 62,
+        62, 62, 159, 156, 152, 148, 145, 143, 142, 141, 139, 135, 130, 126, 121,
+        116, 111, 105, 100, 96, 91, 87, 83, 79, 76, 73, 70, 68, 65, 63, 61, 61,
+        61, 61, 157, 154, 151, 149, 146, 142, 138, 134, 130, 126, 122, 118, 113,
+        109, 104, 100, 95, 91, 88, 84, 80, 77, 74, 71, 68, 66, 64, 62, 60, 60,
+        60, 60, 154, 152, 151, 149, 148, 141, 134, 127, 120, 117, 113, 109, 105,
+        102, 98, 94, 90, 87, 84, 81, 77, 75, 72, 69, 67, 65, 63, 61, 59, 59, 59,
+        59, 151, 151, 150, 150, 149, 139, 130, 120, 111, 108, 104, 101, 97, 94,
+        91, 89, 86, 83, 80, 77, 75, 72, 70, 68, 65, 64, 62, 60, 58, 58, 58, 58,
+        143, 143, 144, 144, 144, 135, 126, 117, 108, 104, 100, 97, 93, 90, 87,
+        85, 82, 79, 77, 74, 72, 70, 68, 66, 64, 62, 61, 59, 57, 57, 57, 57, 135,
+        136, 137, 138, 139, 130, 122, 113, 104, 100, 96, 93, 89, 86, 84, 81, 78,
+        76, 74, 72, 69, 68, 66, 64, 62, 61, 59, 58, 56, 56, 56, 56, 127, 129,
+        130, 132, 134, 126, 118, 109, 101, 97, 93, 88, 84, 82, 80, 77, 75, 73,
+        71, 69, 67, 65, 64, 62, 60, 59, 58, 57, 55, 55, 55, 55, 118, 121, 124,
+        127, 130, 121, 113, 105, 97, 93, 89, 84, 80, 78, 76, 73, 71, 69, 68, 66,
+        64, 63, 62, 60, 59, 58, 57, 55, 54, 54, 54, 54, 112, 115, 118, 120, 123,
+        116, 109, 102, 94, 90, 86, 82, 78, 76, 73, 71, 69, 67, 66, 64, 63, 61,
+        60, 59, 58, 56, 55, 54, 53, 53, 53, 53, 105, 108, 111, 114, 117, 111,
+        104, 98, 91, 87, 84, 80, 76, 73, 71, 69, 67, 65, 64, 62, 61, 60, 59, 57,
+        56, 55, 54, 53, 52, 52, 52, 52, 99, 102, 105, 108, 111, 105, 100, 94,
+        89, 85, 81, 77, 73, 71, 69, 67, 65, 63, 62, 60, 59, 58, 57, 56, 55, 54,
+        53, 52, 51, 51, 51, 51, 92, 96, 99, 102, 105, 100, 95, 90, 86, 82, 78,
+        75, 71, 69, 67, 65, 63, 61, 60, 59, 57, 56, 55, 55, 54, 53, 52, 51, 51,
+        51, 51, 51, 88, 91, 94, 97, 100, 96, 91, 87, 83, 79, 76, 73, 69, 67, 65,
+        63, 61, 60, 59, 57, 56, 55, 54, 54, 53, 52, 51, 51, 50, 50, 50, 50, 84,
+        87, 89, 92, 95, 91, 88, 84, 80, 77, 74, 71, 68, 66, 64, 62, 60, 59, 58,
+        56, 55, 54, 53, 53, 52, 51, 50, 50, 49, 49, 49, 49, 79, 82, 85, 87, 90,
+        87, 84, 81, 77, 74, 72, 69, 66, 64, 62, 60, 59, 57, 56, 55, 54, 53, 52,
+        52, 51, 50, 50, 49, 48, 48, 48, 48, 75, 78, 80, 83, 85, 83, 80, 77, 75,
+        72, 69, 67, 64, 63, 61, 59, 57, 56, 55, 54, 53, 52, 51, 51, 50, 49, 49,
+        48, 48, 48, 48, 48, 72, 75, 77, 79, 82, 79, 77, 75, 72, 70, 68, 65, 63,
+        61, 60, 58, 56, 55, 54, 53, 52, 51, 51, 50, 49, 49, 48, 48, 47, 47, 47,
+        47, 69, 71, 74, 76, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, 59, 57, 55,
+        54, 53, 52, 51, 51, 50, 49, 49, 48, 48, 47, 47, 47, 47, 47, 66, 68, 71,
+        73, 75, 73, 71, 69, 68, 66, 64, 62, 60, 59, 57, 56, 55, 54, 53, 52, 51,
+        50, 49, 49, 48, 48, 47, 47, 46, 46, 46, 46, 63, 65, 67, 69, 71, 70, 68,
+        67, 65, 64, 62, 60, 59, 58, 56, 55, 54, 53, 52, 51, 50, 49, 49, 48, 47,
+        47, 47, 46, 46, 46, 46, 46, 62, 63, 65, 67, 69, 68, 66, 65, 64, 62, 61,
+        59, 58, 56, 55, 54, 53, 52, 51, 50, 49, 49, 48, 48, 47, 47, 46, 46, 45,
+        45, 45, 45, 60, 61, 63, 65, 67, 65, 64, 63, 62, 61, 59, 58, 57, 55, 54,
+        53, 52, 51, 50, 50, 49, 48, 48, 47, 47, 46, 46, 45, 45, 45, 45, 45, 58,
+        59, 61, 63, 64, 63, 62, 61, 60, 59, 58, 57, 55, 54, 53, 52, 51, 51, 50,
+        49, 48, 48, 47, 47, 46, 46, 45, 45, 45, 45, 45, 45, 56, 57, 59, 60, 62,
+        61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 51, 50, 49, 48, 48, 47, 47,
+        46, 46, 45, 45, 45, 44, 44, 44, 44, 56, 57, 59, 60, 62, 61, 60, 59, 58,
+        57, 56, 55, 54, 53, 52, 51, 51, 50, 49, 48, 48, 47, 47, 46, 46, 45, 45,
+        45, 44, 44, 44, 44, 56, 57, 59, 60, 62, 61, 60, 59, 58, 57, 56, 55, 54,
+        53, 52, 51, 51, 50, 49, 48, 48, 47, 47, 46, 46, 45, 45, 45, 44, 44, 44,
+        44, 56, 57, 59, 60, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 51,
+        50, 49, 48, 48, 47, 47, 46, 46, 45, 45, 45, 44, 44, 44, 44 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 44, 41, 34, 44, 38, 36, 33, 41, 36, 30, 27, 34, 33, 27, 25,
+        /* Size 8 */
+        64, 77, 47, 45, 42, 39, 36, 33, 77, 56, 46, 50, 49, 45, 41, 37, 47, 46,
+        41, 42, 42, 40, 38, 35, 45, 50, 42, 38, 37, 36, 34, 32, 42, 49, 42, 37,
+        34, 32, 31, 30, 39, 45, 40, 36, 32, 30, 29, 28, 36, 41, 38, 34, 31, 29,
+        27, 26, 33, 37, 35, 32, 30, 28, 26, 25,
+        /* Size 16 */
+        64, 71, 77, 62, 47, 46, 45, 43, 42, 41, 39, 37, 36, 34, 33, 33, 71, 69,
+        67, 57, 46, 47, 47, 46, 46, 44, 42, 40, 38, 37, 35, 35, 77, 67, 56, 51,
+        46, 48, 50, 49, 49, 47, 45, 43, 41, 39, 37, 37, 62, 57, 51, 47, 43, 45,
+        46, 46, 45, 44, 43, 41, 40, 38, 36, 36, 47, 46, 46, 43, 41, 41, 42, 42,
+        42, 41, 40, 39, 38, 37, 35, 35, 46, 47, 48, 45, 41, 41, 40, 40, 39, 39,
+        38, 37, 36, 35, 34, 34, 45, 47, 50, 46, 42, 40, 38, 37, 37, 36, 36, 35,
+        34, 33, 32, 32, 43, 46, 49, 46, 42, 40, 37, 36, 35, 35, 34, 33, 33, 32,
+        31, 31, 42, 46, 49, 45, 42, 39, 37, 35, 34, 33, 32, 32, 31, 30, 30, 30,
+        41, 44, 47, 44, 41, 39, 36, 35, 33, 32, 31, 31, 30, 29, 29, 29, 39, 42,
+        45, 43, 40, 38, 36, 34, 32, 31, 30, 30, 29, 28, 28, 28, 37, 40, 43, 41,
+        39, 37, 35, 33, 32, 31, 30, 29, 28, 28, 27, 27, 36, 38, 41, 40, 38, 36,
+        34, 33, 31, 30, 29, 28, 27, 27, 26, 26, 34, 37, 39, 38, 37, 35, 33, 32,
+        30, 29, 28, 28, 27, 26, 26, 26, 33, 35, 37, 36, 35, 34, 32, 31, 30, 29,
+        28, 27, 26, 26, 25, 25, 33, 35, 37, 36, 35, 34, 32, 31, 30, 29, 28, 27,
+        26, 26, 25, 25,
+        /* Size 32 */
+        64, 67, 71, 74, 77, 69, 62, 54, 47, 46, 46, 45, 45, 44, 43, 43, 42, 42,
+        41, 40, 39, 38, 37, 36, 36, 35, 34, 33, 33, 33, 33, 33, 67, 68, 70, 71,
+        72, 66, 59, 53, 47, 46, 46, 46, 46, 45, 45, 44, 44, 43, 42, 41, 40, 40,
+        39, 38, 37, 36, 35, 35, 34, 34, 34, 34, 71, 70, 69, 68, 67, 62, 57, 51,
+        46, 47, 47, 47, 47, 47, 46, 46, 46, 45, 44, 43, 42, 41, 40, 39, 38, 38,
+        37, 36, 35, 35, 35, 35, 74, 71, 68, 65, 61, 58, 54, 50, 46, 47, 47, 48,
+        49, 48, 48, 47, 47, 46, 45, 44, 44, 43, 42, 41, 40, 39, 38, 37, 36, 36,
+        36, 36, 77, 72, 67, 61, 56, 54, 51, 49, 46, 47, 48, 49, 50, 50, 49, 49,
+        49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 37, 37, 37, 69, 66,
+        62, 58, 54, 51, 49, 47, 45, 46, 46, 47, 48, 48, 47, 47, 47, 46, 45, 45,
+        44, 43, 42, 41, 40, 39, 39, 38, 37, 37, 37, 37, 62, 59, 57, 54, 51, 49,
+        47, 45, 43, 44, 45, 45, 46, 46, 46, 45, 45, 45, 44, 43, 43, 42, 41, 40,
+        40, 39, 38, 37, 36, 36, 36, 36, 54, 53, 51, 50, 49, 47, 45, 44, 42, 43,
+        43, 43, 44, 44, 44, 44, 44, 43, 43, 42, 42, 41, 40, 39, 39, 38, 37, 36,
+        36, 36, 36, 36, 47, 47, 46, 46, 46, 45, 43, 42, 41, 41, 41, 42, 42, 42,
+        42, 42, 42, 42, 41, 41, 40, 40, 39, 38, 38, 37, 37, 36, 35, 35, 35, 35,
+        46, 46, 47, 47, 47, 46, 44, 43, 41, 41, 41, 41, 41, 41, 41, 41, 41, 40,
+        40, 40, 39, 39, 38, 37, 37, 36, 36, 35, 34, 34, 34, 34, 46, 46, 47, 47,
+        48, 46, 45, 43, 41, 41, 41, 40, 40, 40, 40, 40, 39, 39, 39, 38, 38, 37,
+        37, 36, 36, 35, 35, 34, 34, 34, 34, 34, 45, 46, 47, 48, 49, 47, 45, 43,
+        42, 41, 40, 40, 39, 39, 39, 38, 38, 38, 37, 37, 37, 36, 36, 36, 35, 35,
+        34, 34, 33, 33, 33, 33, 45, 46, 47, 49, 50, 48, 46, 44, 42, 41, 40, 39,
+        38, 38, 37, 37, 37, 36, 36, 36, 36, 35, 35, 35, 34, 34, 33, 33, 32, 32,
+        32, 32, 44, 45, 47, 48, 50, 48, 46, 44, 42, 41, 40, 39, 38, 37, 37, 36,
+        36, 36, 35, 35, 35, 34, 34, 34, 33, 33, 33, 32, 32, 32, 32, 32, 43, 45,
+        46, 48, 49, 47, 46, 44, 42, 41, 40, 39, 37, 37, 36, 36, 35, 35, 35, 34,
+        34, 34, 33, 33, 33, 32, 32, 32, 31, 31, 31, 31, 43, 44, 46, 47, 49, 47,
+        45, 44, 42, 41, 40, 38, 37, 36, 36, 35, 35, 34, 34, 34, 33, 33, 33, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 42, 44, 46, 47, 49, 47, 45, 44, 42, 41,
+        39, 38, 37, 36, 35, 35, 34, 34, 33, 33, 32, 32, 32, 31, 31, 31, 30, 30,
+        30, 30, 30, 30, 42, 43, 45, 46, 48, 46, 45, 43, 42, 40, 39, 38, 36, 36,
+        35, 34, 34, 33, 33, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 29,
+        41, 42, 44, 45, 47, 45, 44, 43, 41, 40, 39, 37, 36, 35, 35, 34, 33, 33,
+        32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 29, 29, 29, 40, 41, 43, 44,
+        46, 45, 43, 42, 41, 40, 38, 37, 36, 35, 34, 34, 33, 32, 32, 31, 31, 30,
+        30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 39, 40, 42, 44, 45, 44, 43, 42,
+        40, 39, 38, 37, 36, 35, 34, 33, 32, 32, 31, 31, 30, 30, 30, 29, 29, 29,
+        28, 28, 28, 28, 28, 28, 38, 40, 41, 43, 44, 43, 42, 41, 40, 39, 37, 36,
+        35, 34, 34, 33, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28,
+        28, 28, 37, 39, 40, 42, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 33,
+        32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 28, 27, 27, 27, 27, 27, 36, 38,
+        39, 41, 42, 41, 40, 39, 38, 37, 36, 36, 35, 34, 33, 32, 31, 31, 30, 30,
+        29, 29, 29, 28, 28, 28, 27, 27, 27, 27, 27, 27, 36, 37, 38, 40, 41, 40,
+        40, 39, 38, 37, 36, 35, 34, 33, 33, 32, 31, 31, 30, 29, 29, 29, 28, 28,
+        27, 27, 27, 27, 26, 26, 26, 26, 35, 36, 38, 39, 40, 39, 39, 38, 37, 36,
+        35, 35, 34, 33, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 27, 26,
+        26, 26, 26, 26, 34, 35, 37, 38, 39, 39, 38, 37, 37, 36, 35, 34, 33, 33,
+        32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 26, 26,
+        33, 35, 36, 37, 38, 38, 37, 36, 36, 35, 34, 34, 33, 32, 32, 31, 30, 30,
+        29, 29, 28, 28, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 33, 34, 35, 36,
+        37, 37, 36, 36, 35, 34, 34, 33, 32, 32, 31, 31, 30, 29, 29, 28, 28, 28,
+        27, 27, 26, 26, 26, 26, 25, 25, 25, 25, 33, 34, 35, 36, 37, 37, 36, 36,
+        35, 34, 34, 33, 32, 32, 31, 31, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26,
+        26, 26, 25, 25, 25, 25, 33, 34, 35, 36, 37, 37, 36, 36, 35, 34, 34, 33,
+        32, 32, 31, 31, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 26, 26, 25, 25,
+        25, 25, 33, 34, 35, 36, 37, 37, 36, 36, 35, 34, 34, 33, 32, 32, 31, 31,
+        30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 26, 26, 25, 25, 25, 25 },
+      { /* Intra matrices */
+        /* Size 4 */
+        125, 84, 78, 64, 84, 72, 68, 61, 78, 68, 56, 51, 64, 61, 51, 45,
+        /* Size 8 */
+        117, 141, 84, 80, 76, 69, 63, 57, 141, 102, 83, 90, 88, 81, 73, 66, 84,
+        83, 73, 75, 75, 72, 67, 62, 80, 90, 75, 68, 65, 63, 60, 57, 76, 88, 75,
+        65, 60, 57, 54, 52, 69, 81, 72, 63, 57, 53, 50, 48, 63, 73, 67, 60, 54,
+        50, 47, 45, 57, 66, 62, 57, 52, 48, 45, 43,
+        /* Size 16 */
+        119, 132, 144, 115, 86, 84, 82, 79, 77, 74, 71, 67, 64, 61, 58, 58, 132,
+        128, 124, 105, 85, 86, 87, 85, 83, 80, 77, 73, 69, 66, 63, 63, 144, 124,
+        104, 94, 84, 88, 92, 91, 89, 86, 83, 79, 75, 71, 68, 68, 115, 105, 94,
+        87, 79, 82, 84, 83, 83, 80, 78, 75, 72, 69, 65, 65, 86, 85, 84, 79, 74,
+        75, 76, 76, 76, 75, 73, 71, 68, 66, 63, 63, 84, 86, 88, 82, 75, 74, 73,
+        72, 71, 70, 69, 67, 65, 63, 61, 61, 82, 87, 92, 84, 76, 73, 69, 68, 66,
+        65, 64, 63, 61, 60, 58, 58, 79, 85, 91, 83, 76, 72, 68, 66, 64, 62, 61,
+        60, 58, 57, 56, 56, 77, 83, 89, 83, 76, 71, 66, 64, 61, 59, 58, 57, 55,
+        54, 53, 53, 74, 80, 86, 80, 75, 70, 65, 62, 59, 58, 56, 55, 53, 52, 51,
+        51, 71, 77, 83, 78, 73, 69, 64, 61, 58, 56, 54, 53, 51, 50, 49, 49, 67,
+        73, 79, 75, 71, 67, 63, 60, 57, 55, 53, 51, 50, 49, 48, 48, 64, 69, 75,
+        72, 68, 65, 61, 58, 55, 53, 51, 50, 48, 47, 46, 46, 61, 66, 71, 69, 66,
+        63, 60, 57, 54, 52, 50, 49, 47, 46, 45, 45, 58, 63, 68, 65, 63, 61, 58,
+        56, 53, 51, 49, 48, 46, 45, 44, 44, 58, 63, 68, 65, 63, 61, 58, 56, 53,
+        51, 49, 48, 46, 45, 44, 44,
+        /* Size 32 */
+        120, 127, 133, 140, 146, 131, 116, 101, 87, 86, 85, 84, 82, 81, 80, 79,
+        78, 76, 75, 73, 71, 70, 68, 66, 65, 63, 62, 60, 59, 59, 59, 59, 127,
+        129, 131, 134, 136, 123, 111, 99, 86, 86, 86, 85, 85, 84, 83, 82, 81,
+        80, 78, 76, 74, 73, 71, 69, 67, 66, 64, 63, 61, 61, 61, 61, 133, 131,
+        129, 127, 126, 116, 106, 96, 86, 86, 87, 87, 88, 87, 86, 85, 84, 83, 81,
+        79, 77, 76, 74, 72, 70, 69, 67, 65, 64, 64, 64, 64, 140, 134, 127, 121,
+        115, 108, 101, 93, 86, 87, 88, 89, 90, 89, 89, 88, 87, 86, 84, 82, 81,
+        79, 77, 75, 73, 71, 70, 68, 66, 66, 66, 66, 146, 136, 126, 115, 105,
+        100, 95, 90, 85, 87, 89, 91, 93, 92, 92, 91, 90, 89, 87, 85, 84, 82, 80,
+        78, 76, 74, 72, 70, 68, 68, 68, 68, 131, 123, 116, 108, 100, 96, 92, 87,
+        83, 84, 86, 87, 89, 88, 88, 88, 87, 86, 84, 83, 81, 79, 78, 76, 74, 72,
+        71, 69, 67, 67, 67, 67, 116, 111, 106, 101, 95, 92, 88, 84, 80, 81, 83,
+        84, 85, 85, 84, 84, 84, 83, 81, 80, 79, 77, 76, 74, 72, 71, 69, 68, 66,
+        66, 66, 66, 101, 99, 96, 93, 90, 87, 84, 81, 78, 78, 79, 80, 81, 81, 81,
+        81, 81, 80, 79, 78, 76, 75, 74, 72, 71, 69, 68, 67, 65, 65, 65, 65, 87,
+        86, 86, 86, 85, 83, 80, 78, 75, 75, 76, 77, 77, 77, 77, 77, 77, 76, 76,
+        75, 74, 73, 72, 70, 69, 68, 67, 65, 64, 64, 64, 64, 86, 86, 86, 87, 87,
+        84, 81, 78, 75, 75, 75, 75, 75, 75, 75, 75, 75, 74, 73, 73, 72, 71, 70,
+        69, 67, 66, 65, 64, 63, 63, 63, 63, 85, 86, 87, 88, 89, 86, 83, 79, 76,
+        75, 75, 74, 73, 73, 73, 73, 72, 72, 71, 70, 70, 69, 68, 67, 66, 65, 63,
+        62, 61, 61, 61, 61, 84, 85, 87, 89, 91, 87, 84, 80, 77, 75, 74, 73, 72,
+        71, 71, 70, 70, 69, 68, 68, 67, 66, 66, 65, 64, 63, 62, 61, 60, 60, 60,
+        60, 82, 85, 88, 90, 93, 89, 85, 81, 77, 75, 73, 72, 70, 69, 68, 68, 67,
+        67, 66, 65, 65, 64, 63, 63, 62, 61, 60, 59, 59, 59, 59, 59, 81, 84, 87,
+        89, 92, 88, 85, 81, 77, 75, 73, 71, 69, 68, 67, 67, 66, 65, 65, 64, 63,
+        63, 62, 61, 61, 60, 59, 58, 57, 57, 57, 57, 80, 83, 86, 89, 92, 88, 84,
+        81, 77, 75, 73, 71, 68, 67, 66, 65, 64, 64, 63, 62, 62, 61, 60, 60, 59,
+        58, 58, 57, 56, 56, 56, 56, 79, 82, 85, 88, 91, 88, 84, 81, 77, 75, 73,
+        70, 68, 67, 65, 64, 63, 62, 62, 61, 60, 59, 59, 58, 58, 57, 56, 56, 55,
+        55, 55, 55, 78, 81, 84, 87, 90, 87, 84, 81, 77, 75, 72, 70, 67, 66, 64,
+        63, 62, 61, 60, 59, 58, 58, 57, 57, 56, 55, 55, 54, 54, 54, 54, 54, 76,
+        80, 83, 86, 89, 86, 83, 80, 76, 74, 72, 69, 67, 65, 64, 62, 61, 60, 59,
+        58, 57, 57, 56, 56, 55, 54, 54, 53, 53, 53, 53, 53, 75, 78, 81, 84, 87,
+        84, 81, 79, 76, 73, 71, 68, 66, 65, 63, 62, 60, 59, 58, 57, 56, 56, 55,
+        55, 54, 53, 53, 52, 52, 52, 52, 52, 73, 76, 79, 82, 85, 83, 80, 78, 75,
+        73, 70, 68, 65, 64, 62, 61, 59, 58, 57, 56, 55, 55, 54, 54, 53, 52, 52,
+        51, 51, 51, 51, 51, 71, 74, 77, 81, 84, 81, 79, 76, 74, 72, 70, 67, 65,
+        63, 62, 60, 58, 57, 56, 55, 54, 54, 53, 52, 52, 51, 51, 50, 50, 50, 50,
+        50, 70, 73, 76, 79, 82, 79, 77, 75, 73, 71, 69, 66, 64, 63, 61, 59, 58,
+        57, 56, 55, 54, 53, 52, 52, 51, 51, 50, 50, 49, 49, 49, 49, 68, 71, 74,
+        77, 80, 78, 76, 74, 72, 70, 68, 66, 63, 62, 60, 59, 57, 56, 55, 54, 53,
+        52, 52, 51, 50, 50, 49, 49, 48, 48, 48, 48, 66, 69, 72, 75, 78, 76, 74,
+        72, 70, 69, 67, 65, 63, 61, 60, 58, 57, 56, 55, 54, 52, 52, 51, 50, 50,
+        49, 49, 48, 48, 48, 48, 48, 65, 67, 70, 73, 76, 74, 72, 71, 69, 67, 66,
+        64, 62, 61, 59, 58, 56, 55, 54, 53, 52, 51, 50, 50, 49, 48, 48, 47, 47,
+        47, 47, 47, 63, 66, 69, 71, 74, 72, 71, 69, 68, 66, 65, 63, 61, 60, 58,
+        57, 55, 54, 53, 52, 51, 51, 50, 49, 48, 48, 47, 47, 46, 46, 46, 46, 62,
+        64, 67, 70, 72, 71, 69, 68, 67, 65, 63, 62, 60, 59, 58, 56, 55, 54, 53,
+        52, 51, 50, 49, 49, 48, 47, 47, 46, 46, 46, 46, 46, 60, 63, 65, 68, 70,
+        69, 68, 67, 65, 64, 62, 61, 59, 58, 57, 56, 54, 53, 52, 51, 50, 50, 49,
+        48, 47, 47, 46, 46, 45, 45, 45, 45, 59, 61, 64, 66, 68, 67, 66, 65, 64,
+        63, 61, 60, 59, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 48, 47, 46, 46,
+        45, 45, 45, 45, 45, 59, 61, 64, 66, 68, 67, 66, 65, 64, 63, 61, 60, 59,
+        57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 48, 47, 46, 46, 45, 45, 45, 45,
+        45, 59, 61, 64, 66, 68, 67, 66, 65, 64, 63, 61, 60, 59, 57, 56, 55, 54,
+        53, 52, 51, 50, 49, 48, 48, 47, 46, 46, 45, 45, 45, 45, 45, 59, 61, 64,
+        66, 68, 67, 66, 65, 64, 63, 61, 60, 59, 57, 56, 55, 54, 53, 52, 51, 50,
+        49, 48, 48, 47, 46, 46, 45, 45, 45, 45, 45 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 60, 41, 31, 60, 44, 35, 29, 41, 35, 29, 26, 31, 29, 26, 24,
+        /* Size 8 */
+        64, 80, 75, 61, 49, 41, 36, 33, 80, 72, 74, 66, 55, 46, 40, 35, 75, 74,
+        57, 51, 46, 41, 37, 34, 61, 66, 51, 43, 39, 36, 34, 32, 49, 55, 46, 39,
+        36, 33, 32, 30, 41, 46, 41, 36, 33, 31, 30, 29, 36, 40, 37, 34, 32, 30,
+        29, 28, 33, 35, 34, 32, 30, 29, 28, 27,
+        /* Size 16 */
+        64, 72, 80, 78, 75, 68, 61, 55, 49, 45, 41, 39, 36, 34, 33, 33, 72, 74,
+        76, 75, 75, 69, 63, 57, 52, 48, 43, 41, 38, 36, 34, 34, 80, 76, 72, 73,
+        74, 70, 66, 60, 55, 50, 46, 43, 40, 37, 35, 35, 78, 75, 73, 69, 66, 62,
+        58, 54, 50, 47, 43, 41, 38, 36, 34, 34, 75, 75, 74, 66, 57, 54, 51, 49,
+        46, 43, 41, 39, 37, 35, 34, 34, 68, 69, 70, 62, 54, 51, 47, 45, 43, 41,
+        39, 37, 35, 34, 33, 33, 61, 63, 66, 58, 51, 47, 43, 41, 39, 38, 36, 35,
+        34, 33, 32, 32, 55, 57, 60, 54, 49, 45, 41, 39, 37, 36, 35, 34, 33, 32,
+        31, 31, 49, 52, 55, 50, 46, 43, 39, 37, 36, 34, 33, 32, 32, 31, 30, 30,
+        45, 48, 50, 47, 43, 41, 38, 36, 34, 33, 32, 31, 31, 30, 30, 30, 41, 43,
+        46, 43, 41, 39, 36, 35, 33, 32, 31, 31, 30, 29, 29, 29, 39, 41, 43, 41,
+        39, 37, 35, 34, 32, 31, 31, 30, 29, 29, 28, 28, 36, 38, 40, 38, 37, 35,
+        34, 33, 32, 31, 30, 29, 29, 28, 28, 28, 34, 36, 37, 36, 35, 34, 33, 32,
+        31, 30, 29, 29, 28, 28, 28, 28, 33, 34, 35, 34, 34, 33, 32, 31, 30, 30,
+        29, 28, 28, 28, 27, 27, 33, 34, 35, 34, 34, 33, 32, 31, 30, 30, 29, 28,
+        28, 28, 27, 27,
+        /* Size 32 */
+        64, 68, 72, 76, 80, 79, 78, 77, 75, 72, 68, 64, 61, 58, 55, 52, 49, 47,
+        45, 43, 41, 40, 39, 37, 36, 35, 34, 33, 33, 33, 33, 33, 68, 71, 73, 76,
+        78, 77, 77, 76, 75, 72, 69, 65, 62, 59, 56, 53, 50, 48, 46, 44, 42, 41,
+        40, 38, 37, 36, 35, 34, 33, 33, 33, 33, 72, 73, 74, 75, 76, 76, 75, 75,
+        75, 72, 69, 66, 63, 60, 57, 55, 52, 50, 48, 46, 43, 42, 41, 39, 38, 37,
+        36, 35, 34, 34, 34, 34, 76, 76, 75, 75, 74, 74, 74, 74, 75, 72, 70, 67,
+        64, 62, 59, 56, 53, 51, 49, 47, 45, 43, 42, 40, 39, 38, 37, 36, 35, 35,
+        35, 35, 80, 78, 76, 74, 72, 72, 73, 74, 74, 72, 70, 68, 66, 63, 60, 57,
+        55, 52, 50, 48, 46, 44, 43, 41, 40, 38, 37, 36, 35, 35, 35, 35, 79, 77,
+        76, 74, 72, 72, 71, 71, 70, 68, 66, 64, 62, 60, 57, 55, 52, 50, 48, 47,
+        45, 43, 42, 40, 39, 38, 37, 36, 35, 35, 35, 35, 78, 77, 75, 74, 73, 71,
+        69, 68, 66, 64, 62, 60, 58, 56, 54, 52, 50, 49, 47, 45, 43, 42, 41, 39,
+        38, 37, 36, 35, 34, 34, 34, 34, 77, 76, 75, 74, 74, 71, 68, 65, 62, 60,
+        58, 56, 55, 53, 51, 50, 48, 47, 45, 44, 42, 41, 40, 39, 38, 37, 36, 35,
+        34, 34, 34, 34, 75, 75, 75, 75, 74, 70, 66, 62, 57, 56, 54, 53, 51, 50,
+        49, 47, 46, 45, 43, 42, 41, 40, 39, 38, 37, 36, 35, 35, 34, 34, 34, 34,
+        72, 72, 72, 72, 72, 68, 64, 60, 56, 54, 53, 51, 49, 48, 47, 46, 44, 43,
+        42, 41, 40, 39, 38, 37, 36, 35, 35, 34, 33, 33, 33, 33, 68, 69, 69, 70,
+        70, 66, 62, 58, 54, 53, 51, 49, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38,
+        37, 36, 35, 35, 34, 33, 33, 33, 33, 33, 64, 65, 66, 67, 68, 64, 60, 56,
+        53, 51, 49, 47, 45, 44, 43, 42, 41, 40, 39, 38, 38, 37, 36, 35, 35, 34,
+        33, 33, 32, 32, 32, 32, 61, 62, 63, 64, 66, 62, 58, 55, 51, 49, 47, 45,
+        43, 42, 41, 40, 39, 39, 38, 37, 36, 36, 35, 35, 34, 33, 33, 32, 32, 32,
+        32, 32, 58, 59, 60, 62, 63, 60, 56, 53, 50, 48, 46, 44, 42, 41, 40, 39,
+        38, 38, 37, 36, 36, 35, 34, 34, 33, 33, 32, 32, 31, 31, 31, 31, 55, 56,
+        57, 59, 60, 57, 54, 51, 49, 47, 45, 43, 41, 40, 39, 38, 37, 37, 36, 35,
+        35, 34, 34, 33, 33, 32, 32, 31, 31, 31, 31, 31, 52, 53, 55, 56, 57, 55,
+        52, 50, 47, 46, 44, 42, 40, 39, 38, 37, 37, 36, 35, 35, 34, 34, 33, 33,
+        32, 32, 31, 31, 31, 31, 31, 31, 49, 50, 52, 53, 55, 52, 50, 48, 46, 44,
+        43, 41, 39, 38, 37, 37, 36, 35, 34, 34, 33, 33, 32, 32, 32, 31, 31, 31,
+        30, 30, 30, 30, 47, 48, 50, 51, 52, 50, 49, 47, 45, 43, 42, 40, 39, 38,
+        37, 36, 35, 34, 34, 33, 33, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30,
+        45, 46, 48, 49, 50, 48, 47, 45, 43, 42, 41, 39, 38, 37, 36, 35, 34, 34,
+        33, 33, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 43, 44, 46, 47,
+        48, 47, 45, 44, 42, 41, 40, 38, 37, 36, 35, 35, 34, 33, 33, 32, 32, 31,
+        31, 31, 30, 30, 30, 29, 29, 29, 29, 29, 41, 42, 43, 45, 46, 45, 43, 42,
+        41, 40, 39, 38, 36, 36, 35, 34, 33, 33, 32, 32, 31, 31, 31, 30, 30, 30,
+        29, 29, 29, 29, 29, 29, 40, 41, 42, 43, 44, 43, 42, 41, 40, 39, 38, 37,
+        36, 35, 34, 34, 33, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 29, 29,
+        29, 29, 39, 40, 41, 42, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 34, 33,
+        32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 37, 38,
+        39, 40, 41, 40, 39, 39, 38, 37, 36, 35, 35, 34, 33, 33, 32, 32, 31, 31,
+        30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 36, 37, 38, 39, 40, 39,
+        38, 38, 37, 36, 35, 35, 34, 33, 33, 32, 32, 31, 31, 30, 30, 30, 29, 29,
+        29, 29, 28, 28, 28, 28, 28, 28, 35, 36, 37, 38, 38, 38, 37, 37, 36, 35,
+        35, 34, 33, 33, 32, 32, 31, 31, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28,
+        28, 28, 28, 28, 34, 35, 36, 37, 37, 37, 36, 36, 35, 35, 34, 33, 33, 32,
+        32, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28,
+        33, 34, 35, 36, 36, 36, 35, 35, 35, 34, 33, 33, 32, 32, 31, 31, 31, 30,
+        30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 33, 33, 34, 35,
+        35, 35, 34, 34, 34, 33, 33, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29,
+        28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 33, 33, 34, 35, 35, 35, 34, 34,
+        34, 33, 33, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28,
+        28, 27, 27, 27, 27, 27, 33, 33, 34, 35, 35, 35, 34, 34, 34, 33, 33, 32,
+        32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27,
+        27, 27, 33, 33, 34, 35, 35, 35, 34, 34, 34, 33, 33, 32, 32, 31, 31, 31,
+        30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27 },
+      { /* Intra matrices */
+        /* Size 4 */
+        129, 119, 81, 60, 119, 86, 68, 56, 81, 68, 55, 49, 60, 56, 49, 45,
+        /* Size 8 */
+        111, 141, 132, 105, 84, 70, 60, 54, 141, 125, 130, 114, 94, 78, 67, 59,
+        132, 130, 99, 88, 78, 69, 62, 56, 105, 114, 88, 74, 66, 61, 56, 53, 84,
+        94, 78, 66, 59, 55, 52, 50, 70, 78, 69, 61, 55, 52, 49, 47, 60, 67, 62,
+        56, 52, 49, 47, 46, 54, 59, 56, 53, 50, 47, 46, 44,
+        /* Size 16 */
+        114, 129, 144, 140, 135, 122, 108, 97, 86, 79, 71, 67, 62, 59, 55, 55,
+        129, 133, 137, 135, 134, 123, 113, 102, 91, 84, 76, 70, 65, 62, 58, 58,
+        144, 137, 129, 131, 134, 125, 117, 107, 96, 88, 80, 74, 68, 64, 60, 60,
+        140, 135, 131, 124, 118, 111, 104, 96, 88, 82, 76, 71, 66, 62, 59, 59,
+        135, 134, 134, 118, 102, 96, 90, 85, 80, 76, 71, 67, 63, 60, 58, 58,
+        122, 123, 125, 111, 96, 89, 83, 79, 74, 71, 67, 64, 61, 58, 56, 56, 108,
+        113, 117, 104, 90, 83, 76, 72, 68, 65, 63, 60, 58, 56, 54, 54, 97, 102,
+        107, 96, 85, 79, 72, 68, 65, 62, 60, 58, 56, 54, 53, 53, 86, 91, 96, 88,
+        80, 74, 68, 65, 61, 59, 57, 55, 54, 52, 51, 51, 79, 84, 88, 82, 76, 71,
+        65, 62, 59, 57, 55, 53, 52, 51, 50, 50, 71, 76, 80, 76, 71, 67, 63, 60,
+        57, 55, 53, 52, 50, 50, 49, 49, 67, 70, 74, 71, 67, 64, 60, 58, 55, 53,
+        52, 51, 49, 49, 48, 48, 62, 65, 68, 66, 63, 61, 58, 56, 54, 52, 50, 49,
+        48, 48, 47, 47, 59, 62, 64, 62, 60, 58, 56, 54, 52, 51, 50, 49, 48, 47,
+        46, 46, 55, 58, 60, 59, 58, 56, 54, 53, 51, 50, 49, 48, 47, 46, 46, 46,
+        55, 58, 60, 59, 58, 56, 54, 53, 51, 50, 49, 48, 47, 46, 46, 46,
+        /* Size 32 */
+        116, 123, 131, 139, 146, 144, 142, 139, 137, 130, 123, 116, 109, 104,
+        98, 93, 87, 84, 80, 76, 72, 70, 68, 65, 63, 61, 59, 58, 56, 56, 56, 56,
+        123, 128, 133, 138, 142, 141, 140, 138, 137, 130, 124, 118, 112, 106,
+        101, 95, 90, 86, 82, 78, 75, 72, 69, 67, 64, 63, 61, 59, 57, 57, 57, 57,
+        131, 133, 135, 137, 138, 138, 137, 137, 136, 131, 125, 120, 114, 109,
+        103, 98, 93, 89, 85, 81, 77, 74, 71, 69, 66, 64, 62, 61, 59, 59, 59, 59,
+        139, 138, 137, 136, 134, 135, 135, 135, 136, 131, 126, 121, 116, 111,
+        106, 100, 95, 91, 87, 83, 79, 76, 73, 71, 68, 66, 64, 62, 60, 60, 60,
+        60, 146, 142, 138, 134, 130, 132, 133, 134, 135, 131, 127, 123, 119,
+        113, 108, 103, 98, 94, 89, 85, 81, 78, 75, 72, 69, 67, 65, 63, 61, 61,
+        61, 61, 144, 141, 138, 135, 132, 131, 129, 128, 127, 123, 120, 116, 112,
+        107, 103, 98, 94, 90, 86, 83, 79, 76, 73, 71, 68, 66, 64, 62, 61, 61,
+        61, 61, 142, 140, 137, 135, 133, 129, 126, 123, 119, 116, 112, 109, 105,
+        101, 97, 93, 90, 86, 83, 80, 77, 74, 72, 69, 67, 65, 63, 62, 60, 60, 60,
+        60, 139, 138, 137, 135, 134, 128, 123, 117, 111, 108, 105, 101, 98, 95,
+        92, 89, 86, 83, 80, 77, 74, 72, 70, 68, 66, 64, 62, 61, 59, 59, 59, 59,
+        137, 137, 136, 136, 135, 127, 119, 111, 103, 100, 97, 94, 91, 89, 86,
+        84, 81, 79, 77, 74, 72, 70, 68, 66, 64, 63, 61, 60, 58, 58, 58, 58, 130,
+        130, 131, 131, 131, 123, 116, 108, 100, 97, 94, 91, 88, 85, 83, 81, 78,
+        76, 74, 72, 70, 68, 66, 65, 63, 62, 60, 59, 57, 57, 57, 57, 123, 124,
+        125, 126, 127, 120, 112, 105, 97, 94, 91, 87, 84, 82, 80, 77, 75, 73,
+        71, 70, 68, 66, 65, 63, 61, 60, 59, 58, 57, 57, 57, 57, 116, 118, 120,
+        121, 123, 116, 109, 101, 94, 91, 87, 84, 80, 78, 76, 74, 72, 71, 69, 67,
+        66, 64, 63, 61, 60, 59, 58, 57, 56, 56, 56, 56, 109, 112, 114, 116, 119,
+        112, 105, 98, 91, 88, 84, 80, 77, 75, 73, 71, 69, 68, 66, 65, 63, 62,
+        61, 60, 59, 58, 57, 56, 55, 55, 55, 55, 104, 106, 109, 111, 113, 107,
+        101, 95, 89, 85, 82, 78, 75, 73, 71, 69, 67, 66, 65, 63, 62, 61, 60, 59,
+        58, 57, 56, 55, 54, 54, 54, 54, 98, 101, 103, 106, 108, 103, 97, 92, 86,
+        83, 80, 76, 73, 71, 69, 67, 65, 64, 63, 62, 60, 59, 58, 57, 56, 56, 55,
+        54, 53, 53, 53, 53, 93, 95, 98, 100, 103, 98, 93, 89, 84, 81, 77, 74,
+        71, 69, 67, 65, 64, 62, 61, 60, 59, 58, 57, 56, 55, 55, 54, 53, 52, 52,
+        52, 52, 87, 90, 93, 95, 98, 94, 90, 86, 81, 78, 75, 72, 69, 67, 65, 64,
+        62, 61, 60, 59, 57, 57, 56, 55, 54, 54, 53, 52, 52, 52, 52, 52, 84, 86,
+        89, 91, 94, 90, 86, 83, 79, 76, 73, 71, 68, 66, 64, 62, 61, 60, 59, 58,
+        56, 56, 55, 54, 53, 53, 52, 52, 51, 51, 51, 51, 80, 82, 85, 87, 89, 86,
+        83, 80, 77, 74, 71, 69, 66, 65, 63, 61, 60, 59, 58, 57, 56, 55, 54, 53,
+        53, 52, 52, 51, 50, 50, 50, 50, 76, 78, 81, 83, 85, 83, 80, 77, 74, 72,
+        70, 67, 65, 63, 62, 60, 59, 58, 57, 56, 55, 54, 53, 53, 52, 51, 51, 50,
+        50, 50, 50, 50, 72, 75, 77, 79, 81, 79, 77, 74, 72, 70, 68, 66, 63, 62,
+        60, 59, 57, 56, 56, 55, 54, 53, 52, 52, 51, 51, 50, 50, 49, 49, 49, 49,
+        70, 72, 74, 76, 78, 76, 74, 72, 70, 68, 66, 64, 62, 61, 59, 58, 57, 56,
+        55, 54, 53, 52, 52, 51, 51, 50, 50, 49, 49, 49, 49, 49, 68, 69, 71, 73,
+        75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 57, 56, 55, 54, 53, 52, 52,
+        51, 51, 50, 50, 49, 49, 48, 48, 48, 48, 65, 67, 69, 71, 72, 71, 69, 68,
+        66, 65, 63, 61, 60, 59, 57, 56, 55, 54, 53, 53, 52, 51, 51, 50, 50, 49,
+        49, 48, 48, 48, 48, 48, 63, 64, 66, 68, 69, 68, 67, 66, 64, 63, 61, 60,
+        59, 58, 56, 55, 54, 53, 53, 52, 51, 51, 50, 50, 49, 49, 48, 48, 47, 47,
+        47, 47, 61, 63, 64, 66, 67, 66, 65, 64, 63, 62, 60, 59, 58, 57, 56, 55,
+        54, 53, 52, 51, 51, 50, 50, 49, 49, 48, 48, 48, 47, 47, 47, 47, 59, 61,
+        62, 64, 65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 52, 51,
+        50, 50, 49, 49, 48, 48, 48, 47, 47, 47, 47, 47, 58, 59, 61, 62, 63, 62,
+        62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 52, 51, 50, 50, 49, 49, 48,
+        48, 48, 47, 47, 47, 47, 47, 47, 56, 57, 59, 60, 61, 61, 60, 59, 58, 57,
+        57, 56, 55, 54, 53, 52, 52, 51, 50, 50, 49, 49, 48, 48, 47, 47, 47, 47,
+        46, 46, 46, 46, 56, 57, 59, 60, 61, 61, 60, 59, 58, 57, 57, 56, 55, 54,
+        53, 52, 52, 51, 50, 50, 49, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46,
+        56, 57, 59, 60, 61, 61, 60, 59, 58, 57, 57, 56, 55, 54, 53, 52, 52, 51,
+        50, 50, 49, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 56, 57, 59, 60,
+        61, 61, 60, 59, 58, 57, 57, 56, 55, 54, 53, 52, 52, 51, 50, 50, 49, 49,
+        48, 48, 47, 47, 47, 47, 46, 46, 46, 46 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 45, 42, 36, 45, 40, 38, 35, 42, 38, 32, 30, 36, 35, 30, 27,
+        /* Size 8 */
+        64, 76, 48, 46, 44, 41, 37, 35, 76, 57, 47, 51, 50, 46, 43, 39, 48, 47,
+        42, 43, 43, 42, 40, 37, 46, 51, 43, 40, 39, 38, 36, 35, 44, 50, 43, 39,
+        36, 34, 33, 32, 41, 46, 42, 38, 34, 33, 31, 30, 37, 43, 40, 36, 33, 31,
+        30, 29, 35, 39, 37, 35, 32, 30, 29, 28,
+        /* Size 16 */
+        64, 70, 76, 62, 48, 47, 46, 45, 44, 42, 41, 39, 37, 36, 35, 35, 70, 68,
+        66, 57, 48, 48, 48, 48, 47, 45, 44, 42, 40, 39, 37, 37, 76, 66, 57, 52,
+        47, 49, 51, 50, 50, 48, 46, 45, 43, 41, 39, 39, 62, 57, 52, 48, 45, 46,
+        47, 47, 47, 45, 44, 43, 41, 40, 38, 38, 48, 48, 47, 45, 42, 43, 43, 43,
+        43, 43, 42, 41, 40, 38, 37, 37, 47, 48, 49, 46, 43, 42, 42, 41, 41, 40,
+        40, 39, 38, 37, 36, 36, 46, 48, 51, 47, 43, 42, 40, 39, 39, 38, 38, 37,
+        36, 35, 35, 35, 45, 48, 50, 47, 43, 41, 39, 38, 37, 37, 36, 35, 35, 34,
+        33, 33, 44, 47, 50, 47, 43, 41, 39, 37, 36, 35, 34, 34, 33, 33, 32, 32,
+        42, 45, 48, 45, 43, 40, 38, 37, 35, 34, 34, 33, 32, 32, 31, 31, 41, 44,
+        46, 44, 42, 40, 38, 36, 34, 34, 33, 32, 31, 31, 30, 30, 39, 42, 45, 43,
+        41, 39, 37, 35, 34, 33, 32, 31, 31, 30, 30, 30, 37, 40, 43, 41, 40, 38,
+        36, 35, 33, 32, 31, 31, 30, 29, 29, 29, 36, 39, 41, 40, 38, 37, 35, 34,
+        33, 32, 31, 30, 29, 29, 28, 28, 35, 37, 39, 38, 37, 36, 35, 33, 32, 31,
+        30, 30, 29, 28, 28, 28, 35, 37, 39, 38, 37, 36, 35, 33, 32, 31, 30, 30,
+        29, 28, 28, 28,
+        /* Size 32 */
+        64, 67, 70, 73, 76, 69, 62, 55, 48, 47, 47, 46, 46, 45, 45, 44, 44, 43,
+        42, 41, 41, 40, 39, 38, 37, 37, 36, 35, 35, 35, 35, 35, 67, 68, 69, 70,
+        71, 65, 60, 54, 48, 48, 47, 47, 47, 47, 46, 46, 45, 44, 44, 43, 42, 41,
+        40, 40, 39, 38, 37, 37, 36, 36, 36, 36, 70, 69, 68, 67, 66, 62, 57, 52,
+        48, 48, 48, 48, 48, 48, 48, 47, 47, 46, 45, 44, 44, 43, 42, 41, 40, 39,
+        39, 38, 37, 37, 37, 37, 73, 70, 67, 65, 62, 58, 55, 51, 47, 48, 49, 49,
+        50, 49, 49, 49, 48, 47, 47, 46, 45, 44, 43, 42, 41, 41, 40, 39, 38, 38,
+        38, 38, 76, 71, 66, 62, 57, 54, 52, 50, 47, 48, 49, 50, 51, 51, 50, 50,
+        50, 49, 48, 47, 46, 46, 45, 44, 43, 42, 41, 40, 39, 39, 39, 39, 69, 65,
+        62, 58, 54, 52, 50, 48, 46, 47, 47, 48, 49, 49, 49, 48, 48, 47, 47, 46,
+        45, 44, 44, 43, 42, 41, 40, 39, 39, 39, 39, 39, 62, 60, 57, 55, 52, 50,
+        48, 47, 45, 45, 46, 46, 47, 47, 47, 47, 47, 46, 45, 45, 44, 43, 43, 42,
+        41, 40, 40, 39, 38, 38, 38, 38, 55, 54, 52, 51, 50, 48, 47, 45, 44, 44,
+        44, 45, 45, 45, 45, 45, 45, 45, 44, 44, 43, 42, 42, 41, 40, 40, 39, 38,
+        38, 38, 38, 38, 48, 48, 48, 47, 47, 46, 45, 44, 42, 43, 43, 43, 43, 43,
+        43, 43, 43, 43, 43, 42, 42, 41, 41, 40, 40, 39, 38, 38, 37, 37, 37, 37,
+        47, 48, 48, 48, 48, 47, 45, 44, 43, 43, 43, 42, 42, 42, 42, 42, 42, 42,
+        42, 41, 41, 40, 40, 39, 39, 38, 38, 37, 36, 36, 36, 36, 47, 47, 48, 49,
+        49, 47, 46, 44, 43, 43, 42, 42, 42, 41, 41, 41, 41, 41, 40, 40, 40, 39,
+        39, 38, 38, 37, 37, 36, 36, 36, 36, 36, 46, 47, 48, 49, 50, 48, 46, 45,
+        43, 42, 42, 41, 41, 40, 40, 40, 40, 40, 39, 39, 39, 38, 38, 37, 37, 37,
+        36, 36, 35, 35, 35, 35, 46, 47, 48, 50, 51, 49, 47, 45, 43, 42, 42, 41,
+        40, 40, 39, 39, 39, 38, 38, 38, 38, 37, 37, 37, 36, 36, 35, 35, 35, 35,
+        35, 35, 45, 47, 48, 49, 51, 49, 47, 45, 43, 42, 41, 40, 40, 39, 39, 38,
+        38, 38, 37, 37, 37, 36, 36, 36, 35, 35, 35, 34, 34, 34, 34, 34, 45, 46,
+        48, 49, 50, 49, 47, 45, 43, 42, 41, 40, 39, 39, 38, 38, 37, 37, 37, 36,
+        36, 36, 35, 35, 35, 34, 34, 34, 33, 33, 33, 33, 44, 46, 47, 49, 50, 48,
+        47, 45, 43, 42, 41, 40, 39, 38, 38, 37, 37, 36, 36, 36, 35, 35, 35, 34,
+        34, 34, 33, 33, 33, 33, 33, 33, 44, 45, 47, 48, 50, 48, 47, 45, 43, 42,
+        41, 40, 39, 38, 37, 37, 36, 36, 35, 35, 34, 34, 34, 34, 33, 33, 33, 32,
+        32, 32, 32, 32, 43, 44, 46, 47, 49, 47, 46, 45, 43, 42, 41, 40, 38, 38,
+        37, 36, 36, 35, 35, 34, 34, 34, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+        42, 44, 45, 47, 48, 47, 45, 44, 43, 42, 40, 39, 38, 37, 37, 36, 35, 35,
+        34, 34, 34, 33, 33, 33, 32, 32, 32, 32, 31, 31, 31, 31, 41, 43, 44, 46,
+        47, 46, 45, 44, 42, 41, 40, 39, 38, 37, 36, 36, 35, 34, 34, 33, 33, 33,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 41, 42, 44, 45, 46, 45, 44, 43,
+        42, 41, 40, 39, 38, 37, 36, 35, 34, 34, 34, 33, 33, 32, 32, 32, 31, 31,
+        31, 31, 30, 30, 30, 30, 40, 41, 43, 44, 46, 44, 43, 42, 41, 40, 39, 38,
+        37, 36, 36, 35, 34, 34, 33, 33, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30,
+        30, 30, 39, 40, 42, 43, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 35,
+        34, 33, 33, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 38, 40,
+        41, 42, 44, 43, 42, 41, 40, 39, 38, 37, 37, 36, 35, 34, 34, 33, 33, 32,
+        32, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 37, 39, 40, 41, 43, 42,
+        41, 40, 40, 39, 38, 37, 36, 35, 35, 34, 33, 33, 32, 32, 31, 31, 31, 30,
+        30, 30, 29, 29, 29, 29, 29, 29, 37, 38, 39, 41, 42, 41, 40, 40, 39, 38,
+        37, 37, 36, 35, 34, 34, 33, 33, 32, 32, 31, 31, 30, 30, 30, 29, 29, 29,
+        29, 29, 29, 29, 36, 37, 39, 40, 41, 40, 40, 39, 38, 38, 37, 36, 35, 35,
+        34, 33, 33, 32, 32, 31, 31, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28,
+        35, 37, 38, 39, 40, 39, 39, 38, 38, 37, 36, 36, 35, 34, 34, 33, 32, 32,
+        32, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 35, 36, 37, 38,
+        39, 39, 38, 38, 37, 36, 36, 35, 35, 34, 33, 33, 32, 32, 31, 31, 30, 30,
+        30, 29, 29, 29, 28, 28, 28, 28, 28, 28, 35, 36, 37, 38, 39, 39, 38, 38,
+        37, 36, 36, 35, 35, 34, 33, 33, 32, 32, 31, 31, 30, 30, 30, 29, 29, 29,
+        28, 28, 28, 28, 28, 28, 35, 36, 37, 38, 39, 39, 38, 38, 37, 36, 36, 35,
+        35, 34, 33, 33, 32, 32, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28,
+        28, 28, 35, 36, 37, 38, 39, 39, 38, 38, 37, 36, 36, 35, 35, 34, 33, 33,
+        32, 32, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 28 },
+      { /* Intra matrices */
+        /* Size 4 */
+        117, 81, 76, 64, 81, 71, 67, 61, 76, 67, 57, 52, 64, 61, 52, 47,
+        /* Size 8 */
+        110, 131, 81, 77, 74, 68, 62, 57, 131, 97, 80, 86, 84, 78, 72, 65, 81,
+        80, 71, 73, 73, 70, 66, 62, 77, 86, 73, 67, 64, 63, 60, 57, 74, 84, 73,
+        64, 60, 57, 55, 53, 68, 78, 70, 63, 57, 54, 51, 50, 62, 72, 66, 60, 55,
+        51, 49, 47, 57, 65, 62, 57, 53, 50, 47, 45,
+        /* Size 16 */
+        112, 123, 134, 108, 83, 81, 79, 77, 75, 72, 69, 66, 64, 61, 59, 59, 123,
+        120, 116, 99, 82, 83, 83, 82, 81, 78, 75, 71, 68, 65, 63, 63, 134, 116,
+        99, 90, 81, 85, 88, 87, 86, 83, 80, 77, 73, 70, 67, 67, 108, 99, 90, 84,
+        77, 79, 81, 81, 80, 78, 76, 73, 70, 68, 65, 65, 83, 82, 81, 77, 72, 73,
+        74, 74, 74, 73, 72, 70, 67, 65, 63, 63, 81, 83, 85, 79, 73, 72, 71, 71,
+        70, 69, 68, 66, 64, 62, 61, 61, 79, 83, 88, 81, 74, 71, 68, 67, 66, 65,
+        64, 62, 61, 60, 58, 58, 77, 82, 87, 81, 74, 71, 67, 65, 63, 62, 61, 60,
+        59, 57, 56, 56, 75, 81, 86, 80, 74, 70, 66, 63, 61, 59, 58, 57, 56, 55,
+        54, 54, 72, 78, 83, 78, 73, 69, 65, 62, 59, 58, 56, 55, 54, 53, 52, 52,
+        69, 75, 80, 76, 72, 68, 64, 61, 58, 56, 55, 54, 52, 51, 51, 51, 66, 71,
+        77, 73, 70, 66, 62, 60, 57, 55, 54, 52, 51, 50, 49, 49, 64, 68, 73, 70,
+        67, 64, 61, 59, 56, 54, 52, 51, 50, 49, 48, 48, 61, 65, 70, 68, 65, 62,
+        60, 57, 55, 53, 51, 50, 49, 48, 47, 47, 59, 63, 67, 65, 63, 61, 58, 56,
+        54, 52, 51, 49, 48, 47, 46, 46, 59, 63, 67, 65, 63, 61, 58, 56, 54, 52,
+        51, 49, 48, 47, 46, 46,
+        /* Size 32 */
+        113, 119, 124, 130, 135, 122, 109, 96, 83, 82, 82, 81, 80, 79, 78, 77,
+        76, 74, 73, 71, 70, 69, 67, 66, 64, 63, 62, 60, 59, 59, 59, 59, 119,
+        121, 123, 124, 126, 116, 105, 94, 83, 83, 83, 82, 82, 81, 80, 79, 79,
+        77, 76, 74, 73, 71, 70, 68, 67, 65, 64, 63, 61, 61, 61, 61, 124, 123,
+        121, 119, 118, 109, 100, 91, 83, 83, 84, 84, 84, 84, 83, 82, 81, 80, 78,
+        77, 75, 74, 72, 71, 69, 68, 66, 65, 63, 63, 63, 63, 130, 124, 119, 114,
+        109, 102, 96, 89, 83, 84, 84, 85, 86, 86, 85, 85, 84, 83, 81, 80, 78,
+        76, 75, 73, 71, 70, 68, 67, 65, 65, 65, 65, 135, 126, 118, 109, 100, 95,
+        91, 87, 82, 84, 85, 87, 89, 88, 88, 87, 87, 85, 84, 82, 81, 79, 77, 76,
+        74, 72, 71, 69, 67, 67, 67, 67, 122, 116, 109, 102, 95, 92, 88, 84, 80,
+        81, 83, 84, 85, 85, 85, 84, 84, 83, 81, 80, 79, 77, 76, 74, 72, 71, 69,
+        68, 66, 66, 66, 66, 109, 105, 100, 96, 91, 88, 84, 81, 78, 79, 80, 81,
+        82, 82, 81, 81, 81, 80, 79, 78, 77, 75, 74, 72, 71, 70, 68, 67, 65, 65,
+        65, 65, 96, 94, 91, 89, 87, 84, 81, 78, 75, 76, 77, 78, 78, 78, 78, 78,
+        78, 77, 76, 75, 74, 73, 72, 71, 70, 68, 67, 66, 65, 65, 65, 65, 83, 83,
+        83, 83, 82, 80, 78, 75, 73, 74, 74, 75, 75, 75, 75, 75, 75, 74, 74, 73,
+        72, 71, 70, 69, 68, 67, 66, 65, 64, 64, 64, 64, 82, 83, 83, 84, 84, 81,
+        79, 76, 74, 74, 73, 73, 73, 73, 73, 73, 73, 72, 72, 71, 70, 69, 68, 68,
+        67, 66, 64, 63, 62, 62, 62, 62, 82, 83, 84, 84, 85, 83, 80, 77, 74, 73,
+        73, 72, 72, 72, 71, 71, 71, 70, 70, 69, 68, 68, 67, 66, 65, 64, 63, 62,
+        61, 61, 61, 61, 81, 82, 84, 85, 87, 84, 81, 78, 75, 73, 72, 71, 70, 70,
+        69, 69, 69, 68, 67, 67, 66, 66, 65, 64, 63, 63, 62, 61, 60, 60, 60, 60,
+        80, 82, 84, 86, 89, 85, 82, 78, 75, 73, 72, 70, 69, 68, 67, 67, 66, 66,
+        65, 65, 64, 64, 63, 62, 62, 61, 60, 60, 59, 59, 59, 59, 79, 81, 84, 86,
+        88, 85, 82, 78, 75, 73, 72, 70, 68, 67, 67, 66, 65, 65, 64, 63, 63, 62,
+        62, 61, 60, 60, 59, 58, 58, 58, 58, 58, 78, 80, 83, 85, 88, 85, 81, 78,
+        75, 73, 71, 69, 67, 67, 66, 65, 64, 63, 63, 62, 62, 61, 60, 60, 59, 59,
+        58, 57, 57, 57, 57, 57, 77, 79, 82, 85, 87, 84, 81, 78, 75, 73, 71, 69,
+        67, 66, 65, 64, 63, 62, 61, 61, 60, 60, 59, 58, 58, 57, 57, 56, 56, 56,
+        56, 56, 76, 79, 81, 84, 87, 84, 81, 78, 75, 73, 71, 69, 66, 65, 64, 63,
+        61, 61, 60, 59, 59, 58, 58, 57, 57, 56, 56, 55, 54, 54, 54, 54, 74, 77,
+        80, 83, 85, 83, 80, 77, 74, 72, 70, 68, 66, 65, 63, 62, 61, 60, 59, 59,
+        58, 57, 57, 56, 56, 55, 55, 54, 54, 54, 54, 54, 73, 76, 78, 81, 84, 81,
+        79, 76, 74, 72, 70, 67, 65, 64, 63, 61, 60, 59, 59, 58, 57, 56, 56, 55,
+        55, 54, 54, 53, 53, 53, 53, 53, 71, 74, 77, 80, 82, 80, 78, 75, 73, 71,
+        69, 67, 65, 63, 62, 61, 59, 59, 58, 57, 56, 56, 55, 54, 54, 53, 53, 52,
+        52, 52, 52, 52, 70, 73, 75, 78, 81, 79, 77, 74, 72, 70, 68, 66, 64, 63,
+        62, 60, 59, 58, 57, 56, 55, 55, 54, 53, 53, 52, 52, 51, 51, 51, 51, 51,
+        69, 71, 74, 76, 79, 77, 75, 73, 71, 69, 68, 66, 64, 62, 61, 60, 58, 57,
+        56, 56, 55, 54, 53, 53, 52, 52, 51, 51, 50, 50, 50, 50, 67, 70, 72, 75,
+        77, 76, 74, 72, 70, 68, 67, 65, 63, 62, 60, 59, 58, 57, 56, 55, 54, 53,
+        53, 52, 52, 51, 51, 50, 50, 50, 50, 50, 66, 68, 71, 73, 76, 74, 72, 71,
+        69, 68, 66, 64, 62, 61, 60, 58, 57, 56, 55, 54, 53, 53, 52, 52, 51, 51,
+        50, 50, 49, 49, 49, 49, 64, 67, 69, 71, 74, 72, 71, 70, 68, 67, 65, 63,
+        62, 60, 59, 58, 57, 56, 55, 54, 53, 52, 52, 51, 50, 50, 49, 49, 49, 49,
+        49, 49, 63, 65, 68, 70, 72, 71, 70, 68, 67, 66, 64, 63, 61, 60, 59, 57,
+        56, 55, 54, 53, 52, 52, 51, 51, 50, 49, 49, 49, 48, 48, 48, 48, 62, 64,
+        66, 68, 71, 69, 68, 67, 66, 64, 63, 62, 60, 59, 58, 57, 56, 55, 54, 53,
+        52, 51, 51, 50, 49, 49, 49, 48, 48, 48, 48, 48, 60, 63, 65, 67, 69, 68,
+        67, 66, 65, 63, 62, 61, 60, 58, 57, 56, 55, 54, 53, 52, 51, 51, 50, 50,
+        49, 49, 48, 48, 47, 47, 47, 47, 59, 61, 63, 65, 67, 66, 65, 65, 64, 62,
+        61, 60, 59, 58, 57, 56, 54, 54, 53, 52, 51, 50, 50, 49, 49, 48, 48, 47,
+        47, 47, 47, 47, 59, 61, 63, 65, 67, 66, 65, 65, 64, 62, 61, 60, 59, 58,
+        57, 56, 54, 54, 53, 52, 51, 50, 50, 49, 49, 48, 48, 47, 47, 47, 47, 47,
+        59, 61, 63, 65, 67, 66, 65, 65, 64, 62, 61, 60, 59, 58, 57, 56, 54, 54,
+        53, 52, 51, 50, 50, 49, 49, 48, 48, 47, 47, 47, 47, 47, 59, 61, 63, 65,
+        67, 66, 65, 65, 64, 62, 61, 60, 59, 58, 57, 56, 54, 54, 53, 52, 51, 50,
+        50, 49, 49, 48, 48, 47, 47, 47, 47, 47 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 60, 43, 33, 60, 45, 37, 32, 43, 37, 31, 29, 33, 32, 29, 27,
+        /* Size 8 */
+        64, 79, 74, 61, 50, 43, 38, 35, 79, 71, 73, 66, 55, 47, 42, 38, 74, 73,
+        58, 52, 48, 43, 39, 36, 61, 66, 52, 45, 42, 39, 37, 35, 50, 55, 48, 42,
+        38, 36, 34, 33, 43, 47, 43, 39, 36, 34, 33, 32, 38, 42, 39, 37, 34, 33,
+        32, 31, 35, 38, 36, 35, 33, 32, 31, 31,
+        /* Size 16 */
+        64, 71, 79, 77, 74, 68, 61, 56, 50, 47, 43, 41, 38, 37, 35, 35, 71, 73,
+        75, 74, 74, 69, 63, 58, 53, 49, 45, 43, 40, 38, 37, 37, 79, 75, 71, 72,
+        73, 70, 66, 60, 55, 51, 47, 45, 42, 40, 38, 38, 77, 74, 72, 69, 66, 62,
+        59, 55, 51, 48, 45, 43, 40, 39, 37, 37, 74, 74, 73, 66, 58, 55, 52, 50,
+        48, 45, 43, 41, 39, 38, 36, 36, 68, 69, 70, 62, 55, 52, 49, 47, 45, 43,
+        41, 39, 38, 37, 36, 36, 61, 63, 66, 59, 52, 49, 45, 43, 42, 40, 39, 38,
+        37, 36, 35, 35, 56, 58, 60, 55, 50, 47, 43, 42, 40, 39, 37, 36, 35, 35,
+        34, 34, 50, 53, 55, 51, 48, 45, 42, 40, 38, 37, 36, 35, 34, 34, 33, 33,
+        47, 49, 51, 48, 45, 43, 40, 39, 37, 36, 35, 34, 34, 33, 33, 33, 43, 45,
+        47, 45, 43, 41, 39, 37, 36, 35, 34, 33, 33, 32, 32, 32, 41, 43, 45, 43,
+        41, 39, 38, 36, 35, 34, 33, 33, 32, 32, 32, 32, 38, 40, 42, 40, 39, 38,
+        37, 35, 34, 34, 33, 32, 32, 31, 31, 31, 37, 38, 40, 39, 38, 37, 36, 35,
+        34, 33, 32, 32, 31, 31, 31, 31, 35, 37, 38, 37, 36, 36, 35, 34, 33, 33,
+        32, 32, 31, 31, 31, 31, 35, 37, 38, 37, 36, 36, 35, 34, 33, 33, 32, 32,
+        31, 31, 31, 31,
+        /* Size 32 */
+        64, 68, 71, 75, 79, 78, 77, 75, 74, 71, 68, 64, 61, 58, 56, 53, 50, 49,
+        47, 45, 43, 42, 41, 40, 38, 38, 37, 36, 35, 35, 35, 35, 68, 70, 72, 75,
+        77, 76, 76, 75, 74, 71, 68, 65, 62, 59, 57, 54, 52, 50, 48, 46, 44, 43,
+        42, 40, 39, 38, 38, 37, 36, 36, 36, 36, 71, 72, 73, 74, 75, 75, 74, 74,
+        74, 71, 69, 66, 63, 61, 58, 55, 53, 51, 49, 47, 45, 44, 43, 41, 40, 39,
+        38, 37, 37, 37, 37, 37, 75, 75, 74, 74, 73, 73, 73, 74, 74, 71, 69, 67,
+        64, 62, 59, 57, 54, 52, 50, 48, 46, 45, 44, 42, 41, 40, 39, 38, 37, 37,
+        37, 37, 79, 77, 75, 73, 71, 72, 72, 73, 73, 72, 70, 68, 66, 63, 60, 58,
+        55, 53, 51, 49, 47, 46, 45, 43, 42, 41, 40, 39, 38, 38, 38, 38, 78, 76,
+        75, 73, 72, 71, 71, 70, 70, 68, 66, 64, 62, 60, 58, 56, 53, 52, 50, 48,
+        46, 45, 44, 42, 41, 40, 39, 38, 37, 37, 37, 37, 77, 76, 74, 73, 72, 71,
+        69, 67, 66, 64, 62, 61, 59, 57, 55, 53, 51, 50, 48, 47, 45, 44, 43, 42,
+        40, 40, 39, 38, 37, 37, 37, 37, 75, 75, 74, 74, 73, 70, 67, 65, 62, 60,
+        59, 57, 56, 54, 53, 51, 49, 48, 47, 45, 44, 43, 42, 41, 40, 39, 38, 37,
+        37, 37, 37, 37, 74, 74, 74, 74, 73, 70, 66, 62, 58, 56, 55, 54, 52, 51,
+        50, 49, 48, 46, 45, 44, 43, 42, 41, 40, 39, 39, 38, 37, 36, 36, 36, 36,
+        71, 71, 71, 71, 72, 68, 64, 60, 56, 55, 54, 52, 51, 49, 48, 47, 46, 45,
+        44, 43, 42, 41, 40, 39, 39, 38, 37, 37, 36, 36, 36, 36, 68, 68, 69, 69,
+        70, 66, 62, 59, 55, 54, 52, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40,
+        39, 39, 38, 37, 37, 36, 36, 36, 36, 36, 64, 65, 66, 67, 68, 64, 61, 57,
+        54, 52, 50, 49, 47, 46, 45, 44, 43, 42, 41, 41, 40, 39, 39, 38, 37, 37,
+        36, 36, 35, 35, 35, 35, 61, 62, 63, 64, 66, 62, 59, 56, 52, 51, 49, 47,
+        45, 44, 43, 42, 42, 41, 40, 39, 39, 38, 38, 37, 37, 36, 36, 35, 35, 35,
+        35, 35, 58, 59, 61, 62, 63, 60, 57, 54, 51, 49, 48, 46, 44, 43, 42, 42,
+        41, 40, 39, 39, 38, 38, 37, 37, 36, 36, 35, 35, 34, 34, 34, 34, 56, 57,
+        58, 59, 60, 58, 55, 53, 50, 48, 47, 45, 43, 42, 42, 41, 40, 39, 39, 38,
+        37, 37, 36, 36, 35, 35, 35, 34, 34, 34, 34, 34, 53, 54, 55, 57, 58, 56,
+        53, 51, 49, 47, 46, 44, 42, 42, 41, 40, 39, 38, 38, 37, 37, 36, 36, 35,
+        35, 35, 34, 34, 34, 34, 34, 34, 50, 52, 53, 54, 55, 53, 51, 49, 48, 46,
+        45, 43, 42, 41, 40, 39, 38, 38, 37, 36, 36, 36, 35, 35, 34, 34, 34, 33,
+        33, 33, 33, 33, 49, 50, 51, 52, 53, 52, 50, 48, 46, 45, 44, 42, 41, 40,
+        39, 38, 38, 37, 36, 36, 35, 35, 35, 34, 34, 34, 33, 33, 33, 33, 33, 33,
+        47, 48, 49, 50, 51, 50, 48, 47, 45, 44, 43, 41, 40, 39, 39, 38, 37, 36,
+        36, 36, 35, 35, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 45, 46, 47, 48,
+        49, 48, 47, 45, 44, 43, 42, 41, 39, 39, 38, 37, 36, 36, 36, 35, 35, 34,
+        34, 34, 33, 33, 33, 33, 32, 32, 32, 32, 43, 44, 45, 46, 47, 46, 45, 44,
+        43, 42, 41, 40, 39, 38, 37, 37, 36, 35, 35, 35, 34, 34, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 42, 43, 44, 45, 46, 45, 44, 43, 42, 41, 40, 39,
+        38, 38, 37, 36, 36, 35, 35, 34, 34, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 41, 42, 43, 44, 45, 44, 43, 42, 41, 40, 39, 39, 38, 37, 36, 36,
+        35, 35, 34, 34, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 40, 40,
+        41, 42, 43, 42, 42, 41, 40, 39, 39, 38, 37, 37, 36, 35, 35, 34, 34, 34,
+        33, 33, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 38, 39, 40, 41, 42, 41,
+        40, 40, 39, 39, 38, 37, 37, 36, 35, 35, 34, 34, 34, 33, 33, 33, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 38, 38, 39, 40, 41, 40, 40, 39, 39, 38,
+        37, 37, 36, 36, 35, 35, 34, 34, 33, 33, 33, 32, 32, 32, 32, 31, 31, 31,
+        31, 31, 31, 31, 37, 38, 38, 39, 40, 39, 39, 38, 38, 37, 37, 36, 36, 35,
+        35, 34, 34, 33, 33, 33, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+        36, 37, 37, 38, 39, 38, 38, 37, 37, 37, 36, 36, 35, 35, 34, 34, 33, 33,
+        33, 33, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 35, 36, 37, 37,
+        38, 37, 37, 37, 36, 36, 36, 35, 35, 34, 34, 34, 33, 33, 33, 32, 32, 32,
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 35, 36, 37, 37, 38, 37, 37, 37,
+        36, 36, 36, 35, 35, 34, 34, 34, 33, 33, 33, 32, 32, 32, 32, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 35, 36, 37, 37, 38, 37, 37, 37, 36, 36, 36, 35,
+        35, 34, 34, 34, 33, 33, 33, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 35, 36, 37, 37, 38, 37, 37, 37, 36, 36, 36, 35, 35, 34, 34, 34,
+        33, 33, 33, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31 },
+      { /* Intra matrices */
+        /* Size 4 */
+        118, 110, 77, 59, 110, 82, 67, 57, 77, 67, 56, 51, 59, 57, 51, 47,
+        /* Size 8 */
+        103, 128, 121, 98, 80, 68, 60, 55, 128, 115, 119, 106, 89, 75, 65, 59,
+        121, 119, 93, 83, 75, 68, 61, 56, 98, 106, 83, 71, 65, 61, 57, 54, 80,
+        89, 75, 65, 59, 56, 53, 51, 68, 75, 68, 61, 56, 53, 51, 49, 60, 65, 61,
+        57, 53, 51, 49, 48, 55, 59, 56, 54, 51, 49, 48, 47,
+        /* Size 16 */
+        106, 118, 131, 127, 123, 112, 100, 91, 82, 76, 69, 65, 61, 59, 56, 56,
+        118, 122, 125, 124, 123, 113, 104, 95, 86, 80, 73, 69, 64, 61, 58, 58,
+        131, 125, 118, 120, 122, 115, 108, 99, 91, 84, 77, 72, 67, 64, 60, 60,
+        127, 124, 120, 114, 108, 103, 97, 90, 84, 78, 73, 69, 65, 62, 59, 59,
+        123, 123, 122, 108, 95, 90, 85, 81, 77, 73, 69, 66, 63, 60, 58, 58, 112,
+        113, 115, 103, 90, 85, 79, 76, 72, 69, 66, 63, 60, 58, 56, 56, 100, 104,
+        108, 97, 85, 79, 73, 70, 67, 64, 62, 60, 58, 56, 55, 55, 91, 95, 99, 90,
+        81, 76, 70, 67, 64, 62, 60, 58, 56, 55, 54, 54, 82, 86, 91, 84, 77, 72,
+        67, 64, 61, 59, 57, 56, 54, 53, 52, 52, 76, 80, 84, 78, 73, 69, 64, 62,
+        59, 57, 55, 54, 53, 52, 51, 51, 69, 73, 77, 73, 69, 66, 62, 60, 57, 55,
+        54, 53, 52, 51, 50, 50, 65, 69, 72, 69, 66, 63, 60, 58, 56, 54, 53, 52,
+        51, 50, 49, 49, 61, 64, 67, 65, 63, 60, 58, 56, 54, 53, 52, 51, 50, 49,
+        49, 49, 59, 61, 64, 62, 60, 58, 56, 55, 53, 52, 51, 50, 49, 49, 48, 48,
+        56, 58, 60, 59, 58, 56, 55, 54, 52, 51, 50, 49, 49, 48, 48, 48, 56, 58,
+        60, 59, 58, 56, 55, 54, 52, 51, 50, 49, 49, 48, 48, 48,
+        /* Size 32 */
+        107, 113, 120, 126, 133, 131, 129, 127, 125, 119, 113, 107, 102, 97, 92,
+        88, 83, 80, 77, 73, 70, 68, 66, 64, 62, 61, 59, 58, 57, 57, 57, 57, 113,
+        117, 121, 125, 129, 128, 127, 126, 125, 119, 114, 109, 104, 99, 94, 90,
+        85, 82, 79, 75, 72, 70, 68, 66, 64, 62, 61, 59, 58, 58, 58, 58, 120,
+        121, 123, 125, 126, 126, 125, 125, 124, 119, 115, 110, 105, 101, 96, 92,
+        87, 84, 81, 77, 74, 72, 69, 67, 65, 63, 62, 60, 59, 59, 59, 59, 126,
+        125, 125, 124, 123, 123, 123, 124, 124, 120, 116, 112, 107, 103, 98, 94,
+        90, 86, 83, 79, 76, 73, 71, 69, 66, 65, 63, 61, 60, 60, 60, 60, 133,
+        129, 126, 123, 119, 120, 121, 122, 123, 120, 116, 113, 109, 105, 101,
+        96, 92, 88, 85, 81, 78, 75, 73, 70, 68, 66, 64, 63, 61, 61, 61, 61, 131,
+        128, 126, 123, 120, 119, 118, 118, 117, 113, 110, 107, 104, 100, 96, 92,
+        88, 85, 82, 79, 76, 74, 71, 69, 67, 65, 64, 62, 60, 60, 60, 60, 129,
+        127, 125, 123, 121, 118, 116, 113, 110, 107, 104, 101, 98, 95, 91, 88,
+        85, 82, 79, 77, 74, 72, 70, 68, 66, 64, 63, 61, 60, 60, 60, 60, 127,
+        126, 125, 124, 122, 118, 113, 108, 103, 100, 98, 95, 92, 89, 87, 84, 81,
+        79, 77, 74, 72, 70, 68, 66, 65, 63, 62, 60, 59, 59, 59, 59, 125, 125,
+        124, 124, 123, 117, 110, 103, 96, 94, 91, 89, 86, 84, 82, 80, 78, 76,
+        74, 72, 70, 68, 67, 65, 63, 62, 61, 60, 58, 58, 58, 58, 119, 119, 119,
+        120, 120, 113, 107, 100, 94, 91, 88, 86, 83, 81, 79, 77, 75, 74, 72, 70,
+        68, 67, 65, 64, 62, 61, 60, 59, 58, 58, 58, 58, 113, 114, 115, 116, 116,
+        110, 104, 98, 91, 88, 86, 83, 80, 78, 76, 75, 73, 71, 70, 68, 66, 65,
+        64, 62, 61, 60, 59, 58, 57, 57, 57, 57, 107, 109, 110, 112, 113, 107,
+        101, 95, 89, 86, 83, 80, 77, 75, 74, 72, 70, 69, 67, 66, 65, 63, 62, 61,
+        60, 59, 58, 57, 56, 56, 56, 56, 102, 104, 105, 107, 109, 104, 98, 92,
+        86, 83, 80, 77, 74, 72, 71, 69, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58,
+        57, 56, 55, 55, 55, 55, 97, 99, 101, 103, 105, 100, 95, 89, 84, 81, 78,
+        75, 72, 71, 69, 68, 66, 65, 64, 63, 61, 61, 60, 59, 58, 57, 56, 56, 55,
+        55, 55, 55, 92, 94, 96, 98, 101, 96, 91, 87, 82, 79, 76, 74, 71, 69, 68,
+        66, 64, 63, 62, 61, 60, 59, 59, 58, 57, 56, 56, 55, 54, 54, 54, 54, 88,
+        90, 92, 94, 96, 92, 88, 84, 80, 77, 75, 72, 69, 68, 66, 64, 63, 62, 61,
+        60, 59, 58, 57, 57, 56, 55, 55, 54, 53, 53, 53, 53, 83, 85, 87, 90, 92,
+        88, 85, 81, 78, 75, 73, 70, 67, 66, 64, 63, 61, 60, 60, 59, 58, 57, 56,
+        56, 55, 54, 54, 53, 53, 53, 53, 53, 80, 82, 84, 86, 88, 85, 82, 79, 76,
+        74, 71, 69, 66, 65, 63, 62, 60, 60, 59, 58, 57, 56, 56, 55, 54, 54, 53,
+        53, 52, 52, 52, 52, 77, 79, 81, 83, 85, 82, 79, 77, 74, 72, 70, 67, 65,
+        64, 62, 61, 60, 59, 58, 57, 56, 55, 55, 54, 54, 53, 53, 52, 52, 52, 52,
+        52, 73, 75, 77, 79, 81, 79, 77, 74, 72, 70, 68, 66, 64, 63, 61, 60, 59,
+        58, 57, 56, 55, 55, 54, 54, 53, 53, 52, 52, 51, 51, 51, 51, 70, 72, 74,
+        76, 78, 76, 74, 72, 70, 68, 66, 65, 63, 61, 60, 59, 58, 57, 56, 55, 54,
+        54, 53, 53, 52, 52, 52, 51, 51, 51, 51, 51, 68, 70, 72, 73, 75, 74, 72,
+        70, 68, 67, 65, 63, 62, 61, 59, 58, 57, 56, 55, 55, 54, 53, 53, 52, 52,
+        52, 51, 51, 50, 50, 50, 50, 66, 68, 69, 71, 73, 71, 70, 68, 67, 65, 64,
+        62, 61, 60, 59, 57, 56, 56, 55, 54, 53, 53, 52, 52, 51, 51, 51, 50, 50,
+        50, 50, 50, 64, 66, 67, 69, 70, 69, 68, 66, 65, 64, 62, 61, 60, 59, 58,
+        57, 56, 55, 54, 54, 53, 52, 52, 51, 51, 51, 50, 50, 50, 50, 50, 50, 62,
+        64, 65, 66, 68, 67, 66, 65, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 54,
+        53, 52, 52, 51, 51, 51, 50, 50, 50, 49, 49, 49, 49, 61, 62, 63, 65, 66,
+        65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 54, 53, 53, 52, 52, 51,
+        51, 50, 50, 50, 49, 49, 49, 49, 49, 59, 61, 62, 63, 64, 64, 63, 62, 61,
+        60, 59, 58, 57, 56, 56, 55, 54, 53, 53, 52, 52, 51, 51, 50, 50, 50, 49,
+        49, 49, 49, 49, 49, 58, 59, 60, 61, 63, 62, 61, 60, 60, 59, 58, 57, 56,
+        56, 55, 54, 53, 53, 52, 52, 51, 51, 50, 50, 50, 49, 49, 49, 49, 49, 49,
+        49, 57, 58, 59, 60, 61, 60, 60, 59, 58, 58, 57, 56, 55, 55, 54, 53, 53,
+        52, 52, 51, 51, 50, 50, 50, 49, 49, 49, 49, 48, 48, 48, 48, 57, 58, 59,
+        60, 61, 60, 60, 59, 58, 58, 57, 56, 55, 55, 54, 53, 53, 52, 52, 51, 51,
+        50, 50, 50, 49, 49, 49, 49, 48, 48, 48, 48, 57, 58, 59, 60, 61, 60, 60,
+        59, 58, 58, 57, 56, 55, 55, 54, 53, 53, 52, 52, 51, 51, 50, 50, 50, 49,
+        49, 49, 49, 48, 48, 48, 48, 57, 58, 59, 60, 61, 60, 60, 59, 58, 58, 57,
+        56, 55, 55, 54, 53, 53, 52, 52, 51, 51, 50, 50, 50, 49, 49, 49, 49, 48,
+        48, 48, 48 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 46, 44, 38, 46, 41, 40, 37, 44, 40, 35, 32, 38, 37, 32, 30,
+        /* Size 8 */
+        64, 75, 49, 47, 45, 42, 40, 37, 75, 57, 49, 52, 51, 48, 44, 41, 49, 49,
+        44, 45, 45, 44, 42, 39, 47, 52, 45, 42, 41, 40, 38, 37, 45, 51, 45, 41,
+        38, 37, 36, 35, 42, 48, 44, 40, 37, 35, 34, 33, 40, 44, 42, 38, 36, 34,
+        33, 32, 37, 41, 39, 37, 35, 33, 32, 31,
+        /* Size 16 */
+        64, 70, 75, 62, 49, 48, 47, 46, 45, 44, 42, 41, 40, 38, 37, 37, 70, 68,
+        66, 58, 49, 49, 50, 49, 48, 47, 45, 44, 42, 40, 39, 39, 75, 66, 57, 53,
+        49, 50, 52, 51, 51, 49, 48, 46, 44, 43, 41, 41, 62, 58, 53, 50, 46, 47,
+        48, 48, 48, 47, 46, 44, 43, 42, 40, 40, 49, 49, 49, 46, 44, 44, 45, 45,
+        45, 44, 44, 43, 42, 40, 39, 39, 48, 49, 50, 47, 44, 44, 43, 43, 43, 42,
+        42, 41, 40, 39, 38, 38, 47, 50, 52, 48, 45, 43, 42, 41, 41, 40, 40, 39,
+        38, 38, 37, 37, 46, 49, 51, 48, 45, 43, 41, 40, 39, 39, 38, 38, 37, 36,
+        36, 36, 45, 48, 51, 48, 45, 43, 41, 39, 38, 37, 37, 36, 36, 35, 35, 35,
+        44, 47, 49, 47, 44, 42, 40, 39, 37, 37, 36, 35, 35, 34, 34, 34, 42, 45,
+        48, 46, 44, 42, 40, 38, 37, 36, 35, 34, 34, 33, 33, 33, 41, 44, 46, 44,
+        43, 41, 39, 38, 36, 35, 34, 34, 33, 33, 32, 32, 40, 42, 44, 43, 42, 40,
+        38, 37, 36, 35, 34, 33, 33, 32, 32, 32, 38, 40, 43, 42, 40, 39, 38, 36,
+        35, 34, 33, 33, 32, 32, 31, 31, 37, 39, 41, 40, 39, 38, 37, 36, 35, 34,
+        33, 32, 32, 31, 31, 31, 37, 39, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32,
+        32, 31, 31, 31,
+        /* Size 32 */
+        64, 67, 70, 72, 75, 69, 62, 56, 49, 49, 48, 48, 47, 47, 46, 46, 45, 45,
+        44, 43, 42, 42, 41, 40, 40, 39, 38, 38, 37, 37, 37, 37, 67, 68, 69, 70,
+        71, 65, 60, 54, 49, 49, 49, 49, 48, 48, 48, 47, 47, 46, 45, 45, 44, 43,
+        42, 41, 41, 40, 39, 39, 38, 38, 38, 38, 70, 69, 68, 67, 66, 62, 58, 53,
+        49, 49, 49, 49, 50, 49, 49, 48, 48, 47, 47, 46, 45, 44, 44, 43, 42, 41,
+        40, 40, 39, 39, 39, 39, 72, 70, 67, 64, 62, 59, 55, 52, 49, 49, 50, 50,
+        51, 50, 50, 50, 49, 49, 48, 47, 46, 46, 45, 44, 43, 42, 42, 41, 40, 40,
+        40, 40, 75, 71, 66, 62, 57, 55, 53, 51, 49, 49, 50, 51, 52, 52, 51, 51,
+        51, 50, 49, 49, 48, 47, 46, 45, 44, 44, 43, 42, 41, 41, 41, 41, 69, 65,
+        62, 59, 55, 53, 51, 49, 47, 48, 49, 49, 50, 50, 50, 50, 49, 49, 48, 47,
+        47, 46, 45, 44, 44, 43, 42, 41, 41, 41, 41, 41, 62, 60, 58, 55, 53, 51,
+        50, 48, 46, 47, 47, 48, 48, 48, 48, 48, 48, 47, 47, 46, 46, 45, 44, 44,
+        43, 42, 42, 41, 40, 40, 40, 40, 56, 54, 53, 52, 51, 49, 48, 47, 45, 46,
+        46, 46, 47, 47, 47, 47, 46, 46, 46, 45, 45, 44, 43, 43, 42, 42, 41, 40,
+        40, 40, 40, 40, 49, 49, 49, 49, 49, 47, 46, 45, 44, 44, 44, 45, 45, 45,
+        45, 45, 45, 45, 44, 44, 44, 43, 43, 42, 42, 41, 40, 40, 39, 39, 39, 39,
+        49, 49, 49, 49, 49, 48, 47, 46, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+        43, 43, 43, 42, 42, 41, 41, 40, 40, 39, 39, 39, 39, 39, 48, 49, 49, 50,
+        50, 49, 47, 46, 44, 44, 44, 44, 43, 43, 43, 43, 43, 43, 42, 42, 42, 41,
+        41, 40, 40, 39, 39, 38, 38, 38, 38, 38, 48, 49, 49, 50, 51, 49, 48, 46,
+        45, 44, 44, 43, 43, 42, 42, 42, 42, 41, 41, 41, 41, 40, 40, 39, 39, 39,
+        38, 38, 37, 37, 37, 37, 47, 48, 50, 51, 52, 50, 48, 47, 45, 44, 43, 43,
+        42, 41, 41, 41, 41, 40, 40, 40, 40, 39, 39, 39, 38, 38, 38, 37, 37, 37,
+        37, 37, 47, 48, 49, 50, 52, 50, 48, 47, 45, 44, 43, 42, 41, 41, 41, 40,
+        40, 40, 39, 39, 39, 39, 38, 38, 38, 37, 37, 37, 36, 36, 36, 36, 46, 48,
+        49, 50, 51, 50, 48, 47, 45, 44, 43, 42, 41, 41, 40, 40, 39, 39, 39, 38,
+        38, 38, 38, 37, 37, 37, 36, 36, 36, 36, 36, 36, 46, 47, 48, 50, 51, 50,
+        48, 47, 45, 44, 43, 42, 41, 40, 40, 39, 39, 38, 38, 38, 37, 37, 37, 37,
+        36, 36, 36, 35, 35, 35, 35, 35, 45, 47, 48, 49, 51, 49, 48, 46, 45, 44,
+        43, 42, 41, 40, 39, 39, 38, 38, 37, 37, 37, 37, 36, 36, 36, 35, 35, 35,
+        35, 35, 35, 35, 45, 46, 47, 49, 50, 49, 47, 46, 45, 44, 43, 41, 40, 40,
+        39, 38, 38, 37, 37, 37, 36, 36, 36, 36, 35, 35, 35, 34, 34, 34, 34, 34,
+        44, 45, 47, 48, 49, 48, 47, 46, 44, 43, 42, 41, 40, 39, 39, 38, 37, 37,
+        37, 36, 36, 36, 35, 35, 35, 35, 34, 34, 34, 34, 34, 34, 43, 45, 46, 47,
+        49, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 38, 37, 37, 36, 36, 35, 35,
+        35, 35, 34, 34, 34, 34, 33, 33, 33, 33, 42, 44, 45, 46, 48, 47, 46, 45,
+        44, 43, 42, 41, 40, 39, 38, 37, 37, 36, 36, 35, 35, 35, 34, 34, 34, 34,
+        33, 33, 33, 33, 33, 33, 42, 43, 44, 46, 47, 46, 45, 44, 43, 42, 41, 40,
+        39, 39, 38, 37, 37, 36, 36, 35, 35, 34, 34, 34, 34, 33, 33, 33, 33, 33,
+        33, 33, 41, 42, 44, 45, 46, 45, 44, 43, 43, 42, 41, 40, 39, 38, 38, 37,
+        36, 36, 35, 35, 34, 34, 34, 34, 33, 33, 33, 33, 32, 32, 32, 32, 40, 41,
+        43, 44, 45, 44, 44, 43, 42, 41, 40, 39, 39, 38, 37, 37, 36, 36, 35, 35,
+        34, 34, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 40, 41, 42, 43, 44, 44,
+        43, 42, 42, 41, 40, 39, 38, 38, 37, 36, 36, 35, 35, 34, 34, 34, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 39, 40, 41, 42, 44, 43, 42, 42, 41, 40,
+        39, 39, 38, 37, 37, 36, 35, 35, 35, 34, 34, 33, 33, 33, 32, 32, 32, 32,
+        31, 31, 31, 31, 38, 39, 40, 42, 43, 42, 42, 41, 40, 40, 39, 38, 38, 37,
+        36, 36, 35, 35, 34, 34, 33, 33, 33, 32, 32, 32, 32, 31, 31, 31, 31, 31,
+        38, 39, 40, 41, 42, 41, 41, 40, 40, 39, 38, 38, 37, 37, 36, 35, 35, 34,
+        34, 34, 33, 33, 33, 32, 32, 32, 31, 31, 31, 31, 31, 31, 37, 38, 39, 40,
+        41, 41, 40, 40, 39, 39, 38, 37, 37, 36, 36, 35, 35, 34, 34, 33, 33, 33,
+        32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 37, 38, 39, 40, 41, 41, 40, 40,
+        39, 39, 38, 37, 37, 36, 36, 35, 35, 34, 34, 33, 33, 33, 32, 32, 32, 31,
+        31, 31, 31, 31, 31, 31, 37, 38, 39, 40, 41, 41, 40, 40, 39, 39, 38, 37,
+        37, 36, 36, 35, 35, 34, 34, 33, 33, 33, 32, 32, 32, 31, 31, 31, 31, 31,
+        31, 31, 37, 38, 39, 40, 41, 41, 40, 40, 39, 39, 38, 37, 37, 36, 36, 35,
+        35, 34, 34, 33, 33, 33, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31 },
+      { /* Intra matrices */
+        /* Size 4 */
+        110, 78, 74, 64, 78, 69, 66, 61, 74, 66, 57, 53, 64, 61, 53, 49,
+        /* Size 8 */
+        103, 122, 78, 75, 72, 67, 62, 58, 122, 92, 77, 83, 81, 76, 70, 65, 78,
+        77, 70, 71, 71, 69, 66, 62, 75, 83, 71, 66, 64, 62, 60, 58, 72, 81, 71,
+        64, 60, 58, 56, 54, 67, 76, 69, 62, 58, 55, 53, 51, 62, 70, 66, 60, 56,
+        53, 50, 49, 58, 65, 62, 58, 54, 51, 49, 47,
+        /* Size 16 */
+        105, 115, 124, 102, 80, 78, 77, 75, 73, 71, 68, 66, 63, 61, 59, 59, 115,
+        112, 109, 94, 79, 80, 80, 79, 78, 75, 73, 70, 67, 65, 62, 62, 124, 109,
+        94, 86, 79, 82, 84, 83, 83, 80, 77, 74, 72, 69, 66, 66, 102, 94, 86, 81,
+        75, 77, 78, 78, 78, 76, 74, 71, 69, 67, 64, 64, 80, 79, 79, 75, 71, 72,
+        73, 73, 73, 71, 70, 68, 67, 65, 63, 63, 78, 80, 82, 77, 72, 71, 70, 69,
+        69, 68, 67, 65, 64, 62, 61, 61, 77, 80, 84, 78, 73, 70, 67, 66, 65, 64,
+        63, 62, 61, 60, 59, 59, 75, 79, 83, 78, 73, 69, 66, 64, 63, 62, 61, 60,
+        59, 58, 57, 57, 73, 78, 83, 78, 73, 69, 65, 63, 61, 60, 59, 58, 57, 56,
+        55, 55, 71, 75, 80, 76, 71, 68, 64, 62, 60, 58, 57, 56, 55, 54, 53, 53,
+        68, 73, 77, 74, 70, 67, 63, 61, 59, 57, 55, 54, 53, 53, 52, 52, 66, 70,
+        74, 71, 68, 65, 62, 60, 58, 56, 54, 53, 52, 52, 51, 51, 63, 67, 72, 69,
+        67, 64, 61, 59, 57, 55, 53, 52, 51, 51, 50, 50, 61, 65, 69, 67, 65, 62,
+        60, 58, 56, 54, 53, 52, 51, 50, 49, 49, 59, 62, 66, 64, 63, 61, 59, 57,
+        55, 53, 52, 51, 50, 49, 48, 48, 59, 62, 66, 64, 63, 61, 59, 57, 55, 53,
+        52, 51, 50, 49, 48, 48,
+        /* Size 32 */
+        106, 111, 116, 121, 126, 114, 103, 92, 80, 80, 79, 78, 77, 76, 76, 75,
+        74, 73, 71, 70, 69, 68, 66, 65, 64, 63, 62, 60, 59, 59, 59, 59, 111,
+        113, 114, 116, 118, 108, 99, 90, 80, 80, 80, 79, 79, 79, 78, 77, 76, 75,
+        74, 72, 71, 70, 69, 67, 66, 65, 64, 62, 61, 61, 61, 61, 116, 114, 113,
+        112, 110, 103, 95, 87, 80, 80, 81, 81, 81, 81, 80, 79, 79, 77, 76, 75,
+        73, 72, 71, 69, 68, 67, 65, 64, 63, 63, 63, 63, 121, 116, 112, 107, 102,
+        97, 91, 85, 80, 81, 81, 82, 83, 83, 82, 82, 81, 80, 78, 77, 76, 74, 73,
+        71, 70, 69, 67, 66, 65, 65, 65, 65, 126, 118, 110, 102, 95, 91, 87, 83,
+        79, 81, 82, 84, 85, 85, 84, 84, 83, 82, 81, 79, 78, 77, 75, 74, 72, 71,
+        69, 68, 67, 67, 67, 67, 114, 108, 103, 97, 91, 87, 84, 81, 77, 79, 80,
+        81, 82, 82, 81, 81, 81, 80, 79, 77, 76, 75, 74, 72, 71, 70, 68, 67, 66,
+        66, 66, 66, 103, 99, 95, 91, 87, 84, 81, 78, 75, 76, 77, 78, 79, 79, 79,
+        79, 78, 77, 76, 75, 75, 73, 72, 71, 70, 68, 67, 66, 65, 65, 65, 65, 92,
+        90, 87, 85, 83, 81, 78, 76, 74, 74, 75, 75, 76, 76, 76, 76, 76, 75, 74,
+        73, 73, 72, 71, 70, 68, 67, 66, 65, 64, 64, 64, 64, 80, 80, 80, 80, 79,
+        77, 75, 74, 72, 72, 72, 73, 73, 73, 73, 73, 73, 73, 72, 72, 71, 70, 69,
+        68, 67, 66, 65, 64, 63, 63, 63, 63, 80, 80, 80, 81, 81, 79, 76, 74, 72,
+        72, 72, 72, 72, 72, 72, 71, 71, 71, 70, 70, 69, 68, 68, 67, 66, 65, 64,
+        63, 62, 62, 62, 62, 79, 80, 81, 81, 82, 80, 77, 75, 72, 72, 71, 71, 70,
+        70, 70, 70, 69, 69, 68, 68, 67, 67, 66, 65, 64, 64, 63, 62, 61, 61, 61,
+        61, 78, 79, 81, 82, 84, 81, 78, 75, 73, 72, 71, 70, 69, 69, 68, 68, 68,
+        67, 67, 66, 66, 65, 64, 64, 63, 62, 62, 61, 60, 60, 60, 60, 77, 79, 81,
+        83, 85, 82, 79, 76, 73, 72, 70, 69, 68, 67, 67, 66, 66, 65, 65, 64, 64,
+        63, 63, 62, 62, 61, 60, 60, 59, 59, 59, 59, 76, 79, 81, 83, 85, 82, 79,
+        76, 73, 72, 70, 69, 67, 66, 66, 65, 65, 64, 64, 63, 63, 62, 62, 61, 61,
+        60, 59, 59, 58, 58, 58, 58, 76, 78, 80, 82, 84, 81, 79, 76, 73, 72, 70,
+        68, 67, 66, 65, 64, 64, 63, 62, 62, 61, 61, 60, 60, 59, 59, 58, 58, 57,
+        57, 57, 57, 75, 77, 79, 82, 84, 81, 79, 76, 73, 71, 70, 68, 66, 65, 64,
+        63, 62, 62, 61, 61, 60, 60, 59, 59, 58, 58, 57, 57, 56, 56, 56, 56, 74,
+        76, 79, 81, 83, 81, 78, 76, 73, 71, 69, 68, 66, 65, 64, 62, 61, 61, 60,
+        60, 59, 59, 58, 58, 57, 57, 56, 56, 55, 55, 55, 55, 73, 75, 77, 80, 82,
+        80, 77, 75, 73, 71, 69, 67, 65, 64, 63, 62, 61, 60, 60, 59, 58, 58, 57,
+        57, 56, 56, 55, 55, 55, 55, 55, 55, 71, 74, 76, 78, 81, 79, 76, 74, 72,
+        70, 68, 67, 65, 64, 62, 61, 60, 60, 59, 58, 58, 57, 57, 56, 56, 55, 55,
+        54, 54, 54, 54, 54, 70, 72, 75, 77, 79, 77, 75, 73, 72, 70, 68, 66, 64,
+        63, 62, 61, 60, 59, 58, 57, 57, 56, 56, 55, 55, 54, 54, 54, 53, 53, 53,
+        53, 69, 71, 73, 76, 78, 76, 75, 73, 71, 69, 67, 66, 64, 63, 61, 60, 59,
+        58, 58, 57, 56, 55, 55, 54, 54, 54, 53, 53, 52, 52, 52, 52, 68, 70, 72,
+        74, 77, 75, 73, 72, 70, 68, 67, 65, 63, 62, 61, 60, 59, 58, 57, 56, 55,
+        55, 54, 54, 53, 53, 53, 52, 52, 52, 52, 52, 66, 69, 71, 73, 75, 74, 72,
+        71, 69, 68, 66, 64, 63, 62, 60, 59, 58, 57, 57, 56, 55, 54, 54, 53, 53,
+        52, 52, 52, 51, 51, 51, 51, 65, 67, 69, 71, 74, 72, 71, 70, 68, 67, 65,
+        64, 62, 61, 60, 59, 58, 57, 56, 55, 54, 54, 53, 53, 52, 52, 52, 51, 51,
+        51, 51, 51, 64, 66, 68, 70, 72, 71, 70, 68, 67, 66, 64, 63, 62, 61, 59,
+        58, 57, 56, 56, 55, 54, 53, 53, 52, 52, 51, 51, 51, 50, 50, 50, 50, 63,
+        65, 67, 69, 71, 70, 68, 67, 66, 65, 64, 62, 61, 60, 59, 58, 57, 56, 55,
+        54, 54, 53, 52, 52, 51, 51, 51, 50, 50, 50, 50, 50, 62, 64, 65, 67, 69,
+        68, 67, 66, 65, 64, 63, 62, 60, 59, 58, 57, 56, 55, 55, 54, 53, 53, 52,
+        52, 51, 51, 50, 50, 49, 49, 49, 49, 60, 62, 64, 66, 68, 67, 66, 65, 64,
+        63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 54, 53, 52, 52, 51, 51, 50, 50,
+        49, 49, 49, 49, 49, 59, 61, 63, 65, 67, 66, 65, 64, 63, 62, 61, 60, 59,
+        58, 57, 56, 55, 55, 54, 53, 52, 52, 51, 51, 50, 50, 49, 49, 49, 49, 49,
+        49, 59, 61, 63, 65, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55,
+        55, 54, 53, 52, 52, 51, 51, 50, 50, 49, 49, 49, 49, 49, 49, 59, 61, 63,
+        65, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 55, 54, 53, 52,
+        52, 51, 51, 50, 50, 49, 49, 49, 49, 49, 49, 59, 61, 63, 65, 67, 66, 65,
+        64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 55, 54, 53, 52, 52, 51, 51, 50,
+        50, 49, 49, 49, 49, 49, 49 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 60, 45, 36, 60, 47, 40, 35, 45, 40, 34, 32, 36, 35, 32, 31,
+        /* Size 8 */
+        64, 77, 73, 61, 52, 45, 41, 38, 77, 70, 73, 65, 56, 49, 44, 40, 73, 73,
+        58, 53, 49, 45, 42, 39, 61, 65, 53, 47, 44, 41, 39, 38, 52, 56, 49, 44,
+        41, 39, 37, 36, 45, 49, 45, 41, 39, 37, 36, 35, 41, 44, 42, 39, 37, 36,
+        35, 34, 38, 40, 39, 38, 36, 35, 34, 34,
+        /* Size 16 */
+        64, 71, 77, 75, 73, 67, 61, 56, 52, 48, 45, 43, 41, 40, 38, 38, 71, 72,
+        74, 73, 73, 68, 63, 59, 54, 51, 47, 45, 42, 41, 39, 39, 77, 74, 70, 71,
+        73, 69, 65, 61, 56, 53, 49, 46, 44, 42, 40, 40, 75, 73, 71, 69, 66, 62,
+        59, 56, 53, 50, 47, 45, 43, 41, 40, 40, 73, 73, 73, 66, 58, 56, 53, 51,
+        49, 47, 45, 43, 42, 40, 39, 39, 67, 68, 69, 62, 56, 53, 50, 48, 46, 45,
+        43, 42, 41, 39, 38, 38, 61, 63, 65, 59, 53, 50, 47, 45, 44, 43, 41, 40,
+        39, 38, 38, 38, 56, 59, 61, 56, 51, 48, 45, 44, 42, 41, 40, 39, 38, 38,
+        37, 37, 52, 54, 56, 53, 49, 46, 44, 42, 41, 40, 39, 38, 37, 37, 36, 36,
+        48, 51, 53, 50, 47, 45, 43, 41, 40, 39, 38, 37, 37, 36, 36, 36, 45, 47,
+        49, 47, 45, 43, 41, 40, 39, 38, 37, 37, 36, 36, 35, 35, 43, 45, 46, 45,
+        43, 42, 40, 39, 38, 37, 37, 36, 36, 35, 35, 35, 41, 42, 44, 43, 42, 41,
+        39, 38, 37, 37, 36, 36, 35, 35, 34, 34, 40, 41, 42, 41, 40, 39, 38, 38,
+        37, 36, 36, 35, 35, 34, 34, 34, 38, 39, 40, 40, 39, 38, 38, 37, 36, 36,
+        35, 35, 34, 34, 34, 34, 38, 39, 40, 40, 39, 38, 38, 37, 36, 36, 35, 35,
+        34, 34, 34, 34,
+        /* Size 32 */
+        64, 67, 71, 74, 77, 76, 75, 74, 73, 70, 67, 64, 61, 59, 56, 54, 52, 50,
+        48, 47, 45, 44, 43, 42, 41, 40, 40, 39, 38, 38, 38, 38, 67, 69, 71, 74,
+        76, 75, 74, 74, 73, 70, 68, 65, 62, 60, 58, 55, 53, 51, 50, 48, 46, 45,
+        44, 43, 42, 41, 40, 40, 39, 39, 39, 39, 71, 71, 72, 73, 74, 74, 73, 73,
+        73, 71, 68, 66, 63, 61, 59, 56, 54, 52, 51, 49, 47, 46, 45, 44, 42, 42,
+        41, 40, 39, 39, 39, 39, 74, 74, 73, 73, 72, 72, 72, 73, 73, 71, 69, 66,
+        64, 62, 60, 57, 55, 53, 52, 50, 48, 47, 46, 44, 43, 42, 42, 41, 40, 40,
+        40, 40, 77, 76, 74, 72, 70, 71, 71, 72, 73, 71, 69, 67, 65, 63, 61, 59,
+        56, 54, 53, 51, 49, 48, 46, 45, 44, 43, 42, 41, 40, 40, 40, 40, 76, 75,
+        74, 72, 71, 70, 70, 70, 69, 67, 66, 64, 62, 60, 58, 56, 54, 53, 51, 50,
+        48, 47, 46, 45, 43, 43, 42, 41, 40, 40, 40, 40, 75, 74, 73, 72, 71, 70,
+        69, 67, 66, 64, 62, 61, 59, 58, 56, 54, 53, 51, 50, 48, 47, 46, 45, 44,
+        43, 42, 41, 41, 40, 40, 40, 40, 74, 74, 73, 73, 72, 70, 67, 65, 62, 61,
+        59, 58, 56, 55, 54, 52, 51, 50, 49, 47, 46, 45, 44, 43, 42, 42, 41, 40,
+        39, 39, 39, 39, 73, 73, 73, 73, 73, 69, 66, 62, 58, 57, 56, 55, 53, 52,
+        51, 50, 49, 48, 47, 46, 45, 44, 43, 43, 42, 41, 40, 40, 39, 39, 39, 39,
+        70, 70, 71, 71, 71, 67, 64, 61, 57, 56, 55, 53, 52, 51, 50, 49, 48, 47,
+        46, 45, 44, 43, 43, 42, 41, 41, 40, 39, 39, 39, 39, 39, 67, 68, 68, 69,
+        69, 66, 62, 59, 56, 55, 53, 52, 50, 49, 48, 47, 46, 46, 45, 44, 43, 43,
+        42, 41, 41, 40, 39, 39, 38, 38, 38, 38, 64, 65, 66, 66, 67, 64, 61, 58,
+        55, 53, 52, 50, 49, 48, 47, 46, 45, 44, 44, 43, 42, 42, 41, 41, 40, 39,
+        39, 38, 38, 38, 38, 38, 61, 62, 63, 64, 65, 62, 59, 56, 53, 52, 50, 49,
+        47, 46, 45, 45, 44, 43, 43, 42, 41, 41, 40, 40, 39, 39, 38, 38, 38, 38,
+        38, 38, 59, 60, 61, 62, 63, 60, 58, 55, 52, 51, 49, 48, 46, 45, 45, 44,
+        43, 42, 42, 41, 41, 40, 40, 39, 39, 38, 38, 38, 37, 37, 37, 37, 56, 58,
+        59, 60, 61, 58, 56, 54, 51, 50, 48, 47, 45, 45, 44, 43, 42, 42, 41, 41,
+        40, 40, 39, 39, 38, 38, 38, 37, 37, 37, 37, 37, 54, 55, 56, 57, 59, 56,
+        54, 52, 50, 49, 47, 46, 45, 44, 43, 42, 41, 41, 40, 40, 39, 39, 39, 38,
+        38, 38, 37, 37, 37, 37, 37, 37, 52, 53, 54, 55, 56, 54, 53, 51, 49, 48,
+        46, 45, 44, 43, 42, 41, 41, 40, 40, 39, 39, 38, 38, 38, 37, 37, 37, 37,
+        36, 36, 36, 36, 50, 51, 52, 53, 54, 53, 51, 50, 48, 47, 46, 44, 43, 42,
+        42, 41, 40, 40, 39, 39, 38, 38, 38, 37, 37, 37, 37, 36, 36, 36, 36, 36,
+        48, 50, 51, 52, 53, 51, 50, 49, 47, 46, 45, 44, 43, 42, 41, 40, 40, 39,
+        39, 38, 38, 38, 37, 37, 37, 36, 36, 36, 36, 36, 36, 36, 47, 48, 49, 50,
+        51, 50, 48, 47, 46, 45, 44, 43, 42, 41, 41, 40, 39, 39, 38, 38, 38, 37,
+        37, 37, 36, 36, 36, 36, 35, 35, 35, 35, 45, 46, 47, 48, 49, 48, 47, 46,
+        45, 44, 43, 42, 41, 41, 40, 39, 39, 38, 38, 38, 37, 37, 37, 36, 36, 36,
+        36, 35, 35, 35, 35, 35, 44, 45, 46, 47, 48, 47, 46, 45, 44, 43, 43, 42,
+        41, 40, 40, 39, 38, 38, 38, 37, 37, 37, 36, 36, 36, 36, 35, 35, 35, 35,
+        35, 35, 43, 44, 45, 46, 46, 46, 45, 44, 43, 43, 42, 41, 40, 40, 39, 39,
+        38, 38, 37, 37, 37, 36, 36, 36, 36, 35, 35, 35, 35, 35, 35, 35, 42, 43,
+        44, 44, 45, 45, 44, 43, 43, 42, 41, 41, 40, 39, 39, 38, 38, 37, 37, 37,
+        36, 36, 36, 36, 35, 35, 35, 35, 35, 35, 35, 35, 41, 42, 42, 43, 44, 43,
+        43, 42, 42, 41, 41, 40, 39, 39, 38, 38, 37, 37, 37, 36, 36, 36, 36, 35,
+        35, 35, 35, 35, 34, 34, 34, 34, 40, 41, 42, 42, 43, 43, 42, 42, 41, 41,
+        40, 39, 39, 38, 38, 38, 37, 37, 36, 36, 36, 36, 35, 35, 35, 35, 35, 34,
+        34, 34, 34, 34, 40, 40, 41, 42, 42, 42, 41, 41, 40, 40, 39, 39, 38, 38,
+        38, 37, 37, 37, 36, 36, 36, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 34,
+        39, 40, 40, 41, 41, 41, 41, 40, 40, 39, 39, 38, 38, 38, 37, 37, 37, 36,
+        36, 36, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 34, 34, 38, 39, 39, 40,
+        40, 40, 40, 39, 39, 39, 38, 38, 38, 37, 37, 37, 36, 36, 36, 35, 35, 35,
+        35, 35, 34, 34, 34, 34, 34, 34, 34, 34, 38, 39, 39, 40, 40, 40, 40, 39,
+        39, 39, 38, 38, 38, 37, 37, 37, 36, 36, 36, 35, 35, 35, 35, 35, 34, 34,
+        34, 34, 34, 34, 34, 34, 38, 39, 39, 40, 40, 40, 40, 39, 39, 39, 38, 38,
+        38, 37, 37, 37, 36, 36, 36, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 34,
+        34, 34, 38, 39, 39, 40, 40, 40, 40, 39, 39, 39, 38, 38, 38, 37, 37, 37,
+        36, 36, 36, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 34, 34, 34 },
+      { /* Intra matrices */
+        /* Size 4 */
+        109, 102, 75, 59, 102, 79, 66, 57, 75, 66, 56, 52, 59, 57, 52, 49,
+        /* Size 8 */
+        96, 117, 111, 92, 77, 67, 60, 55, 117, 106, 110, 98, 84, 73, 64, 59,
+        111, 110, 87, 80, 73, 66, 61, 57, 92, 98, 80, 69, 64, 60, 57, 55, 77,
+        84, 73, 64, 59, 56, 54, 52, 67, 73, 66, 60, 56, 54, 52, 51, 60, 64, 61,
+        57, 54, 52, 51, 49, 55, 59, 57, 55, 52, 51, 49, 49,
+        /* Size 16 */
+        98, 109, 120, 116, 113, 103, 94, 86, 78, 73, 68, 65, 61, 59, 57, 57,
+        109, 111, 114, 113, 112, 105, 97, 89, 82, 76, 71, 67, 63, 61, 58, 58,
+        120, 114, 108, 110, 112, 106, 100, 93, 86, 80, 74, 70, 66, 63, 60, 60,
+        116, 113, 110, 105, 101, 96, 91, 85, 80, 75, 71, 67, 64, 62, 59, 59,
+        113, 112, 112, 101, 89, 85, 81, 78, 74, 71, 68, 65, 62, 60, 58, 58, 103,
+        105, 106, 96, 85, 81, 76, 73, 70, 67, 65, 62, 60, 59, 57, 57, 94, 97,
+        100, 91, 81, 76, 71, 68, 66, 64, 62, 60, 58, 57, 56, 56, 86, 89, 93, 85,
+        78, 73, 68, 66, 63, 61, 60, 58, 57, 56, 55, 55, 78, 82, 86, 80, 74, 70,
+        66, 63, 61, 59, 57, 56, 55, 54, 53, 53, 73, 76, 80, 75, 71, 67, 64, 61,
+        59, 58, 56, 55, 54, 53, 53, 53, 68, 71, 74, 71, 68, 65, 62, 60, 57, 56,
+        55, 54, 53, 52, 52, 52, 65, 67, 70, 67, 65, 62, 60, 58, 56, 55, 54, 53,
+        52, 52, 51, 51, 61, 63, 66, 64, 62, 60, 58, 57, 55, 54, 53, 52, 52, 51,
+        51, 51, 59, 61, 63, 62, 60, 59, 57, 56, 54, 53, 52, 52, 51, 51, 50, 50,
+        57, 58, 60, 59, 58, 57, 56, 55, 53, 53, 52, 51, 51, 50, 50, 50, 57, 58,
+        60, 59, 58, 57, 56, 55, 53, 53, 52, 51, 51, 50, 50, 50,
+        /* Size 32 */
+        99, 105, 110, 115, 121, 119, 118, 116, 114, 109, 104, 100, 95, 91, 87,
+        83, 79, 76, 74, 71, 69, 67, 65, 63, 62, 61, 59, 58, 57, 57, 57, 57, 105,
+        108, 111, 115, 118, 117, 116, 115, 114, 110, 105, 101, 96, 93, 89, 85,
+        81, 78, 76, 73, 70, 68, 67, 65, 63, 62, 61, 59, 58, 58, 58, 58, 110,
+        111, 113, 114, 115, 115, 114, 114, 114, 110, 106, 102, 98, 94, 90, 87,
+        83, 80, 77, 74, 72, 70, 68, 66, 64, 63, 62, 60, 59, 59, 59, 59, 115,
+        115, 114, 113, 112, 113, 113, 113, 113, 110, 106, 103, 100, 96, 92, 88,
+        85, 82, 79, 76, 73, 71, 69, 67, 65, 64, 63, 61, 60, 60, 60, 60, 121,
+        118, 115, 112, 110, 110, 111, 112, 113, 110, 107, 104, 101, 98, 94, 90,
+        87, 84, 81, 78, 75, 73, 71, 69, 67, 65, 64, 62, 61, 61, 61, 61, 119,
+        117, 115, 113, 110, 110, 109, 108, 107, 105, 102, 99, 96, 93, 90, 87,
+        84, 81, 78, 76, 73, 71, 69, 68, 66, 64, 63, 62, 60, 60, 60, 60, 118,
+        116, 114, 113, 111, 109, 106, 104, 102, 99, 97, 94, 92, 89, 86, 83, 81,
+        78, 76, 74, 72, 70, 68, 66, 65, 63, 62, 61, 60, 60, 60, 60, 116, 115,
+        114, 113, 112, 108, 104, 100, 96, 94, 91, 89, 87, 85, 82, 80, 78, 76,
+        74, 72, 70, 68, 67, 65, 64, 63, 62, 60, 59, 59, 59, 59, 114, 114, 114,
+        113, 113, 107, 102, 96, 90, 88, 86, 84, 82, 80, 79, 77, 75, 73, 72, 70,
+        68, 67, 66, 64, 63, 62, 61, 60, 59, 59, 59, 59, 109, 110, 110, 110, 110,
+        105, 99, 94, 88, 86, 84, 82, 79, 78, 76, 74, 73, 71, 70, 68, 67, 66, 64,
+        63, 62, 61, 60, 59, 58, 58, 58, 58, 104, 105, 106, 106, 107, 102, 97,
+        91, 86, 84, 81, 79, 77, 75, 74, 72, 71, 69, 68, 67, 65, 64, 63, 62, 61,
+        60, 59, 58, 57, 57, 57, 57, 100, 101, 102, 103, 104, 99, 94, 89, 84, 82,
+        79, 77, 74, 73, 71, 70, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58, 58,
+        57, 57, 57, 57, 95, 96, 98, 100, 101, 96, 92, 87, 82, 79, 77, 74, 72,
+        70, 69, 68, 66, 65, 64, 63, 62, 61, 61, 60, 59, 58, 58, 57, 56, 56, 56,
+        56, 91, 93, 94, 96, 98, 93, 89, 85, 80, 78, 75, 73, 70, 69, 68, 66, 65,
+        64, 63, 62, 61, 60, 60, 59, 58, 58, 57, 56, 56, 56, 56, 56, 87, 89, 90,
+        92, 94, 90, 86, 82, 79, 76, 74, 71, 69, 68, 66, 65, 64, 63, 62, 61, 60,
+        59, 59, 58, 57, 57, 56, 56, 55, 55, 55, 55, 83, 85, 87, 88, 90, 87, 83,
+        80, 77, 74, 72, 70, 68, 66, 65, 64, 62, 62, 61, 60, 59, 59, 58, 57, 57,
+        56, 56, 55, 55, 55, 55, 55, 79, 81, 83, 85, 87, 84, 81, 78, 75, 73, 71,
+        68, 66, 65, 64, 62, 61, 60, 60, 59, 58, 58, 57, 56, 56, 55, 55, 54, 54,
+        54, 54, 54, 76, 78, 80, 82, 84, 81, 78, 76, 73, 71, 69, 67, 65, 64, 63,
+        62, 60, 60, 59, 58, 57, 57, 56, 56, 55, 55, 54, 54, 54, 54, 54, 54, 74,
+        76, 77, 79, 81, 78, 76, 74, 72, 70, 68, 66, 64, 63, 62, 61, 60, 59, 58,
+        57, 57, 56, 56, 55, 55, 54, 54, 54, 53, 53, 53, 53, 71, 73, 74, 76, 78,
+        76, 74, 72, 70, 68, 67, 65, 63, 62, 61, 60, 59, 58, 57, 57, 56, 56, 55,
+        55, 54, 54, 53, 53, 53, 53, 53, 53, 69, 70, 72, 73, 75, 73, 72, 70, 68,
+        67, 65, 64, 62, 61, 60, 59, 58, 57, 57, 56, 55, 55, 55, 54, 54, 53, 53,
+        53, 52, 52, 52, 52, 67, 68, 70, 71, 73, 71, 70, 68, 67, 66, 64, 63, 61,
+        60, 59, 59, 58, 57, 56, 56, 55, 55, 54, 54, 53, 53, 53, 52, 52, 52, 52,
+        52, 65, 67, 68, 69, 71, 69, 68, 67, 66, 64, 63, 62, 61, 60, 59, 58, 57,
+        56, 56, 55, 55, 54, 54, 53, 53, 53, 52, 52, 52, 52, 52, 52, 63, 65, 66,
+        67, 69, 68, 66, 65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 56, 55, 55, 54,
+        54, 53, 53, 52, 52, 52, 52, 51, 51, 51, 51, 62, 63, 64, 65, 67, 66, 65,
+        64, 63, 62, 61, 60, 59, 58, 57, 57, 56, 55, 55, 54, 54, 53, 53, 52, 52,
+        52, 52, 51, 51, 51, 51, 51, 61, 62, 63, 64, 65, 64, 63, 63, 62, 61, 60,
+        59, 58, 58, 57, 56, 55, 55, 54, 54, 53, 53, 53, 52, 52, 52, 51, 51, 51,
+        51, 51, 51, 59, 61, 62, 63, 64, 63, 62, 62, 61, 60, 59, 58, 58, 57, 56,
+        56, 55, 54, 54, 53, 53, 53, 52, 52, 52, 51, 51, 51, 51, 51, 51, 51, 58,
+        59, 60, 61, 62, 62, 61, 60, 60, 59, 58, 58, 57, 56, 56, 55, 54, 54, 54,
+        53, 53, 52, 52, 52, 51, 51, 51, 51, 50, 50, 50, 50, 57, 58, 59, 60, 61,
+        60, 60, 59, 59, 58, 57, 57, 56, 56, 55, 55, 54, 54, 53, 53, 52, 52, 52,
+        51, 51, 51, 51, 50, 50, 50, 50, 50, 57, 58, 59, 60, 61, 60, 60, 59, 59,
+        58, 57, 57, 56, 56, 55, 55, 54, 54, 53, 53, 52, 52, 52, 51, 51, 51, 51,
+        50, 50, 50, 50, 50, 57, 58, 59, 60, 61, 60, 60, 59, 59, 58, 57, 57, 56,
+        56, 55, 55, 54, 54, 53, 53, 52, 52, 52, 51, 51, 51, 51, 50, 50, 50, 50,
+        50, 57, 58, 59, 60, 61, 60, 60, 59, 59, 58, 57, 57, 56, 56, 55, 55, 54,
+        54, 53, 53, 52, 52, 52, 51, 51, 51, 51, 50, 50, 50, 50, 50 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 48, 46, 40, 48, 43, 42, 39, 46, 42, 37, 35, 40, 39, 35, 33,
+        /* Size 8 */
+        64, 74, 50, 49, 47, 44, 42, 39, 74, 58, 50, 53, 52, 49, 46, 43, 50, 50,
+        46, 47, 47, 45, 44, 41, 49, 53, 47, 44, 43, 42, 41, 39, 47, 52, 47, 43,
+        40, 39, 38, 37, 44, 49, 45, 42, 39, 38, 37, 36, 42, 46, 44, 41, 38, 37,
+        35, 35, 39, 43, 41, 39, 37, 36, 35, 34,
+        /* Size 16 */
+        64, 69, 74, 62, 50, 50, 49, 48, 47, 46, 44, 43, 42, 41, 39, 39, 69, 68,
+        66, 58, 50, 51, 51, 50, 50, 48, 47, 45, 44, 43, 41, 41, 74, 66, 58, 54,
+        50, 51, 53, 52, 52, 51, 49, 48, 46, 45, 43, 43, 62, 58, 54, 51, 48, 49,
+        50, 50, 49, 48, 47, 46, 45, 44, 42, 42, 50, 50, 50, 48, 46, 46, 47, 47,
+        47, 46, 45, 45, 44, 43, 41, 41, 50, 51, 51, 49, 46, 46, 45, 45, 45, 44,
+        44, 43, 42, 41, 40, 40, 49, 51, 53, 50, 47, 45, 44, 43, 43, 42, 42, 41,
+        41, 40, 39, 39, 48, 50, 52, 50, 47, 45, 43, 42, 42, 41, 41, 40, 39, 39,
+        38, 38, 47, 50, 52, 49, 47, 45, 43, 42, 40, 40, 39, 39, 38, 38, 37, 37,
+        46, 48, 51, 48, 46, 44, 42, 41, 40, 39, 38, 38, 37, 37, 37, 37, 44, 47,
+        49, 47, 45, 44, 42, 41, 39, 38, 38, 37, 37, 36, 36, 36, 43, 45, 48, 46,
+        45, 43, 41, 40, 39, 38, 37, 37, 36, 36, 35, 35, 42, 44, 46, 45, 44, 42,
+        41, 39, 38, 37, 37, 36, 35, 35, 35, 35, 41, 43, 45, 44, 43, 41, 40, 39,
+        38, 37, 36, 36, 35, 35, 34, 34, 39, 41, 43, 42, 41, 40, 39, 38, 37, 37,
+        36, 35, 35, 34, 34, 34, 39, 41, 43, 42, 41, 40, 39, 38, 37, 37, 36, 35,
+        35, 34, 34, 34,
+        /* Size 32 */
+        64, 67, 69, 72, 74, 68, 62, 56, 50, 50, 50, 49, 49, 48, 48, 48, 47, 46,
+        46, 45, 44, 44, 43, 42, 42, 41, 41, 40, 39, 39, 39, 39, 67, 67, 68, 69,
+        70, 65, 60, 55, 50, 50, 50, 50, 50, 49, 49, 49, 48, 48, 47, 46, 46, 45,
+        44, 44, 43, 42, 42, 41, 40, 40, 40, 40, 69, 68, 68, 67, 66, 62, 58, 54,
+        50, 50, 51, 51, 51, 51, 50, 50, 50, 49, 48, 48, 47, 46, 45, 45, 44, 43,
+        43, 42, 41, 41, 41, 41, 72, 69, 67, 64, 62, 59, 56, 53, 50, 51, 51, 51,
+        52, 52, 51, 51, 51, 50, 49, 49, 48, 47, 47, 46, 45, 44, 44, 43, 42, 42,
+        42, 42, 74, 70, 66, 62, 58, 56, 54, 52, 50, 51, 51, 52, 53, 53, 52, 52,
+        52, 51, 51, 50, 49, 48, 48, 47, 46, 45, 45, 44, 43, 43, 43, 43, 68, 65,
+        62, 59, 56, 54, 52, 51, 49, 50, 50, 51, 51, 51, 51, 51, 51, 50, 50, 49,
+        48, 48, 47, 46, 45, 45, 44, 43, 43, 43, 43, 43, 62, 60, 58, 56, 54, 52,
+        51, 49, 48, 48, 49, 49, 50, 50, 50, 49, 49, 49, 48, 48, 47, 47, 46, 45,
+        45, 44, 44, 43, 42, 42, 42, 42, 56, 55, 54, 53, 52, 51, 49, 48, 47, 47,
+        48, 48, 48, 48, 48, 48, 48, 48, 47, 47, 46, 46, 45, 45, 44, 44, 43, 42,
+        42, 42, 42, 42, 50, 50, 50, 50, 50, 49, 48, 47, 46, 46, 46, 46, 47, 47,
+        47, 47, 47, 46, 46, 46, 45, 45, 45, 44, 44, 43, 43, 42, 41, 41, 41, 41,
+        50, 50, 50, 51, 51, 50, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 45,
+        45, 45, 45, 44, 44, 43, 43, 42, 42, 41, 41, 41, 41, 41, 50, 50, 51, 51,
+        51, 50, 49, 48, 46, 46, 46, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 43,
+        43, 42, 42, 42, 41, 41, 40, 40, 40, 40, 49, 50, 51, 51, 52, 51, 49, 48,
+        46, 46, 45, 45, 44, 44, 44, 44, 44, 43, 43, 43, 43, 42, 42, 42, 41, 41,
+        41, 40, 40, 40, 40, 40, 49, 50, 51, 52, 53, 51, 50, 48, 47, 46, 45, 44,
+        44, 43, 43, 43, 43, 42, 42, 42, 42, 42, 41, 41, 41, 40, 40, 40, 39, 39,
+        39, 39, 48, 49, 51, 52, 53, 51, 50, 48, 47, 46, 45, 44, 43, 43, 43, 42,
+        42, 42, 42, 41, 41, 41, 41, 40, 40, 40, 39, 39, 39, 39, 39, 39, 48, 49,
+        50, 51, 52, 51, 50, 48, 47, 46, 45, 44, 43, 43, 42, 42, 42, 41, 41, 41,
+        41, 40, 40, 40, 39, 39, 39, 39, 38, 38, 38, 38, 48, 49, 50, 51, 52, 51,
+        49, 48, 47, 46, 45, 44, 43, 42, 42, 42, 41, 41, 40, 40, 40, 40, 39, 39,
+        39, 39, 38, 38, 38, 38, 38, 38, 47, 48, 50, 51, 52, 51, 49, 48, 47, 46,
+        45, 44, 43, 42, 42, 41, 40, 40, 40, 40, 39, 39, 39, 39, 38, 38, 38, 38,
+        37, 37, 37, 37, 46, 48, 49, 50, 51, 50, 49, 48, 46, 45, 44, 43, 42, 42,
+        41, 41, 40, 40, 40, 39, 39, 39, 38, 38, 38, 38, 37, 37, 37, 37, 37, 37,
+        46, 47, 48, 49, 51, 50, 48, 47, 46, 45, 44, 43, 42, 42, 41, 40, 40, 40,
+        39, 39, 38, 38, 38, 38, 37, 37, 37, 37, 37, 37, 37, 37, 45, 46, 48, 49,
+        50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 41, 40, 40, 39, 39, 38, 38, 38,
+        38, 37, 37, 37, 37, 36, 36, 36, 36, 36, 44, 46, 47, 48, 49, 48, 47, 46,
+        45, 45, 44, 43, 42, 41, 41, 40, 39, 39, 38, 38, 38, 37, 37, 37, 37, 36,
+        36, 36, 36, 36, 36, 36, 44, 45, 46, 47, 48, 48, 47, 46, 45, 44, 43, 42,
+        42, 41, 40, 40, 39, 39, 38, 38, 37, 37, 37, 37, 36, 36, 36, 36, 35, 35,
+        35, 35, 43, 44, 45, 47, 48, 47, 46, 45, 45, 44, 43, 42, 41, 41, 40, 39,
+        39, 38, 38, 38, 37, 37, 37, 36, 36, 36, 36, 35, 35, 35, 35, 35, 42, 44,
+        45, 46, 47, 46, 45, 45, 44, 43, 42, 42, 41, 40, 40, 39, 39, 38, 38, 37,
+        37, 37, 36, 36, 36, 36, 35, 35, 35, 35, 35, 35, 42, 43, 44, 45, 46, 45,
+        45, 44, 44, 43, 42, 41, 41, 40, 39, 39, 38, 38, 37, 37, 37, 36, 36, 36,
+        35, 35, 35, 35, 35, 35, 35, 35, 41, 42, 43, 44, 45, 45, 44, 44, 43, 42,
+        42, 41, 40, 40, 39, 39, 38, 38, 37, 37, 36, 36, 36, 36, 35, 35, 35, 35,
+        34, 34, 34, 34, 41, 42, 43, 44, 45, 44, 44, 43, 43, 42, 41, 41, 40, 39,
+        39, 38, 38, 37, 37, 37, 36, 36, 36, 35, 35, 35, 35, 34, 34, 34, 34, 34,
+        40, 41, 42, 43, 44, 43, 43, 42, 42, 41, 41, 40, 40, 39, 39, 38, 38, 37,
+        37, 36, 36, 36, 35, 35, 35, 35, 34, 34, 34, 34, 34, 34, 39, 40, 41, 42,
+        43, 43, 42, 42, 41, 41, 40, 40, 39, 39, 38, 38, 37, 37, 37, 36, 36, 35,
+        35, 35, 35, 34, 34, 34, 34, 34, 34, 34, 39, 40, 41, 42, 43, 43, 42, 42,
+        41, 41, 40, 40, 39, 39, 38, 38, 37, 37, 37, 36, 36, 35, 35, 35, 35, 34,
+        34, 34, 34, 34, 34, 34, 39, 40, 41, 42, 43, 43, 42, 42, 41, 41, 40, 40,
+        39, 39, 38, 38, 37, 37, 37, 36, 36, 35, 35, 35, 35, 34, 34, 34, 34, 34,
+        34, 34, 39, 40, 41, 42, 43, 43, 42, 42, 41, 41, 40, 40, 39, 39, 38, 38,
+        37, 37, 37, 36, 36, 35, 35, 35, 35, 34, 34, 34, 34, 34, 34, 34 },
+      { /* Intra matrices */
+        /* Size 4 */
+        103, 76, 72, 63, 76, 68, 66, 61, 72, 66, 58, 54, 63, 61, 54, 51,
+        /* Size 8 */
+        98, 114, 76, 73, 71, 66, 62, 58, 114, 88, 75, 80, 78, 74, 69, 64, 76,
+        75, 69, 70, 70, 68, 65, 62, 73, 80, 70, 65, 64, 62, 60, 58, 71, 78, 70,
+        64, 60, 58, 57, 55, 66, 74, 68, 62, 58, 56, 54, 53, 62, 69, 65, 60, 57,
+        54, 52, 51, 58, 64, 62, 58, 55, 53, 51, 49,
+        /* Size 16 */
+        99, 107, 115, 96, 77, 76, 74, 73, 72, 69, 67, 65, 63, 61, 59, 59, 107,
+        105, 102, 90, 77, 77, 78, 77, 76, 73, 71, 69, 67, 64, 62, 62, 115, 102,
+        89, 83, 76, 79, 81, 80, 80, 77, 75, 73, 70, 68, 65, 65, 96, 90, 83, 78,
+        73, 74, 76, 76, 75, 74, 72, 70, 68, 66, 64, 64, 77, 77, 76, 73, 70, 70,
+        71, 71, 71, 70, 69, 68, 66, 64, 63, 63, 76, 77, 79, 74, 70, 69, 69, 68,
+        68, 67, 66, 65, 64, 62, 61, 61, 74, 78, 81, 76, 71, 69, 66, 65, 65, 64,
+        63, 62, 61, 60, 59, 59, 73, 77, 80, 76, 71, 68, 65, 64, 63, 62, 61, 60,
+        59, 58, 57, 57, 72, 76, 80, 75, 71, 68, 65, 63, 61, 60, 59, 58, 57, 57,
+        56, 56, 69, 73, 77, 74, 70, 67, 64, 62, 60, 59, 58, 57, 56, 55, 55, 55,
+        67, 71, 75, 72, 69, 66, 63, 61, 59, 58, 56, 56, 55, 54, 53, 53, 65, 69,
+        73, 70, 68, 65, 62, 60, 58, 57, 56, 55, 54, 53, 52, 52, 63, 67, 70, 68,
+        66, 64, 61, 59, 57, 56, 55, 54, 53, 52, 51, 51, 61, 64, 68, 66, 64, 62,
+        60, 58, 57, 55, 54, 53, 52, 51, 51, 51, 59, 62, 65, 64, 63, 61, 59, 57,
+        56, 55, 53, 52, 51, 51, 50, 50, 59, 62, 65, 64, 63, 61, 59, 57, 56, 55,
+        53, 52, 51, 51, 50, 50,
+        /* Size 32 */
+        100, 104, 108, 112, 116, 107, 97, 87, 78, 77, 76, 76, 75, 74, 74, 73,
+        72, 71, 70, 69, 68, 67, 66, 65, 63, 63, 62, 61, 60, 60, 60, 60, 104,
+        105, 107, 108, 110, 102, 94, 86, 78, 77, 77, 77, 77, 76, 75, 75, 74, 73,
+        72, 71, 70, 69, 68, 66, 65, 64, 63, 62, 61, 61, 61, 61, 108, 107, 106,
+        104, 103, 97, 90, 84, 77, 78, 78, 78, 78, 78, 77, 77, 76, 75, 74, 73,
+        72, 71, 69, 68, 67, 66, 65, 64, 63, 63, 63, 63, 112, 108, 104, 100, 96,
+        92, 87, 82, 77, 78, 79, 79, 80, 80, 79, 79, 78, 77, 76, 75, 74, 73, 71,
+        70, 69, 68, 67, 65, 64, 64, 64, 64, 116, 110, 103, 96, 90, 87, 83, 80,
+        77, 78, 79, 81, 82, 81, 81, 81, 80, 79, 78, 77, 76, 74, 73, 72, 71, 69,
+        68, 67, 66, 66, 66, 66, 107, 102, 97, 92, 87, 84, 81, 78, 75, 76, 77,
+        78, 79, 79, 79, 78, 78, 77, 76, 75, 74, 73, 72, 71, 70, 68, 67, 66, 65,
+        65, 65, 65, 97, 94, 90, 87, 83, 81, 78, 76, 74, 74, 75, 76, 77, 76, 76,
+        76, 76, 75, 74, 73, 73, 72, 71, 70, 69, 68, 67, 65, 64, 64, 64, 64, 87,
+        86, 84, 82, 80, 78, 76, 74, 72, 72, 73, 73, 74, 74, 74, 74, 74, 73, 72,
+        72, 71, 70, 69, 68, 67, 67, 66, 65, 64, 64, 64, 64, 78, 78, 77, 77, 77,
+        75, 74, 72, 70, 70, 71, 71, 72, 72, 72, 72, 72, 71, 71, 70, 70, 69, 68,
+        67, 66, 66, 65, 64, 63, 63, 63, 63, 77, 77, 78, 78, 78, 76, 74, 72, 70,
+        70, 70, 70, 70, 70, 70, 70, 70, 70, 69, 69, 68, 67, 67, 66, 65, 64, 64,
+        63, 62, 62, 62, 62, 76, 77, 78, 79, 79, 77, 75, 73, 71, 70, 70, 70, 69,
+        69, 69, 69, 68, 68, 67, 67, 67, 66, 65, 65, 64, 63, 63, 62, 61, 61, 61,
+        61, 76, 77, 78, 79, 81, 78, 76, 73, 71, 70, 70, 69, 68, 68, 67, 67, 67,
+        66, 66, 66, 65, 65, 64, 63, 63, 62, 62, 61, 60, 60, 60, 60, 75, 77, 78,
+        80, 82, 79, 77, 74, 72, 70, 69, 68, 67, 66, 66, 65, 65, 65, 64, 64, 64,
+        63, 63, 62, 62, 61, 61, 60, 60, 60, 60, 60, 74, 76, 78, 80, 81, 79, 76,
+        74, 72, 70, 69, 68, 66, 66, 65, 65, 64, 64, 63, 63, 63, 62, 62, 61, 61,
+        60, 60, 59, 59, 59, 59, 59, 74, 75, 77, 79, 81, 79, 76, 74, 72, 70, 69,
+        67, 66, 65, 65, 64, 63, 63, 62, 62, 62, 61, 61, 60, 60, 59, 59, 58, 58,
+        58, 58, 58, 73, 75, 77, 79, 81, 78, 76, 74, 72, 70, 69, 67, 65, 65, 64,
+        63, 62, 62, 61, 61, 60, 60, 60, 59, 59, 58, 58, 58, 57, 57, 57, 57, 72,
+        74, 76, 78, 80, 78, 76, 74, 72, 70, 68, 67, 65, 64, 63, 62, 61, 61, 60,
+        60, 59, 59, 59, 58, 58, 57, 57, 57, 56, 56, 56, 56, 71, 73, 75, 77, 79,
+        77, 75, 73, 71, 70, 68, 66, 65, 64, 63, 62, 61, 60, 60, 59, 59, 58, 58,
+        58, 57, 57, 56, 56, 56, 56, 56, 56, 70, 72, 74, 76, 78, 76, 74, 72, 71,
+        69, 67, 66, 64, 63, 62, 61, 60, 60, 59, 59, 58, 58, 57, 57, 56, 56, 56,
+        55, 55, 55, 55, 55, 69, 71, 73, 75, 77, 75, 73, 72, 70, 69, 67, 66, 64,
+        63, 62, 61, 60, 59, 59, 58, 57, 57, 57, 56, 56, 55, 55, 55, 54, 54, 54,
+        54, 68, 70, 72, 74, 76, 74, 73, 71, 70, 68, 67, 65, 64, 63, 62, 60, 59,
+        59, 58, 57, 57, 56, 56, 56, 55, 55, 54, 54, 54, 54, 54, 54, 67, 69, 71,
+        73, 74, 73, 72, 70, 69, 67, 66, 65, 63, 62, 61, 60, 59, 58, 58, 57, 56,
+        56, 56, 55, 55, 54, 54, 54, 53, 53, 53, 53, 66, 68, 69, 71, 73, 72, 71,
+        69, 68, 67, 65, 64, 63, 62, 61, 60, 59, 58, 57, 57, 56, 56, 55, 55, 54,
+        54, 53, 53, 53, 53, 53, 53, 65, 66, 68, 70, 72, 71, 70, 68, 67, 66, 65,
+        63, 62, 61, 60, 59, 58, 58, 57, 56, 56, 55, 55, 54, 54, 53, 53, 53, 52,
+        52, 52, 52, 63, 65, 67, 69, 71, 70, 69, 67, 66, 65, 64, 63, 62, 61, 60,
+        59, 58, 57, 56, 56, 55, 55, 54, 54, 53, 53, 53, 52, 52, 52, 52, 52, 63,
+        64, 66, 68, 69, 68, 68, 67, 66, 64, 63, 62, 61, 60, 59, 58, 57, 57, 56,
+        55, 55, 54, 54, 53, 53, 53, 52, 52, 52, 52, 52, 52, 62, 63, 65, 67, 68,
+        67, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 56, 55, 54, 54, 53,
+        53, 53, 52, 52, 52, 51, 51, 51, 51, 61, 62, 64, 65, 67, 66, 65, 65, 64,
+        63, 62, 61, 60, 59, 58, 58, 57, 56, 55, 55, 54, 54, 53, 53, 52, 52, 52,
+        51, 51, 51, 51, 51, 60, 61, 63, 64, 66, 65, 64, 64, 63, 62, 61, 60, 60,
+        59, 58, 57, 56, 56, 55, 54, 54, 53, 53, 52, 52, 52, 51, 51, 51, 51, 51,
+        51, 60, 61, 63, 64, 66, 65, 64, 64, 63, 62, 61, 60, 60, 59, 58, 57, 56,
+        56, 55, 54, 54, 53, 53, 52, 52, 52, 51, 51, 51, 51, 51, 51, 60, 61, 63,
+        64, 66, 65, 64, 64, 63, 62, 61, 60, 60, 59, 58, 57, 56, 56, 55, 54, 54,
+        53, 53, 52, 52, 52, 51, 51, 51, 51, 51, 51, 60, 61, 63, 64, 66, 65, 64,
+        64, 63, 62, 61, 60, 60, 59, 58, 57, 56, 56, 55, 54, 54, 53, 53, 52, 52,
+        52, 51, 51, 51, 51, 51, 51 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 61, 47, 39, 61, 49, 42, 38, 47, 42, 38, 35, 39, 38, 35, 34,
+        /* Size 8 */
+        64, 76, 72, 62, 53, 47, 44, 41, 76, 70, 72, 65, 57, 51, 46, 43, 72, 72,
+        59, 55, 51, 47, 44, 42, 62, 65, 55, 49, 46, 44, 42, 41, 53, 57, 51, 46,
+        43, 42, 41, 40, 47, 51, 47, 44, 42, 40, 39, 39, 44, 46, 44, 42, 41, 39,
+        39, 38, 41, 43, 42, 41, 40, 39, 38, 37,
+        /* Size 16 */
+        64, 70, 76, 74, 72, 67, 62, 57, 53, 50, 47, 46, 44, 43, 41, 41, 70, 71,
+        73, 72, 72, 68, 63, 59, 55, 52, 49, 47, 45, 44, 42, 42, 76, 73, 70, 71,
+        72, 68, 65, 61, 57, 54, 51, 49, 46, 45, 43, 43, 74, 72, 71, 68, 65, 63,
+        60, 57, 54, 52, 49, 47, 45, 44, 43, 43, 72, 72, 72, 65, 59, 57, 55, 53,
+        51, 49, 47, 46, 44, 43, 42, 42, 67, 68, 68, 63, 57, 54, 52, 50, 49, 47,
+        46, 44, 43, 42, 41, 41, 62, 63, 65, 60, 55, 52, 49, 48, 46, 45, 44, 43,
+        42, 41, 41, 41, 57, 59, 61, 57, 53, 50, 48, 46, 45, 44, 43, 42, 41, 41,
+        40, 40, 53, 55, 57, 54, 51, 49, 46, 45, 43, 43, 42, 41, 41, 40, 40, 40,
+        50, 52, 54, 52, 49, 47, 45, 44, 43, 42, 41, 40, 40, 40, 39, 39, 47, 49,
+        51, 49, 47, 46, 44, 43, 42, 41, 40, 40, 39, 39, 39, 39, 46, 47, 49, 47,
+        46, 44, 43, 42, 41, 40, 40, 39, 39, 39, 38, 38, 44, 45, 46, 45, 44, 43,
+        42, 41, 41, 40, 39, 39, 39, 38, 38, 38, 43, 44, 45, 44, 43, 42, 41, 41,
+        40, 40, 39, 39, 38, 38, 38, 38, 41, 42, 43, 43, 42, 41, 41, 40, 40, 39,
+        39, 38, 38, 38, 37, 37, 41, 42, 43, 43, 42, 41, 41, 40, 40, 39, 39, 38,
+        38, 38, 37, 37,
+        /* Size 32 */
+        64, 67, 70, 73, 76, 75, 74, 73, 72, 70, 67, 64, 62, 60, 57, 55, 53, 52,
+        50, 49, 47, 47, 46, 45, 44, 43, 43, 42, 41, 41, 41, 41, 67, 69, 71, 72,
+        74, 74, 73, 73, 72, 70, 67, 65, 63, 60, 58, 56, 54, 53, 51, 50, 48, 47,
+        46, 45, 44, 44, 43, 42, 42, 42, 42, 42, 70, 71, 71, 72, 73, 73, 72, 72,
+        72, 70, 68, 66, 63, 61, 59, 57, 55, 54, 52, 51, 49, 48, 47, 46, 45, 44,
+        44, 43, 42, 42, 42, 42, 73, 72, 72, 72, 71, 71, 71, 72, 72, 70, 68, 66,
+        64, 62, 60, 58, 56, 55, 53, 52, 50, 49, 48, 47, 46, 45, 44, 43, 43, 43,
+        43, 43, 76, 74, 73, 71, 70, 70, 71, 71, 72, 70, 68, 67, 65, 63, 61, 59,
+        57, 56, 54, 52, 51, 50, 49, 47, 46, 46, 45, 44, 43, 43, 43, 43, 75, 74,
+        73, 71, 70, 70, 69, 69, 68, 67, 66, 64, 63, 61, 59, 57, 56, 54, 53, 51,
+        50, 49, 48, 47, 46, 45, 44, 44, 43, 43, 43, 43, 74, 73, 72, 71, 71, 69,
+        68, 67, 65, 64, 63, 61, 60, 58, 57, 56, 54, 53, 52, 50, 49, 48, 47, 46,
+        45, 45, 44, 43, 43, 43, 43, 43, 73, 73, 72, 72, 71, 69, 67, 64, 62, 61,
+        60, 59, 57, 56, 55, 54, 52, 51, 50, 49, 48, 47, 47, 46, 45, 44, 44, 43,
+        42, 42, 42, 42, 72, 72, 72, 72, 72, 68, 65, 62, 59, 58, 57, 56, 55, 54,
+        53, 52, 51, 50, 49, 48, 47, 47, 46, 45, 44, 44, 43, 43, 42, 42, 42, 42,
+        70, 70, 70, 70, 70, 67, 64, 61, 58, 57, 56, 55, 53, 52, 52, 51, 50, 49,
+        48, 47, 47, 46, 45, 45, 44, 43, 43, 42, 42, 42, 42, 42, 67, 67, 68, 68,
+        68, 66, 63, 60, 57, 56, 54, 53, 52, 51, 50, 49, 49, 48, 47, 46, 46, 45,
+        44, 44, 43, 43, 42, 42, 41, 41, 41, 41, 64, 65, 66, 66, 67, 64, 61, 59,
+        56, 55, 53, 52, 51, 50, 49, 48, 47, 47, 46, 45, 45, 44, 44, 43, 43, 42,
+        42, 42, 41, 41, 41, 41, 62, 63, 63, 64, 65, 63, 60, 57, 55, 53, 52, 51,
+        49, 48, 48, 47, 46, 46, 45, 45, 44, 44, 43, 43, 42, 42, 41, 41, 41, 41,
+        41, 41, 60, 60, 61, 62, 63, 61, 58, 56, 54, 52, 51, 50, 48, 48, 47, 46,
+        46, 45, 44, 44, 43, 43, 43, 42, 42, 41, 41, 41, 40, 40, 40, 40, 57, 58,
+        59, 60, 61, 59, 57, 55, 53, 52, 50, 49, 48, 47, 46, 46, 45, 44, 44, 43,
+        43, 43, 42, 42, 41, 41, 41, 40, 40, 40, 40, 40, 55, 56, 57, 58, 59, 57,
+        56, 54, 52, 51, 49, 48, 47, 46, 46, 45, 44, 44, 43, 43, 42, 42, 42, 41,
+        41, 41, 40, 40, 40, 40, 40, 40, 53, 54, 55, 56, 57, 56, 54, 52, 51, 50,
+        49, 47, 46, 46, 45, 44, 43, 43, 43, 42, 42, 41, 41, 41, 41, 40, 40, 40,
+        40, 40, 40, 40, 52, 53, 54, 55, 56, 54, 53, 51, 50, 49, 48, 47, 46, 45,
+        44, 44, 43, 43, 42, 42, 41, 41, 41, 41, 40, 40, 40, 40, 39, 39, 39, 39,
+        50, 51, 52, 53, 54, 53, 52, 50, 49, 48, 47, 46, 45, 44, 44, 43, 43, 42,
+        42, 41, 41, 41, 40, 40, 40, 40, 40, 39, 39, 39, 39, 39, 49, 50, 51, 52,
+        52, 51, 50, 49, 48, 47, 46, 45, 45, 44, 43, 43, 42, 42, 41, 41, 41, 40,
+        40, 40, 40, 39, 39, 39, 39, 39, 39, 39, 47, 48, 49, 50, 51, 50, 49, 48,
+        47, 47, 46, 45, 44, 43, 43, 42, 42, 41, 41, 41, 40, 40, 40, 40, 39, 39,
+        39, 39, 39, 39, 39, 39, 47, 47, 48, 49, 50, 49, 48, 47, 47, 46, 45, 44,
+        44, 43, 43, 42, 41, 41, 41, 40, 40, 40, 40, 39, 39, 39, 39, 39, 38, 38,
+        38, 38, 46, 46, 47, 48, 49, 48, 47, 47, 46, 45, 44, 44, 43, 43, 42, 42,
+        41, 41, 40, 40, 40, 40, 39, 39, 39, 39, 39, 38, 38, 38, 38, 38, 45, 45,
+        46, 47, 47, 47, 46, 46, 45, 45, 44, 43, 43, 42, 42, 41, 41, 41, 40, 40,
+        40, 39, 39, 39, 39, 39, 38, 38, 38, 38, 38, 38, 44, 44, 45, 46, 46, 46,
+        45, 45, 44, 44, 43, 43, 42, 42, 41, 41, 41, 40, 40, 40, 39, 39, 39, 39,
+        39, 38, 38, 38, 38, 38, 38, 38, 43, 44, 44, 45, 46, 45, 45, 44, 44, 43,
+        43, 42, 42, 41, 41, 41, 40, 40, 40, 39, 39, 39, 39, 39, 38, 38, 38, 38,
+        38, 38, 38, 38, 43, 43, 44, 44, 45, 44, 44, 44, 43, 43, 42, 42, 41, 41,
+        41, 40, 40, 40, 40, 39, 39, 39, 39, 38, 38, 38, 38, 38, 38, 38, 38, 38,
+        42, 42, 43, 43, 44, 44, 43, 43, 43, 42, 42, 42, 41, 41, 40, 40, 40, 40,
+        39, 39, 39, 39, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 41, 42, 42, 43,
+        43, 43, 43, 42, 42, 42, 41, 41, 41, 40, 40, 40, 40, 39, 39, 39, 39, 38,
+        38, 38, 38, 38, 38, 38, 37, 37, 37, 37, 41, 42, 42, 43, 43, 43, 43, 42,
+        42, 42, 41, 41, 41, 40, 40, 40, 40, 39, 39, 39, 39, 38, 38, 38, 38, 38,
+        38, 38, 37, 37, 37, 37, 41, 42, 42, 43, 43, 43, 43, 42, 42, 42, 41, 41,
+        41, 40, 40, 40, 40, 39, 39, 39, 39, 38, 38, 38, 38, 38, 38, 38, 37, 37,
+        37, 37, 41, 42, 42, 43, 43, 43, 43, 42, 42, 42, 41, 41, 41, 40, 40, 40,
+        40, 39, 39, 39, 39, 38, 38, 38, 38, 38, 38, 38, 37, 37, 37, 37 },
+      { /* Intra matrices */
+        /* Size 4 */
+        100, 95, 72, 60, 95, 76, 65, 58, 72, 65, 57, 54, 60, 58, 54, 51,
+        /* Size 8 */
+        90, 107, 102, 86, 74, 65, 60, 56, 107, 98, 101, 92, 80, 70, 64, 59, 102,
+        101, 83, 76, 71, 65, 61, 58, 86, 92, 76, 68, 64, 60, 58, 56, 74, 80, 71,
+        64, 60, 57, 55, 54, 65, 70, 65, 60, 57, 55, 53, 52, 60, 64, 61, 58, 55,
+        53, 52, 51, 56, 59, 58, 56, 54, 52, 51, 51,
+        /* Size 16 */
+        92, 100, 109, 107, 104, 96, 88, 82, 75, 71, 67, 64, 61, 59, 57, 57, 100,
+        102, 105, 104, 103, 97, 91, 84, 78, 74, 69, 66, 63, 61, 59, 59, 109,
+        105, 100, 101, 103, 98, 93, 87, 81, 76, 72, 68, 65, 63, 60, 60, 107,
+        104, 101, 97, 94, 89, 85, 81, 77, 73, 69, 66, 63, 61, 59, 59, 104, 103,
+        103, 94, 84, 81, 78, 75, 72, 69, 66, 64, 62, 60, 59, 59, 96, 97, 98, 89,
+        81, 77, 73, 71, 68, 66, 64, 62, 60, 59, 58, 58, 88, 91, 93, 85, 78, 73,
+        69, 67, 65, 63, 62, 60, 59, 58, 57, 57, 82, 84, 87, 81, 75, 71, 67, 65,
+        63, 61, 60, 59, 58, 57, 56, 56, 75, 78, 81, 77, 72, 68, 65, 63, 61, 59,
+        58, 57, 56, 55, 55, 55, 71, 74, 76, 73, 69, 66, 63, 61, 59, 58, 57, 56,
+        55, 55, 54, 54, 67, 69, 72, 69, 66, 64, 62, 60, 58, 57, 56, 55, 54, 54,
+        53, 53, 64, 66, 68, 66, 64, 62, 60, 59, 57, 56, 55, 54, 54, 53, 53, 53,
+        61, 63, 65, 63, 62, 60, 59, 58, 56, 55, 54, 54, 53, 53, 52, 52, 59, 61,
+        63, 61, 60, 59, 58, 57, 55, 55, 54, 53, 53, 52, 52, 52, 57, 59, 60, 59,
+        59, 58, 57, 56, 55, 54, 53, 53, 52, 52, 52, 52, 57, 59, 60, 59, 59, 58,
+        57, 56, 55, 54, 53, 53, 52, 52, 52, 52,
+        /* Size 32 */
+        92, 97, 101, 106, 110, 109, 107, 106, 105, 101, 97, 93, 89, 86, 82, 79,
+        76, 74, 72, 69, 67, 66, 64, 63, 62, 61, 60, 59, 58, 58, 58, 58, 97, 100,
+        102, 105, 108, 107, 106, 105, 104, 101, 97, 94, 90, 87, 84, 81, 77, 75,
+        73, 71, 69, 67, 66, 64, 63, 62, 61, 60, 59, 59, 59, 59, 101, 102, 103,
+        105, 106, 105, 105, 105, 104, 101, 98, 95, 91, 88, 85, 82, 79, 77, 74,
+        72, 70, 68, 67, 65, 64, 63, 61, 60, 59, 59, 59, 59, 106, 105, 105, 104,
+        103, 103, 104, 104, 104, 101, 98, 96, 93, 90, 87, 84, 80, 78, 76, 73,
+        71, 69, 68, 66, 65, 63, 62, 61, 60, 60, 60, 60, 110, 108, 106, 103, 101,
+        102, 102, 103, 104, 101, 99, 97, 94, 91, 88, 85, 82, 80, 77, 75, 72, 71,
+        69, 67, 66, 64, 63, 62, 61, 61, 61, 61, 109, 107, 105, 103, 102, 101,
+        100, 100, 99, 97, 95, 92, 90, 88, 85, 82, 80, 77, 75, 73, 71, 69, 68,
+        66, 65, 64, 63, 61, 60, 60, 60, 60, 107, 106, 105, 104, 102, 100, 98,
+        96, 94, 92, 90, 88, 86, 84, 82, 79, 77, 75, 73, 72, 70, 68, 67, 65, 64,
+        63, 62, 61, 60, 60, 60, 60, 106, 105, 105, 104, 103, 100, 96, 93, 90,
+        88, 86, 84, 82, 80, 79, 77, 75, 73, 72, 70, 68, 67, 66, 65, 63, 62, 61,
+        60, 60, 60, 60, 60, 105, 104, 104, 104, 104, 99, 94, 90, 85, 83, 82, 80,
+        78, 77, 75, 74, 73, 71, 70, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 59,
+        59, 59, 101, 101, 101, 101, 101, 97, 92, 88, 83, 82, 80, 78, 76, 75, 73,
+        72, 71, 69, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 59, 59, 59, 59, 97,
+        97, 98, 98, 99, 95, 90, 86, 82, 80, 78, 76, 74, 73, 71, 70, 69, 68, 67,
+        66, 65, 64, 63, 62, 61, 60, 60, 59, 58, 58, 58, 58, 93, 94, 95, 96, 97,
+        92, 88, 84, 80, 78, 76, 74, 72, 71, 70, 68, 67, 66, 65, 64, 63, 63, 62,
+        61, 60, 59, 59, 58, 58, 58, 58, 58, 89, 90, 91, 93, 94, 90, 86, 82, 78,
+        76, 74, 72, 70, 69, 68, 66, 65, 65, 64, 63, 62, 61, 61, 60, 59, 59, 58,
+        58, 57, 57, 57, 57, 86, 87, 88, 90, 91, 88, 84, 80, 77, 75, 73, 71, 69,
+        68, 66, 65, 64, 64, 63, 62, 61, 61, 60, 59, 59, 58, 58, 57, 57, 57, 57,
+        57, 82, 84, 85, 87, 88, 85, 82, 79, 75, 73, 71, 70, 68, 66, 65, 64, 63,
+        63, 62, 61, 60, 60, 59, 59, 58, 58, 57, 57, 56, 56, 56, 56, 79, 81, 82,
+        84, 85, 82, 79, 77, 74, 72, 70, 68, 66, 65, 64, 63, 62, 62, 61, 60, 59,
+        59, 58, 58, 57, 57, 57, 56, 56, 56, 56, 56, 76, 77, 79, 80, 82, 80, 77,
+        75, 73, 71, 69, 67, 65, 64, 63, 62, 61, 61, 60, 59, 59, 58, 58, 57, 57,
+        56, 56, 56, 55, 55, 55, 55, 74, 75, 77, 78, 80, 77, 75, 73, 71, 69, 68,
+        66, 65, 64, 63, 62, 61, 60, 59, 59, 58, 58, 57, 57, 56, 56, 56, 55, 55,
+        55, 55, 55, 72, 73, 74, 76, 77, 75, 73, 72, 70, 68, 67, 65, 64, 63, 62,
+        61, 60, 59, 59, 58, 57, 57, 57, 56, 56, 55, 55, 55, 55, 55, 55, 55, 69,
+        71, 72, 73, 75, 73, 72, 70, 68, 67, 66, 64, 63, 62, 61, 60, 59, 59, 58,
+        58, 57, 57, 56, 56, 55, 55, 55, 54, 54, 54, 54, 54, 67, 69, 70, 71, 72,
+        71, 70, 68, 67, 66, 65, 63, 62, 61, 60, 59, 59, 58, 57, 57, 56, 56, 56,
+        55, 55, 55, 54, 54, 54, 54, 54, 54, 66, 67, 68, 69, 71, 69, 68, 67, 66,
+        65, 64, 63, 61, 61, 60, 59, 58, 58, 57, 57, 56, 56, 55, 55, 55, 54, 54,
+        54, 54, 54, 54, 54, 64, 66, 67, 68, 69, 68, 67, 66, 65, 64, 63, 62, 61,
+        60, 59, 58, 58, 57, 57, 56, 56, 55, 55, 55, 54, 54, 54, 54, 53, 53, 53,
+        53, 63, 64, 65, 66, 67, 66, 65, 65, 64, 63, 62, 61, 60, 59, 59, 58, 57,
+        57, 56, 56, 55, 55, 55, 54, 54, 54, 54, 53, 53, 53, 53, 53, 62, 63, 64,
+        65, 66, 65, 64, 63, 63, 62, 61, 60, 59, 59, 58, 57, 57, 56, 56, 55, 55,
+        55, 54, 54, 54, 53, 53, 53, 53, 53, 53, 53, 61, 62, 63, 63, 64, 64, 63,
+        62, 62, 61, 60, 59, 59, 58, 58, 57, 56, 56, 55, 55, 55, 54, 54, 54, 53,
+        53, 53, 53, 53, 53, 53, 53, 60, 61, 61, 62, 63, 63, 62, 61, 61, 60, 60,
+        59, 58, 58, 57, 57, 56, 56, 55, 55, 54, 54, 54, 54, 53, 53, 53, 53, 52,
+        52, 52, 52, 59, 60, 60, 61, 62, 61, 61, 60, 60, 59, 59, 58, 58, 57, 57,
+        56, 56, 55, 55, 54, 54, 54, 54, 53, 53, 53, 53, 52, 52, 52, 52, 52, 58,
+        59, 59, 60, 61, 60, 60, 60, 59, 59, 58, 58, 57, 57, 56, 56, 55, 55, 55,
+        54, 54, 54, 53, 53, 53, 53, 52, 52, 52, 52, 52, 52, 58, 59, 59, 60, 61,
+        60, 60, 60, 59, 59, 58, 58, 57, 57, 56, 56, 55, 55, 55, 54, 54, 54, 53,
+        53, 53, 53, 52, 52, 52, 52, 52, 52, 58, 59, 59, 60, 61, 60, 60, 60, 59,
+        59, 58, 58, 57, 57, 56, 56, 55, 55, 55, 54, 54, 54, 53, 53, 53, 53, 52,
+        52, 52, 52, 52, 52, 58, 59, 59, 60, 61, 60, 60, 60, 59, 59, 58, 58, 57,
+        57, 56, 56, 55, 55, 55, 54, 54, 54, 53, 53, 53, 53, 52, 52, 52, 52, 52,
+        52 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 50, 48, 43, 50, 45, 44, 42, 48, 44, 40, 38, 43, 42, 38, 36,
+        /* Size 8 */
+        64, 73, 52, 50, 49, 47, 44, 42, 73, 59, 51, 54, 53, 51, 48, 45, 52, 51,
+        48, 49, 49, 47, 46, 44, 50, 54, 49, 46, 45, 44, 43, 42, 49, 53, 49, 45,
+        43, 42, 41, 40, 47, 51, 47, 44, 42, 40, 40, 39, 44, 48, 46, 43, 41, 40,
+        39, 38, 42, 45, 44, 42, 40, 39, 38, 37,
+        /* Size 16 */
+        64, 69, 73, 63, 52, 51, 50, 50, 49, 48, 47, 45, 44, 43, 42, 42, 69, 67,
+        66, 59, 52, 52, 52, 52, 51, 50, 49, 47, 46, 45, 44, 44, 73, 66, 59, 55,
+        51, 53, 54, 54, 53, 52, 51, 49, 48, 47, 45, 45, 63, 59, 55, 52, 50, 50,
+        51, 51, 51, 50, 49, 48, 47, 46, 45, 45, 52, 52, 51, 50, 48, 48, 49, 49,
+        49, 48, 47, 47, 46, 45, 44, 44, 51, 52, 53, 50, 48, 48, 47, 47, 47, 46,
+        46, 45, 44, 44, 43, 43, 50, 52, 54, 51, 49, 47, 46, 45, 45, 45, 44, 44,
+        43, 43, 42, 42, 50, 52, 54, 51, 49, 47, 45, 45, 44, 44, 43, 43, 42, 42,
+        41, 41, 49, 51, 53, 51, 49, 47, 45, 44, 43, 42, 42, 41, 41, 41, 40, 40,
+        48, 50, 52, 50, 48, 46, 45, 44, 42, 42, 41, 41, 40, 40, 39, 39, 47, 49,
+        51, 49, 47, 46, 44, 43, 42, 41, 40, 40, 40, 39, 39, 39, 45, 47, 49, 48,
+        47, 45, 44, 43, 41, 41, 40, 40, 39, 39, 38, 38, 44, 46, 48, 47, 46, 44,
+        43, 42, 41, 40, 40, 39, 39, 38, 38, 38, 43, 45, 47, 46, 45, 44, 43, 42,
+        41, 40, 39, 39, 38, 38, 37, 37, 42, 44, 45, 45, 44, 43, 42, 41, 40, 39,
+        39, 38, 38, 37, 37, 37, 42, 44, 45, 45, 44, 43, 42, 41, 40, 39, 39, 38,
+        38, 37, 37, 37,
+        /* Size 32 */
+        64, 66, 69, 71, 73, 68, 63, 57, 52, 52, 51, 51, 50, 50, 50, 49, 49, 48,
+        48, 47, 47, 46, 45, 45, 44, 44, 43, 43, 42, 42, 42, 42, 66, 67, 68, 69,
+        69, 65, 61, 56, 52, 52, 52, 52, 51, 51, 51, 50, 50, 49, 49, 48, 48, 47,
+        46, 46, 45, 45, 44, 43, 43, 43, 43, 43, 69, 68, 67, 67, 66, 62, 59, 55,
+        52, 52, 52, 52, 52, 52, 52, 51, 51, 51, 50, 49, 49, 48, 47, 47, 46, 46,
+        45, 44, 44, 44, 44, 44, 71, 69, 67, 64, 62, 60, 57, 54, 52, 52, 52, 53,
+        53, 53, 53, 52, 52, 52, 51, 50, 50, 49, 48, 48, 47, 46, 46, 45, 45, 45,
+        45, 45, 73, 69, 66, 62, 59, 57, 55, 53, 51, 52, 53, 53, 54, 54, 54, 54,
+        53, 53, 52, 51, 51, 50, 49, 49, 48, 47, 47, 46, 45, 45, 45, 45, 68, 65,
+        62, 60, 57, 55, 54, 52, 51, 51, 52, 52, 53, 53, 52, 52, 52, 52, 51, 51,
+        50, 49, 49, 48, 47, 47, 46, 46, 45, 45, 45, 45, 63, 61, 59, 57, 55, 54,
+        52, 51, 50, 50, 50, 51, 51, 51, 51, 51, 51, 51, 50, 50, 49, 49, 48, 47,
+        47, 46, 46, 45, 45, 45, 45, 45, 57, 56, 55, 54, 53, 52, 51, 50, 49, 49,
+        49, 50, 50, 50, 50, 50, 50, 49, 49, 49, 48, 48, 47, 47, 46, 46, 45, 45,
+        44, 44, 44, 44, 52, 52, 52, 52, 51, 51, 50, 49, 48, 48, 48, 48, 49, 49,
+        49, 49, 49, 48, 48, 48, 47, 47, 47, 46, 46, 45, 45, 44, 44, 44, 44, 44,
+        52, 52, 52, 52, 52, 51, 50, 49, 48, 48, 48, 48, 48, 48, 48, 48, 48, 47,
+        47, 47, 47, 46, 46, 46, 45, 45, 44, 44, 43, 43, 43, 43, 51, 52, 52, 52,
+        53, 52, 50, 49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46, 46, 46,
+        45, 45, 44, 44, 44, 43, 43, 43, 43, 43, 51, 52, 52, 53, 53, 52, 51, 50,
+        48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 45, 45, 45, 45, 44, 44, 44, 43,
+        43, 43, 42, 42, 42, 42, 50, 51, 52, 53, 54, 53, 51, 50, 49, 48, 47, 47,
+        46, 46, 45, 45, 45, 45, 45, 44, 44, 44, 44, 43, 43, 43, 43, 42, 42, 42,
+        42, 42, 50, 51, 52, 53, 54, 53, 51, 50, 49, 48, 47, 46, 46, 45, 45, 45,
+        45, 44, 44, 44, 44, 43, 43, 43, 43, 42, 42, 42, 42, 42, 42, 42, 50, 51,
+        52, 53, 54, 52, 51, 50, 49, 48, 47, 46, 45, 45, 45, 44, 44, 44, 44, 43,
+        43, 43, 43, 42, 42, 42, 42, 41, 41, 41, 41, 41, 49, 50, 51, 52, 54, 52,
+        51, 50, 49, 48, 47, 46, 45, 45, 44, 44, 44, 43, 43, 43, 42, 42, 42, 42,
+        42, 41, 41, 41, 41, 41, 41, 41, 49, 50, 51, 52, 53, 52, 51, 50, 49, 48,
+        47, 46, 45, 45, 44, 44, 43, 43, 42, 42, 42, 42, 41, 41, 41, 41, 41, 40,
+        40, 40, 40, 40, 48, 49, 51, 52, 53, 52, 51, 49, 48, 47, 47, 46, 45, 44,
+        44, 43, 43, 42, 42, 42, 42, 41, 41, 41, 41, 40, 40, 40, 40, 40, 40, 40,
+        48, 49, 50, 51, 52, 51, 50, 49, 48, 47, 46, 45, 45, 44, 44, 43, 42, 42,
+        42, 42, 41, 41, 41, 41, 40, 40, 40, 40, 39, 39, 39, 39, 47, 48, 49, 50,
+        51, 51, 50, 49, 48, 47, 46, 45, 44, 44, 43, 43, 42, 42, 42, 41, 41, 41,
+        40, 40, 40, 40, 40, 39, 39, 39, 39, 39, 47, 48, 49, 50, 51, 50, 49, 48,
+        47, 47, 46, 45, 44, 44, 43, 42, 42, 42, 41, 41, 40, 40, 40, 40, 40, 39,
+        39, 39, 39, 39, 39, 39, 46, 47, 48, 49, 50, 49, 49, 48, 47, 46, 46, 45,
+        44, 43, 43, 42, 42, 41, 41, 41, 40, 40, 40, 40, 39, 39, 39, 39, 39, 39,
+        39, 39, 45, 46, 47, 48, 49, 49, 48, 47, 47, 46, 45, 44, 44, 43, 43, 42,
+        41, 41, 41, 40, 40, 40, 40, 39, 39, 39, 39, 38, 38, 38, 38, 38, 45, 46,
+        47, 48, 49, 48, 47, 47, 46, 46, 45, 44, 43, 43, 42, 42, 41, 41, 41, 40,
+        40, 40, 39, 39, 39, 39, 38, 38, 38, 38, 38, 38, 44, 45, 46, 47, 48, 47,
+        47, 46, 46, 45, 44, 44, 43, 43, 42, 42, 41, 41, 40, 40, 40, 39, 39, 39,
+        39, 38, 38, 38, 38, 38, 38, 38, 44, 45, 46, 46, 47, 47, 46, 46, 45, 45,
+        44, 43, 43, 42, 42, 41, 41, 40, 40, 40, 39, 39, 39, 39, 38, 38, 38, 38,
+        38, 38, 38, 38, 43, 44, 45, 46, 47, 46, 46, 45, 45, 44, 44, 43, 43, 42,
+        42, 41, 41, 40, 40, 40, 39, 39, 39, 38, 38, 38, 38, 38, 37, 37, 37, 37,
+        43, 43, 44, 45, 46, 46, 45, 45, 44, 44, 43, 43, 42, 42, 41, 41, 40, 40,
+        40, 39, 39, 39, 38, 38, 38, 38, 38, 37, 37, 37, 37, 37, 42, 43, 44, 45,
+        45, 45, 45, 44, 44, 43, 43, 42, 42, 42, 41, 41, 40, 40, 39, 39, 39, 39,
+        38, 38, 38, 38, 37, 37, 37, 37, 37, 37, 42, 43, 44, 45, 45, 45, 45, 44,
+        44, 43, 43, 42, 42, 42, 41, 41, 40, 40, 39, 39, 39, 39, 38, 38, 38, 38,
+        37, 37, 37, 37, 37, 37, 42, 43, 44, 45, 45, 45, 45, 44, 44, 43, 43, 42,
+        42, 42, 41, 41, 40, 40, 39, 39, 39, 39, 38, 38, 38, 38, 37, 37, 37, 37,
+        37, 37, 42, 43, 44, 45, 45, 45, 45, 44, 44, 43, 43, 42, 42, 42, 41, 41,
+        40, 40, 39, 39, 39, 39, 38, 38, 38, 38, 37, 37, 37, 37, 37, 37 },
+      { /* Intra matrices */
+        /* Size 4 */
+        97, 74, 71, 63, 74, 67, 65, 61, 71, 65, 58, 56, 63, 61, 56, 52,
+        /* Size 8 */
+        92, 106, 74, 72, 69, 66, 62, 59, 106, 84, 73, 77, 76, 72, 68, 64, 74,
+        73, 68, 69, 69, 67, 65, 62, 72, 77, 69, 65, 63, 62, 61, 59, 69, 76, 69,
+        63, 60, 59, 57, 56, 66, 72, 67, 62, 59, 57, 55, 54, 62, 68, 65, 61, 57,
+        55, 54, 53, 59, 64, 62, 59, 56, 54, 53, 51,
+        /* Size 16 */
+        93, 100, 107, 91, 75, 74, 73, 71, 70, 68, 67, 65, 63, 61, 60, 60, 100,
+        98, 96, 85, 74, 75, 75, 74, 74, 72, 70, 68, 66, 64, 62, 62, 107, 96, 85,
+        79, 74, 76, 78, 77, 77, 75, 73, 71, 69, 67, 65, 65, 91, 85, 79, 75, 71,
+        73, 74, 74, 73, 72, 71, 69, 67, 65, 64, 64, 75, 74, 74, 71, 68, 69, 70,
+        70, 70, 69, 68, 67, 65, 64, 63, 63, 74, 75, 76, 73, 69, 68, 68, 67, 67,
+        66, 65, 64, 63, 62, 61, 61, 73, 75, 78, 74, 70, 68, 66, 65, 64, 64, 63,
+        62, 61, 60, 60, 60, 71, 74, 77, 74, 70, 67, 65, 64, 63, 62, 61, 61, 60,
+        59, 58, 58, 70, 74, 77, 73, 70, 67, 64, 63, 61, 60, 59, 59, 58, 58, 57,
+        57, 68, 72, 75, 72, 69, 66, 64, 62, 60, 59, 58, 58, 57, 56, 56, 56, 67,
+        70, 73, 71, 68, 65, 63, 61, 59, 58, 57, 57, 56, 55, 55, 55, 65, 68, 71,
+        69, 67, 64, 62, 61, 59, 58, 57, 56, 55, 55, 54, 54, 63, 66, 69, 67, 65,
+        63, 61, 60, 58, 57, 56, 55, 54, 54, 53, 53, 61, 64, 67, 65, 64, 62, 60,
+        59, 58, 56, 55, 55, 54, 53, 53, 53, 60, 62, 65, 64, 63, 61, 60, 58, 57,
+        56, 55, 54, 53, 53, 52, 52, 60, 62, 65, 64, 63, 61, 60, 58, 57, 56, 55,
+        54, 53, 53, 52, 52,
+        /* Size 32 */
+        94, 97, 101, 104, 108, 100, 91, 83, 75, 75, 74, 74, 73, 72, 72, 71, 71,
+        70, 69, 68, 67, 66, 65, 64, 63, 63, 62, 61, 60, 60, 60, 60, 97, 98, 100,
+        101, 102, 95, 89, 82, 75, 75, 75, 75, 74, 74, 73, 73, 72, 71, 70, 70,
+        69, 68, 67, 66, 65, 64, 63, 62, 61, 61, 61, 61, 101, 100, 99, 98, 97,
+        91, 86, 80, 75, 75, 75, 76, 76, 75, 75, 74, 74, 73, 72, 71, 70, 69, 68,
+        67, 66, 65, 65, 64, 63, 63, 63, 63, 104, 101, 98, 94, 91, 87, 83, 79,
+        75, 75, 76, 77, 77, 77, 76, 76, 76, 75, 74, 73, 72, 71, 70, 69, 68, 67,
+        66, 65, 64, 64, 64, 64, 108, 102, 97, 91, 85, 83, 80, 77, 75, 76, 77,
+        78, 79, 78, 78, 78, 77, 76, 75, 75, 74, 73, 71, 70, 69, 68, 67, 66, 65,
+        65, 65, 65, 100, 95, 91, 87, 83, 80, 78, 76, 73, 74, 75, 76, 76, 76, 76,
+        76, 76, 75, 74, 73, 72, 71, 70, 69, 68, 68, 67, 66, 65, 65, 65, 65, 91,
+        89, 86, 83, 80, 78, 76, 74, 72, 72, 73, 74, 74, 74, 74, 74, 74, 73, 72,
+        72, 71, 70, 69, 68, 68, 67, 66, 65, 64, 64, 64, 64, 83, 82, 80, 79, 77,
+        76, 74, 72, 70, 71, 71, 72, 72, 72, 72, 72, 72, 71, 71, 70, 70, 69, 68,
+        67, 67, 66, 65, 64, 64, 64, 64, 64, 75, 75, 75, 75, 75, 73, 72, 70, 69,
+        69, 69, 70, 70, 70, 70, 70, 70, 70, 69, 69, 68, 68, 67, 66, 66, 65, 64,
+        64, 63, 63, 63, 63, 75, 75, 75, 75, 76, 74, 72, 71, 69, 69, 69, 69, 69,
+        69, 69, 69, 69, 68, 68, 68, 67, 67, 66, 65, 65, 64, 63, 63, 62, 62, 62,
+        62, 74, 75, 75, 76, 77, 75, 73, 71, 69, 69, 69, 68, 68, 68, 68, 68, 67,
+        67, 67, 66, 66, 65, 65, 64, 64, 63, 63, 62, 61, 61, 61, 61, 74, 75, 76,
+        77, 78, 76, 74, 72, 70, 69, 68, 68, 67, 67, 67, 66, 66, 66, 65, 65, 65,
+        64, 64, 63, 63, 62, 62, 61, 61, 61, 61, 61, 73, 74, 76, 77, 79, 76, 74,
+        72, 70, 69, 68, 67, 66, 66, 65, 65, 65, 64, 64, 64, 63, 63, 63, 62, 62,
+        61, 61, 60, 60, 60, 60, 60, 72, 74, 75, 77, 78, 76, 74, 72, 70, 69, 68,
+        67, 66, 65, 65, 64, 64, 64, 63, 63, 63, 62, 62, 61, 61, 61, 60, 60, 59,
+        59, 59, 59, 72, 73, 75, 76, 78, 76, 74, 72, 70, 69, 68, 67, 65, 65, 64,
+        64, 63, 63, 62, 62, 62, 61, 61, 61, 60, 60, 59, 59, 59, 59, 59, 59, 71,
+        73, 74, 76, 78, 76, 74, 72, 70, 69, 68, 66, 65, 64, 64, 63, 62, 62, 62,
+        61, 61, 60, 60, 60, 59, 59, 59, 58, 58, 58, 58, 58, 71, 72, 74, 76, 77,
+        76, 74, 72, 70, 69, 67, 66, 65, 64, 63, 62, 62, 61, 61, 60, 60, 60, 59,
+        59, 59, 58, 58, 58, 57, 57, 57, 57, 70, 71, 73, 75, 76, 75, 73, 71, 70,
+        68, 67, 66, 64, 64, 63, 62, 61, 61, 60, 60, 59, 59, 59, 58, 58, 58, 57,
+        57, 57, 57, 57, 57, 69, 70, 72, 74, 75, 74, 72, 71, 69, 68, 67, 65, 64,
+        63, 62, 62, 61, 60, 60, 59, 59, 58, 58, 58, 57, 57, 57, 56, 56, 56, 56,
+        56, 68, 70, 71, 73, 75, 73, 72, 70, 69, 68, 66, 65, 64, 63, 62, 61, 60,
+        60, 59, 59, 58, 58, 58, 57, 57, 57, 56, 56, 56, 56, 56, 56, 67, 69, 70,
+        72, 74, 72, 71, 70, 68, 67, 66, 65, 63, 63, 62, 61, 60, 59, 59, 58, 58,
+        57, 57, 57, 56, 56, 56, 55, 55, 55, 55, 55, 66, 68, 69, 71, 73, 71, 70,
+        69, 68, 67, 65, 64, 63, 62, 61, 60, 60, 59, 58, 58, 57, 57, 57, 56, 56,
+        56, 55, 55, 55, 55, 55, 55, 65, 67, 68, 70, 71, 70, 69, 68, 67, 66, 65,
+        64, 63, 62, 61, 60, 59, 59, 58, 58, 57, 57, 56, 56, 55, 55, 55, 55, 54,
+        54, 54, 54, 64, 66, 67, 69, 70, 69, 68, 67, 66, 65, 64, 63, 62, 61, 61,
+        60, 59, 58, 58, 57, 57, 56, 56, 55, 55, 55, 55, 54, 54, 54, 54, 54, 63,
+        65, 66, 68, 69, 68, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 59, 58, 57,
+        57, 56, 56, 55, 55, 55, 54, 54, 54, 54, 54, 54, 54, 63, 64, 65, 67, 68,
+        68, 67, 66, 65, 64, 63, 62, 61, 61, 60, 59, 58, 58, 57, 57, 56, 56, 55,
+        55, 54, 54, 54, 54, 53, 53, 53, 53, 62, 63, 65, 66, 67, 67, 66, 65, 64,
+        63, 63, 62, 61, 60, 59, 59, 58, 57, 57, 56, 56, 55, 55, 55, 54, 54, 54,
+        53, 53, 53, 53, 53, 61, 62, 64, 65, 66, 66, 65, 64, 64, 63, 62, 61, 60,
+        60, 59, 58, 58, 57, 56, 56, 55, 55, 55, 54, 54, 54, 53, 53, 53, 53, 53,
+        53, 60, 61, 63, 64, 65, 65, 64, 64, 63, 62, 61, 61, 60, 59, 59, 58, 57,
+        57, 56, 56, 55, 55, 54, 54, 54, 53, 53, 53, 52, 52, 52, 52, 60, 61, 63,
+        64, 65, 65, 64, 64, 63, 62, 61, 61, 60, 59, 59, 58, 57, 57, 56, 56, 55,
+        55, 54, 54, 54, 53, 53, 53, 52, 52, 52, 52, 60, 61, 63, 64, 65, 65, 64,
+        64, 63, 62, 61, 61, 60, 59, 59, 58, 57, 57, 56, 56, 55, 55, 54, 54, 54,
+        53, 53, 53, 52, 52, 52, 52, 60, 61, 63, 64, 65, 65, 64, 64, 63, 62, 61,
+        61, 60, 59, 59, 58, 57, 57, 56, 56, 55, 55, 54, 54, 54, 53, 53, 53, 52,
+        52, 52, 52 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 61, 49, 42, 61, 51, 45, 41, 49, 45, 41, 39, 42, 41, 39, 38,
+        /* Size 8 */
+        64, 74, 71, 62, 55, 50, 47, 45, 74, 69, 70, 65, 58, 53, 49, 46, 71, 70,
+        60, 56, 53, 50, 47, 45, 62, 65, 56, 51, 49, 47, 45, 44, 55, 58, 53, 49,
+        46, 45, 44, 43, 50, 53, 50, 47, 45, 44, 43, 42, 47, 49, 47, 45, 44, 43,
+        42, 42, 45, 46, 45, 44, 43, 42, 42, 41,
+        /* Size 16 */
+        64, 69, 74, 73, 71, 66, 62, 58, 55, 52, 50, 48, 47, 46, 45, 45, 69, 70,
+        71, 71, 71, 67, 63, 60, 56, 54, 51, 49, 48, 47, 45, 45, 74, 71, 69, 70,
+        70, 68, 65, 62, 58, 55, 53, 51, 49, 48, 46, 46, 73, 71, 70, 67, 65, 63,
+        61, 58, 55, 53, 51, 50, 48, 47, 46, 46, 71, 71, 70, 65, 60, 58, 56, 54,
+        53, 51, 50, 48, 47, 46, 45, 45, 66, 67, 68, 63, 58, 56, 54, 52, 51, 50,
+        48, 47, 46, 45, 45, 45, 62, 63, 65, 61, 56, 54, 51, 50, 49, 48, 47, 46,
+        45, 45, 44, 44, 58, 60, 62, 58, 54, 52, 50, 49, 48, 47, 46, 45, 45, 44,
+        44, 44, 55, 56, 58, 55, 53, 51, 49, 48, 46, 46, 45, 44, 44, 43, 43, 43,
+        52, 54, 55, 53, 51, 50, 48, 47, 46, 45, 44, 44, 43, 43, 43, 43, 50, 51,
+        53, 51, 50, 48, 47, 46, 45, 44, 44, 43, 43, 43, 42, 42, 48, 49, 51, 50,
+        48, 47, 46, 45, 44, 44, 43, 43, 42, 42, 42, 42, 47, 48, 49, 48, 47, 46,
+        45, 45, 44, 43, 43, 42, 42, 42, 42, 42, 46, 47, 48, 47, 46, 45, 45, 44,
+        43, 43, 43, 42, 42, 42, 41, 41, 45, 45, 46, 46, 45, 45, 44, 44, 43, 43,
+        42, 42, 42, 41, 41, 41, 45, 45, 46, 46, 45, 45, 44, 44, 43, 43, 42, 42,
+        42, 41, 41, 41,
+        /* Size 32 */
+        64, 67, 69, 72, 74, 73, 73, 72, 71, 69, 66, 64, 62, 60, 58, 57, 55, 53,
+        52, 51, 50, 49, 48, 47, 47, 46, 46, 45, 45, 45, 45, 45, 67, 68, 70, 71,
+        73, 72, 72, 71, 71, 69, 67, 65, 63, 61, 59, 57, 56, 54, 53, 52, 51, 50,
+        49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 69, 70, 70, 71, 71, 71, 71, 71,
+        71, 69, 67, 65, 63, 62, 60, 58, 56, 55, 54, 53, 51, 50, 49, 49, 48, 47,
+        47, 46, 45, 45, 45, 45, 72, 71, 71, 71, 70, 70, 70, 70, 71, 69, 67, 66,
+        64, 63, 61, 59, 57, 56, 55, 53, 52, 51, 50, 49, 48, 48, 47, 46, 46, 46,
+        46, 46, 74, 73, 71, 70, 69, 69, 70, 70, 70, 69, 68, 66, 65, 63, 62, 60,
+        58, 57, 55, 54, 53, 52, 51, 50, 49, 48, 48, 47, 46, 46, 46, 46, 73, 72,
+        71, 70, 69, 69, 69, 68, 68, 67, 65, 64, 63, 61, 60, 58, 57, 56, 54, 53,
+        52, 51, 50, 49, 48, 48, 47, 47, 46, 46, 46, 46, 73, 72, 71, 70, 70, 69,
+        67, 66, 65, 64, 63, 62, 61, 59, 58, 57, 55, 54, 53, 52, 51, 50, 50, 49,
+        48, 47, 47, 46, 46, 46, 46, 46, 72, 71, 71, 70, 70, 68, 66, 64, 62, 61,
+        60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 50, 49, 48, 48, 47, 47, 46,
+        45, 45, 45, 45, 71, 71, 71, 71, 70, 68, 65, 62, 60, 59, 58, 57, 56, 55,
+        54, 54, 53, 52, 51, 50, 50, 49, 48, 48, 47, 47, 46, 46, 45, 45, 45, 45,
+        69, 69, 69, 69, 69, 67, 64, 61, 59, 58, 57, 56, 55, 54, 53, 53, 52, 51,
+        50, 50, 49, 48, 48, 47, 47, 46, 46, 45, 45, 45, 45, 45, 66, 67, 67, 67,
+        68, 65, 63, 60, 58, 57, 56, 55, 54, 53, 52, 51, 51, 50, 50, 49, 48, 48,
+        47, 47, 46, 46, 45, 45, 45, 45, 45, 45, 64, 65, 65, 66, 66, 64, 62, 59,
+        57, 56, 55, 54, 52, 52, 51, 50, 50, 49, 49, 48, 48, 47, 47, 46, 46, 45,
+        45, 45, 44, 44, 44, 44, 62, 63, 63, 64, 65, 63, 61, 58, 56, 55, 54, 52,
+        51, 51, 50, 49, 49, 48, 48, 47, 47, 46, 46, 46, 45, 45, 45, 44, 44, 44,
+        44, 44, 60, 61, 62, 63, 63, 61, 59, 57, 55, 54, 53, 52, 51, 50, 49, 49,
+        48, 48, 47, 47, 46, 46, 46, 45, 45, 45, 44, 44, 44, 44, 44, 44, 58, 59,
+        60, 61, 62, 60, 58, 56, 54, 53, 52, 51, 50, 49, 49, 48, 48, 47, 47, 46,
+        46, 46, 45, 45, 45, 44, 44, 44, 44, 44, 44, 44, 57, 57, 58, 59, 60, 58,
+        57, 55, 54, 53, 51, 50, 49, 49, 48, 48, 47, 47, 46, 46, 45, 45, 45, 45,
+        44, 44, 44, 44, 43, 43, 43, 43, 55, 56, 56, 57, 58, 57, 55, 54, 53, 52,
+        51, 50, 49, 48, 48, 47, 46, 46, 46, 45, 45, 45, 44, 44, 44, 44, 43, 43,
+        43, 43, 43, 43, 53, 54, 55, 56, 57, 56, 54, 53, 52, 51, 50, 49, 48, 48,
+        47, 47, 46, 46, 45, 45, 45, 44, 44, 44, 44, 43, 43, 43, 43, 43, 43, 43,
+        52, 53, 54, 55, 55, 54, 53, 52, 51, 50, 50, 49, 48, 47, 47, 46, 46, 45,
+        45, 45, 44, 44, 44, 44, 43, 43, 43, 43, 43, 43, 43, 43, 51, 52, 53, 53,
+        54, 53, 52, 51, 50, 50, 49, 48, 47, 47, 46, 46, 45, 45, 45, 44, 44, 44,
+        44, 43, 43, 43, 43, 43, 42, 42, 42, 42, 50, 51, 51, 52, 53, 52, 51, 50,
+        50, 49, 48, 48, 47, 46, 46, 45, 45, 45, 44, 44, 44, 43, 43, 43, 43, 43,
+        43, 42, 42, 42, 42, 42, 49, 50, 50, 51, 52, 51, 50, 50, 49, 48, 48, 47,
+        46, 46, 46, 45, 45, 44, 44, 44, 43, 43, 43, 43, 43, 43, 42, 42, 42, 42,
+        42, 42, 48, 49, 49, 50, 51, 50, 50, 49, 48, 48, 47, 47, 46, 46, 45, 45,
+        44, 44, 44, 44, 43, 43, 43, 43, 42, 42, 42, 42, 42, 42, 42, 42, 47, 48,
+        49, 49, 50, 49, 49, 48, 48, 47, 47, 46, 46, 45, 45, 45, 44, 44, 44, 43,
+        43, 43, 43, 43, 42, 42, 42, 42, 42, 42, 42, 42, 47, 47, 48, 48, 49, 48,
+        48, 48, 47, 47, 46, 46, 45, 45, 45, 44, 44, 44, 43, 43, 43, 43, 42, 42,
+        42, 42, 42, 42, 42, 42, 42, 42, 46, 47, 47, 48, 48, 48, 47, 47, 47, 46,
+        46, 45, 45, 45, 44, 44, 44, 43, 43, 43, 43, 43, 42, 42, 42, 42, 42, 42,
+        42, 42, 42, 42, 46, 46, 47, 47, 48, 47, 47, 47, 46, 46, 45, 45, 45, 44,
+        44, 44, 43, 43, 43, 43, 43, 42, 42, 42, 42, 42, 42, 42, 41, 41, 41, 41,
+        45, 45, 46, 46, 47, 47, 46, 46, 46, 45, 45, 45, 44, 44, 44, 44, 43, 43,
+        43, 43, 42, 42, 42, 42, 42, 42, 42, 41, 41, 41, 41, 41, 45, 45, 45, 46,
+        46, 46, 46, 45, 45, 45, 45, 44, 44, 44, 44, 43, 43, 43, 43, 42, 42, 42,
+        42, 42, 42, 42, 41, 41, 41, 41, 41, 41, 45, 45, 45, 46, 46, 46, 46, 45,
+        45, 45, 45, 44, 44, 44, 44, 43, 43, 43, 43, 42, 42, 42, 42, 42, 42, 42,
+        41, 41, 41, 41, 41, 41, 45, 45, 45, 46, 46, 46, 46, 45, 45, 45, 45, 44,
+        44, 44, 44, 43, 43, 43, 43, 42, 42, 42, 42, 42, 42, 42, 41, 41, 41, 41,
+        41, 41, 45, 45, 45, 46, 46, 46, 46, 45, 45, 45, 45, 44, 44, 44, 44, 43,
+        43, 43, 43, 42, 42, 42, 42, 42, 42, 42, 41, 41, 41, 41, 41, 41 },
+      { /* Intra matrices */
+        /* Size 4 */
+        93, 88, 70, 60, 88, 73, 64, 58, 70, 64, 58, 55, 60, 58, 55, 53,
+        /* Size 8 */
+        84, 98, 94, 82, 71, 65, 60, 57, 98, 91, 93, 86, 76, 69, 63, 60, 94, 93,
+        79, 73, 69, 65, 61, 58, 82, 86, 73, 67, 63, 61, 58, 57, 71, 76, 69, 63,
+        60, 58, 56, 55, 65, 69, 65, 61, 58, 56, 55, 54, 60, 63, 61, 58, 56, 55,
+        54, 53, 57, 60, 58, 57, 55, 54, 53, 53,
+        /* Size 16 */
+        86, 93, 100, 98, 96, 89, 83, 78, 73, 69, 66, 63, 61, 60, 58, 58, 93, 94,
+        96, 96, 95, 90, 85, 80, 75, 71, 68, 65, 63, 61, 59, 59, 100, 96, 93, 94,
+        95, 91, 87, 82, 77, 74, 70, 67, 64, 62, 61, 61, 98, 96, 94, 90, 87, 84,
+        81, 77, 74, 71, 68, 65, 63, 61, 60, 60, 96, 95, 95, 87, 80, 77, 74, 72,
+        70, 68, 66, 64, 62, 61, 59, 59, 89, 90, 91, 84, 77, 74, 71, 69, 67, 65,
+        64, 62, 61, 59, 58, 58, 83, 85, 87, 81, 74, 71, 68, 66, 64, 63, 62, 60,
+        59, 58, 58, 58, 78, 80, 82, 77, 72, 69, 66, 64, 62, 61, 60, 59, 58, 58,
+        57, 57, 73, 75, 77, 74, 70, 67, 64, 62, 61, 60, 59, 58, 57, 57, 56, 56,
+        69, 71, 74, 71, 68, 65, 63, 61, 60, 59, 58, 57, 57, 56, 56, 56, 66, 68,
+        70, 68, 66, 64, 62, 60, 59, 58, 57, 56, 56, 55, 55, 55, 63, 65, 67, 65,
+        64, 62, 60, 59, 58, 57, 56, 56, 55, 55, 55, 55, 61, 63, 64, 63, 62, 61,
+        59, 58, 57, 57, 56, 55, 55, 55, 54, 54, 60, 61, 62, 61, 61, 59, 58, 58,
+        57, 56, 55, 55, 55, 54, 54, 54, 58, 59, 61, 60, 59, 58, 58, 57, 56, 56,
+        55, 55, 54, 54, 54, 54, 58, 59, 61, 60, 59, 58, 58, 57, 56, 56, 55, 55,
+        54, 54, 54, 54,
+        /* Size 32 */
+        86, 90, 93, 97, 101, 100, 98, 97, 96, 93, 90, 87, 83, 81, 78, 76, 73,
+        71, 70, 68, 66, 65, 64, 63, 62, 61, 60, 59, 59, 59, 59, 59, 90, 92, 94,
+        97, 99, 98, 97, 97, 96, 93, 90, 87, 84, 82, 79, 77, 74, 73, 71, 69, 67,
+        66, 65, 64, 62, 62, 61, 60, 59, 59, 59, 59, 93, 94, 95, 96, 97, 97, 96,
+        96, 96, 93, 91, 88, 86, 83, 81, 78, 76, 74, 72, 70, 68, 67, 66, 64, 63,
+        62, 62, 61, 60, 60, 60, 60, 97, 97, 96, 96, 95, 95, 95, 96, 96, 93, 91,
+        89, 87, 84, 82, 79, 77, 75, 73, 71, 69, 68, 67, 65, 64, 63, 62, 61, 60,
+        60, 60, 60, 101, 99, 97, 95, 93, 94, 94, 95, 95, 94, 92, 90, 88, 85, 83,
+        80, 78, 76, 74, 72, 70, 69, 68, 66, 65, 64, 63, 62, 61, 61, 61, 61, 100,
+        98, 97, 95, 94, 93, 93, 92, 92, 90, 88, 86, 85, 82, 80, 78, 76, 74, 73,
+        71, 69, 68, 67, 65, 64, 63, 62, 62, 61, 61, 61, 61, 98, 97, 96, 95, 94,
+        93, 91, 90, 88, 86, 85, 83, 81, 80, 78, 76, 74, 73, 71, 70, 68, 67, 66,
+        65, 64, 63, 62, 61, 60, 60, 60, 60, 97, 97, 96, 96, 95, 92, 90, 87, 84,
+        83, 81, 80, 78, 77, 75, 74, 72, 71, 70, 68, 67, 66, 65, 64, 63, 62, 61,
+        61, 60, 60, 60, 60, 96, 96, 96, 96, 95, 92, 88, 84, 80, 79, 78, 76, 75,
+        74, 73, 72, 70, 69, 68, 67, 66, 65, 64, 63, 62, 62, 61, 60, 60, 60, 60,
+        60, 93, 93, 93, 93, 94, 90, 86, 83, 79, 78, 76, 75, 73, 72, 71, 70, 69,
+        68, 67, 66, 65, 64, 63, 63, 62, 61, 60, 60, 59, 59, 59, 59, 90, 90, 91,
+        91, 92, 88, 85, 81, 78, 76, 75, 73, 72, 71, 70, 69, 67, 67, 66, 65, 64,
+        63, 63, 62, 61, 61, 60, 59, 59, 59, 59, 59, 87, 87, 88, 89, 90, 86, 83,
+        80, 76, 75, 73, 72, 70, 69, 68, 67, 66, 65, 65, 64, 63, 62, 62, 61, 60,
+        60, 59, 59, 58, 58, 58, 58, 83, 84, 86, 87, 88, 85, 81, 78, 75, 73, 72,
+        70, 68, 67, 66, 66, 65, 64, 63, 63, 62, 61, 61, 60, 60, 59, 59, 58, 58,
+        58, 58, 58, 81, 82, 83, 84, 85, 82, 80, 77, 74, 72, 71, 69, 67, 66, 66,
+        65, 64, 63, 63, 62, 61, 61, 60, 60, 59, 59, 58, 58, 58, 58, 58, 58, 78,
+        79, 81, 82, 83, 80, 78, 75, 73, 71, 70, 68, 66, 66, 65, 64, 63, 62, 62,
+        61, 61, 60, 60, 59, 59, 58, 58, 58, 57, 57, 57, 57, 76, 77, 78, 79, 80,
+        78, 76, 74, 72, 70, 69, 67, 66, 65, 64, 63, 62, 62, 61, 60, 60, 59, 59,
+        59, 58, 58, 58, 57, 57, 57, 57, 57, 73, 74, 76, 77, 78, 76, 74, 72, 70,
+        69, 67, 66, 65, 64, 63, 62, 61, 61, 60, 60, 59, 59, 58, 58, 58, 57, 57,
+        57, 57, 57, 57, 57, 71, 73, 74, 75, 76, 74, 73, 71, 69, 68, 67, 65, 64,
+        63, 62, 62, 61, 60, 60, 59, 59, 58, 58, 58, 57, 57, 57, 57, 56, 56, 56,
+        56, 70, 71, 72, 73, 74, 73, 71, 70, 68, 67, 66, 65, 63, 63, 62, 61, 60,
+        60, 59, 59, 58, 58, 58, 57, 57, 57, 56, 56, 56, 56, 56, 56, 68, 69, 70,
+        71, 72, 71, 70, 68, 67, 66, 65, 64, 63, 62, 61, 60, 60, 59, 59, 58, 58,
+        58, 57, 57, 57, 56, 56, 56, 56, 56, 56, 56, 66, 67, 68, 69, 70, 69, 68,
+        67, 66, 65, 64, 63, 62, 61, 61, 60, 59, 59, 58, 58, 57, 57, 57, 57, 56,
+        56, 56, 56, 55, 55, 55, 55, 65, 66, 67, 68, 69, 68, 67, 66, 65, 64, 63,
+        62, 61, 61, 60, 59, 59, 58, 58, 58, 57, 57, 57, 56, 56, 56, 56, 55, 55,
+        55, 55, 55, 64, 65, 66, 67, 68, 67, 66, 65, 64, 63, 63, 62, 61, 60, 60,
+        59, 58, 58, 58, 57, 57, 57, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55, 63,
+        64, 64, 65, 66, 65, 65, 64, 63, 63, 62, 61, 60, 60, 59, 59, 58, 58, 57,
+        57, 57, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55, 55, 62, 62, 63, 64, 65,
+        64, 64, 63, 62, 62, 61, 60, 60, 59, 59, 58, 58, 57, 57, 57, 56, 56, 56,
+        56, 55, 55, 55, 55, 55, 55, 55, 55, 61, 62, 62, 63, 64, 63, 63, 62, 62,
+        61, 61, 60, 59, 59, 58, 58, 57, 57, 57, 56, 56, 56, 56, 55, 55, 55, 55,
+        55, 54, 54, 54, 54, 60, 61, 62, 62, 63, 62, 62, 61, 61, 60, 60, 59, 59,
+        58, 58, 58, 57, 57, 56, 56, 56, 56, 55, 55, 55, 55, 55, 54, 54, 54, 54,
+        54, 59, 60, 61, 61, 62, 62, 61, 61, 60, 60, 59, 59, 58, 58, 58, 57, 57,
+        57, 56, 56, 56, 55, 55, 55, 55, 55, 54, 54, 54, 54, 54, 54, 59, 59, 60,
+        60, 61, 61, 60, 60, 60, 59, 59, 58, 58, 58, 57, 57, 57, 56, 56, 56, 55,
+        55, 55, 55, 55, 54, 54, 54, 54, 54, 54, 54, 59, 59, 60, 60, 61, 61, 60,
+        60, 60, 59, 59, 58, 58, 58, 57, 57, 57, 56, 56, 56, 55, 55, 55, 55, 55,
+        54, 54, 54, 54, 54, 54, 54, 59, 59, 60, 60, 61, 61, 60, 60, 60, 59, 59,
+        58, 58, 58, 57, 57, 57, 56, 56, 56, 55, 55, 55, 55, 55, 54, 54, 54, 54,
+        54, 54, 54, 59, 59, 60, 60, 61, 61, 60, 60, 60, 59, 59, 58, 58, 58, 57,
+        57, 57, 56, 56, 56, 55, 55, 55, 55, 55, 54, 54, 54, 54, 54, 54, 54 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 51, 50, 45, 51, 48, 47, 44, 50, 47, 43, 41, 45, 44, 41, 40,
+        /* Size 8 */
+        64, 72, 54, 52, 51, 49, 47, 45, 72, 59, 53, 55, 55, 53, 50, 48, 54, 53,
+        50, 51, 51, 50, 48, 47, 52, 55, 51, 48, 48, 47, 46, 45, 51, 55, 51, 48,
+        46, 45, 44, 43, 49, 53, 50, 47, 45, 44, 43, 42, 47, 50, 48, 46, 44, 43,
+        42, 41, 45, 48, 47, 45, 43, 42, 41, 41,
+        /* Size 16 */
+        64, 68, 72, 63, 54, 53, 52, 52, 51, 50, 49, 48, 47, 46, 45, 45, 68, 67,
+        66, 59, 53, 54, 54, 53, 53, 52, 51, 50, 48, 47, 46, 46, 72, 66, 59, 56,
+        53, 54, 55, 55, 55, 54, 53, 51, 50, 49, 48, 48, 63, 59, 56, 54, 52, 52,
+        53, 53, 53, 52, 51, 50, 49, 48, 47, 47, 54, 53, 53, 52, 50, 50, 51, 51,
+        51, 50, 50, 49, 48, 47, 47, 47, 53, 54, 54, 52, 50, 50, 49, 49, 49, 49,
+        48, 48, 47, 46, 46, 46, 52, 54, 55, 53, 51, 49, 48, 48, 48, 47, 47, 46,
+        46, 45, 45, 45, 52, 53, 55, 53, 51, 49, 48, 47, 47, 46, 46, 45, 45, 45,
+        44, 44, 51, 53, 55, 53, 51, 49, 48, 47, 46, 45, 45, 44, 44, 44, 43, 43,
+        50, 52, 54, 52, 50, 49, 47, 46, 45, 45, 44, 44, 43, 43, 43, 43, 49, 51,
+        53, 51, 50, 48, 47, 46, 45, 44, 44, 43, 43, 42, 42, 42, 48, 50, 51, 50,
+        49, 48, 46, 45, 44, 44, 43, 43, 42, 42, 42, 42, 47, 48, 50, 49, 48, 47,
+        46, 45, 44, 43, 43, 42, 42, 42, 41, 41, 46, 47, 49, 48, 47, 46, 45, 45,
+        44, 43, 42, 42, 42, 41, 41, 41, 45, 46, 48, 47, 47, 46, 45, 44, 43, 43,
+        42, 42, 41, 41, 41, 41, 45, 46, 48, 47, 47, 46, 45, 44, 43, 43, 42, 42,
+        41, 41, 41, 41,
+        /* Size 32 */
+        64, 66, 68, 70, 72, 67, 63, 58, 54, 53, 53, 53, 52, 52, 52, 51, 51, 50,
+        50, 49, 49, 48, 48, 47, 47, 46, 46, 45, 45, 45, 45, 45, 66, 67, 67, 68,
+        69, 65, 61, 57, 53, 53, 53, 53, 53, 53, 52, 52, 52, 51, 51, 50, 50, 49,
+        49, 48, 48, 47, 47, 46, 46, 46, 46, 46, 68, 67, 67, 66, 66, 63, 59, 56,
+        53, 53, 54, 54, 54, 54, 53, 53, 53, 52, 52, 51, 51, 50, 50, 49, 48, 48,
+        47, 47, 46, 46, 46, 46, 70, 68, 66, 64, 62, 60, 58, 56, 53, 54, 54, 54,
+        55, 54, 54, 54, 54, 53, 53, 52, 52, 51, 50, 50, 49, 49, 48, 48, 47, 47,
+        47, 47, 72, 69, 66, 62, 59, 58, 56, 55, 53, 54, 54, 55, 55, 55, 55, 55,
+        55, 54, 54, 53, 53, 52, 51, 51, 50, 50, 49, 48, 48, 48, 48, 48, 67, 65,
+        63, 60, 58, 56, 55, 54, 52, 53, 53, 54, 54, 54, 54, 54, 54, 53, 53, 52,
+        52, 51, 51, 50, 50, 49, 49, 48, 48, 48, 48, 48, 63, 61, 59, 58, 56, 55,
+        54, 53, 52, 52, 52, 53, 53, 53, 53, 53, 53, 52, 52, 52, 51, 51, 50, 50,
+        49, 49, 48, 48, 47, 47, 47, 47, 58, 57, 56, 56, 55, 54, 53, 52, 51, 51,
+        51, 52, 52, 52, 52, 52, 52, 51, 51, 51, 50, 50, 50, 49, 49, 48, 48, 47,
+        47, 47, 47, 47, 54, 53, 53, 53, 53, 52, 52, 51, 50, 50, 50, 50, 51, 51,
+        51, 51, 51, 50, 50, 50, 50, 49, 49, 49, 48, 48, 47, 47, 47, 47, 47, 47,
+        53, 53, 53, 54, 54, 53, 52, 51, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        49, 49, 49, 49, 48, 48, 48, 47, 47, 46, 46, 46, 46, 46, 53, 53, 54, 54,
+        54, 53, 52, 51, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 49, 48, 48, 48,
+        48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 53, 53, 54, 54, 55, 54, 53, 52,
+        50, 50, 50, 49, 49, 49, 49, 48, 48, 48, 48, 48, 48, 47, 47, 47, 46, 46,
+        46, 46, 45, 45, 45, 45, 52, 53, 54, 55, 55, 54, 53, 52, 51, 50, 49, 49,
+        48, 48, 48, 48, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 45,
+        45, 45, 52, 53, 54, 54, 55, 54, 53, 52, 51, 50, 49, 49, 48, 48, 48, 47,
+        47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 45, 44, 44, 44, 44, 52, 52,
+        53, 54, 55, 54, 53, 52, 51, 50, 49, 49, 48, 48, 47, 47, 47, 46, 46, 46,
+        46, 46, 45, 45, 45, 45, 45, 44, 44, 44, 44, 44, 51, 52, 53, 54, 55, 54,
+        53, 52, 51, 50, 49, 48, 48, 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 45,
+        45, 44, 44, 44, 44, 44, 44, 44, 51, 52, 53, 54, 55, 54, 53, 52, 51, 50,
+        49, 48, 48, 47, 47, 46, 46, 46, 45, 45, 45, 45, 44, 44, 44, 44, 44, 44,
+        43, 43, 43, 43, 50, 51, 52, 53, 54, 53, 52, 51, 50, 50, 49, 48, 47, 47,
+        46, 46, 46, 45, 45, 45, 45, 44, 44, 44, 44, 44, 43, 43, 43, 43, 43, 43,
+        50, 51, 52, 53, 54, 53, 52, 51, 50, 49, 49, 48, 47, 47, 46, 46, 45, 45,
+        45, 44, 44, 44, 44, 44, 43, 43, 43, 43, 43, 43, 43, 43, 49, 50, 51, 52,
+        53, 52, 52, 51, 50, 49, 48, 48, 47, 47, 46, 46, 45, 45, 44, 44, 44, 44,
+        43, 43, 43, 43, 43, 43, 42, 42, 42, 42, 49, 50, 51, 52, 53, 52, 51, 50,
+        50, 49, 48, 48, 47, 46, 46, 45, 45, 45, 44, 44, 44, 43, 43, 43, 43, 43,
+        42, 42, 42, 42, 42, 42, 48, 49, 50, 51, 52, 51, 51, 50, 49, 49, 48, 47,
+        47, 46, 46, 45, 45, 44, 44, 44, 43, 43, 43, 43, 43, 42, 42, 42, 42, 42,
+        42, 42, 48, 49, 50, 50, 51, 51, 50, 50, 49, 48, 48, 47, 46, 46, 45, 45,
+        44, 44, 44, 43, 43, 43, 43, 43, 42, 42, 42, 42, 42, 42, 42, 42, 47, 48,
+        49, 50, 51, 50, 50, 49, 49, 48, 47, 47, 46, 46, 45, 45, 44, 44, 44, 43,
+        43, 43, 43, 42, 42, 42, 42, 42, 41, 41, 41, 41, 47, 48, 48, 49, 50, 50,
+        49, 49, 48, 48, 47, 46, 46, 45, 45, 45, 44, 44, 43, 43, 43, 43, 42, 42,
+        42, 42, 42, 41, 41, 41, 41, 41, 46, 47, 48, 49, 50, 49, 49, 48, 48, 47,
+        47, 46, 46, 45, 45, 44, 44, 44, 43, 43, 43, 42, 42, 42, 42, 42, 41, 41,
+        41, 41, 41, 41, 46, 47, 47, 48, 49, 49, 48, 48, 47, 47, 46, 46, 45, 45,
+        45, 44, 44, 43, 43, 43, 42, 42, 42, 42, 42, 41, 41, 41, 41, 41, 41, 41,
+        45, 46, 47, 48, 48, 48, 48, 47, 47, 46, 46, 46, 45, 45, 44, 44, 44, 43,
+        43, 43, 42, 42, 42, 42, 41, 41, 41, 41, 41, 41, 41, 41, 45, 46, 46, 47,
+        48, 48, 47, 47, 47, 46, 46, 45, 45, 44, 44, 44, 43, 43, 43, 42, 42, 42,
+        42, 41, 41, 41, 41, 41, 41, 41, 41, 41, 45, 46, 46, 47, 48, 48, 47, 47,
+        47, 46, 46, 45, 45, 44, 44, 44, 43, 43, 43, 42, 42, 42, 42, 41, 41, 41,
+        41, 41, 41, 41, 41, 41, 45, 46, 46, 47, 48, 48, 47, 47, 47, 46, 46, 45,
+        45, 44, 44, 44, 43, 43, 43, 42, 42, 42, 42, 41, 41, 41, 41, 41, 41, 41,
+        41, 41, 45, 46, 46, 47, 48, 48, 47, 47, 47, 46, 46, 45, 45, 44, 44, 44,
+        43, 43, 43, 42, 42, 42, 42, 41, 41, 41, 41, 41, 41, 41, 41, 41 },
+      { /* Intra matrices */
+        /* Size 4 */
+        90, 72, 69, 63, 72, 66, 65, 62, 69, 65, 59, 57, 63, 62, 57, 54,
+        /* Size 8 */
+        87, 98, 72, 70, 68, 65, 62, 60, 98, 80, 71, 74, 74, 70, 67, 64, 72, 71,
+        67, 68, 68, 66, 64, 62, 70, 74, 68, 64, 63, 62, 61, 59, 68, 74, 68, 63,
+        61, 59, 58, 57, 65, 70, 66, 62, 59, 58, 57, 56, 62, 67, 64, 61, 58, 57,
+        55, 54, 60, 64, 62, 59, 57, 56, 54, 53,
+        /* Size 16 */
+        88, 93, 99, 86, 73, 72, 71, 70, 69, 67, 66, 64, 63, 62, 60, 60, 93, 92,
+        90, 81, 72, 73, 73, 72, 72, 70, 69, 67, 65, 64, 62, 62, 99, 90, 81, 76,
+        72, 74, 75, 75, 74, 73, 71, 69, 68, 66, 65, 65, 86, 81, 76, 73, 70, 71,
+        72, 72, 71, 70, 69, 68, 66, 65, 64, 64, 73, 72, 72, 70, 67, 68, 68, 68,
+        68, 68, 67, 66, 65, 64, 63, 63, 72, 73, 74, 71, 68, 67, 67, 66, 66, 66,
+        65, 64, 63, 62, 61, 61, 71, 73, 75, 72, 68, 67, 65, 65, 64, 63, 63, 62,
+        62, 61, 60, 60, 70, 72, 75, 72, 68, 66, 65, 64, 63, 62, 62, 61, 60, 60,
+        59, 59, 69, 72, 74, 71, 68, 66, 64, 63, 61, 61, 60, 60, 59, 58, 58, 58,
+        67, 70, 73, 70, 68, 66, 63, 62, 61, 60, 59, 59, 58, 58, 57, 57, 66, 69,
+        71, 69, 67, 65, 63, 62, 60, 59, 58, 58, 57, 57, 56, 56, 64, 67, 69, 68,
+        66, 64, 62, 61, 60, 59, 58, 57, 56, 56, 56, 56, 63, 65, 68, 66, 65, 63,
+        62, 60, 59, 58, 57, 56, 56, 55, 55, 55, 62, 64, 66, 65, 64, 62, 61, 60,
+        58, 58, 57, 56, 55, 55, 54, 54, 60, 62, 65, 64, 63, 61, 60, 59, 58, 57,
+        56, 56, 55, 54, 54, 54, 60, 62, 65, 64, 63, 61, 60, 59, 58, 57, 56, 56,
+        55, 54, 54, 54,
+        /* Size 32 */
+        88, 91, 94, 97, 100, 93, 86, 80, 73, 73, 72, 72, 71, 71, 70, 70, 69, 68,
+        68, 67, 66, 65, 65, 64, 63, 63, 62, 61, 61, 61, 61, 61, 91, 92, 93, 94,
+        95, 89, 84, 78, 73, 73, 73, 72, 72, 72, 71, 71, 71, 70, 69, 68, 68, 67,
+        66, 65, 64, 64, 63, 62, 62, 62, 62, 62, 94, 93, 92, 91, 90, 86, 82, 77,
+        73, 73, 73, 73, 73, 73, 73, 72, 72, 71, 70, 70, 69, 68, 67, 66, 66, 65,
+        64, 63, 63, 63, 63, 63, 97, 94, 91, 89, 86, 83, 79, 76, 73, 73, 74, 74,
+        75, 74, 74, 74, 73, 73, 72, 71, 70, 69, 69, 68, 67, 66, 65, 65, 64, 64,
+        64, 64, 100, 95, 90, 86, 81, 79, 77, 75, 72, 73, 74, 75, 76, 75, 75, 75,
+        75, 74, 73, 72, 72, 71, 70, 69, 68, 67, 66, 66, 65, 65, 65, 65, 93, 89,
+        86, 83, 79, 77, 75, 73, 71, 72, 73, 73, 74, 74, 74, 73, 73, 73, 72, 71,
+        71, 70, 69, 68, 67, 67, 66, 65, 64, 64, 64, 64, 86, 84, 82, 79, 77, 75,
+        73, 72, 70, 71, 71, 72, 72, 72, 72, 72, 72, 71, 71, 70, 70, 69, 68, 67,
+        67, 66, 65, 65, 64, 64, 64, 64, 80, 78, 77, 76, 75, 73, 72, 70, 69, 69,
+        70, 70, 70, 70, 70, 70, 70, 70, 69, 69, 68, 68, 67, 67, 66, 65, 65, 64,
+        63, 63, 63, 63, 73, 73, 73, 73, 72, 71, 70, 69, 68, 68, 68, 68, 69, 69,
+        69, 69, 69, 68, 68, 68, 67, 67, 66, 66, 65, 65, 64, 63, 63, 63, 63, 63,
+        73, 73, 73, 73, 73, 72, 71, 69, 68, 68, 68, 68, 68, 68, 68, 68, 68, 67,
+        67, 67, 66, 66, 65, 65, 64, 64, 63, 63, 62, 62, 62, 62, 72, 73, 73, 74,
+        74, 73, 71, 70, 68, 68, 68, 67, 67, 67, 67, 67, 67, 66, 66, 66, 65, 65,
+        64, 64, 64, 63, 63, 62, 62, 62, 62, 62, 72, 72, 73, 74, 75, 73, 72, 70,
+        68, 68, 67, 67, 66, 66, 66, 66, 65, 65, 65, 65, 64, 64, 64, 63, 63, 62,
+        62, 62, 61, 61, 61, 61, 71, 72, 73, 75, 76, 74, 72, 70, 69, 68, 67, 66,
+        65, 65, 65, 65, 64, 64, 64, 64, 63, 63, 63, 62, 62, 62, 61, 61, 60, 60,
+        60, 60, 71, 72, 73, 74, 75, 74, 72, 70, 69, 68, 67, 66, 65, 65, 64, 64,
+        64, 63, 63, 63, 63, 62, 62, 62, 61, 61, 61, 60, 60, 60, 60, 60, 70, 71,
+        73, 74, 75, 74, 72, 70, 69, 68, 67, 66, 65, 64, 64, 64, 63, 63, 62, 62,
+        62, 62, 61, 61, 61, 60, 60, 60, 59, 59, 59, 59, 70, 71, 72, 74, 75, 73,
+        72, 70, 69, 68, 67, 66, 65, 64, 64, 63, 62, 62, 62, 61, 61, 61, 61, 60,
+        60, 60, 59, 59, 59, 59, 59, 59, 69, 71, 72, 73, 75, 73, 72, 70, 69, 68,
+        67, 65, 64, 64, 63, 62, 62, 61, 61, 61, 60, 60, 60, 60, 59, 59, 59, 59,
+        58, 58, 58, 58, 68, 70, 71, 73, 74, 73, 71, 70, 68, 67, 66, 65, 64, 63,
+        63, 62, 61, 61, 61, 60, 60, 60, 59, 59, 59, 59, 58, 58, 58, 58, 58, 58,
+        68, 69, 70, 72, 73, 72, 71, 69, 68, 67, 66, 65, 64, 63, 62, 62, 61, 61,
+        60, 60, 60, 59, 59, 59, 58, 58, 58, 58, 57, 57, 57, 57, 67, 68, 70, 71,
+        72, 71, 70, 69, 68, 67, 66, 65, 64, 63, 62, 61, 61, 60, 60, 60, 59, 59,
+        58, 58, 58, 58, 57, 57, 57, 57, 57, 57, 66, 68, 69, 70, 72, 71, 70, 68,
+        67, 66, 65, 64, 63, 63, 62, 61, 60, 60, 60, 59, 59, 58, 58, 58, 57, 57,
+        57, 57, 57, 57, 57, 57, 65, 67, 68, 69, 71, 70, 69, 68, 67, 66, 65, 64,
+        63, 62, 62, 61, 60, 60, 59, 59, 58, 58, 58, 57, 57, 57, 57, 56, 56, 56,
+        56, 56, 65, 66, 67, 69, 70, 69, 68, 67, 66, 65, 64, 64, 63, 62, 61, 61,
+        60, 59, 59, 58, 58, 58, 57, 57, 57, 57, 56, 56, 56, 56, 56, 56, 64, 65,
+        66, 68, 69, 68, 67, 67, 66, 65, 64, 63, 62, 62, 61, 60, 60, 59, 59, 58,
+        58, 57, 57, 57, 56, 56, 56, 56, 56, 56, 56, 56, 63, 64, 66, 67, 68, 67,
+        67, 66, 65, 64, 64, 63, 62, 61, 61, 60, 59, 59, 58, 58, 57, 57, 57, 56,
+        56, 56, 56, 55, 55, 55, 55, 55, 63, 64, 65, 66, 67, 67, 66, 65, 65, 64,
+        63, 62, 62, 61, 60, 60, 59, 59, 58, 58, 57, 57, 57, 56, 56, 56, 55, 55,
+        55, 55, 55, 55, 62, 63, 64, 65, 66, 66, 65, 65, 64, 63, 63, 62, 61, 61,
+        60, 59, 59, 58, 58, 57, 57, 57, 56, 56, 56, 55, 55, 55, 55, 55, 55, 55,
+        61, 62, 63, 65, 66, 65, 65, 64, 63, 63, 62, 62, 61, 60, 60, 59, 59, 58,
+        58, 57, 57, 56, 56, 56, 55, 55, 55, 55, 55, 55, 55, 55, 61, 62, 63, 64,
+        65, 64, 64, 63, 63, 62, 62, 61, 60, 60, 59, 59, 58, 58, 57, 57, 57, 56,
+        56, 56, 55, 55, 55, 55, 54, 54, 54, 54, 61, 62, 63, 64, 65, 64, 64, 63,
+        63, 62, 62, 61, 60, 60, 59, 59, 58, 58, 57, 57, 57, 56, 56, 56, 55, 55,
+        55, 55, 54, 54, 54, 54, 61, 62, 63, 64, 65, 64, 64, 63, 63, 62, 62, 61,
+        60, 60, 59, 59, 58, 58, 57, 57, 57, 56, 56, 56, 55, 55, 55, 55, 54, 54,
+        54, 54, 61, 62, 63, 64, 65, 64, 64, 63, 63, 62, 62, 61, 60, 60, 59, 59,
+        58, 58, 57, 57, 57, 56, 56, 56, 55, 55, 55, 55, 54, 54, 54, 54 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 62, 51, 46, 62, 53, 48, 45, 51, 48, 45, 43, 46, 45, 43, 42,
+        /* Size 8 */
+        64, 72, 70, 62, 56, 52, 50, 48, 72, 68, 69, 65, 59, 55, 52, 49, 70, 69,
+        61, 57, 55, 52, 50, 49, 62, 65, 57, 53, 51, 50, 49, 48, 56, 59, 55, 51,
+        49, 48, 47, 47, 52, 55, 52, 50, 48, 47, 47, 46, 50, 52, 50, 49, 47, 47,
+        46, 46, 48, 49, 49, 48, 47, 46, 46, 45,
+        /* Size 16 */
+        64, 68, 72, 71, 70, 66, 62, 59, 56, 54, 52, 51, 50, 49, 48, 48, 68, 69,
+        70, 70, 70, 67, 64, 61, 58, 56, 54, 52, 51, 50, 49, 49, 72, 70, 68, 69,
+        69, 67, 65, 62, 59, 57, 55, 53, 52, 50, 49, 49, 71, 70, 69, 67, 65, 63,
+        61, 59, 57, 55, 53, 52, 51, 50, 49, 49, 70, 70, 69, 65, 61, 59, 57, 56,
+        55, 53, 52, 51, 50, 49, 49, 49, 66, 67, 67, 63, 59, 57, 55, 54, 53, 52,
+        51, 50, 49, 49, 48, 48, 62, 64, 65, 61, 57, 55, 53, 52, 51, 51, 50, 49,
+        49, 48, 48, 48, 59, 61, 62, 59, 56, 54, 52, 51, 50, 50, 49, 49, 48, 48,
+        47, 47, 56, 58, 59, 57, 55, 53, 51, 50, 49, 49, 48, 48, 47, 47, 47, 47,
+        54, 56, 57, 55, 53, 52, 51, 50, 49, 48, 48, 47, 47, 47, 46, 46, 52, 54,
+        55, 53, 52, 51, 50, 49, 48, 48, 47, 47, 47, 46, 46, 46, 51, 52, 53, 52,
+        51, 50, 49, 49, 48, 47, 47, 47, 46, 46, 46, 46, 50, 51, 52, 51, 50, 49,
+        49, 48, 47, 47, 47, 46, 46, 46, 46, 46, 49, 50, 50, 50, 49, 49, 48, 48,
+        47, 47, 46, 46, 46, 46, 45, 45, 48, 49, 49, 49, 49, 48, 48, 47, 47, 46,
+        46, 46, 46, 45, 45, 45, 48, 49, 49, 49, 49, 48, 48, 47, 47, 46, 46, 46,
+        46, 45, 45, 45,
+        /* Size 32 */
+        64, 66, 68, 70, 72, 72, 71, 70, 70, 68, 66, 64, 62, 61, 59, 58, 56, 55,
+        54, 53, 52, 52, 51, 50, 50, 49, 49, 48, 48, 48, 48, 48, 66, 67, 69, 70,
+        71, 71, 70, 70, 70, 68, 66, 65, 63, 61, 60, 59, 57, 56, 55, 54, 53, 52,
+        52, 51, 50, 50, 49, 49, 48, 48, 48, 48, 68, 69, 69, 70, 70, 70, 70, 70,
+        70, 68, 67, 65, 64, 62, 61, 59, 58, 57, 56, 55, 54, 53, 52, 51, 51, 50,
+        50, 49, 49, 49, 49, 49, 70, 70, 70, 69, 69, 69, 69, 69, 69, 68, 67, 66,
+        64, 63, 61, 60, 58, 57, 56, 55, 54, 53, 53, 52, 51, 51, 50, 49, 49, 49,
+        49, 49, 72, 71, 70, 69, 68, 68, 69, 69, 69, 68, 67, 66, 65, 63, 62, 61,
+        59, 58, 57, 56, 55, 54, 53, 52, 52, 51, 50, 50, 49, 49, 49, 49, 72, 71,
+        70, 69, 68, 68, 68, 67, 67, 66, 65, 64, 63, 62, 61, 59, 58, 57, 56, 55,
+        54, 53, 53, 52, 51, 51, 50, 50, 49, 49, 49, 49, 71, 70, 70, 69, 69, 68,
+        67, 66, 65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 53, 52, 51,
+        51, 50, 50, 49, 49, 49, 49, 49, 70, 70, 70, 69, 69, 67, 66, 64, 63, 62,
+        61, 60, 59, 58, 58, 57, 56, 55, 54, 54, 53, 52, 52, 51, 50, 50, 50, 49,
+        49, 49, 49, 49, 70, 70, 70, 69, 69, 67, 65, 63, 61, 60, 59, 58, 57, 57,
+        56, 55, 55, 54, 53, 53, 52, 52, 51, 51, 50, 50, 49, 49, 49, 49, 49, 49,
+        68, 68, 68, 68, 68, 66, 64, 62, 60, 59, 58, 57, 56, 56, 55, 55, 54, 53,
+        53, 52, 52, 51, 51, 50, 50, 49, 49, 49, 48, 48, 48, 48, 66, 66, 67, 67,
+        67, 65, 63, 61, 59, 58, 57, 56, 55, 55, 54, 54, 53, 53, 52, 52, 51, 51,
+        50, 50, 49, 49, 49, 48, 48, 48, 48, 48, 64, 65, 65, 66, 66, 64, 62, 60,
+        58, 57, 56, 55, 54, 54, 53, 53, 52, 52, 51, 51, 50, 50, 50, 49, 49, 49,
+        48, 48, 48, 48, 48, 48, 62, 63, 64, 64, 65, 63, 61, 59, 57, 56, 55, 54,
+        53, 53, 52, 52, 51, 51, 51, 50, 50, 50, 49, 49, 49, 48, 48, 48, 48, 48,
+        48, 48, 61, 61, 62, 63, 63, 62, 60, 58, 57, 56, 55, 54, 53, 52, 52, 51,
+        51, 51, 50, 50, 49, 49, 49, 49, 48, 48, 48, 48, 47, 47, 47, 47, 59, 60,
+        61, 61, 62, 61, 59, 58, 56, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50, 49,
+        49, 49, 49, 48, 48, 48, 48, 47, 47, 47, 47, 47, 58, 59, 59, 60, 61, 59,
+        58, 57, 55, 55, 54, 53, 52, 51, 51, 50, 50, 50, 49, 49, 49, 48, 48, 48,
+        48, 48, 47, 47, 47, 47, 47, 47, 56, 57, 58, 58, 59, 58, 57, 56, 55, 54,
+        53, 52, 51, 51, 50, 50, 49, 49, 49, 49, 48, 48, 48, 48, 47, 47, 47, 47,
+        47, 47, 47, 47, 55, 56, 57, 57, 58, 57, 56, 55, 54, 53, 53, 52, 51, 51,
+        50, 50, 49, 49, 49, 48, 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+        54, 55, 56, 56, 57, 56, 55, 54, 53, 53, 52, 51, 51, 50, 50, 49, 49, 49,
+        48, 48, 48, 48, 47, 47, 47, 47, 47, 47, 46, 46, 46, 46, 53, 54, 55, 55,
+        56, 55, 54, 54, 53, 52, 52, 51, 50, 50, 49, 49, 49, 48, 48, 48, 48, 47,
+        47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 52, 53, 54, 54, 55, 54, 53, 53,
+        52, 52, 51, 50, 50, 49, 49, 49, 48, 48, 48, 48, 47, 47, 47, 47, 47, 46,
+        46, 46, 46, 46, 46, 46, 52, 52, 53, 53, 54, 53, 53, 52, 52, 51, 51, 50,
+        50, 49, 49, 48, 48, 48, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46,
+        46, 46, 51, 52, 52, 53, 53, 53, 52, 52, 51, 51, 50, 50, 49, 49, 49, 48,
+        48, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 50, 51,
+        51, 52, 52, 52, 51, 51, 51, 50, 50, 49, 49, 49, 48, 48, 48, 47, 47, 47,
+        47, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 50, 50, 51, 51, 52, 51,
+        51, 50, 50, 50, 49, 49, 49, 48, 48, 48, 47, 47, 47, 47, 47, 46, 46, 46,
+        46, 46, 46, 46, 46, 46, 46, 46, 49, 50, 50, 51, 51, 51, 50, 50, 50, 49,
+        49, 49, 48, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 46,
+        46, 46, 46, 46, 49, 49, 50, 50, 50, 50, 50, 50, 49, 49, 49, 48, 48, 48,
+        48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45,
+        48, 49, 49, 49, 50, 50, 49, 49, 49, 49, 48, 48, 48, 48, 47, 47, 47, 47,
+        47, 46, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 45, 48, 48, 49, 49,
+        49, 49, 49, 49, 49, 48, 48, 48, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46,
+        46, 46, 46, 46, 45, 45, 45, 45, 45, 45, 48, 48, 49, 49, 49, 49, 49, 49,
+        49, 48, 48, 48, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 46,
+        45, 45, 45, 45, 45, 45, 48, 48, 49, 49, 49, 49, 49, 49, 49, 48, 48, 48,
+        48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45,
+        45, 45, 48, 48, 49, 49, 49, 49, 49, 49, 49, 48, 48, 48, 48, 47, 47, 47,
+        47, 47, 46, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45 },
+      { /* Intra matrices */
+        /* Size 4 */
+        86, 83, 68, 60, 83, 70, 64, 59, 68, 64, 59, 57, 60, 59, 57, 55,
+        /* Size 8 */
+        79, 90, 87, 77, 69, 64, 61, 58, 90, 85, 86, 80, 73, 67, 63, 60, 87, 86,
+        75, 71, 67, 64, 61, 59, 77, 80, 71, 66, 63, 61, 59, 58, 69, 73, 67, 63,
+        60, 59, 58, 57, 64, 67, 64, 61, 59, 57, 57, 56, 61, 63, 61, 59, 58, 57,
+        56, 55, 58, 60, 59, 58, 57, 56, 55, 55,
+        /* Size 16 */
+        80, 86, 91, 90, 88, 83, 78, 74, 70, 68, 65, 63, 61, 60, 59, 59, 86, 87,
+        89, 88, 88, 84, 80, 76, 72, 69, 66, 65, 63, 61, 60, 60, 91, 89, 86, 87,
+        87, 84, 82, 78, 74, 71, 68, 66, 64, 62, 61, 61, 90, 88, 87, 84, 82, 79,
+        77, 74, 71, 69, 66, 65, 63, 62, 60, 60, 88, 88, 87, 82, 76, 74, 72, 70,
+        68, 66, 65, 63, 62, 61, 60, 60, 83, 84, 84, 79, 74, 71, 69, 68, 66, 65,
+        63, 62, 61, 60, 59, 59, 78, 80, 82, 77, 72, 69, 66, 65, 64, 63, 62, 61,
+        60, 59, 59, 59, 74, 76, 78, 74, 70, 68, 65, 64, 62, 62, 61, 60, 59, 59,
+        58, 58, 70, 72, 74, 71, 68, 66, 64, 62, 61, 60, 60, 59, 58, 58, 57, 57,
+        68, 69, 71, 69, 66, 65, 63, 62, 60, 60, 59, 58, 58, 57, 57, 57, 65, 66,
+        68, 66, 65, 63, 62, 61, 60, 59, 58, 58, 57, 57, 57, 57, 63, 65, 66, 65,
+        63, 62, 61, 60, 59, 58, 58, 57, 57, 57, 56, 56, 61, 63, 64, 63, 62, 61,
+        60, 59, 58, 58, 57, 57, 57, 56, 56, 56, 60, 61, 62, 62, 61, 60, 59, 59,
+        58, 57, 57, 57, 56, 56, 56, 56, 59, 60, 61, 60, 60, 59, 59, 58, 57, 57,
+        57, 56, 56, 56, 56, 56, 59, 60, 61, 60, 60, 59, 59, 58, 57, 57, 57, 56,
+        56, 56, 56, 56,
+        /* Size 32 */
+        81, 84, 86, 89, 92, 91, 90, 89, 89, 86, 84, 81, 79, 77, 75, 73, 71, 69,
+        68, 67, 65, 64, 64, 63, 62, 61, 61, 60, 59, 59, 59, 59, 84, 85, 87, 89,
+        91, 90, 89, 89, 88, 86, 84, 82, 79, 78, 76, 74, 72, 70, 69, 67, 66, 65,
+        64, 63, 62, 62, 61, 61, 60, 60, 60, 60, 86, 87, 88, 88, 89, 89, 89, 88,
+        88, 86, 84, 82, 80, 78, 76, 75, 73, 71, 70, 68, 67, 66, 65, 64, 63, 62,
+        62, 61, 60, 60, 60, 60, 89, 89, 88, 88, 88, 88, 88, 88, 88, 86, 85, 83,
+        81, 79, 77, 75, 74, 72, 71, 69, 68, 67, 66, 65, 64, 63, 62, 62, 61, 61,
+        61, 61, 92, 91, 89, 88, 86, 87, 87, 88, 88, 86, 85, 83, 82, 80, 78, 76,
+        74, 73, 71, 70, 68, 67, 66, 65, 64, 64, 63, 62, 61, 61, 61, 61, 91, 90,
+        89, 88, 87, 86, 86, 85, 85, 84, 82, 81, 80, 78, 76, 75, 73, 72, 70, 69,
+        68, 67, 66, 65, 64, 63, 62, 62, 61, 61, 61, 61, 90, 89, 89, 88, 87, 86,
+        85, 83, 82, 81, 80, 78, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64,
+        63, 63, 62, 61, 61, 61, 61, 61, 89, 89, 88, 88, 88, 85, 83, 81, 79, 78,
+        77, 76, 75, 73, 72, 71, 70, 69, 68, 67, 66, 65, 64, 64, 63, 62, 62, 61,
+        61, 61, 61, 61, 89, 88, 88, 88, 88, 85, 82, 79, 76, 75, 74, 73, 72, 71,
+        70, 69, 69, 68, 67, 66, 65, 64, 64, 63, 62, 62, 61, 61, 60, 60, 60, 60,
+        86, 86, 86, 86, 86, 84, 81, 78, 75, 74, 73, 72, 71, 70, 69, 68, 67, 67,
+        66, 65, 64, 64, 63, 63, 62, 61, 61, 60, 60, 60, 60, 60, 84, 84, 84, 85,
+        85, 82, 80, 77, 74, 73, 72, 71, 70, 69, 68, 67, 66, 66, 65, 64, 64, 63,
+        63, 62, 61, 61, 61, 60, 60, 60, 60, 60, 81, 82, 82, 83, 83, 81, 78, 76,
+        73, 72, 71, 69, 68, 67, 67, 66, 65, 65, 64, 63, 63, 62, 62, 61, 61, 60,
+        60, 60, 59, 59, 59, 59, 79, 79, 80, 81, 82, 80, 77, 75, 72, 71, 70, 68,
+        67, 66, 65, 65, 64, 64, 63, 63, 62, 62, 61, 61, 60, 60, 60, 59, 59, 59,
+        59, 59, 77, 78, 78, 79, 80, 78, 76, 73, 71, 70, 69, 67, 66, 66, 65, 64,
+        63, 63, 63, 62, 62, 61, 61, 60, 60, 60, 59, 59, 59, 59, 59, 59, 75, 76,
+        76, 77, 78, 76, 74, 72, 70, 69, 68, 67, 65, 65, 64, 63, 63, 62, 62, 61,
+        61, 61, 60, 60, 60, 59, 59, 59, 58, 58, 58, 58, 73, 74, 75, 75, 76, 75,
+        73, 71, 69, 68, 67, 66, 65, 64, 63, 63, 62, 62, 61, 61, 60, 60, 60, 60,
+        59, 59, 59, 58, 58, 58, 58, 58, 71, 72, 73, 74, 74, 73, 72, 70, 69, 67,
+        66, 65, 64, 63, 63, 62, 62, 61, 61, 60, 60, 60, 59, 59, 59, 59, 58, 58,
+        58, 58, 58, 58, 69, 70, 71, 72, 73, 72, 70, 69, 68, 67, 66, 65, 64, 63,
+        62, 62, 61, 61, 60, 60, 60, 59, 59, 59, 58, 58, 58, 58, 58, 58, 58, 58,
+        68, 69, 70, 71, 71, 70, 69, 68, 67, 66, 65, 64, 63, 63, 62, 61, 61, 60,
+        60, 60, 59, 59, 59, 58, 58, 58, 58, 58, 57, 57, 57, 57, 67, 67, 68, 69,
+        70, 69, 68, 67, 66, 65, 64, 63, 63, 62, 61, 61, 60, 60, 60, 59, 59, 59,
+        58, 58, 58, 58, 58, 57, 57, 57, 57, 57, 65, 66, 67, 68, 68, 68, 67, 66,
+        65, 64, 64, 63, 62, 62, 61, 60, 60, 60, 59, 59, 59, 58, 58, 58, 58, 57,
+        57, 57, 57, 57, 57, 57, 64, 65, 66, 67, 67, 67, 66, 65, 64, 64, 63, 62,
+        62, 61, 61, 60, 60, 59, 59, 59, 58, 58, 58, 58, 57, 57, 57, 57, 57, 57,
+        57, 57, 64, 64, 65, 66, 66, 66, 65, 64, 64, 63, 63, 62, 61, 61, 60, 60,
+        59, 59, 59, 58, 58, 58, 58, 57, 57, 57, 57, 57, 57, 57, 57, 57, 63, 63,
+        64, 65, 65, 65, 64, 64, 63, 63, 62, 61, 61, 60, 60, 60, 59, 59, 58, 58,
+        58, 58, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 62, 62, 63, 64, 64, 64,
+        63, 63, 62, 62, 61, 61, 60, 60, 60, 59, 59, 58, 58, 58, 58, 57, 57, 57,
+        57, 57, 57, 56, 56, 56, 56, 56, 61, 62, 62, 63, 64, 63, 63, 62, 62, 61,
+        61, 60, 60, 60, 59, 59, 59, 58, 58, 58, 57, 57, 57, 57, 57, 57, 56, 56,
+        56, 56, 56, 56, 61, 61, 62, 62, 63, 62, 62, 62, 61, 61, 61, 60, 60, 59,
+        59, 59, 58, 58, 58, 58, 57, 57, 57, 57, 57, 56, 56, 56, 56, 56, 56, 56,
+        60, 61, 61, 62, 62, 62, 61, 61, 61, 60, 60, 60, 59, 59, 59, 58, 58, 58,
+        58, 57, 57, 57, 57, 57, 56, 56, 56, 56, 56, 56, 56, 56, 59, 60, 60, 61,
+        61, 61, 61, 61, 60, 60, 60, 59, 59, 59, 58, 58, 58, 58, 57, 57, 57, 57,
+        57, 57, 56, 56, 56, 56, 56, 56, 56, 56, 59, 60, 60, 61, 61, 61, 61, 61,
+        60, 60, 60, 59, 59, 59, 58, 58, 58, 58, 57, 57, 57, 57, 57, 57, 56, 56,
+        56, 56, 56, 56, 56, 56, 59, 60, 60, 61, 61, 61, 61, 61, 60, 60, 60, 59,
+        59, 59, 58, 58, 58, 58, 57, 57, 57, 57, 57, 57, 56, 56, 56, 56, 56, 56,
+        56, 56, 59, 60, 60, 61, 61, 61, 61, 61, 60, 60, 60, 59, 59, 59, 58, 58,
+        58, 58, 57, 57, 57, 57, 57, 57, 56, 56, 56, 56, 56, 56, 56, 56 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 53, 52, 48, 53, 50, 49, 48, 52, 49, 46, 45, 48, 48, 45, 43,
+        /* Size 8 */
+        64, 71, 55, 54, 53, 51, 50, 48, 71, 60, 55, 57, 56, 54, 52, 51, 55, 55,
+        52, 53, 53, 52, 51, 49, 54, 57, 53, 51, 50, 50, 49, 48, 53, 56, 53, 50,
+        49, 48, 47, 47, 51, 54, 52, 50, 48, 47, 46, 46, 50, 52, 51, 49, 47, 46,
+        46, 45, 48, 51, 49, 48, 47, 46, 45, 44,
+        /* Size 16 */
+        64, 67, 71, 63, 55, 55, 54, 54, 53, 52, 51, 50, 50, 49, 48, 48, 67, 66,
+        65, 60, 55, 55, 56, 55, 55, 54, 53, 52, 51, 50, 49, 49, 71, 65, 60, 58,
+        55, 56, 57, 57, 56, 55, 54, 53, 52, 51, 51, 51, 63, 60, 58, 56, 54, 54,
+        55, 55, 55, 54, 53, 52, 52, 51, 50, 50, 55, 55, 55, 54, 52, 53, 53, 53,
+        53, 52, 52, 51, 51, 50, 49, 49, 55, 55, 56, 54, 53, 52, 52, 52, 52, 51,
+        51, 50, 50, 49, 49, 49, 54, 56, 57, 55, 53, 52, 51, 51, 50, 50, 50, 49,
+        49, 48, 48, 48, 54, 55, 57, 55, 53, 52, 51, 50, 50, 49, 49, 48, 48, 48,
+        47, 47, 53, 55, 56, 55, 53, 52, 50, 50, 49, 48, 48, 48, 47, 47, 47, 47,
+        52, 54, 55, 54, 52, 51, 50, 49, 48, 48, 47, 47, 47, 47, 46, 46, 51, 53,
+        54, 53, 52, 51, 50, 49, 48, 47, 47, 47, 46, 46, 46, 46, 50, 52, 53, 52,
+        51, 50, 49, 48, 48, 47, 47, 46, 46, 46, 45, 45, 50, 51, 52, 52, 51, 50,
+        49, 48, 47, 47, 46, 46, 46, 45, 45, 45, 49, 50, 51, 51, 50, 49, 48, 48,
+        47, 47, 46, 46, 45, 45, 45, 45, 48, 49, 51, 50, 49, 49, 48, 47, 47, 46,
+        46, 45, 45, 45, 44, 44, 48, 49, 51, 50, 49, 49, 48, 47, 47, 46, 46, 45,
+        45, 45, 44, 44,
+        /* Size 32 */
+        64, 66, 67, 69, 71, 67, 63, 59, 55, 55, 55, 54, 54, 54, 54, 53, 53, 53,
+        52, 52, 51, 51, 50, 50, 50, 49, 49, 48, 48, 48, 48, 48, 66, 66, 67, 67,
+        68, 65, 62, 58, 55, 55, 55, 55, 55, 55, 54, 54, 54, 53, 53, 53, 52, 52,
+        51, 51, 50, 50, 50, 49, 49, 49, 49, 49, 67, 67, 66, 66, 65, 63, 60, 58,
+        55, 55, 55, 55, 56, 55, 55, 55, 55, 54, 54, 53, 53, 52, 52, 51, 51, 51,
+        50, 50, 49, 49, 49, 49, 69, 67, 66, 64, 63, 61, 59, 57, 55, 55, 56, 56,
+        56, 56, 56, 56, 55, 55, 55, 54, 54, 53, 53, 52, 52, 51, 51, 50, 50, 50,
+        50, 50, 71, 68, 65, 63, 60, 59, 58, 56, 55, 55, 56, 56, 57, 57, 57, 56,
+        56, 56, 55, 55, 54, 54, 53, 53, 52, 52, 51, 51, 51, 51, 51, 51, 67, 65,
+        63, 61, 59, 58, 57, 55, 54, 55, 55, 55, 56, 56, 56, 56, 55, 55, 55, 54,
+        54, 53, 53, 52, 52, 52, 51, 51, 50, 50, 50, 50, 63, 62, 60, 59, 58, 57,
+        56, 55, 54, 54, 54, 54, 55, 55, 55, 55, 55, 54, 54, 54, 53, 53, 52, 52,
+        52, 51, 51, 50, 50, 50, 50, 50, 59, 58, 58, 57, 56, 55, 55, 54, 53, 53,
+        53, 54, 54, 54, 54, 54, 54, 53, 53, 53, 53, 52, 52, 52, 51, 51, 50, 50,
+        50, 50, 50, 50, 55, 55, 55, 55, 55, 54, 54, 53, 52, 52, 53, 53, 53, 53,
+        53, 53, 53, 53, 52, 52, 52, 52, 51, 51, 51, 50, 50, 50, 49, 49, 49, 49,
+        55, 55, 55, 55, 55, 55, 54, 53, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+        52, 52, 51, 51, 51, 51, 50, 50, 50, 49, 49, 49, 49, 49, 55, 55, 55, 56,
+        56, 55, 54, 53, 53, 52, 52, 52, 52, 52, 52, 52, 52, 51, 51, 51, 51, 51,
+        50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 54, 55, 55, 56, 56, 55, 54, 54,
+        53, 52, 52, 52, 51, 51, 51, 51, 51, 51, 51, 50, 50, 50, 50, 50, 49, 49,
+        49, 49, 48, 48, 48, 48, 54, 55, 56, 56, 57, 56, 55, 54, 53, 52, 52, 51,
+        51, 51, 51, 50, 50, 50, 50, 50, 50, 49, 49, 49, 49, 49, 48, 48, 48, 48,
+        48, 48, 54, 55, 55, 56, 57, 56, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50,
+        50, 50, 50, 49, 49, 49, 49, 49, 49, 48, 48, 48, 48, 48, 48, 48, 54, 54,
+        55, 56, 57, 56, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50, 50, 49, 49, 49,
+        49, 49, 48, 48, 48, 48, 48, 48, 47, 47, 47, 47, 53, 54, 55, 56, 56, 56,
+        55, 54, 53, 52, 52, 51, 50, 50, 50, 49, 49, 49, 49, 49, 48, 48, 48, 48,
+        48, 48, 47, 47, 47, 47, 47, 47, 53, 54, 55, 55, 56, 55, 55, 54, 53, 52,
+        52, 51, 50, 50, 50, 49, 49, 49, 48, 48, 48, 48, 48, 48, 47, 47, 47, 47,
+        47, 47, 47, 47, 53, 53, 54, 55, 56, 55, 54, 53, 53, 52, 51, 51, 50, 50,
+        49, 49, 49, 48, 48, 48, 48, 48, 47, 47, 47, 47, 47, 47, 46, 46, 46, 46,
+        52, 53, 54, 55, 55, 55, 54, 53, 52, 52, 51, 51, 50, 50, 49, 49, 48, 48,
+        48, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 52, 53, 53, 54,
+        55, 54, 54, 53, 52, 52, 51, 50, 50, 49, 49, 49, 48, 48, 48, 47, 47, 47,
+        47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 51, 52, 53, 54, 54, 54, 53, 53,
+        52, 51, 51, 50, 50, 49, 49, 48, 48, 48, 47, 47, 47, 47, 47, 46, 46, 46,
+        46, 46, 46, 46, 46, 46, 51, 52, 52, 53, 54, 53, 53, 52, 52, 51, 51, 50,
+        49, 49, 49, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 46,
+        46, 46, 50, 51, 52, 53, 53, 53, 52, 52, 51, 51, 50, 50, 49, 49, 48, 48,
+        48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 45, 50, 51,
+        51, 52, 53, 52, 52, 52, 51, 51, 50, 50, 49, 49, 48, 48, 48, 47, 47, 47,
+        46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45, 50, 50, 51, 52, 52, 52,
+        52, 51, 51, 50, 50, 49, 49, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46,
+        46, 45, 45, 45, 45, 45, 45, 45, 49, 50, 51, 51, 52, 52, 51, 51, 50, 50,
+        50, 49, 49, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 45, 45, 45, 45,
+        45, 45, 45, 45, 49, 50, 50, 51, 51, 51, 51, 50, 50, 50, 49, 49, 48, 48,
+        48, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+        48, 49, 50, 50, 51, 51, 50, 50, 50, 49, 49, 49, 48, 48, 48, 47, 47, 47,
+        46, 46, 46, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 48, 49, 49, 50,
+        51, 50, 50, 50, 49, 49, 49, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46,
+        45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 48, 49, 49, 50, 51, 50, 50, 50,
+        49, 49, 49, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 45, 45, 45, 45,
+        45, 45, 44, 44, 44, 44, 48, 49, 49, 50, 51, 50, 50, 50, 49, 49, 49, 48,
+        48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45, 44, 44,
+        44, 44, 48, 49, 49, 50, 51, 50, 50, 50, 49, 49, 49, 48, 48, 48, 47, 47,
+        47, 46, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44 },
+      { /* Intra matrices */
+        /* Size 4 */
+        85, 70, 68, 63, 70, 66, 64, 62, 68, 64, 60, 58, 63, 62, 58, 56,
+        /* Size 8 */
+        82, 91, 70, 69, 67, 65, 62, 60, 91, 76, 70, 72, 71, 69, 66, 64, 70, 70,
+        66, 67, 67, 66, 64, 62, 69, 72, 67, 64, 63, 62, 61, 60, 67, 71, 67, 63,
+        61, 60, 59, 59, 65, 69, 66, 62, 60, 59, 58, 57, 62, 66, 64, 61, 59, 58,
+        57, 56, 60, 64, 62, 60, 59, 57, 56, 55,
+        /* Size 16 */
+        82, 87, 91, 81, 71, 70, 69, 68, 68, 66, 65, 64, 63, 62, 61, 61, 87, 86,
+        84, 77, 70, 71, 71, 70, 70, 69, 67, 66, 65, 64, 63, 63, 91, 84, 77, 74,
+        70, 71, 73, 72, 72, 71, 70, 68, 67, 65, 64, 64, 81, 77, 74, 71, 68, 69,
+        70, 70, 70, 69, 68, 67, 66, 65, 63, 63, 71, 70, 70, 68, 67, 67, 67, 67,
+        67, 67, 66, 65, 65, 64, 63, 63, 70, 71, 71, 69, 67, 66, 66, 66, 66, 65,
+        65, 64, 63, 62, 62, 62, 69, 71, 73, 70, 67, 66, 65, 64, 64, 63, 63, 62,
+        62, 61, 61, 61, 68, 70, 72, 70, 67, 66, 64, 64, 63, 62, 62, 61, 61, 60,
+        60, 60, 68, 70, 72, 70, 67, 66, 64, 63, 62, 61, 61, 60, 60, 59, 59, 59,
+        66, 69, 71, 69, 67, 65, 63, 62, 61, 61, 60, 60, 59, 59, 58, 58, 65, 67,
+        70, 68, 66, 65, 63, 62, 61, 60, 59, 59, 58, 58, 58, 58, 64, 66, 68, 67,
+        65, 64, 62, 61, 60, 60, 59, 58, 58, 58, 57, 57, 63, 65, 67, 66, 65, 63,
+        62, 61, 60, 59, 58, 58, 57, 57, 57, 57, 62, 64, 65, 65, 64, 62, 61, 60,
+        59, 59, 58, 58, 57, 57, 56, 56, 61, 63, 64, 63, 63, 62, 61, 60, 59, 58,
+        58, 57, 57, 56, 56, 56, 61, 63, 64, 63, 63, 62, 61, 60, 59, 58, 58, 57,
+        57, 56, 56, 56,
+        /* Size 32 */
+        83, 85, 87, 90, 92, 87, 81, 76, 71, 71, 70, 70, 69, 69, 69, 68, 68, 67,
+        67, 66, 66, 65, 64, 64, 63, 63, 62, 62, 61, 61, 61, 61, 85, 86, 87, 87,
+        88, 84, 79, 75, 71, 71, 71, 70, 70, 70, 70, 69, 69, 68, 68, 67, 67, 66,
+        65, 65, 64, 64, 63, 63, 62, 62, 62, 62, 87, 87, 86, 85, 85, 81, 78, 74,
+        71, 71, 71, 71, 71, 71, 71, 70, 70, 69, 69, 68, 68, 67, 66, 66, 65, 65,
+        64, 63, 63, 63, 63, 63, 90, 87, 85, 83, 81, 78, 76, 73, 71, 71, 71, 72,
+        72, 72, 72, 71, 71, 71, 70, 69, 69, 68, 67, 67, 66, 66, 65, 64, 64, 64,
+        64, 64, 92, 88, 85, 81, 77, 76, 74, 72, 70, 71, 72, 72, 73, 73, 73, 72,
+        72, 72, 71, 70, 70, 69, 68, 68, 67, 66, 66, 65, 65, 65, 65, 65, 87, 84,
+        81, 78, 76, 74, 73, 71, 70, 70, 71, 71, 72, 72, 71, 71, 71, 71, 70, 70,
+        69, 68, 68, 67, 67, 66, 65, 65, 64, 64, 64, 64, 81, 79, 78, 76, 74, 73,
+        71, 70, 69, 69, 69, 70, 70, 70, 70, 70, 70, 70, 69, 69, 68, 68, 67, 66,
+        66, 65, 65, 64, 64, 64, 64, 64, 76, 75, 74, 73, 72, 71, 70, 69, 68, 68,
+        68, 69, 69, 69, 69, 69, 69, 68, 68, 68, 67, 67, 66, 66, 65, 65, 64, 64,
+        63, 63, 63, 63, 71, 71, 71, 71, 70, 70, 69, 68, 67, 67, 67, 67, 68, 68,
+        68, 68, 68, 67, 67, 67, 67, 66, 66, 65, 65, 64, 64, 63, 63, 63, 63, 63,
+        71, 71, 71, 71, 71, 70, 69, 68, 67, 67, 67, 67, 67, 67, 67, 67, 67, 66,
+        66, 66, 66, 65, 65, 65, 64, 64, 63, 63, 62, 62, 62, 62, 70, 71, 71, 71,
+        72, 71, 69, 68, 67, 67, 67, 66, 66, 66, 66, 66, 66, 66, 65, 65, 65, 65,
+        64, 64, 64, 63, 63, 62, 62, 62, 62, 62, 70, 70, 71, 72, 72, 71, 70, 69,
+        67, 67, 66, 66, 66, 65, 65, 65, 65, 65, 65, 64, 64, 64, 63, 63, 63, 63,
+        62, 62, 62, 62, 62, 62, 69, 70, 71, 72, 73, 72, 70, 69, 68, 67, 66, 66,
+        65, 65, 65, 64, 64, 64, 64, 63, 63, 63, 63, 63, 62, 62, 62, 61, 61, 61,
+        61, 61, 69, 70, 71, 72, 73, 72, 70, 69, 68, 67, 66, 65, 65, 64, 64, 64,
+        64, 63, 63, 63, 63, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 61, 69, 70,
+        71, 72, 73, 71, 70, 69, 68, 67, 66, 65, 65, 64, 64, 63, 63, 63, 63, 62,
+        62, 62, 62, 61, 61, 61, 61, 60, 60, 60, 60, 60, 68, 69, 70, 71, 72, 71,
+        70, 69, 68, 67, 66, 65, 64, 64, 63, 63, 63, 62, 62, 62, 62, 61, 61, 61,
+        61, 60, 60, 60, 60, 60, 60, 60, 68, 69, 70, 71, 72, 71, 70, 69, 68, 67,
+        66, 65, 64, 64, 63, 63, 62, 62, 62, 61, 61, 61, 61, 60, 60, 60, 60, 60,
+        59, 59, 59, 59, 67, 68, 69, 71, 72, 71, 70, 68, 67, 66, 66, 65, 64, 63,
+        63, 62, 62, 62, 61, 61, 61, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59,
+        67, 68, 69, 70, 71, 70, 69, 68, 67, 66, 65, 65, 64, 63, 63, 62, 62, 61,
+        61, 61, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 66, 67, 68, 69,
+        70, 70, 69, 68, 67, 66, 65, 64, 63, 63, 62, 62, 61, 61, 61, 60, 60, 60,
+        59, 59, 59, 59, 59, 58, 58, 58, 58, 58, 66, 67, 68, 69, 70, 69, 68, 67,
+        67, 66, 65, 64, 63, 63, 62, 62, 61, 61, 60, 60, 60, 59, 59, 59, 59, 58,
+        58, 58, 58, 58, 58, 58, 65, 66, 67, 68, 69, 68, 68, 67, 66, 65, 65, 64,
+        63, 62, 62, 61, 61, 60, 60, 60, 59, 59, 59, 59, 58, 58, 58, 58, 58, 58,
+        58, 58, 64, 65, 66, 67, 68, 68, 67, 66, 66, 65, 64, 63, 63, 62, 62, 61,
+        61, 60, 60, 59, 59, 59, 59, 58, 58, 58, 58, 58, 57, 57, 57, 57, 64, 65,
+        66, 67, 68, 67, 66, 66, 65, 65, 64, 63, 63, 62, 61, 61, 60, 60, 60, 59,
+        59, 59, 58, 58, 58, 58, 58, 57, 57, 57, 57, 57, 63, 64, 65, 66, 67, 67,
+        66, 65, 65, 64, 64, 63, 62, 62, 61, 61, 60, 60, 59, 59, 59, 58, 58, 58,
+        58, 57, 57, 57, 57, 57, 57, 57, 63, 64, 65, 66, 66, 66, 65, 65, 64, 64,
+        63, 63, 62, 61, 61, 60, 60, 60, 59, 59, 58, 58, 58, 58, 57, 57, 57, 57,
+        57, 57, 57, 57, 62, 63, 64, 65, 66, 65, 65, 64, 64, 63, 63, 62, 62, 61,
+        61, 60, 60, 59, 59, 59, 58, 58, 58, 58, 57, 57, 57, 57, 57, 57, 57, 57,
+        62, 63, 63, 64, 65, 65, 64, 64, 63, 63, 62, 62, 61, 61, 60, 60, 60, 59,
+        59, 58, 58, 58, 58, 57, 57, 57, 57, 57, 56, 56, 56, 56, 61, 62, 63, 64,
+        65, 64, 64, 63, 63, 62, 62, 62, 61, 61, 60, 60, 59, 59, 59, 58, 58, 58,
+        57, 57, 57, 57, 57, 56, 56, 56, 56, 56, 61, 62, 63, 64, 65, 64, 64, 63,
+        63, 62, 62, 62, 61, 61, 60, 60, 59, 59, 59, 58, 58, 58, 57, 57, 57, 57,
+        57, 56, 56, 56, 56, 56, 61, 62, 63, 64, 65, 64, 64, 63, 63, 62, 62, 62,
+        61, 61, 60, 60, 59, 59, 59, 58, 58, 58, 57, 57, 57, 57, 57, 56, 56, 56,
+        56, 56, 61, 62, 63, 64, 65, 64, 64, 63, 63, 62, 62, 62, 61, 61, 60, 60,
+        59, 59, 59, 58, 58, 58, 57, 57, 57, 57, 57, 56, 56, 56, 56, 56 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 62, 54, 50, 62, 55, 52, 49, 54, 52, 49, 48, 50, 49, 48, 47,
+        /* Size 8 */
+        64, 70, 68, 63, 58, 55, 53, 52, 70, 67, 68, 65, 60, 57, 54, 53, 68, 68,
+        61, 59, 57, 55, 53, 52, 63, 65, 59, 56, 54, 53, 52, 51, 58, 60, 57, 54,
+        53, 52, 51, 51, 55, 57, 55, 53, 52, 51, 51, 50, 53, 54, 53, 52, 51, 51,
+        50, 50, 52, 53, 52, 51, 51, 50, 50, 50,
+        /* Size 16 */
+        64, 67, 70, 69, 68, 66, 63, 60, 58, 57, 55, 54, 53, 52, 52, 52, 67, 68,
+        69, 69, 68, 66, 64, 61, 59, 58, 56, 55, 54, 53, 52, 52, 70, 69, 67, 68,
+        68, 66, 65, 62, 60, 59, 57, 56, 54, 53, 53, 53, 69, 69, 68, 66, 65, 63,
+        62, 60, 59, 57, 56, 55, 54, 53, 52, 52, 68, 68, 68, 65, 61, 60, 59, 58,
+        57, 56, 55, 54, 53, 53, 52, 52, 66, 66, 66, 63, 60, 59, 57, 56, 56, 55,
+        54, 53, 53, 52, 52, 52, 63, 64, 65, 62, 59, 57, 56, 55, 54, 54, 53, 53,
+        52, 52, 51, 51, 60, 61, 62, 60, 58, 56, 55, 54, 54, 53, 52, 52, 52, 51,
+        51, 51, 58, 59, 60, 59, 57, 56, 54, 54, 53, 52, 52, 52, 51, 51, 51, 51,
+        57, 58, 59, 57, 56, 55, 54, 53, 52, 52, 51, 51, 51, 51, 50, 50, 55, 56,
+        57, 56, 55, 54, 53, 52, 52, 51, 51, 51, 51, 50, 50, 50, 54, 55, 56, 55,
+        54, 53, 53, 52, 52, 51, 51, 51, 50, 50, 50, 50, 53, 54, 54, 54, 53, 53,
+        52, 52, 51, 51, 51, 50, 50, 50, 50, 50, 52, 53, 53, 53, 53, 52, 52, 51,
+        51, 51, 50, 50, 50, 50, 50, 50, 52, 52, 53, 52, 52, 52, 51, 51, 51, 50,
+        50, 50, 50, 50, 50, 50, 52, 52, 53, 52, 52, 52, 51, 51, 51, 50, 50, 50,
+        50, 50, 50, 50,
+        /* Size 32 */
+        64, 66, 67, 69, 70, 70, 69, 69, 68, 67, 66, 64, 63, 62, 60, 59, 58, 57,
+        57, 56, 55, 54, 54, 53, 53, 53, 52, 52, 52, 52, 52, 52, 66, 67, 68, 69,
+        70, 69, 69, 69, 68, 67, 66, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 55,
+        54, 54, 53, 53, 53, 52, 52, 52, 52, 52, 67, 68, 68, 68, 69, 69, 69, 68,
+        68, 67, 66, 65, 64, 63, 61, 60, 59, 58, 58, 57, 56, 55, 55, 54, 54, 53,
+        53, 52, 52, 52, 52, 52, 69, 69, 68, 68, 68, 68, 68, 68, 68, 67, 66, 65,
+        64, 63, 62, 61, 60, 59, 58, 57, 56, 56, 55, 55, 54, 54, 53, 53, 52, 52,
+        52, 52, 70, 70, 69, 68, 67, 67, 68, 68, 68, 67, 66, 66, 65, 64, 62, 61,
+        60, 59, 59, 58, 57, 56, 56, 55, 54, 54, 53, 53, 53, 53, 53, 53, 70, 69,
+        69, 68, 67, 67, 67, 67, 66, 66, 65, 64, 63, 62, 61, 60, 59, 59, 58, 57,
+        56, 56, 55, 55, 54, 54, 53, 53, 52, 52, 52, 52, 69, 69, 69, 68, 68, 67,
+        66, 65, 65, 64, 63, 63, 62, 61, 60, 59, 59, 58, 57, 57, 56, 55, 55, 54,
+        54, 53, 53, 53, 52, 52, 52, 52, 69, 69, 68, 68, 68, 67, 65, 64, 63, 62,
+        62, 61, 60, 60, 59, 58, 58, 57, 57, 56, 55, 55, 54, 54, 54, 53, 53, 53,
+        52, 52, 52, 52, 68, 68, 68, 68, 68, 66, 65, 63, 61, 61, 60, 60, 59, 58,
+        58, 57, 57, 56, 56, 55, 55, 54, 54, 54, 53, 53, 53, 52, 52, 52, 52, 52,
+        67, 67, 67, 67, 67, 66, 64, 62, 61, 60, 59, 59, 58, 58, 57, 57, 56, 56,
+        55, 55, 54, 54, 54, 53, 53, 53, 52, 52, 52, 52, 52, 52, 66, 66, 66, 66,
+        66, 65, 63, 62, 60, 59, 59, 58, 57, 57, 56, 56, 56, 55, 55, 54, 54, 54,
+        53, 53, 53, 52, 52, 52, 52, 52, 52, 52, 64, 64, 65, 65, 66, 64, 63, 61,
+        60, 59, 58, 57, 57, 56, 56, 55, 55, 55, 54, 54, 54, 53, 53, 53, 52, 52,
+        52, 52, 51, 51, 51, 51, 63, 63, 64, 64, 65, 63, 62, 60, 59, 58, 57, 57,
+        56, 55, 55, 55, 54, 54, 54, 53, 53, 53, 53, 52, 52, 52, 52, 51, 51, 51,
+        51, 51, 62, 62, 63, 63, 64, 62, 61, 60, 58, 58, 57, 56, 55, 55, 55, 54,
+        54, 54, 53, 53, 53, 53, 52, 52, 52, 52, 52, 51, 51, 51, 51, 51, 60, 61,
+        61, 62, 62, 61, 60, 59, 58, 57, 56, 56, 55, 55, 54, 54, 54, 53, 53, 53,
+        52, 52, 52, 52, 52, 51, 51, 51, 51, 51, 51, 51, 59, 60, 60, 61, 61, 60,
+        59, 58, 57, 57, 56, 55, 55, 54, 54, 54, 53, 53, 53, 52, 52, 52, 52, 52,
+        51, 51, 51, 51, 51, 51, 51, 51, 58, 59, 59, 60, 60, 59, 59, 58, 57, 56,
+        56, 55, 54, 54, 54, 53, 53, 53, 52, 52, 52, 52, 52, 51, 51, 51, 51, 51,
+        51, 51, 51, 51, 57, 58, 58, 59, 59, 59, 58, 57, 56, 56, 55, 55, 54, 54,
+        53, 53, 53, 52, 52, 52, 52, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51,
+        57, 57, 58, 58, 59, 58, 57, 57, 56, 55, 55, 54, 54, 53, 53, 53, 52, 52,
+        52, 52, 51, 51, 51, 51, 51, 51, 51, 50, 50, 50, 50, 50, 56, 56, 57, 57,
+        58, 57, 57, 56, 55, 55, 54, 54, 53, 53, 53, 52, 52, 52, 52, 51, 51, 51,
+        51, 51, 51, 51, 50, 50, 50, 50, 50, 50, 55, 55, 56, 56, 57, 56, 56, 55,
+        55, 54, 54, 54, 53, 53, 52, 52, 52, 52, 51, 51, 51, 51, 51, 51, 51, 50,
+        50, 50, 50, 50, 50, 50, 54, 55, 55, 56, 56, 56, 55, 55, 54, 54, 54, 53,
+        53, 53, 52, 52, 52, 51, 51, 51, 51, 51, 51, 51, 50, 50, 50, 50, 50, 50,
+        50, 50, 54, 54, 55, 55, 56, 55, 55, 54, 54, 54, 53, 53, 53, 52, 52, 52,
+        52, 51, 51, 51, 51, 51, 51, 50, 50, 50, 50, 50, 50, 50, 50, 50, 53, 54,
+        54, 55, 55, 55, 54, 54, 54, 53, 53, 53, 52, 52, 52, 52, 51, 51, 51, 51,
+        51, 51, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 53, 53, 54, 54, 54, 54,
+        54, 54, 53, 53, 53, 52, 52, 52, 52, 51, 51, 51, 51, 51, 51, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 53, 53, 53, 54, 54, 54, 53, 53, 53, 53,
+        52, 52, 52, 52, 51, 51, 51, 51, 51, 51, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 52, 53, 53, 53, 53, 53, 53, 53, 53, 52, 52, 52, 52, 52,
+        51, 51, 51, 51, 51, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        52, 52, 52, 53, 53, 53, 53, 53, 52, 52, 52, 52, 51, 51, 51, 51, 51, 51,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 52, 52, 52, 52,
+        53, 52, 52, 52, 52, 52, 52, 51, 51, 51, 51, 51, 51, 51, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 52, 52, 52, 52, 53, 52, 52, 52,
+        52, 52, 52, 51, 51, 51, 51, 51, 51, 51, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 52, 52, 52, 52, 53, 52, 52, 52, 52, 52, 52, 51,
+        51, 51, 51, 51, 51, 51, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 52, 52, 52, 52, 53, 52, 52, 52, 52, 52, 52, 51, 51, 51, 51, 51,
+        51, 51, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50 },
+      { /* Intra matrices */
+        /* Size 4 */
+        80, 77, 67, 61, 77, 68, 63, 60, 67, 63, 60, 58, 61, 60, 58, 57,
+        /* Size 8 */
+        75, 83, 80, 73, 68, 64, 61, 60, 83, 79, 80, 76, 70, 66, 63, 61, 80, 80,
+        72, 69, 66, 64, 62, 60, 73, 76, 69, 65, 63, 61, 60, 59, 68, 70, 66, 63,
+        61, 60, 59, 58, 64, 66, 64, 61, 60, 59, 58, 58, 61, 63, 62, 60, 59, 58,
+        58, 57, 60, 61, 60, 59, 58, 58, 57, 57,
+        /* Size 16 */
+        76, 80, 84, 82, 81, 78, 74, 71, 68, 66, 64, 63, 62, 61, 60, 60, 80, 81,
+        82, 81, 81, 78, 75, 72, 70, 68, 66, 64, 63, 62, 61, 61, 84, 82, 80, 80,
+        81, 79, 76, 74, 71, 69, 67, 65, 64, 63, 61, 61, 82, 81, 80, 78, 77, 75,
+        73, 71, 69, 67, 65, 64, 63, 62, 61, 61, 81, 81, 81, 77, 72, 71, 69, 68,
+        67, 66, 64, 63, 62, 61, 61, 61, 78, 78, 79, 75, 71, 69, 67, 66, 65, 64,
+        63, 62, 62, 61, 60, 60, 74, 75, 76, 73, 69, 67, 66, 65, 64, 63, 62, 61,
+        61, 60, 60, 60, 71, 72, 74, 71, 68, 66, 65, 64, 63, 62, 61, 61, 60, 60,
+        59, 59, 68, 70, 71, 69, 67, 65, 64, 63, 62, 61, 60, 60, 60, 59, 59, 59,
+        66, 68, 69, 67, 66, 64, 63, 62, 61, 61, 60, 60, 59, 59, 59, 59, 64, 66,
+        67, 65, 64, 63, 62, 61, 60, 60, 59, 59, 59, 59, 58, 58, 63, 64, 65, 64,
+        63, 62, 61, 61, 60, 60, 59, 59, 59, 58, 58, 58, 62, 63, 64, 63, 62, 62,
+        61, 60, 60, 59, 59, 59, 58, 58, 58, 58, 61, 62, 63, 62, 61, 61, 60, 60,
+        59, 59, 59, 58, 58, 58, 58, 58, 60, 61, 61, 61, 61, 60, 60, 59, 59, 59,
+        58, 58, 58, 58, 58, 58, 60, 61, 61, 61, 61, 60, 60, 59, 59, 59, 58, 58,
+        58, 58, 58, 58,
+        /* Size 32 */
+        76, 78, 80, 82, 84, 83, 83, 82, 82, 80, 78, 76, 74, 73, 71, 70, 69, 68,
+        67, 66, 65, 64, 63, 63, 62, 62, 61, 61, 60, 60, 60, 60, 78, 79, 81, 82,
+        83, 83, 82, 82, 82, 80, 78, 77, 75, 74, 72, 71, 69, 68, 67, 66, 65, 65,
+        64, 63, 63, 62, 62, 61, 61, 61, 61, 61, 80, 81, 81, 82, 82, 82, 82, 82,
+        81, 80, 79, 77, 76, 74, 73, 71, 70, 69, 68, 67, 66, 65, 64, 64, 63, 63,
+        62, 62, 61, 61, 61, 61, 82, 82, 82, 81, 81, 81, 81, 81, 81, 80, 79, 77,
+        76, 75, 73, 72, 71, 70, 69, 67, 66, 66, 65, 64, 63, 63, 62, 62, 61, 61,
+        61, 61, 84, 83, 82, 81, 80, 80, 81, 81, 81, 80, 79, 78, 77, 75, 74, 73,
+        71, 70, 69, 68, 67, 66, 65, 65, 64, 63, 63, 62, 62, 62, 62, 62, 83, 83,
+        82, 81, 80, 80, 80, 79, 79, 78, 77, 76, 75, 74, 73, 71, 70, 69, 68, 67,
+        66, 66, 65, 64, 64, 63, 63, 62, 62, 62, 62, 62, 83, 82, 82, 81, 81, 80,
+        79, 78, 77, 76, 75, 74, 73, 72, 71, 70, 69, 68, 67, 67, 66, 65, 65, 64,
+        63, 63, 62, 62, 61, 61, 61, 61, 82, 82, 82, 81, 81, 79, 78, 76, 75, 74,
+        73, 72, 71, 71, 70, 69, 68, 67, 67, 66, 65, 65, 64, 63, 63, 62, 62, 62,
+        61, 61, 61, 61, 82, 82, 81, 81, 81, 79, 77, 75, 73, 72, 71, 70, 70, 69,
+        68, 68, 67, 66, 66, 65, 65, 64, 64, 63, 63, 62, 62, 61, 61, 61, 61, 61,
+        80, 80, 80, 80, 80, 78, 76, 74, 72, 71, 70, 70, 69, 68, 67, 67, 66, 66,
+        65, 65, 64, 64, 63, 63, 62, 62, 61, 61, 61, 61, 61, 61, 78, 78, 79, 79,
+        79, 77, 75, 73, 71, 70, 69, 69, 68, 67, 67, 66, 65, 65, 64, 64, 63, 63,
+        63, 62, 62, 62, 61, 61, 61, 61, 61, 61, 76, 77, 77, 77, 78, 76, 74, 72,
+        70, 70, 69, 68, 67, 66, 66, 65, 65, 64, 64, 63, 63, 63, 62, 62, 61, 61,
+        61, 61, 60, 60, 60, 60, 74, 75, 76, 76, 77, 75, 73, 71, 70, 69, 68, 67,
+        66, 65, 65, 64, 64, 63, 63, 63, 62, 62, 62, 61, 61, 61, 61, 60, 60, 60,
+        60, 60, 73, 74, 74, 75, 75, 74, 72, 71, 69, 68, 67, 66, 65, 65, 64, 64,
+        63, 63, 63, 62, 62, 62, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 71, 72,
+        73, 73, 74, 73, 71, 70, 68, 67, 67, 66, 65, 64, 64, 63, 63, 63, 62, 62,
+        62, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 70, 71, 71, 72, 73, 71,
+        70, 69, 68, 67, 66, 65, 64, 64, 63, 63, 62, 62, 62, 61, 61, 61, 61, 60,
+        60, 60, 60, 60, 59, 59, 59, 59, 69, 69, 70, 71, 71, 70, 69, 68, 67, 66,
+        65, 65, 64, 63, 63, 62, 62, 62, 61, 61, 61, 61, 60, 60, 60, 60, 60, 59,
+        59, 59, 59, 59, 68, 68, 69, 70, 70, 69, 68, 67, 66, 66, 65, 64, 63, 63,
+        63, 62, 62, 61, 61, 61, 61, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59,
+        67, 67, 68, 69, 69, 68, 67, 67, 66, 65, 64, 64, 63, 63, 62, 62, 61, 61,
+        61, 61, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 66, 66, 67, 67,
+        68, 67, 67, 66, 65, 65, 64, 63, 63, 62, 62, 61, 61, 61, 61, 60, 60, 60,
+        60, 59, 59, 59, 59, 59, 59, 59, 59, 59, 65, 65, 66, 66, 67, 66, 66, 65,
+        65, 64, 63, 63, 62, 62, 62, 61, 61, 61, 60, 60, 60, 60, 59, 59, 59, 59,
+        59, 59, 59, 59, 59, 59, 64, 65, 65, 66, 66, 66, 65, 65, 64, 64, 63, 63,
+        62, 62, 61, 61, 61, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 58, 58,
+        58, 58, 63, 64, 64, 65, 65, 65, 65, 64, 64, 63, 63, 62, 62, 61, 61, 61,
+        60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 58, 58, 58, 58, 58, 63, 63,
+        64, 64, 65, 64, 64, 63, 63, 63, 62, 62, 61, 61, 61, 60, 60, 60, 60, 59,
+        59, 59, 59, 59, 59, 59, 58, 58, 58, 58, 58, 58, 62, 63, 63, 63, 64, 64,
+        63, 63, 63, 62, 62, 61, 61, 61, 61, 60, 60, 60, 60, 59, 59, 59, 59, 59,
+        59, 58, 58, 58, 58, 58, 58, 58, 62, 62, 63, 63, 63, 63, 63, 62, 62, 62,
+        62, 61, 61, 61, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 58, 58, 58, 58,
+        58, 58, 58, 58, 61, 62, 62, 62, 63, 63, 62, 62, 62, 61, 61, 61, 61, 60,
+        60, 60, 60, 59, 59, 59, 59, 59, 59, 58, 58, 58, 58, 58, 58, 58, 58, 58,
+        61, 61, 62, 62, 62, 62, 62, 62, 61, 61, 61, 61, 60, 60, 60, 60, 59, 59,
+        59, 59, 59, 59, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 60, 61, 61, 61,
+        62, 62, 61, 61, 61, 61, 61, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 58,
+        58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 60, 61, 61, 61, 62, 62, 61, 61,
+        61, 61, 61, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 58, 58, 58, 58, 58,
+        58, 58, 58, 58, 58, 58, 60, 61, 61, 61, 62, 62, 61, 61, 61, 61, 61, 60,
+        60, 60, 60, 59, 59, 59, 59, 59, 59, 58, 58, 58, 58, 58, 58, 58, 58, 58,
+        58, 58, 60, 61, 61, 61, 62, 62, 61, 61, 61, 61, 61, 60, 60, 60, 60, 59,
+        59, 59, 59, 59, 59, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 56, 54, 52, 56, 53, 52, 51, 54, 52, 50, 49, 52, 51, 49, 48,
+        /* Size 8 */
+        64, 69, 57, 56, 55, 54, 53, 52, 69, 61, 57, 58, 58, 57, 55, 53, 57, 57,
+        55, 55, 55, 55, 54, 53, 56, 58, 55, 54, 53, 53, 52, 51, 55, 58, 55, 53,
+        52, 51, 51, 50, 54, 57, 55, 53, 51, 51, 50, 50, 53, 55, 54, 52, 51, 50,
+        49, 49, 52, 53, 53, 51, 50, 50, 49, 49,
+        /* Size 16 */
+        64, 67, 69, 63, 57, 57, 56, 56, 55, 55, 54, 53, 53, 52, 52, 52, 67, 66,
+        65, 61, 57, 57, 57, 57, 57, 56, 55, 55, 54, 53, 52, 52, 69, 65, 61, 59,
+        57, 58, 58, 58, 58, 57, 57, 56, 55, 54, 53, 53, 63, 61, 59, 57, 56, 56,
+        57, 57, 57, 56, 56, 55, 54, 54, 53, 53, 57, 57, 57, 56, 55, 55, 55, 55,
+        55, 55, 55, 54, 54, 53, 53, 53, 57, 57, 58, 56, 55, 55, 54, 54, 54, 54,
+        54, 53, 53, 52, 52, 52, 56, 57, 58, 57, 55, 54, 54, 53, 53, 53, 53, 52,
+        52, 52, 51, 51, 56, 57, 58, 57, 55, 54, 53, 53, 53, 52, 52, 52, 52, 51,
+        51, 51, 55, 57, 58, 57, 55, 54, 53, 53, 52, 52, 51, 51, 51, 51, 50, 50,
+        55, 56, 57, 56, 55, 54, 53, 52, 52, 51, 51, 51, 51, 50, 50, 50, 54, 55,
+        57, 56, 55, 54, 53, 52, 51, 51, 51, 50, 50, 50, 50, 50, 53, 55, 56, 55,
+        54, 53, 52, 52, 51, 51, 50, 50, 50, 50, 49, 49, 53, 54, 55, 54, 54, 53,
+        52, 52, 51, 51, 50, 50, 49, 49, 49, 49, 52, 53, 54, 54, 53, 52, 52, 51,
+        51, 50, 50, 50, 49, 49, 49, 49, 52, 52, 53, 53, 53, 52, 51, 51, 50, 50,
+        50, 49, 49, 49, 49, 49, 52, 52, 53, 53, 53, 52, 51, 51, 50, 50, 50, 49,
+        49, 49, 49, 49,
+        /* Size 32 */
+        64, 65, 67, 68, 69, 66, 63, 60, 57, 57, 57, 57, 56, 56, 56, 56, 55, 55,
+        55, 54, 54, 54, 53, 53, 53, 52, 52, 52, 52, 52, 52, 52, 65, 66, 66, 67,
+        67, 65, 62, 60, 57, 57, 57, 57, 57, 57, 56, 56, 56, 56, 55, 55, 55, 54,
+        54, 54, 53, 53, 53, 52, 52, 52, 52, 52, 67, 66, 66, 65, 65, 63, 61, 59,
+        57, 57, 57, 57, 57, 57, 57, 57, 57, 56, 56, 56, 55, 55, 55, 54, 54, 53,
+        53, 53, 52, 52, 52, 52, 68, 67, 65, 64, 63, 61, 60, 58, 57, 57, 57, 58,
+        58, 58, 58, 57, 57, 57, 57, 56, 56, 56, 55, 55, 54, 54, 54, 53, 53, 53,
+        53, 53, 69, 67, 65, 63, 61, 60, 59, 58, 57, 57, 58, 58, 58, 58, 58, 58,
+        58, 58, 57, 57, 57, 56, 56, 55, 55, 55, 54, 54, 53, 53, 53, 53, 66, 65,
+        63, 61, 60, 59, 58, 57, 56, 57, 57, 57, 58, 58, 57, 57, 57, 57, 57, 56,
+        56, 56, 55, 55, 55, 54, 54, 54, 53, 53, 53, 53, 63, 62, 61, 60, 59, 58,
+        57, 57, 56, 56, 56, 57, 57, 57, 57, 57, 57, 56, 56, 56, 56, 55, 55, 55,
+        54, 54, 54, 53, 53, 53, 53, 53, 60, 60, 59, 58, 58, 57, 57, 56, 55, 55,
+        56, 56, 56, 56, 56, 56, 56, 56, 55, 55, 55, 55, 55, 54, 54, 54, 53, 53,
+        53, 53, 53, 53, 57, 57, 57, 57, 57, 56, 56, 55, 55, 55, 55, 55, 55, 55,
+        55, 55, 55, 55, 55, 55, 55, 54, 54, 54, 54, 53, 53, 53, 53, 53, 53, 53,
+        57, 57, 57, 57, 57, 57, 56, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55,
+        54, 54, 54, 54, 54, 53, 53, 53, 53, 53, 52, 52, 52, 52, 57, 57, 57, 57,
+        58, 57, 56, 56, 55, 55, 55, 55, 54, 54, 54, 54, 54, 54, 54, 54, 54, 53,
+        53, 53, 53, 53, 52, 52, 52, 52, 52, 52, 57, 57, 57, 58, 58, 57, 57, 56,
+        55, 55, 55, 54, 54, 54, 54, 54, 54, 54, 53, 53, 53, 53, 53, 53, 53, 52,
+        52, 52, 52, 52, 52, 52, 56, 57, 57, 58, 58, 58, 57, 56, 55, 55, 54, 54,
+        54, 54, 53, 53, 53, 53, 53, 53, 53, 53, 52, 52, 52, 52, 52, 52, 51, 51,
+        51, 51, 56, 57, 57, 58, 58, 58, 57, 56, 55, 55, 54, 54, 54, 53, 53, 53,
+        53, 53, 53, 53, 52, 52, 52, 52, 52, 52, 52, 51, 51, 51, 51, 51, 56, 56,
+        57, 58, 58, 57, 57, 56, 55, 55, 54, 54, 53, 53, 53, 53, 53, 52, 52, 52,
+        52, 52, 52, 52, 52, 51, 51, 51, 51, 51, 51, 51, 56, 56, 57, 57, 58, 57,
+        57, 56, 55, 55, 54, 54, 53, 53, 53, 53, 52, 52, 52, 52, 52, 52, 51, 51,
+        51, 51, 51, 51, 51, 51, 51, 51, 55, 56, 57, 57, 58, 57, 57, 56, 55, 55,
+        54, 54, 53, 53, 53, 52, 52, 52, 52, 52, 51, 51, 51, 51, 51, 51, 51, 51,
+        50, 50, 50, 50, 55, 56, 56, 57, 58, 57, 56, 56, 55, 55, 54, 54, 53, 53,
+        52, 52, 52, 52, 52, 51, 51, 51, 51, 51, 51, 51, 50, 50, 50, 50, 50, 50,
+        55, 55, 56, 57, 57, 57, 56, 55, 55, 54, 54, 53, 53, 53, 52, 52, 52, 52,
+        51, 51, 51, 51, 51, 51, 51, 50, 50, 50, 50, 50, 50, 50, 54, 55, 56, 56,
+        57, 56, 56, 55, 55, 54, 54, 53, 53, 53, 52, 52, 52, 51, 51, 51, 51, 51,
+        51, 50, 50, 50, 50, 50, 50, 50, 50, 50, 54, 55, 55, 56, 57, 56, 56, 55,
+        55, 54, 54, 53, 53, 52, 52, 52, 51, 51, 51, 51, 51, 50, 50, 50, 50, 50,
+        50, 50, 50, 50, 50, 50, 54, 54, 55, 56, 56, 56, 55, 55, 54, 54, 53, 53,
+        53, 52, 52, 52, 51, 51, 51, 51, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+        50, 50, 53, 54, 55, 55, 56, 55, 55, 55, 54, 54, 53, 53, 52, 52, 52, 51,
+        51, 51, 51, 51, 50, 50, 50, 50, 50, 50, 50, 49, 49, 49, 49, 49, 53, 54,
+        54, 55, 55, 55, 55, 54, 54, 53, 53, 53, 52, 52, 52, 51, 51, 51, 51, 50,
+        50, 50, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 53, 53, 54, 54, 55, 55,
+        54, 54, 54, 53, 53, 53, 52, 52, 52, 51, 51, 51, 51, 50, 50, 50, 50, 50,
+        49, 49, 49, 49, 49, 49, 49, 49, 52, 53, 53, 54, 55, 54, 54, 54, 53, 53,
+        53, 52, 52, 52, 51, 51, 51, 51, 50, 50, 50, 50, 50, 50, 49, 49, 49, 49,
+        49, 49, 49, 49, 52, 53, 53, 54, 54, 54, 54, 53, 53, 53, 52, 52, 52, 52,
+        51, 51, 51, 50, 50, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 49, 49, 49,
+        52, 52, 53, 53, 54, 54, 53, 53, 53, 53, 52, 52, 52, 51, 51, 51, 51, 50,
+        50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 52, 52, 52, 53,
+        53, 53, 53, 53, 53, 52, 52, 52, 51, 51, 51, 51, 50, 50, 50, 50, 50, 50,
+        49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 52, 52, 52, 53, 53, 53, 53, 53,
+        53, 52, 52, 52, 51, 51, 51, 51, 50, 50, 50, 50, 50, 50, 49, 49, 49, 49,
+        49, 49, 49, 49, 49, 49, 52, 52, 52, 53, 53, 53, 53, 53, 53, 52, 52, 52,
+        51, 51, 51, 51, 50, 50, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 49, 49,
+        49, 49, 52, 52, 52, 53, 53, 53, 53, 53, 53, 52, 52, 52, 51, 51, 51, 51,
+        50, 50, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49 },
+      { /* Intra matrices */
+        /* Size 4 */
+        79, 68, 67, 63, 68, 65, 64, 62, 67, 64, 61, 60, 63, 62, 60, 58,
+        /* Size 8 */
+        77, 84, 68, 67, 66, 64, 63, 61, 84, 73, 68, 70, 69, 67, 65, 64, 68, 68,
+        65, 66, 66, 65, 64, 62, 67, 70, 66, 64, 63, 63, 62, 61, 66, 69, 66, 63,
+        62, 61, 60, 60, 64, 67, 65, 63, 61, 60, 59, 59, 63, 65, 64, 62, 60, 59,
+        59, 58, 61, 64, 62, 61, 60, 59, 58, 58,
+        /* Size 16 */
+        77, 81, 84, 76, 69, 68, 68, 67, 67, 66, 65, 64, 63, 62, 62, 62, 81, 80,
+        79, 74, 69, 69, 69, 69, 68, 67, 66, 65, 64, 64, 63, 63, 84, 79, 74, 71,
+        68, 69, 70, 70, 70, 69, 68, 67, 66, 65, 64, 64, 76, 74, 71, 69, 67, 68,
+        68, 68, 68, 67, 67, 66, 65, 64, 63, 63, 69, 69, 68, 67, 66, 66, 66, 66,
+        66, 66, 66, 65, 64, 64, 63, 63, 68, 69, 69, 68, 66, 66, 65, 65, 65, 65,
+        64, 64, 63, 63, 62, 62, 68, 69, 70, 68, 66, 65, 64, 64, 64, 63, 63, 63,
+        62, 62, 61, 61, 67, 69, 70, 68, 66, 65, 64, 64, 63, 63, 62, 62, 62, 61,
+        61, 61, 67, 68, 70, 68, 66, 65, 64, 63, 62, 62, 61, 61, 61, 61, 60, 60,
+        66, 67, 69, 67, 66, 65, 63, 63, 62, 61, 61, 61, 60, 60, 60, 60, 65, 66,
+        68, 67, 66, 64, 63, 62, 61, 61, 60, 60, 60, 59, 59, 59, 64, 65, 67, 66,
+        65, 64, 63, 62, 61, 61, 60, 60, 59, 59, 59, 59, 63, 64, 66, 65, 64, 63,
+        62, 62, 61, 60, 60, 59, 59, 59, 58, 58, 62, 64, 65, 64, 64, 63, 62, 61,
+        61, 60, 59, 59, 59, 58, 58, 58, 62, 63, 64, 63, 63, 62, 61, 61, 60, 60,
+        59, 59, 58, 58, 58, 58, 62, 63, 64, 63, 63, 62, 61, 61, 60, 60, 59, 59,
+        58, 58, 58, 58,
+        /* Size 32 */
+        78, 79, 81, 83, 84, 81, 77, 73, 69, 69, 68, 68, 68, 68, 67, 67, 67, 66,
+        66, 65, 65, 65, 64, 64, 63, 63, 63, 62, 62, 62, 62, 62, 79, 80, 81, 81,
+        82, 79, 75, 72, 69, 69, 69, 69, 69, 68, 68, 68, 68, 67, 67, 66, 66, 65,
+        65, 64, 64, 64, 63, 63, 62, 62, 62, 62, 81, 81, 80, 80, 79, 77, 74, 71,
+        69, 69, 69, 69, 69, 69, 69, 69, 68, 68, 67, 67, 67, 66, 66, 65, 65, 64,
+        64, 63, 63, 63, 63, 63, 83, 81, 80, 78, 76, 75, 73, 71, 69, 69, 69, 70,
+        70, 70, 70, 69, 69, 69, 68, 68, 67, 67, 66, 66, 65, 65, 65, 64, 64, 64,
+        64, 64, 84, 82, 79, 76, 74, 73, 71, 70, 69, 69, 70, 70, 71, 70, 70, 70,
+        70, 70, 69, 69, 68, 68, 67, 67, 66, 66, 65, 65, 64, 64, 64, 64, 81, 79,
+        77, 75, 73, 71, 70, 69, 68, 68, 69, 69, 70, 69, 69, 69, 69, 69, 68, 68,
+        68, 67, 67, 66, 66, 65, 65, 64, 64, 64, 64, 64, 77, 75, 74, 73, 71, 70,
+        69, 68, 67, 68, 68, 68, 69, 68, 68, 68, 68, 68, 68, 67, 67, 67, 66, 66,
+        65, 65, 64, 64, 64, 64, 64, 64, 73, 72, 71, 71, 70, 69, 68, 67, 67, 67,
+        67, 67, 68, 67, 67, 67, 67, 67, 67, 67, 66, 66, 66, 65, 65, 65, 64, 64,
+        63, 63, 63, 63, 69, 69, 69, 69, 69, 68, 67, 67, 66, 66, 66, 66, 66, 67,
+        67, 67, 67, 66, 66, 66, 66, 65, 65, 65, 64, 64, 64, 63, 63, 63, 63, 63,
+        69, 69, 69, 69, 69, 68, 68, 67, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
+        66, 65, 65, 65, 65, 64, 64, 64, 63, 63, 63, 63, 63, 63, 68, 69, 69, 69,
+        70, 69, 68, 67, 66, 66, 66, 66, 66, 65, 65, 65, 65, 65, 65, 65, 65, 64,
+        64, 64, 64, 63, 63, 63, 62, 62, 62, 62, 68, 69, 69, 70, 70, 69, 68, 67,
+        66, 66, 66, 65, 65, 65, 65, 65, 65, 64, 64, 64, 64, 64, 63, 63, 63, 63,
+        63, 62, 62, 62, 62, 62, 68, 69, 69, 70, 71, 70, 69, 68, 66, 66, 66, 65,
+        65, 64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62,
+        62, 62, 68, 68, 69, 70, 70, 69, 68, 67, 67, 66, 65, 65, 64, 64, 64, 64,
+        64, 63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 61, 61, 61, 61, 67, 68,
+        69, 70, 70, 69, 68, 67, 67, 66, 65, 65, 64, 64, 64, 63, 63, 63, 63, 63,
+        62, 62, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 67, 68, 69, 69, 70, 69,
+        68, 67, 67, 66, 65, 65, 64, 64, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62,
+        61, 61, 61, 61, 61, 61, 61, 61, 67, 68, 68, 69, 70, 69, 68, 67, 67, 66,
+        65, 65, 64, 64, 63, 63, 62, 62, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61,
+        60, 60, 60, 60, 66, 67, 68, 69, 70, 69, 68, 67, 66, 66, 65, 64, 64, 63,
+        63, 63, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60,
+        66, 67, 67, 68, 69, 68, 68, 67, 66, 66, 65, 64, 64, 63, 63, 62, 62, 62,
+        62, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 65, 66, 67, 68,
+        69, 68, 67, 67, 66, 65, 65, 64, 63, 63, 63, 62, 62, 62, 61, 61, 61, 61,
+        61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 65, 66, 67, 67, 68, 68, 67, 66,
+        66, 65, 65, 64, 63, 63, 62, 62, 62, 61, 61, 61, 61, 60, 60, 60, 60, 60,
+        60, 60, 59, 59, 59, 59, 65, 65, 66, 67, 68, 67, 67, 66, 65, 65, 64, 64,
+        63, 63, 62, 62, 62, 61, 61, 61, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59,
+        59, 59, 64, 65, 66, 66, 67, 67, 66, 66, 65, 65, 64, 63, 63, 63, 62, 62,
+        61, 61, 61, 61, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 64, 64,
+        65, 66, 67, 66, 66, 65, 65, 64, 64, 63, 63, 62, 62, 62, 61, 61, 61, 60,
+        60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 63, 64, 65, 65, 66, 66,
+        65, 65, 64, 64, 64, 63, 63, 62, 62, 61, 61, 61, 60, 60, 60, 60, 60, 59,
+        59, 59, 59, 59, 59, 59, 59, 59, 63, 64, 64, 65, 66, 65, 65, 65, 64, 64,
+        63, 63, 62, 62, 62, 61, 61, 61, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59,
+        59, 59, 59, 59, 63, 63, 64, 65, 65, 65, 64, 64, 64, 63, 63, 63, 62, 62,
+        61, 61, 61, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 58, 58, 58, 58,
+        62, 63, 63, 64, 65, 64, 64, 64, 63, 63, 63, 62, 62, 62, 61, 61, 61, 60,
+        60, 60, 60, 59, 59, 59, 59, 59, 59, 58, 58, 58, 58, 58, 62, 62, 63, 64,
+        64, 64, 64, 63, 63, 63, 62, 62, 62, 61, 61, 61, 60, 60, 60, 60, 59, 59,
+        59, 59, 59, 59, 58, 58, 58, 58, 58, 58, 62, 62, 63, 64, 64, 64, 64, 63,
+        63, 63, 62, 62, 62, 61, 61, 61, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59,
+        58, 58, 58, 58, 58, 58, 62, 62, 63, 64, 64, 64, 64, 63, 63, 63, 62, 62,
+        62, 61, 61, 61, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 58, 58, 58, 58,
+        58, 58, 62, 62, 63, 64, 64, 64, 64, 63, 63, 63, 62, 62, 62, 61, 61, 61,
+        60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 58, 58, 58, 58, 58, 58 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 63, 57, 54, 63, 58, 55, 54, 57, 55, 53, 53, 54, 54, 53, 52,
+        /* Size 8 */
+        64, 68, 67, 63, 60, 58, 56, 55, 68, 66, 67, 64, 61, 59, 57, 56, 67, 67,
+        62, 61, 59, 58, 57, 56, 63, 64, 61, 58, 57, 56, 56, 55, 60, 61, 59, 57,
+        56, 56, 55, 55, 58, 59, 58, 56, 56, 55, 55, 54, 56, 57, 57, 56, 55, 55,
+        54, 54, 55, 56, 56, 55, 55, 54, 54, 54,
+        /* Size 16 */
+        64, 66, 68, 68, 67, 65, 63, 62, 60, 59, 58, 57, 56, 56, 55, 55, 66, 67,
+        67, 67, 67, 65, 64, 62, 61, 60, 58, 58, 57, 56, 56, 56, 68, 67, 66, 66,
+        67, 66, 64, 63, 61, 60, 59, 58, 57, 57, 56, 56, 68, 67, 66, 65, 65, 63,
+        62, 61, 60, 59, 58, 58, 57, 56, 56, 56, 67, 67, 67, 65, 62, 61, 61, 60,
+        59, 58, 58, 57, 57, 56, 56, 56, 65, 65, 66, 63, 61, 60, 59, 59, 58, 58,
+        57, 57, 56, 56, 56, 56, 63, 64, 64, 62, 61, 59, 58, 58, 57, 57, 56, 56,
+        56, 56, 55, 55, 62, 62, 63, 61, 60, 59, 58, 57, 57, 56, 56, 56, 55, 55,
+        55, 55, 60, 61, 61, 60, 59, 58, 57, 57, 56, 56, 56, 55, 55, 55, 55, 55,
+        59, 60, 60, 59, 58, 58, 57, 56, 56, 56, 55, 55, 55, 55, 55, 55, 58, 58,
+        59, 58, 58, 57, 56, 56, 56, 55, 55, 55, 55, 55, 54, 54, 57, 58, 58, 58,
+        57, 57, 56, 56, 55, 55, 55, 55, 55, 54, 54, 54, 56, 57, 57, 57, 57, 56,
+        56, 55, 55, 55, 55, 55, 54, 54, 54, 54, 56, 56, 57, 56, 56, 56, 56, 55,
+        55, 55, 55, 54, 54, 54, 54, 54, 55, 56, 56, 56, 56, 56, 55, 55, 55, 55,
+        54, 54, 54, 54, 54, 54, 55, 56, 56, 56, 56, 56, 55, 55, 55, 55, 54, 54,
+        54, 54, 54, 54,
+        /* Size 32 */
+        64, 65, 66, 67, 68, 68, 68, 67, 67, 66, 65, 64, 63, 62, 62, 61, 60, 59,
+        59, 58, 58, 57, 57, 57, 56, 56, 56, 56, 55, 55, 55, 55, 65, 66, 66, 67,
+        68, 68, 67, 67, 67, 66, 65, 64, 63, 63, 62, 61, 60, 60, 59, 59, 58, 58,
+        57, 57, 57, 56, 56, 56, 56, 56, 56, 56, 66, 66, 67, 67, 67, 67, 67, 67,
+        67, 66, 65, 65, 64, 63, 62, 61, 61, 60, 60, 59, 58, 58, 58, 57, 57, 57,
+        56, 56, 56, 56, 56, 56, 67, 67, 67, 67, 67, 67, 67, 67, 67, 66, 66, 65,
+        64, 63, 63, 62, 61, 60, 60, 59, 59, 58, 58, 58, 57, 57, 57, 56, 56, 56,
+        56, 56, 68, 68, 67, 67, 66, 66, 66, 67, 67, 66, 66, 65, 64, 64, 63, 62,
+        61, 61, 60, 60, 59, 59, 58, 58, 57, 57, 57, 56, 56, 56, 56, 56, 68, 68,
+        67, 67, 66, 66, 66, 66, 66, 65, 65, 64, 63, 63, 62, 62, 61, 60, 60, 59,
+        59, 58, 58, 58, 57, 57, 57, 56, 56, 56, 56, 56, 68, 67, 67, 67, 66, 66,
+        65, 65, 65, 64, 63, 63, 62, 62, 61, 61, 60, 60, 59, 59, 58, 58, 58, 57,
+        57, 57, 56, 56, 56, 56, 56, 56, 67, 67, 67, 67, 67, 66, 65, 64, 63, 63,
+        62, 62, 62, 61, 61, 60, 60, 59, 59, 58, 58, 58, 57, 57, 57, 57, 56, 56,
+        56, 56, 56, 56, 67, 67, 67, 67, 67, 66, 65, 63, 62, 62, 61, 61, 61, 60,
+        60, 59, 59, 59, 58, 58, 58, 57, 57, 57, 57, 56, 56, 56, 56, 56, 56, 56,
+        66, 66, 66, 66, 66, 65, 64, 63, 62, 61, 61, 60, 60, 60, 59, 59, 59, 58,
+        58, 58, 57, 57, 57, 57, 56, 56, 56, 56, 56, 56, 56, 56, 65, 65, 65, 66,
+        66, 65, 63, 62, 61, 61, 60, 60, 59, 59, 59, 59, 58, 58, 58, 57, 57, 57,
+        57, 56, 56, 56, 56, 56, 56, 56, 56, 56, 64, 64, 65, 65, 65, 64, 63, 62,
+        61, 60, 60, 59, 59, 59, 58, 58, 58, 58, 57, 57, 57, 57, 56, 56, 56, 56,
+        56, 56, 55, 55, 55, 55, 63, 63, 64, 64, 64, 63, 62, 62, 61, 60, 59, 59,
+        58, 58, 58, 58, 57, 57, 57, 57, 56, 56, 56, 56, 56, 56, 56, 55, 55, 55,
+        55, 55, 62, 63, 63, 63, 64, 63, 62, 61, 60, 60, 59, 59, 58, 58, 58, 57,
+        57, 57, 57, 56, 56, 56, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55, 62, 62,
+        62, 63, 63, 62, 61, 61, 60, 59, 59, 58, 58, 58, 57, 57, 57, 57, 56, 56,
+        56, 56, 56, 56, 55, 55, 55, 55, 55, 55, 55, 55, 61, 61, 61, 62, 62, 62,
+        61, 60, 59, 59, 59, 58, 58, 57, 57, 57, 57, 56, 56, 56, 56, 56, 56, 55,
+        55, 55, 55, 55, 55, 55, 55, 55, 60, 60, 61, 61, 61, 61, 60, 60, 59, 59,
+        58, 58, 57, 57, 57, 57, 56, 56, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55,
+        55, 55, 55, 55, 59, 60, 60, 60, 61, 60, 60, 59, 59, 58, 58, 58, 57, 57,
+        57, 56, 56, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55,
+        59, 59, 60, 60, 60, 60, 59, 59, 58, 58, 58, 57, 57, 57, 56, 56, 56, 56,
+        56, 56, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 58, 59, 59, 59,
+        60, 59, 59, 58, 58, 58, 57, 57, 57, 56, 56, 56, 56, 56, 56, 55, 55, 55,
+        55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 58, 58, 58, 59, 59, 59, 58, 58,
+        58, 57, 57, 57, 56, 56, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55, 55, 55,
+        55, 55, 54, 54, 54, 54, 57, 58, 58, 58, 59, 58, 58, 58, 57, 57, 57, 57,
+        56, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 54, 54, 54,
+        54, 54, 57, 57, 58, 58, 58, 58, 58, 57, 57, 57, 57, 56, 56, 56, 56, 56,
+        55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 54, 54, 54, 54, 54, 54, 57, 57,
+        57, 58, 58, 58, 57, 57, 57, 57, 56, 56, 56, 56, 56, 55, 55, 55, 55, 55,
+        55, 55, 55, 55, 54, 54, 54, 54, 54, 54, 54, 54, 56, 57, 57, 57, 57, 57,
+        57, 57, 57, 56, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55, 55, 55, 55, 54,
+        54, 54, 54, 54, 54, 54, 54, 54, 56, 56, 57, 57, 57, 57, 57, 57, 56, 56,
+        56, 56, 56, 56, 55, 55, 55, 55, 55, 55, 55, 55, 55, 54, 54, 54, 54, 54,
+        54, 54, 54, 54, 56, 56, 56, 57, 57, 57, 56, 56, 56, 56, 56, 56, 56, 55,
+        55, 55, 55, 55, 55, 55, 55, 55, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54,
+        56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55,
+        55, 55, 55, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 55, 56, 56, 56,
+        56, 56, 56, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55, 55, 55, 55, 54, 54,
+        54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 55, 56, 56, 56, 56, 56, 56, 56,
+        56, 56, 56, 55, 55, 55, 55, 55, 55, 55, 55, 55, 54, 54, 54, 54, 54, 54,
+        54, 54, 54, 54, 54, 54, 55, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 55,
+        55, 55, 55, 55, 55, 55, 55, 55, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54,
+        54, 54, 55, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 55, 55, 55, 55, 55,
+        55, 55, 55, 55, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 },
+      { /* Intra matrices */
+        /* Size 4 */
+        74, 72, 66, 62, 72, 67, 63, 61, 66, 63, 61, 60, 62, 61, 60, 59,
+        /* Size 8 */
+        71, 76, 75, 70, 66, 64, 62, 61, 76, 73, 74, 71, 68, 65, 63, 62, 75, 74,
+        69, 67, 65, 64, 62, 61, 70, 71, 67, 64, 63, 62, 61, 61, 66, 68, 65, 63,
+        62, 61, 61, 60, 64, 65, 64, 62, 61, 60, 60, 60, 62, 63, 62, 61, 61, 60,
+        60, 59, 61, 62, 61, 61, 60, 60, 59, 59,
+        /* Size 16 */
+        71, 74, 77, 76, 75, 73, 70, 68, 67, 65, 64, 63, 62, 62, 61, 61, 74, 75,
+        75, 75, 75, 73, 71, 69, 67, 66, 65, 64, 63, 62, 62, 62, 77, 75, 74, 74,
+        75, 73, 72, 70, 68, 67, 66, 65, 64, 63, 62, 62, 76, 75, 74, 73, 72, 71,
+        70, 68, 67, 66, 65, 64, 63, 63, 62, 62, 75, 75, 75, 72, 69, 68, 67, 66,
+        66, 65, 64, 63, 63, 62, 62, 62, 73, 73, 73, 71, 68, 67, 66, 65, 65, 64,
+        63, 63, 62, 62, 61, 61, 70, 71, 72, 70, 67, 66, 65, 64, 63, 63, 63, 62,
+        62, 61, 61, 61, 68, 69, 70, 68, 66, 65, 64, 64, 63, 62, 62, 62, 61, 61,
+        61, 61, 67, 67, 68, 67, 66, 65, 63, 63, 62, 62, 62, 61, 61, 61, 61, 61,
+        65, 66, 67, 66, 65, 64, 63, 62, 62, 62, 61, 61, 61, 61, 60, 60, 64, 65,
+        66, 65, 64, 63, 63, 62, 62, 61, 61, 61, 60, 60, 60, 60, 63, 64, 65, 64,
+        63, 63, 62, 62, 61, 61, 61, 60, 60, 60, 60, 60, 62, 63, 64, 63, 63, 62,
+        62, 61, 61, 61, 60, 60, 60, 60, 60, 60, 62, 62, 63, 63, 62, 62, 61, 61,
+        61, 61, 60, 60, 60, 60, 60, 60, 61, 62, 62, 62, 62, 61, 61, 61, 61, 60,
+        60, 60, 60, 60, 60, 60, 61, 62, 62, 62, 62, 61, 61, 61, 61, 60, 60, 60,
+        60, 60, 60, 60,
+        /* Size 32 */
+        72, 73, 74, 76, 77, 76, 76, 76, 75, 74, 73, 72, 71, 70, 69, 68, 67, 66,
+        66, 65, 64, 64, 63, 63, 63, 62, 62, 62, 62, 62, 62, 62, 73, 74, 75, 75,
+        76, 76, 76, 75, 75, 74, 73, 72, 71, 70, 69, 68, 67, 67, 66, 65, 65, 64,
+        64, 63, 63, 63, 62, 62, 62, 62, 62, 62, 74, 75, 75, 75, 76, 75, 75, 75,
+        75, 74, 73, 72, 71, 70, 70, 69, 68, 67, 66, 66, 65, 65, 64, 64, 63, 63,
+        63, 62, 62, 62, 62, 62, 76, 75, 75, 75, 75, 75, 75, 75, 75, 74, 73, 73,
+        72, 71, 70, 69, 68, 67, 67, 66, 65, 65, 64, 64, 63, 63, 63, 62, 62, 62,
+        62, 62, 77, 76, 76, 75, 74, 74, 75, 75, 75, 74, 74, 73, 72, 71, 70, 69,
+        69, 68, 67, 66, 66, 65, 65, 64, 64, 63, 63, 63, 62, 62, 62, 62, 76, 76,
+        75, 75, 74, 74, 74, 74, 74, 73, 72, 72, 71, 70, 69, 69, 68, 67, 67, 66,
+        65, 65, 64, 64, 64, 63, 63, 63, 62, 62, 62, 62, 76, 76, 75, 75, 75, 74,
+        73, 73, 72, 72, 71, 70, 70, 69, 69, 68, 67, 67, 66, 66, 65, 65, 64, 64,
+        63, 63, 63, 62, 62, 62, 62, 62, 76, 75, 75, 75, 75, 74, 73, 72, 71, 70,
+        70, 69, 69, 68, 68, 67, 66, 66, 66, 65, 65, 64, 64, 63, 63, 63, 63, 62,
+        62, 62, 62, 62, 75, 75, 75, 75, 75, 74, 72, 71, 69, 69, 68, 68, 67, 67,
+        67, 66, 66, 65, 65, 65, 64, 64, 64, 63, 63, 63, 62, 62, 62, 62, 62, 62,
+        74, 74, 74, 74, 74, 73, 72, 70, 69, 68, 68, 67, 67, 66, 66, 66, 65, 65,
+        65, 64, 64, 64, 63, 63, 63, 62, 62, 62, 62, 62, 62, 62, 73, 73, 73, 73,
+        74, 72, 71, 70, 68, 68, 67, 67, 66, 66, 65, 65, 65, 64, 64, 64, 63, 63,
+        63, 63, 62, 62, 62, 62, 62, 62, 62, 62, 72, 72, 72, 73, 73, 72, 70, 69,
+        68, 67, 67, 66, 66, 65, 65, 65, 64, 64, 64, 63, 63, 63, 63, 62, 62, 62,
+        62, 62, 61, 61, 61, 61, 71, 71, 71, 72, 72, 71, 70, 69, 67, 67, 66, 66,
+        65, 65, 64, 64, 64, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 61, 61, 61,
+        61, 61, 70, 70, 70, 71, 71, 70, 69, 68, 67, 66, 66, 65, 65, 64, 64, 64,
+        63, 63, 63, 63, 62, 62, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 69, 69,
+        70, 70, 70, 69, 69, 68, 67, 66, 65, 65, 64, 64, 64, 63, 63, 63, 63, 62,
+        62, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 61, 68, 68, 69, 69, 69, 69,
+        68, 67, 66, 66, 65, 65, 64, 64, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62,
+        61, 61, 61, 61, 61, 61, 61, 61, 67, 67, 68, 68, 69, 68, 67, 66, 66, 65,
+        65, 64, 64, 63, 63, 63, 62, 62, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61,
+        61, 61, 61, 61, 66, 67, 67, 67, 68, 67, 67, 66, 65, 65, 64, 64, 63, 63,
+        63, 63, 62, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61,
+        66, 66, 66, 67, 67, 67, 66, 66, 65, 65, 64, 64, 63, 63, 63, 62, 62, 62,
+        62, 62, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 65, 65, 66, 66,
+        66, 66, 66, 65, 65, 64, 64, 63, 63, 63, 62, 62, 62, 62, 62, 61, 61, 61,
+        61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 64, 65, 65, 65, 66, 65, 65, 65,
+        64, 64, 63, 63, 63, 62, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 61, 61,
+        60, 60, 60, 60, 60, 60, 64, 64, 65, 65, 65, 65, 65, 64, 64, 64, 63, 63,
+        63, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60,
+        60, 60, 63, 64, 64, 64, 65, 64, 64, 64, 64, 63, 63, 63, 62, 62, 62, 62,
+        61, 61, 61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 63, 63,
+        64, 64, 64, 64, 64, 63, 63, 63, 63, 62, 62, 62, 62, 62, 61, 61, 61, 61,
+        61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 63, 63, 63, 63, 64, 64,
+        63, 63, 63, 63, 62, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 61, 60, 60,
+        60, 60, 60, 60, 60, 60, 60, 60, 62, 63, 63, 63, 63, 63, 63, 63, 63, 62,
+        62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60,
+        60, 60, 60, 60, 62, 62, 63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 61,
+        61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+        62, 62, 62, 62, 63, 63, 62, 62, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61,
+        61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 62, 62, 62, 62,
+        62, 62, 62, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 61, 61, 60, 60, 60,
+        60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 62, 62, 62, 62, 62, 62, 62, 62,
+        62, 62, 62, 61, 61, 61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60,
+        60, 60, 60, 60, 60, 60, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 61,
+        61, 61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+        60, 60, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 61, 61, 61, 61, 61,
+        61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 58, 57, 55, 58, 56, 56, 55, 57, 56, 54, 53, 55, 55, 53, 53,
+        /* Size 8 */
+        64, 68, 59, 59, 58, 57, 56, 55, 68, 62, 59, 60, 60, 59, 58, 57, 59, 59,
+        58, 58, 58, 57, 57, 56, 59, 60, 58, 57, 56, 56, 56, 55, 58, 60, 58, 56,
+        56, 55, 55, 55, 57, 59, 57, 56, 55, 55, 54, 54, 56, 58, 57, 56, 55, 54,
+        54, 54, 55, 57, 56, 55, 55, 54, 54, 53,
+        /* Size 16 */
+        64, 66, 68, 63, 59, 59, 59, 58, 58, 58, 57, 57, 56, 56, 55, 55, 66, 65,
+        65, 62, 59, 59, 59, 59, 59, 58, 58, 57, 57, 56, 56, 56, 68, 65, 62, 60,
+        59, 60, 60, 60, 60, 59, 59, 58, 58, 57, 57, 57, 63, 62, 60, 59, 58, 59,
+        59, 59, 59, 58, 58, 58, 57, 57, 56, 56, 59, 59, 59, 58, 58, 58, 58, 58,
+        58, 58, 57, 57, 57, 56, 56, 56, 59, 59, 60, 59, 58, 58, 57, 57, 57, 57,
+        57, 57, 56, 56, 56, 56, 59, 59, 60, 59, 58, 57, 57, 57, 56, 56, 56, 56,
+        56, 55, 55, 55, 58, 59, 60, 59, 58, 57, 57, 56, 56, 56, 56, 55, 55, 55,
+        55, 55, 58, 59, 60, 59, 58, 57, 56, 56, 56, 55, 55, 55, 55, 55, 55, 55,
+        58, 58, 59, 58, 58, 57, 56, 56, 55, 55, 55, 55, 55, 54, 54, 54, 57, 58,
+        59, 58, 57, 57, 56, 56, 55, 55, 55, 54, 54, 54, 54, 54, 57, 57, 58, 58,
+        57, 57, 56, 55, 55, 55, 54, 54, 54, 54, 54, 54, 56, 57, 58, 57, 57, 56,
+        56, 55, 55, 55, 54, 54, 54, 54, 54, 54, 56, 56, 57, 57, 56, 56, 55, 55,
+        55, 54, 54, 54, 54, 54, 53, 53, 55, 56, 57, 56, 56, 56, 55, 55, 55, 54,
+        54, 54, 54, 53, 53, 53, 55, 56, 57, 56, 56, 56, 55, 55, 55, 54, 54, 54,
+        54, 53, 53, 53,
+        /* Size 32 */
+        64, 65, 66, 67, 68, 66, 63, 61, 59, 59, 59, 59, 59, 58, 58, 58, 58, 58,
+        58, 57, 57, 57, 57, 56, 56, 56, 56, 55, 55, 55, 55, 55, 65, 65, 66, 66,
+        66, 64, 63, 61, 59, 59, 59, 59, 59, 59, 59, 59, 58, 58, 58, 58, 57, 57,
+        57, 57, 56, 56, 56, 56, 56, 56, 56, 56, 66, 66, 65, 65, 65, 63, 62, 61,
+        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 58, 58, 58, 58, 57, 57, 57, 57,
+        56, 56, 56, 56, 56, 56, 67, 66, 65, 64, 63, 62, 61, 60, 59, 59, 59, 60,
+        60, 60, 60, 59, 59, 59, 59, 59, 58, 58, 58, 58, 57, 57, 57, 57, 56, 56,
+        56, 56, 68, 66, 65, 63, 62, 61, 60, 60, 59, 59, 60, 60, 60, 60, 60, 60,
+        60, 60, 59, 59, 59, 58, 58, 58, 58, 57, 57, 57, 57, 57, 57, 57, 66, 64,
+        63, 62, 61, 61, 60, 59, 59, 59, 59, 59, 60, 59, 59, 59, 59, 59, 59, 59,
+        58, 58, 58, 58, 57, 57, 57, 57, 56, 56, 56, 56, 63, 63, 62, 61, 60, 60,
+        59, 59, 58, 58, 59, 59, 59, 59, 59, 59, 59, 59, 58, 58, 58, 58, 58, 57,
+        57, 57, 57, 57, 56, 56, 56, 56, 61, 61, 61, 60, 60, 59, 59, 58, 58, 58,
+        58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 57, 57, 57, 57, 57, 56,
+        56, 56, 56, 56, 59, 59, 59, 59, 59, 59, 58, 58, 58, 58, 58, 58, 58, 58,
+        58, 58, 58, 58, 58, 58, 57, 57, 57, 57, 57, 57, 56, 56, 56, 56, 56, 56,
+        59, 59, 59, 59, 59, 59, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 57,
+        57, 57, 57, 57, 57, 57, 56, 56, 56, 56, 56, 56, 56, 56, 59, 59, 59, 59,
+        60, 59, 59, 58, 58, 58, 58, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57,
+        57, 56, 56, 56, 56, 56, 56, 56, 56, 56, 59, 59, 59, 60, 60, 59, 59, 58,
+        58, 58, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 56, 56, 56, 56, 56, 56,
+        56, 56, 55, 55, 55, 55, 59, 59, 59, 60, 60, 60, 59, 58, 58, 58, 57, 57,
+        57, 57, 57, 57, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 55, 55, 55, 55,
+        55, 55, 58, 59, 59, 60, 60, 59, 59, 58, 58, 58, 57, 57, 57, 57, 56, 56,
+        56, 56, 56, 56, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55, 55, 55, 58, 59,
+        59, 60, 60, 59, 59, 58, 58, 58, 57, 57, 57, 56, 56, 56, 56, 56, 56, 56,
+        56, 56, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 58, 59, 59, 59, 60, 59,
+        59, 58, 58, 58, 57, 57, 57, 56, 56, 56, 56, 56, 56, 56, 55, 55, 55, 55,
+        55, 55, 55, 55, 55, 55, 55, 55, 58, 58, 59, 59, 60, 59, 59, 58, 58, 58,
+        57, 57, 56, 56, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55,
+        55, 55, 55, 55, 58, 58, 59, 59, 60, 59, 59, 58, 58, 57, 57, 57, 56, 56,
+        56, 56, 56, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 54, 54, 54, 54, 54,
+        58, 58, 58, 59, 59, 59, 58, 58, 58, 57, 57, 57, 56, 56, 56, 56, 55, 55,
+        55, 55, 55, 55, 55, 55, 55, 54, 54, 54, 54, 54, 54, 54, 57, 58, 58, 59,
+        59, 59, 58, 58, 58, 57, 57, 57, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55,
+        55, 55, 54, 54, 54, 54, 54, 54, 54, 54, 57, 57, 58, 58, 59, 58, 58, 58,
+        57, 57, 57, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55, 55, 54, 54, 54, 54,
+        54, 54, 54, 54, 54, 54, 57, 57, 58, 58, 58, 58, 58, 58, 57, 57, 57, 56,
+        56, 56, 56, 55, 55, 55, 55, 55, 55, 54, 54, 54, 54, 54, 54, 54, 54, 54,
+        54, 54, 57, 57, 57, 58, 58, 58, 58, 57, 57, 57, 57, 56, 56, 56, 55, 55,
+        55, 55, 55, 55, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 56, 57,
+        57, 58, 58, 58, 57, 57, 57, 57, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55,
+        54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 56, 56, 57, 57, 58, 57,
+        57, 57, 57, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55, 54, 54, 54, 54, 54,
+        54, 54, 54, 54, 54, 54, 54, 54, 56, 56, 57, 57, 57, 57, 57, 57, 57, 56,
+        56, 56, 56, 55, 55, 55, 55, 55, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54,
+        53, 53, 53, 53, 56, 56, 56, 57, 57, 57, 57, 57, 56, 56, 56, 56, 55, 55,
+        55, 55, 55, 55, 54, 54, 54, 54, 54, 54, 54, 54, 54, 53, 53, 53, 53, 53,
+        55, 56, 56, 57, 57, 57, 57, 56, 56, 56, 56, 56, 55, 55, 55, 55, 55, 54,
+        54, 54, 54, 54, 54, 54, 54, 54, 53, 53, 53, 53, 53, 53, 55, 56, 56, 56,
+        57, 56, 56, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55, 54, 54, 54, 54, 54,
+        54, 54, 54, 53, 53, 53, 53, 53, 53, 53, 55, 56, 56, 56, 57, 56, 56, 56,
+        56, 56, 56, 55, 55, 55, 55, 55, 55, 54, 54, 54, 54, 54, 54, 54, 54, 53,
+        53, 53, 53, 53, 53, 53, 55, 56, 56, 56, 57, 56, 56, 56, 56, 56, 56, 55,
+        55, 55, 55, 55, 55, 54, 54, 54, 54, 54, 54, 54, 54, 53, 53, 53, 53, 53,
+        53, 53, 55, 56, 56, 56, 57, 56, 56, 56, 56, 56, 56, 55, 55, 55, 55, 55,
+        55, 54, 54, 54, 54, 54, 54, 54, 54, 53, 53, 53, 53, 53, 53, 53 },
+      { /* Intra matrices */
+        /* Size 4 */
+        74, 67, 66, 63, 67, 65, 64, 63, 66, 64, 62, 61, 63, 63, 61, 60,
+        /* Size 8 */
+        72, 77, 67, 66, 65, 64, 63, 62, 77, 70, 66, 68, 67, 66, 65, 64, 67, 66,
+        65, 65, 65, 65, 64, 63, 66, 68, 65, 64, 63, 63, 63, 62, 65, 67, 65, 63,
+        62, 62, 62, 61, 64, 66, 65, 63, 62, 61, 61, 60, 63, 65, 64, 63, 62, 61,
+        60, 60, 62, 64, 63, 62, 61, 60, 60, 60,
+        /* Size 16 */
+        73, 75, 77, 72, 67, 67, 66, 66, 66, 65, 64, 64, 63, 63, 62, 62, 75, 74,
+        74, 70, 67, 67, 67, 67, 67, 66, 65, 65, 64, 64, 63, 63, 77, 74, 70, 69,
+        67, 67, 68, 68, 68, 67, 67, 66, 65, 65, 64, 64, 72, 70, 69, 67, 66, 66,
+        67, 67, 67, 66, 66, 65, 65, 64, 64, 64, 67, 67, 67, 66, 65, 65, 65, 65,
+        65, 65, 65, 64, 64, 64, 63, 63, 67, 67, 67, 66, 65, 65, 65, 65, 65, 64,
+        64, 64, 63, 63, 63, 63, 66, 67, 68, 67, 65, 65, 64, 64, 64, 64, 63, 63,
+        63, 63, 62, 62, 66, 67, 68, 67, 65, 65, 64, 64, 63, 63, 63, 63, 62, 62,
+        62, 62, 66, 67, 68, 67, 65, 65, 64, 63, 63, 62, 62, 62, 62, 62, 61, 61,
+        65, 66, 67, 66, 65, 64, 64, 63, 62, 62, 62, 62, 61, 61, 61, 61, 64, 65,
+        67, 66, 65, 64, 63, 63, 62, 62, 62, 61, 61, 61, 61, 61, 64, 65, 66, 65,
+        64, 64, 63, 63, 62, 62, 61, 61, 61, 61, 60, 60, 63, 64, 65, 65, 64, 63,
+        63, 62, 62, 61, 61, 61, 61, 60, 60, 60, 63, 64, 65, 64, 64, 63, 63, 62,
+        62, 61, 61, 61, 60, 60, 60, 60, 62, 63, 64, 64, 63, 63, 62, 62, 61, 61,
+        61, 60, 60, 60, 60, 60, 62, 63, 64, 64, 63, 63, 62, 62, 61, 61, 61, 60,
+        60, 60, 60, 60,
+        /* Size 32 */
+        73, 74, 75, 76, 77, 75, 72, 70, 67, 67, 67, 67, 66, 66, 66, 66, 66, 65,
+        65, 65, 65, 64, 64, 64, 63, 63, 63, 63, 62, 62, 62, 62, 74, 74, 75, 75,
+        76, 73, 71, 69, 67, 67, 67, 67, 67, 67, 67, 66, 66, 66, 66, 65, 65, 65,
+        65, 64, 64, 64, 63, 63, 63, 63, 63, 63, 75, 75, 74, 74, 74, 72, 70, 69,
+        67, 67, 67, 67, 67, 67, 67, 67, 67, 66, 66, 66, 66, 65, 65, 65, 64, 64,
+        64, 64, 63, 63, 63, 63, 76, 75, 74, 73, 72, 71, 70, 68, 67, 67, 67, 68,
+        68, 68, 68, 67, 67, 67, 67, 66, 66, 66, 65, 65, 65, 65, 64, 64, 64, 64,
+        64, 64, 77, 76, 74, 72, 70, 70, 69, 68, 67, 67, 68, 68, 68, 68, 68, 68,
+        68, 68, 67, 67, 67, 66, 66, 66, 65, 65, 65, 64, 64, 64, 64, 64, 75, 73,
+        72, 71, 70, 69, 68, 67, 67, 67, 67, 67, 68, 67, 67, 67, 67, 67, 67, 67,
+        66, 66, 66, 65, 65, 65, 64, 64, 64, 64, 64, 64, 72, 71, 70, 70, 69, 68,
+        67, 67, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 66, 66, 66, 66, 65, 65,
+        65, 65, 64, 64, 64, 64, 64, 64, 70, 69, 69, 68, 68, 67, 67, 66, 66, 66,
+        66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 65, 65, 65, 65, 64, 64, 64, 64,
+        64, 64, 64, 64, 67, 67, 67, 67, 67, 67, 66, 66, 65, 65, 65, 65, 66, 66,
+        66, 66, 66, 65, 65, 65, 65, 65, 65, 64, 64, 64, 64, 64, 63, 63, 63, 63,
+        67, 67, 67, 67, 67, 67, 66, 66, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+        65, 65, 65, 64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 67, 67, 67, 67,
+        68, 67, 66, 66, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 64, 64, 64, 64,
+        64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 67, 67, 67, 68, 68, 67, 67, 66,
+        65, 65, 65, 65, 65, 65, 64, 64, 64, 64, 64, 64, 64, 64, 64, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 66, 67, 67, 68, 68, 68, 67, 66, 66, 65, 65, 65,
+        64, 64, 64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 62, 62,
+        62, 62, 66, 67, 67, 68, 68, 67, 67, 66, 66, 65, 65, 65, 64, 64, 64, 64,
+        64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62, 66, 67,
+        67, 68, 68, 67, 67, 66, 66, 65, 65, 64, 64, 64, 64, 64, 63, 63, 63, 63,
+        63, 63, 63, 63, 62, 62, 62, 62, 62, 62, 62, 62, 66, 66, 67, 67, 68, 67,
+        67, 66, 66, 65, 65, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 62, 62,
+        62, 62, 62, 62, 62, 62, 62, 62, 66, 66, 67, 67, 68, 67, 67, 66, 66, 65,
+        65, 64, 64, 64, 63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62, 62, 62,
+        62, 62, 62, 62, 65, 66, 66, 67, 68, 67, 67, 66, 65, 65, 65, 64, 64, 64,
+        63, 63, 63, 63, 62, 62, 62, 62, 62, 62, 62, 62, 62, 61, 61, 61, 61, 61,
+        65, 66, 66, 67, 67, 67, 66, 66, 65, 65, 64, 64, 64, 63, 63, 63, 63, 62,
+        62, 62, 62, 62, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 65, 65, 66, 66,
+        67, 67, 66, 66, 65, 65, 64, 64, 64, 63, 63, 63, 63, 62, 62, 62, 62, 62,
+        62, 62, 61, 61, 61, 61, 61, 61, 61, 61, 65, 65, 66, 66, 67, 66, 66, 65,
+        65, 65, 64, 64, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62, 61, 61, 61, 61,
+        61, 61, 61, 61, 61, 61, 64, 65, 65, 66, 66, 66, 66, 65, 65, 64, 64, 64,
+        63, 63, 63, 63, 62, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 61, 61, 61,
+        61, 61, 64, 65, 65, 65, 66, 66, 65, 65, 65, 64, 64, 64, 63, 63, 63, 62,
+        62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 64, 64,
+        65, 65, 66, 65, 65, 65, 64, 64, 64, 63, 63, 63, 63, 62, 62, 62, 62, 62,
+        61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 63, 64, 64, 65, 65, 65,
+        65, 64, 64, 64, 64, 63, 63, 63, 62, 62, 62, 62, 62, 61, 61, 61, 61, 61,
+        61, 61, 61, 60, 60, 60, 60, 60, 63, 64, 64, 65, 65, 65, 65, 64, 64, 64,
+        63, 63, 63, 63, 62, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 61, 60, 60,
+        60, 60, 60, 60, 63, 63, 64, 64, 65, 64, 64, 64, 64, 64, 63, 63, 63, 62,
+        62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60,
+        63, 63, 64, 64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 62, 62, 62, 62, 61,
+        61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 62, 63, 63, 64,
+        64, 64, 64, 64, 63, 63, 63, 63, 62, 62, 62, 62, 62, 61, 61, 61, 61, 61,
+        61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 62, 63, 63, 64, 64, 64, 64, 64,
+        63, 63, 63, 63, 62, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 61, 60, 60,
+        60, 60, 60, 60, 60, 60, 62, 63, 63, 64, 64, 64, 64, 64, 63, 63, 63, 63,
+        62, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60,
+        60, 60, 62, 63, 63, 64, 64, 64, 64, 64, 63, 63, 63, 63, 62, 62, 62, 62,
+        62, 61, 61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 63, 60, 59, 63, 61, 59, 59, 60, 59, 58, 58, 59, 59, 58, 58,
+        /* Size 8 */
+        64, 66, 66, 64, 62, 61, 60, 60, 66, 65, 65, 64, 63, 61, 61, 60, 66, 65,
+        63, 62, 61, 61, 60, 60, 64, 64, 62, 61, 61, 60, 60, 59, 62, 63, 61, 61,
+        60, 60, 59, 59, 61, 61, 61, 60, 60, 59, 59, 59, 60, 61, 60, 60, 59, 59,
+        59, 59, 60, 60, 60, 59, 59, 59, 59, 59,
+        /* Size 16 */
+        64, 65, 66, 66, 66, 65, 64, 63, 62, 61, 61, 60, 60, 60, 60, 60, 65, 65,
+        66, 66, 66, 65, 64, 63, 62, 62, 61, 61, 60, 60, 60, 60, 66, 66, 65, 65,
+        65, 65, 64, 63, 63, 62, 61, 61, 61, 60, 60, 60, 66, 66, 65, 65, 64, 64,
+        63, 63, 62, 62, 61, 61, 60, 60, 60, 60, 66, 66, 65, 64, 63, 63, 62, 62,
+        61, 61, 61, 60, 60, 60, 60, 60, 65, 65, 65, 64, 63, 62, 62, 61, 61, 61,
+        60, 60, 60, 60, 60, 60, 64, 64, 64, 63, 62, 62, 61, 61, 61, 60, 60, 60,
+        60, 60, 59, 59, 63, 63, 63, 63, 62, 61, 61, 61, 60, 60, 60, 60, 60, 59,
+        59, 59, 62, 62, 63, 62, 61, 61, 61, 60, 60, 60, 60, 60, 59, 59, 59, 59,
+        61, 62, 62, 62, 61, 61, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 61, 61,
+        61, 61, 61, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 60, 61, 61, 61,
+        60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 60, 60, 61, 60, 60, 60,
+        60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 60, 59,
+        59, 59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59,
+        59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59,
+        59, 59, 59, 59,
+        /* Size 32 */
+        64, 65, 65, 66, 66, 66, 66, 66, 66, 65, 65, 64, 64, 63, 63, 62, 62, 62,
+        61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 65, 65, 65, 66,
+        66, 66, 66, 66, 66, 65, 65, 64, 64, 63, 63, 62, 62, 62, 62, 61, 61, 61,
+        61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 65, 65, 65, 66, 66, 66, 66, 66,
+        66, 65, 65, 64, 64, 63, 63, 63, 62, 62, 62, 61, 61, 61, 61, 61, 60, 60,
+        60, 60, 60, 60, 60, 60, 66, 66, 66, 65, 65, 65, 65, 65, 65, 65, 65, 64,
+        64, 64, 63, 63, 62, 62, 62, 62, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60,
+        60, 60, 66, 66, 66, 65, 65, 65, 65, 65, 65, 65, 65, 65, 64, 64, 63, 63,
+        63, 62, 62, 62, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 66, 66,
+        66, 65, 65, 65, 65, 65, 65, 65, 64, 64, 64, 63, 63, 63, 62, 62, 62, 62,
+        61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 66, 66, 66, 65, 65, 65,
+        65, 65, 64, 64, 64, 63, 63, 63, 63, 62, 62, 62, 62, 61, 61, 61, 61, 61,
+        60, 60, 60, 60, 60, 60, 60, 60, 66, 66, 66, 65, 65, 65, 65, 64, 64, 63,
+        63, 63, 63, 62, 62, 62, 62, 62, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60,
+        60, 60, 60, 60, 66, 66, 66, 65, 65, 65, 64, 64, 63, 63, 63, 62, 62, 62,
+        62, 62, 61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+        65, 65, 65, 65, 65, 65, 64, 63, 63, 63, 62, 62, 62, 62, 62, 61, 61, 61,
+        61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 65, 65, 65, 65,
+        65, 64, 64, 63, 63, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 61, 60, 60,
+        60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 64, 64, 64, 64, 65, 64, 63, 63,
+        62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60,
+        60, 60, 60, 60, 60, 60, 64, 64, 64, 64, 64, 64, 63, 63, 62, 62, 62, 61,
+        61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 59, 59,
+        59, 59, 63, 63, 63, 64, 64, 63, 63, 62, 62, 62, 61, 61, 61, 61, 61, 61,
+        60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 63, 63,
+        63, 63, 63, 63, 63, 62, 62, 62, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60,
+        60, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 62, 62, 63, 63, 63, 63,
+        62, 62, 62, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+        60, 59, 59, 59, 59, 59, 59, 59, 62, 62, 62, 62, 63, 62, 62, 62, 61, 61,
+        61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59,
+        59, 59, 59, 59, 62, 62, 62, 62, 62, 62, 62, 62, 61, 61, 61, 61, 60, 60,
+        60, 60, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
+        61, 62, 62, 62, 62, 62, 62, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60,
+        60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 61, 61, 61, 62,
+        62, 62, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 59, 59,
+        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 61, 61, 61, 61, 61, 61, 61, 61,
+        61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59,
+        59, 59, 59, 59, 59, 59, 61, 61, 61, 61, 61, 61, 61, 61, 61, 60, 60, 60,
+        60, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
+        59, 59, 60, 61, 61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60,
+        60, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60,
+        61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 59, 59, 59,
+        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 61, 60,
+        60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59,
+        59, 59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+        60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
+        59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
+        60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59,
+        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60,
+        60, 60, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
+        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 60, 60,
+        60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
+        59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
+        59, 59, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59,
+        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59 },
+      { /* Intra matrices */
+        /* Size 4 */
+        69, 68, 65, 63, 68, 65, 64, 63, 65, 64, 62, 62, 63, 63, 62, 62,
+        /* Size 8 */
+        67, 70, 69, 67, 65, 64, 63, 62, 70, 69, 69, 68, 66, 64, 63, 63, 69, 69,
+        66, 65, 64, 64, 63, 63, 67, 68, 65, 64, 63, 63, 63, 62, 65, 66, 64, 63,
+        63, 62, 62, 62, 64, 64, 64, 63, 62, 62, 62, 62, 63, 63, 63, 63, 62, 62,
+        62, 62, 62, 63, 63, 62, 62, 62, 62, 62,
+        /* Size 16 */
+        68, 69, 70, 70, 69, 68, 67, 66, 65, 65, 64, 64, 63, 63, 63, 63, 69, 69,
+        69, 69, 69, 68, 67, 66, 66, 65, 64, 64, 63, 63, 63, 63, 70, 69, 69, 69,
+        69, 68, 68, 67, 66, 65, 65, 64, 64, 63, 63, 63, 70, 69, 69, 68, 68, 67,
+        67, 66, 65, 65, 64, 64, 63, 63, 63, 63, 69, 69, 69, 68, 66, 66, 65, 65,
+        65, 64, 64, 64, 63, 63, 63, 63, 68, 68, 68, 67, 66, 65, 65, 65, 64, 64,
+        64, 63, 63, 63, 63, 63, 67, 67, 68, 67, 65, 65, 64, 64, 64, 63, 63, 63,
+        63, 63, 62, 62, 66, 66, 67, 66, 65, 65, 64, 64, 63, 63, 63, 63, 63, 62,
+        62, 62, 65, 66, 66, 65, 65, 64, 64, 63, 63, 63, 63, 63, 62, 62, 62, 62,
+        65, 65, 65, 65, 64, 64, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 64, 64,
+        65, 64, 64, 64, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62, 64, 64, 64, 64,
+        64, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62, 62, 63, 63, 64, 63, 63, 63,
+        63, 63, 62, 62, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, 62,
+        62, 62, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 62, 62, 62, 62,
+        62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62,
+        62, 62, 62, 62,
+        /* Size 32 */
+        68, 68, 69, 70, 70, 70, 70, 70, 69, 69, 68, 68, 67, 67, 66, 66, 65, 65,
+        65, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 63, 68, 69, 69, 69,
+        70, 70, 70, 69, 69, 69, 68, 68, 67, 67, 66, 66, 65, 65, 65, 65, 64, 64,
+        64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 69, 69, 69, 69, 70, 69, 69, 69,
+        69, 69, 68, 68, 67, 67, 67, 66, 66, 65, 65, 65, 64, 64, 64, 64, 63, 63,
+        63, 63, 63, 63, 63, 63, 70, 69, 69, 69, 69, 69, 69, 69, 69, 69, 68, 68,
+        68, 67, 67, 66, 66, 66, 65, 65, 65, 64, 64, 64, 64, 63, 63, 63, 63, 63,
+        63, 63, 70, 70, 70, 69, 69, 69, 69, 69, 69, 69, 69, 68, 68, 67, 67, 67,
+        66, 66, 65, 65, 65, 65, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 70, 70,
+        69, 69, 69, 69, 69, 69, 69, 68, 68, 68, 67, 67, 67, 66, 66, 65, 65, 65,
+        65, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 70, 70, 69, 69, 69, 69,
+        68, 68, 68, 68, 67, 67, 67, 66, 66, 66, 65, 65, 65, 65, 64, 64, 64, 64,
+        64, 63, 63, 63, 63, 63, 63, 63, 70, 69, 69, 69, 69, 69, 68, 68, 67, 67,
+        67, 66, 66, 66, 66, 65, 65, 65, 65, 64, 64, 64, 64, 64, 63, 63, 63, 63,
+        63, 63, 63, 63, 69, 69, 69, 69, 69, 69, 68, 67, 67, 66, 66, 66, 66, 65,
+        65, 65, 65, 65, 64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 63,
+        69, 69, 69, 69, 69, 68, 68, 67, 66, 66, 66, 66, 65, 65, 65, 65, 65, 64,
+        64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 63, 68, 68, 68, 68,
+        69, 68, 67, 67, 66, 66, 66, 65, 65, 65, 65, 64, 64, 64, 64, 64, 64, 64,
+        63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 68, 68, 68, 68, 68, 68, 67, 66,
+        66, 66, 65, 65, 65, 65, 64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 67, 67, 67, 68, 68, 67, 67, 66, 66, 65, 65, 65,
+        64, 64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 67, 67, 67, 67, 67, 67, 66, 66, 65, 65, 65, 65, 64, 64, 64, 64,
+        64, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 66, 66,
+        67, 67, 67, 67, 66, 66, 65, 65, 65, 64, 64, 64, 64, 64, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 66, 66, 66, 66, 67, 66,
+        66, 65, 65, 65, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 62, 62, 62, 62, 62, 62, 65, 65, 66, 66, 66, 66, 65, 65, 65, 65,
+        64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 62, 62, 62,
+        62, 62, 62, 62, 65, 65, 65, 66, 66, 65, 65, 65, 65, 64, 64, 64, 64, 63,
+        63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62, 62, 62,
+        65, 65, 65, 65, 65, 65, 65, 65, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62, 62, 62, 62, 64, 65, 65, 65,
+        65, 65, 65, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 63, 62,
+        62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 64, 64, 64, 65, 65, 65, 64, 64,
+        64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62,
+        62, 62, 62, 62, 62, 62, 64, 64, 64, 64, 65, 64, 64, 64, 64, 64, 64, 63,
+        63, 63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62,
+        62, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 63, 64,
+        64, 64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 63, 62, 62,
+        62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 63, 63, 63, 64, 64, 64,
+        64, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62, 62,
+        62, 62, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 64, 64, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62,
+        62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63,
+        63, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62,
+        63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 62, 62, 62,
+        62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62, 62, 62,
+        62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62,
+        62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62,
+        62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 62, 62,
+        62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 61, 60, 59, 61, 60, 60, 59, 60, 60, 59, 58, 59, 59, 58, 58,
+        /* Size 8 */
+        64, 66, 61, 61, 61, 60, 60, 59, 66, 63, 61, 62, 62, 61, 61, 60, 61, 61,
+        61, 61, 61, 61, 60, 60, 61, 62, 61, 60, 60, 60, 60, 59, 61, 62, 61, 60,
+        60, 59, 59, 59, 60, 61, 61, 60, 59, 59, 59, 59, 60, 61, 60, 60, 59, 59,
+        59, 59, 59, 60, 60, 59, 59, 59, 59, 58,
+        /* Size 16 */
+        64, 65, 66, 64, 61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 59, 59, 65, 65,
+        64, 63, 61, 61, 62, 61, 61, 61, 61, 61, 60, 60, 60, 60, 66, 64, 63, 62,
+        61, 62, 62, 62, 62, 62, 61, 61, 61, 60, 60, 60, 64, 63, 62, 62, 61, 61,
+        61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 61, 61, 61, 61, 61, 61, 61, 61,
+        61, 61, 61, 60, 60, 60, 60, 60, 61, 61, 62, 61, 61, 61, 60, 60, 60, 60,
+        60, 60, 60, 60, 60, 60, 61, 62, 62, 61, 61, 60, 60, 60, 60, 60, 60, 60,
+        60, 60, 59, 59, 61, 61, 62, 61, 61, 60, 60, 60, 60, 60, 60, 60, 59, 59,
+        59, 59, 61, 61, 62, 61, 61, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59,
+        61, 61, 62, 61, 61, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 60, 61,
+        61, 61, 61, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 60, 61, 61, 61,
+        60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60, 61, 60, 60, 60,
+        60, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 60, 59,
+        59, 59, 59, 59, 59, 59, 58, 58, 59, 60, 60, 60, 60, 60, 59, 59, 59, 59,
+        59, 59, 59, 58, 58, 58, 59, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59,
+        59, 58, 58, 58,
+        /* Size 32 */
+        64, 64, 65, 65, 66, 65, 64, 63, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61,
+        61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59, 64, 65, 65, 65,
+        65, 64, 63, 62, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 60,
+        60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 65, 65, 65, 65, 64, 64, 63, 62,
+        61, 61, 61, 62, 62, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 60, 60, 60,
+        60, 60, 60, 60, 60, 60, 65, 65, 65, 64, 64, 63, 63, 62, 61, 61, 62, 62,
+        62, 62, 62, 62, 62, 61, 61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60,
+        60, 60, 66, 65, 64, 64, 63, 63, 62, 62, 61, 62, 62, 62, 62, 62, 62, 62,
+        62, 62, 62, 61, 61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 65, 64,
+        64, 63, 63, 62, 62, 62, 61, 61, 61, 62, 62, 62, 62, 62, 62, 61, 61, 61,
+        61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 64, 63, 63, 63, 62, 62,
+        62, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61,
+        60, 60, 60, 60, 60, 60, 60, 60, 63, 62, 62, 62, 62, 62, 61, 61, 61, 61,
+        61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60,
+        60, 60, 60, 60, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61,
+        61, 61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+        61, 61, 61, 61, 62, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61,
+        60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 61, 61, 61, 62,
+        62, 61, 61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+        60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 61, 61, 62, 62, 62, 62, 61, 61,
+        61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+        60, 60, 59, 59, 59, 59, 61, 61, 62, 62, 62, 62, 61, 61, 61, 61, 60, 60,
+        60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 59, 59, 59,
+        59, 59, 61, 61, 61, 62, 62, 62, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60,
+        60, 60, 60, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 61, 61,
+        61, 62, 62, 62, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+        60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 59, 61, 61, 61, 62, 62, 62,
+        61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 59, 59, 59,
+        59, 59, 59, 59, 59, 59, 59, 59, 61, 61, 61, 62, 62, 62, 61, 61, 61, 61,
+        60, 60, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
+        59, 59, 59, 59, 61, 61, 61, 61, 62, 61, 61, 61, 61, 61, 60, 60, 60, 60,
+        60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
+        61, 61, 61, 61, 62, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 59, 59,
+        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 61, 61, 61,
+        61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59,
+        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 61, 61, 61, 61, 61, 61, 61,
+        61, 60, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
+        59, 59, 59, 59, 59, 59, 60, 60, 61, 61, 61, 61, 61, 61, 60, 60, 60, 60,
+        60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
+        59, 59, 60, 60, 61, 61, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 59,
+        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60,
+        60, 61, 61, 61, 61, 60, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59,
+        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 61, 61,
+        60, 60, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
+        59, 59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 61, 60, 60, 60, 60, 60,
+        60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
+        58, 58, 58, 58, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 59,
+        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 58, 58, 58, 58, 58,
+        60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59,
+        59, 59, 59, 59, 59, 59, 59, 59, 58, 58, 58, 58, 58, 58, 59, 60, 60, 60,
+        60, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
+        59, 59, 59, 58, 58, 58, 58, 58, 58, 58, 59, 60, 60, 60, 60, 60, 60, 60,
+        60, 60, 60, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 58,
+        58, 58, 58, 58, 58, 58, 59, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 59,
+        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 58, 58, 58, 58, 58,
+        58, 58, 59, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 59, 59, 59, 59, 59,
+        59, 59, 59, 59, 59, 59, 59, 59, 59, 58, 58, 58, 58, 58, 58, 58 },
+      { /* Intra matrices */
+        /* Size 4 */
+        69, 65, 65, 64, 65, 64, 64, 63, 65, 64, 63, 62, 64, 63, 62, 62,
+        /* Size 8 */
+        68, 70, 65, 65, 65, 64, 63, 63, 70, 67, 65, 66, 66, 65, 64, 64, 65, 65,
+        64, 64, 65, 64, 64, 63, 65, 66, 64, 64, 64, 63, 63, 63, 65, 66, 65, 64,
+        63, 63, 63, 63, 64, 65, 64, 63, 63, 63, 62, 62, 63, 64, 64, 63, 63, 62,
+        62, 62, 63, 64, 63, 63, 63, 62, 62, 62,
+        /* Size 16 */
+        68, 69, 70, 68, 65, 65, 65, 65, 65, 64, 64, 64, 64, 63, 63, 63, 69, 69,
+        69, 67, 65, 65, 66, 65, 65, 65, 65, 64, 64, 64, 64, 64, 70, 69, 67, 66,
+        65, 66, 66, 66, 66, 65, 65, 65, 65, 64, 64, 64, 68, 67, 66, 66, 65, 65,
+        65, 65, 65, 65, 65, 65, 64, 64, 64, 64, 65, 65, 65, 65, 64, 65, 65, 65,
+        65, 65, 64, 64, 64, 64, 64, 64, 65, 65, 66, 65, 65, 64, 64, 64, 64, 64,
+        64, 64, 64, 63, 63, 63, 65, 66, 66, 65, 65, 64, 64, 64, 64, 64, 64, 63,
+        63, 63, 63, 63, 65, 65, 66, 65, 65, 64, 64, 64, 64, 63, 63, 63, 63, 63,
+        63, 63, 65, 65, 66, 65, 65, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 63,
+        64, 65, 65, 65, 65, 64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 63, 64, 65,
+        65, 65, 64, 64, 64, 63, 63, 63, 63, 63, 63, 62, 62, 62, 64, 64, 65, 65,
+        64, 64, 63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 64, 64, 65, 64, 64, 64,
+        63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 63, 64, 64, 64, 64, 63, 63, 63,
+        63, 63, 62, 62, 62, 62, 62, 62, 63, 64, 64, 64, 64, 63, 63, 63, 63, 63,
+        62, 62, 62, 62, 62, 62, 63, 64, 64, 64, 64, 63, 63, 63, 63, 63, 62, 62,
+        62, 62, 62, 62,
+        /* Size 32 */
+        68, 69, 69, 70, 71, 69, 68, 67, 66, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+        65, 64, 64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 69, 69, 69, 69,
+        70, 69, 68, 67, 66, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 64, 64,
+        64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 69, 69, 69, 69, 69, 68, 67, 66,
+        65, 66, 66, 66, 66, 66, 65, 65, 65, 65, 65, 65, 65, 65, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 70, 69, 69, 68, 68, 67, 67, 66, 65, 66, 66, 66,
+        66, 66, 66, 66, 66, 65, 65, 65, 65, 65, 65, 65, 64, 64, 64, 64, 64, 64,
+        64, 64, 71, 70, 69, 68, 67, 67, 66, 66, 65, 66, 66, 66, 66, 66, 66, 66,
+        66, 66, 66, 65, 65, 65, 65, 65, 65, 64, 64, 64, 64, 64, 64, 64, 69, 69,
+        68, 67, 67, 66, 66, 66, 65, 65, 65, 66, 66, 66, 66, 66, 66, 65, 65, 65,
+        65, 65, 65, 65, 64, 64, 64, 64, 64, 64, 64, 64, 68, 68, 67, 67, 66, 66,
+        66, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 67, 67, 66, 66, 66, 66, 65, 65, 65, 65,
+        65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 66, 66, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
+        65, 65, 65, 65, 65, 65, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        65, 65, 66, 66, 66, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 66, 66,
+        66, 65, 65, 65, 65, 65, 65, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 65, 65, 66, 66, 66, 66, 65, 65,
+        65, 65, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        63, 63, 63, 63, 63, 63, 65, 65, 66, 66, 66, 66, 65, 65, 65, 65, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63,
+        63, 63, 65, 65, 66, 66, 66, 66, 65, 65, 65, 65, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 65, 65,
+        65, 66, 66, 66, 65, 65, 65, 65, 64, 64, 64, 64, 64, 64, 64, 64, 64, 63,
+        63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 65, 65, 65, 66, 66, 66,
+        65, 65, 65, 65, 64, 64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 63, 63, 65, 65, 65, 66, 66, 66, 65, 65, 65, 65,
+        64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 63, 65, 65, 65, 65, 66, 65, 65, 65, 65, 64, 64, 64, 64, 64,
+        64, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63,
+        65, 65, 65, 65, 66, 65, 65, 65, 65, 64, 64, 64, 64, 64, 64, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 64, 65, 65, 65,
+        65, 65, 65, 65, 65, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 64, 64, 65, 65, 65, 65, 65, 65,
+        64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63,
+        63, 62, 62, 62, 62, 62, 64, 64, 65, 65, 65, 65, 65, 65, 64, 64, 64, 64,
+        64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62,
+        62, 62, 64, 64, 64, 65, 65, 65, 65, 64, 64, 64, 64, 64, 64, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62, 62, 62, 64, 64,
+        64, 65, 65, 65, 64, 64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 62, 62, 62, 62, 62, 62, 62, 62, 62, 64, 64, 64, 64, 65, 64,
+        64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 62, 62,
+        62, 62, 62, 62, 62, 62, 62, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62, 62,
+        62, 62, 62, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62,
+        63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 63, 63, 64, 64,
+        64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 63, 62, 62, 62,
+        62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 63, 63, 64, 64, 64, 64, 64, 64,
+        64, 64, 63, 63, 63, 63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62, 62,
+        62, 62, 62, 62, 62, 62, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 63, 63,
+        63, 63, 63, 63, 63, 63, 63, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62,
+        62, 62, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62 } } },
+  { {   /* Luma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        /* Size 8 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        /* Size 16 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64,
+        /* Size 32 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+      { /* Intra matrices */
+        /* Size 4 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        /* Size 8 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        /* Size 16 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64,
+        /* Size 32 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } },
+    {   /* Chroma matrices */
+      { /* Inter matrices */
+        /* Size 4 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        /* Size 8 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        /* Size 16 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64,
+        /* Size 32 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+      { /* Intra matrices */
+        /* Size 4 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        /* Size 8 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        /* Size 16 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64,
+        /* Size 32 */
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } } }
+};
+
+#endif
+
+#if CONFIG_PVQ || CONFIG_DAALA_DIST
+/* Quantization matrices for 8x8. For other block sizes, we currently just do
+   resampling. */
+/* Flat quantization, i.e. optimize for PSNR. */
+const int OD_QM8_Q4_FLAT[] = { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+                               16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+                               16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+                               16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+                               16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+                               16, 16, 16, 16, 16, 16, 16, 16, 16 };
+#if 0
+/* M1: MPEG2 matrix for inter (which has a dead zone). */
+const int OD_QM8_Q4[] = {
+  16, 17, 18, 19, 20, 21, 22, 23,
+  17, 18, 19, 20, 21, 22, 23, 24,
+  18, 19, 20, 21, 22, 23, 24, 25,
+  19, 20, 21, 22, 23, 24, 26, 27,
+  20, 21, 22, 23, 25, 26, 27, 28,
+  21, 22, 23, 24, 26, 27, 28, 30,
+  22, 23, 24, 26, 27, 28, 30, 31,
+  23, 24, 25, 27, 28, 30, 31, 33};
+#endif
+#if 0
+/* M2: MPEG2 matrix for intra (no dead zone). */
+const int OD_QM8_Q4[] = {
+  16, 16, 19, 22, 22, 26, 26, 27,
+  16, 16, 22, 22, 26, 27, 27, 29,
+  19, 22, 26, 26, 27, 29, 29, 35,
+  22, 24, 27, 27, 29, 32, 34, 38,
+  26, 27, 29, 29, 32, 35, 38, 46,
+  27, 29, 34, 34, 35, 40, 46, 56,
+  29, 34, 34, 37, 40, 48, 56, 69,
+  34, 37, 38, 40, 48, 58, 69, 83
+};
+#endif
+#if 0
+/* M3: Taken from dump_psnrhvs. */
+const int OD_QM8_Q4[] = {
+  16, 16, 17, 20, 24, 29, 36, 42,
+  16, 17, 17, 19, 22, 26, 31, 37,
+  17, 17, 21, 23, 26, 30, 34, 40,
+  20, 19, 23, 28, 31, 35, 39, 45,
+  24, 22, 26, 31, 36, 41, 46, 51,
+  29, 26, 30, 35, 41, 47, 52, 58,
+  36, 31, 34, 39, 46, 52, 59, 66,
+  42, 37, 40, 45, 51, 58, 66, 73
+};
+#endif
+#if 1
+/* M4: a compromise equal to .5*(M3 + .5*(M2+transpose(M2))) */
+const int OD_QM8_Q4_HVS[] = { 16, 16, 18, 21, 24, 28, 32, 36, 16, 17, 20,
+                              21, 24, 27, 31, 35, 18, 20, 24, 25, 27, 31,
+                              33, 38, 21, 21, 25, 28, 30, 34, 37, 42, 24,
+                              24, 27, 30, 34, 38, 43, 49, 28, 27, 31, 34,
+                              38, 44, 50, 58, 32, 31, 33, 37, 43, 50, 58,
+                              68, 36, 35, 38, 42, 49, 58, 68, 78 };
+#endif
+#endif
diff --git a/third_party/aom/av1/common/quant_common.h b/third_party/aom/av1/common/quant_common.h
new file mode 100644
index 000000000..3f442427d
--- /dev/null
+++ b/third_party/aom/av1/common/quant_common.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_QUANT_COMMON_H_
+#define AV1_COMMON_QUANT_COMMON_H_
+
+#include "aom/aom_codec.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/entropy.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MINQ 0
+#define MAXQ 255
+#define QINDEX_RANGE (MAXQ - MINQ + 1)
+#define QINDEX_BITS 8
+#if CONFIG_AOM_QM
+// Total number of QM sets stored
+#define QM_LEVEL_BITS 4
+#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS)
+/* Offset into the list of QMs. Actual number of levels used is
+   (NUM_QM_LEVELS-AOM_QM_OFFSET)
+   Lower value of AOM_QM_OFFSET implies more heavily weighted matrices.*/
+#define DEFAULT_QM_FIRST (NUM_QM_LEVELS / 2)
+#define DEFAULT_QM_LAST (NUM_QM_LEVELS - 1)
+#endif
+
+struct AV1Common;
+
+int16_t av1_dc_quant(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_ac_quant(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_qindex_from_ac(int ac, aom_bit_depth_t bit_depth);
+
+int av1_get_qindex(const struct segmentation *seg, int segment_id,
+                   int base_qindex);
+#if CONFIG_AOM_QM
+// Reduce the large number of quantizers to a smaller number of levels for which
+// different matrices may be defined
+static INLINE int aom_get_qmlevel(int qindex, int first, int last) {
+  int qmlevel = (qindex * (last + 1 - first) + QINDEX_RANGE / 2) / QINDEX_RANGE;
+  qmlevel = AOMMIN(qmlevel + first, NUM_QM_LEVELS - 1);
+  return qmlevel;
+}
+void aom_qm_init(struct AV1Common *cm);
+qm_val_t *aom_iqmatrix(struct AV1Common *cm, int qindex, int comp,
+                       int log2sizem2, int is_intra);
+qm_val_t *aom_qmatrix(struct AV1Common *cm, int qindex, int comp,
+                      int log2sizem2, int is_intra);
+#endif
+
+#if CONFIG_NEW_QUANT
+
+#define QUANT_PROFILES 4
+#define QUANT_RANGES 2
+#define NUQ_KNOTS 3
+
+typedef tran_low_t dequant_val_type_nuq[NUQ_KNOTS + 1];
+typedef tran_low_t cuml_bins_type_nuq[NUQ_KNOTS];
+void av1_get_dequant_val_nuq(int q, int band, tran_low_t *dq,
+                             tran_low_t *cuml_bins, int dq_off_index);
+tran_low_t av1_dequant_abscoeff_nuq(int v, int q, const tran_low_t *dq);
+tran_low_t av1_dequant_coeff_nuq(int v, int q, const tran_low_t *dq);
+
+static INLINE int qindex_to_qrange(int qindex) {
+  return (qindex < 140 ? 1 : 0);
+}
+
+static INLINE int get_dq_profile_from_ctx(int qindex, int q_ctx, int is_inter,
+                                          PLANE_TYPE plane_type) {
+  // intra/inter, Y/UV, ctx, qrange
+  static const int
+      def_dq_profile_lookup[REF_TYPES][PLANE_TYPES][COEFF_CONTEXTS0]
+                           [QUANT_RANGES] = {
+                             {
+                                 // intra
+                                 { { 2, 1 }, { 2, 1 }, { 2, 1 } },  // Y
+                                 { { 3, 1 }, { 3, 1 }, { 3, 1 } },  // UV
+                             },
+                             {
+                                 // inter
+                                 { { 3, 1 }, { 2, 1 }, { 2, 1 } },  // Y
+                                 { { 3, 1 }, { 3, 1 }, { 3, 1 } },  // UV
+                             },
+                           };
+  if (!qindex) return 0;  // lossless
+  return def_dq_profile_lookup[is_inter][plane_type][q_ctx]
+                              [qindex_to_qrange(qindex)];
+}
+#endif  // CONFIG_NEW_QUANT
+
+#if CONFIG_PVQ || CONFIG_DAALA_DIST
+extern const int OD_QM8_Q4_FLAT[];
+extern const int OD_QM8_Q4_HVS[];
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_QUANT_COMMON_H_
diff --git a/third_party/aom/av1/common/reconinter.c b/third_party/aom/av1/common/reconinter.c
new file mode 100644
index 000000000..ed7065757
--- /dev/null
+++ b/third_party/aom/av1/common/reconinter.c
@@ -0,0 +1,3083 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "./aom_scale_rtcd.h"
+#include "./aom_dsp_rtcd.h"
+#include "./aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#if CONFIG_MOTION_VAR
+#include "av1/common/onyxc_int.h"
+#endif  // CONFIG_MOTION_VAR
+
+#if CONFIG_EXT_INTER
+
+#define NSMOOTHERS 1
+
+// [smoother][negative][direction]
+DECLARE_ALIGNED(16, static uint8_t,
+                wedge_mask_obl[NSMOOTHERS][2][WEDGE_DIRECTIONS]
+                              [MASK_MASTER_SIZE * MASK_MASTER_SIZE]);
+
+DECLARE_ALIGNED(16, static uint8_t,
+                wedge_signflip_lookup[BLOCK_SIZES][MAX_WEDGE_TYPES]);
+
+// 3 * MAX_WEDGE_SQUARE is an easy to compute and fairly tight upper bound
+// on the sum of all mask sizes up to an including MAX_WEDGE_SQUARE.
+DECLARE_ALIGNED(16, static uint8_t,
+                wedge_mask_buf[2 * MAX_WEDGE_TYPES * 3 * MAX_WEDGE_SQUARE]);
+
+static wedge_masks_type wedge_masks[BLOCK_SIZES][2];
+
+// Some unused wedge codebooks left temporarily to facilitate experiments.
+// To be removed when settled.
+/*
+static wedge_code_type wedge_codebook_8_hgtw[8] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
+  { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+};
+
+static wedge_code_type wedge_codebook_8_hltw[8] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
+  { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static wedge_code_type wedge_codebook_8_heqw[8] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
+  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 6, 4 },
+};
+
+static const wedge_code_type wedge_codebook_32_hgtw[32] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 },
+  { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
+  { WEDGE_OBLIQUE27, 4, 1 },  { WEDGE_OBLIQUE27, 4, 2 },
+  { WEDGE_OBLIQUE27, 4, 3 },  { WEDGE_OBLIQUE27, 4, 5 },
+  { WEDGE_OBLIQUE27, 4, 6 },  { WEDGE_OBLIQUE27, 4, 7 },
+  { WEDGE_OBLIQUE153, 4, 1 }, { WEDGE_OBLIQUE153, 4, 2 },
+  { WEDGE_OBLIQUE153, 4, 3 }, { WEDGE_OBLIQUE153, 4, 5 },
+  { WEDGE_OBLIQUE153, 4, 6 }, { WEDGE_OBLIQUE153, 4, 7 },
+  { WEDGE_OBLIQUE63, 1, 4 },  { WEDGE_OBLIQUE63, 2, 4 },
+  { WEDGE_OBLIQUE63, 3, 4 },  { WEDGE_OBLIQUE63, 5, 4 },
+  { WEDGE_OBLIQUE63, 6, 4 },  { WEDGE_OBLIQUE63, 7, 4 },
+  { WEDGE_OBLIQUE117, 1, 4 }, { WEDGE_OBLIQUE117, 2, 4 },
+  { WEDGE_OBLIQUE117, 3, 4 }, { WEDGE_OBLIQUE117, 5, 4 },
+  { WEDGE_OBLIQUE117, 6, 4 }, { WEDGE_OBLIQUE117, 7, 4 },
+};
+
+static const wedge_code_type wedge_codebook_32_hltw[32] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 4, 4 },
+  { WEDGE_VERTICAL, 6, 4 },   { WEDGE_HORIZONTAL, 4, 4 },
+  { WEDGE_OBLIQUE27, 4, 1 },  { WEDGE_OBLIQUE27, 4, 2 },
+  { WEDGE_OBLIQUE27, 4, 3 },  { WEDGE_OBLIQUE27, 4, 5 },
+  { WEDGE_OBLIQUE27, 4, 6 },  { WEDGE_OBLIQUE27, 4, 7 },
+  { WEDGE_OBLIQUE153, 4, 1 }, { WEDGE_OBLIQUE153, 4, 2 },
+  { WEDGE_OBLIQUE153, 4, 3 }, { WEDGE_OBLIQUE153, 4, 5 },
+  { WEDGE_OBLIQUE153, 4, 6 }, { WEDGE_OBLIQUE153, 4, 7 },
+  { WEDGE_OBLIQUE63, 1, 4 },  { WEDGE_OBLIQUE63, 2, 4 },
+  { WEDGE_OBLIQUE63, 3, 4 },  { WEDGE_OBLIQUE63, 5, 4 },
+  { WEDGE_OBLIQUE63, 6, 4 },  { WEDGE_OBLIQUE63, 7, 4 },
+  { WEDGE_OBLIQUE117, 1, 4 }, { WEDGE_OBLIQUE117, 2, 4 },
+  { WEDGE_OBLIQUE117, 3, 4 }, { WEDGE_OBLIQUE117, 5, 4 },
+  { WEDGE_OBLIQUE117, 6, 4 }, { WEDGE_OBLIQUE117, 7, 4 },
+};
+
+static const wedge_code_type wedge_codebook_32_heqw[32] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
+  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 6, 4 },
+  { WEDGE_OBLIQUE27, 4, 1 },  { WEDGE_OBLIQUE27, 4, 2 },
+  { WEDGE_OBLIQUE27, 4, 3 },  { WEDGE_OBLIQUE27, 4, 5 },
+  { WEDGE_OBLIQUE27, 4, 6 },  { WEDGE_OBLIQUE27, 4, 7 },
+  { WEDGE_OBLIQUE153, 4, 1 }, { WEDGE_OBLIQUE153, 4, 2 },
+  { WEDGE_OBLIQUE153, 4, 3 }, { WEDGE_OBLIQUE153, 4, 5 },
+  { WEDGE_OBLIQUE153, 4, 6 }, { WEDGE_OBLIQUE153, 4, 7 },
+  { WEDGE_OBLIQUE63, 1, 4 },  { WEDGE_OBLIQUE63, 2, 4 },
+  { WEDGE_OBLIQUE63, 3, 4 },  { WEDGE_OBLIQUE63, 5, 4 },
+  { WEDGE_OBLIQUE63, 6, 4 },  { WEDGE_OBLIQUE63, 7, 4 },
+  { WEDGE_OBLIQUE117, 1, 4 }, { WEDGE_OBLIQUE117, 2, 4 },
+  { WEDGE_OBLIQUE117, 3, 4 }, { WEDGE_OBLIQUE117, 5, 4 },
+  { WEDGE_OBLIQUE117, 6, 4 }, { WEDGE_OBLIQUE117, 7, 4 },
+};
+*/
+
+static const wedge_code_type wedge_codebook_16_hgtw[16] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 },
+  { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
+  { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
+  { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+  { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
+  { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static const wedge_code_type wedge_codebook_16_hltw[16] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 4, 4 },
+  { WEDGE_VERTICAL, 6, 4 },   { WEDGE_HORIZONTAL, 4, 4 },
+  { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
+  { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+  { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
+  { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static const wedge_code_type wedge_codebook_16_heqw[16] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
+  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 6, 4 },
+  { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
+  { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+  { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
+  { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+const wedge_params_type wedge_params_lookup[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  { 0, NULL, NULL, 0, NULL },
+  { 0, NULL, NULL, 0, NULL },
+  { 0, NULL, NULL, 0, NULL },
+#endif  // CONFIG_CB4X4
+  { 0, NULL, NULL, 0, NULL },
+  { 0, NULL, NULL, 0, NULL },
+  { 0, NULL, NULL, 0, NULL },
+#if CONFIG_WEDGE
+  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8], 0,
+    wedge_masks[BLOCK_8X8] },
+  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16], 0,
+    wedge_masks[BLOCK_8X16] },
+  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8], 0,
+    wedge_masks[BLOCK_16X8] },
+  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16], 0,
+    wedge_masks[BLOCK_16X16] },
+  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32], 0,
+    wedge_masks[BLOCK_16X32] },
+  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16], 0,
+    wedge_masks[BLOCK_32X16] },
+  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32], 0,
+    wedge_masks[BLOCK_32X32] },
+  { 0, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_32X64], 0,
+    wedge_masks[BLOCK_32X64] },
+  { 0, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_64X32], 0,
+    wedge_masks[BLOCK_64X32] },
+  { 0, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_64X64], 0,
+    wedge_masks[BLOCK_64X64] },
+#else
+  { 0, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8], 0,
+    wedge_masks[BLOCK_8X8] },
+  { 0, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16], 0,
+    wedge_masks[BLOCK_8X16] },
+  { 0, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8], 0,
+    wedge_masks[BLOCK_16X8] },
+  { 0, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16], 0,
+    wedge_masks[BLOCK_16X16] },
+  { 0, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32], 0,
+    wedge_masks[BLOCK_16X32] },
+  { 0, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16], 0,
+    wedge_masks[BLOCK_32X16] },
+  { 0, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32], 0,
+    wedge_masks[BLOCK_32X32] },
+  { 0, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_32X64], 0,
+    wedge_masks[BLOCK_32X64] },
+  { 0, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_64X32], 0,
+    wedge_masks[BLOCK_64X32] },
+  { 0, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_64X64], 0,
+    wedge_masks[BLOCK_64X64] },
+#endif  // CONFIG_WEDGE
+#if CONFIG_EXT_PARTITION
+  { 0, NULL, NULL, 0, NULL },
+  { 0, NULL, NULL, 0, NULL },
+  { 0, NULL, NULL, 0, NULL },
+#endif  // CONFIG_EXT_PARTITION
+};
+
+static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg,
+                                             BLOCK_SIZE sb_type) {
+  const uint8_t *master;
+  const int bh = block_size_high[sb_type];
+  const int bw = block_size_wide[sb_type];
+  const wedge_code_type *a =
+      wedge_params_lookup[sb_type].codebook + wedge_index;
+  const int smoother = wedge_params_lookup[sb_type].smoother;
+  int woff, hoff;
+  const uint8_t wsignflip = wedge_params_lookup[sb_type].signflip[wedge_index];
+
+  assert(wedge_index >= 0 &&
+         wedge_index < (1 << get_wedge_bits_lookup(sb_type)));
+  woff = (a->x_offset * bw) >> 3;
+  hoff = (a->y_offset * bh) >> 3;
+  master = wedge_mask_obl[smoother][neg ^ wsignflip][a->direction] +
+           MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) +
+           MASK_MASTER_SIZE / 2 - woff;
+  return master;
+}
+
+const uint8_t *av1_get_soft_mask(int wedge_index, int wedge_sign,
+                                 BLOCK_SIZE sb_type, int offset_x,
+                                 int offset_y) {
+  const uint8_t *mask =
+      get_wedge_mask_inplace(wedge_index, wedge_sign, sb_type);
+  if (mask) mask -= (offset_x + offset_y * MASK_MASTER_STRIDE);
+  return mask;
+}
+
+#if CONFIG_COMPOUND_SEGMENT
+static uint8_t *invert_mask(uint8_t *mask_inv_buffer, const uint8_t *const mask,
+                            int h, int w, int stride) {
+  int i, j;
+
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      mask_inv_buffer[i * stride + j] =
+          AOM_BLEND_A64_MAX_ALPHA - mask[i * stride + j];
+    }
+  return mask_inv_buffer;
+}
+#endif  // CONFIG_COMPOUND_SEGMENT
+
+const uint8_t *av1_get_compound_type_mask_inverse(
+    const INTERINTER_COMPOUND_DATA *const comp_data,
+#if CONFIG_COMPOUND_SEGMENT
+    uint8_t *mask_buffer, int h, int w, int stride,
+#endif
+    BLOCK_SIZE sb_type) {
+  assert(is_masked_compound_type(comp_data->interinter_compound_type));
+  (void)sb_type;
+  switch (comp_data->interinter_compound_type) {
+#if CONFIG_WEDGE
+    case COMPOUND_WEDGE:
+      return av1_get_contiguous_soft_mask(comp_data->wedge_index,
+                                          !comp_data->wedge_sign, sb_type);
+#endif  // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+    case COMPOUND_SEG:
+      return invert_mask(mask_buffer, comp_data->seg_mask, h, w, stride);
+#endif  // CONFIG_COMPOUND_SEGMENT
+    default: assert(0); return NULL;
+  }
+}
+
+const uint8_t *av1_get_compound_type_mask(
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type) {
+  assert(is_masked_compound_type(comp_data->interinter_compound_type));
+  (void)sb_type;
+  switch (comp_data->interinter_compound_type) {
+#if CONFIG_WEDGE
+    case COMPOUND_WEDGE:
+      return av1_get_contiguous_soft_mask(comp_data->wedge_index,
+                                          comp_data->wedge_sign, sb_type);
+#endif  // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+    case COMPOUND_SEG: return comp_data->seg_mask;
+#endif  // CONFIG_COMPOUND_SEGMENT
+    default: assert(0); return NULL;
+  }
+}
+
+#if CONFIG_COMPOUND_SEGMENT
+#if COMPOUND_SEGMENT_TYPE == 0
+static void uniform_mask(uint8_t *mask, int which_inverse, BLOCK_SIZE sb_type,
+                         int h, int w, int mask_val) {
+  int i, j;
+  int block_stride = block_size_wide[sb_type];
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      mask[i * block_stride + j] =
+          which_inverse ? AOM_BLEND_A64_MAX_ALPHA - mask_val : mask_val;
+    }
+}
+
+void build_compound_seg_mask(uint8_t *mask, SEG_MASK_TYPE mask_type,
+                             const uint8_t *src0, int src0_stride,
+                             const uint8_t *src1, int src1_stride,
+                             BLOCK_SIZE sb_type, int h, int w) {
+  (void)src0;
+  (void)src1;
+  (void)src0_stride;
+  (void)src1_stride;
+  switch (mask_type) {
+    case UNIFORM_45: uniform_mask(mask, 0, sb_type, h, w, 45); break;
+    case UNIFORM_45_INV: uniform_mask(mask, 1, sb_type, h, w, 45); break;
+    default: assert(0);
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+void build_compound_seg_mask_highbd(uint8_t *mask, SEG_MASK_TYPE mask_type,
+                                    const uint8_t *src0, int src0_stride,
+                                    const uint8_t *src1, int src1_stride,
+                                    BLOCK_SIZE sb_type, int h, int w, int bd) {
+  (void)src0;
+  (void)src1;
+  (void)src0_stride;
+  (void)src1_stride;
+  (void)bd;
+  switch (mask_type) {
+    case UNIFORM_45: uniform_mask(mask, 0, sb_type, h, w, 45); break;
+    case UNIFORM_45_INV: uniform_mask(mask, 1, sb_type, h, w, 45); break;
+    default: assert(0);
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+#elif COMPOUND_SEGMENT_TYPE == 1
+#define DIFF_FACTOR 16
+static void diffwtd_mask(uint8_t *mask, int which_inverse, int mask_base,
+                         const uint8_t *src0, int src0_stride,
+                         const uint8_t *src1, int src1_stride,
+                         BLOCK_SIZE sb_type, int h, int w) {
+  int i, j, m, diff;
+  int block_stride = block_size_wide[sb_type];
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      diff =
+          abs((int)src0[i * src0_stride + j] - (int)src1[i * src1_stride + j]);
+      m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
+      mask[i * block_stride + j] =
+          which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
+    }
+  }
+}
+
+void build_compound_seg_mask(uint8_t *mask, SEG_MASK_TYPE mask_type,
+                             const uint8_t *src0, int src0_stride,
+                             const uint8_t *src1, int src1_stride,
+                             BLOCK_SIZE sb_type, int h, int w) {
+  switch (mask_type) {
+    case DIFFWTD_42:
+      diffwtd_mask(mask, 0, 42, src0, src0_stride, src1, src1_stride, sb_type,
+                   h, w);
+      break;
+    case DIFFWTD_42_INV:
+      diffwtd_mask(mask, 1, 42, src0, src0_stride, src1, src1_stride, sb_type,
+                   h, w);
+      break;
+    default: assert(0);
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void diffwtd_mask_highbd(uint8_t *mask, int which_inverse, int mask_base,
+                                const uint16_t *src0, int src0_stride,
+                                const uint16_t *src1, int src1_stride,
+                                BLOCK_SIZE sb_type, int h, int w, int bd) {
+  int i, j, m, diff;
+  int block_stride = block_size_wide[sb_type];
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      diff = abs((int)src0[i * src0_stride + j] -
+                 (int)src1[i * src1_stride + j]) >>
+             (bd - 8);
+      m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
+      mask[i * block_stride + j] =
+          which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
+    }
+  }
+}
+
+void build_compound_seg_mask_highbd(uint8_t *mask, SEG_MASK_TYPE mask_type,
+                                    const uint8_t *src0, int src0_stride,
+                                    const uint8_t *src1, int src1_stride,
+                                    BLOCK_SIZE sb_type, int h, int w, int bd) {
+  switch (mask_type) {
+    case DIFFWTD_42:
+      diffwtd_mask_highbd(mask, 0, 42, CONVERT_TO_SHORTPTR(src0), src0_stride,
+                          CONVERT_TO_SHORTPTR(src1), src1_stride, sb_type, h, w,
+                          bd);
+      break;
+    case DIFFWTD_42_INV:
+      diffwtd_mask_highbd(mask, 1, 42, CONVERT_TO_SHORTPTR(src0), src0_stride,
+                          CONVERT_TO_SHORTPTR(src1), src1_stride, sb_type, h, w,
+                          bd);
+      break;
+    default: assert(0);
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // COMPOUND_SEGMENT_TYPE
+#endif  // CONFIG_COMPOUND_SEGMENT
+
+#if MASK_MASTER_SIZE == 64
+static const uint8_t wedge_master_oblique_odd[NSMOOTHERS][MASK_MASTER_SIZE] = {
+  {
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  6,  18,
+      37, 53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+      64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  }
+};
+static const uint8_t wedge_master_oblique_even[NSMOOTHERS][MASK_MASTER_SIZE] = {
+  {
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  4,  11, 27,
+      46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+      64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  }
+};
+static const uint8_t wedge_master_vertical[NSMOOTHERS][MASK_MASTER_SIZE] = { {
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  7,  21,
+    43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+} };
+
+static void shift_copy(const uint8_t *src, uint8_t *dst, int shift, int width) {
+  if (shift >= 0) {
+    memcpy(dst + shift, src, width - shift);
+    memset(dst, src[0], shift);
+  } else {
+    shift = -shift;
+    memcpy(dst, src + shift, width - shift);
+    memset(dst + width - shift, src[width - 1], shift);
+  }
+}
+#else
+static const double smoother_param[NSMOOTHERS] = { 2.83 };
+#endif  // MASK_MASTER_SIZE == 64
+
+static void init_wedge_master_masks() {
+  int i, j, s;
+  const int w = MASK_MASTER_SIZE;
+  const int h = MASK_MASTER_SIZE;
+  const int stride = MASK_MASTER_STRIDE;
+  for (s = 0; s < NSMOOTHERS; s++) {
+#if MASK_MASTER_SIZE == 64
+    // Generate prototype by shifting the masters
+    int shift = h / 4;
+    for (i = 0; i < h; i += 2) {
+      shift_copy(wedge_master_oblique_even[s],
+                 &wedge_mask_obl[s][1][WEDGE_OBLIQUE63][i * stride], shift,
+                 MASK_MASTER_SIZE);
+      shift--;
+      shift_copy(wedge_master_oblique_odd[s],
+                 &wedge_mask_obl[s][1][WEDGE_OBLIQUE63][(i + 1) * stride],
+                 shift, MASK_MASTER_SIZE);
+      memcpy(&wedge_mask_obl[s][1][WEDGE_VERTICAL][i * stride],
+             wedge_master_vertical[s],
+             MASK_MASTER_SIZE * sizeof(wedge_master_vertical[s][0]));
+      memcpy(&wedge_mask_obl[s][1][WEDGE_VERTICAL][(i + 1) * stride],
+             wedge_master_vertical[s],
+             MASK_MASTER_SIZE * sizeof(wedge_master_vertical[s][0]));
+    }
+#else
+    const int a[2] = { 2, 1 };
+    const double asqrt = sqrt(a[0] * a[0] + a[1] * a[1]);
+    for (i = 0; i < h; i++) {
+      for (j = 0; j < w; ++j) {
+        int x = (2 * j + 1 - w);
+        int y = (2 * i + 1 - h);
+        double d = (a[0] * x + a[1] * y) / asqrt;
+        const int msk = (int)rint((1.0 + tanh(d / smoother_param[s])) * 32);
+        wedge_mask_obl[s][1][WEDGE_OBLIQUE63][i * stride + j] = msk;
+        const int mskx = (int)rint((1.0 + tanh(x / smoother_param[s])) * 32);
+        wedge_mask_obl[s][1][WEDGE_VERTICAL][i * stride + j] = mskx;
+      }
+    }
+#endif  // MASK_MASTER_SIZE == 64
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int msk = wedge_mask_obl[s][1][WEDGE_OBLIQUE63][i * stride + j];
+        wedge_mask_obl[s][1][WEDGE_OBLIQUE27][j * stride + i] = msk;
+        wedge_mask_obl[s][1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
+            wedge_mask_obl[s][1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
+                (1 << WEDGE_WEIGHT_BITS) - msk;
+        wedge_mask_obl[s][0][WEDGE_OBLIQUE63][i * stride + j] =
+            wedge_mask_obl[s][0][WEDGE_OBLIQUE27][j * stride + i] =
+                (1 << WEDGE_WEIGHT_BITS) - msk;
+        wedge_mask_obl[s][0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
+            wedge_mask_obl[s][0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
+                msk;
+        const int mskx = wedge_mask_obl[s][1][WEDGE_VERTICAL][i * stride + j];
+        wedge_mask_obl[s][1][WEDGE_HORIZONTAL][j * stride + i] = mskx;
+        wedge_mask_obl[s][0][WEDGE_VERTICAL][i * stride + j] =
+            wedge_mask_obl[s][0][WEDGE_HORIZONTAL][j * stride + i] =
+                (1 << WEDGE_WEIGHT_BITS) - mskx;
+      }
+    }
+  }
+}
+
+// If the signs for the wedges for various blocksizes are
+// inconsistent flip the sign flag. Do it only once for every
+// wedge codebook.
+static void init_wedge_signs() {
+  BLOCK_SIZE sb_type;
+  memset(wedge_signflip_lookup, 0, sizeof(wedge_signflip_lookup));
+  for (sb_type = BLOCK_4X4; sb_type < BLOCK_SIZES; ++sb_type) {
+    const int bw = block_size_wide[sb_type];
+    const int bh = block_size_high[sb_type];
+    const wedge_params_type wedge_params = wedge_params_lookup[sb_type];
+    const int wbits = wedge_params.bits;
+    const int wtypes = 1 << wbits;
+    int i, w;
+    if (wbits == 0) continue;
+    for (w = 0; w < wtypes; ++w) {
+      const uint8_t *mask = get_wedge_mask_inplace(w, 0, sb_type);
+      int sum = 0;
+      for (i = 0; i < bw; ++i) sum += mask[i];
+      for (i = 0; i < bh; ++i) sum += mask[i * MASK_MASTER_STRIDE];
+      sum = (sum + (bw + bh) / 2) / (bw + bh);
+      wedge_params.signflip[w] = (sum < 32);
+    }
+  }
+}
+
+static void init_wedge_masks() {
+  uint8_t *dst = wedge_mask_buf;
+  BLOCK_SIZE bsize;
+  memset(wedge_masks, 0, sizeof(wedge_masks));
+  for (bsize = BLOCK_4X4; bsize < BLOCK_SIZES; ++bsize) {
+    const uint8_t *mask;
+    const int bw = block_size_wide[bsize];
+    const int bh = block_size_high[bsize];
+    const wedge_params_type *wedge_params = &wedge_params_lookup[bsize];
+    const int wbits = wedge_params->bits;
+    const int wtypes = 1 << wbits;
+    int w;
+    if (wbits == 0) continue;
+    for (w = 0; w < wtypes; ++w) {
+      mask = get_wedge_mask_inplace(w, 0, bsize);
+      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
+                        bh);
+      wedge_params->masks[0][w] = dst;
+      dst += bw * bh;
+
+      mask = get_wedge_mask_inplace(w, 1, bsize);
+      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
+                        bh);
+      wedge_params->masks[1][w] = dst;
+      dst += bw * bh;
+    }
+    assert(sizeof(wedge_mask_buf) >= (size_t)(dst - wedge_mask_buf));
+  }
+}
+
+// Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0
+void av1_init_wedge_masks() {
+  init_wedge_master_masks();
+  init_wedge_signs();
+  init_wedge_masks();
+}
+
+#if CONFIG_SUPERTX
+static void build_masked_compound_wedge_extend(
+    uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
+    const uint8_t *src1, int src1_stride,
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type,
+    int wedge_offset_x, int wedge_offset_y, int h, int w) {
+  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
+  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
+  const uint8_t *mask;
+  size_t mask_stride;
+  switch (comp_data->interinter_compound_type) {
+    case COMPOUND_WEDGE:
+      mask = av1_get_soft_mask(comp_data->wedge_index, comp_data->wedge_sign,
+                               sb_type, wedge_offset_x, wedge_offset_y);
+      mask_stride = MASK_MASTER_STRIDE;
+      break;
+#if CONFIG_COMPOUND_SEGMENT
+    case COMPOUND_SEG:
+      mask = comp_data->seg_mask;
+      mask_stride = block_size_wide[sb_type];
+      break;
+#endif
+    default: assert(0); return;
+  }
+  aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                     mask, (int)mask_stride, h, w, subh, subw);
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void build_masked_compound_wedge_extend_highbd(
+    uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
+    const uint8_t *src1_8, int src1_stride,
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type,
+    int wedge_offset_x, int wedge_offset_y, int h, int w, int bd) {
+  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
+  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
+  const uint8_t *mask;
+  size_t mask_stride;
+  switch (comp_data->interinter_compound_type) {
+    case COMPOUND_WEDGE:
+      mask = av1_get_soft_mask(comp_data->wedge_index, comp_data->wedge_sign,
+                               sb_type, wedge_offset_x, wedge_offset_y);
+      mask_stride = MASK_MASTER_STRIDE;
+      break;
+#if CONFIG_COMPOUND_SEGMENT
+    case COMPOUND_SEG:
+      mask = comp_data->seg_mask;
+      mask_stride = block_size_wide[sb_type];
+      break;
+#endif
+    default: assert(0); return;
+  }
+  aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                            src1_stride, mask, (int)mask_stride, h, w, subh,
+                            subw, bd);
+}
+#endif  // CONFIG_HIGHBITDEPTH
+#else
+static void build_masked_compound(
+    uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
+    const uint8_t *src1, int src1_stride,
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+    int w) {
+  // Derive subsampling from h and w passed in. May be refactored to
+  // pass in subsampling factors directly.
+  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
+  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
+  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+  aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                     mask, block_size_wide[sb_type], h, w, subh, subw);
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void build_masked_compound_highbd(
+    uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
+    const uint8_t *src1_8, int src1_stride,
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+    int w, int bd) {
+  // Derive subsampling from h and w passed in. May be refactored to
+  // pass in subsampling factors directly.
+  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
+  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
+  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+  // const uint8_t *mask =
+  //     av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
+  aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                            src1_stride, mask, block_size_wide[sb_type], h, w,
+                            subh, subw, bd);
+}
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_SUPERTX
+
+void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
+                                     uint8_t *dst, int dst_stride,
+                                     const int subpel_x, const int subpel_y,
+                                     const struct scale_factors *sf, int w,
+                                     int h,
+#if CONFIG_DUAL_FILTER
+                                     const InterpFilter *interp_filter,
+#else
+                                     const InterpFilter interp_filter,
+#endif
+                                     int xs, int ys,
+#if CONFIG_SUPERTX
+                                     int wedge_offset_x, int wedge_offset_y,
+#endif  // CONFIG_SUPERTX
+                                     int plane,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                                     const WarpTypesAllowed *warp_types,
+                                     int p_col, int p_row, int ref,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                                     MACROBLOCKD *xd) {
+  MODE_INFO *mi = xd->mi[0];
+  const INTERINTER_COMPOUND_DATA comp_data = {
+#if CONFIG_WEDGE
+    mi->mbmi.wedge_index,
+    mi->mbmi.wedge_sign,
+#endif  // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+    mi->mbmi.mask_type,
+    xd->seg_mask,
+#endif  // CONFIG_COMPOUND_SEGMENT
+    mi->mbmi.interinter_compound_type
+  };
+// The prediction filter types used here should be those for
+// the second reference block.
+#if CONFIG_DUAL_FILTER
+  InterpFilter tmp_ipf[4] = {
+    interp_filter[2], interp_filter[3], interp_filter[2], interp_filter[3],
+  };
+#else
+  InterpFilter tmp_ipf = interp_filter;
+#endif  // CONFIG_DUAL_FILTER
+  ConvolveParams conv_params = get_conv_params(0, plane);
+
+#if CONFIG_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint8_t, tmp_dst_[2 * MAX_SB_SQUARE]);
+  uint8_t *tmp_dst = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+                         ? CONVERT_TO_BYTEPTR(tmp_dst_)
+                         : tmp_dst_;
+  av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, subpel_x,
+                           subpel_y, sf, w, h, &conv_params, tmp_ipf,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                           warp_types, p_col, p_row, plane, ref,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+#if CONFIG_MOTION_VAR
+                           0, 0,
+#endif
+                           xs, ys, xd);
+#if CONFIG_COMPOUND_SEGMENT
+  if (!plane && comp_data.interinter_compound_type == COMPOUND_SEG) {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      build_compound_seg_mask_highbd(comp_data.seg_mask, comp_data.mask_type,
+                                     dst, dst_stride, tmp_dst, MAX_SB_SIZE,
+                                     mi->mbmi.sb_type, h, w, xd->bd);
+    else
+      build_compound_seg_mask(comp_data.seg_mask, comp_data.mask_type, dst,
+                              dst_stride, tmp_dst, MAX_SB_SIZE,
+                              mi->mbmi.sb_type, h, w);
+  }
+#endif  // CONFIG_COMPOUND_SEGMENT
+
+#if CONFIG_SUPERTX
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    build_masked_compound_wedge_extend_highbd(
+        dst, dst_stride, dst, dst_stride, tmp_dst, MAX_SB_SIZE, &comp_data,
+        mi->mbmi.sb_type, wedge_offset_x, wedge_offset_y, h, w, xd->bd);
+  else
+    build_masked_compound_wedge_extend(
+        dst, dst_stride, dst, dst_stride, tmp_dst, MAX_SB_SIZE, &comp_data,
+        mi->mbmi.sb_type, wedge_offset_x, wedge_offset_y, h, w);
+#else
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    build_masked_compound_highbd(dst, dst_stride, dst, dst_stride, tmp_dst,
+                                 MAX_SB_SIZE, &comp_data, mi->mbmi.sb_type, h,
+                                 w, xd->bd);
+  else
+    build_masked_compound(dst, dst_stride, dst, dst_stride, tmp_dst,
+                          MAX_SB_SIZE, &comp_data, mi->mbmi.sb_type, h, w);
+#endif  // CONFIG_SUPERTX
+
+#else  // CONFIG_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint8_t, tmp_dst[MAX_SB_SQUARE]);
+  av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, subpel_x,
+                           subpel_y, sf, w, h, &conv_params, tmp_ipf,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                           warp_types, p_col, p_row, plane, ref,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+#if CONFIG_MOTION_VAR
+                           0, 0,
+#endif
+                           xs, ys, xd);
+#if CONFIG_COMPOUND_SEGMENT
+  if (!plane && comp_data.interinter_compound_type == COMPOUND_SEG)
+    build_compound_seg_mask(comp_data.seg_mask, comp_data.mask_type, dst,
+                            dst_stride, tmp_dst, MAX_SB_SIZE, mi->mbmi.sb_type,
+                            h, w);
+#endif  // CONFIG_COMPOUND_SEGMENT
+#if CONFIG_SUPERTX
+  build_masked_compound_wedge_extend(dst, dst_stride, dst, dst_stride, tmp_dst,
+                                     MAX_SB_SIZE, &comp_data, mi->mbmi.sb_type,
+                                     wedge_offset_x, wedge_offset_y, h, w);
+#else
+  build_masked_compound(dst, dst_stride, dst, dst_stride, tmp_dst, MAX_SB_SIZE,
+                        &comp_data, mi->mbmi.sb_type, h, w);
+#endif  // CONFIG_SUPERTX
+#endif  // CONFIG_HIGHBITDEPTH
+#if CONFIG_COMPOUND_SEGMENT
+  (void)plane;
+#endif  // CONFIG_COMPOUND_SEGMENT
+}
+#endif  // CONFIG_EXT_INTER
+
+// TODO(sarahparker) av1_highbd_build_inter_predictor and
+// av1_build_inter_predictor should be combined with
+// av1_make_inter_predictor
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_build_inter_predictor(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+    const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref,
+#if CONFIG_DUAL_FILTER
+    const InterpFilter *interp_filter,
+#else
+    const InterpFilter interp_filter,
+#endif
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+    const WarpTypesAllowed *warp_types, int p_col, int p_row,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+    int plane, enum mv_precision precision, int x, int y,
+    const MACROBLOCKD *xd) {
+  const int is_q4 = precision == MV_PRECISION_Q4;
+  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
+                     is_q4 ? src_mv->col : src_mv->col * 2 };
+  MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
+  const int subpel_x = mv.col & SUBPEL_MASK;
+  const int subpel_y = mv.row & SUBPEL_MASK;
+  ConvolveParams conv_params = get_conv_params(ref, plane);
+
+  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
+
+  av1_make_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
+                           sf, w, h, &conv_params, interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                           warp_types, p_col, p_row, plane, ref,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+#if CONFIG_MOTION_VAR
+                           0, 0,
+#endif
+                           sf->x_step_q4, sf->y_step_q4, xd);
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+                               int dst_stride, const MV *src_mv,
+                               const struct scale_factors *sf, int w, int h,
+                               ConvolveParams *conv_params,
+#if CONFIG_DUAL_FILTER
+                               const InterpFilter *interp_filter,
+#else
+                               const InterpFilter interp_filter,
+#endif
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                               const WarpTypesAllowed *warp_types, int p_col,
+                               int p_row, int plane, int ref,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                               enum mv_precision precision, int x, int y,
+                               const MACROBLOCKD *xd) {
+  const int is_q4 = precision == MV_PRECISION_Q4;
+  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
+                     is_q4 ? src_mv->col : src_mv->col * 2 };
+  MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
+  const int subpel_x = mv.col & SUBPEL_MASK;
+  const int subpel_y = mv.row & SUBPEL_MASK;
+
+  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
+
+  av1_make_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
+                           sf, w, h, conv_params, interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                           warp_types, p_col, p_row, plane, ref,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+#if CONFIG_MOTION_VAR
+                           0, 0,
+#endif
+                           sf->x_step_q4, sf->y_step_q4, xd);
+}
+
+typedef struct SubpelParams {
+  int xs;
+  int ys;
+  int subpel_x;
+  int subpel_y;
+} SubpelParams;
+
+void build_inter_predictors(MACROBLOCKD *xd, int plane,
+#if CONFIG_MOTION_VAR
+                            int mi_col_offset, int mi_row_offset,
+#endif  // CONFIG_MOTION_VAR
+                            int block, int bw, int bh, int x, int y, int w,
+                            int h,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                            int wedge_offset_x, int wedge_offset_y,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                            int mi_x, int mi_y) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+#if CONFIG_MOTION_VAR
+  const MODE_INFO *mi = xd->mi[mi_col_offset + xd->mi_stride * mi_row_offset];
+#if !CONFIG_CB4X4 || CONFIG_SUB8X8_MC
+  const int build_for_obmc = !(mi_col_offset == 0 && mi_row_offset == 0);
+#endif  // !CONFIG_CB4X4 || CONFIG_SUB8X8_MC
+#else
+  const MODE_INFO *mi = xd->mi[0];
+#endif  // CONFIG_MOTION_VAR
+  const int is_compound = has_second_ref(&mi->mbmi);
+  int ref;
+#if CONFIG_INTRABC
+  const int is_intrabc = is_intrabc_block(&mi->mbmi);
+  struct scale_factors sf_identity;
+#if CONFIG_HIGHBITDEPTH
+  av1_setup_scale_factors_for_frame(
+      &sf_identity, 64, 64, 64, 64,
+      xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+#else
+  av1_setup_scale_factors_for_frame(&sf_identity, 64, 64, 64, 64);
+#endif  // CONFIG_HIGHBITDEPTH
+  assert(IMPLIES(is_intrabc, !is_compound));
+#endif  // CONFIG_INTRABC
+#if CONFIG_GLOBAL_MOTION
+  int is_global[2];
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    WarpedMotionParams *const wm = &xd->global_motion[mi->mbmi.ref_frame[ref]];
+    is_global[ref] = is_global_mv_block(mi, block, wm->wmtype);
+  }
+#endif  // CONFIG_GLOBAL_MOTION
+
+#if CONFIG_CB4X4
+  (void)block;
+#endif
+
+#if CONFIG_SUB8X8_MC
+#if CONFIG_MOTION_VAR
+  if (mi->mbmi.sb_type < BLOCK_8X8 && plane > 0 && !build_for_obmc) {
+#else
+  if (mi->mbmi.sb_type < BLOCK_8X8 && plane > 0) {
+#endif  // CONFIG_MOTION_VAR
+    // block size in log2
+    const int b4_wl = b_width_log2_lookup[mi->mbmi.sb_type];
+    const int b4_hl = b_height_log2_lookup[mi->mbmi.sb_type];
+    const int b8_sl = b_width_log2_lookup[BLOCK_8X8];
+
+    // block size
+    const int b4_w = 1 << b4_wl;
+    const int b4_h = 1 << b4_hl;
+    const int b8_s = 1 << b8_sl;
+    int idx, idy;
+
+    const int x_base = x;
+    const int y_base = y;
+
+    // processing unit size
+    const int x_step = w >> (b8_sl - b4_wl);
+    const int y_step = h >> (b8_sl - b4_hl);
+
+    for (idy = 0; idy < b8_s; idy += b4_h) {
+      for (idx = 0; idx < b8_s; idx += b4_w) {
+        const int chr_idx = (idy * 2) + idx;
+        for (ref = 0; ref < 1 + is_compound; ++ref) {
+          struct buf_2d *const dst_buf = &pd->dst;
+#if CONFIG_INTRABC
+          const struct scale_factors *const sf =
+              is_intrabc ? &sf_identity : &xd->block_refs[ref]->sf;
+          struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+#else
+          const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+          struct buf_2d *const pre_buf = &pd->pre[ref];
+#endif  // CONFIG_INTRABC
+          uint8_t *dst = dst_buf->buf;
+          const MV mv = mi->bmi[chr_idx].as_mv[ref].as_mv;
+          const MV mv_q4 = clamp_mv_to_umv_border_sb(
+              xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+          uint8_t *pre;
+          MV32 scaled_mv;
+          int xs, ys, subpel_x, subpel_y;
+          const int is_scaled = av1_is_scaled(sf);
+          ConvolveParams conv_params = get_conv_params(ref, plane);
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+          WarpTypesAllowed warp_types;
+#if CONFIG_GLOBAL_MOTION
+          warp_types.global_warp_allowed = is_global[ref];
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+          warp_types.local_warp_allowed = mi->mbmi.motion_mode == WARPED_CAUSAL;
+#endif  // CONFIG_WARPED_MOTION
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+
+          x = x_base + idx * x_step;
+          y = y_base + idy * y_step;
+
+          dst += dst_buf->stride * y + x;
+
+          if (is_scaled) {
+            pre =
+                pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
+            scaled_mv = av1_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+            xs = sf->x_step_q4;
+            ys = sf->y_step_q4;
+          } else {
+            pre = pre_buf->buf + y * pre_buf->stride + x;
+            scaled_mv.row = mv_q4.row;
+            scaled_mv.col = mv_q4.col;
+            xs = ys = 16;
+          }
+
+          subpel_x = scaled_mv.col & SUBPEL_MASK;
+          subpel_y = scaled_mv.row & SUBPEL_MASK;
+          pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride +
+                 (scaled_mv.col >> SUBPEL_BITS);
+
+#if CONFIG_EXT_INTER
+          if (ref && is_masked_compound_type(mi->mbmi.interinter_compound_type))
+            av1_make_masked_inter_predictor(
+                pre, pre_buf->stride, dst, dst_buf->stride, subpel_x, subpel_y,
+                sf, w, h, mi->mbmi.interp_filter, xs, ys,
+#if CONFIG_SUPERTX
+                wedge_offset_x, wedge_offset_y,
+#endif  // CONFIG_SUPERTX
+                plane,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                &warp_types, (mi_x >> pd->subsampling_x) + x,
+                (mi_y >> pd->subsampling_y) + y, ref,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                xd);
+          else
+#endif  // CONFIG_EXT_INTER
+            av1_make_inter_predictor(
+                pre, pre_buf->stride, dst, dst_buf->stride, subpel_x, subpel_y,
+                sf, x_step, y_step, &conv_params, mi->mbmi.interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                &warp_types, (mi_x >> pd->subsampling_x) + x,
+                (mi_y >> pd->subsampling_y) + y, plane, ref,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+#if CONFIG_MOTION_VAR
+                mi_col_offset, mi_row_offset,
+#endif
+                xs, ys, xd);
+        }
+      }
+    }
+    return;
+  }
+#endif
+
+  {
+    struct buf_2d *const dst_buf = &pd->dst;
+    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+    uint8_t *pre[2];
+    MV32 scaled_mv[2];
+    SubpelParams subpel_params[2];
+#if CONFIG_CONVOLVE_ROUND
+    DECLARE_ALIGNED(16, int32_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
+    av1_zero(tmp_dst);
+#endif  // CONFIG_CONVOLVE_ROUND
+
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+#if CONFIG_INTRABC
+      const struct scale_factors *const sf =
+          is_intrabc ? &sf_identity : &xd->block_refs[ref]->sf;
+      struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+#else
+      const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+      struct buf_2d *const pre_buf = &pd->pre[ref];
+#endif  // CONFIG_INTRABC
+#if CONFIG_CB4X4
+      const MV mv = mi->mbmi.mv[ref].as_mv;
+#else
+      const MV mv =
+#if CONFIG_MOTION_VAR
+          (mi->mbmi.sb_type < BLOCK_8X8 && !build_for_obmc)
+              ?
+#else
+          mi->mbmi.sb_type < BLOCK_8X8 ?
+#endif
+              average_split_mvs(pd, mi, ref, block)
+              : mi->mbmi.mv[ref].as_mv;
+#endif
+
+      // TODO(jkoleszar): This clamping is done in the incorrect place for the
+      // scaling case. It needs to be done on the scaled MV, not the pre-scaling
+      // MV. Note however that it performs the subsampling aware scaling so
+      // that the result is always q4.
+      // mv_precision precision is MV_PRECISION_Q4.
+      const MV mv_q4 = clamp_mv_to_umv_border_sb(
+          xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+
+      const int is_scaled = av1_is_scaled(sf);
+      if (is_scaled) {
+        pre[ref] =
+            pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
+        scaled_mv[ref] = av1_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+        subpel_params[ref].xs = sf->x_step_q4;
+        subpel_params[ref].ys = sf->y_step_q4;
+      } else {
+        pre[ref] = pre_buf->buf + (y * pre_buf->stride + x);
+        scaled_mv[ref].row = mv_q4.row;
+        scaled_mv[ref].col = mv_q4.col;
+        subpel_params[ref].xs = 16;
+        subpel_params[ref].ys = 16;
+      }
+
+      subpel_params[ref].subpel_x = scaled_mv[ref].col & SUBPEL_MASK;
+      subpel_params[ref].subpel_y = scaled_mv[ref].row & SUBPEL_MASK;
+      pre[ref] += (scaled_mv[ref].row >> SUBPEL_BITS) * pre_buf->stride +
+                  (scaled_mv[ref].col >> SUBPEL_BITS);
+    }
+
+#if CONFIG_CONVOLVE_ROUND
+    ConvolveParams conv_params =
+        get_conv_params_no_round(ref, plane, tmp_dst, MAX_SB_SIZE);
+#else
+    ConvolveParams conv_params = get_conv_params(ref, plane);
+#endif  // CONFIG_CONVOLVE_ROUND
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+#if CONFIG_INTRABC
+      const struct scale_factors *const sf =
+          is_intrabc ? &sf_identity : &xd->block_refs[ref]->sf;
+      struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+#else
+      const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+      struct buf_2d *const pre_buf = &pd->pre[ref];
+#endif  // CONFIG_INTRABC
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+      WarpTypesAllowed warp_types;
+#if CONFIG_GLOBAL_MOTION
+      warp_types.global_warp_allowed = is_global[ref];
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+      warp_types.local_warp_allowed = mi->mbmi.motion_mode == WARPED_CAUSAL;
+#endif  // CONFIG_WARPED_MOTION
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+      conv_params.ref = ref;
+#if CONFIG_EXT_INTER
+      if (ref && is_masked_compound_type(mi->mbmi.interinter_compound_type))
+        av1_make_masked_inter_predictor(
+            pre[ref], pre_buf->stride, dst, dst_buf->stride,
+            subpel_params[ref].subpel_x, subpel_params[ref].subpel_y, sf, w, h,
+            mi->mbmi.interp_filter, subpel_params[ref].xs,
+            subpel_params[ref].ys,
+#if CONFIG_SUPERTX
+            wedge_offset_x, wedge_offset_y,
+#endif  // CONFIG_SUPERTX
+            plane,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+            &warp_types, (mi_x >> pd->subsampling_x) + x,
+            (mi_y >> pd->subsampling_y) + y, ref,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+            xd);
+      else
+#endif  // CONFIG_EXT_INTER
+        av1_make_inter_predictor(
+            pre[ref], pre_buf->stride, dst, dst_buf->stride,
+            subpel_params[ref].subpel_x, subpel_params[ref].subpel_y, sf, w, h,
+            &conv_params, mi->mbmi.interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+            &warp_types, (mi_x >> pd->subsampling_x) + x,
+            (mi_y >> pd->subsampling_y) + y, plane, ref,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+#if CONFIG_MOTION_VAR
+            mi_col_offset, mi_row_offset,
+#endif
+            subpel_params[ref].xs, subpel_params[ref].ys, xd);
+    }
+
+#if CONFIG_CONVOLVE_ROUND
+// TODO(angiebird): This part needs optimization
+#if CONFIG_HIGHBITDEPTH
+    if (!(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH))
+#endif  // CONFIG_HIGHBITDEPTH
+      av1_convolve_rounding(tmp_dst, MAX_SB_SIZE, dst, dst_buf->stride, w, h,
+                            FILTER_BITS * 2 + is_compound -
+                                conv_params.round_0 - conv_params.round_1);
+#endif  // CONFIG_CONVOLVE_ROUND
+  }
+}
+
+void av1_build_inter_predictor_sub8x8(MACROBLOCKD *xd, int plane, int i, int ir,
+                                      int ic, int mi_row, int mi_col) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  MODE_INFO *const mi = xd->mi[0];
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
+  const int width = block_size_wide[plane_bsize];
+  const int height = block_size_high[plane_bsize];
+  uint8_t *const dst = &pd->dst.buf[(ir * pd->dst.stride + ic) << 2];
+  int ref;
+  const int is_compound = has_second_ref(&mi->mbmi);
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+  WarpTypesAllowed warp_types;
+  const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
+  const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
+#if CONFIG_GLOBAL_MOTION
+  int is_global[2];
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    WarpedMotionParams *const wm = &xd->global_motion[mi->mbmi.ref_frame[ref]];
+    is_global[ref] = is_global_mv_block(mi, i, wm->wmtype);
+  }
+#endif  // CONFIG_GLOBAL_MOTION
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    ConvolveParams conv_params = get_conv_params(ref, plane);
+    const uint8_t *pre =
+        &pd->pre[ref].buf[(ir * pd->pre[ref].stride + ic) << 2];
+#if CONFIG_GLOBAL_MOTION
+    warp_types.global_warp_allowed = is_global[ref];
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+    warp_types.local_warp_allowed = mi->mbmi.motion_mode == WARPED_CAUSAL;
+#endif  // CONFIG_WARPED_MOTION
+
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      av1_highbd_build_inter_predictor(
+          pre, pd->pre[ref].stride, dst, pd->dst.stride,
+          &mi->bmi[i].as_mv[ref].as_mv, &xd->block_refs[ref]->sf, width, height,
+          ref, mi->mbmi.interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+          &warp_types, p_col, p_row,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+          plane, MV_PRECISION_Q3, mi_col * MI_SIZE + 4 * ic,
+          mi_row * MI_SIZE + 4 * ir, xd);
+    else
+#endif  // CONFIG_HIGHBITDEPTH
+      av1_build_inter_predictor(pre, pd->pre[ref].stride, dst, pd->dst.stride,
+                                &mi->bmi[i].as_mv[ref].as_mv,
+                                &xd->block_refs[ref]->sf, width, height,
+                                &conv_params, mi->mbmi.interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                                &warp_types, p_col, p_row, plane, ref,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                                MV_PRECISION_Q3, mi_col * MI_SIZE + 4 * ic,
+                                mi_row * MI_SIZE + 4 * ir, xd);
+  }
+}
+
+static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int mi_row, int mi_col,
+                                              int plane_from, int plane_to) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+#endif
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    const int bw = pd->width;
+    const int bh = pd->height;
+
+#if CONFIG_CB4X4
+    if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                             pd->subsampling_y))
+      continue;
+#endif
+
+    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8 && !unify_bsize) {
+      const PARTITION_TYPE bp = bsize - xd->mi[0]->mbmi.sb_type;
+      const int have_vsplit = bp != PARTITION_HORZ;
+      const int have_hsplit = bp != PARTITION_VERT;
+      const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
+      const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
+      const int pw = 8 >> (have_vsplit | pd->subsampling_x);
+      const int ph = 8 >> (have_hsplit | pd->subsampling_y);
+      int x, y;
+      assert(bp != PARTITION_NONE && bp < PARTITION_TYPES);
+      assert(bsize == BLOCK_8X8);
+      assert(pw * num_4x4_w == bw && ph * num_4x4_h == bh);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+          build_inter_predictors(xd, plane,
+#if CONFIG_MOTION_VAR
+                                 0, 0,
+#endif  // CONFIG_MOTION_VAR
+                                 y * 2 + x, bw, bh, 4 * x, 4 * y, pw, ph,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                                 0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                                 mi_x, mi_y);
+    } else {
+      build_inter_predictors(xd, plane,
+#if CONFIG_MOTION_VAR
+                             0, 0,
+#endif  // CONFIG_MOTION_VAR
+                             0, bw, bh, 0, 0, bw, bh,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                             0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                             mi_x, mi_y);
+    }
+  }
+}
+
+void av1_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BUFFER_SET *ctx, BLOCK_SIZE bsize) {
+  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, 0);
+#if CONFIG_EXT_INTER
+  if (is_interintra_pred(&xd->mi[0]->mbmi)) {
+    BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL },
+                               { xd->plane[0].dst.stride, 0, 0 } };
+    if (!ctx) ctx = &default_ctx;
+    av1_build_interintra_predictors_sby(xd, xd->plane[0].dst.buf,
+                                        xd->plane[0].dst.stride, ctx, bsize);
+  }
+#else
+  (void)ctx;
+#endif  // CONFIG_EXT_INTER
+}
+
+void av1_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                     BUFFER_SET *ctx, BLOCK_SIZE bsize) {
+  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1,
+                                    MAX_MB_PLANE - 1);
+#if CONFIG_EXT_INTER
+  if (is_interintra_pred(&xd->mi[0]->mbmi)) {
+    BUFFER_SET default_ctx = {
+      { NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
+      { 0, xd->plane[1].dst.stride, xd->plane[2].dst.stride }
+    };
+    if (!ctx) ctx = &default_ctx;
+    av1_build_interintra_predictors_sbuv(
+        xd, xd->plane[1].dst.buf, xd->plane[2].dst.buf, xd->plane[1].dst.stride,
+        xd->plane[2].dst.stride, ctx, bsize);
+  }
+#else
+  (void)ctx;
+#endif  // CONFIG_EXT_INTER
+}
+
+// TODO(afergs): Check if ctx can be made constant
+void av1_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                   BUFFER_SET *ctx, BLOCK_SIZE bsize) {
+  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0,
+                                    MAX_MB_PLANE - 1);
+#if CONFIG_EXT_INTER
+  if (is_interintra_pred(&xd->mi[0]->mbmi)) {
+    BUFFER_SET default_ctx = {
+      { xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
+      { xd->plane[0].dst.stride, xd->plane[1].dst.stride,
+        xd->plane[2].dst.stride }
+    };
+    if (!ctx) ctx = &default_ctx;
+    av1_build_interintra_predictors(
+        xd, xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
+        xd->plane[0].dst.stride, xd->plane[1].dst.stride,
+        xd->plane[2].dst.stride, ctx, bsize);
+  }
+#else
+  (void)ctx;
+#endif  // CONFIG_EXT_INTER
+}
+
+void av1_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
+                          BLOCK_SIZE bsize, const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col) {
+  uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
+                                           src->v_buffer };
+  const int widths[MAX_MB_PLANE] = { src->y_crop_width, src->uv_crop_width,
+                                     src->uv_crop_width };
+  const int heights[MAX_MB_PLANE] = { src->y_crop_height, src->uv_crop_height,
+                                      src->uv_crop_height };
+  const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
+                                      src->uv_stride };
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    struct macroblockd_plane *const pd = &planes[i];
+    setup_pred_plane(&pd->dst, bsize, buffers[i], widths[i], heights[i],
+                     strides[i], mi_row, mi_col, NULL, pd->subsampling_x,
+                     pd->subsampling_y);
+  }
+}
+
+void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const struct scale_factors *sf) {
+  if (src != NULL) {
+    int i;
+    uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
+                                             src->v_buffer };
+    const int widths[MAX_MB_PLANE] = { src->y_crop_width, src->uv_crop_width,
+                                       src->uv_crop_width };
+    const int heights[MAX_MB_PLANE] = { src->y_crop_height, src->uv_crop_height,
+                                        src->uv_crop_height };
+    const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
+                                        src->uv_stride };
+    for (i = 0; i < MAX_MB_PLANE; ++i) {
+      struct macroblockd_plane *const pd = &xd->plane[i];
+      setup_pred_plane(&pd->pre[idx], xd->mi[0]->mbmi.sb_type, buffers[i],
+                       widths[i], heights[i], strides[i], mi_row, mi_col, sf,
+                       pd->subsampling_x, pd->subsampling_y);
+    }
+  }
+}
+
+#if CONFIG_SUPERTX
+#if CONFIG_CB4X4
+static const uint8_t mask_4[4] = { 64, 52, 12, 0 };
+static const uint8_t mask_4_uv[4] = { 64, 52, 12, 0 };
+#endif  // CONFIG_CB4X4
+static const uint8_t mask_8[8] = { 64, 64, 62, 52, 12, 2, 0, 0 };
+
+static const uint8_t mask_16[16] = { 63, 62, 60, 58, 55, 50, 43, 36,
+                                     28, 21, 14, 9,  6,  4,  2,  1 };
+
+static const uint8_t mask_32[32] = { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 63,
+                                     61, 57, 52, 45, 36, 28, 19, 12, 7,  3,  1,
+                                     0,  0,  0,  0,  0,  0,  0,  0,  0,  0 };
+
+static const uint8_t mask_8_uv[8] = { 64, 64, 62, 52, 12, 2, 0, 0 };
+
+static const uint8_t mask_16_uv[16] = { 64, 64, 64, 64, 61, 53, 45, 36,
+                                        28, 19, 11, 3,  0,  0,  0,  0 };
+
+static const uint8_t mask_32_uv[32] = { 64, 64, 64, 64, 64, 64, 64, 64,
+                                        64, 64, 64, 64, 60, 54, 46, 36,
+                                        28, 18, 10, 4,  0,  0,  0,  0,
+                                        0,  0,  0,  0,  0,  0,  0,  0 };
+
+static const uint8_t *get_supertx_mask(int length, int plane) {
+  switch (length) {
+#if CONFIG_CB4X4
+    case 4: return plane ? mask_4_uv : mask_4;
+#endif  // CONFIG_CB4X4
+    case 8: return plane ? mask_8_uv : mask_8;
+    case 16: return plane ? mask_16_uv : mask_16;
+    case 32: return plane ? mask_32_uv : mask_32;
+    default: assert(0);
+  }
+  return NULL;
+}
+
+void av1_build_masked_inter_predictor_complex(
+    MACROBLOCKD *xd, uint8_t *dst, int dst_stride, const uint8_t *pre,
+    int pre_stride, int mi_row, int mi_col, int mi_row_ori, int mi_col_ori,
+    BLOCK_SIZE bsize, BLOCK_SIZE top_bsize, PARTITION_TYPE partition,
+    int plane) {
+  const struct macroblockd_plane *pd = &xd->plane[plane];
+  const int ssx = pd->subsampling_x;
+  const int ssy = pd->subsampling_y;
+  const int top_w = block_size_wide[top_bsize] >> ssx;
+  const int top_h = block_size_high[top_bsize] >> ssy;
+  const int w = block_size_wide[bsize] >> ssx;
+  const int h = block_size_high[bsize] >> ssy;
+  const int w_offset = ((mi_col - mi_col_ori) * MI_SIZE) >> ssx;
+  const int h_offset = ((mi_row - mi_row_ori) * MI_SIZE) >> ssy;
+
+  int w_remain, h_remain;
+
+#if CONFIG_HIGHBITDEPTH
+  const int is_hdb = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#endif  // CONFIG_HIGHBITDEPTH
+
+  assert(bsize <= BLOCK_32X32);
+  assert(IMPLIES(plane == 0, ssx == 0));
+  assert(IMPLIES(plane == 0, ssy == 0));
+
+  switch (partition) {
+    case PARTITION_HORZ: {
+      const uint8_t *const mask = get_supertx_mask(h, ssy);
+
+      w_remain = top_w;
+      h_remain = top_h - h_offset - h;
+      dst += h_offset * dst_stride;
+      pre += h_offset * pre_stride;
+
+#if CONFIG_HIGHBITDEPTH
+      if (is_hdb)
+        aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, pre,
+                                   pre_stride, mask, h, top_w, xd->bd);
+      else
+#endif  // CONFIG_HIGHBITDEPTH
+        aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, pre, pre_stride,
+                            mask, h, top_w);
+
+      dst += h * dst_stride;
+      pre += h * pre_stride;
+      break;
+    }
+    case PARTITION_VERT: {
+      const uint8_t *const mask = get_supertx_mask(w, ssx);
+
+      w_remain = top_w - w_offset - w;
+      h_remain = top_h;
+      dst += w_offset;
+      pre += w_offset;
+
+#if CONFIG_HIGHBITDEPTH
+      if (is_hdb)
+        aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, pre,
+                                   pre_stride, mask, top_h, w, xd->bd);
+      else
+#endif  // CONFIG_HIGHBITDEPTH
+        aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, pre, pre_stride,
+                            mask, top_h, w);
+
+      dst += w;
+      pre += w;
+      break;
+    }
+    default: {
+      assert(0);
+      return;
+    }
+  }
+
+  if (w_remain == 0 || h_remain == 0) {
+    return;
+  }
+
+#if CONFIG_HIGHBITDEPTH
+  if (is_hdb) {
+    dst = (uint8_t *)CONVERT_TO_SHORTPTR(dst);
+    pre = (const uint8_t *)CONVERT_TO_SHORTPTR(pre);
+    dst_stride *= 2;
+    pre_stride *= 2;
+    w_remain *= 2;
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+
+  do {
+    memcpy(dst, pre, w_remain * sizeof(uint8_t));
+    dst += dst_stride;
+    pre += pre_stride;
+  } while (--h_remain);
+}
+
+void av1_build_inter_predictors_sb_sub8x8_extend(MACROBLOCKD *xd,
+#if CONFIG_EXT_INTER
+                                                 int mi_row_ori, int mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                                                 int mi_row, int mi_col,
+                                                 BLOCK_SIZE bsize, int block) {
+  // Prediction function used in supertx:
+  // Use the mv at current block (which is less than 8x8)
+  // to get prediction of a block located at (mi_row, mi_col) at size of bsize
+  // bsize can be larger than 8x8.
+  // block (0-3): the sub8x8 location of current block
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+#if CONFIG_EXT_INTER
+  const int wedge_offset_x = (mi_col_ori - mi_col) * MI_SIZE;
+  const int wedge_offset_y = (mi_row_ori - mi_row) * MI_SIZE;
+#endif  // CONFIG_EXT_INTER
+
+  // For sub8x8 uv:
+  // Skip uv prediction in supertx except the first block (block = 0)
+  int max_plane = block ? 1 : MAX_MB_PLANE;
+
+  for (plane = 0; plane < max_plane; plane++) {
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    build_inter_predictors(xd, plane,
+#if CONFIG_MOTION_VAR
+                           0, 0,
+#endif  // CONFIG_MOTION_VAR
+                           block, bw, bh, 0, 0, bw, bh,
+#if CONFIG_EXT_INTER
+                           wedge_offset_x, wedge_offset_y,
+#endif  // CONFIG_EXT_INTER
+                           mi_x, mi_y);
+  }
+#if CONFIG_EXT_INTER
+  if (is_interintra_pred(&xd->mi[0]->mbmi)) {
+    BUFFER_SET ctx = { { xd->plane[0].dst.buf, xd->plane[1].dst.buf,
+                         xd->plane[2].dst.buf },
+                       { xd->plane[0].dst.stride, xd->plane[1].dst.stride,
+                         xd->plane[2].dst.stride } };
+    av1_build_interintra_predictors(
+        xd, xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
+        xd->plane[0].dst.stride, xd->plane[1].dst.stride,
+        xd->plane[2].dst.stride, &ctx, bsize);
+  }
+#endif  // CONFIG_EXT_INTER
+}
+
+void av1_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+#if CONFIG_EXT_INTER
+                                          int mi_row_ori, int mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                                          int mi_row, int mi_col,
+                                          BLOCK_SIZE bsize) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+#if CONFIG_EXT_INTER
+  const int wedge_offset_x = (mi_col_ori - mi_col) * MI_SIZE;
+  const int wedge_offset_y = (mi_row_ori - mi_row) * MI_SIZE;
+#endif  // CONFIG_EXT_INTER
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, &xd->plane[plane]);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+
+    build_inter_predictors(xd, plane,
+#if CONFIG_MOTION_VAR
+                           0, 0,
+#endif  // CONFIG_MOTION_VAR
+                           0, bw, bh, 0, 0, bw, bh,
+#if CONFIG_EXT_INTER
+                           wedge_offset_x, wedge_offset_y,
+#endif  // CONFIG_EXT_INTER
+                           mi_x, mi_y);
+  }
+}
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_MOTION_VAR
+// obmc_mask_N[overlap_position]
+static const uint8_t obmc_mask_1[1] = { 64 };
+
+static const uint8_t obmc_mask_2[2] = { 45, 64 };
+
+static const uint8_t obmc_mask_4[4] = { 39, 50, 59, 64 };
+
+static const uint8_t obmc_mask_8[8] = { 36, 42, 48, 53, 57, 61, 64, 64 };
+
+static const uint8_t obmc_mask_16[16] = { 34, 37, 40, 43, 46, 49, 52, 54,
+                                          56, 58, 60, 61, 64, 64, 64, 64 };
+
+static const uint8_t obmc_mask_32[32] = { 33, 35, 36, 38, 40, 41, 43, 44,
+                                          45, 47, 48, 50, 51, 52, 53, 55,
+                                          56, 57, 58, 59, 60, 60, 61, 62,
+                                          64, 64, 64, 64, 64, 64, 64, 64 };
+
+#if CONFIG_EXT_PARTITION
+static const uint8_t obmc_mask_64[64] = {
+  33, 34, 35, 35, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 44, 44,
+  45, 46, 47, 47, 48, 49, 50, 51, 51, 51, 52, 52, 53, 54, 55, 56,
+  56, 56, 57, 57, 58, 58, 59, 60, 60, 60, 60, 60, 61, 62, 62, 62,
+  62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+#endif  // CONFIG_EXT_PARTITION
+
+const uint8_t *av1_get_obmc_mask(int length) {
+  switch (length) {
+    case 1: return obmc_mask_1;
+    case 2: return obmc_mask_2;
+    case 4: return obmc_mask_4;
+    case 8: return obmc_mask_8;
+    case 16: return obmc_mask_16;
+    case 32: return obmc_mask_32;
+#if CONFIG_EXT_PARTITION
+    case 64: return obmc_mask_64;
+#endif  // CONFIG_EXT_PARTITION
+    default: assert(0); return NULL;
+  }
+}
+
+#if CONFIG_NCOBMC
+// obmc_mask_flipN[overlap_position]
+static const uint8_t obmc_mask_flip1[1] = { 55 };
+
+static const uint8_t obmc_mask_flip2[2] = { 62, 45 };
+
+static const uint8_t obmc_mask_flip4[4] = { 64, 59, 50, 39 };
+
+static const uint8_t obmc_mask_flip8[8] = { 64, 63, 61, 57, 53, 48, 42, 36 };
+
+static const uint8_t obmc_mask_flip16[16] = { 64, 64, 64, 63, 61, 60, 58, 56,
+                                              54, 52, 49, 46, 43, 40, 37, 34 };
+
+static const uint8_t obmc_mask_flip32[32] = { 64, 64, 64, 64, 64, 63, 63, 62,
+                                              62, 61, 60, 60, 59, 58, 57, 56,
+                                              55, 53, 52, 51, 50, 48, 47, 45,
+                                              44, 43, 41, 40, 38, 36, 35, 33 };
+
+#if CONFIG_EXT_PARTITION
+static const uint8_t obmc_mask_flip64[64] = {
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 62, 62,
+  62, 62, 62, 61, 60, 60, 60, 60, 60, 59, 58, 58, 57, 57, 56, 56,
+  56, 55, 54, 53, 52, 52, 51, 51, 51, 50, 49, 48, 47, 47, 46, 45,
+  44, 44, 44, 43, 42, 41, 40, 40, 39, 38, 37, 36, 35, 35, 34, 33,
+};
+#endif  // CONFIG_EXT_PARTITION
+
+const uint8_t *av1_get_obmc_mask_flipped(int length) {
+  switch (length) {
+    case 1: return obmc_mask_flip1;
+    case 2: return obmc_mask_flip2;
+    case 4: return obmc_mask_flip4;
+    case 8: return obmc_mask_flip8;
+    case 16: return obmc_mask_flip16;
+    case 32: return obmc_mask_flip32;
+#if CONFIG_EXT_PARTITION
+    case 64: return obmc_mask_flip64;
+#endif  // CONFIG_EXT_PARTITION
+    default: assert(0); return NULL;
+  }
+}
+#endif  // CONFIG_NCOBMC
+
+void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                      int mi_row, int mi_col) {
+  int i, mi_step;
+
+  xd->mi[0]->mbmi.overlappable_neighbors[0] = 0;
+  xd->mi[0]->mbmi.overlappable_neighbors[1] = 0;
+
+  if (xd->up_available) {
+    const int ilimit = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
+    for (i = 0; i < ilimit; i += mi_step) {
+      int mi_row_offset = -1;
+      int mi_col_offset = i;
+      MODE_INFO *above_mi =
+          xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+      MB_MODE_INFO *above_mbmi = &above_mi->mbmi;
+
+      mi_step = AOMMIN(xd->n8_w, mi_size_wide[above_mbmi->sb_type]);
+
+      if (is_neighbor_overlappable(above_mbmi))
+        xd->mi[0]->mbmi.overlappable_neighbors[0]++;
+    }
+  }
+
+  if (xd->left_available) {
+    const int ilimit = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
+    for (i = 0; i < ilimit; i += mi_step) {
+      int mi_row_offset = i;
+      int mi_col_offset = -1;
+      MODE_INFO *left_mi =
+          xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+      MB_MODE_INFO *left_mbmi = &left_mi->mbmi;
+
+      mi_step = AOMMIN(xd->n8_h, mi_size_high[left_mbmi->sb_type]);
+
+      if (is_neighbor_overlappable(left_mbmi))
+        xd->mi[0]->mbmi.overlappable_neighbors[1]++;
+    }
+  }
+}
+
+// HW does not support < 4x4 prediction. To limit the bandwidth requirement, for
+// small blocks, only blend with neighbors from one side. If block-size of
+// current plane is 4x4 or 8x4, the above neighbor (dir = 0) will be skipped. If
+// it is 4x8, the left neighbor (dir = 1) will be skipped.
+#define DISABLE_CHROMA_U8X8_OBMC 0  // 0: one-sided obmc; 1: disable
+
+int skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize, const struct macroblockd_plane *pd,
+                           int dir) {
+  assert(is_motion_variation_allowed_bsize(bsize));
+
+  BLOCK_SIZE bsize_plane =
+      ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y];
+#if CONFIG_CB4X4
+  if (bsize_plane < BLOCK_4X4) return 1;
+#endif
+  switch (bsize_plane) {
+#if DISABLE_CHROMA_U8X8_OBMC
+    case BLOCK_4X4:
+    case BLOCK_8X4:
+    case BLOCK_4X8: return 1; break;
+#else
+    case BLOCK_4X4:
+    case BLOCK_8X4:
+    case BLOCK_4X8: return dir == 1; break;
+#endif
+    default: return 0;
+  }
+}
+
+// This function combines motion compensated predictions that is generated by
+// top/left neighboring blocks' inter predictors with the regular inter
+// prediction. We assume the original prediction (bmc) is stored in
+// xd->plane[].dst.buf
+void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                     int mi_row, int mi_col,
+                                     uint8_t *above[MAX_MB_PLANE],
+                                     int above_stride[MAX_MB_PLANE],
+                                     uint8_t *left[MAX_MB_PLANE],
+                                     int left_stride[MAX_MB_PLANE]) {
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  int plane, i;
+#if CONFIG_HIGHBITDEPTH
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#endif  // CONFIG_HIGHBITDEPTH
+
+  // handle above row
+  if (xd->up_available) {
+    const int overlap = num_4x4_blocks_high_lookup[bsize] * 2;
+    const int miw = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
+    const int mi_row_offset = -1;
+    const int neighbor_limit = max_neighbor_obmc[b_width_log2_lookup[bsize]];
+    int neighbor_count = 0;
+
+    assert(miw > 0);
+
+    i = 0;
+    do {  // for each mi in the above row
+      const int mi_col_offset = i;
+      const MB_MODE_INFO *const above_mbmi =
+          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
+      const BLOCK_SIZE a_bsize = above_mbmi->sb_type;
+      const int mi_step = AOMMIN(xd->n8_w, mi_size_wide[a_bsize]);
+
+      if (is_neighbor_overlappable(above_mbmi)) {
+        neighbor_count++;
+        if (neighbor_count > neighbor_limit) break;
+        for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+          const struct macroblockd_plane *pd = &xd->plane[plane];
+          const int bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
+          const int bh = overlap >> pd->subsampling_y;
+
+          if (skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+
+          const int dst_stride = pd->dst.stride;
+          uint8_t *const dst = &pd->dst.buf[(i * MI_SIZE) >> pd->subsampling_x];
+          const int tmp_stride = above_stride[plane];
+          const uint8_t *const tmp =
+              &above[plane][(i * MI_SIZE) >> pd->subsampling_x];
+          const uint8_t *const mask = av1_get_obmc_mask(bh);
+
+#if CONFIG_HIGHBITDEPTH
+          if (is_hbd)
+            aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
+                                       tmp_stride, mask, bh, bw, xd->bd);
+          else
+#endif  // CONFIG_HIGHBITDEPTH
+            aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
+                                tmp_stride, mask, bh, bw);
+        }
+      }
+      i += mi_step;
+    } while (i < miw);
+  }
+
+  // handle left column
+  if (xd->left_available) {
+    const int overlap = num_4x4_blocks_wide_lookup[bsize] * 2;
+    const int mih = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
+    const int mi_col_offset = -1;
+    const int neighbor_limit = max_neighbor_obmc[b_height_log2_lookup[bsize]];
+    int neighbor_count = 0;
+
+    assert(mih > 0);
+
+    i = 0;
+    do {  // for each mi in the left column
+      const int mi_row_offset = i;
+      const MB_MODE_INFO *const left_mbmi =
+          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
+      const BLOCK_SIZE l_bsize = left_mbmi->sb_type;
+      const int mi_step = AOMMIN(xd->n8_h, mi_size_high[l_bsize]);
+
+      if (is_neighbor_overlappable(left_mbmi)) {
+        neighbor_count++;
+        if (neighbor_count > neighbor_limit) break;
+        for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+          const struct macroblockd_plane *pd = &xd->plane[plane];
+          const int bw = overlap >> pd->subsampling_x;
+          const int bh = (mi_step * MI_SIZE) >> pd->subsampling_y;
+
+          if (skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+
+          const int dst_stride = pd->dst.stride;
+          uint8_t *const dst =
+              &pd->dst.buf[(i * MI_SIZE * dst_stride) >> pd->subsampling_y];
+          const int tmp_stride = left_stride[plane];
+          const uint8_t *const tmp =
+              &left[plane][(i * MI_SIZE * tmp_stride) >> pd->subsampling_y];
+          const uint8_t *const mask = av1_get_obmc_mask(bw);
+
+#if CONFIG_HIGHBITDEPTH
+          if (is_hbd)
+            aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
+                                       tmp_stride, mask, bh, bw, xd->bd);
+          else
+#endif  // CONFIG_HIGHBITDEPTH
+            aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
+                                tmp_stride, mask, bh, bw);
+        }
+      }
+      i += mi_step;
+    } while (i < mih);
+  }
+}
+
+void modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
+#if CONFIG_EXT_INTER
+  if (is_interintra_pred(mbmi)) {
+    mbmi->ref_frame[1] = NONE_FRAME;
+  } else if (has_second_ref(mbmi) &&
+             is_masked_compound_type(mbmi->interinter_compound_type)) {
+    mbmi->interinter_compound_type = COMPOUND_AVERAGE;
+    mbmi->ref_frame[1] = NONE_FRAME;
+  }
+#endif  // CONFIG_EXT_INTER
+  if (has_second_ref(mbmi)) mbmi->ref_frame[1] = NONE_FRAME;
+  return;
+}
+
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                         int mi_row, int mi_col,
+                                         uint8_t *tmp_buf[MAX_MB_PLANE],
+                                         int tmp_width[MAX_MB_PLANE],
+                                         int tmp_height[MAX_MB_PLANE],
+                                         int tmp_stride[MAX_MB_PLANE]) {
+  const TileInfo *const tile = &xd->tile;
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  int i, j, mi_step, ref;
+  const int ilimit = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
+  int mb_to_right_edge_base = xd->mb_to_right_edge;
+  const int neighbor_limit = max_neighbor_obmc[b_width_log2_lookup[bsize]];
+  int neighbor_count = 0;
+
+  if (mi_row <= tile->mi_row_start) return;
+
+  xd->mb_to_bottom_edge += xd->n8_h * 32;
+  for (i = 0; i < ilimit; i += mi_step) {
+    int mi_row_offset = -1;
+    int mi_col_offset = i;
+    int mi_x, mi_y, bw, bh;
+    MODE_INFO *above_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+    MB_MODE_INFO *above_mbmi = &above_mi->mbmi;
+    const BLOCK_SIZE a_bsize = above_mbmi->sb_type;
+    MB_MODE_INFO backup_mbmi;
+
+    mi_step = AOMMIN(xd->n8_w, mi_size_wide[a_bsize]);
+
+    if (!is_neighbor_overlappable(above_mbmi)) continue;
+
+    neighbor_count++;
+    if (neighbor_count > neighbor_limit) break;
+
+    backup_mbmi = *above_mbmi;
+    modify_neighbor_predictor_for_obmc(above_mbmi);
+
+    for (j = 0; j < MAX_MB_PLANE; ++j) {
+      struct macroblockd_plane *const pd = &xd->plane[j];
+      setup_pred_plane(&pd->dst, AOMMAX(a_bsize, BLOCK_8X8), tmp_buf[j],
+                       tmp_width[j], tmp_height[j], tmp_stride[j], 0, i, NULL,
+                       pd->subsampling_x, pd->subsampling_y);
+    }
+    for (ref = 0; ref < 1 + has_second_ref(above_mbmi); ++ref) {
+      const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
+      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
+
+      xd->block_refs[ref] = ref_buf;
+      if ((!av1_is_valid_scale(&ref_buf->sf)))
+        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Reference frame has invalid dimensions");
+      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col + i,
+                           &ref_buf->sf);
+    }
+
+    xd->mb_to_left_edge = -(((mi_col + i) * MI_SIZE) * 8);
+    xd->mb_to_right_edge =
+        mb_to_right_edge_base + (xd->n8_w - i - mi_step) * 64;
+    mi_x = (mi_col + i) << MI_SIZE_LOG2;
+    mi_y = mi_row << MI_SIZE_LOG2;
+
+    for (j = 0; j < MAX_MB_PLANE; ++j) {
+      const struct macroblockd_plane *pd = &xd->plane[j];
+      bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
+      bh = AOMMAX((num_4x4_blocks_high_lookup[bsize] * 2) >> pd->subsampling_y,
+                  4);
+
+      if (skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+      build_inter_predictors(xd, j, mi_col_offset, mi_row_offset, 0, bw, bh, 0,
+                             0, bw, bh,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                             0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                             mi_x, mi_y);
+    }
+    *above_mbmi = backup_mbmi;
+  }
+  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge = mb_to_right_edge_base;
+  xd->mb_to_bottom_edge -= xd->n8_h * 32;
+}
+
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        int mi_row, int mi_col,
+                                        uint8_t *tmp_buf[MAX_MB_PLANE],
+                                        int tmp_width[MAX_MB_PLANE],
+                                        int tmp_height[MAX_MB_PLANE],
+                                        int tmp_stride[MAX_MB_PLANE]) {
+  const TileInfo *const tile = &xd->tile;
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  int i, j, mi_step, ref;
+  const int ilimit = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
+  int mb_to_bottom_edge_base = xd->mb_to_bottom_edge;
+  const int neighbor_limit = max_neighbor_obmc[b_height_log2_lookup[bsize]];
+  int neighbor_count = 0;
+
+  if (mi_col == 0 || (mi_col - 1 < tile->mi_col_start)) return;
+
+  xd->mb_to_right_edge += xd->n8_w * 32;
+  for (i = 0; i < ilimit; i += mi_step) {
+    int mi_row_offset = i;
+    int mi_col_offset = -1;
+    int mi_x, mi_y, bw, bh;
+    MODE_INFO *left_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+    MB_MODE_INFO *left_mbmi = &left_mi->mbmi;
+    const BLOCK_SIZE l_bsize = left_mbmi->sb_type;
+    MB_MODE_INFO backup_mbmi;
+
+    mi_step = AOMMIN(xd->n8_h, mi_size_high[l_bsize]);
+
+    if (!is_neighbor_overlappable(left_mbmi)) continue;
+
+    neighbor_count++;
+    if (neighbor_count > neighbor_limit) break;
+
+    backup_mbmi = *left_mbmi;
+    modify_neighbor_predictor_for_obmc(left_mbmi);
+
+    for (j = 0; j < MAX_MB_PLANE; ++j) {
+      struct macroblockd_plane *const pd = &xd->plane[j];
+      setup_pred_plane(&pd->dst, AOMMAX(l_bsize, BLOCK_8X8), tmp_buf[j],
+                       tmp_width[j], tmp_height[j], tmp_stride[j], i, 0, NULL,
+                       pd->subsampling_x, pd->subsampling_y);
+    }
+    for (ref = 0; ref < 1 + has_second_ref(left_mbmi); ++ref) {
+      const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
+      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
+
+      xd->block_refs[ref] = ref_buf;
+      if ((!av1_is_valid_scale(&ref_buf->sf)))
+        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Reference frame has invalid dimensions");
+      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + i, mi_col,
+                           &ref_buf->sf);
+    }
+
+    xd->mb_to_top_edge = -(((mi_row + i) * MI_SIZE) * 8);
+    xd->mb_to_bottom_edge =
+        mb_to_bottom_edge_base + (xd->n8_h - i - mi_step) * 64;
+    mi_x = mi_col << MI_SIZE_LOG2;
+    mi_y = (mi_row + i) << MI_SIZE_LOG2;
+
+    for (j = 0; j < MAX_MB_PLANE; ++j) {
+      const struct macroblockd_plane *pd = &xd->plane[j];
+      bw = AOMMAX((num_4x4_blocks_wide_lookup[bsize] * 2) >> pd->subsampling_x,
+                  4);
+      bh = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;
+
+      if (skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+      build_inter_predictors(xd, j, mi_col_offset, mi_row_offset, 0, bw, bh, 0,
+                             0, bw, bh,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                             0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                             mi_x, mi_y);
+    }
+    *left_mbmi = backup_mbmi;
+  }
+  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_bottom_edge = mb_to_bottom_edge_base;
+  xd->mb_to_right_edge -= xd->n8_w * 32;
+}
+
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        int mi_row, int mi_col) {
+#if CONFIG_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+#else
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
+#endif  // CONFIG_HIGHBITDEPTH
+  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+  int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
+    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
+    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
+    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
+    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
+  } else {
+#endif  // CONFIG_HIGHBITDEPTH
+    dst_buf1[0] = tmp_buf1;
+    dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
+    dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
+    dst_buf2[0] = tmp_buf2;
+    dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
+    dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+  av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
+                                      dst_width1, dst_height1, dst_stride1);
+  av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
+                                     dst_width2, dst_height2, dst_stride2);
+  av1_setup_dst_planes(xd->plane, xd->mi[0]->mbmi.sb_type,
+                       get_frame_new_buffer(cm), mi_row, mi_col);
+  av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
+                                  dst_buf2, dst_stride2);
+}
+
+#if CONFIG_NCOBMC
+void av1_build_prediction_by_bottom_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          uint8_t *tmp_buf[MAX_MB_PLANE],
+                                          int tmp_width[MAX_MB_PLANE],
+                                          int tmp_height[MAX_MB_PLANE],
+                                          int tmp_stride[MAX_MB_PLANE]) {
+  const TileInfo *const tile = &xd->tile;
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  int i, j, mi_step, ref;
+  const int ilimit = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
+  int mb_to_right_edge_base = xd->mb_to_right_edge;
+
+  if (mi_row + xd->n8_h >= tile->mi_row_end ||
+      (mi_row + xd->n8_h) % MI_SIZE == 0 || (mi_row + xd->n8_h) >= cm->mi_rows)
+    return;
+  assert(bsize >= BLOCK_8X8);
+
+  xd->mb_to_top_edge -= xd->n8_h * 32;
+  for (i = 0; i < ilimit; i += mi_step) {
+    int mi_row_offset = xd->n8_h;
+    int mi_col_offset = i;
+    int mi_x, mi_y, bw, bh;
+    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+    MB_MODE_INFO *mbmi = &mi->mbmi;
+#if CONFIG_EXT_INTER
+    MB_MODE_INFO backup_mbmi;
+#endif  // CONFIG_EXT_INTER
+
+    mi_step = AOMMIN(xd->n8_w, mi_size_wide[mbmi->sb_type]);
+
+    if (!is_neighbor_overlappable(mbmi)) continue;
+
+#if CONFIG_EXT_INTER
+    backup_mbmi = *mbmi;
+    modify_neighbor_predictor_for_obmc(mbmi);
+#endif  // CONFIG_EXT_INTER
+
+    for (j = 0; j < MAX_MB_PLANE; ++j) {
+      struct macroblockd_plane *const pd = &xd->plane[j];
+      setup_pred_plane(&pd->dst, AOMMAX(mbmi->sb_type, BLOCK_8X8), tmp_buf[j],
+                       tmp_width[j], tmp_height[j], tmp_stride[j],
+                       (xd->n8_h >> 1), i, NULL, pd->subsampling_x,
+                       pd->subsampling_y);
+    }
+    for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+      const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
+
+      xd->block_refs[ref] = ref_buf;
+      if ((!av1_is_valid_scale(&ref_buf->sf)))
+        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Reference frame has invalid dimensions");
+      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + (xd->n8_h >> 1),
+                           mi_col + i, &ref_buf->sf);
+    }
+
+    xd->mb_to_left_edge = -(((mi_col + i) * MI_SIZE) * 8);
+    xd->mb_to_right_edge =
+        mb_to_right_edge_base + (xd->n8_w - i - mi_step) * 64;
+    mi_x = (mi_col + i) << MI_SIZE_LOG2;
+    mi_y = (mi_row << MI_SIZE_LOG2) + xd->n8_h * 4;
+
+    for (j = 0; j < MAX_MB_PLANE; ++j) {
+      const struct macroblockd_plane *pd = &xd->plane[j];
+      bw = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_x;
+      bh = (num_4x4_blocks_high_lookup[bsize] << 1) >> pd->subsampling_y;
+
+      if (mbmi->sb_type < BLOCK_8X8 && !CONFIG_CB4X4) {
+        const PARTITION_TYPE bp = BLOCK_8X8 - mbmi->sb_type;
+        const int have_vsplit = bp != PARTITION_HORZ;
+        const int have_hsplit = bp != PARTITION_VERT;
+        const int num_4x4_w = 2 >> (!have_vsplit);
+        const int num_4x4_h = 2 >> (!have_hsplit);
+        const int pw = 8 >> (have_vsplit + pd->subsampling_x);
+        int x, y;
+
+        for (y = 0; y < num_4x4_h; ++y)
+          for (x = 0; x < num_4x4_w; ++x) {
+            if ((bp == PARTITION_HORZ || bp == PARTITION_SPLIT) && y != 0)
+              continue;
+
+            build_inter_predictors(
+                xd, j, mi_col_offset, mi_row_offset, y * 2 + x, bw, bh,
+                (4 * x) >> pd->subsampling_x,
+                xd->n8_h == 1 ? (4 >> pd->subsampling_y) : 0, pw, bh,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                mi_x, mi_y);
+          }
+      } else {
+        build_inter_predictors(xd, j, mi_col_offset, mi_row_offset, 0, bw, bh,
+                               0, xd->n8_h == 1 ? (4 >> pd->subsampling_y) : 0,
+                               bw, bh,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                               0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                               mi_x, mi_y);
+      }
+    }
+#if CONFIG_EXT_INTER
+    *mbmi = backup_mbmi;
+#endif  // CONFIG_EXT_INTER
+  }
+  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge = mb_to_right_edge_base;
+  xd->mb_to_top_edge += xd->n8_h * 32;
+}
+
+void av1_build_prediction_by_right_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                         int mi_row, int mi_col,
+                                         uint8_t *tmp_buf[MAX_MB_PLANE],
+                                         int tmp_width[MAX_MB_PLANE],
+                                         int tmp_height[MAX_MB_PLANE],
+                                         const int tmp_stride[MAX_MB_PLANE]) {
+  const TileInfo *const tile = &xd->tile;
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  int i, j, mi_step, ref;
+  const int ilimit = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
+  int mb_to_bottom_edge_base = xd->mb_to_bottom_edge;
+
+  if (mi_col + xd->n8_w >= tile->mi_col_end ||
+      (mi_col + xd->n8_w) % MI_SIZE == 0 || (mi_col + xd->n8_w) >= cm->mi_cols)
+    return;
+
+  xd->mb_to_left_edge -= xd->n8_w * 32;
+  for (i = 0; i < ilimit; i += mi_step) {
+    int mi_row_offset = i;
+    int mi_col_offset = xd->n8_w;
+    int mi_x, mi_y, bw, bh;
+    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+    MB_MODE_INFO *mbmi = &mi->mbmi;
+#if CONFIG_EXT_INTER
+    MB_MODE_INFO backup_mbmi;
+#endif  // CONFIG_EXT_INTER
+
+    mi_step = AOMMIN(xd->n8_h, mi_size_high[mbmi->sb_type]);
+
+    if (!is_neighbor_overlappable(mbmi)) continue;
+
+#if CONFIG_EXT_INTER
+    backup_mbmi = *mbmi;
+    modify_neighbor_predictor_for_obmc(mbmi);
+#endif  // CONFIG_EXT_INTER
+
+    for (j = 0; j < MAX_MB_PLANE; ++j) {
+      struct macroblockd_plane *const pd = &xd->plane[j];
+      setup_pred_plane(&pd->dst, AOMMAX(mbmi->sb_type, BLOCK_8X8), tmp_buf[j],
+                       tmp_width[j], tmp_height[j], tmp_stride[j], i,
+                       xd->n8_w >> 1, NULL, pd->subsampling_x,
+                       pd->subsampling_y);
+    }
+    for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+      const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
+
+      xd->block_refs[ref] = ref_buf;
+      if ((!av1_is_valid_scale(&ref_buf->sf)))
+        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Reference frame has invalid dimensions");
+      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + i,
+                           mi_col + (xd->n8_w >> 1), &ref_buf->sf);
+    }
+
+    xd->mb_to_top_edge = -(((mi_row + i) * MI_SIZE) * 8);
+    xd->mb_to_bottom_edge =
+        mb_to_bottom_edge_base + (xd->n8_h - i - mi_step) * 64;
+    mi_x = (mi_col << MI_SIZE_LOG2) + xd->n8_w * 4;
+    mi_y = (mi_row + i) << MI_SIZE_LOG2;
+
+    for (j = 0; j < MAX_MB_PLANE; ++j) {
+      const struct macroblockd_plane *pd = &xd->plane[j];
+      bw = (num_4x4_blocks_wide_lookup[bsize] << 1) >> pd->subsampling_x;
+      bh = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;
+
+      if (mbmi->sb_type < BLOCK_8X8 && !CONFIG_CB4X4) {
+        const PARTITION_TYPE bp = BLOCK_8X8 - mbmi->sb_type;
+        const int have_vsplit = bp != PARTITION_HORZ;
+        const int have_hsplit = bp != PARTITION_VERT;
+        const int num_4x4_w = 2 >> (!have_vsplit);
+        const int num_4x4_h = 2 >> (!have_hsplit);
+        const int ph = 8 >> (have_hsplit + pd->subsampling_y);
+        int x, y;
+
+        for (y = 0; y < num_4x4_h; ++y)
+          for (x = 0; x < num_4x4_w; ++x) {
+            if ((bp == PARTITION_VERT || bp == PARTITION_SPLIT) && x != 0)
+              continue;
+
+            build_inter_predictors(xd, j, mi_col_offset, mi_row_offset,
+                                   y * 2 + x, bw, bh,
+                                   xd->n8_w == 1 ? 4 >> pd->subsampling_x : 0,
+                                   (4 * y) >> pd->subsampling_y, bw, ph,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                                   0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                                   mi_x, mi_y);
+          }
+      } else {
+        build_inter_predictors(xd, j, mi_col_offset, mi_row_offset, 0, bw, bh,
+                               xd->n8_w == 1 ? 4 >> pd->subsampling_x : 0, 0,
+                               bw, bh,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                               0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                               mi_x, mi_y);
+      }
+    }
+#if CONFIG_EXT_INTER
+    *mbmi = backup_mbmi;
+#endif  // CONFIG_EXT_INTER
+  }
+  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_bottom_edge = mb_to_bottom_edge_base;
+  xd->mb_to_left_edge += xd->n8_w * 32;
+}
+
+// This function combines motion compensated predictions that is generated by
+// bottom/right neighboring blocks' inter predictors with prediction in dst
+// buffer.
+void av1_merge_dst_bottom_right_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                      int mi_row, int mi_col,
+                                      uint8_t *bottom[MAX_MB_PLANE],
+                                      const int bottom_stride[MAX_MB_PLANE],
+                                      uint8_t *right[MAX_MB_PLANE],
+                                      const int right_stride[MAX_MB_PLANE]) {
+  const TileInfo *const tile = &xd->tile;
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  int plane, i, mi_step;
+  const int bottom_available = mi_row + xd->n8_h < tile->mi_row_end &&
+                               (mi_row + xd->n8_h) % MI_SIZE != 0 &&
+                               (mi_row + xd->n8_h) < cm->mi_rows;
+#if CONFIG_HIGHBITDEPTH
+  int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#endif  // CONFIG_HIGHBITDEPTH
+
+  // handle bottom row
+  for (i = 0; bottom_available && i < AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
+       i += mi_step) {
+    int mi_row_offset = xd->n8_h;
+    int mi_col_offset = i;
+    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+    MB_MODE_INFO *mbmi = &mi->mbmi;
+    int overlap;
+
+    mi_step = AOMMIN(xd->n8_w, mi_size_wide[mbmi->sb_type]);
+
+    if (!is_neighbor_overlappable(mbmi)) continue;
+
+    overlap = num_4x4_blocks_high_lookup[bsize] << 1;
+
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      const struct macroblockd_plane *pd = &xd->plane[plane];
+      const int bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
+      const int bh = overlap >> pd->subsampling_y;
+      const int dst_stride = pd->dst.stride;
+      uint8_t *dst =
+          &pd->dst.buf[((i * MI_SIZE) >> pd->subsampling_x) +
+                       (((xd->n8_h * MI_SIZE - overlap) * dst_stride) >>
+                        pd->subsampling_y)];
+      const int tmp_stride = bottom_stride[plane];
+      const uint8_t *const tmp =
+          &bottom[plane][((i * MI_SIZE) >> pd->subsampling_x) +
+                         (((xd->n8_h * MI_SIZE - overlap) * tmp_stride) >>
+                          pd->subsampling_y)];
+      const uint8_t *const mask = av1_get_obmc_mask_flipped(bh);
+
+#if CONFIG_HIGHBITDEPTH
+      if (is_hbd)
+        aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
+                                   tmp_stride, mask, bh, bw, xd->bd);
+      else
+#endif  // CONFIG_HIGHBITDEPTH
+        aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
+                            mask, bh, bw);
+    }
+  }  // each mi in the bottom row
+
+  // handle right column
+  if (mi_col + xd->n8_w >= tile->mi_col_end ||
+      (mi_col + xd->n8_w) % MI_SIZE == 0 || (mi_col + xd->n8_w) >= cm->mi_cols)
+    return;
+
+  for (i = 0; i < AOMMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
+    int mi_row_offset = i;
+    int mi_col_offset = xd->n8_w;
+    int overlap;
+    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+    MB_MODE_INFO *mbmi = &mi->mbmi;
+
+    mi_step = AOMMIN(xd->n8_h, mi_size_high[mbmi->sb_type]);
+
+    if (!is_neighbor_overlappable(mbmi)) continue;
+
+    overlap = num_4x4_blocks_wide_lookup[bsize] << 1;
+
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      const struct macroblockd_plane *pd = &xd->plane[plane];
+      const int bw = overlap >> pd->subsampling_x;
+      const int bh = (mi_step * MI_SIZE) >> pd->subsampling_y;
+      const int dst_stride = pd->dst.stride;
+      uint8_t *dst =
+          &pd->dst.buf[((i * MI_SIZE * dst_stride) >> pd->subsampling_y) +
+                       ((xd->n8_w * MI_SIZE - overlap) >> pd->subsampling_x)];
+      const int tmp_stride = right_stride[plane];
+      const uint8_t *const tmp =
+          &right[plane][((i * MI_SIZE * tmp_stride) >> pd->subsampling_y) +
+                        ((xd->n8_w * MI_SIZE - overlap) >> pd->subsampling_x)];
+      const uint8_t *const mask = av1_get_obmc_mask_flipped(bw);
+
+#if CONFIG_HIGHBITDEPTH
+      if (is_hbd)
+        aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
+                                   tmp_stride, mask, bh, bw, xd->bd);
+      else
+#endif  // CONFIG_HIGHBITDEPTH
+        aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
+                            mask, bh, bw);
+    }
+  }  // each mi in the right column
+}
+
+// This function generates 4 sided obmc. (1) Prediction blocks generated by
+// bottom and right motion vectors are calculated. (2) Combine them with the
+// original prediction block (which should be pre-stored in xd->plane[].dst.buf
+// before calling this function). The results is updated in xd->plane[].dst.buf
+// (3) Call causal obmc prediction function, which will generate left and above
+// preds, and then merge them and xd->plane[].dst.buf.
+void av1_build_ncobmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                          int mi_row, int mi_col) {
+#if CONFIG_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+#else
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
+#endif  // CONFIG_HIGHBITDEPTH
+  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+  int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
+    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
+    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
+    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
+    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
+  } else {
+#endif  // CONFIG_HIGHBITDEPTH
+    dst_buf1[0] = tmp_buf1;
+    dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
+    dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
+    dst_buf2[0] = tmp_buf2;
+    dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
+    dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  av1_build_prediction_by_bottom_preds(cm, xd, mi_row, mi_col, dst_buf1,
+                                       dst_width1, dst_height1, dst_stride1);
+  av1_build_prediction_by_right_preds(cm, xd, mi_row, mi_col, dst_buf2,
+                                      dst_width2, dst_height2, dst_stride2);
+  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+                       mi_col);
+  av1_merge_dst_bottom_right_preds(cm, xd, mi_row, mi_col, dst_buf1,
+                                   dst_stride1, dst_buf2, dst_stride2);
+  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+                       mi_col);
+  av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+                       mi_col);
+}
+#endif  // CONFIG_NCOBMC
+#endif  // CONFIG_MOTION_VAR
+
+#if CONFIG_EXT_INTER
+/* clang-format off */
+#if CONFIG_EXT_PARTITION
+static const int ii_weights1d[MAX_SB_SIZE] = {
+  26, 25, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17,
+  16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 12, 12, 12, 12,
+  12, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10,  9,  9,  9,
+  9,  9,  9,  9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7
+};
+static int ii_size_scales[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+    32, 32, 32,
+#endif
+    32, 16, 16, 16, 8, 8, 8, 4,
+    4,  4,  2,  2,  2, 1, 1, 1,
+};
+#else
+static const int ii_weights1d[MAX_SB_SIZE] = {
+  26, 25, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18,
+  17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13, 13, 13,
+  13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 10, 10, 10,
+  10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9
+};
+static int ii_size_scales[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+    16, 16, 16,
+#endif
+    16, 8, 8, 8, 4, 4, 4,
+    2,  2, 2, 1, 1, 1,
+};
+/* clang-format on */
+#endif  // CONFIG_EXT_PARTITION
+
+static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra,
+                               int wedge_index, int wedge_sign,
+                               BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
+                               uint8_t *comppred, int compstride,
+                               const uint8_t *interpred, int interstride,
+                               const uint8_t *intrapred, int intrastride) {
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
+  const int size_scale = ii_size_scales[plane_bsize];
+  int i, j;
+
+  if (use_wedge_interintra) {
+    if (is_interintra_wedge_used(bsize)) {
+      const uint8_t *mask =
+          av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+      const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
+      const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
+      aom_blend_a64_mask(comppred, compstride, intrapred, intrastride,
+                         interpred, interstride, mask, block_size_wide[bsize],
+                         bh, bw, subh, subw);
+    }
+    return;
+  }
+
+  switch (mode) {
+    case II_V_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = ii_weights1d[i * size_scale];
+          comppred[i * compstride + j] =
+              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
+                            interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_H_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = ii_weights1d[j * size_scale];
+          comppred[i * compstride + j] =
+              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
+                            interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D63_PRED:
+    case II_D117_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = (ii_weights1d[i * size_scale] * 3 +
+                       ii_weights1d[j * size_scale]) >>
+                      2;
+          comppred[i * compstride + j] =
+              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
+                            interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D207_PRED:
+    case II_D153_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = (ii_weights1d[j * size_scale] * 3 +
+                       ii_weights1d[i * size_scale]) >>
+                      2;
+          comppred[i * compstride + j] =
+              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
+                            interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D135_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = ii_weights1d[(i < j ? i : j) * size_scale];
+          comppred[i * compstride + j] =
+              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
+                            interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D45_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale =
+              (ii_weights1d[i * size_scale] + ii_weights1d[j * size_scale]) >>
+              1;
+          comppred[i * compstride + j] =
+              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
+                            interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_TM_PRED:
+    case II_DC_PRED:
+    default:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          comppred[i * compstride + j] = AOM_BLEND_AVG(
+              intrapred[i * intrastride + j], interpred[i * interstride + j]);
+        }
+      }
+      break;
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void combine_interintra_highbd(
+    INTERINTRA_MODE mode, int use_wedge_interintra, int wedge_index,
+    int wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
+    uint8_t *comppred8, int compstride, const uint8_t *interpred8,
+    int interstride, const uint8_t *intrapred8, int intrastride, int bd) {
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
+  const int size_scale = ii_size_scales[plane_bsize];
+  int i, j;
+
+  uint16_t *comppred = CONVERT_TO_SHORTPTR(comppred8);
+  const uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8);
+  const uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);
+
+  if (use_wedge_interintra) {
+    if (is_interintra_wedge_used(bsize)) {
+      const uint8_t *mask =
+          av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+      const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
+      const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
+      aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride,
+                                interpred8, interstride, mask, bw, bh, bw, subh,
+                                subw, bd);
+    }
+    return;
+  }
+
+  switch (mode) {
+    case II_V_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = ii_weights1d[i * size_scale];
+          comppred[i * compstride + j] =
+              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
+                            interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_H_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = ii_weights1d[j * size_scale];
+          comppred[i * compstride + j] =
+              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
+                            interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D63_PRED:
+    case II_D117_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = (ii_weights1d[i * size_scale] * 3 +
+                       ii_weights1d[j * size_scale]) >>
+                      2;
+          comppred[i * compstride + j] =
+              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
+                            interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D207_PRED:
+    case II_D153_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = (ii_weights1d[j * size_scale] * 3 +
+                       ii_weights1d[i * size_scale]) >>
+                      2;
+          comppred[i * compstride + j] =
+              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
+                            interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D135_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = ii_weights1d[(i < j ? i : j) * size_scale];
+          comppred[i * compstride + j] =
+              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
+                            interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D45_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale =
+              (ii_weights1d[i * size_scale] + ii_weights1d[j * size_scale]) >>
+              1;
+          comppred[i * compstride + j] =
+              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
+                            interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_TM_PRED:
+    case II_DC_PRED:
+    default:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          comppred[i * compstride + j] = AOM_BLEND_AVG(
+              interpred[i * interstride + j], intrapred[i * intrastride + j]);
+        }
+      }
+      break;
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+void av1_build_intra_predictors_for_interintra(MACROBLOCKD *xd,
+                                               BLOCK_SIZE bsize, int plane,
+                                               BUFFER_SET *ctx, uint8_t *dst,
+                                               int dst_stride) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
+  PREDICTION_MODE mode =
+      interintra_to_intra_mode[xd->mi[0]->mbmi.interintra_mode];
+
+  av1_predict_intra_block(xd, pd->width, pd->height, plane_bsize, mode,
+                          ctx->plane[plane], ctx->stride[plane], dst,
+                          dst_stride, 0, 0, plane);
+}
+
+void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
+                            const uint8_t *inter_pred, int inter_stride,
+                            const uint8_t *intra_pred, int intra_stride) {
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    combine_interintra_highbd(
+        xd->mi[0]->mbmi.interintra_mode, xd->mi[0]->mbmi.use_wedge_interintra,
+        xd->mi[0]->mbmi.interintra_wedge_index,
+        xd->mi[0]->mbmi.interintra_wedge_sign, bsize, plane_bsize,
+        xd->plane[plane].dst.buf, xd->plane[plane].dst.stride, inter_pred,
+        inter_stride, intra_pred, intra_stride, xd->bd);
+    return;
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+  combine_interintra(xd->mi[0]->mbmi.interintra_mode,
+                     xd->mi[0]->mbmi.use_wedge_interintra,
+                     xd->mi[0]->mbmi.interintra_wedge_index,
+                     xd->mi[0]->mbmi.interintra_wedge_sign, bsize, plane_bsize,
+                     xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
+                     inter_pred, inter_stride, intra_pred, intra_stride);
+}
+
+void av1_build_interintra_predictors_sby(MACROBLOCKD *xd, uint8_t *ypred,
+                                         int ystride, BUFFER_SET *ctx,
+                                         BLOCK_SIZE bsize) {
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
+    av1_build_intra_predictors_for_interintra(
+        xd, bsize, 0, ctx, CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
+    av1_combine_interintra(xd, bsize, 0, ypred, ystride,
+                           CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
+    return;
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+  {
+    DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
+    av1_build_intra_predictors_for_interintra(xd, bsize, 0, ctx, intrapredictor,
+                                              MAX_SB_SIZE);
+    av1_combine_interintra(xd, bsize, 0, ypred, ystride, intrapredictor,
+                           MAX_SB_SIZE);
+  }
+}
+
+void av1_build_interintra_predictors_sbc(MACROBLOCKD *xd, uint8_t *upred,
+                                         int ustride, BUFFER_SET *ctx,
+                                         int plane, BLOCK_SIZE bsize) {
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, uintrapredictor[MAX_SB_SQUARE]);
+    av1_build_intra_predictors_for_interintra(
+        xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(uintrapredictor),
+        MAX_SB_SIZE);
+    av1_combine_interintra(xd, bsize, plane, upred, ustride,
+                           CONVERT_TO_BYTEPTR(uintrapredictor), MAX_SB_SIZE);
+    return;
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+  {
+    DECLARE_ALIGNED(16, uint8_t, uintrapredictor[MAX_SB_SQUARE]);
+    av1_build_intra_predictors_for_interintra(xd, bsize, plane, ctx,
+                                              uintrapredictor, MAX_SB_SIZE);
+    av1_combine_interintra(xd, bsize, plane, upred, ustride, uintrapredictor,
+                           MAX_SB_SIZE);
+  }
+}
+
+void av1_build_interintra_predictors_sbuv(MACROBLOCKD *xd, uint8_t *upred,
+                                          uint8_t *vpred, int ustride,
+                                          int vstride, BUFFER_SET *ctx,
+                                          BLOCK_SIZE bsize) {
+  av1_build_interintra_predictors_sbc(xd, upred, ustride, ctx, 1, bsize);
+  av1_build_interintra_predictors_sbc(xd, vpred, vstride, ctx, 2, bsize);
+}
+
+void av1_build_interintra_predictors(MACROBLOCKD *xd, uint8_t *ypred,
+                                     uint8_t *upred, uint8_t *vpred,
+                                     int ystride, int ustride, int vstride,
+                                     BUFFER_SET *ctx, BLOCK_SIZE bsize) {
+  av1_build_interintra_predictors_sby(xd, ypred, ystride, ctx, bsize);
+  av1_build_interintra_predictors_sbuv(xd, upred, vpred, ustride, vstride, ctx,
+                                       bsize);
+}
+
+// Builds the inter-predictor for the single ref case
+// for use in the encoder to search the wedges efficiently.
+static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
+                                              int block, int bw, int bh, int x,
+                                              int y, int w, int h, int mi_x,
+                                              int mi_y, int ref,
+                                              uint8_t *const ext_dst,
+                                              int ext_dst_stride) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MODE_INFO *mi = xd->mi[0];
+
+  const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+  struct buf_2d *const pre_buf = &pd->pre[ref];
+#if CONFIG_HIGHBITDEPTH
+  uint8_t *const dst =
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? CONVERT_TO_BYTEPTR(ext_dst)
+                                                   : ext_dst) +
+      ext_dst_stride * y + x;
+#else
+  uint8_t *const dst = ext_dst + ext_dst_stride * y + x;
+#endif
+  const MV mv = mi->mbmi.sb_type < BLOCK_8X8
+                    ? average_split_mvs(pd, mi, ref, block)
+                    : mi->mbmi.mv[ref].as_mv;
+
+  // TODO(jkoleszar): This clamping is done in the incorrect place for the
+  // scaling case. It needs to be done on the scaled MV, not the pre-scaling
+  // MV. Note however that it performs the subsampling aware scaling so
+  // that the result is always q4.
+  // mv_precision precision is MV_PRECISION_Q4.
+  const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh, pd->subsampling_x,
+                                             pd->subsampling_y);
+
+  uint8_t *pre;
+  MV32 scaled_mv;
+  int xs, ys, subpel_x, subpel_y;
+  const int is_scaled = av1_is_scaled(sf);
+  ConvolveParams conv_params = get_conv_params(0, plane);
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+  WarpTypesAllowed warp_types;
+#if CONFIG_GLOBAL_MOTION
+  WarpedMotionParams *const wm = &xd->global_motion[mi->mbmi.ref_frame[ref]];
+  warp_types.global_warp_allowed = is_global_mv_block(mi, block, wm->wmtype);
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+  warp_types.local_warp_allowed = mi->mbmi.motion_mode == WARPED_CAUSAL;
+#endif  // CONFIG_WARPED_MOTION
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+
+  if (is_scaled) {
+    pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
+    scaled_mv = av1_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+    xs = sf->x_step_q4;
+    ys = sf->y_step_q4;
+  } else {
+    pre = pre_buf->buf + (y * pre_buf->stride + x);
+    scaled_mv.row = mv_q4.row;
+    scaled_mv.col = mv_q4.col;
+    xs = ys = 16;
+  }
+
+  subpel_x = scaled_mv.col & SUBPEL_MASK;
+  subpel_y = scaled_mv.row & SUBPEL_MASK;
+  pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride +
+         (scaled_mv.col >> SUBPEL_BITS);
+
+  av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride, subpel_x,
+                           subpel_y, sf, w, h, &conv_params,
+                           mi->mbmi.interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                           &warp_types, (mi_x >> pd->subsampling_x) + x,
+                           (mi_y >> pd->subsampling_y) + y, plane, ref,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+#if CONFIG_MOTION_VAR
+                           0, 0,
+#endif
+                           xs, ys, xd);
+}
+
+void av1_build_inter_predictors_for_planes_single_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
+    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3]) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+
+    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8 && !CONFIG_CB4X4) {
+      int x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+          build_inter_predictors_single_buf(
+              xd, plane, y * 2 + x, bw, bh, 4 * x, 4 * y, 4, 4, mi_x, mi_y, ref,
+              ext_dst[plane], ext_dst_stride[plane]);
+    } else {
+      build_inter_predictors_single_buf(xd, plane, 0, bw, bh, 0, 0, bw, bh,
+                                        mi_x, mi_y, ref, ext_dst[plane],
+                                        ext_dst_stride[plane]);
+    }
+  }
+}
+
+static void build_wedge_inter_predictor_from_buf(
+    MACROBLOCKD *xd, int plane, int x, int y, int w, int h,
+#if CONFIG_SUPERTX
+    int wedge_offset_x, int wedge_offset_y,
+#endif  // CONFIG_SUPERTX
+    uint8_t *ext_dst0, int ext_dst_stride0, uint8_t *ext_dst1,
+    int ext_dst_stride1) {
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int is_compound = has_second_ref(mbmi);
+  MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+  struct buf_2d *const dst_buf = &pd->dst;
+  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+  const INTERINTER_COMPOUND_DATA comp_data = {
+#if CONFIG_WEDGE
+    mbmi->wedge_index,
+    mbmi->wedge_sign,
+#endif  // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+    mbmi->mask_type,
+    xd->seg_mask,
+#endif  // CONFIG_COMPOUND_SEGMENT
+    mbmi->interinter_compound_type
+  };
+
+  if (is_compound && is_masked_compound_type(mbmi->interinter_compound_type)) {
+#if CONFIG_COMPOUND_SEGMENT
+    if (!plane && comp_data.interinter_compound_type == COMPOUND_SEG) {
+#if CONFIG_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+        build_compound_seg_mask_highbd(
+            comp_data.seg_mask, comp_data.mask_type,
+            CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+            CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, mbmi->sb_type, h, w,
+            xd->bd);
+      else
+#endif  // CONFIG_HIGHBITDEPTH
+        build_compound_seg_mask(comp_data.seg_mask, comp_data.mask_type,
+                                ext_dst0, ext_dst_stride0, ext_dst1,
+                                ext_dst_stride1, mbmi->sb_type, h, w);
+    }
+#endif  // CONFIG_COMPOUND_SEGMENT
+
+#if CONFIG_SUPERTX
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      build_masked_compound_wedge_extend_highbd(
+          dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, &comp_data,
+          mbmi->sb_type, wedge_offset_x, wedge_offset_y, h, w, xd->bd);
+    else
+#endif  // CONFIG_HIGHBITDEPTH
+      build_masked_compound_wedge_extend(
+          dst, dst_buf->stride, ext_dst0, ext_dst_stride0, ext_dst1,
+          ext_dst_stride1, &comp_data, mbmi->sb_type, wedge_offset_x,
+          wedge_offset_y, h, w);
+#else
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      build_masked_compound_highbd(
+          dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, &comp_data,
+          mbmi->sb_type, h, w, xd->bd);
+    else
+#endif  // CONFIG_HIGHBITDEPTH
+      build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
+                            ext_dst1, ext_dst_stride1, &comp_data,
+                            mbmi->sb_type, h, w);
+#endif  // CONFIG_SUPERTX
+  } else {
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+                               dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
+                               xd->bd);
+    else
+#endif  // CONFIG_HIGHBITDEPTH
+      aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
+                        0, NULL, 0, w, h);
+  }
+}
+
+void av1_build_wedge_inter_predictor_from_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to,
+#if CONFIG_SUPERTX
+    int wedge_offset_x, int wedge_offset_y,
+#endif  // CONFIG_SUPERTX
+    uint8_t *ext_dst0[3], int ext_dst_stride0[3], uint8_t *ext_dst1[3],
+    int ext_dst_stride1[3]) {
+  int plane;
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+
+    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8 && !CONFIG_CB4X4) {
+      int x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+          build_wedge_inter_predictor_from_buf(
+              xd, plane, 4 * x, 4 * y, 4, 4,
+#if CONFIG_SUPERTX
+              wedge_offset_x, wedge_offset_y,
+#endif  // CONFIG_SUPERTX
+              ext_dst0[plane], ext_dst_stride0[plane], ext_dst1[plane],
+              ext_dst_stride1[plane]);
+    } else {
+      const int bw = block_size_wide[plane_bsize];
+      const int bh = block_size_high[plane_bsize];
+      build_wedge_inter_predictor_from_buf(
+          xd, plane, 0, 0, bw, bh,
+#if CONFIG_SUPERTX
+          wedge_offset_x, wedge_offset_y,
+#endif  // CONFIG_SUPERTX
+          ext_dst0[plane], ext_dst_stride0[plane], ext_dst1[plane],
+          ext_dst_stride1[plane]);
+    }
+  }
+}
+#endif  // CONFIG_EXT_INTER
diff --git a/third_party/aom/av1/common/reconinter.h b/third_party/aom/av1/common/reconinter.h
new file mode 100644
index 000000000..10933a751
--- /dev/null
+++ b/third_party/aom/av1/common/reconinter.h
@@ -0,0 +1,828 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_RECONINTER_H_
+#define AV1_COMMON_RECONINTER_H_
+
+#include "av1/common/filter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/convolve.h"
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+#include "av1/common/warped_motion.h"
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+#include "aom/aom_integer.h"
+
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+#define WARP_WM_NEIGHBORS_WITH_OBMC 0
+#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+
+#if CONFIG_MOTION_VAR && CONFIG_GLOBAL_MOTION
+#define WARP_GM_NEIGHBORS_WITH_OBMC 0
+#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE void inter_predictor(const uint8_t *src, int src_stride,
+                                   uint8_t *dst, int dst_stride,
+                                   const int subpel_x, const int subpel_y,
+                                   const struct scale_factors *sf, int w, int h,
+                                   ConvolveParams *conv_params,
+#if CONFIG_DUAL_FILTER
+                                   const InterpFilter *interp_filter,
+#else
+                                   const InterpFilter interp_filter,
+#endif
+                                   int xs, int ys) {
+#if CONFIG_DUAL_FILTER
+  InterpFilter filter_x = av1_get_plane_interp_filter(
+      interp_filter[1 + 2 * conv_params->ref], conv_params->plane);
+  InterpFilter filter_y = av1_get_plane_interp_filter(
+      interp_filter[0 + 2 * conv_params->ref], conv_params->plane);
+  InterpFilterParams interp_filter_params_x =
+      av1_get_interp_filter_params(filter_x);
+  InterpFilterParams interp_filter_params_y =
+      av1_get_interp_filter_params(filter_y);
+#else
+  InterpFilterParams interp_filter_params =
+      av1_get_interp_filter_params(interp_filter);
+#endif
+
+  assert(sf);
+#if CONFIG_DUAL_FILTER
+  if (interp_filter_params_x.taps == SUBPEL_TAPS &&
+      interp_filter_params_y.taps == SUBPEL_TAPS && w > 2 && h > 2 &&
+      conv_params->round == CONVOLVE_OPT_ROUND && xs == 16 && ys == 16) {
+    const int16_t *kernel_x =
+        av1_get_interp_filter_subpel_kernel(interp_filter_params_x, subpel_x);
+    const int16_t *kernel_y =
+        av1_get_interp_filter_subpel_kernel(interp_filter_params_y, subpel_y);
+#else
+  if (interp_filter_params.taps == SUBPEL_TAPS && w > 2 && h > 2 &&
+      conv_params->round == CONVOLVE_OPT_ROUND && xs == 16 && ys == 16) {
+    const int16_t *kernel_x =
+        av1_get_interp_filter_subpel_kernel(interp_filter_params, subpel_x);
+    const int16_t *kernel_y =
+        av1_get_interp_filter_subpel_kernel(interp_filter_params, subpel_y);
+#endif
+    sf->predict[subpel_x != 0][subpel_y != 0][conv_params->ref](
+        src, src_stride, dst, dst_stride, kernel_x, xs, kernel_y, ys, w, h);
+  } else {
+// ref_idx > 0 means this is the second reference frame
+// first reference frame's prediction result is already in dst
+// therefore we need to average the first and second results
+#if CONFIG_CONVOLVE_ROUND
+    if (conv_params->round == CONVOLVE_OPT_NO_ROUND && xs == 16 && ys == 16)
+      av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+#if CONFIG_DUAL_FILTER
+                             interp_filter,
+#else
+                             &interp_filter,
+#endif
+                             subpel_x, xs, subpel_y, ys, conv_params);
+    else
+#endif
+    {
+      if (xs == 16 && ys == 16) {
+        av1_convolve(src, src_stride, dst, dst_stride, w, h, interp_filter,
+                     subpel_x, xs, subpel_y, ys, conv_params);
+      } else {
+        // If xs == 16 || ys == 16 scaling is happening and the SSE2
+        // instructions don't support scaling; use the C versions to be safe.
+        av1_convolve_c(src, src_stride, dst, dst_stride, w, h, interp_filter,
+                       subpel_x, xs, subpel_y, ys, conv_params);
+      }
+    }
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride,
+                                          uint8_t *dst, int dst_stride,
+                                          const int subpel_x,
+                                          const int subpel_y,
+                                          const struct scale_factors *sf, int w,
+                                          int h, int ref,
+#if CONFIG_DUAL_FILTER
+                                          const InterpFilter *interp_filter,
+#else
+                                          const InterpFilter interp_filter,
+#endif
+                                          int xs, int ys, int bd) {
+#if CONFIG_DUAL_FILTER
+  InterpFilterParams interp_filter_params_x =
+      av1_get_interp_filter_params(interp_filter[1 + 2 * ref]);
+  InterpFilterParams interp_filter_params_y =
+      av1_get_interp_filter_params(interp_filter[0 + 2 * ref]);
+#else
+  InterpFilterParams interp_filter_params =
+      av1_get_interp_filter_params(interp_filter);
+#endif
+
+#if CONFIG_DUAL_FILTER
+  if (interp_filter_params_x.taps == SUBPEL_TAPS &&
+      interp_filter_params_y.taps == SUBPEL_TAPS && w > 2 && h > 2) {
+    const int16_t *kernel_x =
+        av1_get_interp_filter_subpel_kernel(interp_filter_params_x, subpel_x);
+    const int16_t *kernel_y =
+        av1_get_interp_filter_subpel_kernel(interp_filter_params_y, subpel_y);
+#else
+  if (interp_filter_params.taps == SUBPEL_TAPS && w > 2 && h > 2) {
+    const int16_t *kernel_x =
+        av1_get_interp_filter_subpel_kernel(interp_filter_params, subpel_x);
+    const int16_t *kernel_y =
+        av1_get_interp_filter_subpel_kernel(interp_filter_params, subpel_y);
+#endif  // CONFIG_DUAL_FILTER
+    sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
+        src, src_stride, dst, dst_stride, kernel_x, xs, kernel_y, ys, w, h, bd);
+  } else {
+    // ref > 0 means this is the second reference frame
+    // first reference frame's prediction result is already in dst
+    // therefore we need to average the first and second results
+    int avg = ref > 0;
+    av1_highbd_convolve(src, src_stride, dst, dst_stride, w, h, interp_filter,
+                        subpel_x, xs, subpel_y, ys, avg, bd);
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_EXT_INTER
+// Set to (1 << 5) if the 32-ary codebooks are used for any bock size
+#define MAX_WEDGE_TYPES (1 << 4)
+
+#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
+#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
+#define MAX_WEDGE_SQUARE (MAX_WEDGE_SIZE * MAX_WEDGE_SIZE)
+
+#define WEDGE_WEIGHT_BITS 6
+
+#define WEDGE_NONE -1
+
+// Angles are with respect to horizontal anti-clockwise
+typedef enum {
+  WEDGE_HORIZONTAL = 0,
+  WEDGE_VERTICAL = 1,
+  WEDGE_OBLIQUE27 = 2,
+  WEDGE_OBLIQUE63 = 3,
+  WEDGE_OBLIQUE117 = 4,
+  WEDGE_OBLIQUE153 = 5,
+  WEDGE_DIRECTIONS
+} WedgeDirectionType;
+
+// 3-tuple: {direction, x_offset, y_offset}
+typedef struct {
+  WedgeDirectionType direction;
+  int x_offset;
+  int y_offset;
+} wedge_code_type;
+
+typedef uint8_t *wedge_masks_type[MAX_WEDGE_TYPES];
+
+typedef struct {
+  int bits;
+  const wedge_code_type *codebook;
+  uint8_t *signflip;
+  int smoother;
+  wedge_masks_type *masks;
+} wedge_params_type;
+
+extern const wedge_params_type wedge_params_lookup[BLOCK_SIZES];
+
+static INLINE int is_interinter_compound_used(COMPOUND_TYPE type,
+                                              BLOCK_SIZE sb_type) {
+  (void)sb_type;
+  switch (type) {
+    case COMPOUND_AVERAGE: return 1;
+#if CONFIG_WEDGE
+    case COMPOUND_WEDGE: return wedge_params_lookup[sb_type].bits > 0;
+#endif  // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+    case COMPOUND_SEG: return sb_type >= BLOCK_8X8;
+#endif  // CONFIG_COMPOUND_SEGMENT
+    default: assert(0); return 0;
+  }
+}
+
+static INLINE int is_any_masked_compound_used(BLOCK_SIZE sb_type) {
+  COMPOUND_TYPE comp_type;
+  for (comp_type = 0; comp_type < COMPOUND_TYPES; comp_type++) {
+    if (is_masked_compound_type(comp_type) &&
+        is_interinter_compound_used(comp_type, sb_type))
+      return 1;
+  }
+  return 0;
+}
+
+static INLINE int get_wedge_bits_lookup(BLOCK_SIZE sb_type) {
+  return wedge_params_lookup[sb_type].bits;
+}
+
+static INLINE int get_interinter_wedge_bits(BLOCK_SIZE sb_type) {
+  const int wbits = wedge_params_lookup[sb_type].bits;
+  return (wbits > 0) ? wbits + 1 : 0;
+}
+
+static INLINE int is_interintra_wedge_used(BLOCK_SIZE sb_type) {
+  (void)sb_type;
+  return wedge_params_lookup[sb_type].bits > 0;
+}
+
+static INLINE int get_interintra_wedge_bits(BLOCK_SIZE sb_type) {
+  return wedge_params_lookup[sb_type].bits;
+}
+
+#if CONFIG_COMPOUND_SEGMENT
+void build_compound_seg_mask(uint8_t *mask, SEG_MASK_TYPE mask_type,
+                             const uint8_t *src0, int src0_stride,
+                             const uint8_t *src1, int src1_stride,
+                             BLOCK_SIZE sb_type, int h, int w);
+#if CONFIG_HIGHBITDEPTH
+void build_compound_seg_mask_highbd(uint8_t *mask, SEG_MASK_TYPE mask_type,
+                                    const uint8_t *src0, int src0_stride,
+                                    const uint8_t *src1, int src1_stride,
+                                    BLOCK_SIZE sb_type, int h, int w, int bd);
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_COMPOUND_SEGMENT
+#endif  // CONFIG_EXT_INTER
+
+void build_inter_predictors(MACROBLOCKD *xd, int plane,
+#if CONFIG_MOTION_VAR
+                            int mi_col_offset, int mi_row_offset,
+#endif  // CONFIG_MOTION_VAR
+                            int block, int bw, int bh, int x, int y, int w,
+                            int h,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                            int wedge_offset_x, int wedge_offset_y,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                            int mi_x, int mi_y);
+
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+// This function will determine whether or not to create a warped
+// prediction and return the appropriate motion model depending
+// on the configuration. Behavior will change with different
+// combinations of GLOBAL_MOTION, WARPED_MOTION and MOTION_VAR.
+static INLINE int allow_warp(const MODE_INFO *const mi,
+                             const WarpTypesAllowed *const warp_types,
+#if CONFIG_GLOBAL_MOTION
+                             const WarpedMotionParams *const gm_params,
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_MOTION_VAR
+                             int mi_col_offset, int mi_row_offset,
+#endif  // CONFIG_MOTION_VAR
+                             WarpedMotionParams *final_warp_params) {
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  set_default_warp_params(final_warp_params);
+
+// Only global motion configured
+#if CONFIG_GLOBAL_MOTION && !CONFIG_WARPED_MOTION && !CONFIG_MOTION_VAR
+  (void)mbmi;
+  if (warp_types->global_warp_allowed) {
+    memcpy(final_warp_params, gm_params, sizeof(*final_warp_params));
+    return 1;
+  }
+#endif  // CONFIG_GLOBAL_MOTION && !CONFIG_WARPED_MOTION && !CONFIG_MOTION_VAR
+
+// Only warped motion configured
+#if CONFIG_WARPED_MOTION && !CONFIG_GLOBAL_MOTION && !CONFIG_MOTION_VAR
+  if (warp_types->local_warp_allowed) {
+    memcpy(final_warp_params, &mbmi->wm_params[0], sizeof(*final_warp_params));
+    return 1;
+  }
+#endif  // CONFIG_WARPED_MOTION && !CONFIG_GLOBAL_MOTION && !CONFIG_MOTION_VAR
+
+// Warped and global motion configured
+#if CONFIG_GLOBAL_MOTION && CONFIG_WARPED_MOTION && !CONFIG_MOTION_VAR
+  // When both are enabled, warped will take priority. The global parameters
+  // will only be used to compute projection samples to find the warped model.
+  // Note that, if SEPARATE_GLOBAL_MOTION is enabled and a block chooses
+  // global, it will not be possible to select WARPED_CAUSAL.
+  if (warp_types->local_warp_allowed) {
+    memcpy(final_warp_params, &mbmi->wm_params[0], sizeof(*final_warp_params));
+    return 1;
+  } else if (warp_types->global_warp_allowed) {
+    memcpy(final_warp_params, gm_params, sizeof(*final_warp_params));
+    return 1;
+  }
+#endif  // CONFIG_GLOBAL_MOTION && CONFIG_WARPED_MOTION && !CONFIG_MOTION_VAR
+
+// Motion var and global motion configured
+#if CONFIG_GLOBAL_MOTION && CONFIG_MOTION_VAR && !CONFIG_WARPED_MOTION
+  // We warp if either case is true:
+  //   1.) We are predicting a block which uses global motion
+  //   2.) We are predicting a neighboring block of a block using OBMC,
+  //       the neighboring block uses global motion, and we have enabled
+  //       WARP_GM_NEIGHBORS_WITH_OBMC
+  const int build_for_obmc = !(mi_col_offset == 0 && mi_row_offset == 0);
+  (void)mbmi;
+  if (warp_types->global_warp_allowed &&
+      (WARP_GM_NEIGHBORS_WITH_OBMC || !build_for_obmc)) {
+    memcpy(final_warp_params, gm_params, sizeof(*final_warp_params));
+    return 1;
+  }
+#endif  // CONFIG_GLOBAL_MOTION && CONFIG_MOTION_VAR && !CONFIG_WARPED_MOTION
+
+// Motion var and warped motion configured
+#if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR && !CONFIG_GLOBAL_MOTION
+  // We warp if either case is true:
+  //   1.) We are predicting a block with motion mode WARPED_CAUSAL
+  //   2.) We are predicting a neighboring block of a block using OBMC,
+  //       the neighboring block has mode WARPED_CAUSAL, and we have enabled
+  //       WARP_WM_NEIGHBORS_WITH_OBMC
+  const int build_for_obmc = !(mi_col_offset == 0 && mi_row_offset == 0);
+  if (warp_types->local_warp_allowed) {
+    if ((build_for_obmc && WARP_WM_NEIGHBORS_WITH_OBMC) || (!build_for_obmc)) {
+      memcpy(final_warp_params, &mbmi->wm_params[0],
+             sizeof(*final_warp_params));
+      return 1;
+    }
+  }
+#endif  // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR && !CONFIG_GLOBAL_MOTION
+
+// Motion var, warped motion and global motion all configured
+#if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR && CONFIG_GLOBAL_MOTION
+  const int build_for_obmc = !(mi_col_offset == 0 && mi_row_offset == 0);
+  if (warp_types->local_warp_allowed) {
+    if ((build_for_obmc && WARP_WM_NEIGHBORS_WITH_OBMC) || (!build_for_obmc)) {
+      memcpy(final_warp_params, &mbmi->wm_params[0],
+             sizeof(*final_warp_params));
+      return 1;
+    }
+  } else if (warp_types->global_warp_allowed &&
+             (WARP_GM_NEIGHBORS_WITH_OBMC || !build_for_obmc)) {
+    memcpy(final_warp_params, gm_params, sizeof(*final_warp_params));
+    return 1;
+  }
+#endif  // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR && CONFIG_GLOBAL_MOTION
+
+  return 0;
+}
+#endif  // CONFIG_GLOBAL_MOTION ||CONFIG_WARPED_MOTION
+
+static INLINE void av1_make_inter_predictor(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+    const int subpel_x, const int subpel_y, const struct scale_factors *sf,
+    int w, int h, ConvolveParams *conv_params,
+#if CONFIG_DUAL_FILTER
+    const InterpFilter *interp_filter,
+#else
+    const InterpFilter interp_filter,
+#endif
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+    const WarpTypesAllowed *warp_types, int p_col, int p_row, int plane,
+    int ref,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+#if CONFIG_MOTION_VAR
+    int mi_col_offset, int mi_row_offset,
+#endif
+    int xs, int ys, const MACROBLOCKD *xd) {
+  (void)xd;
+
+#if CONFIG_MOTION_VAR
+  const MODE_INFO *mi = xd->mi[mi_col_offset + xd->mi_stride * mi_row_offset];
+#else
+  const MODE_INFO *mi = xd->mi[0];
+  (void)mi;
+#endif  // CONFIG_MOTION_VAR
+
+// Make sure the selected motion mode is valid for this configuration
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  assert_motion_mode_valid(mi->mbmi.motion_mode,
+#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+                           0, xd->global_motion,
+#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+                           mi);
+#endif  // CONFIG MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION
+  WarpedMotionParams final_warp_params;
+  const int do_warp = allow_warp(mi, warp_types,
+#if CONFIG_GLOBAL_MOTION
+                                 &xd->global_motion[mi->mbmi.ref_frame[ref]],
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_MOTION_VAR
+                                 mi_col_offset, mi_row_offset,
+#endif  // CONFIG_MOTION_VAR
+                                 &final_warp_params);
+  if (do_warp) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const struct buf_2d *const pre_buf = &pd->pre[ref];
+    av1_warp_plane(&final_warp_params,
+#if CONFIG_HIGHBITDEPTH
+                   xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+#endif  // CONFIG_HIGHBITDEPTH
+                   pre_buf->buf0, pre_buf->width, pre_buf->height,
+                   pre_buf->stride, dst, p_col, p_row, w, h, dst_stride,
+                   pd->subsampling_x, pd->subsampling_y, xs, ys, ref);
+    return;
+  }
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
+                           sf, w, h, conv_params->ref, interp_filter, xs, ys,
+                           xd->bd);
+    return;
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+  inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y, sf, w,
+                  h, conv_params, interp_filter, xs, ys);
+}
+
+#if CONFIG_EXT_INTER
+void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
+                                     uint8_t *dst, int dst_stride,
+                                     const int subpel_x, const int subpel_y,
+                                     const struct scale_factors *sf, int w,
+                                     int h,
+#if CONFIG_DUAL_FILTER
+                                     const InterpFilter *interp_filter,
+#else
+                                     const InterpFilter interp_filter,
+#endif
+                                     int xs, int ys,
+#if CONFIG_SUPERTX
+                                     int wedge_offset_x, int wedge_offset_y,
+#endif  // CONFIG_SUPERTX
+                                     int plane,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                                     const WarpTypesAllowed *warp_types,
+                                     int p_col, int p_row, int ref,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                                     MACROBLOCKD *xd);
+#endif  // CONFIG_EXT_INTER
+
+static INLINE int round_mv_comp_q4(int value) {
+  return (value < 0 ? value - 2 : value + 2) / 4;
+}
+
+static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) {
+  MV res = {
+    round_mv_comp_q4(
+        mi->bmi[0].as_mv[idx].as_mv.row + mi->bmi[1].as_mv[idx].as_mv.row +
+        mi->bmi[2].as_mv[idx].as_mv.row + mi->bmi[3].as_mv[idx].as_mv.row),
+    round_mv_comp_q4(
+        mi->bmi[0].as_mv[idx].as_mv.col + mi->bmi[1].as_mv[idx].as_mv.col +
+        mi->bmi[2].as_mv[idx].as_mv.col + mi->bmi[3].as_mv[idx].as_mv.col)
+  };
+  return res;
+}
+
+static INLINE int round_mv_comp_q2(int value) {
+  return (value < 0 ? value - 1 : value + 1) / 2;
+}
+
+static MV mi_mv_pred_q2(const MODE_INFO *mi, int idx, int block0, int block1) {
+  MV res = { round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.row +
+                              mi->bmi[block1].as_mv[idx].as_mv.row),
+             round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.col +
+                              mi->bmi[block1].as_mv[idx].as_mv.col) };
+  return res;
+}
+
+// TODO(jkoleszar): yet another mv clamping function :-(
+static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
+                                           const MV *src_mv, int bw, int bh,
+                                           int ss_x, int ss_y) {
+  // If the MV points so far into the UMV border that no visible pixels
+  // are used for reconstruction, the subpel part of the MV can be
+  // discarded and the MV limited to 16 pixels with equivalent results.
+  const int spel_left = (AOM_INTERP_EXTEND + bw) << SUBPEL_BITS;
+  const int spel_right = spel_left - SUBPEL_SHIFTS;
+  const int spel_top = (AOM_INTERP_EXTEND + bh) << SUBPEL_BITS;
+  const int spel_bottom = spel_top - SUBPEL_SHIFTS;
+  MV clamped_mv = { src_mv->row * (1 << (1 - ss_y)),
+                    src_mv->col * (1 << (1 - ss_x)) };
+  assert(ss_x <= 1);
+  assert(ss_y <= 1);
+
+  clamp_mv(&clamped_mv, xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left,
+           xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right,
+           xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top,
+           xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom);
+
+  return clamped_mv;
+}
+
+static INLINE MV average_split_mvs(const struct macroblockd_plane *pd,
+                                   const MODE_INFO *mi, int ref, int block) {
+  const int ss_idx = ((pd->subsampling_x > 0) << 1) | (pd->subsampling_y > 0);
+  MV res = { 0, 0 };
+  switch (ss_idx) {
+    case 0: res = mi->bmi[block].as_mv[ref].as_mv; break;
+    case 1: res = mi_mv_pred_q2(mi, ref, block, block + 2); break;
+    case 2: res = mi_mv_pred_q2(mi, ref, block, block + 1); break;
+    case 3: res = mi_mv_pred_q4(mi, ref); break;
+    default: assert(ss_idx <= 3 && ss_idx >= 0);
+  }
+  return res;
+}
+
+void av1_build_inter_predictor_sub8x8(MACROBLOCKD *xd, int plane, int i, int ir,
+                                      int ic, int mi_row, int mi_col);
+
+void av1_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BUFFER_SET *ctx, BLOCK_SIZE bsize);
+
+void av1_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                     BUFFER_SET *ctx, BLOCK_SIZE bsize);
+
+void av1_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                   BUFFER_SET *ctx, BLOCK_SIZE bsize);
+
+#if CONFIG_SUPERTX
+void av1_build_inter_predictors_sb_sub8x8_extend(MACROBLOCKD *xd,
+#if CONFIG_EXT_INTER
+                                                 int mi_row_ori, int mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                                                 int mi_row, int mi_col,
+                                                 BLOCK_SIZE bsize, int block);
+
+void av1_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+#if CONFIG_EXT_INTER
+                                          int mi_row_ori, int mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                                          int mi_row, int mi_col,
+                                          BLOCK_SIZE bsize);
+struct macroblockd_plane;
+void av1_build_masked_inter_predictor_complex(
+    MACROBLOCKD *xd, uint8_t *dst, int dst_stride, const uint8_t *pre,
+    int pre_stride, int mi_row, int mi_col, int mi_row_ori, int mi_col_ori,
+    BLOCK_SIZE bsize, BLOCK_SIZE top_bsize, PARTITION_TYPE partition,
+    int plane);
+#endif  // CONFIG_SUPERTX
+
+void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+                               int dst_stride, const MV *src_mv,
+                               const struct scale_factors *sf, int w, int h,
+                               ConvolveParams *conv_params,
+#if CONFIG_DUAL_FILTER
+                               const InterpFilter *interp_filter,
+#else
+                               const InterpFilter interp_filter,
+#endif
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                               const WarpTypesAllowed *warp_types, int p_col,
+                               int p_row, int plane, int ref,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                               enum mv_precision precision, int x, int y,
+                               const MACROBLOCKD *xd);
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_build_inter_predictor(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+    const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg,
+#if CONFIG_DUAL_FILTER
+    const InterpFilter *interp_filter,
+#else
+    const InterpFilter interp_filter,
+#endif
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+    const WarpTypesAllowed *warp_types, int p_col, int p_row,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+    int plane, enum mv_precision precision, int x, int y,
+    const MACROBLOCKD *xd);
+#endif
+
+static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
+                                       const struct scale_factors *sf) {
+  const int x = sf ? sf->scale_value_x(x_offset, sf) : x_offset;
+  const int y = sf ? sf->scale_value_y(y_offset, sf) : y_offset;
+  return y * stride + x;
+}
+
+static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize,
+                                    uint8_t *src, int width, int height,
+                                    int stride, int mi_row, int mi_col,
+                                    const struct scale_factors *scale,
+                                    int subsampling_x, int subsampling_y) {
+#if CONFIG_CHROMA_SUB8X8
+  if (bsize < BLOCK_8X8) {
+    // Offset the buffer pointer
+    if (subsampling_y && (mi_row & 0x01)) mi_row -= 1;
+    if (subsampling_x && (mi_col & 0x01)) mi_col -= 1;
+  }
+#else
+  (void)bsize;
+#endif
+
+  const int x = (MI_SIZE * mi_col) >> subsampling_x;
+  const int y = (MI_SIZE * mi_row) >> subsampling_y;
+  dst->buf = src + scaled_buffer_offset(x, y, stride, scale);
+  dst->buf0 = src;
+  dst->width = width;
+  dst->height = height;
+  dst->stride = stride;
+}
+
+void av1_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
+                          BLOCK_SIZE bsize, const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col);
+
+void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const struct scale_factors *sf);
+
+// Detect if the block have sub-pixel level motion vectors
+// per component.
+#define CHECK_SUBPEL 0
+static INLINE int has_subpel_mv_component(const MODE_INFO *const mi,
+                                          const MACROBLOCKD *const xd,
+                                          int dir) {
+#if CHECK_SUBPEL
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  int plane;
+  int ref = (dir >> 1);
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+#endif
+
+  if (bsize >= BLOCK_8X8 || unify_bsize) {
+    if (dir & 0x01) {
+      if (mbmi->mv[ref].as_mv.col & SUBPEL_MASK) return 1;
+    } else {
+      if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK) return 1;
+    }
+  } else {
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      const PARTITION_TYPE bp = BLOCK_8X8 - bsize;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const int have_vsplit = bp != PARTITION_HORZ;
+      const int have_hsplit = bp != PARTITION_VERT;
+      const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
+      const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
+
+      int x, y;
+      for (y = 0; y < num_4x4_h; ++y) {
+        for (x = 0; x < num_4x4_w; ++x) {
+          const MV mv = average_split_mvs(pd, mi, ref, y * 2 + x);
+          if (dir & 0x01) {
+            if (mv.col & SUBPEL_MASK) return 1;
+          } else {
+            if (mv.row & SUBPEL_MASK) return 1;
+          }
+        }
+      }
+    }
+  }
+
+  return 0;
+#else
+  (void)mi;
+  (void)xd;
+  (void)dir;
+  return 1;
+#endif
+}
+
+static INLINE void set_default_interp_filters(
+    MB_MODE_INFO *const mbmi, InterpFilter frame_interp_filter) {
+#if CONFIG_DUAL_FILTER
+  int dir;
+  for (dir = 0; dir < 4; ++dir)
+    mbmi->interp_filter[dir] = frame_interp_filter == SWITCHABLE
+                                   ? EIGHTTAP_REGULAR
+                                   : frame_interp_filter;
+#else
+  mbmi->interp_filter = frame_interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR
+                                                          : frame_interp_filter;
+#endif  // CONFIG_DUAL_FILTER
+}
+
+static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) {
+  (void)xd;
+#if CONFIG_WARPED_MOTION
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  if (mbmi->motion_mode == WARPED_CAUSAL) return 0;
+#endif  // CONFIG_WARPED_MOTION
+#if CONFIG_GLOBAL_MOTION
+  if (is_nontrans_global_motion(xd)) return 0;
+#endif  // CONFIG_GLOBAL_MOTION
+  return 1;
+}
+
+static INLINE int av1_is_interp_search_needed(const MACROBLOCKD *const xd) {
+  MODE_INFO *const mi = xd->mi[0];
+  const int is_compound = has_second_ref(&mi->mbmi);
+  int ref;
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    int row_col;
+    for (row_col = 0; row_col < 2; ++row_col) {
+      const int dir = (ref << 1) + row_col;
+      if (has_subpel_mv_component(mi, xd, dir)) {
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
+
+#if CONFIG_MOTION_VAR
+const uint8_t *av1_get_obmc_mask(int length);
+void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                      int mi_row, int mi_col);
+void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                     int mi_row, int mi_col,
+                                     uint8_t *above[MAX_MB_PLANE],
+                                     int above_stride[MAX_MB_PLANE],
+                                     uint8_t *left[MAX_MB_PLANE],
+                                     int left_stride[MAX_MB_PLANE]);
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                         int mi_row, int mi_col,
+                                         uint8_t *tmp_buf[MAX_MB_PLANE],
+                                         int tmp_width[MAX_MB_PLANE],
+                                         int tmp_height[MAX_MB_PLANE],
+                                         int tmp_stride[MAX_MB_PLANE]);
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        int mi_row, int mi_col,
+                                        uint8_t *tmp_buf[MAX_MB_PLANE],
+                                        int tmp_width[MAX_MB_PLANE],
+                                        int tmp_height[MAX_MB_PLANE],
+                                        int tmp_stride[MAX_MB_PLANE]);
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        int mi_row, int mi_col);
+#if CONFIG_NCOBMC
+void av1_build_ncobmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                          int mi_row, int mi_col);
+#endif
+#endif  // CONFIG_MOTION_VAR
+
+#if CONFIG_EXT_INTER
+#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
+#define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
+
+void av1_init_wedge_masks();
+
+static INLINE const uint8_t *av1_get_contiguous_soft_mask(int wedge_index,
+                                                          int wedge_sign,
+                                                          BLOCK_SIZE sb_type) {
+  return wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
+}
+
+const uint8_t *av1_get_soft_mask(int wedge_index, int wedge_sign,
+                                 BLOCK_SIZE sb_type, int wedge_offset_x,
+                                 int wedge_offset_y);
+
+const uint8_t *av1_get_compound_type_mask_inverse(
+    const INTERINTER_COMPOUND_DATA *const comp_data,
+#if CONFIG_COMPOUND_SEGMENT
+    uint8_t *mask_buffer, int h, int w, int stride,
+#endif
+    BLOCK_SIZE sb_type);
+
+const uint8_t *av1_get_compound_type_mask(
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type);
+
+void av1_build_interintra_predictors(MACROBLOCKD *xd, uint8_t *ypred,
+                                     uint8_t *upred, uint8_t *vpred,
+                                     int ystride, int ustride, int vstride,
+                                     BUFFER_SET *ctx, BLOCK_SIZE bsize);
+void av1_build_interintra_predictors_sby(MACROBLOCKD *xd, uint8_t *ypred,
+                                         int ystride, BUFFER_SET *ctx,
+                                         BLOCK_SIZE bsize);
+void av1_build_interintra_predictors_sbc(MACROBLOCKD *xd, uint8_t *upred,
+                                         int ustride, BUFFER_SET *ctx,
+                                         int plane, BLOCK_SIZE bsize);
+void av1_build_interintra_predictors_sbuv(MACROBLOCKD *xd, uint8_t *upred,
+                                          uint8_t *vpred, int ustride,
+                                          int vstride, BUFFER_SET *ctx,
+                                          BLOCK_SIZE bsize);
+
+void av1_build_intra_predictors_for_interintra(MACROBLOCKD *xd,
+                                               BLOCK_SIZE bsize, int plane,
+                                               BUFFER_SET *ctx,
+                                               uint8_t *intra_pred,
+                                               int intra_stride);
+void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
+                            const uint8_t *inter_pred, int inter_stride,
+                            const uint8_t *intra_pred, int intra_stride);
+
+// Encoder only
+void av1_build_inter_predictors_for_planes_single_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
+    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3]);
+void av1_build_wedge_inter_predictor_from_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to,
+#if CONFIG_SUPERTX
+    int wedge_offset_x, int wedge_offset_y,
+#endif  // CONFIG_SUPERTX
+    uint8_t *ext_dst0[3], int ext_dst_stride0[3], uint8_t *ext_dst1[3],
+    int ext_dst_stride1[3]);
+#endif  // CONFIG_EXT_INTER
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_RECONINTER_H_
diff --git a/third_party/aom/av1/common/reconintra.c b/third_party/aom/av1/common/reconintra.c
new file mode 100644
index 000000000..6e0ff52ce
--- /dev/null
+++ b/third_party/aom/av1/common/reconintra.c
@@ -0,0 +1,2467 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom_ports/system_state.h"
+
+#if CONFIG_HIGHBITDEPTH
+#include "aom_dsp/aom_dsp_common.h"
+#endif  // CONFIG_HIGHBITDEPTH
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_once.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/onyxc_int.h"
+#if CONFIG_CFL
+#include "av1/common/cfl.h"
+#endif
+
+enum {
+  NEED_LEFT = 1 << 1,
+  NEED_ABOVE = 1 << 2,
+  NEED_ABOVERIGHT = 1 << 3,
+  NEED_ABOVELEFT = 1 << 4,
+  NEED_BOTTOMLEFT = 1 << 5,
+};
+
+static const uint8_t extend_modes[INTRA_MODES] = {
+  NEED_ABOVE | NEED_LEFT,                   // DC
+  NEED_ABOVE,                               // V
+  NEED_LEFT,                                // H
+  NEED_ABOVE | NEED_ABOVERIGHT,             // D45
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D135
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D117
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D153
+  NEED_LEFT | NEED_BOTTOMLEFT,              // D207
+  NEED_ABOVE | NEED_ABOVERIGHT,             // D63
+#if CONFIG_ALT_INTRA
+  NEED_LEFT | NEED_ABOVE,                   // SMOOTH
+#endif                                      // CONFIG_ALT_INTRA
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // TM
+};
+
+static const uint16_t orders_128x128[1] = { 0 };
+static const uint16_t orders_128x64[2] = { 0, 1 };
+static const uint16_t orders_64x128[2] = { 0, 1 };
+static const uint16_t orders_64x64[4] = {
+  0, 1, 2, 3,
+};
+static const uint16_t orders_64x32[8] = {
+  0, 2, 1, 3, 4, 6, 5, 7,
+};
+static const uint16_t orders_32x64[8] = {
+  0, 1, 2, 3, 4, 5, 6, 7,
+};
+static const uint16_t orders_32x32[16] = {
+  0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15,
+};
+static const uint16_t orders_32x16[32] = {
+  0,  2,  8,  10, 1,  3,  9,  11, 4,  6,  12, 14, 5,  7,  13, 15,
+  16, 18, 24, 26, 17, 19, 25, 27, 20, 22, 28, 30, 21, 23, 29, 31,
+};
+static const uint16_t orders_16x32[32] = {
+  0,  1,  2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15,
+  16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31,
+};
+static const uint16_t orders_16x16[64] = {
+  0,  1,  4,  5,  16, 17, 20, 21, 2,  3,  6,  7,  18, 19, 22, 23,
+  8,  9,  12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31,
+  32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55,
+  40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63,
+};
+
+#if CONFIG_CB4X4 || CONFIG_EXT_PARTITION
+static const uint16_t orders_16x8[128] = {
+  0,  2,  8,  10, 32,  34,  40,  42,  1,  3,  9,  11, 33,  35,  41,  43,
+  4,  6,  12, 14, 36,  38,  44,  46,  5,  7,  13, 15, 37,  39,  45,  47,
+  16, 18, 24, 26, 48,  50,  56,  58,  17, 19, 25, 27, 49,  51,  57,  59,
+  20, 22, 28, 30, 52,  54,  60,  62,  21, 23, 29, 31, 53,  55,  61,  63,
+  64, 66, 72, 74, 96,  98,  104, 106, 65, 67, 73, 75, 97,  99,  105, 107,
+  68, 70, 76, 78, 100, 102, 108, 110, 69, 71, 77, 79, 101, 103, 109, 111,
+  80, 82, 88, 90, 112, 114, 120, 122, 81, 83, 89, 91, 113, 115, 121, 123,
+  84, 86, 92, 94, 116, 118, 124, 126, 85, 87, 93, 95, 117, 119, 125, 127,
+};
+static const uint16_t orders_8x16[128] = {
+  0,  1,  2,  3,  8,  9,  10, 11, 32,  33,  34,  35,  40,  41,  42,  43,
+  4,  5,  6,  7,  12, 13, 14, 15, 36,  37,  38,  39,  44,  45,  46,  47,
+  16, 17, 18, 19, 24, 25, 26, 27, 48,  49,  50,  51,  56,  57,  58,  59,
+  20, 21, 22, 23, 28, 29, 30, 31, 52,  53,  54,  55,  60,  61,  62,  63,
+  64, 65, 66, 67, 72, 73, 74, 75, 96,  97,  98,  99,  104, 105, 106, 107,
+  68, 69, 70, 71, 76, 77, 78, 79, 100, 101, 102, 103, 108, 109, 110, 111,
+  80, 81, 82, 83, 88, 89, 90, 91, 112, 113, 114, 115, 120, 121, 122, 123,
+  84, 85, 86, 87, 92, 93, 94, 95, 116, 117, 118, 119, 124, 125, 126, 127,
+};
+static const uint16_t orders_8x8[256] = {
+  0,   1,   4,   5,   16,  17,  20,  21,  64,  65,  68,  69,  80,  81,  84,
+  85,  2,   3,   6,   7,   18,  19,  22,  23,  66,  67,  70,  71,  82,  83,
+  86,  87,  8,   9,   12,  13,  24,  25,  28,  29,  72,  73,  76,  77,  88,
+  89,  92,  93,  10,  11,  14,  15,  26,  27,  30,  31,  74,  75,  78,  79,
+  90,  91,  94,  95,  32,  33,  36,  37,  48,  49,  52,  53,  96,  97,  100,
+  101, 112, 113, 116, 117, 34,  35,  38,  39,  50,  51,  54,  55,  98,  99,
+  102, 103, 114, 115, 118, 119, 40,  41,  44,  45,  56,  57,  60,  61,  104,
+  105, 108, 109, 120, 121, 124, 125, 42,  43,  46,  47,  58,  59,  62,  63,
+  106, 107, 110, 111, 122, 123, 126, 127, 128, 129, 132, 133, 144, 145, 148,
+  149, 192, 193, 196, 197, 208, 209, 212, 213, 130, 131, 134, 135, 146, 147,
+  150, 151, 194, 195, 198, 199, 210, 211, 214, 215, 136, 137, 140, 141, 152,
+  153, 156, 157, 200, 201, 204, 205, 216, 217, 220, 221, 138, 139, 142, 143,
+  154, 155, 158, 159, 202, 203, 206, 207, 218, 219, 222, 223, 160, 161, 164,
+  165, 176, 177, 180, 181, 224, 225, 228, 229, 240, 241, 244, 245, 162, 163,
+  166, 167, 178, 179, 182, 183, 226, 227, 230, 231, 242, 243, 246, 247, 168,
+  169, 172, 173, 184, 185, 188, 189, 232, 233, 236, 237, 248, 249, 252, 253,
+  170, 171, 174, 175, 186, 187, 190, 191, 234, 235, 238, 239, 250, 251, 254,
+  255,
+};
+
+#if CONFIG_CB4X4 && CONFIG_EXT_PARTITION
+static const uint16_t orders_4x8[512] = {
+  0,   1,   2,   3,   8,   9,   10,  11,  32,  33,  34,  35,  40,  41,  42,
+  43,  128, 129, 130, 131, 136, 137, 138, 139, 160, 161, 162, 163, 168, 169,
+  170, 171, 4,   5,   6,   7,   12,  13,  14,  15,  36,  37,  38,  39,  44,
+  45,  46,  47,  132, 133, 134, 135, 140, 141, 142, 143, 164, 165, 166, 167,
+  172, 173, 174, 175, 16,  17,  18,  19,  24,  25,  26,  27,  48,  49,  50,
+  51,  56,  57,  58,  59,  144, 145, 146, 147, 152, 153, 154, 155, 176, 177,
+  178, 179, 184, 185, 186, 187, 20,  21,  22,  23,  28,  29,  30,  31,  52,
+  53,  54,  55,  60,  61,  62,  63,  148, 149, 150, 151, 156, 157, 158, 159,
+  180, 181, 182, 183, 188, 189, 190, 191, 64,  65,  66,  67,  72,  73,  74,
+  75,  96,  97,  98,  99,  104, 105, 106, 107, 192, 193, 194, 195, 200, 201,
+  202, 203, 224, 225, 226, 227, 232, 233, 234, 235, 68,  69,  70,  71,  76,
+  77,  78,  79,  100, 101, 102, 103, 108, 109, 110, 111, 196, 197, 198, 199,
+  204, 205, 206, 207, 228, 229, 230, 231, 236, 237, 238, 239, 80,  81,  82,
+  83,  88,  89,  90,  91,  112, 113, 114, 115, 120, 121, 122, 123, 208, 209,
+  210, 211, 216, 217, 218, 219, 240, 241, 242, 243, 248, 249, 250, 251, 84,
+  85,  86,  87,  92,  93,  94,  95,  116, 117, 118, 119, 124, 125, 126, 127,
+  212, 213, 214, 215, 220, 221, 222, 223, 244, 245, 246, 247, 252, 253, 254,
+  255, 256, 257, 258, 259, 264, 265, 266, 267, 288, 289, 290, 291, 296, 297,
+  298, 299, 384, 385, 386, 387, 392, 393, 394, 395, 416, 417, 418, 419, 424,
+  425, 426, 427, 260, 261, 262, 263, 268, 269, 270, 271, 292, 293, 294, 295,
+  300, 301, 302, 303, 388, 389, 390, 391, 396, 397, 398, 399, 420, 421, 422,
+  423, 428, 429, 430, 431, 272, 273, 274, 275, 280, 281, 282, 283, 304, 305,
+  306, 307, 312, 313, 314, 315, 400, 401, 402, 403, 408, 409, 410, 411, 432,
+  433, 434, 435, 440, 441, 442, 443, 276, 277, 278, 279, 284, 285, 286, 287,
+  308, 309, 310, 311, 316, 317, 318, 319, 404, 405, 406, 407, 412, 413, 414,
+  415, 436, 437, 438, 439, 444, 445, 446, 447, 320, 321, 322, 323, 328, 329,
+  330, 331, 352, 353, 354, 355, 360, 361, 362, 363, 448, 449, 450, 451, 456,
+  457, 458, 459, 480, 481, 482, 483, 488, 489, 490, 491, 324, 325, 326, 327,
+  332, 333, 334, 335, 356, 357, 358, 359, 364, 365, 366, 367, 452, 453, 454,
+  455, 460, 461, 462, 463, 484, 485, 486, 487, 492, 493, 494, 495, 336, 337,
+  338, 339, 344, 345, 346, 347, 368, 369, 370, 371, 376, 377, 378, 379, 464,
+  465, 466, 467, 472, 473, 474, 475, 496, 497, 498, 499, 504, 505, 506, 507,
+  340, 341, 342, 343, 348, 349, 350, 351, 372, 373, 374, 375, 380, 381, 382,
+  383, 468, 469, 470, 471, 476, 477, 478, 479, 500, 501, 502, 503, 508, 509,
+  510, 511,
+};
+
+static const uint16_t orders_8x4[512] = {
+  0,   2,   8,   10,  32,  34,  40,  42,  128, 130, 136, 138, 160, 162, 168,
+  170, 1,   3,   9,   11,  33,  35,  41,  43,  129, 131, 137, 139, 161, 163,
+  169, 171, 4,   6,   12,  14,  36,  38,  44,  46,  132, 134, 140, 142, 164,
+  166, 172, 174, 5,   7,   13,  15,  37,  39,  45,  47,  133, 135, 141, 143,
+  165, 167, 173, 175, 16,  18,  24,  26,  48,  50,  56,  58,  144, 146, 152,
+  154, 176, 178, 184, 186, 17,  19,  25,  27,  49,  51,  57,  59,  145, 147,
+  153, 155, 177, 179, 185, 187, 20,  22,  28,  30,  52,  54,  60,  62,  148,
+  150, 156, 158, 180, 182, 188, 190, 21,  23,  29,  31,  53,  55,  61,  63,
+  149, 151, 157, 159, 181, 183, 189, 191, 64,  66,  72,  74,  96,  98,  104,
+  106, 192, 194, 200, 202, 224, 226, 232, 234, 65,  67,  73,  75,  97,  99,
+  105, 107, 193, 195, 201, 203, 225, 227, 233, 235, 68,  70,  76,  78,  100,
+  102, 108, 110, 196, 198, 204, 206, 228, 230, 236, 238, 69,  71,  77,  79,
+  101, 103, 109, 111, 197, 199, 205, 207, 229, 231, 237, 239, 80,  82,  88,
+  90,  112, 114, 120, 122, 208, 210, 216, 218, 240, 242, 248, 250, 81,  83,
+  89,  91,  113, 115, 121, 123, 209, 211, 217, 219, 241, 243, 249, 251, 84,
+  86,  92,  94,  116, 118, 124, 126, 212, 214, 220, 222, 244, 246, 252, 254,
+  85,  87,  93,  95,  117, 119, 125, 127, 213, 215, 221, 223, 245, 247, 253,
+  255, 256, 258, 264, 266, 288, 290, 296, 298, 384, 386, 392, 394, 416, 418,
+  424, 426, 257, 259, 265, 267, 289, 291, 297, 299, 385, 387, 393, 395, 417,
+  419, 425, 427, 260, 262, 268, 270, 292, 294, 300, 302, 388, 390, 396, 398,
+  420, 422, 428, 430, 261, 263, 269, 271, 293, 295, 301, 303, 389, 391, 397,
+  399, 421, 423, 429, 431, 272, 274, 280, 282, 304, 306, 312, 314, 400, 402,
+  408, 410, 432, 434, 440, 442, 273, 275, 281, 283, 305, 307, 313, 315, 401,
+  403, 409, 411, 433, 435, 441, 443, 276, 278, 284, 286, 308, 310, 316, 318,
+  404, 406, 412, 414, 436, 438, 444, 446, 277, 279, 285, 287, 309, 311, 317,
+  319, 405, 407, 413, 415, 437, 439, 445, 447, 320, 322, 328, 330, 352, 354,
+  360, 362, 448, 450, 456, 458, 480, 482, 488, 490, 321, 323, 329, 331, 353,
+  355, 361, 363, 449, 451, 457, 459, 481, 483, 489, 491, 324, 326, 332, 334,
+  356, 358, 364, 366, 452, 454, 460, 462, 484, 486, 492, 494, 325, 327, 333,
+  335, 357, 359, 365, 367, 453, 455, 461, 463, 485, 487, 493, 495, 336, 338,
+  344, 346, 368, 370, 376, 378, 464, 466, 472, 474, 496, 498, 504, 506, 337,
+  339, 345, 347, 369, 371, 377, 379, 465, 467, 473, 475, 497, 499, 505, 507,
+  340, 342, 348, 350, 372, 374, 380, 382, 468, 470, 476, 478, 500, 502, 508,
+  510, 341, 343, 349, 351, 373, 375, 381, 383, 469, 471, 477, 479, 501, 503,
+  509, 511,
+};
+
+static const uint16_t orders_4x4[1024] = {
+  0,    1,    4,    5,    16,   17,   20,   21,   64,   65,   68,   69,   80,
+  81,   84,   85,   256,  257,  260,  261,  272,  273,  276,  277,  320,  321,
+  324,  325,  336,  337,  340,  341,  2,    3,    6,    7,    18,   19,   22,
+  23,   66,   67,   70,   71,   82,   83,   86,   87,   258,  259,  262,  263,
+  274,  275,  278,  279,  322,  323,  326,  327,  338,  339,  342,  343,  8,
+  9,    12,   13,   24,   25,   28,   29,   72,   73,   76,   77,   88,   89,
+  92,   93,   264,  265,  268,  269,  280,  281,  284,  285,  328,  329,  332,
+  333,  344,  345,  348,  349,  10,   11,   14,   15,   26,   27,   30,   31,
+  74,   75,   78,   79,   90,   91,   94,   95,   266,  267,  270,  271,  282,
+  283,  286,  287,  330,  331,  334,  335,  346,  347,  350,  351,  32,   33,
+  36,   37,   48,   49,   52,   53,   96,   97,   100,  101,  112,  113,  116,
+  117,  288,  289,  292,  293,  304,  305,  308,  309,  352,  353,  356,  357,
+  368,  369,  372,  373,  34,   35,   38,   39,   50,   51,   54,   55,   98,
+  99,   102,  103,  114,  115,  118,  119,  290,  291,  294,  295,  306,  307,
+  310,  311,  354,  355,  358,  359,  370,  371,  374,  375,  40,   41,   44,
+  45,   56,   57,   60,   61,   104,  105,  108,  109,  120,  121,  124,  125,
+  296,  297,  300,  301,  312,  313,  316,  317,  360,  361,  364,  365,  376,
+  377,  380,  381,  42,   43,   46,   47,   58,   59,   62,   63,   106,  107,
+  110,  111,  122,  123,  126,  127,  298,  299,  302,  303,  314,  315,  318,
+  319,  362,  363,  366,  367,  378,  379,  382,  383,  128,  129,  132,  133,
+  144,  145,  148,  149,  192,  193,  196,  197,  208,  209,  212,  213,  384,
+  385,  388,  389,  400,  401,  404,  405,  448,  449,  452,  453,  464,  465,
+  468,  469,  130,  131,  134,  135,  146,  147,  150,  151,  194,  195,  198,
+  199,  210,  211,  214,  215,  386,  387,  390,  391,  402,  403,  406,  407,
+  450,  451,  454,  455,  466,  467,  470,  471,  136,  137,  140,  141,  152,
+  153,  156,  157,  200,  201,  204,  205,  216,  217,  220,  221,  392,  393,
+  396,  397,  408,  409,  412,  413,  456,  457,  460,  461,  472,  473,  476,
+  477,  138,  139,  142,  143,  154,  155,  158,  159,  202,  203,  206,  207,
+  218,  219,  222,  223,  394,  395,  398,  399,  410,  411,  414,  415,  458,
+  459,  462,  463,  474,  475,  478,  479,  160,  161,  164,  165,  176,  177,
+  180,  181,  224,  225,  228,  229,  240,  241,  244,  245,  416,  417,  420,
+  421,  432,  433,  436,  437,  480,  481,  484,  485,  496,  497,  500,  501,
+  162,  163,  166,  167,  178,  179,  182,  183,  226,  227,  230,  231,  242,
+  243,  246,  247,  418,  419,  422,  423,  434,  435,  438,  439,  482,  483,
+  486,  487,  498,  499,  502,  503,  168,  169,  172,  173,  184,  185,  188,
+  189,  232,  233,  236,  237,  248,  249,  252,  253,  424,  425,  428,  429,
+  440,  441,  444,  445,  488,  489,  492,  493,  504,  505,  508,  509,  170,
+  171,  174,  175,  186,  187,  190,  191,  234,  235,  238,  239,  250,  251,
+  254,  255,  426,  427,  430,  431,  442,  443,  446,  447,  490,  491,  494,
+  495,  506,  507,  510,  511,  512,  513,  516,  517,  528,  529,  532,  533,
+  576,  577,  580,  581,  592,  593,  596,  597,  768,  769,  772,  773,  784,
+  785,  788,  789,  832,  833,  836,  837,  848,  849,  852,  853,  514,  515,
+  518,  519,  530,  531,  534,  535,  578,  579,  582,  583,  594,  595,  598,
+  599,  770,  771,  774,  775,  786,  787,  790,  791,  834,  835,  838,  839,
+  850,  851,  854,  855,  520,  521,  524,  525,  536,  537,  540,  541,  584,
+  585,  588,  589,  600,  601,  604,  605,  776,  777,  780,  781,  792,  793,
+  796,  797,  840,  841,  844,  845,  856,  857,  860,  861,  522,  523,  526,
+  527,  538,  539,  542,  543,  586,  587,  590,  591,  602,  603,  606,  607,
+  778,  779,  782,  783,  794,  795,  798,  799,  842,  843,  846,  847,  858,
+  859,  862,  863,  544,  545,  548,  549,  560,  561,  564,  565,  608,  609,
+  612,  613,  624,  625,  628,  629,  800,  801,  804,  805,  816,  817,  820,
+  821,  864,  865,  868,  869,  880,  881,  884,  885,  546,  547,  550,  551,
+  562,  563,  566,  567,  610,  611,  614,  615,  626,  627,  630,  631,  802,
+  803,  806,  807,  818,  819,  822,  823,  866,  867,  870,  871,  882,  883,
+  886,  887,  552,  553,  556,  557,  568,  569,  572,  573,  616,  617,  620,
+  621,  632,  633,  636,  637,  808,  809,  812,  813,  824,  825,  828,  829,
+  872,  873,  876,  877,  888,  889,  892,  893,  554,  555,  558,  559,  570,
+  571,  574,  575,  618,  619,  622,  623,  634,  635,  638,  639,  810,  811,
+  814,  815,  826,  827,  830,  831,  874,  875,  878,  879,  890,  891,  894,
+  895,  640,  641,  644,  645,  656,  657,  660,  661,  704,  705,  708,  709,
+  720,  721,  724,  725,  896,  897,  900,  901,  912,  913,  916,  917,  960,
+  961,  964,  965,  976,  977,  980,  981,  642,  643,  646,  647,  658,  659,
+  662,  663,  706,  707,  710,  711,  722,  723,  726,  727,  898,  899,  902,
+  903,  914,  915,  918,  919,  962,  963,  966,  967,  978,  979,  982,  983,
+  648,  649,  652,  653,  664,  665,  668,  669,  712,  713,  716,  717,  728,
+  729,  732,  733,  904,  905,  908,  909,  920,  921,  924,  925,  968,  969,
+  972,  973,  984,  985,  988,  989,  650,  651,  654,  655,  666,  667,  670,
+  671,  714,  715,  718,  719,  730,  731,  734,  735,  906,  907,  910,  911,
+  922,  923,  926,  927,  970,  971,  974,  975,  986,  987,  990,  991,  672,
+  673,  676,  677,  688,  689,  692,  693,  736,  737,  740,  741,  752,  753,
+  756,  757,  928,  929,  932,  933,  944,  945,  948,  949,  992,  993,  996,
+  997,  1008, 1009, 1012, 1013, 674,  675,  678,  679,  690,  691,  694,  695,
+  738,  739,  742,  743,  754,  755,  758,  759,  930,  931,  934,  935,  946,
+  947,  950,  951,  994,  995,  998,  999,  1010, 1011, 1014, 1015, 680,  681,
+  684,  685,  696,  697,  700,  701,  744,  745,  748,  749,  760,  761,  764,
+  765,  936,  937,  940,  941,  952,  953,  956,  957,  1000, 1001, 1004, 1005,
+  1016, 1017, 1020, 1021, 682,  683,  686,  687,  698,  699,  702,  703,  746,
+  747,  750,  751,  762,  763,  766,  767,  938,  939,  942,  943,  954,  955,
+  958,  959,  1002, 1003, 1006, 1007, 1018, 1019, 1022, 1023,
+};
+#endif
+#endif  // CONFIG_CB4X4 || CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_PARTITION
+/* clang-format off */
+static const uint16_t *const orders[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  // 2X2,         2X4,            4X2
+  orders_4x4,     orders_4x4,     orders_4x4,
+  //                              4X4
+                                  orders_4x4,
+  // 4X8,         8X4,            8X8
+  orders_4x8,     orders_8x4,     orders_8x8,
+#else
+  //                              4X4
+                                  orders_8x8,
+  // 4X8,         8X4,            8X8
+  orders_8x8,     orders_8x8,     orders_8x8,
+#endif
+  // 8X16,        16X8,           16X16
+  orders_8x16,    orders_16x8,    orders_16x16,
+  // 16X32,       32X16,          32X32
+  orders_16x32,   orders_32x16,   orders_32x32,
+  // 32X64,       64X32,          64X64
+  orders_32x64,   orders_64x32,   orders_64x64,
+  // 64x128,      128x64,         128x128
+  orders_64x128,  orders_128x64,  orders_128x128
+};
+/* clang-format on */
+#else
+/* clang-format off */
+static const uint16_t *const orders[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  // 2X2,         2X4,            4X2
+  orders_8x8,     orders_8x8,     orders_8x8,
+  //                              4X4
+                                  orders_8x8,
+  // 4X8,         8X4,            8X8
+  orders_8x16,    orders_16x8,    orders_16x16,
+#else
+  //                              4X4
+                                  orders_16x16,
+  // 4X8,         8X4,            8X8
+  orders_16x16,   orders_16x16,   orders_16x16,
+#endif
+  // 8X16,        16X8,           16X16
+  orders_16x32,   orders_32x16,   orders_32x32,
+  // 16X32,       32X16,          32X32
+  orders_32x64,   orders_64x32,   orders_64x64,
+  // 32X64,       64X32,          64X64
+  orders_64x128,  orders_128x64,  orders_128x128
+};
+/* clang-format on */
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_PARTITION_TYPES
+static const uint16_t orders_verta_64x64[4] = {
+  0, 2, 1, 2,
+};
+static const uint16_t orders_verta_32x32[16] = {
+  0, 2, 4, 6, 1, 2, 5, 6, 8, 10, 12, 14, 9, 10, 13, 14,
+};
+static const uint16_t orders_verta_16x16[64] = {
+  0,  2,  4,  6,  16, 18, 20, 22, 1,  2,  5,  6,  17, 18, 21, 22,
+  8,  10, 12, 14, 24, 26, 28, 30, 9,  10, 13, 14, 25, 26, 29, 30,
+  32, 34, 36, 38, 48, 50, 52, 54, 33, 34, 37, 38, 49, 50, 53, 54,
+  40, 42, 44, 46, 56, 58, 60, 62, 41, 42, 45, 46, 57, 58, 61, 62,
+};
+#if CONFIG_EXT_PARTITION || CONFIG_CB4X4
+static const uint16_t orders_verta_8x8[256] = {
+  0,   2,   4,   6,   16,  18,  20,  22,  64,  66,  68,  70,  80,  82,  84,
+  86,  1,   2,   5,   6,   17,  18,  21,  22,  65,  66,  69,  70,  81,  82,
+  85,  86,  8,   10,  12,  14,  24,  26,  28,  30,  72,  74,  76,  78,  88,
+  90,  92,  94,  9,   10,  13,  14,  25,  26,  29,  30,  73,  74,  77,  78,
+  89,  90,  93,  94,  32,  34,  36,  38,  48,  50,  52,  54,  96,  98,  100,
+  102, 112, 114, 116, 118, 33,  34,  37,  38,  49,  50,  53,  54,  97,  98,
+  101, 102, 113, 114, 117, 118, 40,  42,  44,  46,  56,  58,  60,  62,  104,
+  106, 108, 110, 120, 122, 124, 126, 41,  42,  45,  46,  57,  58,  61,  62,
+  105, 106, 109, 110, 121, 122, 125, 126, 128, 130, 132, 134, 144, 146, 148,
+  150, 192, 194, 196, 198, 208, 210, 212, 214, 129, 130, 133, 134, 145, 146,
+  149, 150, 193, 194, 197, 198, 209, 210, 213, 214, 136, 138, 140, 142, 152,
+  154, 156, 158, 200, 202, 204, 206, 216, 218, 220, 222, 137, 138, 141, 142,
+  153, 154, 157, 158, 201, 202, 205, 206, 217, 218, 221, 222, 160, 162, 164,
+  166, 176, 178, 180, 182, 224, 226, 228, 230, 240, 242, 244, 246, 161, 162,
+  165, 166, 177, 178, 181, 182, 225, 226, 229, 230, 241, 242, 245, 246, 168,
+  170, 172, 174, 184, 186, 188, 190, 232, 234, 236, 238, 248, 250, 252, 254,
+  169, 170, 173, 174, 185, 186, 189, 190, 233, 234, 237, 238, 249, 250, 253,
+  254,
+};
+#endif  // CONFIG_EXT_PARTITION || CONFIG_CB4X4
+
+#if CONFIG_EXT_PARTITION
+/* clang-format off */
+static const uint16_t *const orders_verta[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  // 2X2,           2X4,              4X2
+  orders_4x4,       orders_4x4,       orders_4x4,
+#endif
+  //                                  4X4
+                                      orders_verta_8x8,
+  // 4X8,           8X4,              8X8
+  orders_verta_8x8, orders_verta_8x8, orders_verta_8x8,
+  // 8X16,          16X8,             16X16
+  orders_8x16,      orders_16x8,      orders_verta_16x16,
+  // 16X32,         32X16,            32X32
+  orders_16x32,     orders_32x16,     orders_verta_32x32,
+  // 32X64,         64X32,            64X64
+  orders_32x64,     orders_64x32,     orders_verta_64x64,
+  // 64x128,        128x64,           128x128
+  orders_64x128,    orders_128x64,    orders_128x128
+};
+/* clang-format on */
+#else
+/* clang-format off */
+static const uint16_t *const orders_verta[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  // 2X2,             2X4,                4X2
+  orders_verta_8x8,   orders_verta_8x8,   orders_verta_8x8,
+  //                                      4X4
+                                          orders_verta_8x8,
+  // 4X8,             8X4,                8X8
+  orders_verta_8x8,   orders_verta_8x8,   orders_verta_16x16,
+#else
+  //                                      4X4
+                                          orders_verta_16x16,
+  // 4X8,             8X4,                8X8
+  orders_verta_16x16, orders_verta_16x16, orders_verta_16x16,
+#endif
+  // 8X16,            16X8,               16X16
+  orders_16x32,       orders_32x16,       orders_verta_32x32,
+  // 16X32,           32X16,              32X32
+  orders_32x64,       orders_64x32,       orders_verta_64x64,
+  // 32X64,           64X32,              64X64
+  orders_64x128,      orders_128x64,      orders_128x128
+};
+/* clang-format on */
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
+static int has_top_right(BLOCK_SIZE bsize, int mi_row, int mi_col,
+                         int top_available, int right_available,
+#if CONFIG_EXT_PARTITION_TYPES
+                         PARTITION_TYPE partition,
+#endif
+                         TX_SIZE txsz, int row_off, int col_off, int ss_x) {
+  if (!top_available || !right_available) return 0;
+
+#if !CONFIG_CB4X4
+  // TODO(bshacklett, huisu): Currently the RD loop traverses 4X8 blocks in
+  // inverted N order while in the bitstream the subblocks are stored in Z
+  // order. This discrepancy makes this function incorrect when considering 4X8
+  // blocks in the RD loop, so we disable the extended right edge for these
+  // blocks. The correct solution is to change the bitstream to store these
+  // blocks in inverted N order, and then update this function appropriately.
+  if (bsize == BLOCK_4X8 && row_off == 1) return 0;
+#endif
+
+  const int bw_unit = block_size_wide[bsize] >> tx_size_wide_log2[0];
+  const int plane_bw_unit = AOMMAX(bw_unit >> ss_x, 1);
+  const int top_right_count_unit = tx_size_wide_unit[txsz];
+
+  // Special handling for block sizes 4x8 and 4x4.
+  if (ss_x == 0 && bw_unit < 2 && col_off == 0) return 1;
+
+  if (row_off > 0) {  // Just need to check if enough pixels on the right.
+    return col_off + top_right_count_unit < plane_bw_unit;
+  } else {
+    // All top-right pixels are in the block above, which is already available.
+    if (col_off + top_right_count_unit < plane_bw_unit) return 1;
+
+    const int bw_in_mi_log2 = mi_width_log2_lookup[bsize];
+    const int bh_in_mi_log2 = mi_height_log2_lookup[bsize];
+    const int blk_row_in_sb = (mi_row & MAX_MIB_MASK) >> bh_in_mi_log2;
+    const int blk_col_in_sb = (mi_col & MAX_MIB_MASK) >> bw_in_mi_log2;
+
+    // Top row of superblock: so top-right pixels are in the top and/or
+    // top-right superblocks, both of which are already available.
+    if (blk_row_in_sb == 0) return 1;
+
+    // Rightmost column of superblock (and not the top row): so top-right pixels
+    // fall in the right superblock, which is not available yet.
+    if (((blk_col_in_sb + 1) << bw_in_mi_log2) >= MAX_MIB_SIZE) return 0;
+
+    // General case (neither top row nor rightmost column): check if the
+    // top-right block is coded before the current block.
+    const uint16_t *const order =
+#if CONFIG_EXT_PARTITION_TYPES
+        (partition == PARTITION_VERT_A) ? orders_verta[bsize] :
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                                        orders[bsize];
+    const int this_blk_index =
+        ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
+        blk_col_in_sb + 0;
+    const uint16_t this_blk_order = order[this_blk_index];
+    const int tr_blk_index =
+        ((blk_row_in_sb - 1) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
+        blk_col_in_sb + 1;
+    const uint16_t tr_blk_order = order[tr_blk_index];
+    return tr_blk_order < this_blk_order;
+  }
+}
+
+static int has_bottom_left(BLOCK_SIZE bsize, int mi_row, int mi_col,
+                           int bottom_available, int left_available,
+                           TX_SIZE txsz, int row_off, int col_off, int ss_y) {
+  if (!bottom_available || !left_available) return 0;
+
+  if (col_off > 0) {
+    // Bottom-left pixels are in the bottom-left block, which is not available.
+    return 0;
+  } else {
+    const int bh_unit = block_size_high[bsize] >> tx_size_high_log2[0];
+    const int plane_bh_unit = AOMMAX(bh_unit >> ss_y, 1);
+    const int bottom_left_count_unit = tx_size_high_unit[txsz];
+
+#if !CONFIG_CB4X4
+    // Special handling for block sizes 8x4 and 4x4.
+    if (ss_y == 0 && bh_unit < 2 && row_off == 0) return 1;
+#endif
+
+    // All bottom-left pixels are in the left block, which is already available.
+    if (row_off + bottom_left_count_unit < plane_bh_unit) return 1;
+
+    const int bw_in_mi_log2 = mi_width_log2_lookup[bsize];
+    const int bh_in_mi_log2 = mi_height_log2_lookup[bsize];
+    const int blk_row_in_sb = (mi_row & MAX_MIB_MASK) >> bh_in_mi_log2;
+    const int blk_col_in_sb = (mi_col & MAX_MIB_MASK) >> bw_in_mi_log2;
+
+    // Leftmost column of superblock: so bottom-left pixels maybe in the left
+    // and/or bottom-left superblocks. But only the left superblock is
+    // available, so check if all required pixels fall in that superblock.
+    if (blk_col_in_sb == 0) {
+      const int blk_start_row_off = blk_row_in_sb << (bh_in_mi_log2 + !ss_y);
+      const int row_off_in_sb = blk_start_row_off + row_off;
+      const int sb_height_unit = MAX_MIB_SIZE << !ss_y;
+      return row_off_in_sb + bottom_left_count_unit < sb_height_unit;
+    }
+
+    // Bottom row of superblock (and not the leftmost column): so bottom-left
+    // pixels fall in the bottom superblock, which is not available yet.
+    if (((blk_row_in_sb + 1) << bh_in_mi_log2) >= MAX_MIB_SIZE) return 0;
+
+    // General case (neither leftmost column nor bottom row): check if the
+    // bottom-left block is coded before the current block.
+    const uint16_t *const order = orders[bsize];
+    const int this_blk_index =
+        ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
+        blk_col_in_sb + 0;
+    const uint16_t this_blk_order = order[this_blk_index];
+    const int bl_blk_index =
+        ((blk_row_in_sb + 1) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
+        blk_col_in_sb - 1;
+    const uint16_t bl_blk_order = order[bl_blk_index];
+    return bl_blk_order < this_blk_order;
+  }
+}
+
+typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left);
+
+static intra_pred_fn pred[INTRA_MODES][TX_SIZES];
+static intra_pred_fn dc_pred[2][2][TX_SIZES];
+
+#if CONFIG_HIGHBITDEPTH
+typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride,
+                                   const uint16_t *above, const uint16_t *left,
+                                   int bd);
+static intra_high_pred_fn pred_high[INTRA_MODES][TX_SIZES];
+static intra_high_pred_fn dc_pred_high[2][2][TX_SIZES];
+#endif  // CONFIG_HIGHBITDEPTH
+
+static void av1_init_intra_predictors_internal(void) {
+#if CONFIG_TX64X64
+#define INIT_NO_4X4(p, type)                  \
+  p[TX_8X8] = aom_##type##_predictor_8x8;     \
+  p[TX_16X16] = aom_##type##_predictor_16x16; \
+  p[TX_32X32] = aom_##type##_predictor_32x32; \
+  p[TX_64X64] = aom_##type##_predictor_64x64
+#else
+#define INIT_NO_4X4(p, type)                  \
+  p[TX_8X8] = aom_##type##_predictor_8x8;     \
+  p[TX_16X16] = aom_##type##_predictor_16x16; \
+  p[TX_32X32] = aom_##type##_predictor_32x32
+#endif  // CONFIG_TX64X64
+
+#if CONFIG_CB4X4
+#define INIT_ALL_SIZES(p, type)           \
+  p[TX_2X2] = aom_##type##_predictor_2x2; \
+  p[TX_4X4] = aom_##type##_predictor_4x4; \
+  INIT_NO_4X4(p, type)
+#else
+#define INIT_ALL_SIZES(p, type)           \
+  p[TX_4X4] = aom_##type##_predictor_4x4; \
+  INIT_NO_4X4(p, type)
+#endif
+
+  INIT_ALL_SIZES(pred[V_PRED], v);
+  INIT_ALL_SIZES(pred[H_PRED], h);
+  INIT_ALL_SIZES(pred[D207_PRED], d207e);
+  INIT_ALL_SIZES(pred[D45_PRED], d45e);
+  INIT_ALL_SIZES(pred[D63_PRED], d63e);
+  INIT_ALL_SIZES(pred[D117_PRED], d117);
+  INIT_ALL_SIZES(pred[D135_PRED], d135);
+  INIT_ALL_SIZES(pred[D153_PRED], d153);
+
+#if CONFIG_ALT_INTRA
+  INIT_ALL_SIZES(pred[TM_PRED], paeth);
+  INIT_ALL_SIZES(pred[SMOOTH_PRED], smooth);
+#else
+  INIT_ALL_SIZES(pred[TM_PRED], tm);
+#endif  // CONFIG_ALT_INTRA
+
+  INIT_ALL_SIZES(dc_pred[0][0], dc_128);
+  INIT_ALL_SIZES(dc_pred[0][1], dc_top);
+  INIT_ALL_SIZES(dc_pred[1][0], dc_left);
+  INIT_ALL_SIZES(dc_pred[1][1], dc);
+
+#if CONFIG_HIGHBITDEPTH
+  INIT_ALL_SIZES(pred_high[V_PRED], highbd_v);
+  INIT_ALL_SIZES(pred_high[H_PRED], highbd_h);
+  INIT_ALL_SIZES(pred_high[D207_PRED], highbd_d207e);
+  INIT_ALL_SIZES(pred_high[D45_PRED], highbd_d45e);
+  INIT_ALL_SIZES(pred_high[D63_PRED], highbd_d63e);
+  INIT_ALL_SIZES(pred_high[D117_PRED], highbd_d117);
+  INIT_ALL_SIZES(pred_high[D135_PRED], highbd_d135);
+  INIT_ALL_SIZES(pred_high[D153_PRED], highbd_d153);
+
+#if CONFIG_ALT_INTRA
+  INIT_ALL_SIZES(pred_high[TM_PRED], highbd_paeth);
+  INIT_ALL_SIZES(pred_high[SMOOTH_PRED], highbd_smooth);
+#else
+  INIT_ALL_SIZES(pred_high[TM_PRED], highbd_tm);
+#endif  // CONFIG_ALT_INTRA
+
+  INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128);
+  INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top);
+  INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left);
+  INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc);
+#endif  // CONFIG_HIGHBITDEPTH
+
+#undef intra_pred_allsizes
+}
+
+#if CONFIG_EXT_INTRA
+#if CONFIG_INTRA_INTERP
+static int intra_subpel_interp(int base, int shift, const uint8_t *ref,
+                               int ref_start_idx, int ref_end_idx,
+                               INTRA_FILTER filter_type) {
+  int val, k, idx, filter_idx = 0;
+  const int16_t *filter = NULL;
+
+  if (filter_type == INTRA_FILTER_LINEAR) {
+    val = ref[base] * (256 - shift) + ref[base + 1] * shift;
+    val = ROUND_POWER_OF_TWO(val, 8);
+  } else {
+    filter_idx = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS);
+    filter = av1_intra_filter_kernels[filter_type][filter_idx];
+
+    if (filter_idx < (1 << SUBPEL_BITS)) {
+      val = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k) {
+        idx = base + 1 - (SUBPEL_TAPS / 2) + k;
+        idx = AOMMAX(AOMMIN(idx, ref_end_idx), ref_start_idx);
+        val += ref[idx] * filter[k];
+      }
+      val = ROUND_POWER_OF_TWO(val, FILTER_BITS);
+    } else {
+      val = ref[base + 1];
+    }
+  }
+
+  return val;
+}
+#endif  // CONFIG_INTRA_INTERP
+
+// Directional prediction, zone 1: 0 < angle < 90
+static void dr_prediction_z1(uint8_t *dst, ptrdiff_t stride, int bs,
+                             const uint8_t *above, const uint8_t *left,
+#if CONFIG_INTRA_INTERP
+                             INTRA_FILTER filter_type,
+#endif  // CONFIG_INTRA_INTERP
+                             int dx, int dy) {
+  int r, c, x, base, shift, val;
+
+  (void)left;
+  (void)dy;
+  assert(dy == 1);
+  assert(dx > 0);
+
+#if CONFIG_INTRA_INTERP
+  if (filter_type != INTRA_FILTER_LINEAR) {
+    const int pad_size = SUBPEL_TAPS >> 1;
+    int len;
+    DECLARE_ALIGNED(16, uint8_t, buf[SUBPEL_SHIFTS][MAX_SB_SIZE]);
+    DECLARE_ALIGNED(16, uint8_t, src[MAX_SB_SIZE + SUBPEL_TAPS]);
+    uint8_t flags[SUBPEL_SHIFTS];
+
+    memset(flags, 0, SUBPEL_SHIFTS * sizeof(flags[0]));
+    memset(src, above[0], pad_size * sizeof(above[0]));
+    memcpy(src + pad_size, above, 2 * bs * sizeof(above[0]));
+    memset(src + pad_size + 2 * bs, above[2 * bs - 1],
+           pad_size * sizeof(above[0]));
+    flags[0] = 1;
+    x = dx;
+    for (r = 0; r < bs; ++r, dst += stride, x += dx) {
+      base = x >> 8;
+      shift = x & 0xFF;
+      shift = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS);
+      if (shift == SUBPEL_SHIFTS) {
+        base += 1;
+        shift = 0;
+      }
+      len = AOMMIN(bs, 2 * bs - 1 - base);
+      if (len <= 0) {
+        int i;
+        for (i = r; i < bs; ++i) {
+          memset(dst, above[2 * bs - 1], bs * sizeof(dst[0]));
+          dst += stride;
+        }
+        return;
+      }
+
+      if (len <= (bs >> 1) && !flags[shift]) {
+        base = x >> 8;
+        shift = x & 0xFF;
+        for (c = 0; c < len; ++c) {
+          val = intra_subpel_interp(base, shift, above, 0, 2 * bs - 1,
+                                    filter_type);
+          dst[c] = clip_pixel(val);
+          ++base;
+        }
+      } else {
+        if (!flags[shift]) {
+          const int16_t *filter = av1_intra_filter_kernels[filter_type][shift];
+          aom_convolve8_horiz(src + pad_size, 2 * bs, buf[shift], 2 * bs,
+                              filter, 16, NULL, 16, 2 * bs,
+                              2 * bs < 16 ? 2 : 1);
+          flags[shift] = 1;
+        }
+        memcpy(dst, shift == 0 ? src + pad_size + base : &buf[shift][base],
+               len * sizeof(dst[0]));
+      }
+
+      if (len < bs)
+        memset(dst + len, above[2 * bs - 1], (bs - len) * sizeof(dst[0]));
+    }
+    return;
+  }
+#endif  // CONFIG_INTRA_INTERP
+
+  x = dx;
+  for (r = 0; r < bs; ++r, dst += stride, x += dx) {
+    base = x >> 8;
+    shift = x & 0xFF;
+
+    if (base >= 2 * bs - 1) {
+      int i;
+      for (i = r; i < bs; ++i) {
+        memset(dst, above[2 * bs - 1], bs * sizeof(dst[0]));
+        dst += stride;
+      }
+      return;
+    }
+
+    for (c = 0; c < bs; ++c, ++base) {
+      if (base < 2 * bs - 1) {
+        val = above[base] * (256 - shift) + above[base + 1] * shift;
+        val = ROUND_POWER_OF_TWO(val, 8);
+        dst[c] = clip_pixel(val);
+      } else {
+        dst[c] = above[2 * bs - 1];
+      }
+    }
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+static void dr_prediction_z2(uint8_t *dst, ptrdiff_t stride, int bs,
+                             const uint8_t *above, const uint8_t *left,
+#if CONFIG_INTRA_INTERP
+                             INTRA_FILTER filter_type,
+#endif  // CONFIG_INTRA_INTERP
+                             int dx, int dy) {
+  int r, c, x, y, shift1, shift2, val, base1, base2;
+
+  assert(dx > 0);
+  assert(dy > 0);
+
+  x = -dx;
+  for (r = 0; r < bs; ++r, x -= dx, dst += stride) {
+    base1 = x >> 8;
+    y = (r << 8) - dy;
+    for (c = 0; c < bs; ++c, ++base1, y -= dy) {
+      if (base1 >= -1) {
+        shift1 = x & 0xFF;
+#if CONFIG_INTRA_INTERP
+        val =
+            intra_subpel_interp(base1, shift1, above, -1, bs - 1, filter_type);
+#else
+        val = above[base1] * (256 - shift1) + above[base1 + 1] * shift1;
+        val = ROUND_POWER_OF_TWO(val, 8);
+#endif  // CONFIG_INTRA_INTERP
+      } else {
+        base2 = y >> 8;
+        shift2 = y & 0xFF;
+#if CONFIG_INTRA_INTERP
+        val = intra_subpel_interp(base2, shift2, left, -1, bs - 1, filter_type);
+#else
+        val = left[base2] * (256 - shift2) + left[base2 + 1] * shift2;
+        val = ROUND_POWER_OF_TWO(val, 8);
+#endif  // CONFIG_INTRA_INTERP
+      }
+      dst[c] = clip_pixel(val);
+    }
+  }
+}
+
+// Directional prediction, zone 3: 180 < angle < 270
+static void dr_prediction_z3(uint8_t *dst, ptrdiff_t stride, int bs,
+                             const uint8_t *above, const uint8_t *left,
+#if CONFIG_INTRA_INTERP
+                             INTRA_FILTER filter_type,
+#endif  // CONFIG_INTRA_INTERP
+                             int dx, int dy) {
+  int r, c, y, base, shift, val;
+
+  (void)above;
+  (void)dx;
+
+  assert(dx == 1);
+  assert(dy > 0);
+
+#if CONFIG_INTRA_INTERP
+  if (filter_type != INTRA_FILTER_LINEAR) {
+    const int pad_size = SUBPEL_TAPS >> 1;
+    int len, i;
+    DECLARE_ALIGNED(16, uint8_t, buf[MAX_SB_SIZE][4 * SUBPEL_SHIFTS]);
+    DECLARE_ALIGNED(16, uint8_t, src[(MAX_SB_SIZE + SUBPEL_TAPS) * 4]);
+    uint8_t flags[SUBPEL_SHIFTS];
+
+    memset(flags, 0, SUBPEL_SHIFTS * sizeof(flags[0]));
+    for (i = 0; i < pad_size; ++i) src[4 * i] = left[0];
+    for (i = 0; i < 2 * bs; ++i) src[4 * (i + pad_size)] = left[i];
+    for (i = 0; i < pad_size; ++i)
+      src[4 * (i + 2 * bs + pad_size)] = left[2 * bs - 1];
+    flags[0] = 1;
+    y = dy;
+    for (c = 0; c < bs; ++c, y += dy) {
+      base = y >> 8;
+      shift = y & 0xFF;
+      shift = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS);
+      if (shift == SUBPEL_SHIFTS) {
+        base += 1;
+        shift = 0;
+      }
+      len = AOMMIN(bs, 2 * bs - 1 - base);
+
+      if (len <= 0) {
+        for (r = 0; r < bs; ++r) {
+          dst[r * stride + c] = left[2 * bs - 1];
+        }
+        continue;
+      }
+
+      if (len <= (bs >> 1) && !flags[shift]) {
+        base = y >> 8;
+        shift = y & 0xFF;
+        for (r = 0; r < len; ++r) {
+          val = intra_subpel_interp(base, shift, left, 0, 2 * bs - 1,
+                                    filter_type);
+          dst[r * stride + c] = clip_pixel(val);
+          ++base;
+        }
+      } else {
+        if (!flags[shift]) {
+          const int16_t *filter = av1_intra_filter_kernels[filter_type][shift];
+          aom_convolve8_vert(src + 4 * pad_size, 4, buf[0] + 4 * shift,
+                             4 * SUBPEL_SHIFTS, NULL, 16, filter, 16,
+                             2 * bs < 16 ? 4 : 4, 2 * bs);
+          flags[shift] = 1;
+        }
+
+        if (shift == 0) {
+          for (r = 0; r < len; ++r) {
+            dst[r * stride + c] = left[r + base];
+          }
+        } else {
+          for (r = 0; r < len; ++r) {
+            dst[r * stride + c] = buf[r + base][4 * shift];
+          }
+        }
+      }
+
+      if (len < bs) {
+        for (r = len; r < bs; ++r) {
+          dst[r * stride + c] = left[2 * bs - 1];
+        }
+      }
+    }
+    return;
+  }
+#endif  // CONFIG_INTRA_INTERP
+
+  y = dy;
+  for (c = 0; c < bs; ++c, y += dy) {
+    base = y >> 8;
+    shift = y & 0xFF;
+
+    for (r = 0; r < bs; ++r, ++base) {
+      if (base < 2 * bs - 1) {
+        val = left[base] * (256 - shift) + left[base + 1] * shift;
+        val = ROUND_POWER_OF_TWO(val, 8);
+        dst[r * stride + c] = clip_pixel(val);
+      } else {
+        for (; r < bs; ++r) dst[r * stride + c] = left[2 * bs - 1];
+        break;
+      }
+    }
+  }
+}
+
+// Get the shift (up-scaled by 256) in X w.r.t a unit change in Y.
+// If angle > 0 && angle < 90, dx = -((int)(256 / t));
+// If angle > 90 && angle < 180, dx = (int)(256 / t);
+// If angle > 180 && angle < 270, dx = 1;
+static INLINE int get_dx(int angle) {
+  if (angle > 0 && angle < 90) {
+    return dr_intra_derivative[angle];
+  } else if (angle > 90 && angle < 180) {
+    return dr_intra_derivative[180 - angle];
+  } else {
+    // In this case, we are not really going to use dx. We may return any value.
+    return 1;
+  }
+}
+
+// Get the shift (up-scaled by 256) in Y w.r.t a unit change in X.
+// If angle > 0 && angle < 90, dy = 1;
+// If angle > 90 && angle < 180, dy = (int)(256 * t);
+// If angle > 180 && angle < 270, dy = -((int)(256 * t));
+static INLINE int get_dy(int angle) {
+  if (angle > 90 && angle < 180) {
+    return dr_intra_derivative[angle - 90];
+  } else if (angle > 180 && angle < 270) {
+    return dr_intra_derivative[270 - angle];
+  } else {
+    // In this case, we are not really going to use dy. We may return any value.
+    return 1;
+  }
+}
+
+static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
+                         const uint8_t *above, const uint8_t *left,
+#if CONFIG_INTRA_INTERP
+                         INTRA_FILTER filter_type,
+#endif  // CONFIG_INTRA_INTERP
+                         int angle) {
+  const int dx = get_dx(angle);
+  const int dy = get_dy(angle);
+  const int bs = tx_size_wide[tx_size];
+  assert(angle > 0 && angle < 270);
+
+  if (angle > 0 && angle < 90) {
+    dr_prediction_z1(dst, stride, bs, above, left,
+#if CONFIG_INTRA_INTERP
+                     filter_type,
+#endif  // CONFIG_INTRA_INTERP
+                     dx, dy);
+  } else if (angle > 90 && angle < 180) {
+    dr_prediction_z2(dst, stride, bs, above, left,
+#if CONFIG_INTRA_INTERP
+                     filter_type,
+#endif  // CONFIG_INTRA_INTERP
+                     dx, dy);
+  } else if (angle > 180 && angle < 270) {
+    dr_prediction_z3(dst, stride, bs, above, left,
+#if CONFIG_INTRA_INTERP
+                     filter_type,
+#endif  // CONFIG_INTRA_INTERP
+                     dx, dy);
+  } else if (angle == 90) {
+    pred[V_PRED][tx_size](dst, stride, above, left);
+  } else if (angle == 180) {
+    pred[H_PRED][tx_size](dst, stride, above, left);
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+#if CONFIG_INTRA_INTERP
+static int highbd_intra_subpel_interp(int base, int shift, const uint16_t *ref,
+                                      int ref_start_idx, int ref_end_idx,
+                                      INTRA_FILTER filter_type) {
+  int val, k, idx, filter_idx = 0;
+  const int16_t *filter = NULL;
+
+  if (filter_type == INTRA_FILTER_LINEAR) {
+    val = ref[base] * (256 - shift) + ref[base + 1] * shift;
+    val = ROUND_POWER_OF_TWO(val, 8);
+  } else {
+    filter_idx = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS);
+    filter = av1_intra_filter_kernels[filter_type][filter_idx];
+
+    if (filter_idx < (1 << SUBPEL_BITS)) {
+      val = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k) {
+        idx = base + 1 - (SUBPEL_TAPS / 2) + k;
+        idx = AOMMAX(AOMMIN(idx, ref_end_idx), ref_start_idx);
+        val += ref[idx] * filter[k];
+      }
+      val = ROUND_POWER_OF_TWO(val, FILTER_BITS);
+    } else {
+      val = ref[base + 1];
+    }
+  }
+
+  return val;
+}
+#endif  // CONFIG_INTRA_INTERP
+
+// Directional prediction, zone 1: 0 < angle < 90
+static void highbd_dr_prediction_z1(uint16_t *dst, ptrdiff_t stride, int bs,
+                                    const uint16_t *above, const uint16_t *left,
+#if CONFIG_INTRA_INTERP
+                                    INTRA_FILTER filter_type,
+#endif  // CONFIG_INTRA_INTERP
+                                    int dx, int dy, int bd) {
+  int r, c, x, base, shift, val;
+
+  (void)left;
+  (void)dy;
+  assert(dy == 1);
+  assert(dx > 0);
+
+  x = dx;
+  for (r = 0; r < bs; ++r, dst += stride, x += dx) {
+    base = x >> 8;
+    shift = x & 0xFF;
+
+    if (base >= 2 * bs - 1) {
+      int i;
+      for (i = r; i < bs; ++i) {
+        aom_memset16(dst, above[2 * bs - 1], bs);
+        dst += stride;
+      }
+      return;
+    }
+
+    for (c = 0; c < bs; ++c, ++base) {
+      if (base < 2 * bs - 1) {
+#if CONFIG_INTRA_INTERP
+        val = highbd_intra_subpel_interp(base, shift, above, 0, 2 * bs - 1,
+                                         filter_type);
+#else
+        val = above[base] * (256 - shift) + above[base + 1] * shift;
+        val = ROUND_POWER_OF_TWO(val, 8);
+#endif  // CONFIG_INTRA_INTERP
+        dst[c] = clip_pixel_highbd(val, bd);
+      } else {
+        dst[c] = above[2 * bs - 1];
+      }
+    }
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+static void highbd_dr_prediction_z2(uint16_t *dst, ptrdiff_t stride, int bs,
+                                    const uint16_t *above, const uint16_t *left,
+#if CONFIG_INTRA_INTERP
+                                    INTRA_FILTER filter_type,
+#endif  // CONFIG_INTRA_INTERP
+                                    int dx, int dy, int bd) {
+  int r, c, x, y, shift, val, base;
+
+  assert(dx > 0);
+  assert(dy > 0);
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      y = r + 1;
+      x = (c << 8) - y * dx;
+      base = x >> 8;
+      if (base >= -1) {
+        shift = x & 0xFF;
+#if CONFIG_INTRA_INTERP
+        val = highbd_intra_subpel_interp(base, shift, above, -1, bs - 1,
+                                         filter_type);
+#else
+        val = above[base] * (256 - shift) + above[base + 1] * shift;
+        val = ROUND_POWER_OF_TWO(val, 8);
+#endif  // CONFIG_INTRA_INTERP
+      } else {
+        x = c + 1;
+        y = (r << 8) - x * dy;
+        base = y >> 8;
+        shift = y & 0xFF;
+#if CONFIG_INTRA_INTERP
+        val = highbd_intra_subpel_interp(base, shift, left, -1, bs - 1,
+                                         filter_type);
+#else
+        val = left[base] * (256 - shift) + left[base + 1] * shift;
+        val = ROUND_POWER_OF_TWO(val, 8);
+#endif  // CONFIG_INTRA_INTERP
+      }
+      dst[c] = clip_pixel_highbd(val, bd);
+    }
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 3: 180 < angle < 270
+static void highbd_dr_prediction_z3(uint16_t *dst, ptrdiff_t stride, int bs,
+                                    const uint16_t *above, const uint16_t *left,
+#if CONFIG_INTRA_INTERP
+                                    INTRA_FILTER filter_type,
+#endif  // CONFIG_INTRA_INTERP
+                                    int dx, int dy, int bd) {
+  int r, c, y, base, shift, val;
+
+  (void)above;
+  (void)dx;
+  assert(dx == 1);
+  assert(dy > 0);
+
+  y = dy;
+  for (c = 0; c < bs; ++c, y += dy) {
+    base = y >> 8;
+    shift = y & 0xFF;
+
+    for (r = 0; r < bs; ++r, ++base) {
+      if (base < 2 * bs - 1) {
+#if CONFIG_INTRA_INTERP
+        val = highbd_intra_subpel_interp(base, shift, left, 0, 2 * bs - 1,
+                                         filter_type);
+#else
+        val = left[base] * (256 - shift) + left[base + 1] * shift;
+        val = ROUND_POWER_OF_TWO(val, 8);
+#endif  // CONFIG_INTRA_INTERP
+        dst[r * stride + c] = clip_pixel_highbd(val, bd);
+      } else {
+        for (; r < bs; ++r) dst[r * stride + c] = left[2 * bs - 1];
+        break;
+      }
+    }
+  }
+}
+
+static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  int r;
+  (void)left;
+  (void)bd;
+  for (r = 0; r < bs; r++) {
+    memcpy(dst, above, bs * sizeof(uint16_t));
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  int r;
+  (void)above;
+  (void)bd;
+  for (r = 0; r < bs; r++) {
+    aom_memset16(dst, left[r], bs);
+    dst += stride;
+  }
+}
+
+static void highbd_dr_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                const uint16_t *above, const uint16_t *left,
+#if CONFIG_INTRA_INTERP
+                                INTRA_FILTER filter,
+#endif  // CONFIG_INTRA_INTERP
+                                int angle, int bd) {
+  const int dx = get_dx(angle);
+  const int dy = get_dy(angle);
+  assert(angle > 0 && angle < 270);
+
+  if (angle > 0 && angle < 90) {
+    highbd_dr_prediction_z1(dst, stride, bs, above, left,
+#if CONFIG_INTRA_INTERP
+                            filter,
+#endif  // CONFIG_INTRA_INTERP
+                            dx, dy, bd);
+  } else if (angle > 90 && angle < 180) {
+    highbd_dr_prediction_z2(dst, stride, bs, above, left,
+#if CONFIG_INTRA_INTERP
+                            filter,
+#endif  // CONFIG_INTRA_INTERP
+                            dx, dy, bd);
+  } else if (angle > 180 && angle < 270) {
+    highbd_dr_prediction_z3(dst, stride, bs, above, left,
+#if CONFIG_INTRA_INTERP
+                            filter,
+#endif  // CONFIG_INTRA_INTERP
+                            dx, dy, bd);
+  } else if (angle == 90) {
+    highbd_v_predictor(dst, stride, bs, above, left, bd);
+  } else if (angle == 180) {
+    highbd_h_predictor(dst, stride, bs, above, left, bd);
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_FILTER_INTRA
+#if USE_3TAP_INTRA_FILTER
+int av1_filter_intra_taps_3[TX_SIZES][INTRA_MODES][3] = {
+#if CONFIG_CB4X4
+  {
+      { 697, 836, -509 },
+      { 993, 513, -482 },
+      { 381, 984, -341 },
+      { 642, 1169, -787 },
+      { 590, 553, -119 },
+      { 762, 385, -123 },
+      { 358, 687, -21 },
+      { 411, 1083, -470 },
+      { 912, 814, -702 },
+      { 883, 902, 761 },
+  },
+#endif
+  {
+      { 697, 836, -509 },
+      { 993, 513, -482 },
+      { 381, 984, -341 },
+      { 642, 1169, -787 },
+      { 590, 553, -119 },
+      { 762, 385, -123 },
+      { 358, 687, -21 },
+      { 411, 1083, -470 },
+      { 912, 814, -702 },
+      { 883, 902, 761 },
+  },
+  {
+      { 659, 816, -451 },
+      { 980, 625, -581 },
+      { 558, 962, -496 },
+      { 681, 888, -545 },
+      { 591, 613, 180 },
+      { 778, 399, -153 },
+      { 495, 641, -112 },
+      { 671, 937, -584 },
+      { 745, 940, -661 },
+      { 839, 911, -726 },
+  },
+  {
+      { 539, 927, -442 },
+      { 1003, 714, -693 },
+      { 349, 1271, -596 },
+      { 820, 764, -560 },
+      { 524, 816, -316 },
+      { 780, 681, -437 },
+      { 586, 795, -357 },
+      { 551, 1135, -663 },
+      { 593, 1061, -630 },
+      { 974, 970, -920 },
+  },
+  {
+      { 595, 919, -490 },
+      { 945, 668, -579 },
+      { 495, 962, -433 },
+      { 385, 1551, -912 },
+      { 455, 554, 15 },
+      { 852, 478, -306 },
+      { 177, 760, -87 },
+      { -65, 1611, -522 },
+      { 815, 894, -685 },
+      { 846, 1010, -832 },
+  },
+#if CONFIG_TX64X64
+  {
+      { 595, 919, -490 },
+      { 945, 668, -579 },
+      { 495, 962, -433 },
+      { 385, 1551, -912 },
+      { 455, 554, 15 },
+      { 852, 478, -306 },
+      { 177, 760, -87 },
+      { -65, 1611, -522 },
+      { 815, 894, -685 },
+      { 846, 1010, -832 },
+  },
+#endif  // CONFIG_TX64X64
+};
+#else
+int av1_filter_intra_taps_4[TX_SIZES][INTRA_MODES][4] = {
+#if CONFIG_CB4X4
+  {
+      { 735, 881, -537, -54 },
+      { 1005, 519, -488, -11 },
+      { 383, 990, -343, -6 },
+      { 442, 805, -542, 319 },
+      { 658, 616, -133, -116 },
+      { 875, 442, -141, -151 },
+      { 386, 741, -23, -80 },
+      { 390, 1027, -446, 51 },
+      { 679, 606, -523, 262 },
+      { 903, 922, -778, -23 },
+  },
+#endif
+  {
+      { 735, 881, -537, -54 },
+      { 1005, 519, -488, -11 },
+      { 383, 990, -343, -6 },
+      { 442, 805, -542, 319 },
+      { 658, 616, -133, -116 },
+      { 875, 442, -141, -151 },
+      { 386, 741, -23, -80 },
+      { 390, 1027, -446, 51 },
+      { 679, 606, -523, 262 },
+      { 903, 922, -778, -23 },
+  },
+  {
+      { 648, 803, -444, 16 },
+      { 972, 620, -576, 7 },
+      { 561, 967, -499, -5 },
+      { 585, 762, -468, 144 },
+      { 596, 619, -182, -9 },
+      { 895, 459, -176, -153 },
+      { 557, 722, -126, -129 },
+      { 601, 839, -523, 105 },
+      { 562, 709, -499, 251 },
+      { 803, 872, -695, 43 },
+  },
+  {
+      { 423, 728, -347, 111 },
+      { 963, 685, -665, 23 },
+      { 281, 1024, -480, 216 },
+      { 640, 596, -437, 78 },
+      { 429, 669, -259, 99 },
+      { 740, 646, -415, 23 },
+      { 568, 771, -346, 40 },
+      { 404, 833, -486, 209 },
+      { 398, 712, -423, 307 },
+      { 939, 935, -887, 17 },
+  },
+  {
+      { 477, 737, -393, 150 },
+      { 881, 630, -546, 67 },
+      { 506, 984, -443, -20 },
+      { 114, 459, -270, 528 },
+      { 433, 528, 14, 3 },
+      { 837, 470, -301, -30 },
+      { 181, 777, 89, -107 },
+      { -29, 716, -232, 259 },
+      { 589, 646, -495, 255 },
+      { 740, 884, -728, 77 },
+  },
+#if CONFIG_TX64X64
+  {
+      { 477, 737, -393, 150 },
+      { 881, 630, -546, 67 },
+      { 506, 984, -443, -20 },
+      { 114, 459, -270, 528 },
+      { 433, 528, 14, 3 },
+      { 837, 470, -301, -30 },
+      { 181, 777, 89, -107 },
+      { -29, 716, -232, 259 },
+      { 589, 646, -495, 255 },
+      { 740, 884, -728, 77 },
+  },
+#endif  // CONFIG_TX64X64
+};
+#endif
+
+static INLINE TX_SIZE get_txsize_from_blocklen(int bs) {
+  switch (bs) {
+    case 4: return TX_4X4;
+    case 8: return TX_8X8;
+    case 16: return TX_16X16;
+    case 32: return TX_32X32;
+#if CONFIG_TX64X64
+    case 64: return TX_64X64;
+#endif  // CONFIG_TX64X64
+    default: assert(0); return TX_INVALID;
+  }
+}
+
+#if USE_3TAP_INTRA_FILTER
+static void filter_intra_predictors_3tap(uint8_t *dst, ptrdiff_t stride, int bs,
+                                         const uint8_t *above,
+                                         const uint8_t *left, int mode) {
+  int k, r, c;
+  int mean, ipred;
+#if CONFIG_TX64X64
+  int buffer[65][65];
+#else
+  int buffer[33][33];
+#endif  // CONFIG_TX64X64
+  const TX_SIZE tx_size = get_txsize_from_blocklen(bs);
+  const int c0 = av1_filter_intra_taps_3[tx_size][mode][0];
+  const int c1 = av1_filter_intra_taps_3[tx_size][mode][1];
+  const int c2 = av1_filter_intra_taps_3[tx_size][mode][2];
+
+  k = 0;
+  mean = 0;
+  while (k < bs) {
+    mean = mean + (int)left[k];
+    mean = mean + (int)above[k];
+    k++;
+  }
+  mean = (mean + bs) / (2 * bs);
+
+  for (r = 0; r < bs; ++r) buffer[r + 1][0] = (int)left[r] - mean;
+
+  for (c = 0; c < bs + 1; ++c) buffer[0][c] = (int)above[c - 1] - mean;
+
+  for (r = 1; r < bs + 1; ++r)
+    for (c = 1; c < bs + 1; ++c) {
+      ipred = c0 * buffer[r - 1][c] + c1 * buffer[r][c - 1] +
+              c2 * buffer[r - 1][c - 1];
+      buffer[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_PREC_BITS);
+    }
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      ipred = buffer[r + 1][c + 1] + mean;
+      dst[c] = clip_pixel(ipred);
+    }
+    dst += stride;
+  }
+}
+#else
+static void filter_intra_predictors_4tap(uint8_t *dst, ptrdiff_t stride, int bs,
+                                         const uint8_t *above,
+                                         const uint8_t *left, int mode) {
+  int k, r, c;
+  int mean, ipred;
+#if CONFIG_TX64X64
+  int buffer[65][129];
+#else
+  int buffer[33][65];
+#endif  // CONFIG_TX64X64
+  const TX_SIZE tx_size = get_txsize_from_blocklen(bs);
+  const int c0 = av1_filter_intra_taps_4[tx_size][mode][0];
+  const int c1 = av1_filter_intra_taps_4[tx_size][mode][1];
+  const int c2 = av1_filter_intra_taps_4[tx_size][mode][2];
+  const int c3 = av1_filter_intra_taps_4[tx_size][mode][3];
+
+  k = 0;
+  mean = 0;
+  while (k < bs) {
+    mean = mean + (int)left[k];
+    mean = mean + (int)above[k];
+    k++;
+  }
+  mean = (mean + bs) / (2 * bs);
+
+  for (r = 0; r < bs; ++r) buffer[r + 1][0] = (int)left[r] - mean;
+
+  for (c = 0; c < 2 * bs + 1; ++c) buffer[0][c] = (int)above[c - 1] - mean;
+
+  for (r = 1; r < bs + 1; ++r)
+    for (c = 1; c < 2 * bs + 1 - r; ++c) {
+      ipred = c0 * buffer[r - 1][c] + c1 * buffer[r][c - 1] +
+              c2 * buffer[r - 1][c - 1] + c3 * buffer[r - 1][c + 1];
+      buffer[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_PREC_BITS);
+    }
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      ipred = buffer[r + 1][c + 1] + mean;
+      dst[c] = clip_pixel(ipred);
+    }
+    dst += stride;
+  }
+}
+#endif
+
+void av1_dc_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+                               const uint8_t *above, const uint8_t *left) {
+#if USE_3TAP_INTRA_FILTER
+  filter_intra_predictors_3tap(dst, stride, bs, above, left, DC_PRED);
+#else
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, DC_PRED);
+#endif
+}
+
+void av1_v_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+                              const uint8_t *above, const uint8_t *left) {
+#if USE_3TAP_INTRA_FILTER
+  filter_intra_predictors_3tap(dst, stride, bs, above, left, V_PRED);
+#else
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, V_PRED);
+#endif
+}
+
+void av1_h_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+                              const uint8_t *above, const uint8_t *left) {
+#if USE_3TAP_INTRA_FILTER
+  filter_intra_predictors_3tap(dst, stride, bs, above, left, H_PRED);
+#else
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, H_PRED);
+#endif
+}
+
+void av1_d45_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+                                const uint8_t *above, const uint8_t *left) {
+#if USE_3TAP_INTRA_FILTER
+  filter_intra_predictors_3tap(dst, stride, bs, above, left, D45_PRED);
+#else
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D45_PRED);
+#endif
+}
+
+void av1_d135_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
+#if USE_3TAP_INTRA_FILTER
+  filter_intra_predictors_3tap(dst, stride, bs, above, left, D135_PRED);
+#else
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D135_PRED);
+#endif
+}
+
+void av1_d117_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
+#if USE_3TAP_INTRA_FILTER
+  filter_intra_predictors_3tap(dst, stride, bs, above, left, D117_PRED);
+#else
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D117_PRED);
+#endif
+}
+
+void av1_d153_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
+#if USE_3TAP_INTRA_FILTER
+  filter_intra_predictors_3tap(dst, stride, bs, above, left, D153_PRED);
+#else
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D153_PRED);
+#endif
+}
+
+void av1_d207_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
+#if USE_3TAP_INTRA_FILTER
+  filter_intra_predictors_3tap(dst, stride, bs, above, left, D207_PRED);
+#else
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D207_PRED);
+#endif
+}
+
+void av1_d63_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+                                const uint8_t *above, const uint8_t *left) {
+#if USE_3TAP_INTRA_FILTER
+  filter_intra_predictors_3tap(dst, stride, bs, above, left, D63_PRED);
+#else
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D63_PRED);
+#endif
+}
+
+void av1_tm_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+                               const uint8_t *above, const uint8_t *left) {
+#if USE_3TAP_INTRA_FILTER
+  filter_intra_predictors_3tap(dst, stride, bs, above, left, TM_PRED);
+#else
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, TM_PRED);
+#endif
+}
+
+static void filter_intra_predictors(FILTER_INTRA_MODE mode, uint8_t *dst,
+                                    ptrdiff_t stride, int bs,
+                                    const uint8_t *above, const uint8_t *left) {
+  switch (mode) {
+    case FILTER_DC_PRED:
+      av1_dc_filter_predictor(dst, stride, bs, above, left);
+      break;
+    case FILTER_V_PRED:
+      av1_v_filter_predictor(dst, stride, bs, above, left);
+      break;
+    case FILTER_H_PRED:
+      av1_h_filter_predictor(dst, stride, bs, above, left);
+      break;
+    case FILTER_D45_PRED:
+      av1_d45_filter_predictor(dst, stride, bs, above, left);
+      break;
+    case FILTER_D135_PRED:
+      av1_d135_filter_predictor(dst, stride, bs, above, left);
+      break;
+    case FILTER_D117_PRED:
+      av1_d117_filter_predictor(dst, stride, bs, above, left);
+      break;
+    case FILTER_D153_PRED:
+      av1_d153_filter_predictor(dst, stride, bs, above, left);
+      break;
+    case FILTER_D207_PRED:
+      av1_d207_filter_predictor(dst, stride, bs, above, left);
+      break;
+    case FILTER_D63_PRED:
+      av1_d63_filter_predictor(dst, stride, bs, above, left);
+      break;
+    case FILTER_TM_PRED:
+      av1_tm_filter_predictor(dst, stride, bs, above, left);
+      break;
+    default: assert(0);
+  }
+}
+#if CONFIG_HIGHBITDEPTH
+#if USE_3TAP_INTRA_FILTER
+static void highbd_filter_intra_predictors_3tap(uint16_t *dst, ptrdiff_t stride,
+                                                int bs, const uint16_t *above,
+                                                const uint16_t *left, int mode,
+                                                int bd) {
+  int k, r, c;
+  int mean, ipred;
+#if CONFIG_TX64X64
+  int preds[65][65];
+#else
+  int preds[33][33];
+#endif  // CONFIG_TX64X64
+  const TX_SIZE tx_size = get_txsize_from_blocklen(bs);
+  const int c0 = av1_filter_intra_taps_3[tx_size][mode][0];
+  const int c1 = av1_filter_intra_taps_3[tx_size][mode][1];
+  const int c2 = av1_filter_intra_taps_3[tx_size][mode][2];
+
+  k = 0;
+  mean = 0;
+  while (k < bs) {
+    mean = mean + (int)left[k];
+    mean = mean + (int)above[k];
+    k++;
+  }
+  mean = (mean + bs) / (2 * bs);
+
+  for (r = 0; r < bs; ++r) preds[r + 1][0] = (int)left[r] - mean;
+
+  for (c = 0; c < bs + 1; ++c) preds[0][c] = (int)above[c - 1] - mean;
+
+  for (r = 1; r < bs + 1; ++r)
+    for (c = 1; c < bs + 1; ++c) {
+      ipred = c0 * preds[r - 1][c] + c1 * preds[r][c - 1] +
+              c2 * preds[r - 1][c - 1];
+      preds[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_PREC_BITS);
+    }
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      ipred = preds[r + 1][c + 1] + mean;
+      dst[c] = clip_pixel_highbd(ipred, bd);
+    }
+    dst += stride;
+  }
+}
+#else
+static void highbd_filter_intra_predictors_4tap(uint16_t *dst, ptrdiff_t stride,
+                                                int bs, const uint16_t *above,
+                                                const uint16_t *left, int mode,
+                                                int bd) {
+  int k, r, c;
+  int mean, ipred;
+#if CONFIG_TX64X64
+  int preds[65][129];
+#else
+  int preds[33][65];
+#endif  // CONFIG_TX64X64
+  const TX_SIZE tx_size = get_txsize_from_blocklen(bs);
+  const int c0 = av1_filter_intra_taps_4[tx_size][mode][0];
+  const int c1 = av1_filter_intra_taps_4[tx_size][mode][1];
+  const int c2 = av1_filter_intra_taps_4[tx_size][mode][2];
+  const int c3 = av1_filter_intra_taps_4[tx_size][mode][3];
+
+  k = 0;
+  mean = 0;
+  while (k < bs) {
+    mean = mean + (int)left[k];
+    mean = mean + (int)above[k];
+    k++;
+  }
+  mean = (mean + bs) / (2 * bs);
+
+  for (r = 0; r < bs; ++r) preds[r + 1][0] = (int)left[r] - mean;
+
+  for (c = 0; c < 2 * bs + 1; ++c) preds[0][c] = (int)above[c - 1] - mean;
+
+  for (r = 1; r < bs + 1; ++r)
+    for (c = 1; c < 2 * bs + 1 - r; ++c) {
+      ipred = c0 * preds[r - 1][c] + c1 * preds[r][c - 1] +
+              c2 * preds[r - 1][c - 1] + c3 * preds[r - 1][c + 1];
+      preds[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_PREC_BITS);
+    }
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      ipred = preds[r + 1][c + 1] + mean;
+      dst[c] = clip_pixel_highbd(ipred, bd);
+    }
+    dst += stride;
+  }
+}
+#endif
+
+void av1_highbd_dc_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+#if USE_3TAP_INTRA_FILTER
+  highbd_filter_intra_predictors_3tap(dst, stride, bs, above, left, DC_PRED,
+                                      bd);
+#else
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, DC_PRED,
+                                      bd);
+#endif
+}
+
+void av1_highbd_v_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+#if USE_3TAP_INTRA_FILTER
+  highbd_filter_intra_predictors_3tap(dst, stride, bs, above, left, V_PRED, bd);
+#else
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, V_PRED, bd);
+#endif
+}
+
+void av1_highbd_h_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+#if USE_3TAP_INTRA_FILTER
+  highbd_filter_intra_predictors_3tap(dst, stride, bs, above, left, H_PRED, bd);
+#else
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, H_PRED, bd);
+#endif
+}
+
+void av1_highbd_d45_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+#if USE_3TAP_INTRA_FILTER
+  highbd_filter_intra_predictors_3tap(dst, stride, bs, above, left, D45_PRED,
+                                      bd);
+#else
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D45_PRED,
+                                      bd);
+#endif
+}
+
+void av1_highbd_d135_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+#if USE_3TAP_INTRA_FILTER
+  highbd_filter_intra_predictors_3tap(dst, stride, bs, above, left, D135_PRED,
+                                      bd);
+#else
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D135_PRED,
+                                      bd);
+#endif
+}
+
+void av1_highbd_d117_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+#if USE_3TAP_INTRA_FILTER
+  highbd_filter_intra_predictors_3tap(dst, stride, bs, above, left, D117_PRED,
+                                      bd);
+#else
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D117_PRED,
+                                      bd);
+#endif
+}
+
+void av1_highbd_d153_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+#if USE_3TAP_INTRA_FILTER
+  highbd_filter_intra_predictors_3tap(dst, stride, bs, above, left, D153_PRED,
+                                      bd);
+#else
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D153_PRED,
+                                      bd);
+#endif
+}
+
+void av1_highbd_d207_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+#if USE_3TAP_INTRA_FILTER
+  highbd_filter_intra_predictors_3tap(dst, stride, bs, above, left, D207_PRED,
+                                      bd);
+#else
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D207_PRED,
+                                      bd);
+#endif
+}
+
+void av1_highbd_d63_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+#if USE_3TAP_INTRA_FILTER
+  highbd_filter_intra_predictors_3tap(dst, stride, bs, above, left, D63_PRED,
+                                      bd);
+#else
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D63_PRED,
+                                      bd);
+#endif
+}
+
+void av1_highbd_tm_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+#if USE_3TAP_INTRA_FILTER
+  highbd_filter_intra_predictors_3tap(dst, stride, bs, above, left, TM_PRED,
+                                      bd);
+#else
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, TM_PRED,
+                                      bd);
+#endif
+}
+
+static void highbd_filter_intra_predictors(FILTER_INTRA_MODE mode,
+                                           uint16_t *dst, ptrdiff_t stride,
+                                           int bs, const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  switch (mode) {
+    case FILTER_DC_PRED:
+      av1_highbd_dc_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case FILTER_V_PRED:
+      av1_highbd_v_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case FILTER_H_PRED:
+      av1_highbd_h_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case FILTER_D45_PRED:
+      av1_highbd_d45_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case FILTER_D135_PRED:
+      av1_highbd_d135_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case FILTER_D117_PRED:
+      av1_highbd_d117_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case FILTER_D153_PRED:
+      av1_highbd_d153_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case FILTER_D207_PRED:
+      av1_highbd_d207_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case FILTER_D63_PRED:
+      av1_highbd_d63_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case FILTER_TM_PRED:
+      av1_highbd_tm_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    default: assert(0);
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_FILTER_INTRA
+
+#if CONFIG_HIGHBITDEPTH
+static void build_intra_predictors_high(
+    const MACROBLOCKD *xd, const uint8_t *ref8, int ref_stride, uint8_t *dst8,
+    int dst_stride, PREDICTION_MODE mode, TX_SIZE tx_size, int n_top_px,
+    int n_topright_px, int n_left_px, int n_bottomleft_px, int plane) {
+  int i;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  DECLARE_ALIGNED(16, uint16_t, left_data[MAX_TX_SIZE * 2 + 16]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[MAX_TX_SIZE * 2 + 16]);
+  uint16_t *above_row = above_data + 16;
+  uint16_t *left_col = left_data + 16;
+  const uint16_t *const_above_row = above_row;
+  const int bs = tx_size_wide[tx_size];
+  int need_left = extend_modes[mode] & NEED_LEFT;
+  int need_above = extend_modes[mode] & NEED_ABOVE;
+  int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
+  const uint16_t *above_ref = ref - ref_stride;
+#if CONFIG_EXT_INTRA
+  int p_angle = 0;
+  const int is_dr_mode = av1_is_directional_mode(mode, xd->mi[0]->mbmi.sb_type);
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  const FILTER_INTRA_MODE_INFO *filter_intra_mode_info =
+      &xd->mi[0]->mbmi.filter_intra_mode_info;
+  const FILTER_INTRA_MODE filter_intra_mode =
+      filter_intra_mode_info->filter_intra_mode[plane != 0];
+#endif  // CONFIG_FILTER_INTRA
+  int base = 128 << (xd->bd - 8);
+  assert(tx_size_wide[tx_size] == tx_size_high[tx_size]);
+
+  // base-1 base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
+  // base+1   A      B  ..     Y      Z
+  // base+1   C      D  ..     W      X
+  // base+1   E      F  ..     U      V
+  // base+1   G      H  ..     S      T      T      T      T      T
+  aom_memset16(left_data, base + 1, sizeof(left_data) / sizeof(*left_data));
+
+#if CONFIG_EXT_INTRA
+  if (is_dr_mode) {
+    p_angle = mode_to_angle_map[mode] +
+              xd->mi[0]->mbmi.angle_delta[plane != 0] * ANGLE_STEP;
+    if (p_angle <= 90)
+      need_above = 1, need_left = 0, need_above_left = 1;
+    else if (p_angle < 180)
+      need_above = 1, need_left = 1, need_above_left = 1;
+    else
+      need_above = 0, need_left = 1, need_above_left = 1;
+  }
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
+    need_left = need_above = need_above_left = 1;
+#endif  // CONFIG_FILTER_INTRA
+
+  (void)plane;
+  assert(n_top_px >= 0);
+  assert(n_topright_px >= 0);
+  assert(n_left_px >= 0);
+  assert(n_bottomleft_px >= 0);
+
+  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
+    const int val = need_left ? base + 1 : base - 1;
+    for (i = 0; i < bs; ++i) {
+      aom_memset16(dst, val, bs);
+      dst += dst_stride;
+    }
+    return;
+  }
+
+  // NEED_LEFT
+  if (need_left) {
+#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
+    int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+#if CONFIG_FILTER_INTRA
+    if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
+      need_bottom = 0;
+#endif  // CONFIG_FILTER_INTRA
+#if CONFIG_EXT_INTRA
+    if (is_dr_mode) need_bottom = p_angle > 180;
+#endif  // CONFIG_EXT_INTRA
+#else
+    const int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
+    i = 0;
+    if (n_left_px > 0) {
+      for (; i < n_left_px; i++) left_col[i] = ref[i * ref_stride - 1];
+      if (need_bottom && n_bottomleft_px > 0) {
+        assert(i == bs);
+        for (; i < bs + n_bottomleft_px; i++)
+          left_col[i] = ref[i * ref_stride - 1];
+      }
+      if (i < (bs << need_bottom))
+        aom_memset16(&left_col[i], left_col[i - 1], (bs << need_bottom) - i);
+    } else {
+      aom_memset16(left_col, base + 1, bs << need_bottom);
+    }
+  }
+
+  // NEED_ABOVE
+  if (need_above) {
+#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
+    int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+#if CONFIG_FILTER_INTRA
+    if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
+      need_right = 1;
+#endif  // CONFIG_FILTER_INTRA
+#if CONFIG_EXT_INTRA
+    if (is_dr_mode) need_right = p_angle < 90;
+#endif  // CONFIG_EXT_INTRA
+#else
+    const int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
+    if (n_top_px > 0) {
+      memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0]));
+      i = n_top_px;
+      if (need_right && n_topright_px > 0) {
+        assert(n_top_px == bs);
+        memcpy(above_row + bs, above_ref + bs,
+               n_topright_px * sizeof(above_ref[0]));
+        i += n_topright_px;
+      }
+      if (i < (bs << need_right))
+        aom_memset16(&above_row[i], above_row[i - 1], (bs << need_right) - i);
+    } else {
+      aom_memset16(above_row, base - 1, bs << need_right);
+    }
+  }
+
+  if (need_above_left) {
+    above_row[-1] =
+        n_top_px > 0 ? (n_left_px > 0 ? above_ref[-1] : base + 1) : base - 1;
+    left_col[-1] = above_row[-1];
+  }
+
+#if CONFIG_FILTER_INTRA
+  if (filter_intra_mode_info->use_filter_intra_mode[plane != 0]) {
+    highbd_filter_intra_predictors(filter_intra_mode, dst, dst_stride, bs,
+                                   const_above_row, left_col, xd->bd);
+    return;
+  }
+#endif  // CONFIG_FILTER_INTRA
+
+#if CONFIG_EXT_INTRA
+  if (is_dr_mode) {
+#if CONFIG_INTRA_INTERP
+    INTRA_FILTER filter = INTRA_FILTER_LINEAR;
+    if (plane == 0 && av1_is_intra_filter_switchable(p_angle))
+      filter = xd->mi[0]->mbmi.intra_filter;
+#endif  // CONFIG_INTRA_INTERP
+    highbd_dr_predictor(dst, dst_stride, bs, const_above_row, left_col,
+#if CONFIG_INTRA_INTERP
+                        filter,
+#endif  // CONFIG_INTRA_INTERP
+                        p_angle, xd->bd);
+    return;
+  }
+#endif  // CONFIG_EXT_INTRA
+
+  // predict
+  if (mode == DC_PRED) {
+    dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size](
+        dst, dst_stride, const_above_row, left_col, xd->bd);
+  } else {
+    pred_high[mode][tx_size](dst, dst_stride, const_above_row, left_col,
+                             xd->bd);
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
+                                   int ref_stride, uint8_t *dst, int dst_stride,
+                                   PREDICTION_MODE mode, TX_SIZE tx_size,
+                                   int n_top_px, int n_topright_px,
+                                   int n_left_px, int n_bottomleft_px,
+                                   int plane) {
+  int i;
+  const uint8_t *above_ref = ref - ref_stride;
+  DECLARE_ALIGNED(16, uint8_t, left_data[MAX_TX_SIZE * 2 + 16]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[MAX_TX_SIZE * 2 + 16]);
+  uint8_t *above_row = above_data + 16;
+  uint8_t *left_col = left_data + 16;
+  const uint8_t *const_above_row = above_row;
+  const int bs = tx_size_wide[tx_size];
+  int need_left = extend_modes[mode] & NEED_LEFT;
+  int need_above = extend_modes[mode] & NEED_ABOVE;
+  int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
+#if CONFIG_EXT_INTRA
+  int p_angle = 0;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int is_dr_mode = av1_is_directional_mode(mode, mbmi->sb_type);
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  const FILTER_INTRA_MODE_INFO *filter_intra_mode_info =
+      &xd->mi[0]->mbmi.filter_intra_mode_info;
+  const FILTER_INTRA_MODE filter_intra_mode =
+      filter_intra_mode_info->filter_intra_mode[plane != 0];
+#endif  // CONFIG_FILTER_INTRA
+  assert(tx_size_wide[tx_size] == tx_size_high[tx_size]);
+
+  // 127 127 127 .. 127 127 127 127 127 127
+  // 129  A   B  ..  Y   Z
+  // 129  C   D  ..  W   X
+  // 129  E   F  ..  U   V
+  // 129  G   H  ..  S   T   T   T   T   T
+  // ..
+  memset(left_data, 129, sizeof(left_data));
+
+#if CONFIG_EXT_INTRA
+  if (is_dr_mode) {
+    p_angle = mode_to_angle_map[mode] +
+              xd->mi[0]->mbmi.angle_delta[plane != 0] * ANGLE_STEP;
+    if (p_angle <= 90)
+      need_above = 1, need_left = 0, need_above_left = 1;
+    else if (p_angle < 180)
+      need_above = 1, need_left = 1, need_above_left = 1;
+    else
+      need_above = 0, need_left = 1, need_above_left = 1;
+  }
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
+    need_left = need_above = need_above_left = 1;
+#endif  // CONFIG_FILTER_INTRA
+
+  (void)xd;
+  (void)plane;
+  assert(n_top_px >= 0);
+  assert(n_topright_px >= 0);
+  assert(n_left_px >= 0);
+  assert(n_bottomleft_px >= 0);
+
+  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
+    const int val = need_left ? 129 : 127;
+    for (i = 0; i < bs; ++i) {
+      memset(dst, val, bs);
+      dst += dst_stride;
+    }
+    return;
+  }
+
+  // NEED_LEFT
+  if (need_left) {
+#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
+    int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+#if CONFIG_FILTER_INTRA
+    if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
+      need_bottom = 0;
+#endif  // CONFIG_FILTER_INTRA
+#if CONFIG_EXT_INTRA
+    if (is_dr_mode) need_bottom = p_angle > 180;
+#endif  // CONFIG_EXT_INTRA
+#else
+    const int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
+    i = 0;
+    if (n_left_px > 0) {
+      for (; i < n_left_px; i++) left_col[i] = ref[i * ref_stride - 1];
+      if (need_bottom && n_bottomleft_px > 0) {
+        assert(i == bs);
+        for (; i < bs + n_bottomleft_px; i++)
+          left_col[i] = ref[i * ref_stride - 1];
+      }
+      if (i < (bs << need_bottom))
+        memset(&left_col[i], left_col[i - 1], (bs << need_bottom) - i);
+    } else {
+      memset(left_col, 129, bs << need_bottom);
+    }
+  }
+
+  // NEED_ABOVE
+  if (need_above) {
+#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
+    int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+#if CONFIG_FILTER_INTRA
+    if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
+      need_right = 1;
+#endif  // CONFIG_FILTER_INTRA
+#if CONFIG_EXT_INTRA
+    if (is_dr_mode) need_right = p_angle < 90;
+#endif  // CONFIG_EXT_INTRA
+#else
+    const int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+#endif  // CONFIG_EXT_INTRA || CONFIG_FITLER_INTRA
+    if (n_top_px > 0) {
+      memcpy(above_row, above_ref, n_top_px);
+      i = n_top_px;
+      if (need_right && n_topright_px > 0) {
+        assert(n_top_px == bs);
+        memcpy(above_row + bs, above_ref + bs, n_topright_px);
+        i += n_topright_px;
+      }
+      if (i < (bs << need_right))
+        memset(&above_row[i], above_row[i - 1], (bs << need_right) - i);
+    } else {
+      memset(above_row, 127, bs << need_right);
+    }
+  }
+
+  if (need_above_left) {
+    above_row[-1] = n_top_px > 0 ? (n_left_px > 0 ? above_ref[-1] : 129) : 127;
+    left_col[-1] = above_row[-1];
+  }
+
+#if CONFIG_FILTER_INTRA
+  if (filter_intra_mode_info->use_filter_intra_mode[plane != 0]) {
+    filter_intra_predictors(filter_intra_mode, dst, dst_stride, bs,
+                            const_above_row, left_col);
+    return;
+  }
+#endif  // CONFIG_FILTER_INTRA
+#if CONFIG_EXT_INTRA
+  if (is_dr_mode) {
+#if CONFIG_INTRA_INTERP
+    INTRA_FILTER filter = INTRA_FILTER_LINEAR;
+    if (plane == 0 && av1_is_intra_filter_switchable(p_angle))
+      filter = xd->mi[0]->mbmi.intra_filter;
+#endif  // CONFIG_INTRA_INTERP
+    dr_predictor(dst, dst_stride, tx_size, const_above_row, left_col,
+#if CONFIG_INTRA_INTERP
+                 filter,
+#endif  // CONFIG_INTRA_INTERP
+                 p_angle);
+    return;
+  }
+#endif  // CONFIG_EXT_INTRA
+
+  // predict
+  if (mode == DC_PRED) {
+#if CONFIG_CFL
+    // CFL predict its own DC_PRED for Chromatic planes
+    if (plane == AOM_PLANE_Y) {
+#endif
+      dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride,
+                                                    const_above_row, left_col);
+#if CONFIG_CFL
+    }
+#endif
+
+  } else {
+    pred[mode][tx_size](dst, dst_stride, const_above_row, left_col);
+  }
+}
+
+static void predict_square_intra_block(const MACROBLOCKD *xd, int wpx, int hpx,
+                                       TX_SIZE tx_size, PREDICTION_MODE mode,
+                                       const uint8_t *ref, int ref_stride,
+                                       uint8_t *dst, int dst_stride,
+                                       int col_off, int row_off, int plane) {
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int txw = tx_size_wide_unit[tx_size];
+#if CONFIG_CB4X4 && CONFIG_CHROMA_SUB8X8
+  const int have_top = row_off || (pd->subsampling_y ? xd->chroma_up_available
+                                                     : xd->up_available);
+  const int have_left =
+      col_off ||
+      (pd->subsampling_x ? xd->chroma_left_available : xd->left_available);
+#else
+  const int have_top = row_off || xd->up_available;
+  const int have_left = col_off || xd->left_available;
+#endif
+  const int x = col_off << tx_size_wide_log2[0];
+  const int y = row_off << tx_size_high_log2[0];
+  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+  const int txwpx = tx_size_wide[tx_size];
+  const int txhpx = tx_size_high[tx_size];
+#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+  const int xr_chr_offset = (pd->subsampling_x && bsize < BLOCK_8X8) ? 2 : 0;
+  const int yd_chr_offset = (pd->subsampling_y && bsize < BLOCK_8X8) ? 2 : 0;
+#else
+  const int xr_chr_offset = 0;
+  const int yd_chr_offset = 0;
+#endif
+
+  // Distance between the right edge of this prediction block to
+  // the frame right edge
+  const int xr = (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) +
+                 (wpx - x - txwpx) - xr_chr_offset;
+  // Distance between the bottom edge of this prediction block to
+  // the frame bottom edge
+  const int yd = (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) +
+                 (hpx - y - txhpx) - yd_chr_offset;
+  const int right_available =
+      (mi_col + ((col_off + txw) >> (1 - pd->subsampling_x))) <
+      xd->tile.mi_col_end;
+  const int bottom_available = (yd > 0);
+#if CONFIG_EXT_PARTITION_TYPES
+  const PARTITION_TYPE partition = xd->mi[0]->mbmi.partition;
+#endif
+
+#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+  // force 4x4 chroma component block size.
+  bsize = scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
+#endif
+
+  const int have_top_right =
+      has_top_right(bsize, mi_row, mi_col, have_top, right_available,
+#if CONFIG_EXT_PARTITION_TYPES
+                    partition,
+#endif
+                    tx_size, row_off, col_off, pd->subsampling_x);
+  const int have_bottom_left =
+      has_bottom_left(bsize, mi_row, mi_col, bottom_available, have_left,
+                      tx_size, row_off, col_off, pd->subsampling_y);
+  assert(txwpx == txhpx);
+
+#if CONFIG_PALETTE
+  if (xd->mi[0]->mbmi.palette_mode_info.palette_size[plane != 0] > 0) {
+    const int bs = tx_size_wide[tx_size];
+    const int stride = wpx;
+    int r, c;
+    const uint8_t *const map = xd->plane[plane != 0].color_index_map;
+#if CONFIG_HIGHBITDEPTH
+    uint16_t *palette = xd->mi[0]->mbmi.palette_mode_info.palette_colors +
+                        plane * PALETTE_MAX_SIZE;
+#else
+    uint8_t *palette = xd->mi[0]->mbmi.palette_mode_info.palette_colors +
+                       plane * PALETTE_MAX_SIZE;
+#endif  // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+      for (r = 0; r < bs; ++r)
+        for (c = 0; c < bs; ++c)
+          dst16[r * dst_stride + c] = palette[map[(r + y) * stride + c + x]];
+    } else {
+      for (r = 0; r < bs; ++r)
+        for (c = 0; c < bs; ++c)
+          dst[r * dst_stride + c] =
+              (uint8_t)(palette[map[(r + y) * stride + c + x]]);
+    }
+#else
+    for (r = 0; r < bs; ++r)
+      for (c = 0; c < bs; ++c)
+        dst[r * dst_stride + c] = palette[map[(r + y) * stride + c + x]];
+#endif  // CONFIG_HIGHBITDEPTH
+    return;
+  }
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    build_intra_predictors_high(
+        xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
+        have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+        have_top_right ? AOMMIN(txwpx, xr) : 0,
+        have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
+        have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane);
+    return;
+  }
+#endif
+  build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
+                         have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+                         have_top_right ? AOMMIN(txwpx, xr) : 0,
+                         have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
+                         have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane);
+}
+
+void av1_predict_intra_block_facade(MACROBLOCKD *xd, int plane, int block_idx,
+                                    int blk_col, int blk_row, TX_SIZE tx_size) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst =
+      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int block_raster_idx =
+      av1_block_index_to_raster_order(tx_size, block_idx);
+  const PREDICTION_MODE mode =
+      (plane == 0) ? get_y_mode(xd->mi[0], block_raster_idx) : mbmi->uv_mode;
+  av1_predict_intra_block(xd, pd->width, pd->height, txsize_to_bsize[tx_size],
+                          mode, dst, dst_stride, dst, dst_stride, blk_col,
+                          blk_row, plane);
+#if CONFIG_CFL
+  if (plane != AOM_PLANE_Y && mbmi->uv_mode == DC_PRED) {
+    if (plane == AOM_PLANE_U && blk_col == 0 && blk_row == 0) {
+      // Compute the block-level DC_PRED for both chromatic planes prior to
+      // processing the first chromatic plane in order to compute alpha_cb and
+      // alpha_cr. Note: This is not required on the decoder side because alpha
+      // is signaled.
+      cfl_dc_pred(xd, get_plane_block_size(block_idx, pd), tx_size);
+    }
+    cfl_predict_block(xd->cfl, dst, pd->dst.stride, blk_row, blk_col, tx_size,
+                      xd->cfl->dc_pred[plane - 1]);
+  }
+#endif
+}
+
+void av1_predict_intra_block(const MACROBLOCKD *xd, int wpx, int hpx,
+                             BLOCK_SIZE bsize, PREDICTION_MODE mode,
+                             const uint8_t *ref, int ref_stride, uint8_t *dst,
+                             int dst_stride, int col_off, int row_off,
+                             int plane) {
+  const int block_width = block_size_wide[bsize];
+  const int block_height = block_size_high[bsize];
+  TX_SIZE tx_size = max_txsize_lookup[bsize];
+  assert(tx_size < TX_SIZES);
+  if (block_width == block_height) {
+    predict_square_intra_block(xd, wpx, hpx, tx_size, mode, ref, ref_stride,
+                               dst, dst_stride, col_off, row_off, plane);
+  } else {
+#if (CONFIG_RECT_TX && (CONFIG_VAR_TX || CONFIG_EXT_TX)) || (CONFIG_EXT_INTER)
+#if CONFIG_HIGHBITDEPTH
+    uint16_t tmp16[MAX_SB_SIZE];
+#endif
+    uint8_t tmp[MAX_SB_SIZE];
+    assert((block_width == wpx && block_height == hpx) ||
+           (block_width == (wpx >> 1) && block_height == hpx) ||
+           (block_width == wpx && block_height == (hpx >> 1)));
+
+    if (block_width < block_height) {
+      assert(block_height == (block_width << 1));
+      // Predict the top square sub-block.
+      predict_square_intra_block(xd, wpx, hpx, tx_size, mode, ref, ref_stride,
+                                 dst, dst_stride, col_off, row_off, plane);
+      {
+        const int half_block_height = block_height >> 1;
+        const int half_block_height_unit =
+            half_block_height >> tx_size_wide_log2[0];
+        // Cast away const to modify 'ref' temporarily; will be restored later.
+        uint8_t *src_2 = (uint8_t *)ref + half_block_height * ref_stride;
+        uint8_t *dst_2 = dst + half_block_height * dst_stride;
+        const int row_off_2 = row_off + half_block_height_unit;
+        // Save the last row of top square sub-block as 'above' row for bottom
+        // square sub-block.
+        if (src_2 != dst_2 || ref_stride != dst_stride) {
+#if CONFIG_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            uint16_t *src_2_16 = CONVERT_TO_SHORTPTR(src_2);
+            uint16_t *dst_2_16 = CONVERT_TO_SHORTPTR(dst_2);
+            memcpy(tmp16, src_2_16 - ref_stride,
+                   block_width * sizeof(*src_2_16));
+            memcpy(src_2_16 - ref_stride, dst_2_16 - dst_stride,
+                   block_width * sizeof(*src_2_16));
+          } else {
+#endif  // CONFIG_HIGHBITDEPTH
+            memcpy(tmp, src_2 - ref_stride, block_width * sizeof(*src_2));
+            memcpy(src_2 - ref_stride, dst_2 - dst_stride,
+                   block_width * sizeof(*src_2));
+#if CONFIG_HIGHBITDEPTH
+          }
+#endif  // CONFIG_HIGHBITDEPTH
+        }
+        // Predict the bottom square sub-block.
+        predict_square_intra_block(xd, wpx, hpx, tx_size, mode, src_2,
+                                   ref_stride, dst_2, dst_stride, col_off,
+                                   row_off_2, plane);
+        // Restore the last row of top square sub-block.
+        if (src_2 != dst_2 || ref_stride != dst_stride) {
+#if CONFIG_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            uint16_t *src_2_16 = CONVERT_TO_SHORTPTR(src_2);
+            memcpy(src_2_16 - ref_stride, tmp16,
+                   block_width * sizeof(*src_2_16));
+          } else {
+#endif  // CONFIG_HIGHBITDEPTH
+            memcpy(src_2 - ref_stride, tmp, block_width * sizeof(*src_2));
+#if CONFIG_HIGHBITDEPTH
+          }
+#endif  // CONFIG_HIGHBITDEPTH
+        }
+      }
+    } else {  // block_width > block_height
+      assert(block_width == (block_height << 1));
+      // Predict the left square sub-block
+      predict_square_intra_block(xd, wpx, hpx, tx_size, mode, ref, ref_stride,
+                                 dst, dst_stride, col_off, row_off, plane);
+      {
+        int i;
+        const int half_block_width = block_width >> 1;
+        const int half_block_width_unit =
+            half_block_width >> tx_size_wide_log2[0];
+        // Cast away const to modify 'ref' temporarily; will be restored later.
+        uint8_t *src_2 = (uint8_t *)ref + half_block_width;
+        uint8_t *dst_2 = dst + half_block_width;
+        const int col_off_2 = col_off + half_block_width_unit;
+        // Save the last column of left square sub-block as 'left' column for
+        // right square sub-block.
+        const int save_src = src_2 != dst_2 || ref_stride != dst_stride;
+        if (save_src) {
+#if CONFIG_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            uint16_t *src_2_16 = CONVERT_TO_SHORTPTR(src_2);
+            uint16_t *dst_2_16 = CONVERT_TO_SHORTPTR(dst_2);
+            for (i = 0; i < block_height; ++i) {
+              tmp16[i] = src_2_16[i * ref_stride - 1];
+              src_2_16[i * ref_stride - 1] = dst_2_16[i * dst_stride - 1];
+            }
+          } else {
+#endif  // CONFIG_HIGHBITDEPTH
+            for (i = 0; i < block_height; ++i) {
+              tmp[i] = src_2[i * ref_stride - 1];
+              src_2[i * ref_stride - 1] = dst_2[i * dst_stride - 1];
+            }
+#if CONFIG_HIGHBITDEPTH
+          }
+#endif  // CONFIG_HIGHBITDEPTH
+        }
+        // Predict the right square sub-block.
+        predict_square_intra_block(xd, wpx, hpx, tx_size, mode, src_2,
+                                   ref_stride, dst_2, dst_stride, col_off_2,
+                                   row_off, plane);
+        // Restore the last column of left square sub-block.
+        if (save_src) {
+#if CONFIG_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            uint16_t *src_2_16 = CONVERT_TO_SHORTPTR(src_2);
+            for (i = 0; i < block_height; ++i) {
+              src_2_16[i * ref_stride - 1] = tmp16[i];
+            }
+          } else {
+#endif  // CONFIG_HIGHBITDEPTH
+            for (i = 0; i < block_height; ++i) {
+              src_2[i * ref_stride - 1] = tmp[i];
+            }
+#if CONFIG_HIGHBITDEPTH
+          }
+#endif  // CONFIG_HIGHBITDEPTH
+        }
+      }
+    }
+#else
+    assert(0);
+#endif  // (CONFIG_RECT_TX && (CONFIG_VAR_TX || CONFIG_EXT_TX)) ||
+        // (CONFIG_EXT_INTER)
+  }
+}
+
+void av1_init_intra_predictors(void) {
+  once(av1_init_intra_predictors_internal);
+}
diff --git a/third_party/aom/av1/common/reconintra.h b/third_party/aom/av1/common/reconintra.h
new file mode 100644
index 000000000..7ee0c495e
--- /dev/null
+++ b/third_party/aom/av1/common/reconintra.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_RECONINTRA_H_
+#define AV1_COMMON_RECONINTRA_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_init_intra_predictors(void);
+void av1_predict_intra_block_facade(MACROBLOCKD *xd, int plane, int block_idx,
+                                    int blk_col, int blk_row, TX_SIZE tx_size);
+void av1_predict_intra_block(const MACROBLOCKD *xd, int bw, int bh,
+                             BLOCK_SIZE bsize, PREDICTION_MODE mode,
+                             const uint8_t *ref, int ref_stride, uint8_t *dst,
+                             int dst_stride, int aoff, int loff, int plane);
+
+#if CONFIG_EXT_INTER
+// Mapping of interintra to intra mode for use in the intra component
+static const PREDICTION_MODE interintra_to_intra_mode[INTERINTRA_MODES] = {
+  DC_PRED,   V_PRED,    H_PRED,    D45_PRED, D135_PRED,
+  D117_PRED, D153_PRED, D207_PRED, D63_PRED, TM_PRED
+};
+
+// Mapping of intra mode to the interintra mode
+static const INTERINTRA_MODE intra_to_interintra_mode[INTRA_MODES] = {
+  II_DC_PRED,   II_V_PRED,    II_H_PRED,    II_D45_PRED, II_D135_PRED,
+  II_D117_PRED, II_D153_PRED, II_D207_PRED, II_D63_PRED,
+#if CONFIG_ALT_INTRA
+  II_DC_PRED,  // Note: Filler value, as there's no II_SMOOTH_PRED.
+#endif         // CONFIG_ALT_INTRA
+  II_TM_PRED
+};
+#endif  // CONFIG_EXT_INTER
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#if CONFIG_FILTER_INTRA
+#define FILTER_INTRA_PREC_BITS 10
+extern int av1_filter_intra_taps_4[TX_SIZES][INTRA_MODES][4];
+#endif  // CONFIG_FILTER_INTRA
+
+#if CONFIG_EXT_INTRA
+static INLINE int av1_is_directional_mode(PREDICTION_MODE mode,
+                                          BLOCK_SIZE bsize) {
+  return mode != DC_PRED && mode != TM_PRED &&
+#if CONFIG_ALT_INTRA
+         mode != SMOOTH_PRED &&
+#endif  // CONFIG_ALT_INTRA
+         bsize >= BLOCK_8X8;
+}
+#endif  // CONFIG_EXT_INTRA
+
+#endif  // AV1_COMMON_RECONINTRA_H_
diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c
new file mode 100644
index 000000000..8c0d3aa09
--- /dev/null
+++ b/third_party/aom/av1/common/resize.c
@@ -0,0 +1,821 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "./aom_config.h"
+#if CONFIG_HIGHBITDEPTH
+#include "aom_dsp/aom_dsp_common.h"
+#endif  // CONFIG_HIGHBITDEPTH
+#include "aom_ports/mem.h"
+#include "av1/common/common.h"
+#include "av1/common/resize.h"
+
+#define FILTER_BITS 7
+
+#define INTERP_TAPS 8
+#define SUBPEL_BITS 5
+#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
+#define INTERP_PRECISION_BITS 32
+
+typedef int16_t interp_kernel[INTERP_TAPS];
+
+// Filters for interpolation (0.5-band) - note this also filters integer pels.
+static const interp_kernel filteredinterp_filters500[(1 << SUBPEL_BITS)] = {
+  { -3, 0, 35, 64, 35, 0, -3, 0 },    { -3, -1, 34, 64, 36, 1, -3, 0 },
+  { -3, -1, 32, 64, 38, 1, -3, 0 },   { -2, -2, 31, 63, 39, 2, -3, 0 },
+  { -2, -2, 29, 63, 41, 2, -3, 0 },   { -2, -2, 28, 63, 42, 3, -4, 0 },
+  { -2, -3, 27, 63, 43, 4, -4, 0 },   { -2, -3, 25, 62, 45, 5, -4, 0 },
+  { -2, -3, 24, 62, 46, 5, -4, 0 },   { -2, -3, 23, 61, 47, 6, -4, 0 },
+  { -2, -3, 21, 60, 49, 7, -4, 0 },   { -1, -4, 20, 60, 50, 8, -4, -1 },
+  { -1, -4, 19, 59, 51, 9, -4, -1 },  { -1, -4, 17, 58, 52, 10, -4, 0 },
+  { -1, -4, 16, 57, 53, 12, -4, -1 }, { -1, -4, 15, 56, 54, 13, -4, -1 },
+  { -1, -4, 14, 55, 55, 14, -4, -1 }, { -1, -4, 13, 54, 56, 15, -4, -1 },
+  { -1, -4, 12, 53, 57, 16, -4, -1 }, { 0, -4, 10, 52, 58, 17, -4, -1 },
+  { -1, -4, 9, 51, 59, 19, -4, -1 },  { -1, -4, 8, 50, 60, 20, -4, -1 },
+  { 0, -4, 7, 49, 60, 21, -3, -2 },   { 0, -4, 6, 47, 61, 23, -3, -2 },
+  { 0, -4, 5, 46, 62, 24, -3, -2 },   { 0, -4, 5, 45, 62, 25, -3, -2 },
+  { 0, -4, 4, 43, 63, 27, -3, -2 },   { 0, -4, 3, 42, 63, 28, -2, -2 },
+  { 0, -3, 2, 41, 63, 29, -2, -2 },   { 0, -3, 2, 39, 63, 31, -2, -2 },
+  { 0, -3, 1, 38, 64, 32, -1, -3 },   { 0, -3, 1, 36, 64, 34, -1, -3 }
+};
+
+// Filters for interpolation (0.625-band) - note this also filters integer pels.
+static const interp_kernel filteredinterp_filters625[(1 << SUBPEL_BITS)] = {
+  { -1, -8, 33, 80, 33, -8, -1, 0 }, { -1, -8, 30, 80, 35, -8, -1, 1 },
+  { -1, -8, 28, 80, 37, -7, -2, 1 }, { 0, -8, 26, 79, 39, -7, -2, 1 },
+  { 0, -8, 24, 79, 41, -7, -2, 1 },  { 0, -8, 22, 78, 43, -6, -2, 1 },
+  { 0, -8, 20, 78, 45, -5, -3, 1 },  { 0, -8, 18, 77, 48, -5, -3, 1 },
+  { 0, -8, 16, 76, 50, -4, -3, 1 },  { 0, -8, 15, 75, 52, -3, -4, 1 },
+  { 0, -7, 13, 74, 54, -3, -4, 1 },  { 0, -7, 11, 73, 56, -2, -4, 1 },
+  { 0, -7, 10, 71, 58, -1, -4, 1 },  { 1, -7, 8, 70, 60, 0, -5, 1 },
+  { 1, -6, 6, 68, 62, 1, -5, 1 },    { 1, -6, 5, 67, 63, 2, -5, 1 },
+  { 1, -6, 4, 65, 65, 4, -6, 1 },    { 1, -5, 2, 63, 67, 5, -6, 1 },
+  { 1, -5, 1, 62, 68, 6, -6, 1 },    { 1, -5, 0, 60, 70, 8, -7, 1 },
+  { 1, -4, -1, 58, 71, 10, -7, 0 },  { 1, -4, -2, 56, 73, 11, -7, 0 },
+  { 1, -4, -3, 54, 74, 13, -7, 0 },  { 1, -4, -3, 52, 75, 15, -8, 0 },
+  { 1, -3, -4, 50, 76, 16, -8, 0 },  { 1, -3, -5, 48, 77, 18, -8, 0 },
+  { 1, -3, -5, 45, 78, 20, -8, 0 },  { 1, -2, -6, 43, 78, 22, -8, 0 },
+  { 1, -2, -7, 41, 79, 24, -8, 0 },  { 1, -2, -7, 39, 79, 26, -8, 0 },
+  { 1, -2, -7, 37, 80, 28, -8, -1 }, { 1, -1, -8, 35, 80, 30, -8, -1 },
+};
+
+// Filters for interpolation (0.75-band) - note this also filters integer pels.
+static const interp_kernel filteredinterp_filters750[(1 << SUBPEL_BITS)] = {
+  { 2, -11, 25, 96, 25, -11, 2, 0 }, { 2, -11, 22, 96, 28, -11, 2, 0 },
+  { 2, -10, 19, 95, 31, -11, 2, 0 }, { 2, -10, 17, 95, 34, -12, 2, 0 },
+  { 2, -9, 14, 94, 37, -12, 2, 0 },  { 2, -8, 12, 93, 40, -12, 1, 0 },
+  { 2, -8, 9, 92, 43, -12, 1, 1 },   { 2, -7, 7, 91, 46, -12, 1, 0 },
+  { 2, -7, 5, 90, 49, -12, 1, 0 },   { 2, -6, 3, 88, 52, -12, 0, 1 },
+  { 2, -5, 1, 86, 55, -12, 0, 1 },   { 2, -5, -1, 84, 58, -11, 0, 1 },
+  { 2, -4, -2, 82, 61, -11, -1, 1 }, { 2, -4, -4, 80, 64, -10, -1, 1 },
+  { 1, -3, -5, 77, 67, -9, -1, 1 },  { 1, -3, -6, 75, 70, -8, -2, 1 },
+  { 1, -2, -7, 72, 72, -7, -2, 1 },  { 1, -2, -8, 70, 75, -6, -3, 1 },
+  { 1, -1, -9, 67, 77, -5, -3, 1 },  { 1, -1, -10, 64, 80, -4, -4, 2 },
+  { 1, -1, -11, 61, 82, -2, -4, 2 }, { 1, 0, -11, 58, 84, -1, -5, 2 },
+  { 1, 0, -12, 55, 86, 1, -5, 2 },   { 1, 0, -12, 52, 88, 3, -6, 2 },
+  { 0, 1, -12, 49, 90, 5, -7, 2 },   { 0, 1, -12, 46, 91, 7, -7, 2 },
+  { 1, 1, -12, 43, 92, 9, -8, 2 },   { 0, 1, -12, 40, 93, 12, -8, 2 },
+  { 0, 2, -12, 37, 94, 14, -9, 2 },  { 0, 2, -12, 34, 95, 17, -10, 2 },
+  { 0, 2, -11, 31, 95, 19, -10, 2 }, { 0, 2, -11, 28, 96, 22, -11, 2 }
+};
+
+// Filters for interpolation (0.875-band) - note this also filters integer pels.
+static const interp_kernel filteredinterp_filters875[(1 << SUBPEL_BITS)] = {
+  { 3, -8, 13, 112, 13, -8, 3, 0 },   { 3, -7, 10, 112, 17, -9, 3, -1 },
+  { 2, -6, 7, 111, 21, -9, 3, -1 },   { 2, -5, 4, 111, 24, -10, 3, -1 },
+  { 2, -4, 1, 110, 28, -11, 3, -1 },  { 1, -3, -1, 108, 32, -12, 4, -1 },
+  { 1, -2, -3, 106, 36, -13, 4, -1 }, { 1, -1, -6, 105, 40, -14, 4, -1 },
+  { 1, -1, -7, 102, 44, -14, 4, -1 }, { 1, 0, -9, 100, 48, -15, 4, -1 },
+  { 1, 1, -11, 97, 53, -16, 4, -1 },  { 0, 1, -12, 95, 57, -16, 4, -1 },
+  { 0, 2, -13, 91, 61, -16, 4, -1 },  { 0, 2, -14, 88, 65, -16, 4, -1 },
+  { 0, 3, -15, 84, 69, -17, 4, 0 },   { 0, 3, -16, 81, 73, -16, 3, 0 },
+  { 0, 3, -16, 77, 77, -16, 3, 0 },   { 0, 3, -16, 73, 81, -16, 3, 0 },
+  { 0, 4, -17, 69, 84, -15, 3, 0 },   { -1, 4, -16, 65, 88, -14, 2, 0 },
+  { -1, 4, -16, 61, 91, -13, 2, 0 },  { -1, 4, -16, 57, 95, -12, 1, 0 },
+  { -1, 4, -16, 53, 97, -11, 1, 1 },  { -1, 4, -15, 48, 100, -9, 0, 1 },
+  { -1, 4, -14, 44, 102, -7, -1, 1 }, { -1, 4, -14, 40, 105, -6, -1, 1 },
+  { -1, 4, -13, 36, 106, -3, -2, 1 }, { -1, 4, -12, 32, 108, -1, -3, 1 },
+  { -1, 3, -11, 28, 110, 1, -4, 2 },  { -1, 3, -10, 24, 111, 4, -5, 2 },
+  { -1, 3, -9, 21, 111, 7, -6, 2 },   { -1, 3, -9, 17, 112, 10, -7, 3 }
+};
+
+// Filters for interpolation (full-band) - no filtering for integer pixels
+static const interp_kernel filteredinterp_filters1000[(1 << SUBPEL_BITS)] = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },        { 0, 1, -3, 128, 3, -1, 0, 0 },
+  { -1, 2, -6, 127, 7, -2, 1, 0 },     { -1, 3, -9, 126, 12, -4, 1, 0 },
+  { -1, 4, -12, 125, 16, -5, 1, 0 },   { -1, 4, -14, 123, 20, -6, 2, 0 },
+  { -1, 5, -15, 120, 25, -8, 2, 0 },   { -1, 5, -17, 118, 30, -9, 3, -1 },
+  { -1, 6, -18, 114, 35, -10, 3, -1 }, { -1, 6, -19, 111, 41, -12, 3, -1 },
+  { -1, 6, -20, 107, 46, -13, 4, -1 }, { -1, 6, -21, 103, 52, -14, 4, -1 },
+  { -1, 6, -21, 99, 57, -16, 5, -1 },  { -1, 6, -21, 94, 63, -17, 5, -1 },
+  { -1, 6, -20, 89, 68, -18, 5, -1 },  { -1, 6, -20, 84, 73, -19, 6, -1 },
+  { -1, 6, -20, 79, 79, -20, 6, -1 },  { -1, 6, -19, 73, 84, -20, 6, -1 },
+  { -1, 5, -18, 68, 89, -20, 6, -1 },  { -1, 5, -17, 63, 94, -21, 6, -1 },
+  { -1, 5, -16, 57, 99, -21, 6, -1 },  { -1, 4, -14, 52, 103, -21, 6, -1 },
+  { -1, 4, -13, 46, 107, -20, 6, -1 }, { -1, 3, -12, 41, 111, -19, 6, -1 },
+  { -1, 3, -10, 35, 114, -18, 6, -1 }, { -1, 3, -9, 30, 118, -17, 5, -1 },
+  { 0, 2, -8, 25, 120, -15, 5, -1 },   { 0, 2, -6, 20, 123, -14, 4, -1 },
+  { 0, 1, -5, 16, 125, -12, 4, -1 },   { 0, 1, -4, 12, 126, -9, 3, -1 },
+  { 0, 1, -2, 7, 127, -6, 2, -1 },     { 0, 0, -1, 3, 128, -3, 1, 0 }
+};
+
+// Filters for factor of 2 downsampling.
+static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 };
+static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 };
+
+static const interp_kernel *choose_interp_filter(int inlength, int outlength) {
+  int outlength16 = outlength * 16;
+  if (outlength16 >= inlength * 16)
+    return filteredinterp_filters1000;
+  else if (outlength16 >= inlength * 13)
+    return filteredinterp_filters875;
+  else if (outlength16 >= inlength * 11)
+    return filteredinterp_filters750;
+  else if (outlength16 >= inlength * 9)
+    return filteredinterp_filters625;
+  else
+    return filteredinterp_filters500;
+}
+
+static void interpolate(const uint8_t *const input, int inlength,
+                        uint8_t *output, int outlength) {
+  const int64_t delta =
+      (((uint64_t)inlength << 32) + outlength / 2) / outlength;
+  const int64_t offset =
+      inlength > outlength
+          ? (((int64_t)(inlength - outlength) << 31) + outlength / 2) /
+                outlength
+          : -(((int64_t)(outlength - inlength) << 31) + outlength / 2) /
+                outlength;
+  uint8_t *optr = output;
+  int x, x1, x2, sum, k, int_pel, sub_pel;
+  int64_t y;
+
+  const interp_kernel *interp_filters =
+      choose_interp_filter(inlength, outlength);
+
+  x = 0;
+  y = offset;
+  while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) {
+    x++;
+    y += delta;
+  }
+  x1 = x;
+  x = outlength - 1;
+  y = delta * x + offset;
+  while ((y >> INTERP_PRECISION_BITS) + (int64_t)(INTERP_TAPS / 2) >=
+         inlength) {
+    x--;
+    y -= delta;
+  }
+  x2 = x;
+  if (x1 > x2) {
+    for (x = 0, y = offset; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k) {
+        const int pk = int_pel - INTERP_TAPS / 2 + 1 + k;
+        sum += filter[k] *
+               input[(pk < 0 ? 0 : (pk >= inlength ? inlength - 1 : pk))];
+      }
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+  } else {
+    // Initial part.
+    for (x = 0, y = offset; x < x1; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0
+                                      ? 0
+                                      : int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+    // Middle part.
+    for (; x <= x2; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+    // End part.
+    for (; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >= inlength
+                                      ? inlength - 1
+                                      : int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+  }
+}
+
+static void down2_symeven(const uint8_t *const input, int length,
+                          uint8_t *output) {
+  // Actual filter len = 2 * filter_len_half.
+  const int16_t *filter = av1_down2_symeven_half_filter;
+  const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2;
+  int i, j;
+  uint8_t *optr = output;
+  int l1 = filter_len_half;
+  int l2 = (length - filter_len_half);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+               filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+               filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  }
+}
+
+static void down2_symodd(const uint8_t *const input, int length,
+                         uint8_t *output) {
+  // Actual filter len = 2 * filter_len_half - 1.
+  const int16_t *filter = av1_down2_symodd_half_filter;
+  const int filter_len_half = sizeof(av1_down2_symodd_half_filter) / 2;
+  int i, j;
+  uint8_t *optr = output;
+  int l1 = filter_len_half - 1;
+  int l2 = (length - filter_len_half + 1);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + j >= length ? length - 1 : i + j)]) *
+               filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) *
+               filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  }
+}
+
+static int get_down2_length(int length, int steps) {
+  int s;
+  for (s = 0; s < steps; ++s) length = (length + 1) >> 1;
+  return length;
+}
+
+static int get_down2_steps(int in_length, int out_length) {
+  int steps = 0;
+  int proj_in_length;
+  while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) {
+    ++steps;
+    in_length = proj_in_length;
+  }
+  return steps;
+}
+
+static void resize_multistep(const uint8_t *const input, int length,
+                             uint8_t *output, int olength, uint8_t *otmp) {
+  int steps;
+  if (length == olength) {
+    memcpy(output, input, sizeof(output[0]) * length);
+    return;
+  }
+  steps = get_down2_steps(length, olength);
+
+  if (steps > 0) {
+    int s;
+    uint8_t *out = NULL;
+    uint8_t *otmp2;
+    int filteredlength = length;
+
+    assert(otmp != NULL);
+    otmp2 = otmp + get_down2_length(length, 1);
+    for (s = 0; s < steps; ++s) {
+      const int proj_filteredlength = get_down2_length(filteredlength, 1);
+      const uint8_t *const in = (s == 0 ? input : out);
+      if (s == steps - 1 && proj_filteredlength == olength)
+        out = output;
+      else
+        out = (s & 1 ? otmp2 : otmp);
+      if (filteredlength & 1)
+        down2_symodd(in, filteredlength, out);
+      else
+        down2_symeven(in, filteredlength, out);
+      filteredlength = proj_filteredlength;
+    }
+    if (filteredlength != olength) {
+      interpolate(out, filteredlength, output, olength);
+    }
+  } else {
+    interpolate(input, length, output, olength);
+  }
+}
+
+static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) {
+  int i;
+  uint8_t *iptr = img;
+  uint8_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *aptr++ = *iptr;
+  }
+}
+
+static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
+  int i;
+  uint8_t *iptr = img;
+  uint8_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *iptr = *aptr++;
+  }
+}
+
+void av1_resize_plane(const uint8_t *const input, int height, int width,
+                      int in_stride, uint8_t *output, int height2, int width2,
+                      int out_stride) {
+  int i;
+  uint8_t *intbuf = (uint8_t *)malloc(sizeof(uint8_t) * width2 * height);
+  uint8_t *tmpbuf =
+      (uint8_t *)malloc(sizeof(uint8_t) * (width < height ? height : width));
+  uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * height);
+  uint8_t *arrbuf2 = (uint8_t *)malloc(sizeof(uint8_t) * height2);
+  if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL)
+    goto Error;
+  assert(width > 0);
+  assert(height > 0);
+  assert(width2 > 0);
+  assert(height2 > 0);
+  for (i = 0; i < height; ++i)
+    resize_multistep(input + in_stride * i, width, intbuf + width2 * i, width2,
+                     tmpbuf);
+  for (i = 0; i < width2; ++i) {
+    fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+    resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf);
+    fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
+  }
+
+Error:
+  free(intbuf);
+  free(tmpbuf);
+  free(arrbuf);
+  free(arrbuf2);
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void highbd_interpolate(const uint16_t *const input, int inlength,
+                               uint16_t *output, int outlength, int bd) {
+  const int64_t delta =
+      (((uint64_t)inlength << 32) + outlength / 2) / outlength;
+  const int64_t offset =
+      inlength > outlength
+          ? (((int64_t)(inlength - outlength) << 31) + outlength / 2) /
+                outlength
+          : -(((int64_t)(outlength - inlength) << 31) + outlength / 2) /
+                outlength;
+  uint16_t *optr = output;
+  int x, x1, x2, sum, k, int_pel, sub_pel;
+  int64_t y;
+
+  const interp_kernel *interp_filters =
+      choose_interp_filter(inlength, outlength);
+
+  x = 0;
+  y = offset;
+  while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) {
+    x++;
+    y += delta;
+  }
+  x1 = x;
+  x = outlength - 1;
+  y = delta * x + offset;
+  while ((y >> INTERP_PRECISION_BITS) + (int64_t)(INTERP_TAPS / 2) >=
+         inlength) {
+    x--;
+    y -= delta;
+  }
+  x2 = x;
+  if (x1 > x2) {
+    for (x = 0, y = offset; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k) {
+        const int pk = int_pel - INTERP_TAPS / 2 + 1 + k;
+        sum += filter[k] *
+               input[(pk < 0 ? 0 : (pk >= inlength ? inlength - 1 : pk))];
+      }
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+  } else {
+    // Initial part.
+    for (x = 0, y = offset; x < x1; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0
+                                      ? 0
+                                      : int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+    // Middle part.
+    for (; x <= x2; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+    // End part.
+    for (; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >= inlength
+                                      ? inlength - 1
+                                      : int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+  }
+}
+
+static void highbd_down2_symeven(const uint16_t *const input, int length,
+                                 uint16_t *output, int bd) {
+  // Actual filter len = 2 * filter_len_half.
+  static const int16_t *filter = av1_down2_symeven_half_filter;
+  const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2;
+  int i, j;
+  uint16_t *optr = output;
+  int l1 = filter_len_half;
+  int l2 = (length - filter_len_half);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+               filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+               filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  }
+}
+
+static void highbd_down2_symodd(const uint16_t *const input, int length,
+                                uint16_t *output, int bd) {
+  // Actual filter len = 2 * filter_len_half - 1.
+  static const int16_t *filter = av1_down2_symodd_half_filter;
+  const int filter_len_half = sizeof(av1_down2_symodd_half_filter) / 2;
+  int i, j;
+  uint16_t *optr = output;
+  int l1 = filter_len_half - 1;
+  int l2 = (length - filter_len_half + 1);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + j >= length ? length - 1 : i + j)]) *
+               filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) *
+               filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  }
+}
+
+static void highbd_resize_multistep(const uint16_t *const input, int length,
+                                    uint16_t *output, int olength,
+                                    uint16_t *otmp, int bd) {
+  int steps;
+  if (length == olength) {
+    memcpy(output, input, sizeof(output[0]) * length);
+    return;
+  }
+  steps = get_down2_steps(length, olength);
+
+  if (steps > 0) {
+    int s;
+    uint16_t *out = NULL;
+    uint16_t *otmp2;
+    int filteredlength = length;
+
+    assert(otmp != NULL);
+    otmp2 = otmp + get_down2_length(length, 1);
+    for (s = 0; s < steps; ++s) {
+      const int proj_filteredlength = get_down2_length(filteredlength, 1);
+      const uint16_t *const in = (s == 0 ? input : out);
+      if (s == steps - 1 && proj_filteredlength == olength)
+        out = output;
+      else
+        out = (s & 1 ? otmp2 : otmp);
+      if (filteredlength & 1)
+        highbd_down2_symodd(in, filteredlength, out, bd);
+      else
+        highbd_down2_symeven(in, filteredlength, out, bd);
+      filteredlength = proj_filteredlength;
+    }
+    if (filteredlength != olength) {
+      highbd_interpolate(out, filteredlength, output, olength, bd);
+    }
+  } else {
+    highbd_interpolate(input, length, output, olength, bd);
+  }
+}
+
+static void highbd_fill_col_to_arr(uint16_t *img, int stride, int len,
+                                   uint16_t *arr) {
+  int i;
+  uint16_t *iptr = img;
+  uint16_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *aptr++ = *iptr;
+  }
+}
+
+static void highbd_fill_arr_to_col(uint16_t *img, int stride, int len,
+                                   uint16_t *arr) {
+  int i;
+  uint16_t *iptr = img;
+  uint16_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *iptr = *aptr++;
+  }
+}
+
+void av1_highbd_resize_plane(const uint8_t *const input, int height, int width,
+                             int in_stride, uint8_t *output, int height2,
+                             int width2, int out_stride, int bd) {
+  int i;
+  uint16_t *intbuf = (uint16_t *)malloc(sizeof(uint16_t) * width2 * height);
+  uint16_t *tmpbuf =
+      (uint16_t *)malloc(sizeof(uint16_t) * (width < height ? height : width));
+  uint16_t *arrbuf = (uint16_t *)malloc(sizeof(uint16_t) * height);
+  uint16_t *arrbuf2 = (uint16_t *)malloc(sizeof(uint16_t) * height2);
+  if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL)
+    goto Error;
+  for (i = 0; i < height; ++i) {
+    highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width,
+                            intbuf + width2 * i, width2, tmpbuf, bd);
+  }
+  for (i = 0; i < width2; ++i) {
+    highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+    highbd_resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf, bd);
+    highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2,
+                           arrbuf2);
+  }
+
+Error:
+  free(intbuf);
+  free(tmpbuf);
+  free(arrbuf);
+  free(arrbuf2);
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+void av1_resize_frame420(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride, int height, int width, uint8_t *oy,
+                         int oy_stride, uint8_t *ou, uint8_t *ov,
+                         int ouv_stride, int oheight, int owidth) {
+  av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+  av1_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
+                   owidth / 2, ouv_stride);
+  av1_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
+                   owidth / 2, ouv_stride);
+}
+
+void av1_resize_frame422(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride, int height, int width, uint8_t *oy,
+                         int oy_stride, uint8_t *ou, uint8_t *ov,
+                         int ouv_stride, int oheight, int owidth) {
+  av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+  av1_resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2,
+                   ouv_stride);
+  av1_resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2,
+                   ouv_stride);
+}
+
+void av1_resize_frame444(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride, int height, int width, uint8_t *oy,
+                         int oy_stride, uint8_t *ou, uint8_t *ov,
+                         int ouv_stride, int oheight, int owidth) {
+  av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+  av1_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
+                   ouv_stride);
+  av1_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
+                   ouv_stride);
+}
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride, int height, int width,
+                                uint8_t *oy, int oy_stride, uint8_t *ou,
+                                uint8_t *ov, int ouv_stride, int oheight,
+                                int owidth, int bd) {
+  av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+                          oy_stride, bd);
+  av1_highbd_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
+                          owidth / 2, ouv_stride, bd);
+  av1_highbd_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
+                          owidth / 2, ouv_stride, bd);
+}
+
+void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride, int height, int width,
+                                uint8_t *oy, int oy_stride, uint8_t *ou,
+                                uint8_t *ov, int ouv_stride, int oheight,
+                                int owidth, int bd) {
+  av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+                          oy_stride, bd);
+  av1_highbd_resize_plane(u, height, width / 2, uv_stride, ou, oheight,
+                          owidth / 2, ouv_stride, bd);
+  av1_highbd_resize_plane(v, height, width / 2, uv_stride, ov, oheight,
+                          owidth / 2, ouv_stride, bd);
+}
+
+void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride, int height, int width,
+                                uint8_t *oy, int oy_stride, uint8_t *ou,
+                                uint8_t *ov, int ouv_stride, int oheight,
+                                int owidth, int bd) {
+  av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+                          oy_stride, bd);
+  av1_highbd_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
+                          ouv_stride, bd);
+  av1_highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
+                          ouv_stride, bd);
+}
+#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/av1/common/resize.h b/third_party/aom/av1/common/resize.h
new file mode 100644
index 000000000..959cda969
--- /dev/null
+++ b/third_party/aom/av1/common/resize.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_RESIZE_H_
+#define AV1_ENCODER_RESIZE_H_
+
+#include <stdio.h>
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_resize_plane(const uint8_t *const input, int height, int width,
+                      int in_stride, uint8_t *output, int height2, int width2,
+                      int out_stride);
+void av1_resize_frame420(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride, int height, int width, uint8_t *oy,
+                         int oy_stride, uint8_t *ou, uint8_t *ov,
+                         int ouv_stride, int oheight, int owidth);
+void av1_resize_frame422(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride, int height, int width, uint8_t *oy,
+                         int oy_stride, uint8_t *ou, uint8_t *ov,
+                         int ouv_stride, int oheight, int owidth);
+void av1_resize_frame444(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride, int height, int width, uint8_t *oy,
+                         int oy_stride, uint8_t *ou, uint8_t *ov,
+                         int ouv_stride, int oheight, int owidth);
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_resize_plane(const uint8_t *const input, int height, int width,
+                             int in_stride, uint8_t *output, int height2,
+                             int width2, int out_stride, int bd);
+void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride, int height, int width,
+                                uint8_t *oy, int oy_stride, uint8_t *ou,
+                                uint8_t *ov, int ouv_stride, int oheight,
+                                int owidth, int bd);
+void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride, int height, int width,
+                                uint8_t *oy, int oy_stride, uint8_t *ou,
+                                uint8_t *ov, int ouv_stride, int oheight,
+                                int owidth, int bd);
+void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride, int height, int width,
+                                uint8_t *oy, int oy_stride, uint8_t *ou,
+                                uint8_t *ov, int ouv_stride, int oheight,
+                                int owidth, int bd);
+#endif  // CONFIG_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_RESIZE_H_
diff --git a/third_party/aom/av1/common/restoration.c b/third_party/aom/av1/common/restoration.c
new file mode 100644
index 000000000..b7ed9f98b
--- /dev/null
+++ b/third_party/aom/av1/common/restoration.c
@@ -0,0 +1,1401 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ */
+
+#include <math.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "./aom_scale_rtcd.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/restoration.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
+#if USE_HIGHPASS_IN_SGRPROJ
+  // corner, edge, r2, eps2
+  { -1, 2, 1, 1 }, { -1, 2, 1, 2 }, { -1, 2, 1, 3 }, { -1, 2, 1, 4 },
+  { -1, 2, 1, 5 }, { -2, 3, 1, 2 }, { -2, 3, 1, 3 }, { -2, 3, 1, 4 },
+  { -2, 3, 1, 5 }, { -2, 3, 1, 6 }, { -3, 4, 1, 3 }, { -3, 4, 1, 4 },
+  { -3, 4, 1, 5 }, { -3, 4, 1, 6 }, { -3, 4, 1, 7 }, { -3, 4, 1, 8 }
+#else
+  // r1, eps1, r2, eps2
+  { 2, 12, 1, 4 },  { 2, 15, 1, 6 },  { 2, 18, 1, 8 },  { 2, 20, 1, 9 },
+  { 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 },
+  { 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 3, 30, 1, 10 },
+  { 3, 50, 1, 12 }, { 3, 50, 2, 25 }, { 3, 60, 2, 35 }, { 3, 70, 2, 45 },
+#endif
+};
+
+typedef void (*restore_func_type)(uint8_t *data8, int width, int height,
+                                  int stride, RestorationInternal *rst,
+                                  uint8_t *dst8, int dst_stride);
+#if CONFIG_HIGHBITDEPTH
+typedef void (*restore_func_highbd_type)(uint8_t *data8, int width, int height,
+                                         int stride, RestorationInternal *rst,
+                                         int bit_depth, uint8_t *dst8,
+                                         int dst_stride);
+#endif  // CONFIG_HIGHBITDEPTH
+
+int av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rst_info,
+                                 int width, int height) {
+  const int ntiles = av1_get_rest_ntiles(
+      width, height, rst_info->restoration_tilesize, NULL, NULL, NULL, NULL);
+  aom_free(rst_info->restoration_type);
+  CHECK_MEM_ERROR(cm, rst_info->restoration_type,
+                  (RestorationType *)aom_malloc(
+                      sizeof(*rst_info->restoration_type) * ntiles));
+  aom_free(rst_info->wiener_info);
+  CHECK_MEM_ERROR(
+      cm, rst_info->wiener_info,
+      (WienerInfo *)aom_memalign(16, sizeof(*rst_info->wiener_info) * ntiles));
+  memset(rst_info->wiener_info, 0, sizeof(*rst_info->wiener_info) * ntiles);
+  aom_free(rst_info->sgrproj_info);
+  CHECK_MEM_ERROR(
+      cm, rst_info->sgrproj_info,
+      (SgrprojInfo *)aom_malloc(sizeof(*rst_info->sgrproj_info) * ntiles));
+  return ntiles;
+}
+
+void av1_free_restoration_struct(RestorationInfo *rst_info) {
+  aom_free(rst_info->restoration_type);
+  rst_info->restoration_type = NULL;
+  aom_free(rst_info->wiener_info);
+  rst_info->wiener_info = NULL;
+  aom_free(rst_info->sgrproj_info);
+  rst_info->sgrproj_info = NULL;
+}
+
+#define MAX_RADIUS 3  // Only 1, 2, 3 allowed
+#define MAX_EPS 80    // Max value of eps
+#define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1))
+#define SGRPROJ_MTABLE_BITS 20
+#define SGRPROJ_RECIP_BITS 12
+
+// TODO(debargha): This table can be substantially reduced since only a few
+// values are actually used.
+int sgrproj_mtable[MAX_EPS][MAX_NELEM];
+
+static void GenSgrprojVtable() {
+  int e, n;
+  for (e = 1; e <= MAX_EPS; ++e)
+    for (n = 1; n <= MAX_NELEM; ++n) {
+      const int n2e = n * n * e;
+      sgrproj_mtable[e - 1][n - 1] =
+          (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
+    }
+}
+
+void av1_loop_restoration_precal() { GenSgrprojVtable(); }
+
+static void loop_restoration_init(RestorationInternal *rst, int kf) {
+  rst->keyframe = kf;
+}
+
+void extend_frame(uint8_t *data, int width, int height, int stride) {
+  uint8_t *data_p;
+  int i;
+  for (i = 0; i < height; ++i) {
+    data_p = data + i * stride;
+    memset(data_p - WIENER_HALFWIN, data_p[0], WIENER_HALFWIN);
+    memset(data_p + width, data_p[width - 1], WIENER_HALFWIN);
+  }
+  data_p = data - WIENER_HALFWIN;
+  for (i = -WIENER_HALFWIN; i < 0; ++i) {
+    memcpy(data_p + i * stride, data_p, width + 2 * WIENER_HALFWIN);
+  }
+  for (i = height; i < height + WIENER_HALFWIN; ++i) {
+    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
+           width + 2 * WIENER_HALFWIN);
+  }
+}
+
+static void loop_copy_tile(uint8_t *data, int tile_idx, int subtile_idx,
+                           int subtile_bits, int width, int height, int stride,
+                           RestorationInternal *rst, uint8_t *dst,
+                           int dst_stride) {
+  const int tile_width = rst->tile_width;
+  const int tile_height = rst->tile_height;
+  int i;
+  int h_start, h_end, v_start, v_end;
+  av1_get_rest_tile_limits(tile_idx, subtile_idx, subtile_bits, rst->nhtiles,
+                           rst->nvtiles, tile_width, tile_height, width, height,
+                           0, 0, &h_start, &h_end, &v_start, &v_end);
+  for (i = v_start; i < v_end; ++i)
+    memcpy(dst + i * dst_stride + h_start, data + i * stride + h_start,
+           h_end - h_start);
+}
+
+static void loop_wiener_filter_tile(uint8_t *data, int tile_idx, int width,
+                                    int height, int stride,
+                                    RestorationInternal *rst, uint8_t *dst,
+                                    int dst_stride) {
+  const int tile_width = rst->tile_width;
+  const int tile_height = rst->tile_height;
+  int i, j;
+  int h_start, h_end, v_start, v_end;
+  if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
+    loop_copy_tile(data, tile_idx, 0, 0, width, height, stride, rst, dst,
+                   dst_stride);
+    return;
+  }
+  av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
+                           tile_width, tile_height, width, height, 0, 0,
+                           &h_start, &h_end, &v_start, &v_end);
+  // Convolve the whole tile (done in blocks here to match the requirements
+  // of the vectorized convolve functions, but the result is equivalent)
+  for (i = v_start; i < v_end; i += MAX_SB_SIZE)
+    for (j = h_start; j < h_end; j += MAX_SB_SIZE) {
+      int w = AOMMIN(MAX_SB_SIZE, (h_end - j + 15) & ~15);
+      int h = AOMMIN(MAX_SB_SIZE, (v_end - i + 15) & ~15);
+      const uint8_t *data_p = data + i * stride + j;
+      uint8_t *dst_p = dst + i * dst_stride + j;
+      aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
+                            rst->rsi->wiener_info[tile_idx].hfilter, 16,
+                            rst->rsi->wiener_info[tile_idx].vfilter, 16, w, h);
+    }
+}
+
+static void loop_wiener_filter(uint8_t *data, int width, int height, int stride,
+                               RestorationInternal *rst, uint8_t *dst,
+                               int dst_stride) {
+  int tile_idx;
+  extend_frame(data, width, height, stride);
+  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
+    loop_wiener_filter_tile(data, tile_idx, width, height, stride, rst, dst,
+                            dst_stride);
+  }
+}
+
+/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
+   over the input. The window is of size (2r + 1)x(2r + 1), and we
+   specialize to r = 1, 2, 3. A default function is used for r > 3.
+
+   Each loop follows the same format: We keep a window's worth of input
+   in individual variables and select data out of that as appropriate.
+*/
+static void boxsum1(int32_t *src, int width, int height, int src_stride,
+                    int sqr, int32_t *dst, int dst_stride) {
+  int i, j, a, b, c;
+
+  // Vertical sum over 3-pixel regions, from src into dst.
+  if (!sqr) {
+    for (j = 0; j < width; ++j) {
+      a = src[j];
+      b = src[src_stride + j];
+      c = src[2 * src_stride + j];
+
+      dst[j] = a + b;
+      for (i = 1; i < height - 2; ++i) {
+        // Loop invariant: At the start of each iteration,
+        // a = src[(i - 1) * src_stride + j]
+        // b = src[(i    ) * src_stride + j]
+        // c = src[(i + 1) * src_stride + j]
+        dst[i * dst_stride + j] = a + b + c;
+        a = b;
+        b = c;
+        c = src[(i + 2) * src_stride + j];
+      }
+      dst[i * dst_stride + j] = a + b + c;
+      dst[(i + 1) * dst_stride + j] = b + c;
+    }
+  } else {
+    for (j = 0; j < width; ++j) {
+      a = src[j] * src[j];
+      b = src[src_stride + j] * src[src_stride + j];
+      c = src[2 * src_stride + j] * src[2 * src_stride + j];
+
+      dst[j] = a + b;
+      for (i = 1; i < height - 2; ++i) {
+        dst[i * dst_stride + j] = a + b + c;
+        a = b;
+        b = c;
+        c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
+      }
+      dst[i * dst_stride + j] = a + b + c;
+      dst[(i + 1) * dst_stride + j] = b + c;
+    }
+  }
+
+  // Horizontal sum over 3-pixel regions of dst
+  for (i = 0; i < height; ++i) {
+    a = dst[i * dst_stride];
+    b = dst[i * dst_stride + 1];
+    c = dst[i * dst_stride + 2];
+
+    dst[i * dst_stride] = a + b;
+    for (j = 1; j < width - 2; ++j) {
+      // Loop invariant: At the start of each iteration,
+      // a = src[i * src_stride + (j - 1)]
+      // b = src[i * src_stride + (j    )]
+      // c = src[i * src_stride + (j + 1)]
+      dst[i * dst_stride + j] = a + b + c;
+      a = b;
+      b = c;
+      c = dst[i * dst_stride + (j + 2)];
+    }
+    dst[i * dst_stride + j] = a + b + c;
+    dst[i * dst_stride + (j + 1)] = b + c;
+  }
+}
+
+static void boxsum2(int32_t *src, int width, int height, int src_stride,
+                    int sqr, int32_t *dst, int dst_stride) {
+  int i, j, a, b, c, d, e;
+
+  // Vertical sum over 5-pixel regions, from src into dst.
+  if (!sqr) {
+    for (j = 0; j < width; ++j) {
+      a = src[j];
+      b = src[src_stride + j];
+      c = src[2 * src_stride + j];
+      d = src[3 * src_stride + j];
+      e = src[4 * src_stride + j];
+
+      dst[j] = a + b + c;
+      dst[dst_stride + j] = a + b + c + d;
+      for (i = 2; i < height - 3; ++i) {
+        // Loop invariant: At the start of each iteration,
+        // a = src[(i - 2) * src_stride + j]
+        // b = src[(i - 1) * src_stride + j]
+        // c = src[(i    ) * src_stride + j]
+        // d = src[(i + 1) * src_stride + j]
+        // e = src[(i + 2) * src_stride + j]
+        dst[i * dst_stride + j] = a + b + c + d + e;
+        a = b;
+        b = c;
+        c = d;
+        d = e;
+        e = src[(i + 3) * src_stride + j];
+      }
+      dst[i * dst_stride + j] = a + b + c + d + e;
+      dst[(i + 1) * dst_stride + j] = b + c + d + e;
+      dst[(i + 2) * dst_stride + j] = c + d + e;
+    }
+  } else {
+    for (j = 0; j < width; ++j) {
+      a = src[j] * src[j];
+      b = src[src_stride + j] * src[src_stride + j];
+      c = src[2 * src_stride + j] * src[2 * src_stride + j];
+      d = src[3 * src_stride + j] * src[3 * src_stride + j];
+      e = src[4 * src_stride + j] * src[4 * src_stride + j];
+
+      dst[j] = a + b + c;
+      dst[dst_stride + j] = a + b + c + d;
+      for (i = 2; i < height - 3; ++i) {
+        dst[i * dst_stride + j] = a + b + c + d + e;
+        a = b;
+        b = c;
+        c = d;
+        d = e;
+        e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
+      }
+      dst[i * dst_stride + j] = a + b + c + d + e;
+      dst[(i + 1) * dst_stride + j] = b + c + d + e;
+      dst[(i + 2) * dst_stride + j] = c + d + e;
+    }
+  }
+
+  // Horizontal sum over 5-pixel regions of dst
+  for (i = 0; i < height; ++i) {
+    a = dst[i * dst_stride];
+    b = dst[i * dst_stride + 1];
+    c = dst[i * dst_stride + 2];
+    d = dst[i * dst_stride + 3];
+    e = dst[i * dst_stride + 4];
+
+    dst[i * dst_stride] = a + b + c;
+    dst[i * dst_stride + 1] = a + b + c + d;
+    for (j = 2; j < width - 3; ++j) {
+      // Loop invariant: At the start of each iteration,
+      // a = src[i * src_stride + (j - 2)]
+      // b = src[i * src_stride + (j - 1)]
+      // c = src[i * src_stride + (j    )]
+      // d = src[i * src_stride + (j + 1)]
+      // e = src[i * src_stride + (j + 2)]
+      dst[i * dst_stride + j] = a + b + c + d + e;
+      a = b;
+      b = c;
+      c = d;
+      d = e;
+      e = dst[i * dst_stride + (j + 3)];
+    }
+    dst[i * dst_stride + j] = a + b + c + d + e;
+    dst[i * dst_stride + (j + 1)] = b + c + d + e;
+    dst[i * dst_stride + (j + 2)] = c + d + e;
+  }
+}
+
+static void boxsum3(int32_t *src, int width, int height, int src_stride,
+                    int sqr, int32_t *dst, int dst_stride) {
+  int i, j, a, b, c, d, e, f, g;
+
+  // Vertical sum over 7-pixel regions, from src into dst.
+  if (!sqr) {
+    for (j = 0; j < width; ++j) {
+      a = src[j];
+      b = src[1 * src_stride + j];
+      c = src[2 * src_stride + j];
+      d = src[3 * src_stride + j];
+      e = src[4 * src_stride + j];
+      f = src[5 * src_stride + j];
+      g = src[6 * src_stride + j];
+
+      dst[j] = a + b + c + d;
+      dst[dst_stride + j] = a + b + c + d + e;
+      dst[2 * dst_stride + j] = a + b + c + d + e + f;
+      for (i = 3; i < height - 4; ++i) {
+        dst[i * dst_stride + j] = a + b + c + d + e + f + g;
+        a = b;
+        b = c;
+        c = d;
+        d = e;
+        e = f;
+        f = g;
+        g = src[(i + 4) * src_stride + j];
+      }
+      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
+      dst[(i + 1) * dst_stride + j] = b + c + d + e + f + g;
+      dst[(i + 2) * dst_stride + j] = c + d + e + f + g;
+      dst[(i + 3) * dst_stride + j] = d + e + f + g;
+    }
+  } else {
+    for (j = 0; j < width; ++j) {
+      a = src[j] * src[j];
+      b = src[1 * src_stride + j] * src[1 * src_stride + j];
+      c = src[2 * src_stride + j] * src[2 * src_stride + j];
+      d = src[3 * src_stride + j] * src[3 * src_stride + j];
+      e = src[4 * src_stride + j] * src[4 * src_stride + j];
+      f = src[5 * src_stride + j] * src[5 * src_stride + j];
+      g = src[6 * src_stride + j] * src[6 * src_stride + j];
+
+      dst[j] = a + b + c + d;
+      dst[dst_stride + j] = a + b + c + d + e;
+      dst[2 * dst_stride + j] = a + b + c + d + e + f;
+      for (i = 3; i < height - 4; ++i) {
+        dst[i * dst_stride + j] = a + b + c + d + e + f + g;
+        a = b;
+        b = c;
+        c = d;
+        d = e;
+        e = f;
+        f = g;
+        g = src[(i + 4) * src_stride + j] * src[(i + 4) * src_stride + j];
+      }
+      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
+      dst[(i + 1) * dst_stride + j] = b + c + d + e + f + g;
+      dst[(i + 2) * dst_stride + j] = c + d + e + f + g;
+      dst[(i + 3) * dst_stride + j] = d + e + f + g;
+    }
+  }
+
+  // Horizontal sum over 7-pixel regions of dst
+  for (i = 0; i < height; ++i) {
+    a = dst[i * dst_stride];
+    b = dst[i * dst_stride + 1];
+    c = dst[i * dst_stride + 2];
+    d = dst[i * dst_stride + 3];
+    e = dst[i * dst_stride + 4];
+    f = dst[i * dst_stride + 5];
+    g = dst[i * dst_stride + 6];
+
+    dst[i * dst_stride] = a + b + c + d;
+    dst[i * dst_stride + 1] = a + b + c + d + e;
+    dst[i * dst_stride + 2] = a + b + c + d + e + f;
+    for (j = 3; j < width - 4; ++j) {
+      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
+      a = b;
+      b = c;
+      c = d;
+      d = e;
+      e = f;
+      f = g;
+      g = dst[i * dst_stride + (j + 4)];
+    }
+    dst[i * dst_stride + j] = a + b + c + d + e + f + g;
+    dst[i * dst_stride + (j + 1)] = b + c + d + e + f + g;
+    dst[i * dst_stride + (j + 2)] = c + d + e + f + g;
+    dst[i * dst_stride + (j + 3)] = d + e + f + g;
+  }
+}
+
+// Generic version for any r. To be removed after experiments are done.
+static void boxsumr(int32_t *src, int width, int height, int src_stride, int r,
+                    int sqr, int32_t *dst, int dst_stride) {
+  int32_t *tmp = aom_malloc(width * height * sizeof(*tmp));
+  int tmp_stride = width;
+  int i, j;
+  if (sqr) {
+    for (j = 0; j < width; ++j) tmp[j] = src[j] * src[j];
+    for (j = 0; j < width; ++j)
+      for (i = 1; i < height; ++i)
+        tmp[i * tmp_stride + j] =
+            tmp[(i - 1) * tmp_stride + j] +
+            src[i * src_stride + j] * src[i * src_stride + j];
+  } else {
+    memcpy(tmp, src, sizeof(*tmp) * width);
+    for (j = 0; j < width; ++j)
+      for (i = 1; i < height; ++i)
+        tmp[i * tmp_stride + j] =
+            tmp[(i - 1) * tmp_stride + j] + src[i * src_stride + j];
+  }
+  for (i = 0; i <= r; ++i)
+    memcpy(&dst[i * dst_stride], &tmp[(i + r) * tmp_stride],
+           sizeof(*tmp) * width);
+  for (i = r + 1; i < height - r; ++i)
+    for (j = 0; j < width; ++j)
+      dst[i * dst_stride + j] =
+          tmp[(i + r) * tmp_stride + j] - tmp[(i - r - 1) * tmp_stride + j];
+  for (i = height - r; i < height; ++i)
+    for (j = 0; j < width; ++j)
+      dst[i * dst_stride + j] = tmp[(height - 1) * tmp_stride + j] -
+                                tmp[(i - r - 1) * tmp_stride + j];
+
+  for (i = 0; i < height; ++i) tmp[i * tmp_stride] = dst[i * dst_stride];
+  for (i = 0; i < height; ++i)
+    for (j = 1; j < width; ++j)
+      tmp[i * tmp_stride + j] =
+          tmp[i * tmp_stride + j - 1] + dst[i * src_stride + j];
+
+  for (j = 0; j <= r; ++j)
+    for (i = 0; i < height; ++i)
+      dst[i * dst_stride + j] = tmp[i * tmp_stride + j + r];
+  for (j = r + 1; j < width - r; ++j)
+    for (i = 0; i < height; ++i)
+      dst[i * dst_stride + j] =
+          tmp[i * tmp_stride + j + r] - tmp[i * tmp_stride + j - r - 1];
+  for (j = width - r; j < width; ++j)
+    for (i = 0; i < height; ++i)
+      dst[i * dst_stride + j] =
+          tmp[i * tmp_stride + width - 1] - tmp[i * tmp_stride + j - r - 1];
+  aom_free(tmp);
+}
+
+static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
+                   int sqr, int32_t *dst, int dst_stride) {
+  if (r == 1)
+    boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
+  else if (r == 2)
+    boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
+  else if (r == 3)
+    boxsum3(src, width, height, src_stride, sqr, dst, dst_stride);
+  else
+    boxsumr(src, width, height, src_stride, r, sqr, dst, dst_stride);
+}
+
+static void boxnum(int width, int height, int r, int8_t *num, int num_stride) {
+  int i, j;
+  for (i = 0; i <= r; ++i) {
+    for (j = 0; j <= r; ++j) {
+      num[i * num_stride + j] = (r + 1 + i) * (r + 1 + j);
+      num[i * num_stride + (width - 1 - j)] = num[i * num_stride + j];
+      num[(height - 1 - i) * num_stride + j] = num[i * num_stride + j];
+      num[(height - 1 - i) * num_stride + (width - 1 - j)] =
+          num[i * num_stride + j];
+    }
+  }
+  for (j = 0; j <= r; ++j) {
+    const int val = (2 * r + 1) * (r + 1 + j);
+    for (i = r + 1; i < height - r; ++i) {
+      num[i * num_stride + j] = val;
+      num[i * num_stride + (width - 1 - j)] = val;
+    }
+  }
+  for (i = 0; i <= r; ++i) {
+    const int val = (2 * r + 1) * (r + 1 + i);
+    for (j = r + 1; j < width - r; ++j) {
+      num[i * num_stride + j] = val;
+      num[(height - 1 - i) * num_stride + j] = val;
+    }
+  }
+  for (i = r + 1; i < height - r; ++i) {
+    for (j = r + 1; j < width - r; ++j) {
+      num[i * num_stride + j] = (2 * r + 1) * (2 * r + 1);
+    }
+  }
+}
+
+void decode_xq(int *xqd, int *xq) {
+  xq[0] = xqd[0];
+  xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
+}
+
+const int32_t x_by_xplus1[256] = {
+  0,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
+  240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
+  248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
+  250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
+  252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
+  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
+  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
+  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+  254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  256,
+};
+
+const int32_t one_by_x[MAX_NELEM] = {
+  4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
+  293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164, 158,
+  152,  146,  141,  137,  132, 128, 124, 120, 117, 114, 111, 108, 105,
+  102,  100,  98,   95,   93,  91,  89,  87,  85,  84
+};
+
+static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
+                                                int height, int stride,
+                                                int bit_depth, int r, int eps,
+                                                int32_t *tmpbuf) {
+  int32_t *A = tmpbuf;
+  int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
+  int8_t num[RESTORATION_TILEPELS_MAX];
+  int i, j;
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 16 bytes, for consistency
+  // with the SIMD version of this function.
+  int buf_stride = ((width + 3) & ~3) + 16;
+
+  // Don't filter tiles with dimensions < 5 on any axis
+  if ((width < 5) || (height < 5)) return;
+
+  boxsum(dgd, width, height, stride, r, 0, B, buf_stride);
+  boxsum(dgd, width, height, stride, r, 1, A, buf_stride);
+  boxnum(width, height, r, num, width);
+  assert(r <= 3);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int k = i * buf_stride + j;
+      const int n = num[i * width + j];
+
+      // a < 2^16 * n < 2^22 regardless of bit depth
+      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
+      // b < 2^8 * n < 2^14 regardless of bit depth
+      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
+
+      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
+      // and p itself satisfies p < 2^14 * n^2 < 2^26.
+      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
+      // This is an artefact of rounding, and can only happen if all pixels
+      // are (almost) identical, so in this case we saturate to p=0.
+      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
+      uint32_t s = sgrproj_mtable[eps - 1][n - 1];
+
+      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
+      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
+      // (this holds even after accounting for the rounding in s)
+      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
+
+      A[k] = x_by_xplus1[AOMMIN(z, 255)];  // < 2^8
+
+      // SGRPROJ_SGR - A[k] < 2^8, B[k] < 2^(bit_depth) * n,
+      // one_by_x[n - 1] = round(2^12 / n)
+      // => the product here is < 2^(20 + bit_depth) <= 2^32,
+      // and B[k] is set to a value < 2^(8 + bit depth)
+      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
+                                             (uint32_t)B[k] *
+                                             (uint32_t)one_by_x[n - 1],
+                                         SGRPROJ_RECIP_BITS);
+    }
+  }
+  i = 0;
+  j = 0;
+  {
+    const int k = i * buf_stride + j;
+    const int l = i * stride + j;
+    const int nb = 3;
+    const int32_t a =
+        3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] + A[k + buf_stride + 1];
+    const int32_t b =
+        3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] + B[k + buf_stride + 1];
+    const int32_t v = a * dgd[l] + b;
+    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+  }
+  i = 0;
+  j = width - 1;
+  {
+    const int k = i * buf_stride + j;
+    const int l = i * stride + j;
+    const int nb = 3;
+    const int32_t a =
+        3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] + A[k + buf_stride - 1];
+    const int32_t b =
+        3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] + B[k + buf_stride - 1];
+    const int32_t v = a * dgd[l] + b;
+    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+  }
+  i = height - 1;
+  j = 0;
+  {
+    const int k = i * buf_stride + j;
+    const int l = i * stride + j;
+    const int nb = 3;
+    const int32_t a =
+        3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] + A[k - buf_stride + 1];
+    const int32_t b =
+        3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] + B[k - buf_stride + 1];
+    const int32_t v = a * dgd[l] + b;
+    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+  }
+  i = height - 1;
+  j = width - 1;
+  {
+    const int k = i * buf_stride + j;
+    const int l = i * stride + j;
+    const int nb = 3;
+    const int32_t a =
+        3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] + A[k - buf_stride - 1];
+    const int32_t b =
+        3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] + B[k - buf_stride - 1];
+    const int32_t v = a * dgd[l] + b;
+    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+  }
+  i = 0;
+  for (j = 1; j < width - 1; ++j) {
+    const int k = i * buf_stride + j;
+    const int l = i * stride + j;
+    const int nb = 3;
+    const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
+                      A[k + buf_stride - 1] + A[k + buf_stride + 1];
+    const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
+                      B[k + buf_stride - 1] + B[k + buf_stride + 1];
+    const int32_t v = a * dgd[l] + b;
+    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+  }
+  i = height - 1;
+  for (j = 1; j < width - 1; ++j) {
+    const int k = i * buf_stride + j;
+    const int l = i * stride + j;
+    const int nb = 3;
+    const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
+                      A[k - buf_stride - 1] + A[k - buf_stride + 1];
+    const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
+                      B[k - buf_stride - 1] + B[k - buf_stride + 1];
+    const int32_t v = a * dgd[l] + b;
+    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+  }
+  j = 0;
+  for (i = 1; i < height - 1; ++i) {
+    const int k = i * buf_stride + j;
+    const int l = i * stride + j;
+    const int nb = 3;
+    const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
+                      A[k + 1] + A[k - buf_stride + 1] + A[k + buf_stride + 1];
+    const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
+                      B[k + 1] + B[k - buf_stride + 1] + B[k + buf_stride + 1];
+    const int32_t v = a * dgd[l] + b;
+    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+  }
+  j = width - 1;
+  for (i = 1; i < height - 1; ++i) {
+    const int k = i * buf_stride + j;
+    const int l = i * stride + j;
+    const int nb = 3;
+    const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
+                      A[k - 1] + A[k - buf_stride - 1] + A[k + buf_stride - 1];
+    const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
+                      B[k - 1] + B[k - buf_stride - 1] + B[k + buf_stride - 1];
+    const int32_t v = a * dgd[l] + b;
+    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+  }
+  for (i = 1; i < height - 1; ++i) {
+    for (j = 1; j < width - 1; ++j) {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int nb = 5;
+      const int32_t a =
+          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
+              4 +
+          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
+           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
+              3;
+      const int32_t b =
+          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
+              4 +
+          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
+           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
+              3;
+      const int32_t v = a * dgd[l] + b;
+      dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+  }
+}
+
+void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height,
+                                  int stride, int32_t *dst, int dst_stride,
+                                  int r, int eps, int32_t *tmpbuf) {
+  int i, j;
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      dst[i * dst_stride + j] = dgd[i * stride + j];
+    }
+  }
+  av1_selfguided_restoration_internal(dst, width, height, dst_stride, 8, r, eps,
+                                      tmpbuf);
+}
+
+void av1_highpass_filter_c(uint8_t *dgd, int width, int height, int stride,
+                           int32_t *dst, int dst_stride, int corner, int edge) {
+  int i, j;
+  const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
+
+  i = 0;
+  j = 0;
+  {
+    const int k = i * stride + j;
+    const int l = i * dst_stride + j;
+    dst[l] =
+        center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
+        corner * (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
+  }
+  i = 0;
+  j = width - 1;
+  {
+    const int k = i * stride + j;
+    const int l = i * dst_stride + j;
+    dst[l] =
+        center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
+        corner * (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
+  }
+  i = height - 1;
+  j = 0;
+  {
+    const int k = i * stride + j;
+    const int l = i * dst_stride + j;
+    dst[l] =
+        center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
+        corner * (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
+  }
+  i = height - 1;
+  j = width - 1;
+  {
+    const int k = i * stride + j;
+    const int l = i * dst_stride + j;
+    dst[l] =
+        center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
+        corner * (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
+  }
+  i = 0;
+  for (j = 1; j < width - 1; ++j) {
+    const int k = i * stride + j;
+    const int l = i * dst_stride + j;
+    dst[l] = center * dgd[k] +
+             edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
+             corner * (dgd[k + stride - 1] + dgd[k + stride + 1] + dgd[k - 1] +
+                       dgd[k + 1]);
+  }
+  i = height - 1;
+  for (j = 1; j < width - 1; ++j) {
+    const int k = i * stride + j;
+    const int l = i * dst_stride + j;
+    dst[l] = center * dgd[k] +
+             edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
+             corner * (dgd[k - stride - 1] + dgd[k - stride + 1] + dgd[k - 1] +
+                       dgd[k + 1]);
+  }
+  j = 0;
+  for (i = 1; i < height - 1; ++i) {
+    const int k = i * stride + j;
+    const int l = i * dst_stride + j;
+    dst[l] = center * dgd[k] +
+             edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
+             corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
+                       dgd[k - stride] + dgd[k + stride]);
+  }
+  j = width - 1;
+  for (i = 1; i < height - 1; ++i) {
+    const int k = i * stride + j;
+    const int l = i * dst_stride + j;
+    dst[l] = center * dgd[k] +
+             edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
+             corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
+                       dgd[k - stride] + dgd[k + stride]);
+  }
+  for (i = 1; i < height - 1; ++i) {
+    for (j = 1; j < width - 1; ++j) {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] +
+          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
+          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
+                    dgd[k - stride + 1] + dgd[k + stride + 1]);
+    }
+  }
+}
+
+void apply_selfguided_restoration_c(uint8_t *dat, int width, int height,
+                                    int stride, int eps, int *xqd, uint8_t *dst,
+                                    int dst_stride, int32_t *tmpbuf) {
+  int xq[2];
+  int32_t *flt1 = tmpbuf;
+  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
+  int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
+  int i, j;
+  assert(width * height <= RESTORATION_TILEPELS_MAX);
+#if USE_HIGHPASS_IN_SGRPROJ
+  av1_highpass_filter_c(dat, width, height, stride, flt1, width,
+                        sgr_params[eps].corner, sgr_params[eps].edge);
+#else
+  av1_selfguided_restoration_c(dat, width, height, stride, flt1, width,
+                               sgr_params[eps].r1, sgr_params[eps].e1, tmpbuf2);
+#endif  // USE_HIGHPASS_IN_SGRPROJ
+  av1_selfguided_restoration_c(dat, width, height, stride, flt2, width,
+                               sgr_params[eps].r2, sgr_params[eps].e2, tmpbuf2);
+  decode_xq(xqd, xq);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int k = i * width + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
+      const int32_t f1 = (int32_t)flt1[k] - u;
+      const int32_t f2 = (int32_t)flt2[k] - u;
+      const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
+      const int16_t w =
+          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      dst[m] = clip_pixel(w);
+    }
+  }
+}
+
+static void loop_sgrproj_filter_tile(uint8_t *data, int tile_idx, int width,
+                                     int height, int stride,
+                                     RestorationInternal *rst, uint8_t *dst,
+                                     int dst_stride) {
+  const int tile_width = rst->tile_width;
+  const int tile_height = rst->tile_height;
+  int h_start, h_end, v_start, v_end;
+  uint8_t *data_p, *dst_p;
+
+  if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
+    loop_copy_tile(data, tile_idx, 0, 0, width, height, stride, rst, dst,
+                   dst_stride);
+    return;
+  }
+  av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
+                           tile_width, tile_height, width, height, 0, 0,
+                           &h_start, &h_end, &v_start, &v_end);
+  data_p = data + h_start + v_start * stride;
+  dst_p = dst + h_start + v_start * dst_stride;
+  apply_selfguided_restoration(data_p, h_end - h_start, v_end - v_start, stride,
+                               rst->rsi->sgrproj_info[tile_idx].ep,
+                               rst->rsi->sgrproj_info[tile_idx].xqd, dst_p,
+                               dst_stride, rst->tmpbuf);
+}
+
+static void loop_sgrproj_filter(uint8_t *data, int width, int height,
+                                int stride, RestorationInternal *rst,
+                                uint8_t *dst, int dst_stride) {
+  int tile_idx;
+  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
+    loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst, dst,
+                             dst_stride);
+  }
+}
+
+static void loop_switchable_filter(uint8_t *data, int width, int height,
+                                   int stride, RestorationInternal *rst,
+                                   uint8_t *dst, int dst_stride) {
+  int tile_idx;
+  extend_frame(data, width, height, stride);
+  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
+    if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
+      loop_copy_tile(data, tile_idx, 0, 0, width, height, stride, rst, dst,
+                     dst_stride);
+    } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_WIENER) {
+      loop_wiener_filter_tile(data, tile_idx, width, height, stride, rst, dst,
+                              dst_stride);
+    } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_SGRPROJ) {
+      loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst, dst,
+                               dst_stride);
+    }
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+void extend_frame_highbd(uint16_t *data, int width, int height, int stride) {
+  uint16_t *data_p;
+  int i, j;
+  for (i = 0; i < height; ++i) {
+    data_p = data + i * stride;
+    for (j = -WIENER_HALFWIN; j < 0; ++j) data_p[j] = data_p[0];
+    for (j = width; j < width + WIENER_HALFWIN; ++j)
+      data_p[j] = data_p[width - 1];
+  }
+  data_p = data - WIENER_HALFWIN;
+  for (i = -WIENER_HALFWIN; i < 0; ++i) {
+    memcpy(data_p + i * stride, data_p,
+           (width + 2 * WIENER_HALFWIN) * sizeof(uint16_t));
+  }
+  for (i = height; i < height + WIENER_HALFWIN; ++i) {
+    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
+           (width + 2 * WIENER_HALFWIN) * sizeof(uint16_t));
+  }
+}
+
+static void loop_copy_tile_highbd(uint16_t *data, int tile_idx, int subtile_idx,
+                                  int subtile_bits, int width, int height,
+                                  int stride, RestorationInternal *rst,
+                                  uint16_t *dst, int dst_stride) {
+  const int tile_width = rst->tile_width;
+  const int tile_height = rst->tile_height;
+  int i;
+  int h_start, h_end, v_start, v_end;
+  av1_get_rest_tile_limits(tile_idx, subtile_idx, subtile_bits, rst->nhtiles,
+                           rst->nvtiles, tile_width, tile_height, width, height,
+                           0, 0, &h_start, &h_end, &v_start, &v_end);
+  for (i = v_start; i < v_end; ++i)
+    memcpy(dst + i * dst_stride + h_start, data + i * stride + h_start,
+           (h_end - h_start) * sizeof(*dst));
+}
+
+static void loop_wiener_filter_tile_highbd(uint16_t *data, int tile_idx,
+                                           int width, int height, int stride,
+                                           RestorationInternal *rst,
+                                           int bit_depth, uint16_t *dst,
+                                           int dst_stride) {
+  const int tile_width = rst->tile_width;
+  const int tile_height = rst->tile_height;
+  int h_start, h_end, v_start, v_end;
+  int i, j;
+
+  if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
+    loop_copy_tile_highbd(data, tile_idx, 0, 0, width, height, stride, rst, dst,
+                          dst_stride);
+    return;
+  }
+  av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
+                           tile_width, tile_height, width, height, 0, 0,
+                           &h_start, &h_end, &v_start, &v_end);
+  // Convolve the whole tile (done in blocks here to match the requirements
+  // of the vectorized convolve functions, but the result is equivalent)
+  for (i = v_start; i < v_end; i += MAX_SB_SIZE)
+    for (j = h_start; j < h_end; j += MAX_SB_SIZE) {
+      int w = AOMMIN(MAX_SB_SIZE, (h_end - j + 15) & ~15);
+      int h = AOMMIN(MAX_SB_SIZE, (v_end - i + 15) & ~15);
+      const uint16_t *data_p = data + i * stride + j;
+      uint16_t *dst_p = dst + i * dst_stride + j;
+      aom_highbd_convolve8_add_src(
+          CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
+          dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
+          rst->rsi->wiener_info[tile_idx].vfilter, 16, w, h, bit_depth);
+    }
+}
+
+static void loop_wiener_filter_highbd(uint8_t *data8, int width, int height,
+                                      int stride, RestorationInternal *rst,
+                                      int bit_depth, uint8_t *dst8,
+                                      int dst_stride) {
+  uint16_t *data = CONVERT_TO_SHORTPTR(data8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  int tile_idx;
+  extend_frame_highbd(data, width, height, stride);
+  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
+    loop_wiener_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
+                                   bit_depth, dst, dst_stride);
+  }
+}
+
+void av1_selfguided_restoration_highbd_c(uint16_t *dgd, int width, int height,
+                                         int stride, int32_t *dst,
+                                         int dst_stride, int bit_depth, int r,
+                                         int eps, int32_t *tmpbuf) {
+  int i, j;
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      dst[i * dst_stride + j] = dgd[i * stride + j];
+    }
+  }
+  av1_selfguided_restoration_internal(dst, width, height, dst_stride, bit_depth,
+                                      r, eps, tmpbuf);
+}
+
+void av1_highpass_filter_highbd_c(uint16_t *dgd, int width, int height,
+                                  int stride, int32_t *dst, int dst_stride,
+                                  int corner, int edge) {
+  int i, j;
+  const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
+
+  i = 0;
+  j = 0;
+  {
+    const int k = i * stride + j;
+    const int l = i * dst_stride + j;
+    dst[l] =
+        center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
+        corner * (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
+  }
+  i = 0;
+  j = width - 1;
+  {
+    const int k = i * stride + j;
+    const int l = i * dst_stride + j;
+    dst[l] =
+        center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
+        corner * (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
+  }
+  i = height - 1;
+  j = 0;
+  {
+    const int k = i * stride + j;
+    const int l = i * dst_stride + j;
+    dst[l] =
+        center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
+        corner * (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
+  }
+  i = height - 1;
+  j = width - 1;
+  {
+    const int k = i * stride + j;
+    const int l = i * dst_stride + j;
+    dst[l] =
+        center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
+        corner * (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
+  }
+  i = 0;
+  for (j = 1; j < width - 1; ++j) {
+    const int k = i * stride + j;
+    const int l = i * dst_stride + j;
+    dst[l] = center * dgd[k] +
+             edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
+             corner * (dgd[k + stride - 1] + dgd[k + stride + 1] + dgd[k - 1] +
+                       dgd[k + 1]);
+  }
+  i = height - 1;
+  for (j = 1; j < width - 1; ++j) {
+    const int k = i * stride + j;
+    const int l = i * dst_stride + j;
+    dst[l] = center * dgd[k] +
+             edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
+             corner * (dgd[k - stride - 1] + dgd[k - stride + 1] + dgd[k - 1] +
+                       dgd[k + 1]);
+  }
+  j = 0;
+  for (i = 1; i < height - 1; ++i) {
+    const int k = i * stride + j;
+    const int l = i * dst_stride + j;
+    dst[l] = center * dgd[k] +
+             edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
+             corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
+                       dgd[k - stride] + dgd[k + stride]);
+  }
+  j = width - 1;
+  for (i = 1; i < height - 1; ++i) {
+    const int k = i * stride + j;
+    const int l = i * dst_stride + j;
+    dst[l] = center * dgd[k] +
+             edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
+             corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
+                       dgd[k - stride] + dgd[k + stride]);
+  }
+  for (i = 1; i < height - 1; ++i) {
+    for (j = 1; j < width - 1; ++j) {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] +
+          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
+          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
+                    dgd[k - stride + 1] + dgd[k + stride + 1]);
+    }
+  }
+}
+
+void apply_selfguided_restoration_highbd_c(uint16_t *dat, int width, int height,
+                                           int stride, int bit_depth, int eps,
+                                           int *xqd, uint16_t *dst,
+                                           int dst_stride, int32_t *tmpbuf) {
+  int xq[2];
+  int32_t *flt1 = tmpbuf;
+  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
+  int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
+  int i, j;
+  assert(width * height <= RESTORATION_TILEPELS_MAX);
+#if USE_HIGHPASS_IN_SGRPROJ
+  av1_highpass_filter_highbd_c(dat, width, height, stride, flt1, width,
+                               sgr_params[eps].corner, sgr_params[eps].edge);
+#else
+  av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt1, width,
+                                      bit_depth, sgr_params[eps].r1,
+                                      sgr_params[eps].e1, tmpbuf2);
+#endif  // USE_HIGHPASS_IN_SGRPROJ
+  av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt2, width,
+                                      bit_depth, sgr_params[eps].r2,
+                                      sgr_params[eps].e2, tmpbuf2);
+  decode_xq(xqd, xq);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int k = i * width + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
+      const int32_t f1 = (int32_t)flt1[k] - u;
+      const int32_t f2 = (int32_t)flt2[k] - u;
+      const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
+      const int16_t w =
+          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      dst[m] = (uint16_t)clip_pixel_highbd(w, bit_depth);
+    }
+  }
+}
+
+static void loop_sgrproj_filter_tile_highbd(uint16_t *data, int tile_idx,
+                                            int width, int height, int stride,
+                                            RestorationInternal *rst,
+                                            int bit_depth, uint16_t *dst,
+                                            int dst_stride) {
+  const int tile_width = rst->tile_width;
+  const int tile_height = rst->tile_height;
+  int h_start, h_end, v_start, v_end;
+  uint16_t *data_p, *dst_p;
+
+  if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
+    loop_copy_tile_highbd(data, tile_idx, 0, 0, width, height, stride, rst, dst,
+                          dst_stride);
+    return;
+  }
+  av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
+                           tile_width, tile_height, width, height, 0, 0,
+                           &h_start, &h_end, &v_start, &v_end);
+  data_p = data + h_start + v_start * stride;
+  dst_p = dst + h_start + v_start * dst_stride;
+  apply_selfguided_restoration_highbd(
+      data_p, h_end - h_start, v_end - v_start, stride, bit_depth,
+      rst->rsi->sgrproj_info[tile_idx].ep, rst->rsi->sgrproj_info[tile_idx].xqd,
+      dst_p, dst_stride, rst->tmpbuf);
+}
+
+static void loop_sgrproj_filter_highbd(uint8_t *data8, int width, int height,
+                                       int stride, RestorationInternal *rst,
+                                       int bit_depth, uint8_t *dst8,
+                                       int dst_stride) {
+  int tile_idx;
+  uint16_t *data = CONVERT_TO_SHORTPTR(data8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
+    loop_sgrproj_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
+                                    bit_depth, dst, dst_stride);
+  }
+}
+
+static void loop_switchable_filter_highbd(uint8_t *data8, int width, int height,
+                                          int stride, RestorationInternal *rst,
+                                          int bit_depth, uint8_t *dst8,
+                                          int dst_stride) {
+  uint16_t *data = CONVERT_TO_SHORTPTR(data8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  int tile_idx;
+  extend_frame_highbd(data, width, height, stride);
+  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
+    if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
+      loop_copy_tile_highbd(data, tile_idx, 0, 0, width, height, stride, rst,
+                            dst, dst_stride);
+    } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_WIENER) {
+      loop_wiener_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
+                                     bit_depth, dst, dst_stride);
+    } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_SGRPROJ) {
+      loop_sgrproj_filter_tile_highbd(data, tile_idx, width, height, stride,
+                                      rst, bit_depth, dst, dst_stride);
+    }
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+static void loop_restoration_rows(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+                                  int start_mi_row, int end_mi_row,
+                                  int components_pattern, RestorationInfo *rsi,
+                                  YV12_BUFFER_CONFIG *dst) {
+  const int ywidth = frame->y_crop_width;
+  const int ystride = frame->y_stride;
+  const int uvwidth = frame->uv_crop_width;
+  const int uvstride = frame->uv_stride;
+  const int ystart = start_mi_row << MI_SIZE_LOG2;
+  const int uvstart = ystart >> cm->subsampling_y;
+  int yend = end_mi_row << MI_SIZE_LOG2;
+  int uvend = yend >> cm->subsampling_y;
+  restore_func_type restore_funcs[RESTORE_TYPES] = {
+    NULL, loop_wiener_filter, loop_sgrproj_filter, loop_switchable_filter
+  };
+#if CONFIG_HIGHBITDEPTH
+  restore_func_highbd_type restore_funcs_highbd[RESTORE_TYPES] = {
+    NULL, loop_wiener_filter_highbd, loop_sgrproj_filter_highbd,
+    loop_switchable_filter_highbd
+  };
+#endif  // CONFIG_HIGHBITDEPTH
+  restore_func_type restore_func;
+#if CONFIG_HIGHBITDEPTH
+  restore_func_highbd_type restore_func_highbd;
+#endif  // CONFIG_HIGHBITDEPTH
+  YV12_BUFFER_CONFIG dst_;
+
+  yend = AOMMIN(yend, cm->height);
+  uvend = AOMMIN(uvend, cm->subsampling_y ? (cm->height + 1) >> 1 : cm->height);
+
+  if (components_pattern == (1 << AOM_PLANE_Y)) {
+    // Only y
+    if (rsi[0].frame_restoration_type == RESTORE_NONE) {
+      if (dst) aom_yv12_copy_y(frame, dst);
+      return;
+    }
+  } else if (components_pattern == (1 << AOM_PLANE_U)) {
+    // Only U
+    if (rsi[1].frame_restoration_type == RESTORE_NONE) {
+      if (dst) aom_yv12_copy_u(frame, dst);
+      return;
+    }
+  } else if (components_pattern == (1 << AOM_PLANE_V)) {
+    // Only V
+    if (rsi[2].frame_restoration_type == RESTORE_NONE) {
+      if (dst) aom_yv12_copy_v(frame, dst);
+      return;
+    }
+  } else if (components_pattern ==
+             ((1 << AOM_PLANE_Y) | (1 << AOM_PLANE_U) | (1 << AOM_PLANE_V))) {
+    // All components
+    if (rsi[0].frame_restoration_type == RESTORE_NONE &&
+        rsi[1].frame_restoration_type == RESTORE_NONE &&
+        rsi[2].frame_restoration_type == RESTORE_NONE) {
+      if (dst) aom_yv12_copy_frame(frame, dst);
+      return;
+    }
+  }
+
+  if (!dst) {
+    dst = &dst_;
+    memset(dst, 0, sizeof(YV12_BUFFER_CONFIG));
+    if (aom_realloc_frame_buffer(
+            dst, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+            cm->use_highbitdepth,
+#endif
+            AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL) < 0)
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate restoration dst buffer");
+  }
+
+  if ((components_pattern >> AOM_PLANE_Y) & 1) {
+    if (rsi[0].frame_restoration_type != RESTORE_NONE) {
+      cm->rst_internal.ntiles = av1_get_rest_ntiles(
+          cm->width, cm->height, cm->rst_info[AOM_PLANE_Y].restoration_tilesize,
+          &cm->rst_internal.tile_width, &cm->rst_internal.tile_height,
+          &cm->rst_internal.nhtiles, &cm->rst_internal.nvtiles);
+      cm->rst_internal.rsi = &rsi[0];
+      restore_func =
+          restore_funcs[cm->rst_internal.rsi->frame_restoration_type];
+#if CONFIG_HIGHBITDEPTH
+      restore_func_highbd =
+          restore_funcs_highbd[cm->rst_internal.rsi->frame_restoration_type];
+      if (cm->use_highbitdepth)
+        restore_func_highbd(
+            frame->y_buffer + ystart * ystride, ywidth, yend - ystart, ystride,
+            &cm->rst_internal, cm->bit_depth,
+            dst->y_buffer + ystart * dst->y_stride, dst->y_stride);
+      else
+#endif  // CONFIG_HIGHBITDEPTH
+        restore_func(frame->y_buffer + ystart * ystride, ywidth, yend - ystart,
+                     ystride, &cm->rst_internal,
+                     dst->y_buffer + ystart * dst->y_stride, dst->y_stride);
+    } else {
+      aom_yv12_copy_y(frame, dst);
+    }
+  }
+
+  if ((components_pattern >> AOM_PLANE_U) & 1) {
+    if (rsi[AOM_PLANE_U].frame_restoration_type != RESTORE_NONE) {
+      cm->rst_internal.ntiles = av1_get_rest_ntiles(
+          ROUND_POWER_OF_TWO(cm->width, cm->subsampling_x),
+          ROUND_POWER_OF_TWO(cm->height, cm->subsampling_y),
+          cm->rst_info[AOM_PLANE_U].restoration_tilesize,
+          &cm->rst_internal.tile_width, &cm->rst_internal.tile_height,
+          &cm->rst_internal.nhtiles, &cm->rst_internal.nvtiles);
+      cm->rst_internal.rsi = &rsi[AOM_PLANE_U];
+      restore_func =
+          restore_funcs[cm->rst_internal.rsi->frame_restoration_type];
+#if CONFIG_HIGHBITDEPTH
+      restore_func_highbd =
+          restore_funcs_highbd[cm->rst_internal.rsi->frame_restoration_type];
+      if (cm->use_highbitdepth)
+        restore_func_highbd(
+            frame->u_buffer + uvstart * uvstride, uvwidth, uvend - uvstart,
+            uvstride, &cm->rst_internal, cm->bit_depth,
+            dst->u_buffer + uvstart * dst->uv_stride, dst->uv_stride);
+      else
+#endif  // CONFIG_HIGHBITDEPTH
+        restore_func(frame->u_buffer + uvstart * uvstride, uvwidth,
+                     uvend - uvstart, uvstride, &cm->rst_internal,
+                     dst->u_buffer + uvstart * dst->uv_stride, dst->uv_stride);
+    } else {
+      aom_yv12_copy_u(frame, dst);
+    }
+  }
+
+  if ((components_pattern >> AOM_PLANE_V) & 1) {
+    if (rsi[AOM_PLANE_V].frame_restoration_type != RESTORE_NONE) {
+      cm->rst_internal.ntiles = av1_get_rest_ntiles(
+          ROUND_POWER_OF_TWO(cm->width, cm->subsampling_x),
+          ROUND_POWER_OF_TWO(cm->height, cm->subsampling_y),
+          cm->rst_info[AOM_PLANE_V].restoration_tilesize,
+          &cm->rst_internal.tile_width, &cm->rst_internal.tile_height,
+          &cm->rst_internal.nhtiles, &cm->rst_internal.nvtiles);
+      cm->rst_internal.rsi = &rsi[AOM_PLANE_V];
+      restore_func =
+          restore_funcs[cm->rst_internal.rsi->frame_restoration_type];
+#if CONFIG_HIGHBITDEPTH
+      restore_func_highbd =
+          restore_funcs_highbd[cm->rst_internal.rsi->frame_restoration_type];
+      if (cm->use_highbitdepth)
+        restore_func_highbd(
+            frame->v_buffer + uvstart * uvstride, uvwidth, uvend - uvstart,
+            uvstride, &cm->rst_internal, cm->bit_depth,
+            dst->v_buffer + uvstart * dst->uv_stride, dst->uv_stride);
+      else
+#endif  // CONFIG_HIGHBITDEPTH
+        restore_func(frame->v_buffer + uvstart * uvstride, uvwidth,
+                     uvend - uvstart, uvstride, &cm->rst_internal,
+                     dst->v_buffer + uvstart * dst->uv_stride, dst->uv_stride);
+    } else {
+      aom_yv12_copy_v(frame, dst);
+    }
+  }
+
+  if (dst == &dst_) {
+    if ((components_pattern >> AOM_PLANE_Y) & 1) aom_yv12_copy_y(dst, frame);
+    if ((components_pattern >> AOM_PLANE_U) & 1) aom_yv12_copy_u(dst, frame);
+    if ((components_pattern >> AOM_PLANE_V) & 1) aom_yv12_copy_v(dst, frame);
+    aom_free_frame_buffer(dst);
+  }
+}
+
+void av1_loop_restoration_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+                                RestorationInfo *rsi, int components_pattern,
+                                int partial_frame, YV12_BUFFER_CONFIG *dst) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_rows;
+  if (partial_frame && cm->mi_rows > 8) {
+    start_mi_row = cm->mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+  loop_restoration_init(&cm->rst_internal, cm->frame_type == KEY_FRAME);
+  loop_restoration_rows(frame, cm, start_mi_row, end_mi_row, components_pattern,
+                        rsi, dst);
+}
diff --git a/third_party/aom/av1/common/restoration.h b/third_party/aom/av1/common/restoration.h
new file mode 100644
index 000000000..866f78b79
--- /dev/null
+++ b/third_party/aom/av1/common/restoration.h
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_RESTORATION_H_
+#define AV1_COMMON_RESTORATION_H_
+
+#include "aom_ports/mem.h"
+#include "./aom_config.h"
+
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
+#define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5))
+
+#define RESTORATION_TILESIZE_MAX 256
+#define RESTORATION_TILEPELS_MAX \
+  (RESTORATION_TILESIZE_MAX * RESTORATION_TILESIZE_MAX * 9 / 4)
+
+// 4 32-bit buffers needed for the filter:
+// 2 for the restored versions of the frame and
+// 2 for each restoration operation
+#define SGRPROJ_OUTBUF_SIZE \
+  ((RESTORATION_TILESIZE_MAX * 3 / 2) * (RESTORATION_TILESIZE_MAX * 3 / 2 + 16))
+#define SGRPROJ_TMPBUF_SIZE                         \
+  (RESTORATION_TILEPELS_MAX * 2 * sizeof(int32_t) + \
+   SGRPROJ_OUTBUF_SIZE * 2 * sizeof(int32_t))
+#define SGRPROJ_EXTBUF_SIZE (0)
+#define SGRPROJ_PARAMS_BITS 4
+#define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS)
+#define USE_HIGHPASS_IN_SGRPROJ 0
+
+// Precision bits for projection
+#define SGRPROJ_PRJ_BITS 7
+// Restoration precision bits generated higher than source before projection
+#define SGRPROJ_RST_BITS 4
+// Internal precision bits for core selfguided_restoration
+#define SGRPROJ_SGR_BITS 8
+#define SGRPROJ_SGR (1 << SGRPROJ_SGR_BITS)
+
+#if USE_HIGHPASS_IN_SGRPROJ
+#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) / 8)
+#define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1)
+#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 2)
+#define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
+#else
+#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) * 3 / 4)
+#define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1)
+#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
+#define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
+#endif  // USE_HIGHPASS_IN_SGRPROJ
+
+#define SGRPROJ_PRJ_SUBEXP_K 4
+
+#define SGRPROJ_BITS (SGRPROJ_PRJ_BITS * 2 + SGRPROJ_PARAMS_BITS)
+
+#define MAX_RADIUS 3  // Only 1, 2, 3 allowed
+#define MAX_EPS 80    // Max value of eps
+#define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1))
+#define SGRPROJ_MTABLE_BITS 20
+#define SGRPROJ_RECIP_BITS 12
+
+#define WIENER_HALFWIN 3
+#define WIENER_HALFWIN1 (WIENER_HALFWIN + 1)
+#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
+#define WIENER_WIN2 ((WIENER_WIN) * (WIENER_WIN))
+#define WIENER_TMPBUF_SIZE (0)
+#define WIENER_EXTBUF_SIZE (0)
+
+#define WIENER_FILT_PREC_BITS 7
+#define WIENER_FILT_STEP (1 << WIENER_FILT_PREC_BITS)
+
+// Central values for the taps
+#define WIENER_FILT_TAP0_MIDV (3)
+#define WIENER_FILT_TAP1_MIDV (-7)
+#define WIENER_FILT_TAP2_MIDV (15)
+
+#define WIENER_FILT_TAP0_BITS 4
+#define WIENER_FILT_TAP1_BITS 5
+#define WIENER_FILT_TAP2_BITS 6
+
+#define WIENER_FILT_BITS \
+  ((WIENER_FILT_TAP0_BITS + WIENER_FILT_TAP1_BITS + WIENER_FILT_TAP2_BITS) * 2)
+
+#define WIENER_FILT_TAP0_MINV \
+  (WIENER_FILT_TAP0_MIDV - (1 << WIENER_FILT_TAP0_BITS) / 2)
+#define WIENER_FILT_TAP1_MINV \
+  (WIENER_FILT_TAP1_MIDV - (1 << WIENER_FILT_TAP1_BITS) / 2)
+#define WIENER_FILT_TAP2_MINV \
+  (WIENER_FILT_TAP2_MIDV - (1 << WIENER_FILT_TAP2_BITS) / 2)
+
+#define WIENER_FILT_TAP0_MAXV \
+  (WIENER_FILT_TAP0_MIDV - 1 + (1 << WIENER_FILT_TAP0_BITS) / 2)
+#define WIENER_FILT_TAP1_MAXV \
+  (WIENER_FILT_TAP1_MIDV - 1 + (1 << WIENER_FILT_TAP1_BITS) / 2)
+#define WIENER_FILT_TAP2_MAXV \
+  (WIENER_FILT_TAP2_MIDV - 1 + (1 << WIENER_FILT_TAP2_BITS) / 2)
+
+#define WIENER_FILT_TAP0_SUBEXP_K 1
+#define WIENER_FILT_TAP1_SUBEXP_K 2
+#define WIENER_FILT_TAP2_SUBEXP_K 3
+
+// Max of SGRPROJ_TMPBUF_SIZE, DOMAINTXFMRF_TMPBUF_SIZE, WIENER_TMPBUF_SIZE
+#define RESTORATION_TMPBUF_SIZE (SGRPROJ_TMPBUF_SIZE)
+
+// Max of SGRPROJ_EXTBUF_SIZE, WIENER_EXTBUF_SIZE
+#define RESTORATION_EXTBUF_SIZE (WIENER_EXTBUF_SIZE)
+
+// Check the assumptions of the existing code
+#if SUBPEL_TAPS != WIENER_WIN + 1
+#error "Wiener filter currently only works if SUBPEL_TAPS == WIENER_WIN + 1"
+#endif
+#if WIENER_FILT_PREC_BITS != 7
+#error "Wiener filter currently only works if WIENER_FILT_PREC_BITS == 7"
+#endif
+typedef struct {
+  DECLARE_ALIGNED(16, InterpKernel, vfilter);
+  DECLARE_ALIGNED(16, InterpKernel, hfilter);
+} WienerInfo;
+
+typedef struct {
+#if USE_HIGHPASS_IN_SGRPROJ
+  int corner;
+  int edge;
+#else
+  int r1;
+  int e1;
+#endif  // USE_HIGHPASS_IN_SGRPROJ
+  int r2;
+  int e2;
+} sgr_params_type;
+
+typedef struct {
+  int ep;
+  int xqd[2];
+} SgrprojInfo;
+
+typedef struct {
+  int restoration_tilesize;
+  RestorationType frame_restoration_type;
+  RestorationType *restoration_type;
+  // Wiener filter
+  WienerInfo *wiener_info;
+  // Selfguided proj filter
+  SgrprojInfo *sgrproj_info;
+} RestorationInfo;
+
+typedef struct {
+  RestorationInfo *rsi;
+  int keyframe;
+  int ntiles;
+  int tile_width, tile_height;
+  int nhtiles, nvtiles;
+  int32_t *tmpbuf;
+} RestorationInternal;
+
+static INLINE void set_default_sgrproj(SgrprojInfo *sgrproj_info) {
+  sgrproj_info->xqd[0] = (SGRPROJ_PRJ_MIN0 + SGRPROJ_PRJ_MAX0) / 2;
+  sgrproj_info->xqd[1] = (SGRPROJ_PRJ_MIN1 + SGRPROJ_PRJ_MAX1) / 2;
+}
+
+static INLINE void set_default_wiener(WienerInfo *wiener_info) {
+  wiener_info->vfilter[0] = wiener_info->hfilter[0] = WIENER_FILT_TAP0_MIDV;
+  wiener_info->vfilter[1] = wiener_info->hfilter[1] = WIENER_FILT_TAP1_MIDV;
+  wiener_info->vfilter[2] = wiener_info->hfilter[2] = WIENER_FILT_TAP2_MIDV;
+  wiener_info->vfilter[WIENER_HALFWIN] = wiener_info->hfilter[WIENER_HALFWIN] =
+      -2 *
+      (WIENER_FILT_TAP2_MIDV + WIENER_FILT_TAP1_MIDV + WIENER_FILT_TAP0_MIDV);
+  wiener_info->vfilter[4] = wiener_info->hfilter[4] = WIENER_FILT_TAP2_MIDV;
+  wiener_info->vfilter[5] = wiener_info->hfilter[5] = WIENER_FILT_TAP1_MIDV;
+  wiener_info->vfilter[6] = wiener_info->hfilter[6] = WIENER_FILT_TAP0_MIDV;
+}
+
+static INLINE int av1_get_rest_ntiles(int width, int height, int tilesize,
+                                      int *tile_width, int *tile_height,
+                                      int *nhtiles, int *nvtiles) {
+  int nhtiles_, nvtiles_;
+  int tile_width_, tile_height_;
+  tile_width_ = (tilesize < 0) ? width : AOMMIN(tilesize, width);
+  tile_height_ = (tilesize < 0) ? height : AOMMIN(tilesize, height);
+  nhtiles_ = (width + (tile_width_ >> 1)) / tile_width_;
+  nvtiles_ = (height + (tile_height_ >> 1)) / tile_height_;
+  if (tile_width) *tile_width = tile_width_;
+  if (tile_height) *tile_height = tile_height_;
+  if (nhtiles) *nhtiles = nhtiles_;
+  if (nvtiles) *nvtiles = nvtiles_;
+  return (nhtiles_ * nvtiles_);
+}
+
+static INLINE void av1_get_rest_tile_limits(
+    int tile_idx, int subtile_idx, int subtile_bits, int nhtiles, int nvtiles,
+    int tile_width, int tile_height, int im_width, int im_height, int clamp_h,
+    int clamp_v, int *h_start, int *h_end, int *v_start, int *v_end) {
+  const int htile_idx = tile_idx % nhtiles;
+  const int vtile_idx = tile_idx / nhtiles;
+  *h_start = htile_idx * tile_width;
+  *v_start = vtile_idx * tile_height;
+  *h_end = (htile_idx < nhtiles - 1) ? *h_start + tile_width : im_width;
+  *v_end = (vtile_idx < nvtiles - 1) ? *v_start + tile_height : im_height;
+  if (subtile_bits) {
+    const int num_subtiles_1d = (1 << subtile_bits);
+    const int subtile_width = (*h_end - *h_start) >> subtile_bits;
+    const int subtile_height = (*v_end - *v_start) >> subtile_bits;
+    const int subtile_idx_h = subtile_idx & (num_subtiles_1d - 1);
+    const int subtile_idx_v = subtile_idx >> subtile_bits;
+    *h_start += subtile_idx_h * subtile_width;
+    *v_start += subtile_idx_v * subtile_height;
+    *h_end = subtile_idx_h == num_subtiles_1d - 1 ? *h_end
+                                                  : *h_start + subtile_width;
+    *v_end = subtile_idx_v == num_subtiles_1d - 1 ? *v_end
+                                                  : *v_start + subtile_height;
+  }
+  if (clamp_h) {
+    *h_start = AOMMAX(*h_start, clamp_h);
+    *h_end = AOMMIN(*h_end, im_width - clamp_h);
+  }
+  if (clamp_v) {
+    *v_start = AOMMAX(*v_start, clamp_v);
+    *v_end = AOMMIN(*v_end, im_height - clamp_v);
+  }
+}
+
+extern const sgr_params_type sgr_params[SGRPROJ_PARAMS];
+extern int sgrproj_mtable[MAX_EPS][MAX_NELEM];
+extern const int32_t x_by_xplus1[256];
+extern const int32_t one_by_x[MAX_NELEM];
+
+int av1_alloc_restoration_struct(struct AV1Common *cm,
+                                 RestorationInfo *rst_info, int width,
+                                 int height);
+void av1_free_restoration_struct(RestorationInfo *rst_info);
+
+void extend_frame(uint8_t *data, int width, int height, int stride);
+#if CONFIG_HIGHBITDEPTH
+void extend_frame_highbd(uint16_t *data, int width, int height, int stride);
+#endif  // CONFIG_HIGHBITDEPTH
+void decode_xq(int *xqd, int *xq);
+void av1_loop_restoration_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+                                RestorationInfo *rsi, int components_pattern,
+                                int partial_frame, YV12_BUFFER_CONFIG *dst);
+void av1_loop_restoration_precal();
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_RESTORATION_H_
diff --git a/third_party/aom/av1/common/scale.c b/third_party/aom/av1/common/scale.c
new file mode 100644
index 000000000..76beaa2bd
--- /dev/null
+++ b/third_party/aom/av1/common/scale.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_dsp_rtcd.h"
+#include "av1/common/filter.h"
+#include "av1/common/scale.h"
+#include "aom_dsp/aom_filter.h"
+
+static INLINE int scaled_x(int val, const struct scale_factors *sf) {
+  return (int)((int64_t)val * sf->x_scale_fp >> REF_SCALE_SHIFT);
+}
+
+static INLINE int scaled_y(int val, const struct scale_factors *sf) {
+  return (int)((int64_t)val * sf->y_scale_fp >> REF_SCALE_SHIFT);
+}
+
+static int unscaled_value(int val, const struct scale_factors *sf) {
+  (void)sf;
+  return val;
+}
+
+static int get_fixed_point_scale_factor(int other_size, int this_size) {
+  // Calculate scaling factor once for each reference frame
+  // and use fixed point scaling factors in decoding and encoding routines.
+  // Hardware implementations can calculate scale factor in device driver
+  // and use multiplication and shifting on hardware instead of division.
+  return (other_size << REF_SCALE_SHIFT) / this_size;
+}
+
+MV32 av1_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf) {
+  const int x_off_q4 = scaled_x(x << SUBPEL_BITS, sf) & SUBPEL_MASK;
+  const int y_off_q4 = scaled_y(y << SUBPEL_BITS, sf) & SUBPEL_MASK;
+  const MV32 res = { scaled_y(mv->row, sf) + y_off_q4,
+                     scaled_x(mv->col, sf) + x_off_q4 };
+  return res;
+}
+
+#if CONFIG_HIGHBITDEPTH
+void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
+                                       int other_h, int this_w, int this_h,
+                                       int use_highbd) {
+#else
+void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
+                                       int other_h, int this_w, int this_h) {
+#endif
+  if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
+    sf->x_scale_fp = REF_INVALID_SCALE;
+    sf->y_scale_fp = REF_INVALID_SCALE;
+    return;
+  }
+
+  sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
+  sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
+  sf->x_step_q4 = scaled_x(16, sf);
+  sf->y_step_q4 = scaled_y(16, sf);
+
+  if (av1_is_scaled(sf)) {
+    sf->scale_value_x = scaled_x;
+    sf->scale_value_y = scaled_y;
+  } else {
+    sf->scale_value_x = unscaled_value;
+    sf->scale_value_y = unscaled_value;
+  }
+
+  // TODO(agrange): Investigate the best choice of functions to use here
+  // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what
+  // to do at full-pel offsets. The current selection, where the filter is
+  // applied in one direction only, and not at all for 0,0, seems to give the
+  // best quality, but it may be worth trying an additional mode that does
+  // do the filtering on full-pel.
+  if (sf->x_step_q4 == 16) {
+    if (sf->y_step_q4 == 16) {
+      // No scaling in either direction.
+      sf->predict[0][0][0] = aom_convolve_copy;
+      sf->predict[0][0][1] = aom_convolve_avg;
+      sf->predict[0][1][0] = aom_convolve8_vert;
+      sf->predict[0][1][1] = aom_convolve8_avg_vert;
+      sf->predict[1][0][0] = aom_convolve8_horiz;
+      sf->predict[1][0][1] = aom_convolve8_avg_horiz;
+    } else {
+      // No scaling in x direction. Must always scale in the y direction.
+      sf->predict[0][0][0] = aom_convolve8_vert;
+      sf->predict[0][0][1] = aom_convolve8_avg_vert;
+      sf->predict[0][1][0] = aom_convolve8_vert;
+      sf->predict[0][1][1] = aom_convolve8_avg_vert;
+      sf->predict[1][0][0] = aom_convolve8;
+      sf->predict[1][0][1] = aom_convolve8_avg;
+    }
+  } else {
+    if (sf->y_step_q4 == 16) {
+      // No scaling in the y direction. Must always scale in the x direction.
+      sf->predict[0][0][0] = aom_convolve8_horiz;
+      sf->predict[0][0][1] = aom_convolve8_avg_horiz;
+      sf->predict[0][1][0] = aom_convolve8;
+      sf->predict[0][1][1] = aom_convolve8_avg;
+      sf->predict[1][0][0] = aom_convolve8_horiz;
+      sf->predict[1][0][1] = aom_convolve8_avg_horiz;
+    } else {
+      // Must always scale in both directions.
+      sf->predict[0][0][0] = aom_convolve8;
+      sf->predict[0][0][1] = aom_convolve8_avg;
+      sf->predict[0][1][0] = aom_convolve8;
+      sf->predict[0][1][1] = aom_convolve8_avg;
+      sf->predict[1][0][0] = aom_convolve8;
+      sf->predict[1][0][1] = aom_convolve8_avg;
+    }
+  }
+  // 2D subpel motion always gets filtered in both directions
+  sf->predict[1][1][0] = aom_convolve8;
+  sf->predict[1][1][1] = aom_convolve8_avg;
+
+#if CONFIG_HIGHBITDEPTH
+  if (use_highbd) {
+    if (sf->x_step_q4 == 16) {
+      if (sf->y_step_q4 == 16) {
+        // No scaling in either direction.
+        sf->highbd_predict[0][0][0] = aom_highbd_convolve_copy;
+        sf->highbd_predict[0][0][1] = aom_highbd_convolve_avg;
+        sf->highbd_predict[0][1][0] = aom_highbd_convolve8_vert;
+        sf->highbd_predict[0][1][1] = aom_highbd_convolve8_avg_vert;
+        sf->highbd_predict[1][0][0] = aom_highbd_convolve8_horiz;
+        sf->highbd_predict[1][0][1] = aom_highbd_convolve8_avg_horiz;
+      } else {
+        // No scaling in x direction. Must always scale in the y direction.
+        sf->highbd_predict[0][0][0] = aom_highbd_convolve8_vert;
+        sf->highbd_predict[0][0][1] = aom_highbd_convolve8_avg_vert;
+        sf->highbd_predict[0][1][0] = aom_highbd_convolve8_vert;
+        sf->highbd_predict[0][1][1] = aom_highbd_convolve8_avg_vert;
+        sf->highbd_predict[1][0][0] = aom_highbd_convolve8;
+        sf->highbd_predict[1][0][1] = aom_highbd_convolve8_avg;
+      }
+    } else {
+      if (sf->y_step_q4 == 16) {
+        // No scaling in the y direction. Must always scale in the x direction.
+        sf->highbd_predict[0][0][0] = aom_highbd_convolve8_horiz;
+        sf->highbd_predict[0][0][1] = aom_highbd_convolve8_avg_horiz;
+        sf->highbd_predict[0][1][0] = aom_highbd_convolve8;
+        sf->highbd_predict[0][1][1] = aom_highbd_convolve8_avg;
+        sf->highbd_predict[1][0][0] = aom_highbd_convolve8_horiz;
+        sf->highbd_predict[1][0][1] = aom_highbd_convolve8_avg_horiz;
+      } else {
+        // Must always scale in both directions.
+        sf->highbd_predict[0][0][0] = aom_highbd_convolve8;
+        sf->highbd_predict[0][0][1] = aom_highbd_convolve8_avg;
+        sf->highbd_predict[0][1][0] = aom_highbd_convolve8;
+        sf->highbd_predict[0][1][1] = aom_highbd_convolve8_avg;
+        sf->highbd_predict[1][0][0] = aom_highbd_convolve8;
+        sf->highbd_predict[1][0][1] = aom_highbd_convolve8_avg;
+      }
+    }
+    // 2D subpel motion always gets filtered in both directions.
+    sf->highbd_predict[1][1][0] = aom_highbd_convolve8;
+    sf->highbd_predict[1][1][1] = aom_highbd_convolve8_avg;
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+}
diff --git a/third_party/aom/av1/common/scale.h b/third_party/aom/av1/common/scale.h
new file mode 100644
index 000000000..ea81efab0
--- /dev/null
+++ b/third_party/aom/av1/common/scale.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_SCALE_H_
+#define AV1_COMMON_SCALE_H_
+
+#include "av1/common/mv.h"
+#include "aom_dsp/aom_convolve.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define REF_SCALE_SHIFT 14
+#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
+#define REF_INVALID_SCALE -1
+
+struct scale_factors {
+  int x_scale_fp;  // horizontal fixed point scale factor
+  int y_scale_fp;  // vertical fixed point scale factor
+  int x_step_q4;
+  int y_step_q4;
+
+  int (*scale_value_x)(int val, const struct scale_factors *sf);
+  int (*scale_value_y)(int val, const struct scale_factors *sf);
+
+  convolve_fn_t predict[2][2][2];  // horiz, vert, avg
+#if CONFIG_HIGHBITDEPTH
+  highbd_convolve_fn_t highbd_predict[2][2][2];  // horiz, vert, avg
+#endif                                           // CONFIG_HIGHBITDEPTH
+};
+
+MV32 av1_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
+
+#if CONFIG_HIGHBITDEPTH
+void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
+                                       int other_h, int this_w, int this_h,
+                                       int use_high);
+#else
+void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
+                                       int other_h, int this_w, int this_h);
+#endif  // CONFIG_HIGHBITDEPTH
+
+static INLINE int av1_is_valid_scale(const struct scale_factors *sf) {
+  return sf->x_scale_fp != REF_INVALID_SCALE &&
+         sf->y_scale_fp != REF_INVALID_SCALE;
+}
+
+static INLINE int av1_is_scaled(const struct scale_factors *sf) {
+  return av1_is_valid_scale(sf) &&
+         (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
+}
+
+static INLINE int valid_ref_frame_size(int ref_width, int ref_height,
+                                       int this_width, int this_height) {
+  return 2 * this_width >= ref_width && 2 * this_height >= ref_height &&
+         this_width <= 16 * ref_width && this_height <= 16 * ref_height;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_SCALE_H_
diff --git a/third_party/aom/av1/common/scan.c b/third_party/aom/av1/common/scan.c
new file mode 100644
index 000000000..9ad6c0b2f
--- /dev/null
+++ b/third_party/aom/av1/common/scan.c
@@ -0,0 +1,6853 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/common_data.h"
+#include "av1/common/scan.h"
+
+#if CONFIG_CB4X4
+DECLARE_ALIGNED(16, static const int16_t, default_scan_2x2[4]) = {
+  0, 1, 2, 3,
+};
+#endif
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = {
+  0, 4, 1, 5, 8, 2, 12, 9, 3, 6, 13, 10, 7, 14, 11, 15,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = {
+  0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x4[16]) = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t, col_scan_4x4[16]) = {
+  0, 4, 8, 1, 12, 5, 9, 2, 13, 6, 10, 3, 7, 14, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, row_scan_4x4[16]) = {
+  0, 1, 4, 2, 5, 3, 6, 8, 9, 7, 12, 10, 13, 11, 14, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_4x8[32]) = {
+  0,  1,  4,  5,  2,  8,  6,  9,  10, 3,  12, 7,  13, 11, 14, 16,
+  17, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x8[32]) = {
+  0, 4, 8,  12, 16, 20, 24, 28, 1, 5, 9,  13, 17, 21, 25, 29,
+  2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x8[32]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x4[32]) = {
+  0,  1,  8,  9, 2,  16, 10, 17, 18, 3,  24, 11, 25, 19, 26, 4,
+  12, 27, 20, 5, 28, 13, 21, 29, 6,  14, 22, 30, 7,  15, 23, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x4[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x4[32]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_4x16[64]) = {
+  0,  1,  4,  2,  5,  8,  3,  6,  9,  12, 7,  10, 13, 16, 11, 14,
+  17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
+  33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
+  49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x4[64]) = {
+  0,  1,  16, 2,  17, 32, 3,  18, 33, 48, 4,  19, 34, 49, 5,  20,
+  35, 50, 6,  21, 36, 51, 7,  22, 37, 52, 8,  23, 38, 53, 9,  24,
+  39, 54, 10, 25, 40, 55, 11, 26, 41, 56, 12, 27, 42, 57, 13, 28,
+  43, 58, 14, 29, 44, 59, 15, 30, 45, 60, 31, 46, 61, 47, 62, 63,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x16[64]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x4[64]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x16[64]) = {
+  0, 4, 8,  12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+  1, 5, 9,  13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
+  2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
+  3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x4[64]) = {
+  0,  16, 32, 48, 1,  17, 33, 49, 2,  18, 34, 50, 3,  19, 35, 51,
+  4,  20, 36, 52, 5,  21, 37, 53, 6,  22, 38, 54, 7,  23, 39, 55,
+  8,  24, 40, 56, 9,  25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
+  12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63,
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x32[256]) = {
+  0,   1,   8,   2,   9,   16,  3,   10,  17,  24,  4,   11,  18,  25,  32,
+  5,   12,  19,  26,  33,  40,  6,   13,  20,  27,  34,  41,  48,  7,   14,
+  21,  28,  35,  42,  49,  56,  15,  22,  29,  36,  43,  50,  57,  64,  23,
+  30,  37,  44,  51,  58,  65,  72,  31,  38,  45,  52,  59,  66,  73,  80,
+  39,  46,  53,  60,  67,  74,  81,  88,  47,  54,  61,  68,  75,  82,  89,
+  96,  55,  62,  69,  76,  83,  90,  97,  104, 63,  70,  77,  84,  91,  98,
+  105, 112, 71,  78,  85,  92,  99,  106, 113, 120, 79,  86,  93,  100, 107,
+  114, 121, 128, 87,  94,  101, 108, 115, 122, 129, 136, 95,  102, 109, 116,
+  123, 130, 137, 144, 103, 110, 117, 124, 131, 138, 145, 152, 111, 118, 125,
+  132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134,
+  141, 148, 155, 162, 169, 176, 135, 142, 149, 156, 163, 170, 177, 184, 143,
+  150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200,
+  159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188, 195, 202, 209,
+  216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218,
+  225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220, 227,
+  234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243,
+  250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_32x8[256]) = {
+  0,   1,   32,  2,   33,  64,  3,   34,  65,  96,  4,   35,  66,  97,  128,
+  5,   36,  67,  98,  129, 160, 6,   37,  68,  99,  130, 161, 192, 7,   38,
+  69,  100, 131, 162, 193, 224, 8,   39,  70,  101, 132, 163, 194, 225, 9,
+  40,  71,  102, 133, 164, 195, 226, 10,  41,  72,  103, 134, 165, 196, 227,
+  11,  42,  73,  104, 135, 166, 197, 228, 12,  43,  74,  105, 136, 167, 198,
+  229, 13,  44,  75,  106, 137, 168, 199, 230, 14,  45,  76,  107, 138, 169,
+  200, 231, 15,  46,  77,  108, 139, 170, 201, 232, 16,  47,  78,  109, 140,
+  171, 202, 233, 17,  48,  79,  110, 141, 172, 203, 234, 18,  49,  80,  111,
+  142, 173, 204, 235, 19,  50,  81,  112, 143, 174, 205, 236, 20,  51,  82,
+  113, 144, 175, 206, 237, 21,  52,  83,  114, 145, 176, 207, 238, 22,  53,
+  84,  115, 146, 177, 208, 239, 23,  54,  85,  116, 147, 178, 209, 240, 24,
+  55,  86,  117, 148, 179, 210, 241, 25,  56,  87,  118, 149, 180, 211, 242,
+  26,  57,  88,  119, 150, 181, 212, 243, 27,  58,  89,  120, 151, 182, 213,
+  244, 28,  59,  90,  121, 152, 183, 214, 245, 29,  60,  91,  122, 153, 184,
+  215, 246, 30,  61,  92,  123, 154, 185, 216, 247, 31,  62,  93,  124, 155,
+  186, 217, 248, 63,  94,  125, 156, 187, 218, 249, 95,  126, 157, 188, 219,
+  250, 127, 158, 189, 220, 251, 159, 190, 221, 252, 191, 222, 253, 223, 254,
+  255,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x32[256]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x8[256]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x32[256]) = {
+  0,   8,   16,  24,  32,  40,  48,  56,  64,  72,  80,  88,  96,  104, 112,
+  120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232,
+  240, 248, 1,   9,   17,  25,  33,  41,  49,  57,  65,  73,  81,  89,  97,
+  105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217,
+  225, 233, 241, 249, 2,   10,  18,  26,  34,  42,  50,  58,  66,  74,  82,
+  90,  98,  106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202,
+  210, 218, 226, 234, 242, 250, 3,   11,  19,  27,  35,  43,  51,  59,  67,
+  75,  83,  91,  99,  107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187,
+  195, 203, 211, 219, 227, 235, 243, 251, 4,   12,  20,  28,  36,  44,  52,
+  60,  68,  76,  84,  92,  100, 108, 116, 124, 132, 140, 148, 156, 164, 172,
+  180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 5,   13,  21,  29,  37,
+  45,  53,  61,  69,  77,  85,  93,  101, 109, 117, 125, 133, 141, 149, 157,
+  165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 6,   14,  22,
+  30,  38,  46,  54,  62,  70,  78,  86,  94,  102, 110, 118, 126, 134, 142,
+  150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 7,
+  15,  23,  31,  39,  47,  55,  63,  71,  79,  87,  95,  103, 111, 119, 127,
+  135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x8[256]) = {
+  0,  32, 64, 96,  128, 160, 192, 224, 1,  33, 65, 97,  129, 161, 193, 225,
+  2,  34, 66, 98,  130, 162, 194, 226, 3,  35, 67, 99,  131, 163, 195, 227,
+  4,  36, 68, 100, 132, 164, 196, 228, 5,  37, 69, 101, 133, 165, 197, 229,
+  6,  38, 70, 102, 134, 166, 198, 230, 7,  39, 71, 103, 135, 167, 199, 231,
+  8,  40, 72, 104, 136, 168, 200, 232, 9,  41, 73, 105, 137, 169, 201, 233,
+  10, 42, 74, 106, 138, 170, 202, 234, 11, 43, 75, 107, 139, 171, 203, 235,
+  12, 44, 76, 108, 140, 172, 204, 236, 13, 45, 77, 109, 141, 173, 205, 237,
+  14, 46, 78, 110, 142, 174, 206, 238, 15, 47, 79, 111, 143, 175, 207, 239,
+  16, 48, 80, 112, 144, 176, 208, 240, 17, 49, 81, 113, 145, 177, 209, 241,
+  18, 50, 82, 114, 146, 178, 210, 242, 19, 51, 83, 115, 147, 179, 211, 243,
+  20, 52, 84, 116, 148, 180, 212, 244, 21, 53, 85, 117, 149, 181, 213, 245,
+  22, 54, 86, 118, 150, 182, 214, 246, 23, 55, 87, 119, 151, 183, 215, 247,
+  24, 56, 88, 120, 152, 184, 216, 248, 25, 57, 89, 121, 153, 185, 217, 249,
+  26, 58, 90, 122, 154, 186, 218, 250, 27, 59, 91, 123, 155, 187, 219, 251,
+  28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253,
+  30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255,
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = {
+  0,  8,  1,  16, 9,  2,  17, 24, 10, 3,  18, 25, 32, 11, 4,  26,
+  33, 19, 40, 12, 34, 27, 5,  41, 20, 48, 13, 35, 42, 28, 21, 6,
+  49, 56, 36, 43, 29, 7,  14, 50, 57, 44, 22, 37, 15, 51, 58, 30,
+  45, 23, 52, 59, 38, 31, 60, 53, 46, 39, 61, 54, 47, 62, 55, 63,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x8[64]) = {
+  0, 8,  16, 24, 32, 40, 48, 56, 1, 9,  17, 25, 33, 41, 49, 57,
+  2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
+  4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61,
+  6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x8[64]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t, col_scan_8x8[64]) = {
+  0,  8,  16, 1,  24, 9,  32, 17, 2,  40, 25, 10, 33, 18, 48, 3,
+  26, 41, 11, 56, 19, 34, 4,  49, 27, 42, 12, 35, 20, 57, 50, 28,
+  5,  43, 13, 36, 58, 51, 21, 44, 6,  29, 59, 37, 14, 52, 22, 7,
+  45, 60, 30, 15, 38, 53, 23, 46, 31, 61, 39, 54, 47, 62, 55, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, row_scan_8x8[64]) = {
+  0,  1,  2,  8,  9,  3,  16, 10, 4,  17, 11, 24, 5,  18, 25, 12,
+  19, 26, 32, 6,  13, 20, 33, 27, 7,  34, 40, 21, 28, 41, 14, 35,
+  48, 42, 29, 36, 49, 22, 43, 15, 56, 37, 50, 44, 30, 57, 23, 51,
+  58, 45, 38, 52, 31, 59, 53, 46, 60, 39, 61, 47, 54, 55, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x16[128]) = {
+  0,   1,   8,   2,   9,   16,  3,   10,  17,  24,  4,   11,  18,  25,  32,
+  5,   12,  19,  26,  33,  40,  6,   13,  20,  27,  34,  41,  48,  7,   14,
+  21,  28,  35,  42,  49,  56,  15,  22,  29,  36,  43,  50,  57,  64,  23,
+  30,  37,  44,  51,  58,  65,  72,  31,  38,  45,  52,  59,  66,  73,  80,
+  39,  46,  53,  60,  67,  74,  81,  88,  47,  54,  61,  68,  75,  82,  89,
+  96,  55,  62,  69,  76,  83,  90,  97,  104, 63,  70,  77,  84,  91,  98,
+  105, 112, 71,  78,  85,  92,  99,  106, 113, 120, 79,  86,  93,  100, 107,
+  114, 121, 87,  94,  101, 108, 115, 122, 95,  102, 109, 116, 123, 103, 110,
+  117, 124, 111, 118, 125, 119, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x8[128]) = {
+  0,   1,  16,  2,   17,  32,  3,  18, 33,  48,  4,   19,  34,  49,  64,  5,
+  20,  35, 50,  65,  80,  6,   21, 36, 51,  66,  81,  96,  7,   22,  37,  52,
+  67,  82, 97,  112, 8,   23,  38, 53, 68,  83,  98,  113, 9,   24,  39,  54,
+  69,  84, 99,  114, 10,  25,  40, 55, 70,  85,  100, 115, 11,  26,  41,  56,
+  71,  86, 101, 116, 12,  27,  42, 57, 72,  87,  102, 117, 13,  28,  43,  58,
+  73,  88, 103, 118, 14,  29,  44, 59, 74,  89,  104, 119, 15,  30,  45,  60,
+  75,  90, 105, 120, 31,  46,  61, 76, 91,  106, 121, 47,  62,  77,  92,  107,
+  122, 63, 78,  93,  108, 123, 79, 94, 109, 124, 95,  110, 125, 111, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x16[128]) = {
+  0, 8,  16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96,  104, 112, 120,
+  1, 9,  17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97,  105, 113, 121,
+  2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98,  106, 114, 122,
+  3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99,  107, 115, 123,
+  4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124,
+  5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125,
+  6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
+  7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x8[128]) = {
+  0,  16, 32, 48, 64, 80, 96,  112, 1,  17, 33, 49, 65, 81, 97,  113,
+  2,  18, 34, 50, 66, 82, 98,  114, 3,  19, 35, 51, 67, 83, 99,  115,
+  4,  20, 36, 52, 68, 84, 100, 116, 5,  21, 37, 53, 69, 85, 101, 117,
+  6,  22, 38, 54, 70, 86, 102, 118, 7,  23, 39, 55, 71, 87, 103, 119,
+  8,  24, 40, 56, 72, 88, 104, 120, 9,  25, 41, 57, 73, 89, 105, 121,
+  10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123,
+  12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125,
+  14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x16[128]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x8[128]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x32[512]) = {
+  0,   1,   16,  2,   17,  32,  3,   18,  33,  48,  4,   19,  34,  49,  64,
+  5,   20,  35,  50,  65,  80,  6,   21,  36,  51,  66,  81,  96,  7,   22,
+  37,  52,  67,  82,  97,  112, 8,   23,  38,  53,  68,  83,  98,  113, 128,
+  9,   24,  39,  54,  69,  84,  99,  114, 129, 144, 10,  25,  40,  55,  70,
+  85,  100, 115, 130, 145, 160, 11,  26,  41,  56,  71,  86,  101, 116, 131,
+  146, 161, 176, 12,  27,  42,  57,  72,  87,  102, 117, 132, 147, 162, 177,
+  192, 13,  28,  43,  58,  73,  88,  103, 118, 133, 148, 163, 178, 193, 208,
+  14,  29,  44,  59,  74,  89,  104, 119, 134, 149, 164, 179, 194, 209, 224,
+  15,  30,  45,  60,  75,  90,  105, 120, 135, 150, 165, 180, 195, 210, 225,
+  240, 31,  46,  61,  76,  91,  106, 121, 136, 151, 166, 181, 196, 211, 226,
+  241, 256, 47,  62,  77,  92,  107, 122, 137, 152, 167, 182, 197, 212, 227,
+  242, 257, 272, 63,  78,  93,  108, 123, 138, 153, 168, 183, 198, 213, 228,
+  243, 258, 273, 288, 79,  94,  109, 124, 139, 154, 169, 184, 199, 214, 229,
+  244, 259, 274, 289, 304, 95,  110, 125, 140, 155, 170, 185, 200, 215, 230,
+  245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216, 231,
+  246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232,
+  247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233,
+  248, 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234,
+  249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235,
+  250, 265, 280, 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236,
+  251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237,
+  252, 267, 282, 297, 312, 327, 342, 357, 372, 387, 402, 417, 432, 223, 238,
+  253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239,
+  254, 269, 284, 299, 314, 329, 344, 359, 374, 389, 404, 419, 434, 449, 464,
+  255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465,
+  480, 271, 286, 301, 316, 331, 346, 361, 376, 391, 406, 421, 436, 451, 466,
+  481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467,
+  482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423, 438, 453, 468, 483,
+  498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335,
+  350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366, 381, 396,
+  411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472,
+  487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444,
+  459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476,
+  491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_32x16[512]) = {
+  0,   1,   32,  2,   33,  64,  3,   34,  65,  96,  4,   35,  66,  97,  128,
+  5,   36,  67,  98,  129, 160, 6,   37,  68,  99,  130, 161, 192, 7,   38,
+  69,  100, 131, 162, 193, 224, 8,   39,  70,  101, 132, 163, 194, 225, 256,
+  9,   40,  71,  102, 133, 164, 195, 226, 257, 288, 10,  41,  72,  103, 134,
+  165, 196, 227, 258, 289, 320, 11,  42,  73,  104, 135, 166, 197, 228, 259,
+  290, 321, 352, 12,  43,  74,  105, 136, 167, 198, 229, 260, 291, 322, 353,
+  384, 13,  44,  75,  106, 137, 168, 199, 230, 261, 292, 323, 354, 385, 416,
+  14,  45,  76,  107, 138, 169, 200, 231, 262, 293, 324, 355, 386, 417, 448,
+  15,  46,  77,  108, 139, 170, 201, 232, 263, 294, 325, 356, 387, 418, 449,
+  480, 16,  47,  78,  109, 140, 171, 202, 233, 264, 295, 326, 357, 388, 419,
+  450, 481, 17,  48,  79,  110, 141, 172, 203, 234, 265, 296, 327, 358, 389,
+  420, 451, 482, 18,  49,  80,  111, 142, 173, 204, 235, 266, 297, 328, 359,
+  390, 421, 452, 483, 19,  50,  81,  112, 143, 174, 205, 236, 267, 298, 329,
+  360, 391, 422, 453, 484, 20,  51,  82,  113, 144, 175, 206, 237, 268, 299,
+  330, 361, 392, 423, 454, 485, 21,  52,  83,  114, 145, 176, 207, 238, 269,
+  300, 331, 362, 393, 424, 455, 486, 22,  53,  84,  115, 146, 177, 208, 239,
+  270, 301, 332, 363, 394, 425, 456, 487, 23,  54,  85,  116, 147, 178, 209,
+  240, 271, 302, 333, 364, 395, 426, 457, 488, 24,  55,  86,  117, 148, 179,
+  210, 241, 272, 303, 334, 365, 396, 427, 458, 489, 25,  56,  87,  118, 149,
+  180, 211, 242, 273, 304, 335, 366, 397, 428, 459, 490, 26,  57,  88,  119,
+  150, 181, 212, 243, 274, 305, 336, 367, 398, 429, 460, 491, 27,  58,  89,
+  120, 151, 182, 213, 244, 275, 306, 337, 368, 399, 430, 461, 492, 28,  59,
+  90,  121, 152, 183, 214, 245, 276, 307, 338, 369, 400, 431, 462, 493, 29,
+  60,  91,  122, 153, 184, 215, 246, 277, 308, 339, 370, 401, 432, 463, 494,
+  30,  61,  92,  123, 154, 185, 216, 247, 278, 309, 340, 371, 402, 433, 464,
+  495, 31,  62,  93,  124, 155, 186, 217, 248, 279, 310, 341, 372, 403, 434,
+  465, 496, 63,  94,  125, 156, 187, 218, 249, 280, 311, 342, 373, 404, 435,
+  466, 497, 95,  126, 157, 188, 219, 250, 281, 312, 343, 374, 405, 436, 467,
+  498, 127, 158, 189, 220, 251, 282, 313, 344, 375, 406, 437, 468, 499, 159,
+  190, 221, 252, 283, 314, 345, 376, 407, 438, 469, 500, 191, 222, 253, 284,
+  315, 346, 377, 408, 439, 470, 501, 223, 254, 285, 316, 347, 378, 409, 440,
+  471, 502, 255, 286, 317, 348, 379, 410, 441, 472, 503, 287, 318, 349, 380,
+  411, 442, 473, 504, 319, 350, 381, 412, 443, 474, 505, 351, 382, 413, 444,
+  475, 506, 383, 414, 445, 476, 507, 415, 446, 477, 508, 447, 478, 509, 479,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x32[512]) = {
+  0,   16,  32,  48,  64,  80,  96,  112, 128, 144, 160, 176, 192, 208, 224,
+  240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464,
+  480, 496, 1,   17,  33,  49,  65,  81,  97,  113, 129, 145, 161, 177, 193,
+  209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433,
+  449, 465, 481, 497, 2,   18,  34,  50,  66,  82,  98,  114, 130, 146, 162,
+  178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402,
+  418, 434, 450, 466, 482, 498, 3,   19,  35,  51,  67,  83,  99,  115, 131,
+  147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371,
+  387, 403, 419, 435, 451, 467, 483, 499, 4,   20,  36,  52,  68,  84,  100,
+  116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340,
+  356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5,   21,  37,  53,  69,
+  85,  101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309,
+  325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6,   22,  38,
+  54,  70,  86,  102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278,
+  294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7,
+  23,  39,  55,  71,  87,  103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+  263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487,
+  503, 8,   24,  40,  56,  72,  88,  104, 120, 136, 152, 168, 184, 200, 216,
+  232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456,
+  472, 488, 504, 9,   25,  41,  57,  73,  89,  105, 121, 137, 153, 169, 185,
+  201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425,
+  441, 457, 473, 489, 505, 10,  26,  42,  58,  74,  90,  106, 122, 138, 154,
+  170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394,
+  410, 426, 442, 458, 474, 490, 506, 11,  27,  43,  59,  75,  91,  107, 123,
+  139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363,
+  379, 395, 411, 427, 443, 459, 475, 491, 507, 12,  28,  44,  60,  76,  92,
+  108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332,
+  348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13,  29,  45,  61,
+  77,  93,  109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301,
+  317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14,  30,
+  46,  62,  78,  94,  110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270,
+  286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510,
+  15,  31,  47,  63,  79,  95,  111, 127, 143, 159, 175, 191, 207, 223, 239,
+  255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479,
+  495, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x16[512]) = {
+  0,  32, 64, 96,  128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480,
+  1,  33, 65, 97,  129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481,
+  2,  34, 66, 98,  130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482,
+  3,  35, 67, 99,  131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483,
+  4,  36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484,
+  5,  37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485,
+  6,  38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486,
+  7,  39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487,
+  8,  40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488,
+  9,  41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489,
+  10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490,
+  11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491,
+  12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492,
+  13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493,
+  14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494,
+  15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495,
+  16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496,
+  17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497,
+  18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498,
+  19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499,
+  20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500,
+  21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501,
+  22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502,
+  23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503,
+  24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504,
+  25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505,
+  26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506,
+  27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507,
+  28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508,
+  29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509,
+  30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510,
+  31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x32[512]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+  270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+  285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+  300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+  315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+  330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+  345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+  360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+  375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+  390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+  405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+  420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+  435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+  450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+  465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+  480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+  495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x16[512]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+  270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+  285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+  300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+  315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+  330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+  345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+  360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+  375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+  390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+  405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+  420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+  435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+  450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+  465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+  480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+  495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x16[256]) = {
+  0,   16,  1,   32,  17,  2,   48,  33,  18,  3,   64,  34,  49,  19,  65,
+  80,  50,  4,   35,  66,  20,  81,  96,  51,  5,   36,  82,  97,  67,  112,
+  21,  52,  98,  37,  83,  113, 6,   68,  128, 53,  22,  99,  114, 84,  7,
+  129, 38,  69,  100, 115, 144, 130, 85,  54,  23,  8,   145, 39,  70,  116,
+  101, 131, 160, 146, 55,  86,  24,  71,  132, 117, 161, 40,  9,   102, 147,
+  176, 162, 87,  56,  25,  133, 118, 177, 148, 72,  103, 41,  163, 10,  192,
+  178, 88,  57,  134, 149, 119, 26,  164, 73,  104, 193, 42,  179, 208, 11,
+  135, 89,  165, 120, 150, 58,  194, 180, 27,  74,  209, 105, 151, 136, 43,
+  90,  224, 166, 195, 181, 121, 210, 59,  12,  152, 106, 167, 196, 75,  137,
+  225, 211, 240, 182, 122, 91,  28,  197, 13,  226, 168, 183, 153, 44,  212,
+  138, 107, 241, 60,  29,  123, 198, 184, 227, 169, 242, 76,  213, 154, 45,
+  92,  14,  199, 139, 61,  228, 214, 170, 185, 243, 108, 77,  155, 30,  15,
+  200, 229, 124, 215, 244, 93,  46,  186, 171, 201, 109, 140, 230, 62,  216,
+  245, 31,  125, 78,  156, 231, 47,  187, 202, 217, 94,  246, 141, 63,  232,
+  172, 110, 247, 157, 79,  218, 203, 126, 233, 188, 248, 95,  173, 142, 219,
+  111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159, 251,
+  190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239,
+  255,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x16[256]) = {
+  0,  16, 32, 48, 64, 80, 96,  112, 128, 144, 160, 176, 192, 208, 224, 240,
+  1,  17, 33, 49, 65, 81, 97,  113, 129, 145, 161, 177, 193, 209, 225, 241,
+  2,  18, 34, 50, 66, 82, 98,  114, 130, 146, 162, 178, 194, 210, 226, 242,
+  3,  19, 35, 51, 67, 83, 99,  115, 131, 147, 163, 179, 195, 211, 227, 243,
+  4,  20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+  5,  21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+  6,  22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+  7,  23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+  8,  24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+  9,  25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+  10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+  11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+  12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+  13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+  14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+  15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x16[256]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255,
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t, col_scan_16x16[256]) = {
+  0,   16,  32,  48,  1,   64,  17,  80,  33,  96,  49,  2,   65,  112, 18,
+  81,  34,  128, 50,  97,  3,   66,  144, 19,  113, 35,  82,  160, 98,  51,
+  129, 4,   67,  176, 20,  114, 145, 83,  36,  99,  130, 52,  192, 5,   161,
+  68,  115, 21,  146, 84,  208, 177, 37,  131, 100, 53,  162, 224, 69,  6,
+  116, 193, 147, 85,  22,  240, 132, 38,  178, 101, 163, 54,  209, 117, 70,
+  7,   148, 194, 86,  179, 225, 23,  133, 39,  164, 8,   102, 210, 241, 55,
+  195, 118, 149, 71,  180, 24,  87,  226, 134, 165, 211, 40,  103, 56,  72,
+  150, 196, 242, 119, 9,   181, 227, 88,  166, 25,  135, 41,  104, 212, 57,
+  151, 197, 120, 73,  243, 182, 136, 167, 213, 89,  10,  228, 105, 152, 198,
+  26,  42,  121, 183, 244, 168, 58,  137, 229, 74,  214, 90,  153, 199, 184,
+  11,  106, 245, 27,  122, 230, 169, 43,  215, 59,  200, 138, 185, 246, 75,
+  12,  91,  154, 216, 231, 107, 28,  44,  201, 123, 170, 60,  247, 232, 76,
+  139, 13,  92,  217, 186, 248, 155, 108, 29,  124, 45,  202, 233, 171, 61,
+  14,  77,  140, 15,  249, 93,  30,  187, 156, 218, 46,  109, 125, 62,  172,
+  78,  203, 31,  141, 234, 94,  47,  188, 63,  157, 110, 250, 219, 79,  126,
+  204, 173, 142, 95,  189, 111, 235, 158, 220, 251, 127, 174, 143, 205, 236,
+  159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, row_scan_16x16[256]) = {
+  0,   1,   2,   16,  3,   17,  4,   18,  32,  5,   33,  19,  6,   34,  48,
+  20,  49,  7,   35,  21,  50,  64,  8,   36,  65,  22,  51,  37,  80,  9,
+  66,  52,  23,  38,  81,  67,  10,  53,  24,  82,  68,  96,  39,  11,  54,
+  83,  97,  69,  25,  98,  84,  40,  112, 55,  12,  70,  99,  113, 85,  26,
+  41,  56,  114, 100, 13,  71,  128, 86,  27,  115, 101, 129, 42,  57,  72,
+  116, 14,  87,  130, 102, 144, 73,  131, 117, 28,  58,  15,  88,  43,  145,
+  103, 132, 146, 118, 74,  160, 89,  133, 104, 29,  59,  147, 119, 44,  161,
+  148, 90,  105, 134, 162, 120, 176, 75,  135, 149, 30,  60,  163, 177, 45,
+  121, 91,  106, 164, 178, 150, 192, 136, 165, 179, 31,  151, 193, 76,  122,
+  61,  137, 194, 107, 152, 180, 208, 46,  166, 167, 195, 92,  181, 138, 209,
+  123, 153, 224, 196, 77,  168, 210, 182, 240, 108, 197, 62,  154, 225, 183,
+  169, 211, 47,  139, 93,  184, 226, 212, 241, 198, 170, 124, 155, 199, 78,
+  213, 185, 109, 227, 200, 63,  228, 242, 140, 214, 171, 186, 156, 229, 243,
+  125, 94,  201, 244, 215, 216, 230, 141, 187, 202, 79,  172, 110, 157, 245,
+  217, 231, 95,  246, 232, 126, 203, 247, 233, 173, 218, 142, 111, 158, 188,
+  248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220, 175,
+  190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254,
+  255,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x32[1024]) = {
+  0,   32,   64,  96,   128, 160,  192, 224,  256, 288,  320, 352,  384, 416,
+  448, 480,  512, 544,  576, 608,  640, 672,  704, 736,  768, 800,  832, 864,
+  896, 928,  960, 992,  1,   33,   65,  97,   129, 161,  193, 225,  257, 289,
+  321, 353,  385, 417,  449, 481,  513, 545,  577, 609,  641, 673,  705, 737,
+  769, 801,  833, 865,  897, 929,  961, 993,  2,   34,   66,  98,   130, 162,
+  194, 226,  258, 290,  322, 354,  386, 418,  450, 482,  514, 546,  578, 610,
+  642, 674,  706, 738,  770, 802,  834, 866,  898, 930,  962, 994,  3,   35,
+  67,  99,   131, 163,  195, 227,  259, 291,  323, 355,  387, 419,  451, 483,
+  515, 547,  579, 611,  643, 675,  707, 739,  771, 803,  835, 867,  899, 931,
+  963, 995,  4,   36,   68,  100,  132, 164,  196, 228,  260, 292,  324, 356,
+  388, 420,  452, 484,  516, 548,  580, 612,  644, 676,  708, 740,  772, 804,
+  836, 868,  900, 932,  964, 996,  5,   37,   69,  101,  133, 165,  197, 229,
+  261, 293,  325, 357,  389, 421,  453, 485,  517, 549,  581, 613,  645, 677,
+  709, 741,  773, 805,  837, 869,  901, 933,  965, 997,  6,   38,   70,  102,
+  134, 166,  198, 230,  262, 294,  326, 358,  390, 422,  454, 486,  518, 550,
+  582, 614,  646, 678,  710, 742,  774, 806,  838, 870,  902, 934,  966, 998,
+  7,   39,   71,  103,  135, 167,  199, 231,  263, 295,  327, 359,  391, 423,
+  455, 487,  519, 551,  583, 615,  647, 679,  711, 743,  775, 807,  839, 871,
+  903, 935,  967, 999,  8,   40,   72,  104,  136, 168,  200, 232,  264, 296,
+  328, 360,  392, 424,  456, 488,  520, 552,  584, 616,  648, 680,  712, 744,
+  776, 808,  840, 872,  904, 936,  968, 1000, 9,   41,   73,  105,  137, 169,
+  201, 233,  265, 297,  329, 361,  393, 425,  457, 489,  521, 553,  585, 617,
+  649, 681,  713, 745,  777, 809,  841, 873,  905, 937,  969, 1001, 10,  42,
+  74,  106,  138, 170,  202, 234,  266, 298,  330, 362,  394, 426,  458, 490,
+  522, 554,  586, 618,  650, 682,  714, 746,  778, 810,  842, 874,  906, 938,
+  970, 1002, 11,  43,   75,  107,  139, 171,  203, 235,  267, 299,  331, 363,
+  395, 427,  459, 491,  523, 555,  587, 619,  651, 683,  715, 747,  779, 811,
+  843, 875,  907, 939,  971, 1003, 12,  44,   76,  108,  140, 172,  204, 236,
+  268, 300,  332, 364,  396, 428,  460, 492,  524, 556,  588, 620,  652, 684,
+  716, 748,  780, 812,  844, 876,  908, 940,  972, 1004, 13,  45,   77,  109,
+  141, 173,  205, 237,  269, 301,  333, 365,  397, 429,  461, 493,  525, 557,
+  589, 621,  653, 685,  717, 749,  781, 813,  845, 877,  909, 941,  973, 1005,
+  14,  46,   78,  110,  142, 174,  206, 238,  270, 302,  334, 366,  398, 430,
+  462, 494,  526, 558,  590, 622,  654, 686,  718, 750,  782, 814,  846, 878,
+  910, 942,  974, 1006, 15,  47,   79,  111,  143, 175,  207, 239,  271, 303,
+  335, 367,  399, 431,  463, 495,  527, 559,  591, 623,  655, 687,  719, 751,
+  783, 815,  847, 879,  911, 943,  975, 1007, 16,  48,   80,  112,  144, 176,
+  208, 240,  272, 304,  336, 368,  400, 432,  464, 496,  528, 560,  592, 624,
+  656, 688,  720, 752,  784, 816,  848, 880,  912, 944,  976, 1008, 17,  49,
+  81,  113,  145, 177,  209, 241,  273, 305,  337, 369,  401, 433,  465, 497,
+  529, 561,  593, 625,  657, 689,  721, 753,  785, 817,  849, 881,  913, 945,
+  977, 1009, 18,  50,   82,  114,  146, 178,  210, 242,  274, 306,  338, 370,
+  402, 434,  466, 498,  530, 562,  594, 626,  658, 690,  722, 754,  786, 818,
+  850, 882,  914, 946,  978, 1010, 19,  51,   83,  115,  147, 179,  211, 243,
+  275, 307,  339, 371,  403, 435,  467, 499,  531, 563,  595, 627,  659, 691,
+  723, 755,  787, 819,  851, 883,  915, 947,  979, 1011, 20,  52,   84,  116,
+  148, 180,  212, 244,  276, 308,  340, 372,  404, 436,  468, 500,  532, 564,
+  596, 628,  660, 692,  724, 756,  788, 820,  852, 884,  916, 948,  980, 1012,
+  21,  53,   85,  117,  149, 181,  213, 245,  277, 309,  341, 373,  405, 437,
+  469, 501,  533, 565,  597, 629,  661, 693,  725, 757,  789, 821,  853, 885,
+  917, 949,  981, 1013, 22,  54,   86,  118,  150, 182,  214, 246,  278, 310,
+  342, 374,  406, 438,  470, 502,  534, 566,  598, 630,  662, 694,  726, 758,
+  790, 822,  854, 886,  918, 950,  982, 1014, 23,  55,   87,  119,  151, 183,
+  215, 247,  279, 311,  343, 375,  407, 439,  471, 503,  535, 567,  599, 631,
+  663, 695,  727, 759,  791, 823,  855, 887,  919, 951,  983, 1015, 24,  56,
+  88,  120,  152, 184,  216, 248,  280, 312,  344, 376,  408, 440,  472, 504,
+  536, 568,  600, 632,  664, 696,  728, 760,  792, 824,  856, 888,  920, 952,
+  984, 1016, 25,  57,   89,  121,  153, 185,  217, 249,  281, 313,  345, 377,
+  409, 441,  473, 505,  537, 569,  601, 633,  665, 697,  729, 761,  793, 825,
+  857, 889,  921, 953,  985, 1017, 26,  58,   90,  122,  154, 186,  218, 250,
+  282, 314,  346, 378,  410, 442,  474, 506,  538, 570,  602, 634,  666, 698,
+  730, 762,  794, 826,  858, 890,  922, 954,  986, 1018, 27,  59,   91,  123,
+  155, 187,  219, 251,  283, 315,  347, 379,  411, 443,  475, 507,  539, 571,
+  603, 635,  667, 699,  731, 763,  795, 827,  859, 891,  923, 955,  987, 1019,
+  28,  60,   92,  124,  156, 188,  220, 252,  284, 316,  348, 380,  412, 444,
+  476, 508,  540, 572,  604, 636,  668, 700,  732, 764,  796, 828,  860, 892,
+  924, 956,  988, 1020, 29,  61,   93,  125,  157, 189,  221, 253,  285, 317,
+  349, 381,  413, 445,  477, 509,  541, 573,  605, 637,  669, 701,  733, 765,
+  797, 829,  861, 893,  925, 957,  989, 1021, 30,  62,   94,  126,  158, 190,
+  222, 254,  286, 318,  350, 382,  414, 446,  478, 510,  542, 574,  606, 638,
+  670, 702,  734, 766,  798, 830,  862, 894,  926, 958,  990, 1022, 31,  63,
+  95,  127,  159, 191,  223, 255,  287, 319,  351, 383,  415, 447,  479, 511,
+  543, 575,  607, 639,  671, 703,  735, 767,  799, 831,  863, 895,  927, 959,
+  991, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x32[1024]) = {
+  0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,   12,
+  13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,
+  26,   27,   28,   29,   30,   31,   32,   33,   34,   35,   36,   37,   38,
+  39,   40,   41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,
+  52,   53,   54,   55,   56,   57,   58,   59,   60,   61,   62,   63,   64,
+  65,   66,   67,   68,   69,   70,   71,   72,   73,   74,   75,   76,   77,
+  78,   79,   80,   81,   82,   83,   84,   85,   86,   87,   88,   89,   90,
+  91,   92,   93,   94,   95,   96,   97,   98,   99,   100,  101,  102,  103,
+  104,  105,  106,  107,  108,  109,  110,  111,  112,  113,  114,  115,  116,
+  117,  118,  119,  120,  121,  122,  123,  124,  125,  126,  127,  128,  129,
+  130,  131,  132,  133,  134,  135,  136,  137,  138,  139,  140,  141,  142,
+  143,  144,  145,  146,  147,  148,  149,  150,  151,  152,  153,  154,  155,
+  156,  157,  158,  159,  160,  161,  162,  163,  164,  165,  166,  167,  168,
+  169,  170,  171,  172,  173,  174,  175,  176,  177,  178,  179,  180,  181,
+  182,  183,  184,  185,  186,  187,  188,  189,  190,  191,  192,  193,  194,
+  195,  196,  197,  198,  199,  200,  201,  202,  203,  204,  205,  206,  207,
+  208,  209,  210,  211,  212,  213,  214,  215,  216,  217,  218,  219,  220,
+  221,  222,  223,  224,  225,  226,  227,  228,  229,  230,  231,  232,  233,
+  234,  235,  236,  237,  238,  239,  240,  241,  242,  243,  244,  245,  246,
+  247,  248,  249,  250,  251,  252,  253,  254,  255,  256,  257,  258,  259,
+  260,  261,  262,  263,  264,  265,  266,  267,  268,  269,  270,  271,  272,
+  273,  274,  275,  276,  277,  278,  279,  280,  281,  282,  283,  284,  285,
+  286,  287,  288,  289,  290,  291,  292,  293,  294,  295,  296,  297,  298,
+  299,  300,  301,  302,  303,  304,  305,  306,  307,  308,  309,  310,  311,
+  312,  313,  314,  315,  316,  317,  318,  319,  320,  321,  322,  323,  324,
+  325,  326,  327,  328,  329,  330,  331,  332,  333,  334,  335,  336,  337,
+  338,  339,  340,  341,  342,  343,  344,  345,  346,  347,  348,  349,  350,
+  351,  352,  353,  354,  355,  356,  357,  358,  359,  360,  361,  362,  363,
+  364,  365,  366,  367,  368,  369,  370,  371,  372,  373,  374,  375,  376,
+  377,  378,  379,  380,  381,  382,  383,  384,  385,  386,  387,  388,  389,
+  390,  391,  392,  393,  394,  395,  396,  397,  398,  399,  400,  401,  402,
+  403,  404,  405,  406,  407,  408,  409,  410,  411,  412,  413,  414,  415,
+  416,  417,  418,  419,  420,  421,  422,  423,  424,  425,  426,  427,  428,
+  429,  430,  431,  432,  433,  434,  435,  436,  437,  438,  439,  440,  441,
+  442,  443,  444,  445,  446,  447,  448,  449,  450,  451,  452,  453,  454,
+  455,  456,  457,  458,  459,  460,  461,  462,  463,  464,  465,  466,  467,
+  468,  469,  470,  471,  472,  473,  474,  475,  476,  477,  478,  479,  480,
+  481,  482,  483,  484,  485,  486,  487,  488,  489,  490,  491,  492,  493,
+  494,  495,  496,  497,  498,  499,  500,  501,  502,  503,  504,  505,  506,
+  507,  508,  509,  510,  511,  512,  513,  514,  515,  516,  517,  518,  519,
+  520,  521,  522,  523,  524,  525,  526,  527,  528,  529,  530,  531,  532,
+  533,  534,  535,  536,  537,  538,  539,  540,  541,  542,  543,  544,  545,
+  546,  547,  548,  549,  550,  551,  552,  553,  554,  555,  556,  557,  558,
+  559,  560,  561,  562,  563,  564,  565,  566,  567,  568,  569,  570,  571,
+  572,  573,  574,  575,  576,  577,  578,  579,  580,  581,  582,  583,  584,
+  585,  586,  587,  588,  589,  590,  591,  592,  593,  594,  595,  596,  597,
+  598,  599,  600,  601,  602,  603,  604,  605,  606,  607,  608,  609,  610,
+  611,  612,  613,  614,  615,  616,  617,  618,  619,  620,  621,  622,  623,
+  624,  625,  626,  627,  628,  629,  630,  631,  632,  633,  634,  635,  636,
+  637,  638,  639,  640,  641,  642,  643,  644,  645,  646,  647,  648,  649,
+  650,  651,  652,  653,  654,  655,  656,  657,  658,  659,  660,  661,  662,
+  663,  664,  665,  666,  667,  668,  669,  670,  671,  672,  673,  674,  675,
+  676,  677,  678,  679,  680,  681,  682,  683,  684,  685,  686,  687,  688,
+  689,  690,  691,  692,  693,  694,  695,  696,  697,  698,  699,  700,  701,
+  702,  703,  704,  705,  706,  707,  708,  709,  710,  711,  712,  713,  714,
+  715,  716,  717,  718,  719,  720,  721,  722,  723,  724,  725,  726,  727,
+  728,  729,  730,  731,  732,  733,  734,  735,  736,  737,  738,  739,  740,
+  741,  742,  743,  744,  745,  746,  747,  748,  749,  750,  751,  752,  753,
+  754,  755,  756,  757,  758,  759,  760,  761,  762,  763,  764,  765,  766,
+  767,  768,  769,  770,  771,  772,  773,  774,  775,  776,  777,  778,  779,
+  780,  781,  782,  783,  784,  785,  786,  787,  788,  789,  790,  791,  792,
+  793,  794,  795,  796,  797,  798,  799,  800,  801,  802,  803,  804,  805,
+  806,  807,  808,  809,  810,  811,  812,  813,  814,  815,  816,  817,  818,
+  819,  820,  821,  822,  823,  824,  825,  826,  827,  828,  829,  830,  831,
+  832,  833,  834,  835,  836,  837,  838,  839,  840,  841,  842,  843,  844,
+  845,  846,  847,  848,  849,  850,  851,  852,  853,  854,  855,  856,  857,
+  858,  859,  860,  861,  862,  863,  864,  865,  866,  867,  868,  869,  870,
+  871,  872,  873,  874,  875,  876,  877,  878,  879,  880,  881,  882,  883,
+  884,  885,  886,  887,  888,  889,  890,  891,  892,  893,  894,  895,  896,
+  897,  898,  899,  900,  901,  902,  903,  904,  905,  906,  907,  908,  909,
+  910,  911,  912,  913,  914,  915,  916,  917,  918,  919,  920,  921,  922,
+  923,  924,  925,  926,  927,  928,  929,  930,  931,  932,  933,  934,  935,
+  936,  937,  938,  939,  940,  941,  942,  943,  944,  945,  946,  947,  948,
+  949,  950,  951,  952,  953,  954,  955,  956,  957,  958,  959,  960,  961,
+  962,  963,  964,  965,  966,  967,  968,  969,  970,  971,  972,  973,  974,
+  975,  976,  977,  978,  979,  980,  981,  982,  983,  984,  985,  986,  987,
+  988,  989,  990,  991,  992,  993,  994,  995,  996,  997,  998,  999,  1000,
+  1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
+  1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = {
+  0,    32,   1,    64,  33,   2,    96,   65,   34,   128,  3,    97,   66,
+  160,  129,  35,   98,  4,    67,   130,  161,  192,  36,   99,   224,  5,
+  162,  193,  68,   131, 37,   100,  225,  194,  256,  163,  69,   132,  6,
+  226,  257,  288,  195, 101,  164,  38,   258,  7,    227,  289,  133,  320,
+  70,   196,  165,  290, 259,  228,  39,   321,  102,  352,  8,    197,  71,
+  134,  322,  291,  260, 353,  384,  229,  166,  103,  40,   354,  323,  292,
+  135,  385,  198,  261, 72,   9,    416,  167,  386,  355,  230,  324,  104,
+  293,  41,   417,  199, 136,  262,  387,  448,  325,  356,  10,   73,   418,
+  231,  168,  449,  294, 388,  105,  419,  263,  42,   200,  357,  450,  137,
+  480,  74,   326,  232, 11,   389,  169,  295,  420,  106,  451,  481,  358,
+  264,  327,  201,  43,  138,  512,  482,  390,  296,  233,  170,  421,  75,
+  452,  359,  12,   513, 265,  483,  328,  107,  202,  514,  544,  422,  391,
+  453,  139,  44,   234, 484,  297,  360,  171,  76,   515,  545,  266,  329,
+  454,  13,   423,  203, 108,  546,  485,  576,  298,  235,  140,  361,  330,
+  172,  547,  45,   455, 267,  577,  486,  77,   204,  362,  608,  14,   299,
+  578,  109,  236,  487, 609,  331,  141,  579,  46,   15,   173,  610,  363,
+  78,   205,  16,   110, 237,  611,  142,  47,   174,  79,   206,  17,   111,
+  238,  48,   143,  80,  175,  112,  207,  49,   18,   239,  81,   113,  19,
+  50,   82,   114,  51,  83,   115,  640,  516,  392,  268,  144,  20,   672,
+  641,  548,  517,  424, 393,  300,  269,  176,  145,  52,   21,   704,  673,
+  642,  580,  549,  518, 456,  425,  394,  332,  301,  270,  208,  177,  146,
+  84,   53,   22,   736, 705,  674,  643,  612,  581,  550,  519,  488,  457,
+  426,  395,  364,  333, 302,  271,  240,  209,  178,  147,  116,  85,   54,
+  23,   737,  706,  675, 613,  582,  551,  489,  458,  427,  365,  334,  303,
+  241,  210,  179,  117, 86,   55,   738,  707,  614,  583,  490,  459,  366,
+  335,  242,  211,  118, 87,   739,  615,  491,  367,  243,  119,  768,  644,
+  520,  396,  272,  148, 24,   800,  769,  676,  645,  552,  521,  428,  397,
+  304,  273,  180,  149, 56,   25,   832,  801,  770,  708,  677,  646,  584,
+  553,  522,  460,  429, 398,  336,  305,  274,  212,  181,  150,  88,   57,
+  26,   864,  833,  802, 771,  740,  709,  678,  647,  616,  585,  554,  523,
+  492,  461,  430,  399, 368,  337,  306,  275,  244,  213,  182,  151,  120,
+  89,   58,   27,   865, 834,  803,  741,  710,  679,  617,  586,  555,  493,
+  462,  431,  369,  338, 307,  245,  214,  183,  121,  90,   59,   866,  835,
+  742,  711,  618,  587, 494,  463,  370,  339,  246,  215,  122,  91,   867,
+  743,  619,  495,  371, 247,  123,  896,  772,  648,  524,  400,  276,  152,
+  28,   928,  897,  804, 773,  680,  649,  556,  525,  432,  401,  308,  277,
+  184,  153,  60,   29,  960,  929,  898,  836,  805,  774,  712,  681,  650,
+  588,  557,  526,  464, 433,  402,  340,  309,  278,  216,  185,  154,  92,
+  61,   30,   992,  961, 930,  899,  868,  837,  806,  775,  744,  713,  682,
+  651,  620,  589,  558, 527,  496,  465,  434,  403,  372,  341,  310,  279,
+  248,  217,  186,  155, 124,  93,   62,   31,   993,  962,  931,  869,  838,
+  807,  745,  714,  683, 621,  590,  559,  497,  466,  435,  373,  342,  311,
+  249,  218,  187,  125, 94,   63,   994,  963,  870,  839,  746,  715,  622,
+  591,  498,  467,  374, 343,  250,  219,  126,  95,   995,  871,  747,  623,
+  499,  375,  251,  127, 900,  776,  652,  528,  404,  280,  156,  932,  901,
+  808,  777,  684,  653, 560,  529,  436,  405,  312,  281,  188,  157,  964,
+  933,  902,  840,  809, 778,  716,  685,  654,  592,  561,  530,  468,  437,
+  406,  344,  313,  282, 220,  189,  158,  996,  965,  934,  903,  872,  841,
+  810,  779,  748,  717, 686,  655,  624,  593,  562,  531,  500,  469,  438,
+  407,  376,  345,  314, 283,  252,  221,  190,  159,  997,  966,  935,  873,
+  842,  811,  749,  718, 687,  625,  594,  563,  501,  470,  439,  377,  346,
+  315,  253,  222,  191, 998,  967,  874,  843,  750,  719,  626,  595,  502,
+  471,  378,  347,  254, 223,  999,  875,  751,  627,  503,  379,  255,  904,
+  780,  656,  532,  408, 284,  936,  905,  812,  781,  688,  657,  564,  533,
+  440,  409,  316,  285, 968,  937,  906,  844,  813,  782,  720,  689,  658,
+  596,  565,  534,  472, 441,  410,  348,  317,  286,  1000, 969,  938,  907,
+  876,  845,  814,  783, 752,  721,  690,  659,  628,  597,  566,  535,  504,
+  473,  442,  411,  380, 349,  318,  287,  1001, 970,  939,  877,  846,  815,
+  753,  722,  691,  629, 598,  567,  505,  474,  443,  381,  350,  319,  1002,
+  971,  878,  847,  754, 723,  630,  599,  506,  475,  382,  351,  1003, 879,
+  755,  631,  507,  383, 908,  784,  660,  536,  412,  940,  909,  816,  785,
+  692,  661,  568,  537, 444,  413,  972,  941,  910,  848,  817,  786,  724,
+  693,  662,  600,  569, 538,  476,  445,  414,  1004, 973,  942,  911,  880,
+  849,  818,  787,  756, 725,  694,  663,  632,  601,  570,  539,  508,  477,
+  446,  415,  1005, 974, 943,  881,  850,  819,  757,  726,  695,  633,  602,
+  571,  509,  478,  447, 1006, 975,  882,  851,  758,  727,  634,  603,  510,
+  479,  1007, 883,  759, 635,  511,  912,  788,  664,  540,  944,  913,  820,
+  789,  696,  665,  572, 541,  976,  945,  914,  852,  821,  790,  728,  697,
+  666,  604,  573,  542, 1008, 977,  946,  915,  884,  853,  822,  791,  760,
+  729,  698,  667,  636, 605,  574,  543,  1009, 978,  947,  885,  854,  823,
+  761,  730,  699,  637, 606,  575,  1010, 979,  886,  855,  762,  731,  638,
+  607,  1011, 887,  763, 639,  916,  792,  668,  948,  917,  824,  793,  700,
+  669,  980,  949,  918, 856,  825,  794,  732,  701,  670,  1012, 981,  950,
+  919,  888,  857,  826, 795,  764,  733,  702,  671,  1013, 982,  951,  889,
+  858,  827,  765,  734, 703,  1014, 983,  890,  859,  766,  735,  1015, 891,
+  767,  920,  796,  952, 921,  828,  797,  984,  953,  922,  860,  829,  798,
+  1016, 985,  954,  923, 892,  861,  830,  799,  1017, 986,  955,  893,  862,
+  831,  1018, 987,  894, 863,  1019, 895,  924,  956,  925,  988,  957,  926,
+  1020, 989,  958,  927, 1021, 990,  959,  1022, 991,  1023,
+};
+
+// Scan over two rectangular vertical partitions one after the other
+DECLARE_ALIGNED(16, static const int16_t, v2_scan_32x32[1024]) = {
+  0,    1,    32,   33,   2,    64,   34,   65,   66,   3,    96,   35,   97,
+  67,   98,   4,    128,  36,   129,  99,   68,   130,  5,    100,  131,  160,
+  37,   161,  69,   162,  132,  101,  163,  6,    192,  38,   193,  70,   194,
+  133,  164,  102,  195,  7,    224,  39,   165,  225,  134,  196,  71,   226,
+  103,  227,  166,  197,  8,    256,  40,   135,  228,  257,  72,   258,  198,
+  104,  259,  167,  229,  136,  260,  9,    288,  41,   289,  73,   199,  230,
+  290,  168,  261,  105,  291,  137,  292,  231,  10,   200,  262,  320,  42,
+  321,  74,   322,  169,  293,  106,  323,  232,  263,  138,  324,  201,  294,
+  11,   352,  43,   353,  75,   170,  325,  354,  264,  107,  233,  295,  355,
+  202,  326,  139,  356,  12,   384,  44,   265,  296,  385,  171,  357,  76,
+  386,  234,  327,  108,  387,  203,  358,  140,  388,  297,  266,  328,  13,
+  172,  389,  416,  45,   235,  359,  417,  77,   418,  109,  419,  204,  390,
+  298,  329,  141,  267,  360,  420,  236,  391,  173,  421,  14,   448,  46,
+  449,  78,   330,  450,  299,  361,  110,  205,  422,  451,  268,  392,  142,
+  452,  237,  423,  174,  331,  362,  453,  15,   300,  393,  480,  47,   481,
+  79,   482,  206,  454,  269,  424,  111,  483,  143,  484,  363,  332,  394,
+  238,  455,  175,  301,  425,  485,  512,  513,  270,  456,  514,  207,  486,
+  364,  395,  515,  333,  426,  516,  239,  487,  302,  457,  517,  396,  271,
+  488,  544,  365,  427,  545,  518,  546,  334,  458,  547,  519,  548,  303,
+  489,  397,  428,  549,  366,  459,  520,  576,  335,  490,  550,  577,  578,
+  579,  521,  429,  551,  398,  460,  580,  367,  491,  581,  552,  522,  582,
+  608,  609,  430,  461,  610,  399,  492,  553,  611,  583,  523,  612,  613,
+  584,  554,  462,  431,  493,  614,  524,  640,  641,  642,  585,  643,  555,
+  615,  644,  463,  494,  586,  525,  616,  645,  556,  646,  672,  617,  673,
+  587,  674,  647,  495,  675,  526,  676,  557,  618,  648,  677,  588,  678,
+  527,  649,  619,  704,  558,  705,  706,  679,  589,  707,  650,  708,  620,
+  680,  709,  559,  590,  710,  651,  681,  736,  621,  737,  711,  738,  739,
+  682,  652,  740,  712,  591,  741,  622,  683,  713,  742,  653,  768,  769,
+  743,  770,  714,  684,  771,  623,  772,  744,  654,  773,  715,  685,  745,
+  774,  655,  775,  800,  801,  716,  746,  802,  803,  686,  776,  804,  747,
+  805,  717,  777,  806,  687,  748,  807,  778,  832,  833,  718,  834,  835,
+  808,  836,  779,  749,  837,  809,  719,  838,  780,  750,  810,  839,  864,
+  865,  866,  867,  840,  781,  868,  811,  751,  869,  841,  870,  812,  782,
+  842,  871,  896,  897,  898,  872,  899,  813,  843,  900,  783,  901,  873,
+  844,  902,  814,  874,  903,  928,  929,  845,  930,  904,  815,  875,  931,
+  932,  905,  933,  846,  876,  934,  906,  935,  877,  960,  847,  961,  962,
+  907,  936,  963,  964,  937,  878,  965,  908,  966,  938,  967,  909,  879,
+  992,  939,  993,  968,  994,  995,  996,  910,  969,  940,  997,  998,  970,
+  911,  941,  999,  971,  1000, 942,  1001, 972,  1002, 943,  973,  1003, 974,
+  1004, 975,  1005, 1006, 1007, 16,   48,   80,   112,  144,  176,  17,   49,
+  208,  81,   113,  145,  240,  177,  272,  18,   50,   209,  82,   114,  304,
+  241,  146,  178,  273,  336,  210,  19,   51,   83,   115,  305,  242,  147,
+  368,  179,  274,  337,  211,  20,   400,  52,   84,   306,  116,  243,  369,
+  148,  338,  180,  275,  432,  401,  212,  21,   53,   307,  85,   370,  244,
+  117,  464,  149,  433,  339,  276,  181,  402,  213,  308,  496,  371,  22,
+  54,   465,  86,   245,  118,  434,  150,  340,  277,  403,  182,  528,  497,
+  214,  466,  372,  309,  23,   55,   435,  87,   246,  119,  341,  404,  151,
+  529,  560,  278,  498,  183,  467,  373,  215,  310,  436,  24,   56,   247,
+  561,  88,   530,  592,  342,  120,  405,  499,  152,  279,  468,  184,  374,
+  311,  437,  216,  562,  593,  531,  624,  25,   248,  500,  57,   406,  89,
+  343,  121,  469,  280,  153,  594,  185,  375,  563,  625,  438,  532,  656,
+  312,  217,  501,  407,  249,  26,   344,  58,   90,   470,  122,  595,  626,
+  281,  564,  657,  154,  376,  533,  688,  439,  186,  313,  502,  218,  408,
+  627,  596,  658,  250,  345,  471,  27,   59,   565,  689,  91,   123,  282,
+  534,  720,  155,  440,  377,  187,  503,  314,  628,  659,  219,  597,  690,
+  409,  472,  566,  721,  346,  251,  28,   60,   535,  752,  92,   124,  283,
+  441,  378,  156,  660,  504,  629,  691,  598,  722,  188,  315,  567,  753,
+  220,  410,  473,  347,  536,  784,  252,  29,   661,  692,  61,   93,   442,
+  630,  723,  284,  125,  379,  505,  599,  754,  157,  316,  568,  785,  189,
+  474,  411,  221,  537,  816,  693,  348,  662,  724,  253,  631,  755,  443,
+  30,   600,  786,  62,   506,  94,   285,  380,  126,  569,  817,  158,  317,
+  190,  475,  694,  725,  412,  663,  756,  538,  848,  222,  632,  787,  349,
+  254,  601,  818,  444,  507,  31,   63,   381,  286,  95,   570,  849,  726,
+  127,  695,  757,  664,  788,  159,  476,  318,  413,  539,  880,  191,  633,
+  819,  223,  350,  602,  850,  508,  255,  445,  727,  758,  696,  789,  571,
+  881,  382,  287,  665,  820,  477,  634,  851,  540,  912,  319,  414,  603,
+  882,  759,  728,  790,  351,  509,  697,  821,  446,  572,  913,  666,  852,
+  383,  635,  883,  478,  541,  944,  415,  760,  791,  604,  914,  729,  822,
+  698,  853,  510,  667,  884,  447,  573,  945,  636,  915,  792,  761,  823,
+  542,  976,  479,  730,  854,  605,  946,  699,  885,  668,  916,  511,  574,
+  977,  793,  824,  637,  947,  762,  855,  731,  886,  543,  1008, 606,  978,
+  700,  917,  669,  948,  575,  825,  1009, 794,  856,  763,  887,  638,  979,
+  732,  918,  701,  949,  607,  1010, 670,  980,  826,  857,  795,  888,  764,
+  919,  639,  1011, 733,  950,  702,  981,  858,  827,  889,  796,  920,  671,
+  1012, 765,  951,  734,  982,  703,  1013, 859,  890,  828,  921,  797,  952,
+  766,  983,  735,  1014, 891,  860,  922,  829,  953,  798,  984,  767,  1015,
+  892,  923,  861,  954,  830,  985,  799,  1016, 924,  893,  955,  862,  986,
+  831,  1017, 925,  956,  894,  987,  863,  1018, 957,  926,  988,  895,  1019,
+  958,  989,  927,  1020, 990,  959,  1021, 991,  1022, 1023,
+};
+
+// Scan over two rectangular horizontal partitions one after the other
+DECLARE_ALIGNED(16, static const int16_t, h2_scan_32x32[1024]) = {
+  0,    1,    32,   33,   2,    64,   34,   65,   66,   3,    96,   35,   97,
+  67,   98,   4,    128,  36,   129,  99,   68,   130,  5,    100,  131,  160,
+  37,   161,  69,   162,  132,  101,  163,  6,    192,  38,   193,  70,   194,
+  133,  164,  102,  195,  7,    224,  39,   165,  225,  134,  196,  71,   226,
+  103,  227,  166,  197,  8,    256,  40,   135,  228,  257,  72,   258,  198,
+  104,  259,  167,  229,  136,  260,  9,    288,  41,   289,  73,   199,  230,
+  290,  168,  261,  105,  291,  137,  292,  231,  10,   200,  262,  320,  42,
+  321,  74,   322,  169,  293,  106,  323,  232,  263,  138,  324,  201,  294,
+  11,   352,  43,   353,  75,   170,  325,  354,  264,  107,  233,  295,  355,
+  202,  326,  139,  356,  12,   384,  44,   265,  296,  385,  171,  357,  76,
+  386,  234,  327,  108,  387,  203,  358,  140,  388,  297,  266,  328,  13,
+  172,  389,  416,  45,   235,  359,  417,  77,   418,  109,  419,  204,  390,
+  298,  329,  141,  267,  360,  420,  236,  391,  173,  421,  14,   448,  46,
+  449,  78,   330,  450,  299,  361,  110,  205,  422,  451,  268,  392,  142,
+  452,  237,  423,  174,  331,  362,  453,  15,   300,  393,  480,  47,   481,
+  79,   482,  206,  454,  269,  424,  111,  483,  143,  484,  363,  332,  394,
+  238,  455,  175,  301,  425,  485,  16,   48,   80,   270,  456,  207,  486,
+  112,  364,  395,  333,  426,  144,  239,  487,  302,  457,  176,  396,  17,
+  271,  488,  49,   365,  427,  208,  81,   334,  458,  113,  145,  240,  303,
+  489,  397,  428,  177,  366,  459,  272,  18,   50,   209,  335,  490,  82,
+  114,  304,  241,  429,  146,  398,  460,  367,  491,  178,  273,  336,  210,
+  19,   51,   83,   430,  461,  399,  492,  115,  305,  242,  147,  368,  179,
+  274,  337,  462,  431,  493,  211,  20,   400,  52,   84,   306,  116,  243,
+  369,  148,  463,  494,  338,  180,  275,  432,  401,  212,  21,   53,   307,
+  85,   370,  244,  117,  495,  464,  149,  433,  339,  276,  181,  402,  213,
+  308,  496,  371,  22,   54,   465,  86,   245,  118,  434,  150,  340,  277,
+  403,  182,  497,  214,  466,  372,  309,  23,   55,   435,  87,   246,  119,
+  341,  404,  151,  278,  498,  183,  467,  373,  215,  310,  436,  24,   56,
+  247,  88,   342,  120,  405,  499,  152,  279,  468,  184,  374,  311,  437,
+  216,  25,   248,  500,  57,   406,  89,   343,  121,  469,  280,  153,  185,
+  375,  438,  312,  217,  501,  407,  249,  26,   344,  58,   90,   470,  122,
+  281,  154,  376,  439,  186,  313,  502,  218,  408,  250,  345,  471,  27,
+  59,   91,   123,  282,  155,  440,  377,  187,  503,  314,  219,  409,  472,
+  346,  251,  28,   60,   92,   124,  283,  441,  378,  156,  504,  188,  315,
+  220,  410,  473,  347,  252,  29,   61,   93,   442,  284,  125,  379,  505,
+  157,  316,  189,  474,  411,  221,  348,  253,  443,  30,   62,   506,  94,
+  285,  380,  126,  158,  317,  190,  475,  412,  222,  349,  254,  444,  507,
+  31,   63,   381,  286,  95,   127,  159,  476,  318,  413,  191,  223,  350,
+  508,  255,  445,  382,  287,  477,  319,  414,  351,  509,  446,  383,  478,
+  415,  510,  447,  479,  511,  512,  513,  514,  515,  516,  517,  544,  545,
+  518,  546,  547,  519,  548,  549,  520,  576,  550,  577,  578,  579,  521,
+  551,  580,  581,  552,  522,  582,  608,  609,  610,  553,  611,  583,  523,
+  612,  613,  584,  554,  614,  524,  640,  641,  642,  585,  643,  555,  615,
+  644,  586,  525,  616,  645,  556,  646,  672,  617,  673,  587,  674,  647,
+  675,  526,  676,  557,  618,  648,  677,  588,  678,  527,  649,  619,  704,
+  558,  705,  706,  679,  589,  707,  650,  708,  620,  680,  709,  528,  559,
+  590,  710,  651,  681,  736,  621,  737,  711,  738,  739,  682,  652,  529,
+  560,  740,  712,  591,  741,  622,  683,  713,  742,  653,  768,  769,  561,
+  743,  530,  592,  770,  714,  684,  771,  623,  772,  744,  654,  773,  715,
+  685,  745,  774,  562,  593,  531,  624,  655,  775,  800,  801,  716,  746,
+  802,  803,  686,  776,  804,  594,  563,  625,  747,  805,  717,  532,  656,
+  777,  806,  687,  748,  807,  778,  832,  833,  718,  834,  595,  626,  835,
+  564,  657,  808,  836,  533,  688,  779,  749,  837,  809,  719,  838,  780,
+  627,  596,  658,  750,  810,  839,  864,  565,  689,  865,  866,  867,  534,
+  720,  840,  781,  868,  811,  751,  869,  841,  628,  659,  597,  690,  870,
+  812,  782,  566,  721,  842,  871,  896,  535,  752,  897,  898,  872,  899,
+  813,  843,  660,  900,  783,  629,  691,  598,  722,  901,  873,  567,  753,
+  844,  902,  814,  874,  536,  784,  903,  661,  692,  928,  929,  630,  723,
+  845,  930,  904,  815,  875,  931,  599,  754,  932,  568,  785,  905,  933,
+  846,  876,  934,  537,  816,  693,  662,  724,  906,  631,  755,  935,  877,
+  600,  786,  960,  847,  961,  962,  907,  936,  963,  569,  817,  964,  937,
+  694,  725,  878,  965,  908,  663,  756,  538,  848,  966,  632,  787,  938,
+  601,  818,  967,  909,  879,  992,  939,  993,  968,  570,  849,  994,  726,
+  695,  757,  995,  664,  788,  996,  910,  969,  539,  880,  940,  633,  819,
+  997,  998,  602,  850,  970,  911,  941,  999,  727,  758,  696,  789,  571,
+  881,  971,  665,  820,  1000, 634,  851,  942,  540,  912,  1001, 972,  603,
+  882,  759,  728,  790,  1002, 697,  821,  943,  973,  572,  913,  666,  852,
+  1003, 635,  883,  974,  541,  944,  760,  791,  1004, 604,  914,  729,  822,
+  698,  853,  975,  667,  884,  573,  945,  1005, 636,  915,  792,  761,  823,
+  542,  976,  1006, 730,  854,  605,  946,  699,  885,  668,  916,  1007, 574,
+  977,  793,  824,  637,  947,  762,  855,  731,  886,  543,  1008, 606,  978,
+  700,  917,  669,  948,  575,  825,  1009, 794,  856,  763,  887,  638,  979,
+  732,  918,  701,  949,  607,  1010, 670,  980,  826,  857,  795,  888,  764,
+  919,  639,  1011, 733,  950,  702,  981,  858,  827,  889,  796,  920,  671,
+  1012, 765,  951,  734,  982,  703,  1013, 859,  890,  828,  921,  797,  952,
+  766,  983,  735,  1014, 891,  860,  922,  829,  953,  798,  984,  767,  1015,
+  892,  923,  861,  954,  830,  985,  799,  1016, 924,  893,  955,  862,  986,
+  831,  1017, 925,  956,  894,  987,  863,  1018, 957,  926,  988,  895,  1019,
+  958,  989,  927,  1020, 990,  959,  1021, 991,  1022, 1023,
+};
+
+// Scan where the top left quarter is scanned first
+DECLARE_ALIGNED(16, static const int16_t, qtr_scan_32x32[1024]) = {
+  0,    1,    32,   33,   2,    64,   34,   65,   66,   3,    96,   35,   97,
+  67,   98,   4,    128,  36,   129,  99,   68,   130,  5,    100,  131,  160,
+  37,   161,  69,   162,  132,  101,  163,  6,    192,  38,   193,  70,   194,
+  133,  164,  102,  195,  7,    224,  39,   165,  225,  134,  196,  71,   226,
+  103,  227,  166,  197,  8,    256,  40,   135,  228,  257,  72,   258,  198,
+  104,  259,  167,  229,  136,  260,  9,    288,  41,   289,  73,   199,  230,
+  290,  168,  261,  105,  291,  137,  292,  231,  10,   200,  262,  320,  42,
+  321,  74,   322,  169,  293,  106,  323,  232,  263,  138,  324,  201,  294,
+  11,   352,  43,   353,  75,   170,  325,  354,  264,  107,  233,  295,  355,
+  202,  326,  139,  356,  12,   384,  44,   265,  296,  385,  171,  357,  76,
+  386,  234,  327,  108,  387,  203,  358,  140,  388,  297,  266,  328,  13,
+  172,  389,  416,  45,   235,  359,  417,  77,   418,  109,  419,  204,  390,
+  298,  329,  141,  267,  360,  420,  236,  391,  173,  421,  14,   448,  46,
+  449,  78,   330,  450,  299,  361,  110,  205,  422,  451,  268,  392,  142,
+  452,  237,  423,  174,  331,  362,  453,  15,   300,  393,  480,  47,   481,
+  79,   482,  206,  454,  269,  424,  111,  483,  143,  484,  363,  332,  394,
+  238,  455,  175,  301,  425,  485,  270,  456,  207,  486,  364,  395,  333,
+  426,  239,  487,  302,  457,  396,  271,  488,  365,  427,  334,  458,  303,
+  489,  397,  428,  366,  459,  335,  490,  429,  398,  460,  367,  491,  430,
+  461,  399,  492,  462,  431,  493,  463,  494,  495,  16,   512,  48,   513,
+  80,   514,  112,  515,  144,  516,  176,  517,  17,   544,  49,   545,  208,
+  518,  81,   546,  113,  547,  145,  240,  519,  548,  177,  549,  272,  520,
+  18,   576,  50,   209,  550,  577,  82,   578,  114,  579,  304,  521,  241,
+  551,  146,  580,  178,  581,  273,  552,  336,  522,  210,  582,  19,   608,
+  51,   609,  83,   610,  115,  305,  553,  611,  242,  583,  147,  368,  523,
+  612,  179,  613,  274,  584,  337,  554,  211,  614,  20,   400,  524,  640,
+  52,   641,  84,   642,  306,  585,  116,  643,  243,  369,  555,  615,  148,
+  644,  338,  586,  180,  275,  432,  525,  616,  645,  401,  556,  212,  646,
+  21,   672,  53,   307,  617,  673,  85,   370,  587,  674,  244,  647,  117,
+  675,  464,  526,  149,  676,  433,  557,  339,  618,  276,  648,  181,  677,
+  402,  588,  213,  678,  308,  496,  527,  649,  371,  619,  22,   704,  54,
+  465,  558,  705,  86,   706,  245,  679,  118,  434,  589,  707,  150,  340,
+  650,  708,  277,  403,  620,  680,  182,  709,  528,  497,  559,  214,  466,
+  590,  710,  372,  651,  309,  681,  23,   736,  55,   435,  621,  737,  87,
+  246,  711,  738,  119,  739,  341,  682,  404,  652,  151,  529,  560,  740,
+  278,  712,  498,  591,  183,  741,  467,  622,  373,  683,  215,  310,  713,
+  742,  436,  653,  24,   768,  56,   769,  247,  561,  743,  88,   530,  592,
+  770,  342,  714,  120,  405,  684,  771,  499,  623,  152,  772,  279,  744,
+  468,  654,  184,  773,  374,  715,  311,  437,  685,  745,  216,  774,  562,
+  593,  531,  624,  25,   248,  500,  655,  775,  800,  57,   801,  406,  716,
+  89,   343,  746,  802,  121,  803,  469,  686,  280,  776,  153,  804,  594,
+  185,  375,  563,  625,  747,  805,  438,  717,  532,  656,  312,  777,  217,
+  806,  501,  687,  407,  748,  249,  807,  26,   344,  778,  832,  58,   833,
+  90,   470,  718,  834,  122,  595,  626,  835,  281,  564,  657,  808,  154,
+  836,  376,  533,  688,  779,  439,  749,  186,  837,  313,  809,  502,  719,
+  218,  838,  408,  780,  627,  596,  658,  250,  345,  471,  750,  810,  839,
+  27,   864,  59,   565,  689,  865,  91,   866,  123,  867,  282,  534,  720,
+  840,  155,  440,  781,  868,  377,  811,  187,  503,  751,  869,  314,  841,
+  628,  659,  219,  597,  690,  870,  409,  812,  472,  782,  566,  721,  346,
+  842,  251,  871,  28,   896,  60,   535,  752,  897,  92,   898,  124,  283,
+  872,  899,  441,  813,  378,  843,  156,  660,  900,  504,  783,  629,  691,
+  598,  722,  188,  901,  315,  873,  567,  753,  220,  410,  844,  902,  473,
+  814,  347,  874,  536,  784,  252,  903,  29,   661,  692,  928,  61,   929,
+  93,   442,  630,  723,  845,  930,  284,  904,  125,  379,  505,  815,  875,
+  931,  599,  754,  157,  932,  316,  568,  785,  905,  189,  933,  474,  846,
+  411,  876,  221,  934,  537,  816,  693,  348,  662,  724,  906,  253,  631,
+  755,  935,  443,  877,  30,   600,  786,  960,  62,   506,  847,  961,  94,
+  962,  285,  380,  907,  936,  126,  963,  569,  817,  158,  964,  317,  937,
+  190,  475,  694,  725,  878,  965,  412,  908,  663,  756,  538,  848,  222,
+  966,  632,  787,  349,  938,  254,  601,  818,  967,  444,  909,  507,  879,
+  31,   992,  63,   381,  939,  993,  286,  968,  95,   570,  849,  994,  726,
+  127,  695,  757,  995,  664,  788,  159,  996,  476,  910,  318,  969,  413,
+  539,  880,  940,  191,  633,  819,  997,  223,  998,  350,  602,  850,  970,
+  508,  911,  255,  445,  941,  999,  727,  758,  696,  789,  571,  881,  382,
+  971,  287,  665,  820,  1000, 477,  634,  851,  942,  540,  912,  319,  1001,
+  414,  972,  603,  882,  759,  728,  790,  351,  1002, 509,  697,  821,  943,
+  446,  973,  572,  913,  666,  852,  383,  1003, 635,  883,  478,  974,  541,
+  944,  415,  760,  791,  1004, 604,  914,  729,  822,  698,  853,  510,  975,
+  667,  884,  447,  573,  945,  1005, 636,  915,  792,  761,  823,  542,  976,
+  479,  1006, 730,  854,  605,  946,  699,  885,  668,  916,  511,  1007, 574,
+  977,  793,  824,  637,  947,  762,  855,  731,  886,  543,  1008, 606,  978,
+  700,  917,  669,  948,  575,  825,  1009, 794,  856,  763,  887,  638,  979,
+  732,  918,  701,  949,  607,  1010, 670,  980,  826,  857,  795,  888,  764,
+  919,  639,  1011, 733,  950,  702,  981,  858,  827,  889,  796,  920,  671,
+  1012, 765,  951,  734,  982,  703,  1013, 859,  890,  828,  921,  797,  952,
+  766,  983,  735,  1014, 891,  860,  922,  829,  953,  798,  984,  767,  1015,
+  892,  923,  861,  954,  830,  985,  799,  1016, 924,  893,  955,  862,  986,
+  831,  1017, 925,  956,  894,  987,  863,  1018, 957,  926,  988,  895,  1019,
+  958,  989,  927,  1020, 990,  959,  1021, 991,  1022, 1023,
+};
+
+#if CONFIG_TX64X64
+DECLARE_ALIGNED(16, static const int16_t, default_scan_64x64[4096]) = {
+  0,    1,    64,   65,   2,    128,  66,   129,  130,  3,    192,  67,   193,
+  131,  194,  4,    256,  68,   257,  195,  132,  258,  5,    196,  259,  320,
+  69,   321,  133,  322,  260,  197,  323,  6,    384,  70,   385,  134,  386,
+  261,  324,  198,  387,  7,    448,  71,   325,  449,  262,  388,  135,  450,
+  199,  451,  326,  389,  8,    512,  72,   263,  452,  513,  136,  514,  390,
+  200,  515,  327,  453,  264,  516,  9,    576,  73,   577,  137,  391,  454,
+  578,  328,  517,  201,  579,  265,  580,  455,  10,   392,  518,  640,  74,
+  641,  138,  642,  329,  581,  202,  643,  456,  519,  266,  644,  393,  582,
+  11,   704,  75,   705,  139,  330,  645,  706,  520,  203,  457,  583,  707,
+  394,  646,  267,  708,  12,   768,  76,   521,  584,  769,  331,  709,  140,
+  770,  458,  647,  204,  771,  395,  710,  268,  772,  585,  522,  648,  13,
+  332,  773,  832,  77,   459,  711,  833,  141,  834,  205,  835,  396,  774,
+  586,  649,  269,  523,  712,  836,  460,  775,  333,  837,  14,   896,  78,
+  897,  142,  650,  898,  587,  713,  206,  397,  838,  899,  524,  776,  270,
+  900,  461,  839,  334,  651,  714,  901,  15,   588,  777,  960,  79,   961,
+  143,  962,  398,  902,  525,  840,  207,  963,  271,  964,  715,  652,  778,
+  462,  903,  335,  589,  841,  965,  16,   1024, 80,   1025, 144,  526,  904,
+  1026, 399,  966,  208,  716,  779,  1027, 653,  842,  272,  1028, 463,  967,
+  590,  905,  336,  1029, 780,  17,   527,  968,  1088, 81,   717,  843,  1089,
+  400,  1030, 145,  1090, 654,  906,  209,  1091, 273,  464,  1031, 1092, 591,
+  969,  781,  844,  337,  1093, 718,  907,  528,  1032, 18,   1152, 82,   401,
+  655,  970,  1094, 1153, 146,  1154, 210,  1155, 592,  1033, 465,  845,  1095,
+  274,  782,  908,  1156, 719,  971,  338,  1157, 529,  1096, 656,  1034, 402,
+  1158, 19,   1216, 83,   1217, 147,  846,  909,  1218, 783,  972,  211,  593,
+  1097, 1219, 466,  1159, 275,  720,  1035, 1220, 339,  1221, 530,  1160, 657,
+  1098, 910,  847,  973,  403,  1222, 20,   784,  1036, 1280, 84,   1281, 148,
+  1282, 594,  1161, 212,  1283, 467,  721,  1099, 1223, 276,  1284, 911,  974,
+  658,  1162, 340,  531,  848,  1037, 1224, 1285, 785,  1100, 404,  1286, 21,
+  1344, 85,   595,  1225, 1345, 149,  722,  1163, 1346, 468,  1287, 213,  975,
+  1347, 912,  1038, 277,  1348, 849,  1101, 659,  1226, 532,  1288, 341,  1349,
+  786,  1164, 405,  1350, 596,  976,  1039, 1289, 723,  1227, 22,   1408, 86,
+  913,  1102, 1409, 150,  1410, 469,  1351, 214,  850,  1165, 1411, 278,  660,
+  1290, 1412, 533,  787,  1228, 1352, 342,  1413, 1040, 977,  1103, 406,  914,
+  1166, 1414, 724,  1291, 597,  1353, 23,   1472, 87,   851,  1229, 1473, 151,
+  470,  1415, 1474, 215,  1475, 661,  1354, 788,  1292, 279,  1041, 1104, 1476,
+  534,  1416, 978,  1167, 343,  1477, 915,  1230, 725,  1355, 407,  598,  1417,
+  1478, 852,  1293, 24,   1536, 88,   1537, 471,  1105, 1479, 152,  1042, 1168,
+  1538, 662,  1418, 216,  789,  1356, 1539, 979,  1231, 280,  1540, 535,  1480,
+  916,  1294, 344,  1541, 726,  1419, 599,  853,  1357, 1481, 408,  1542, 1106,
+  1169, 1043, 1232, 25,   472,  980,  1295, 1543, 1600, 89,   1601, 790,  1420,
+  153,  663,  1482, 1602, 217,  1603, 917,  1358, 536,  1544, 281,  1604, 1170,
+  345,  727,  1107, 1233, 1483, 1605, 854,  1421, 1044, 1296, 600,  1545, 409,
+  1606, 981,  1359, 791,  1484, 473,  1607, 26,   664,  1546, 1664, 90,   1665,
+  154,  918,  1422, 1666, 218,  1171, 1234, 1667, 537,  1108, 1297, 1608, 282,
+  1668, 728,  1045, 1360, 1547, 855,  1485, 346,  1669, 601,  1609, 982,  1423,
+  410,  1670, 792,  1548, 1235, 1172, 1298, 474,  665,  919,  1486, 1610, 1671,
+  27,   1728, 91,   1109, 1361, 1729, 155,  1730, 219,  1731, 538,  1046, 1424,
+  1672, 283,  856,  1549, 1732, 729,  1611, 347,  983,  1487, 1733, 602,  1673,
+  1236, 1299, 411,  1173, 1362, 1734, 793,  1612, 920,  1550, 1110, 1425, 666,
+  1674, 475,  1735, 28,   1792, 92,   1047, 1488, 1793, 156,  1794, 220,  539,
+  1736, 1795, 857,  1613, 730,  1675, 284,  1300, 1796, 984,  1551, 1237, 1363,
+  1174, 1426, 348,  1797, 603,  1737, 1111, 1489, 412,  794,  1676, 1798, 921,
+  1614, 667,  1738, 1048, 1552, 476,  1799, 29,   1301, 1364, 1856, 93,   1857,
+  157,  858,  1238, 1427, 1677, 1858, 540,  1800, 221,  731,  985,  1615, 1739,
+  1859, 1175, 1490, 285,  1860, 604,  1112, 1553, 1801, 349,  1861, 922,  1678,
+  795,  1740, 413,  1862, 1049, 1616, 1365, 668,  1302, 1428, 1802, 477,  1239,
+  1491, 1863, 859,  1741, 30,   1176, 1554, 1920, 94,   986,  1679, 1921, 158,
+  1922, 541,  732,  1803, 1864, 222,  1923, 1113, 1617, 286,  1924, 605,  1865,
+  350,  923,  1366, 1429, 1742, 1925, 796,  1804, 1303, 1492, 1050, 1680, 414,
+  1926, 1240, 1555, 669,  1866, 478,  1177, 1618, 1927, 860,  1805, 987,  1743,
+  31,   1984, 95,   733,  1867, 1985, 542,  1928, 159,  1114, 1681, 1986, 1430,
+  223,  1367, 1493, 1987, 1304, 1556, 287,  1988, 924,  1806, 606,  1929, 797,
+  1051, 1744, 1868, 351,  1241, 1619, 1989, 415,  1990, 670,  1178, 1682, 1930,
+  988,  1807, 479,  861,  1869, 1991, 1431, 1494, 1368, 1557, 1115, 1745, 734,
+  1931, 32,   2048, 96,   543,  1305, 1620, 1992, 2049, 160,  2050, 224,  2051,
+  925,  1242, 1683, 1870, 288,  1052, 1808, 2052, 607,  1993, 798,  1932, 352,
+  2053, 1179, 1746, 1495, 416,  1432, 1558, 2054, 671,  1994, 989,  1369, 1621,
+  1871, 862,  1933, 480,  1116, 1809, 2055, 1306, 1684, 735,  1995, 544,  2056,
+  33,   2112, 97,   1243, 1747, 2113, 161,  2114, 926,  1934, 1053, 1872, 225,
+  2115, 289,  608,  799,  1496, 1559, 1996, 2057, 2116, 1180, 1810, 1433, 1622,
+  353,  2117, 1370, 1685, 672,  2058, 417,  990,  1935, 2118, 1307, 1748, 863,
+  1117, 1873, 1997, 481,  2119, 736,  1244, 1811, 2059, 1560, 545,  2120, 1497,
+  1623, 34,   1054, 1936, 2176, 98,   927,  1998, 2177, 162,  1434, 1686, 2178,
+  226,  1181, 1874, 2179, 800,  2060, 609,  1371, 1749, 2121, 290,  2180, 354,
+  2181, 1308, 1812, 991,  1999, 673,  1118, 1937, 2122, 418,  2182, 864,  2061,
+  1561, 1624, 1245, 1875, 482,  1498, 1687, 2183, 737,  2123, 1435, 1750, 1055,
+  2000, 546,  928,  2062, 2184, 1182, 1938, 35,   1372, 1813, 2240, 99,   2241,
+  163,  2242, 801,  2124, 227,  2243, 610,  2185, 291,  1309, 1876, 2244, 992,
+  2063, 355,  1119, 1625, 2001, 2245, 1562, 1688, 674,  2186, 865,  1499, 1751,
+  2125, 419,  1246, 1939, 2246, 1436, 1814, 483,  2247, 738,  2187, 1056, 2064,
+  1373, 1877, 929,  1183, 2002, 2126, 547,  2248, 36,   2304, 100,  2305, 164,
+  802,  1310, 1940, 2188, 2306, 1626, 1689, 228,  1563, 1752, 2307, 611,  2249,
+  292,  2308, 1120, 1500, 1815, 2065, 993,  2127, 356,  2309, 1247, 2003, 675,
+  866,  1437, 1878, 2189, 2250, 420,  2310, 1374, 1941, 484,  1057, 2128, 2311,
+  739,  2251, 1184, 2066, 930,  1690, 2190, 1627, 1753, 548,  1564, 1816, 2312,
+  1311, 2004, 37,   803,  2252, 2368, 101,  1501, 1879, 2369, 165,  2370, 612,
+  2313, 229,  1121, 2129, 2371, 994,  2191, 1438, 1942, 293,  1248, 2067, 2372,
+  357,  867,  2253, 2373, 676,  2314, 1375, 2005, 421,  1691, 1754, 2374, 1628,
+  1817, 1058, 2192, 1185, 2130, 740,  1565, 1880, 2315, 485,  2375, 931,  2254,
+  1312, 2068, 1502, 1943, 549,  2376, 804,  2316, 38,   2432, 102,  1122, 1439,
+  2006, 2193, 2433, 166,  2434, 613,  995,  1249, 2131, 2255, 2377, 230,  2435,
+  1755, 294,  1692, 1818, 2436, 868,  1376, 2069, 2317, 1629, 1881, 358,  677,
+  2378, 2437, 1566, 1944, 422,  1186, 2194, 2438, 1059, 2256, 1313, 2132, 741,
+  1503, 2007, 2379, 932,  2318, 486,  2439, 550,  1440, 2070, 2440, 805,  1756,
+  1819, 2380, 1123, 2257, 1250, 1693, 1882, 2195, 39,   996,  2319, 2496, 103,
+  2497, 167,  614,  1630, 1945, 2441, 2498, 231,  1377, 2133, 2499, 295,  1567,
+  2008, 2500, 869,  2381, 678,  2442, 359,  2501, 1187, 2258, 1060, 2320, 1504,
+  2071, 1314, 2196, 423,  2502, 742,  933,  2382, 2443, 1820, 487,  1757, 1883,
+  2503, 1441, 2134, 1694, 1946, 551,  1124, 2321, 2504, 1251, 1631, 2009, 2259,
+  806,  2444, 997,  2383, 1378, 2197, 40,   1568, 2072, 2560, 104,  2561, 615,
+  2505, 168,  2562, 232,  2563, 870,  2445, 296,  2564, 1505, 2135, 1188, 2322,
+  679,  2506, 360,  1061, 1315, 1821, 1884, 2260, 2384, 2565, 1758, 1947, 424,
+  2566, 1695, 2010, 934,  1442, 2198, 2446, 743,  2507, 488,  1632, 2073, 2567,
+  1252, 2323, 1125, 2385, 552,  2568, 807,  1569, 2136, 2508, 1379, 2261, 998,
+  2447, 41,   616,  2569, 2624, 105,  1885, 2625, 1822, 1948, 169,  1506, 2199,
+  2626, 233,  871,  1759, 2011, 2509, 2627, 1189, 2386, 1316, 2324, 297,  2628,
+  680,  1062, 1696, 2074, 2448, 2570, 361,  2629, 1443, 2262, 1633, 2137, 425,
+  935,  2510, 2630, 744,  2571, 489,  1253, 2387, 2631, 1570, 2200, 1126, 2449,
+  1380, 2325, 1886, 1949, 808,  2572, 553,  1823, 2012, 2632, 999,  2511, 1760,
+  2075, 1507, 2263, 617,  2633, 42,   2688, 106,  1697, 2138, 2689, 170,  1190,
+  2450, 2690, 872,  1317, 2388, 2573, 234,  2691, 1063, 2512, 298,  1444, 2326,
+  2692, 681,  1634, 2201, 2634, 362,  2693, 936,  2574, 426,  1950, 2694, 1571,
+  2264, 745,  1887, 2013, 2635, 1254, 2451, 1824, 2076, 1127, 1381, 2389, 2513,
+  490,  2695, 1761, 2139, 809,  1000, 1508, 2327, 2575, 2636, 554,  2696, 1698,
+  2202, 1318, 2452, 618,  1191, 2514, 2697, 43,   2752, 107,  873,  1635, 2265,
+  2637, 2753, 171,  1445, 2390, 2754, 1064, 2576, 235,  2755, 1951, 2014, 682,
+  2698, 299,  1888, 2077, 2756, 1572, 2328, 1825, 2140, 363,  2757, 937,  2638,
+  1255, 2515, 427,  746,  1382, 1762, 2203, 2453, 2699, 2758, 1128, 2577, 491,
+  1509, 2391, 2759, 1699, 2266, 1001, 2639, 810,  2700, 555,  2760, 1319, 1636,
+  2329, 2516, 2015, 1192, 1952, 2078, 2578, 1446, 2454, 619,  1889, 2141, 2761,
+  874,  2701, 44,   2816, 108,  1065, 2640, 2817, 172,  1826, 2204, 2818, 236,
+  1573, 2392, 2819, 683,  2762, 300,  2820, 1763, 2267, 938,  2702, 364,  1256,
+  2579, 2821, 1383, 2517, 747,  1129, 2641, 2763, 428,  1700, 2330, 2822, 1510,
+  2455, 492,  2016, 2079, 2823, 1002, 1953, 2142, 2703, 811,  2764, 1637, 2393,
+  1890, 2205, 556,  1320, 2580, 2824, 1193, 1447, 2518, 2642, 1827, 2268, 620,
+  2825, 875,  2765, 1066, 1574, 2456, 2704, 45,   1764, 2331, 2880, 109,  2881,
+  173,  2882, 237,  2883, 684,  2826, 301,  1384, 2581, 2884, 1257, 2643, 939,
+  1701, 2394, 2766, 2080, 365,  1511, 2017, 2143, 2519, 2885, 1130, 2705, 1954,
+  2206, 748,  2827, 429,  2886, 1891, 2269, 1638, 2457, 493,  1003, 2767, 2887,
+  812,  1828, 2332, 2828, 1321, 2644, 1448, 2582, 1194, 2706, 557,  2888, 1575,
+  2520, 1765, 2395, 876,  1067, 2768, 2829, 621,  2889, 2081, 2144, 46,   2944,
+  110,  2018, 2207, 2945, 174,  1702, 2458, 2946, 1385, 2645, 238,  685,  1258,
+  1955, 2270, 2707, 2890, 2947, 1512, 2583, 302,  940,  2830, 2948, 1892, 2333,
+  1131, 2769, 366,  2949, 749,  1639, 2521, 2891, 430,  2950, 1829, 2396, 1004,
+  2831, 1322, 2708, 494,  1449, 2646, 2951, 813,  2892, 1195, 1766, 2459, 2770,
+  1576, 2584, 2145, 558,  2082, 2208, 2952, 2019, 2271, 1068, 2832, 877,  2893,
+  1956, 2334, 622,  1703, 2522, 2953, 1386, 2709, 47,   3008, 111,  1259, 1513,
+  1893, 2397, 2647, 2771, 3009, 175,  3010, 686,  2954, 239,  3011, 941,  2894,
+  303,  1132, 1640, 2585, 2833, 3012, 1830, 2460, 367,  3013, 750,  2955, 431,
+  2146, 2209, 3014, 1450, 2710, 1323, 2083, 2272, 2772, 1005, 1767, 2523, 2895,
+  1577, 2020, 2335, 2648, 495,  3015, 814,  1196, 2834, 2956, 1957, 2398, 559,
+  3016, 1704, 2586, 1069, 2896, 878,  1894, 2461, 2957, 623,  1387, 2773, 3017,
+  1514, 2711, 1260, 2835, 48,   3072, 112,  1831, 2524, 3073, 1641, 2649, 176,
+  3074, 687,  3018, 942,  2210, 2958, 240,  3075, 1133, 2147, 2273, 2897, 304,
+  2084, 2336, 3076, 368,  1768, 2587, 3077, 751,  2021, 2399, 3019, 1451, 2774,
+  1324, 2836, 432,  1578, 2712, 3078, 1006, 2959, 1958, 2462, 1197, 2898, 496,
+  815,  3020, 3079, 1705, 2650, 1895, 2525, 560,  3080, 1070, 2960, 1388, 2837,
+  879,  1515, 2775, 3021, 2211, 2274, 1832, 2588, 624,  2148, 2337, 3081, 1261,
+  2899, 1642, 2713, 2085, 2400, 49,   3136, 113,  3137, 688,  3082, 177,  943,
+  1134, 2022, 2463, 2961, 3022, 3138, 241,  1769, 2651, 3139, 305,  3140, 1452,
+  2838, 1959, 2526, 752,  1325, 1579, 2776, 2900, 3083, 369,  3141, 1007, 3023,
+  433,  3142, 1198, 1706, 2714, 2962, 1896, 2589, 816,  3084, 497,  2275, 3143,
+  2212, 2338, 2149, 2401, 561,  1071, 1516, 1833, 2652, 2839, 3024, 3144, 1389,
+  2901, 2086, 2464, 880,  3085, 1643, 2777, 1262, 2963, 625,  2023, 2527, 3145,
+  1770, 2715, 1135, 3025, 50,   944,  1960, 2590, 3086, 3200, 114,  689,  3146,
+  3201, 178,  3202, 242,  1453, 2902, 3203, 1580, 2840, 306,  1326, 2964, 3204,
+  2276, 2339, 753,  1897, 2653, 3147, 370,  1707, 2213, 2402, 2778, 3205, 1008,
+  3087, 1199, 2150, 2465, 3026, 434,  3206, 817,  2087, 2528, 3148, 1834, 2716,
+  498,  3207, 1517, 2903, 1390, 2965, 1072, 3088, 1644, 2024, 2591, 2841, 562,
+  3208, 881,  1263, 3027, 3149, 1771, 2779, 626,  1961, 2654, 3209, 2340, 1136,
+  3089, 2277, 2403, 945,  3150, 690,  1454, 2214, 2466, 2966, 3210, 51,   1581,
+  2904, 3264, 115,  3265, 179,  1898, 2717, 3266, 1327, 3028, 243,  2151, 2529,
+  3267, 1708, 2842, 307,  3268, 754,  3211, 2088, 2592, 371,  1009, 3151, 3269,
+  1200, 3090, 1835, 2780, 435,  3270, 2025, 2655, 818,  3212, 1518, 2967, 499,
+  1391, 1645, 2905, 3029, 3271, 1073, 3152, 1962, 2718, 563,  1264, 1772, 2341,
+  2404, 2843, 3091, 3272, 882,  2278, 2467, 3213, 2215, 2530, 627,  3273, 2152,
+  2593, 1137, 1899, 2781, 3153, 1582, 2968, 1455, 3030, 946,  3214, 691,  1709,
+  2906, 3274, 52,   1328, 3092, 3328, 116,  2089, 2656, 3329, 180,  3330, 244,
+  3331, 308,  1836, 2844, 3332, 755,  3275, 1010, 1201, 2026, 2719, 3154, 3215,
+  372,  3333, 1519, 2405, 3031, 436,  2342, 2468, 3334, 1646, 2969, 819,  1392,
+  3093, 3276, 2279, 2531, 1963, 2782, 500,  3335, 1773, 2907, 1074, 2216, 2594,
+  3216, 1265, 3155, 564,  3336, 883,  2153, 2657, 3277, 1900, 2845, 628,  1583,
+  3032, 3337, 1456, 2090, 2720, 3094, 1138, 3217, 1710, 2970, 947,  3278, 1329,
+  3156, 692,  3338, 53,   1837, 2908, 3392, 117,  2027, 2783, 3393, 181,  2406,
+  2469, 3394, 2343, 2532, 245,  3395, 1202, 3218, 309,  756,  2280, 2595, 3339,
+  3396, 1011, 3279, 1520, 3095, 373,  1647, 3033, 3397, 1964, 2846, 2217, 2658,
+  1393, 3157, 437,  1774, 2971, 3398, 820,  3340, 2154, 2721, 1075, 3280, 501,
+  3399, 1266, 3219, 1901, 2909, 565,  884,  2091, 2784, 3341, 3400, 1584, 3096,
+  1457, 1711, 3034, 3158, 2470, 629,  1139, 2407, 2533, 3281, 3401, 2344, 2596,
+  2028, 2847, 948,  1330, 1838, 2972, 3220, 3342, 2281, 2659, 693,  3402, 54,
+  3456, 118,  3457, 182,  2218, 2722, 3458, 246,  1203, 1965, 2910, 3282, 3459,
+  1012, 1648, 3097, 3343, 757,  1521, 3159, 3403, 310,  3460, 1775, 2155, 2785,
+  3035, 374,  1394, 3221, 3461, 438,  3462, 821,  3404, 1902, 2973, 1076, 2092,
+  2848, 3344, 1267, 3283, 502,  2471, 2534, 3463, 2408, 2597, 1585, 2345, 2660,
+  3160, 885,  3405, 566,  1712, 3098, 3464, 1458, 3222, 2029, 2911, 2282, 2723,
+  1140, 1839, 3036, 3345, 630,  3465, 1331, 3284, 949,  2219, 2786, 3406, 694,
+  1966, 2974, 3466, 55,   2156, 2849, 3520, 119,  1649, 3161, 3521, 1204, 3346,
+  183,  1522, 3223, 3522, 1776, 3099, 247,  1013, 3407, 3523, 758,  3467, 311,
+  3524, 1395, 2535, 3285, 2472, 2598, 2093, 2912, 375,  1903, 2409, 2661, 3037,
+  3525, 822,  2346, 2724, 3468, 439,  3526, 1077, 1268, 3347, 3408, 503,  2283,
+  2787, 3527, 1586, 3224, 1713, 2030, 2975, 3162, 886,  1459, 3286, 3469, 1840,
+  3100, 567,  3528, 2220, 2850, 1141, 3409, 1332, 3348, 631,  3529, 1967, 3038,
+  950,  3470, 2157, 2913, 2536, 2599, 695,  1650, 2473, 2662, 3225, 3530, 1523,
+  1777, 3163, 3287, 1205, 2410, 2725, 3410, 56,   3584, 120,  3585, 184,  2094,
+  2976, 3586, 1014, 3471, 248,  1396, 1904, 2347, 2788, 3101, 3349, 3587, 759,
+  3531, 312,  3588, 376,  2284, 2851, 3589, 823,  3532, 1269, 2031, 3039, 3411,
+  440,  1078, 3472, 3590, 1714, 3226, 1587, 3288, 2221, 2914, 504,  1841, 3164,
+  3591, 1460, 3350, 887,  3533, 568,  2600, 3592, 2537, 2663, 1968, 3102, 1142,
+  2158, 2977, 3473, 2474, 2726, 1333, 3412, 632,  3593, 2411, 2789, 951,  3534,
+  1651, 3289, 1778, 3227, 2348, 2852, 1524, 2095, 3040, 3351, 696,  3594, 1206,
+  3474, 1905, 3165, 57,   3648, 121,  1015, 1397, 2285, 2915, 3413, 3535, 3649,
+  185,  3650, 760,  3595, 249,  3651, 313,  2032, 3103, 3652, 2222, 2978, 377,
+  3653, 1270, 1715, 3290, 3475, 824,  1588, 3352, 3596, 1079, 2601, 2664, 3536,
+  1842, 3228, 441,  2538, 2727, 3654, 1461, 2475, 2790, 3414, 505,  2159, 3041,
+  3655, 1969, 3166, 888,  2412, 2853, 3597, 569,  3656, 1143, 3537, 1334, 3476,
+  2349, 2916, 2096, 3104, 1652, 3353, 633,  1779, 3291, 3657, 952,  3598, 1525,
+  3415, 1906, 2286, 2979, 3229, 697,  1207, 3538, 3658, 1398, 3477, 1016, 3599,
+  2033, 2665, 3167, 58,   2602, 2728, 3712, 122,  2223, 3042, 3713, 186,  3714,
+  761,  2539, 2791, 3659, 250,  3715, 314,  1716, 2476, 2854, 3354, 3716, 1589,
+  1843, 3292, 3416, 1271, 3539, 378,  3717, 1080, 3600, 825,  2160, 3105, 3660,
+  2413, 2917, 442,  1462, 1970, 3230, 3478, 3718, 2350, 2980, 506,  3719, 889,
+  3661, 1144, 1335, 2097, 3168, 3540, 3601, 570,  3720, 1780, 3355, 1653, 2287,
+  3043, 3417, 1907, 3293, 634,  953,  1526, 2666, 2729, 3479, 3662, 3721, 2603,
+  2792, 2540, 2855, 1208, 2224, 3106, 3602, 2034, 3231, 698,  3722, 1399, 3541,
+  2477, 2918, 1017, 3663, 59,   3776, 123,  3777, 187,  762,  1717, 2414, 2981,
+  3418, 3723, 3778, 1844, 3356, 251,  2161, 3169, 3779, 1590, 3480, 315,  1272,
+  3603, 3780, 1971, 3294, 1081, 2351, 3044, 3664, 379,  3781, 826,  3724, 1463,
+  3542, 443,  3782, 2098, 3232, 2730, 2288, 3107, 507,  2667, 2793, 3783, 890,
+  3725, 1336, 2604, 2856, 3604, 1145, 1781, 3419, 3665, 1654, 3481, 571,  1908,
+  3357, 3784, 2541, 2919, 1527, 3543, 2225, 3170, 954,  2478, 2982, 3726, 635,
+  2035, 3295, 3785, 1209, 3666, 1400, 3605, 2415, 3045, 699,  3786, 1018, 2162,
+  3233, 3727, 1718, 3482, 1845, 3420, 60,   2352, 3108, 3840, 124,  1591, 3544,
+  3841, 763,  3787, 188,  1972, 3358, 3842, 252,  3843, 1273, 3667, 2731, 2794,
+  316,  3844, 2668, 2857, 1082, 1464, 3606, 3728, 380,  827,  2099, 2605, 2920,
+  3296, 3788, 3845, 2289, 3171, 444,  3846, 2542, 2983, 1782, 3483, 508,  1337,
+  3668, 3847, 891,  1655, 1909, 3421, 3545, 3789, 1146, 2479, 3046, 3729, 2226,
+  3234, 572,  3848, 1528, 2036, 3359, 3607, 2416, 3109, 955,  3790, 636,  3849,
+  1210, 3730, 1401, 2163, 3297, 3669, 2353, 3172, 2795, 700,  1846, 2732, 2858,
+  3484, 3850, 1719, 3546, 1019, 2669, 2921, 3791, 1973, 3422, 1592, 3608, 2606,
+  2984, 61,   764,  3851, 3904, 125,  3905, 189,  1274, 2290, 3235, 3731, 3906,
+  2100, 3360, 253,  2543, 3047, 3907, 1465, 3670, 317,  1083, 3792, 3908, 828,
+  3852, 381,  3909, 2480, 3110, 1783, 3547, 445,  1910, 2227, 3298, 3485, 3910,
+  1656, 3609, 1338, 3732, 892,  3853, 509,  1147, 2037, 2417, 3173, 3423, 3793,
+  3911, 1529, 3671, 573,  2796, 2859, 3912, 2733, 2922, 2164, 3361, 956,  2354,
+  3236, 3854, 2670, 2985, 637,  3913, 1211, 1402, 3733, 3794, 1847, 2607, 3048,
+  3548, 1720, 3610, 1974, 3486, 701,  3914, 1020, 1593, 2544, 3111, 3672, 3855,
+  2291, 3299, 2101, 3424, 765,  1275, 3795, 3915, 62,   3968, 126,  2481, 3174,
+  3969, 190,  1466, 3734, 3970, 254,  3971, 1084, 3856, 318,  2228, 3362, 3972,
+  829,  1784, 3611, 3916, 1911, 3549, 382,  2418, 3237, 3973, 2860, 1657, 2797,
+  2923, 3673, 2038, 3487, 446,  2734, 2986, 3974, 1339, 3796, 1148, 3857, 893,
+  2671, 3049, 3917, 510,  1530, 3735, 3975, 2355, 3300, 2165, 3425, 2608, 3112,
+  574,  3976, 957,  3918, 1848, 3612, 1403, 2545, 3175, 3797, 1212, 3858, 638,
+  1721, 1975, 3550, 3674, 3977, 2292, 3363, 1594, 2102, 3488, 3736, 702,  2482,
+  3238, 3978, 1021, 3919, 1276, 2861, 2924, 3859, 766,  1467, 2229, 2798, 2987,
+  3426, 3798, 3979, 63,   4032, 127,  2419, 3301, 4033, 191,  2735, 3050, 4034,
+  1085, 1912, 3613, 3920, 255,  1785, 3675, 4035, 319,  2672, 3113, 4036, 2039,
+  3551, 830,  3980, 1658, 3737, 383,  4037, 1340, 2356, 3364, 3860, 2609, 3176,
+  447,  2166, 3489, 4038, 1149, 1531, 3799, 3921, 894,  3981, 511,  4039, 2546,
+  3239, 575,  1849, 3676, 4040, 2293, 3427, 1976, 3614, 958,  1722, 3738, 3982,
+  1404, 3861, 1213, 2483, 3302, 3922, 2103, 3552, 639,  2925, 4041, 2862, 2988,
+  1595, 3800, 2799, 3051, 2736, 3114, 703,  1022, 3983, 4042, 2230, 3490, 2420,
+  3365, 1277, 2673, 3177, 3923, 1468, 3862, 767,  1913, 3677, 4043, 1786, 3739,
+  2040, 3615, 1086, 2610, 3240, 3984, 2357, 3428, 1659, 3801, 831,  4044, 2167,
+  3553, 1341, 3924, 2547, 3303, 1532, 3863, 1150, 3985, 895,  4045, 2294, 2926,
+  2989, 3491, 2863, 3052, 1850, 2484, 3366, 3740, 1977, 3678, 2800, 3115, 1723,
+  3802, 2104, 3616, 1405, 3925, 959,  2737, 3178, 4046, 1214, 3986, 1596, 3864,
+  2421, 3429, 2231, 2674, 3241, 3554, 1023, 4047, 2611, 3304, 1278, 1469, 1914,
+  3741, 3926, 3987, 1787, 2041, 3679, 3803, 2358, 3492, 1087, 1660, 2168, 2548,
+  3367, 3617, 3865, 4048, 2990, 2927, 3053, 2864, 3116, 1342, 3988, 1533, 2295,
+  2801, 3179, 3555, 3927, 2485, 3430, 1151, 4049, 1978, 2738, 3242, 3742, 1851,
+  3804, 2105, 3680, 1724, 3866, 2675, 3305, 1406, 2422, 3493, 3989, 2232, 3618,
+  1215, 4050, 1597, 3928, 2612, 3368, 2359, 3556, 1915, 3805, 2042, 2991, 3054,
+  3743, 1470, 3990, 1788, 2928, 3117, 3867, 1279, 2549, 3431, 4051, 2865, 3180,
+  2169, 3681, 1661, 3929, 2802, 3243, 2486, 3494, 2296, 3619, 2739, 3306, 1343,
+  4052, 1534, 3991, 1979, 3806, 1852, 3868, 2676, 3369, 2106, 3744, 2423, 3557,
+  1725, 3930, 2233, 3682, 2613, 3432, 1407, 4053, 3055, 1598, 2992, 3118, 3992,
+  2929, 3181, 2360, 3620, 2866, 3244, 2550, 3495, 1916, 3869, 2043, 3807, 1789,
+  2803, 3307, 3931, 1471, 2170, 3745, 4054, 2740, 3370, 1662, 2487, 3558, 3993,
+  2297, 3683, 2677, 3433, 1535, 4055, 1980, 3870, 1853, 2107, 2424, 3621, 3808,
+  3932, 3056, 3119, 2614, 3496, 2993, 3182, 1726, 2234, 3746, 3994, 2930, 3245,
+  2867, 3308, 1599, 2361, 3684, 4056, 2551, 3559, 2804, 3371, 2044, 3871, 1917,
+  3933, 2171, 3809, 1790, 2741, 3434, 3995, 2488, 3622, 2298, 3747, 1663, 4057,
+  2678, 3497, 3120, 3057, 3183, 2994, 3246, 2425, 3685, 1981, 3934, 2108, 3872,
+  2615, 3560, 2931, 3309, 1854, 3996, 2235, 3810, 2868, 3372, 1727, 4058, 2552,
+  3623, 2805, 3435, 2362, 3748, 2742, 3498, 2045, 3935, 1918, 3997, 2172, 3873,
+  2489, 3686, 1791, 4059, 3121, 3184, 2299, 2679, 3561, 3811, 3058, 3247, 2995,
+  3310, 2932, 3373, 2426, 3749, 2616, 3624, 1982, 3998, 2109, 2869, 3436, 3936,
+  1855, 4060, 2236, 3874, 2806, 3499, 2553, 3687, 2363, 3812, 2743, 3562, 3185,
+  3122, 3248, 2046, 3999, 2490, 3750, 1919, 2173, 3059, 3311, 3937, 4061, 2680,
+  3625, 2996, 3374, 2300, 3875, 2933, 3437, 2617, 3688, 2427, 3813, 2870, 3500,
+  2110, 4000, 1983, 4062, 2807, 3563, 2237, 3938, 2554, 3751, 2364, 3876, 2744,
+  3626, 3186, 3249, 3123, 3312, 3060, 3375, 2491, 2997, 3438, 3814, 2047, 2681,
+  3689, 4063, 2174, 4001, 2934, 3501, 2301, 3939, 2871, 3564, 2618, 3752, 2428,
+  3877, 2808, 3627, 2111, 4064, 2238, 3250, 4002, 2555, 3187, 3313, 3815, 3124,
+  3376, 2745, 3690, 2365, 3940, 3061, 3439, 2998, 3502, 2492, 3878, 2682, 3753,
+  2935, 3565, 2175, 4065, 2302, 4003, 2872, 3628, 2619, 3816, 2429, 3941, 2809,
+  3691, 3251, 3314, 3188, 3377, 3125, 3440, 2556, 3879, 2239, 3062, 3503, 4066,
+  2746, 3754, 2366, 4004, 2999, 3566, 2936, 3629, 2683, 3817, 2493, 3942, 2873,
+  3692, 2303, 4067, 2620, 3880, 3315, 3252, 3378, 3189, 3441, 2430, 2810, 3755,
+  4005, 3126, 3504, 3063, 3567, 2557, 3943, 2747, 3818, 3000, 3630, 2367, 4068,
+  2937, 3693, 2684, 3881, 2494, 4006, 2874, 3756, 3316, 3379, 3253, 3442, 3190,
+  3505, 2621, 3944, 3127, 3568, 2811, 3819, 2431, 4069, 3064, 3631, 2748, 3882,
+  2558, 3001, 3694, 4007, 2938, 3757, 2685, 3945, 3380, 3317, 3443, 2495, 4070,
+  3254, 3506, 2875, 3820, 3191, 3569, 3128, 3632, 2622, 4008, 2812, 3883, 3065,
+  3695, 3002, 3758, 2749, 3946, 2559, 4071, 2939, 3821, 3381, 3444, 3318, 3507,
+  2686, 3255, 3570, 4009, 2876, 3884, 3192, 3633, 3129, 3696, 2623, 4072, 2813,
+  3947, 3066, 3759, 3003, 3822, 2750, 4010, 3445, 3382, 3508, 2940, 3885, 3319,
+  3571, 3256, 3634, 2687, 3193, 3697, 4073, 2877, 3948, 3130, 3760, 3067, 3823,
+  2814, 4011, 3004, 3886, 3446, 3509, 3383, 3572, 2751, 4074, 3320, 3635, 2941,
+  3949, 3257, 3698, 3194, 3761, 2878, 4012, 3131, 3824, 3068, 3887, 2815, 4075,
+  3510, 3447, 3573, 3005, 3950, 3384, 3636, 3321, 3699, 3258, 3762, 2942, 4013,
+  3195, 3825, 3132, 3888, 2879, 4076, 3069, 3951, 3511, 3574, 3448, 3637, 3385,
+  3700, 3006, 4014, 3322, 3763, 3259, 3826, 2943, 4077, 3196, 3889, 3133, 3952,
+  3575, 3512, 3638, 3070, 4015, 3449, 3701, 3386, 3764, 3323, 3827, 3007, 4078,
+  3260, 3890, 3197, 3953, 3134, 4016, 3576, 3639, 3513, 3702, 3450, 3765, 3071,
+  4079, 3387, 3828, 3324, 3891, 3261, 3954, 3198, 4017, 3640, 3135, 4080, 3577,
+  3703, 3514, 3766, 3451, 3829, 3388, 3892, 3325, 3955, 3262, 4018, 3199, 4081,
+  3641, 3704, 3578, 3767, 3515, 3830, 3452, 3893, 3389, 3956, 3326, 4019, 3263,
+  4082, 3705, 3642, 3768, 3579, 3831, 3516, 3894, 3453, 3957, 3390, 4020, 3327,
+  4083, 3706, 3769, 3643, 3832, 3580, 3895, 3517, 3958, 3454, 4021, 3391, 4084,
+  3770, 3707, 3833, 3644, 3896, 3581, 3959, 3518, 4022, 3455, 4085, 3771, 3834,
+  3708, 3897, 3645, 3960, 3582, 4023, 3519, 4086, 3835, 3772, 3898, 3709, 3961,
+  3646, 4024, 3583, 4087, 3836, 3899, 3773, 3962, 3710, 4025, 3647, 4088, 3900,
+  3837, 3963, 3774, 4026, 3711, 4089, 3901, 3964, 3838, 4027, 3775, 4090, 3965,
+  3902, 4028, 3839, 4091, 3966, 4029, 3903, 4092, 4030, 3967, 4093, 4031, 4094,
+  4095,
+};
+#endif  // CONFIG_TX64X64
+
+#if CONFIG_CB4X4
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_2x2_neighbors[5 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 1, 1, 2, 0, 0,
+};
+#endif
+
+// Neighborhood 2-tuples for various scans and blocksizes,
+// in {top, left} order for each position in corresponding scan order.
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 4,  0, 1, 4, 4, 5,  5,  1, 8,  8,  5,  8, 2,
+  2, 2, 5, 9, 12, 6, 9, 3, 6, 10, 13, 7, 10, 11, 14, 0, 0,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 4, 4,  8,  8, 0, 0, 1, 4, 5,  8,  9,  12, 1,
+  1, 2, 5, 6, 9, 10, 13, 2, 2, 3, 6, 7, 10, 11, 14, 0,  0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 2,  2, 0, 0, 1,  4,  2,  5,  3,  6, 4,
+  4, 5, 8, 6, 9, 7, 10, 8, 8, 9, 12, 10, 13, 11, 14, 0, 0,
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t,
+                col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0,  0, 0, 0, 4, 4, 4, 0, 8, 8,  1,  4, 5,  8,  5,  1, 9,
+  12, 2, 5, 6, 9, 6, 2, 3, 6, 10, 13, 7, 10, 11, 14, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 1, 1, 1, 1,  4, 2,  2,  2,  5,  4,  5, 5,
+  8, 3, 6, 8, 9, 6, 9, 9, 12, 7, 10, 10, 13, 11, 14, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  0,  0,  1,  4,  1,  1,  4,  4,  2,  5,  5,  8,  6,
+  9,  2,  2,  8,  8,  3,  6,  9,  12, 7,  10, 10, 13, 12, 12, 13, 16,
+  11, 14, 14, 17, 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21,
+  24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
+  0, 0, 0,  0,  4,  4,  8,  8,  12, 12, 16, 16, 20, 20, 24, 24, 0,
+  0, 1, 4,  5,  8,  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 1,  1,
+  2, 5, 6,  9,  10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 2,  2,  3,
+  6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  1,  1,  2,  2,  0,  0,  1,  4,  2,  5,  3,  6,  4,
+  4,  5,  8,  6,  9,  7,  10, 8,  8,  9,  12, 10, 13, 11, 14, 12, 12,
+  13, 16, 14, 17, 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21,
+  24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  0,  0, 1,  8,  1,  1,  8,  8,  2,  9,  9, 16, 10,
+  17, 2,  2,  16, 16, 3, 10, 17, 24, 11, 18, 18, 25, 3,  3, 4,  11,
+  19, 26, 12, 19, 4,  4, 20, 27, 5,  12, 13, 20, 21, 28, 5, 5,  6,
+  13, 14, 21, 22, 29, 6, 6,  7,  14, 15, 22, 23, 30, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  8,  8,  16, 16, 0,  0,  1,  8,  9,  16, 17, 24, 1,
+  1,  2,  9,  10, 17, 18, 25, 2,  2,  3,  10, 11, 18, 19, 26, 3,  3,
+  4,  11, 12, 19, 20, 27, 4,  4,  5,  12, 13, 20, 21, 28, 5,  5,  6,
+  13, 14, 21, 22, 29, 6,  6,  7,  14, 15, 22, 23, 30, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  0,
+  0,  1,  8,  2,  9,  3,  10, 4,  11, 5,  12, 6,  13, 7,  14, 8,  8,
+  9,  16, 10, 17, 11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 16, 17,
+  24, 18, 25, 19, 26, 20, 27, 21, 28, 22, 29, 23, 30, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  0,  0,  1,  1,  1,  4,  4,  4,  2,  2,  2,  5,  5,  8,  8,
+  8,  3,  6,  6,  9,  9,  12, 12, 12, 7,  10, 10, 13, 13, 16, 16, 16, 11, 14,
+  14, 17, 17, 20, 20, 20, 15, 18, 18, 21, 21, 24, 24, 24, 19, 22, 22, 25, 25,
+  28, 28, 28, 23, 26, 26, 29, 29, 32, 32, 32, 27, 30, 30, 33, 33, 36, 36, 36,
+  31, 34, 34, 37, 37, 40, 40, 40, 35, 38, 38, 41, 41, 44, 44, 44, 39, 42, 42,
+  45, 45, 48, 48, 48, 43, 46, 46, 49, 49, 52, 52, 52, 47, 50, 50, 53, 53, 56,
+  56, 56, 51, 54, 54, 57, 57, 60, 55, 58, 58, 61, 59, 62, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  0,  0,  1,  1,  1,  16, 16, 16, 2,  2,  2,  17, 17, 32, 32,
+  32, 3,  3,  3,  18, 18, 33, 33, 48, 4,  4,  4,  19, 19, 34, 34, 49, 5,  5,
+  5,  20, 20, 35, 35, 50, 6,  6,  6,  21, 21, 36, 36, 51, 7,  7,  7,  22, 22,
+  37, 37, 52, 8,  8,  8,  23, 23, 38, 38, 53, 9,  9,  9,  24, 24, 39, 39, 54,
+  10, 10, 10, 25, 25, 40, 40, 55, 11, 11, 11, 26, 26, 41, 41, 56, 12, 12, 12,
+  27, 27, 42, 42, 57, 13, 13, 13, 28, 28, 43, 43, 58, 14, 14, 14, 29, 29, 44,
+  44, 59, 15, 30, 30, 45, 45, 60, 31, 46, 46, 61, 47, 62, 0,  0
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  0,  0,  1,  1,  1,  16, 16, 16, 2,  2,  2,  17, 17, 32, 32,
+  32, 3,  3,  3,  18, 18, 33, 33, 48, 4,  4,  4,  19, 19, 34, 34, 49, 5,  5,
+  5,  20, 20, 35, 35, 50, 6,  6,  6,  21, 21, 36, 36, 51, 7,  7,  7,  22, 22,
+  37, 37, 52, 8,  8,  8,  23, 23, 38, 38, 53, 9,  9,  9,  24, 24, 39, 39, 54,
+  10, 10, 10, 25, 25, 40, 40, 55, 11, 11, 11, 26, 26, 41, 41, 56, 12, 12, 12,
+  27, 27, 42, 42, 57, 13, 13, 13, 28, 28, 43, 43, 58, 14, 14, 14, 29, 29, 44,
+  44, 59, 15, 30, 30, 45, 45, 60, 31, 46, 46, 61, 47, 62, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,
+  8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 0,  0,  1,  16, 2,  17,
+  3,  18, 4,  19, 5,  20, 6,  21, 7,  22, 8,  23, 9,  24, 10, 25, 11, 26, 12,
+  27, 13, 28, 14, 29, 15, 30, 16, 16, 17, 32, 18, 33, 19, 34, 20, 35, 21, 36,
+  22, 37, 23, 38, 24, 39, 25, 40, 26, 41, 27, 42, 28, 43, 29, 44, 30, 45, 31,
+  46, 32, 32, 33, 48, 34, 49, 35, 50, 36, 51, 37, 52, 38, 53, 39, 54, 40, 55,
+  41, 56, 42, 57, 43, 58, 44, 59, 45, 60, 46, 61, 47, 62, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  4,  4,  8,  8,  12, 12, 16, 16, 20, 20, 24, 24, 28, 28, 32,
+  32, 36, 36, 40, 40, 44, 44, 48, 48, 52, 52, 56, 56, 0,  0,  1,  4,  5,  8,
+  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32, 33, 36, 37, 40, 41, 44, 45,
+  48, 49, 52, 53, 56, 57, 60, 1,  1,  2,  5,  6,  9,  10, 13, 14, 17, 18, 21,
+  22, 25, 26, 29, 30, 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58,
+  61, 2,  2,  3,  6,  7,  10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31, 34,
+  35, 38, 39, 42, 43, 46, 47, 50, 51, 54, 55, 58, 59, 62, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  16, 16, 32, 32, 0,  0,  1,  16, 17, 32, 33, 48, 1,  1,  2,
+  17, 18, 33, 34, 49, 2,  2,  3,  18, 19, 34, 35, 50, 3,  3,  4,  19, 20, 35,
+  36, 51, 4,  4,  5,  20, 21, 36, 37, 52, 5,  5,  6,  21, 22, 37, 38, 53, 6,
+  6,  7,  22, 23, 38, 39, 54, 7,  7,  8,  23, 24, 39, 40, 55, 8,  8,  9,  24,
+  25, 40, 41, 56, 9,  9,  10, 25, 26, 41, 42, 57, 10, 10, 11, 26, 27, 42, 43,
+  58, 11, 11, 12, 27, 28, 43, 44, 59, 12, 12, 13, 28, 29, 44, 45, 60, 13, 13,
+  14, 29, 30, 45, 46, 61, 14, 14, 15, 30, 31, 46, 47, 62, 0,  0
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   0,   0,   1,   1,   1,   8,   8,   8,   2,   2,   2,
+  9,   9,   16,  16,  16,  3,   3,   3,   10,  10,  17,  17,  24,  24,  24,
+  4,   4,   4,   11,  11,  18,  18,  25,  25,  32,  32,  32,  5,   5,   5,
+  12,  12,  19,  19,  26,  26,  33,  33,  40,  40,  40,  6,   6,   6,   13,
+  13,  20,  20,  27,  27,  34,  34,  41,  41,  48,  48,  48,  7,   14,  14,
+  21,  21,  28,  28,  35,  35,  42,  42,  49,  49,  56,  56,  56,  15,  22,
+  22,  29,  29,  36,  36,  43,  43,  50,  50,  57,  57,  64,  64,  64,  23,
+  30,  30,  37,  37,  44,  44,  51,  51,  58,  58,  65,  65,  72,  72,  72,
+  31,  38,  38,  45,  45,  52,  52,  59,  59,  66,  66,  73,  73,  80,  80,
+  80,  39,  46,  46,  53,  53,  60,  60,  67,  67,  74,  74,  81,  81,  88,
+  88,  88,  47,  54,  54,  61,  61,  68,  68,  75,  75,  82,  82,  89,  89,
+  96,  96,  96,  55,  62,  62,  69,  69,  76,  76,  83,  83,  90,  90,  97,
+  97,  104, 104, 104, 63,  70,  70,  77,  77,  84,  84,  91,  91,  98,  98,
+  105, 105, 112, 112, 112, 71,  78,  78,  85,  85,  92,  92,  99,  99,  106,
+  106, 113, 113, 120, 120, 120, 79,  86,  86,  93,  93,  100, 100, 107, 107,
+  114, 114, 121, 121, 128, 128, 128, 87,  94,  94,  101, 101, 108, 108, 115,
+  115, 122, 122, 129, 129, 136, 136, 136, 95,  102, 102, 109, 109, 116, 116,
+  123, 123, 130, 130, 137, 137, 144, 144, 144, 103, 110, 110, 117, 117, 124,
+  124, 131, 131, 138, 138, 145, 145, 152, 152, 152, 111, 118, 118, 125, 125,
+  132, 132, 139, 139, 146, 146, 153, 153, 160, 160, 160, 119, 126, 126, 133,
+  133, 140, 140, 147, 147, 154, 154, 161, 161, 168, 168, 168, 127, 134, 134,
+  141, 141, 148, 148, 155, 155, 162, 162, 169, 169, 176, 176, 176, 135, 142,
+  142, 149, 149, 156, 156, 163, 163, 170, 170, 177, 177, 184, 184, 184, 143,
+  150, 150, 157, 157, 164, 164, 171, 171, 178, 178, 185, 185, 192, 192, 192,
+  151, 158, 158, 165, 165, 172, 172, 179, 179, 186, 186, 193, 193, 200, 200,
+  200, 159, 166, 166, 173, 173, 180, 180, 187, 187, 194, 194, 201, 201, 208,
+  208, 208, 167, 174, 174, 181, 181, 188, 188, 195, 195, 202, 202, 209, 209,
+  216, 216, 216, 175, 182, 182, 189, 189, 196, 196, 203, 203, 210, 210, 217,
+  217, 224, 224, 224, 183, 190, 190, 197, 197, 204, 204, 211, 211, 218, 218,
+  225, 225, 232, 232, 232, 191, 198, 198, 205, 205, 212, 212, 219, 219, 226,
+  226, 233, 233, 240, 240, 240, 199, 206, 206, 213, 213, 220, 220, 227, 227,
+  234, 234, 241, 241, 248, 207, 214, 214, 221, 221, 228, 228, 235, 235, 242,
+  242, 249, 215, 222, 222, 229, 229, 236, 236, 243, 243, 250, 223, 230, 230,
+  237, 237, 244, 244, 251, 231, 238, 238, 245, 245, 252, 239, 246, 246, 253,
+  247, 254, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   0,   0,   1,   1,   1,   32,  32,  32,  2,   2,   2,
+  33,  33,  64,  64,  64,  3,   3,   3,   34,  34,  65,  65,  96,  96,  96,
+  4,   4,   4,   35,  35,  66,  66,  97,  97,  128, 128, 128, 5,   5,   5,
+  36,  36,  67,  67,  98,  98,  129, 129, 160, 160, 160, 6,   6,   6,   37,
+  37,  68,  68,  99,  99,  130, 130, 161, 161, 192, 192, 192, 7,   7,   7,
+  38,  38,  69,  69,  100, 100, 131, 131, 162, 162, 193, 193, 224, 8,   8,
+  8,   39,  39,  70,  70,  101, 101, 132, 132, 163, 163, 194, 194, 225, 9,
+  9,   9,   40,  40,  71,  71,  102, 102, 133, 133, 164, 164, 195, 195, 226,
+  10,  10,  10,  41,  41,  72,  72,  103, 103, 134, 134, 165, 165, 196, 196,
+  227, 11,  11,  11,  42,  42,  73,  73,  104, 104, 135, 135, 166, 166, 197,
+  197, 228, 12,  12,  12,  43,  43,  74,  74,  105, 105, 136, 136, 167, 167,
+  198, 198, 229, 13,  13,  13,  44,  44,  75,  75,  106, 106, 137, 137, 168,
+  168, 199, 199, 230, 14,  14,  14,  45,  45,  76,  76,  107, 107, 138, 138,
+  169, 169, 200, 200, 231, 15,  15,  15,  46,  46,  77,  77,  108, 108, 139,
+  139, 170, 170, 201, 201, 232, 16,  16,  16,  47,  47,  78,  78,  109, 109,
+  140, 140, 171, 171, 202, 202, 233, 17,  17,  17,  48,  48,  79,  79,  110,
+  110, 141, 141, 172, 172, 203, 203, 234, 18,  18,  18,  49,  49,  80,  80,
+  111, 111, 142, 142, 173, 173, 204, 204, 235, 19,  19,  19,  50,  50,  81,
+  81,  112, 112, 143, 143, 174, 174, 205, 205, 236, 20,  20,  20,  51,  51,
+  82,  82,  113, 113, 144, 144, 175, 175, 206, 206, 237, 21,  21,  21,  52,
+  52,  83,  83,  114, 114, 145, 145, 176, 176, 207, 207, 238, 22,  22,  22,
+  53,  53,  84,  84,  115, 115, 146, 146, 177, 177, 208, 208, 239, 23,  23,
+  23,  54,  54,  85,  85,  116, 116, 147, 147, 178, 178, 209, 209, 240, 24,
+  24,  24,  55,  55,  86,  86,  117, 117, 148, 148, 179, 179, 210, 210, 241,
+  25,  25,  25,  56,  56,  87,  87,  118, 118, 149, 149, 180, 180, 211, 211,
+  242, 26,  26,  26,  57,  57,  88,  88,  119, 119, 150, 150, 181, 181, 212,
+  212, 243, 27,  27,  27,  58,  58,  89,  89,  120, 120, 151, 151, 182, 182,
+  213, 213, 244, 28,  28,  28,  59,  59,  90,  90,  121, 121, 152, 152, 183,
+  183, 214, 214, 245, 29,  29,  29,  60,  60,  91,  91,  122, 122, 153, 153,
+  184, 184, 215, 215, 246, 30,  30,  30,  61,  61,  92,  92,  123, 123, 154,
+  154, 185, 185, 216, 216, 247, 31,  62,  62,  93,  93,  124, 124, 155, 155,
+  186, 186, 217, 217, 248, 63,  94,  94,  125, 125, 156, 156, 187, 187, 218,
+  218, 249, 95,  126, 126, 157, 157, 188, 188, 219, 219, 250, 127, 158, 158,
+  189, 189, 220, 220, 251, 159, 190, 190, 221, 221, 252, 191, 222, 222, 253,
+  223, 254, 0,   0
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
+  6,   0,   0,   1,   8,   2,   9,   3,   10,  4,   11,  5,   12,  6,   13,
+  7,   14,  8,   8,   9,   16,  10,  17,  11,  18,  12,  19,  13,  20,  14,
+  21,  15,  22,  16,  16,  17,  24,  18,  25,  19,  26,  20,  27,  21,  28,
+  22,  29,  23,  30,  24,  24,  25,  32,  26,  33,  27,  34,  28,  35,  29,
+  36,  30,  37,  31,  38,  32,  32,  33,  40,  34,  41,  35,  42,  36,  43,
+  37,  44,  38,  45,  39,  46,  40,  40,  41,  48,  42,  49,  43,  50,  44,
+  51,  45,  52,  46,  53,  47,  54,  48,  48,  49,  56,  50,  57,  51,  58,
+  52,  59,  53,  60,  54,  61,  55,  62,  56,  56,  57,  64,  58,  65,  59,
+  66,  60,  67,  61,  68,  62,  69,  63,  70,  64,  64,  65,  72,  66,  73,
+  67,  74,  68,  75,  69,  76,  70,  77,  71,  78,  72,  72,  73,  80,  74,
+  81,  75,  82,  76,  83,  77,  84,  78,  85,  79,  86,  80,  80,  81,  88,
+  82,  89,  83,  90,  84,  91,  85,  92,  86,  93,  87,  94,  88,  88,  89,
+  96,  90,  97,  91,  98,  92,  99,  93,  100, 94,  101, 95,  102, 96,  96,
+  97,  104, 98,  105, 99,  106, 100, 107, 101, 108, 102, 109, 103, 110, 104,
+  104, 105, 112, 106, 113, 107, 114, 108, 115, 109, 116, 110, 117, 111, 118,
+  112, 112, 113, 120, 114, 121, 115, 122, 116, 123, 117, 124, 118, 125, 119,
+  126, 120, 120, 121, 128, 122, 129, 123, 130, 124, 131, 125, 132, 126, 133,
+  127, 134, 128, 128, 129, 136, 130, 137, 131, 138, 132, 139, 133, 140, 134,
+  141, 135, 142, 136, 136, 137, 144, 138, 145, 139, 146, 140, 147, 141, 148,
+  142, 149, 143, 150, 144, 144, 145, 152, 146, 153, 147, 154, 148, 155, 149,
+  156, 150, 157, 151, 158, 152, 152, 153, 160, 154, 161, 155, 162, 156, 163,
+  157, 164, 158, 165, 159, 166, 160, 160, 161, 168, 162, 169, 163, 170, 164,
+  171, 165, 172, 166, 173, 167, 174, 168, 168, 169, 176, 170, 177, 171, 178,
+  172, 179, 173, 180, 174, 181, 175, 182, 176, 176, 177, 184, 178, 185, 179,
+  186, 180, 187, 181, 188, 182, 189, 183, 190, 184, 184, 185, 192, 186, 193,
+  187, 194, 188, 195, 189, 196, 190, 197, 191, 198, 192, 192, 193, 200, 194,
+  201, 195, 202, 196, 203, 197, 204, 198, 205, 199, 206, 200, 200, 201, 208,
+  202, 209, 203, 210, 204, 211, 205, 212, 206, 213, 207, 214, 208, 208, 209,
+  216, 210, 217, 211, 218, 212, 219, 213, 220, 214, 221, 215, 222, 216, 216,
+  217, 224, 218, 225, 219, 226, 220, 227, 221, 228, 222, 229, 223, 230, 224,
+  224, 225, 232, 226, 233, 227, 234, 228, 235, 229, 236, 230, 237, 231, 238,
+  232, 232, 233, 240, 234, 241, 235, 242, 236, 243, 237, 244, 238, 245, 239,
+  246, 240, 240, 241, 248, 242, 249, 243, 250, 244, 251, 245, 252, 246, 253,
+  247, 254, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
+  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
+  14,  14,  15,  15,  16,  16,  17,  17,  18,  18,  19,  19,  20,  20,  21,
+  21,  22,  22,  23,  23,  24,  24,  25,  25,  26,  26,  27,  27,  28,  28,
+  29,  29,  30,  30,  0,   0,   1,   32,  2,   33,  3,   34,  4,   35,  5,
+  36,  6,   37,  7,   38,  8,   39,  9,   40,  10,  41,  11,  42,  12,  43,
+  13,  44,  14,  45,  15,  46,  16,  47,  17,  48,  18,  49,  19,  50,  20,
+  51,  21,  52,  22,  53,  23,  54,  24,  55,  25,  56,  26,  57,  27,  58,
+  28,  59,  29,  60,  30,  61,  31,  62,  32,  32,  33,  64,  34,  65,  35,
+  66,  36,  67,  37,  68,  38,  69,  39,  70,  40,  71,  41,  72,  42,  73,
+  43,  74,  44,  75,  45,  76,  46,  77,  47,  78,  48,  79,  49,  80,  50,
+  81,  51,  82,  52,  83,  53,  84,  54,  85,  55,  86,  56,  87,  57,  88,
+  58,  89,  59,  90,  60,  91,  61,  92,  62,  93,  63,  94,  64,  64,  65,
+  96,  66,  97,  67,  98,  68,  99,  69,  100, 70,  101, 71,  102, 72,  103,
+  73,  104, 74,  105, 75,  106, 76,  107, 77,  108, 78,  109, 79,  110, 80,
+  111, 81,  112, 82,  113, 83,  114, 84,  115, 85,  116, 86,  117, 87,  118,
+  88,  119, 89,  120, 90,  121, 91,  122, 92,  123, 93,  124, 94,  125, 95,
+  126, 96,  96,  97,  128, 98,  129, 99,  130, 100, 131, 101, 132, 102, 133,
+  103, 134, 104, 135, 105, 136, 106, 137, 107, 138, 108, 139, 109, 140, 110,
+  141, 111, 142, 112, 143, 113, 144, 114, 145, 115, 146, 116, 147, 117, 148,
+  118, 149, 119, 150, 120, 151, 121, 152, 122, 153, 123, 154, 124, 155, 125,
+  156, 126, 157, 127, 158, 128, 128, 129, 160, 130, 161, 131, 162, 132, 163,
+  133, 164, 134, 165, 135, 166, 136, 167, 137, 168, 138, 169, 139, 170, 140,
+  171, 141, 172, 142, 173, 143, 174, 144, 175, 145, 176, 146, 177, 147, 178,
+  148, 179, 149, 180, 150, 181, 151, 182, 152, 183, 153, 184, 154, 185, 155,
+  186, 156, 187, 157, 188, 158, 189, 159, 190, 160, 160, 161, 192, 162, 193,
+  163, 194, 164, 195, 165, 196, 166, 197, 167, 198, 168, 199, 169, 200, 170,
+  201, 171, 202, 172, 203, 173, 204, 174, 205, 175, 206, 176, 207, 177, 208,
+  178, 209, 179, 210, 180, 211, 181, 212, 182, 213, 183, 214, 184, 215, 185,
+  216, 186, 217, 187, 218, 188, 219, 189, 220, 190, 221, 191, 222, 192, 192,
+  193, 224, 194, 225, 195, 226, 196, 227, 197, 228, 198, 229, 199, 230, 200,
+  231, 201, 232, 202, 233, 203, 234, 204, 235, 205, 236, 206, 237, 207, 238,
+  208, 239, 209, 240, 210, 241, 211, 242, 212, 243, 213, 244, 214, 245, 215,
+  246, 216, 247, 217, 248, 218, 249, 219, 250, 220, 251, 221, 252, 222, 253,
+  223, 254, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   8,   8,   16,  16,  24,  24,  32,  32,  40,  40,  48,
+  48,  56,  56,  64,  64,  72,  72,  80,  80,  88,  88,  96,  96,  104, 104,
+  112, 112, 120, 120, 128, 128, 136, 136, 144, 144, 152, 152, 160, 160, 168,
+  168, 176, 176, 184, 184, 192, 192, 200, 200, 208, 208, 216, 216, 224, 224,
+  232, 232, 240, 240, 0,   0,   1,   8,   9,   16,  17,  24,  25,  32,  33,
+  40,  41,  48,  49,  56,  57,  64,  65,  72,  73,  80,  81,  88,  89,  96,
+  97,  104, 105, 112, 113, 120, 121, 128, 129, 136, 137, 144, 145, 152, 153,
+  160, 161, 168, 169, 176, 177, 184, 185, 192, 193, 200, 201, 208, 209, 216,
+  217, 224, 225, 232, 233, 240, 241, 248, 1,   1,   2,   9,   10,  17,  18,
+  25,  26,  33,  34,  41,  42,  49,  50,  57,  58,  65,  66,  73,  74,  81,
+  82,  89,  90,  97,  98,  105, 106, 113, 114, 121, 122, 129, 130, 137, 138,
+  145, 146, 153, 154, 161, 162, 169, 170, 177, 178, 185, 186, 193, 194, 201,
+  202, 209, 210, 217, 218, 225, 226, 233, 234, 241, 242, 249, 2,   2,   3,
+  10,  11,  18,  19,  26,  27,  34,  35,  42,  43,  50,  51,  58,  59,  66,
+  67,  74,  75,  82,  83,  90,  91,  98,  99,  106, 107, 114, 115, 122, 123,
+  130, 131, 138, 139, 146, 147, 154, 155, 162, 163, 170, 171, 178, 179, 186,
+  187, 194, 195, 202, 203, 210, 211, 218, 219, 226, 227, 234, 235, 242, 243,
+  250, 3,   3,   4,   11,  12,  19,  20,  27,  28,  35,  36,  43,  44,  51,
+  52,  59,  60,  67,  68,  75,  76,  83,  84,  91,  92,  99,  100, 107, 108,
+  115, 116, 123, 124, 131, 132, 139, 140, 147, 148, 155, 156, 163, 164, 171,
+  172, 179, 180, 187, 188, 195, 196, 203, 204, 211, 212, 219, 220, 227, 228,
+  235, 236, 243, 244, 251, 4,   4,   5,   12,  13,  20,  21,  28,  29,  36,
+  37,  44,  45,  52,  53,  60,  61,  68,  69,  76,  77,  84,  85,  92,  93,
+  100, 101, 108, 109, 116, 117, 124, 125, 132, 133, 140, 141, 148, 149, 156,
+  157, 164, 165, 172, 173, 180, 181, 188, 189, 196, 197, 204, 205, 212, 213,
+  220, 221, 228, 229, 236, 237, 244, 245, 252, 5,   5,   6,   13,  14,  21,
+  22,  29,  30,  37,  38,  45,  46,  53,  54,  61,  62,  69,  70,  77,  78,
+  85,  86,  93,  94,  101, 102, 109, 110, 117, 118, 125, 126, 133, 134, 141,
+  142, 149, 150, 157, 158, 165, 166, 173, 174, 181, 182, 189, 190, 197, 198,
+  205, 206, 213, 214, 221, 222, 229, 230, 237, 238, 245, 246, 253, 6,   6,
+  7,   14,  15,  22,  23,  30,  31,  38,  39,  46,  47,  54,  55,  62,  63,
+  70,  71,  78,  79,  86,  87,  94,  95,  102, 103, 110, 111, 118, 119, 126,
+  127, 134, 135, 142, 143, 150, 151, 158, 159, 166, 167, 174, 175, 182, 183,
+  190, 191, 198, 199, 206, 207, 214, 215, 222, 223, 230, 231, 238, 239, 246,
+  247, 254, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  32, 32, 64, 64,  96,  96,  128, 128, 160, 160, 192, 192,
+  0,  0,  1,  32, 33, 64, 65, 96,  97,  128, 129, 160, 161, 192, 193, 224,
+  1,  1,  2,  33, 34, 65, 66, 97,  98,  129, 130, 161, 162, 193, 194, 225,
+  2,  2,  3,  34, 35, 66, 67, 98,  99,  130, 131, 162, 163, 194, 195, 226,
+  3,  3,  4,  35, 36, 67, 68, 99,  100, 131, 132, 163, 164, 195, 196, 227,
+  4,  4,  5,  36, 37, 68, 69, 100, 101, 132, 133, 164, 165, 196, 197, 228,
+  5,  5,  6,  37, 38, 69, 70, 101, 102, 133, 134, 165, 166, 197, 198, 229,
+  6,  6,  7,  38, 39, 70, 71, 102, 103, 134, 135, 166, 167, 198, 199, 230,
+  7,  7,  8,  39, 40, 71, 72, 103, 104, 135, 136, 167, 168, 199, 200, 231,
+  8,  8,  9,  40, 41, 72, 73, 104, 105, 136, 137, 168, 169, 200, 201, 232,
+  9,  9,  10, 41, 42, 73, 74, 105, 106, 137, 138, 169, 170, 201, 202, 233,
+  10, 10, 11, 42, 43, 74, 75, 106, 107, 138, 139, 170, 171, 202, 203, 234,
+  11, 11, 12, 43, 44, 75, 76, 107, 108, 139, 140, 171, 172, 203, 204, 235,
+  12, 12, 13, 44, 45, 76, 77, 108, 109, 140, 141, 172, 173, 204, 205, 236,
+  13, 13, 14, 45, 46, 77, 78, 109, 110, 141, 142, 173, 174, 205, 206, 237,
+  14, 14, 15, 46, 47, 78, 79, 110, 111, 142, 143, 174, 175, 206, 207, 238,
+  15, 15, 16, 47, 48, 79, 80, 111, 112, 143, 144, 175, 176, 207, 208, 239,
+  16, 16, 17, 48, 49, 80, 81, 112, 113, 144, 145, 176, 177, 208, 209, 240,
+  17, 17, 18, 49, 50, 81, 82, 113, 114, 145, 146, 177, 178, 209, 210, 241,
+  18, 18, 19, 50, 51, 82, 83, 114, 115, 146, 147, 178, 179, 210, 211, 242,
+  19, 19, 20, 51, 52, 83, 84, 115, 116, 147, 148, 179, 180, 211, 212, 243,
+  20, 20, 21, 52, 53, 84, 85, 116, 117, 148, 149, 180, 181, 212, 213, 244,
+  21, 21, 22, 53, 54, 85, 86, 117, 118, 149, 150, 181, 182, 213, 214, 245,
+  22, 22, 23, 54, 55, 86, 87, 118, 119, 150, 151, 182, 183, 214, 215, 246,
+  23, 23, 24, 55, 56, 87, 88, 119, 120, 151, 152, 183, 184, 215, 216, 247,
+  24, 24, 25, 56, 57, 88, 89, 120, 121, 152, 153, 184, 185, 216, 217, 248,
+  25, 25, 26, 57, 58, 89, 90, 121, 122, 153, 154, 185, 186, 217, 218, 249,
+  26, 26, 27, 58, 59, 90, 91, 122, 123, 154, 155, 186, 187, 218, 219, 250,
+  27, 27, 28, 59, 60, 91, 92, 123, 124, 155, 156, 187, 188, 219, 220, 251,
+  28, 28, 29, 60, 61, 92, 93, 124, 125, 156, 157, 188, 189, 220, 221, 252,
+  29, 29, 30, 61, 62, 93, 94, 125, 126, 157, 158, 189, 190, 221, 222, 253,
+  30, 30, 31, 62, 63, 94, 95, 126, 127, 158, 159, 190, 191, 222, 223, 254,
+  0,  0
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t,
+                col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  8,  8,  8,  0,  16, 16, 1,  8,  24, 24, 9,  16, 9,  1,  32,
+  32, 17, 24, 2,  9,  25, 32, 10, 17, 40, 40, 10, 2,  18, 25, 33, 40, 3,  10,
+  48, 48, 11, 18, 26, 33, 11, 3,  41, 48, 19, 26, 34, 41, 4,  11, 27, 34, 12,
+  19, 49, 56, 42, 49, 20, 27, 12, 4,  35, 42, 5,  12, 28, 35, 50, 57, 43, 50,
+  13, 20, 36, 43, 13, 5,  21, 28, 51, 58, 29, 36, 6,  13, 44, 51, 14, 21, 14,
+  6,  37, 44, 52, 59, 22, 29, 7,  14, 30, 37, 45, 52, 15, 22, 38, 45, 23, 30,
+  53, 60, 31, 38, 46, 53, 39, 46, 54, 61, 47, 54, 55, 62, 0,  0,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  8,  8,  16, 16, 24, 24, 32, 32, 40, 40, 48, 48, 0,  0,  1,
+  8,  9,  16, 17, 24, 25, 32, 33, 40, 41, 48, 49, 56, 1,  1,  2,  9,  10, 17,
+  18, 25, 26, 33, 34, 41, 42, 49, 50, 57, 2,  2,  3,  10, 11, 18, 19, 26, 27,
+  34, 35, 42, 43, 50, 51, 58, 3,  3,  4,  11, 12, 19, 20, 27, 28, 35, 36, 43,
+  44, 51, 52, 59, 4,  4,  5,  12, 13, 20, 21, 28, 29, 36, 37, 44, 45, 52, 53,
+  60, 5,  5,  6,  13, 14, 21, 22, 29, 30, 37, 38, 45, 46, 53, 54, 61, 6,  6,
+  7,  14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 0,  0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  0,  0,  1,
+  8,  2,  9,  3,  10, 4,  11, 5,  12, 6,  13, 7,  14, 8,  8,  9,  16, 10, 17,
+  11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 16, 17, 24, 18, 25, 19, 26, 20,
+  27, 21, 28, 22, 29, 23, 30, 24, 24, 25, 32, 26, 33, 27, 34, 28, 35, 29, 36,
+  30, 37, 31, 38, 32, 32, 33, 40, 34, 41, 35, 42, 36, 43, 37, 44, 38, 45, 39,
+  46, 40, 40, 41, 48, 42, 49, 43, 50, 44, 51, 45, 52, 46, 53, 47, 54, 48, 48,
+  49, 56, 50, 57, 51, 58, 52, 59, 53, 60, 54, 61, 55, 62, 0,  0,
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t,
+                row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  1,  1,  0,  1,  1,  8,  2,  2,  8,  9,  2,  9,  3,  3,  9,
+  16, 3,  10, 16, 17, 4,  4,  10, 17, 17, 24, 4,  11, 11, 18, 18, 25, 24, 25,
+  5,  5,  5,  12, 12, 19, 25, 32, 19, 26, 6,  6,  26, 33, 32, 33, 13, 20, 20,
+  27, 33, 40, 6,  13, 27, 34, 40, 41, 34, 41, 21, 28, 28, 35, 41, 48, 14, 21,
+  35, 42, 7,  14, 48, 49, 29, 36, 42, 49, 36, 43, 22, 29, 49, 56, 15, 22, 43,
+  50, 50, 57, 37, 44, 30, 37, 44, 51, 23, 30, 51, 58, 45, 52, 38, 45, 52, 59,
+  31, 38, 53, 60, 39, 46, 46, 53, 47, 54, 54, 61, 55, 62, 0,  0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  8,  0,  8,  8,  1,  8,  9,  1,  9,  16, 16, 17, 2,  9,  10,
+  2,  10, 17, 17, 24, 24, 25, 3,  10, 11, 3,  18, 25, 25, 32, 11, 18, 32, 33,
+  4,  11, 26, 33, 19, 26, 12, 4,  33, 40, 12, 19, 40, 41, 5,  12, 27, 34, 34,
+  41, 20, 27, 13, 20, 13, 5,  41, 48, 48, 49, 28, 35, 35, 42, 21, 28, 6,  6,
+  6,  13, 42, 49, 49, 56, 36, 43, 14, 21, 29, 36, 7,  14, 43, 50, 50, 57, 22,
+  29, 37, 44, 15, 22, 44, 51, 51, 58, 30, 37, 23, 30, 52, 59, 45, 52, 38, 45,
+  31, 38, 53, 60, 46, 53, 39, 46, 54, 61, 47, 54, 55, 62, 0,  0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   0,   0,   1,   1,   1,   8,   8,   8,   2,   2,   2,
+  9,   9,   16,  16,  16,  3,   3,   3,   10,  10,  17,  17,  24,  24,  24,
+  4,   4,   4,   11,  11,  18,  18,  25,  25,  32,  32,  32,  5,   5,   5,
+  12,  12,  19,  19,  26,  26,  33,  33,  40,  40,  40,  6,   6,   6,   13,
+  13,  20,  20,  27,  27,  34,  34,  41,  41,  48,  48,  48,  7,   14,  14,
+  21,  21,  28,  28,  35,  35,  42,  42,  49,  49,  56,  56,  56,  15,  22,
+  22,  29,  29,  36,  36,  43,  43,  50,  50,  57,  57,  64,  64,  64,  23,
+  30,  30,  37,  37,  44,  44,  51,  51,  58,  58,  65,  65,  72,  72,  72,
+  31,  38,  38,  45,  45,  52,  52,  59,  59,  66,  66,  73,  73,  80,  80,
+  80,  39,  46,  46,  53,  53,  60,  60,  67,  67,  74,  74,  81,  81,  88,
+  88,  88,  47,  54,  54,  61,  61,  68,  68,  75,  75,  82,  82,  89,  89,
+  96,  96,  96,  55,  62,  62,  69,  69,  76,  76,  83,  83,  90,  90,  97,
+  97,  104, 104, 104, 63,  70,  70,  77,  77,  84,  84,  91,  91,  98,  98,
+  105, 105, 112, 112, 112, 71,  78,  78,  85,  85,  92,  92,  99,  99,  106,
+  106, 113, 113, 120, 79,  86,  86,  93,  93,  100, 100, 107, 107, 114, 114,
+  121, 87,  94,  94,  101, 101, 108, 108, 115, 115, 122, 95,  102, 102, 109,
+  109, 116, 116, 123, 103, 110, 110, 117, 117, 124, 111, 118, 118, 125, 119,
+  126, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   0,   0,  1,  1,   1,   16,  16,  16,  2,   2,   2,
+  17,  17,  32,  32,  32,  3,  3,  3,   18,  18,  33,  33,  48,  48,  48,
+  4,   4,   4,   19,  19,  34, 34, 49,  49,  64,  64,  64,  5,   5,   5,
+  20,  20,  35,  35,  50,  50, 65, 65,  80,  80,  80,  6,   6,   6,   21,
+  21,  36,  36,  51,  51,  66, 66, 81,  81,  96,  96,  96,  7,   7,   7,
+  22,  22,  37,  37,  52,  52, 67, 67,  82,  82,  97,  97,  112, 8,   8,
+  8,   23,  23,  38,  38,  53, 53, 68,  68,  83,  83,  98,  98,  113, 9,
+  9,   9,   24,  24,  39,  39, 54, 54,  69,  69,  84,  84,  99,  99,  114,
+  10,  10,  10,  25,  25,  40, 40, 55,  55,  70,  70,  85,  85,  100, 100,
+  115, 11,  11,  11,  26,  26, 41, 41,  56,  56,  71,  71,  86,  86,  101,
+  101, 116, 12,  12,  12,  27, 27, 42,  42,  57,  57,  72,  72,  87,  87,
+  102, 102, 117, 13,  13,  13, 28, 28,  43,  43,  58,  58,  73,  73,  88,
+  88,  103, 103, 118, 14,  14, 14, 29,  29,  44,  44,  59,  59,  74,  74,
+  89,  89,  104, 104, 119, 15, 30, 30,  45,  45,  60,  60,  75,  75,  90,
+  90,  105, 105, 120, 31,  46, 46, 61,  61,  76,  76,  91,  91,  106, 106,
+  121, 47,  62,  62,  77,  77, 92, 92,  107, 107, 122, 63,  78,  78,  93,
+  93,  108, 108, 123, 79,  94, 94, 109, 109, 124, 95,  110, 110, 125, 111,
+  126, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  8,  8,  16, 16, 24, 24,  32,  32,  40,  40,  48,  48,
+  56, 56, 64, 64, 72, 72, 80, 80, 88, 88,  96,  96,  104, 104, 112, 112,
+  0,  0,  1,  8,  9,  16, 17, 24, 25, 32,  33,  40,  41,  48,  49,  56,
+  57, 64, 65, 72, 73, 80, 81, 88, 89, 96,  97,  104, 105, 112, 113, 120,
+  1,  1,  2,  9,  10, 17, 18, 25, 26, 33,  34,  41,  42,  49,  50,  57,
+  58, 65, 66, 73, 74, 81, 82, 89, 90, 97,  98,  105, 106, 113, 114, 121,
+  2,  2,  3,  10, 11, 18, 19, 26, 27, 34,  35,  42,  43,  50,  51,  58,
+  59, 66, 67, 74, 75, 82, 83, 90, 91, 98,  99,  106, 107, 114, 115, 122,
+  3,  3,  4,  11, 12, 19, 20, 27, 28, 35,  36,  43,  44,  51,  52,  59,
+  60, 67, 68, 75, 76, 83, 84, 91, 92, 99,  100, 107, 108, 115, 116, 123,
+  4,  4,  5,  12, 13, 20, 21, 28, 29, 36,  37,  44,  45,  52,  53,  60,
+  61, 68, 69, 76, 77, 84, 85, 92, 93, 100, 101, 108, 109, 116, 117, 124,
+  5,  5,  6,  13, 14, 21, 22, 29, 30, 37,  38,  45,  46,  53,  54,  61,
+  62, 69, 70, 77, 78, 85, 86, 93, 94, 101, 102, 109, 110, 117, 118, 125,
+  6,  6,  7,  14, 15, 22, 23, 30, 31, 38,  39,  46,  47,  54,  55,  62,
+  63, 70, 71, 78, 79, 86, 87, 94, 95, 102, 103, 110, 111, 118, 119, 126,
+  0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  16, 16, 32, 32, 48, 48, 64, 64, 80, 80,  96,  96,
+  0,  0,  1,  16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 96,  97,  112,
+  1,  1,  2,  17, 18, 33, 34, 49, 50, 65, 66, 81, 82, 97,  98,  113,
+  2,  2,  3,  18, 19, 34, 35, 50, 51, 66, 67, 82, 83, 98,  99,  114,
+  3,  3,  4,  19, 20, 35, 36, 51, 52, 67, 68, 83, 84, 99,  100, 115,
+  4,  4,  5,  20, 21, 36, 37, 52, 53, 68, 69, 84, 85, 100, 101, 116,
+  5,  5,  6,  21, 22, 37, 38, 53, 54, 69, 70, 85, 86, 101, 102, 117,
+  6,  6,  7,  22, 23, 38, 39, 54, 55, 70, 71, 86, 87, 102, 103, 118,
+  7,  7,  8,  23, 24, 39, 40, 55, 56, 71, 72, 87, 88, 103, 104, 119,
+  8,  8,  9,  24, 25, 40, 41, 56, 57, 72, 73, 88, 89, 104, 105, 120,
+  9,  9,  10, 25, 26, 41, 42, 57, 58, 73, 74, 89, 90, 105, 106, 121,
+  10, 10, 11, 26, 27, 42, 43, 58, 59, 74, 75, 90, 91, 106, 107, 122,
+  11, 11, 12, 27, 28, 43, 44, 59, 60, 75, 76, 91, 92, 107, 108, 123,
+  12, 12, 13, 28, 29, 44, 45, 60, 61, 76, 77, 92, 93, 108, 109, 124,
+  13, 13, 14, 29, 30, 45, 46, 61, 62, 77, 78, 93, 94, 109, 110, 125,
+  14, 14, 15, 30, 31, 46, 47, 62, 63, 78, 79, 94, 95, 110, 111, 126,
+  0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
+  6,   0,   0,   1,   8,   2,   9,   3,   10,  4,   11,  5,   12,  6,   13,
+  7,   14,  8,   8,   9,   16,  10,  17,  11,  18,  12,  19,  13,  20,  14,
+  21,  15,  22,  16,  16,  17,  24,  18,  25,  19,  26,  20,  27,  21,  28,
+  22,  29,  23,  30,  24,  24,  25,  32,  26,  33,  27,  34,  28,  35,  29,
+  36,  30,  37,  31,  38,  32,  32,  33,  40,  34,  41,  35,  42,  36,  43,
+  37,  44,  38,  45,  39,  46,  40,  40,  41,  48,  42,  49,  43,  50,  44,
+  51,  45,  52,  46,  53,  47,  54,  48,  48,  49,  56,  50,  57,  51,  58,
+  52,  59,  53,  60,  54,  61,  55,  62,  56,  56,  57,  64,  58,  65,  59,
+  66,  60,  67,  61,  68,  62,  69,  63,  70,  64,  64,  65,  72,  66,  73,
+  67,  74,  68,  75,  69,  76,  70,  77,  71,  78,  72,  72,  73,  80,  74,
+  81,  75,  82,  76,  83,  77,  84,  78,  85,  79,  86,  80,  80,  81,  88,
+  82,  89,  83,  90,  84,  91,  85,  92,  86,  93,  87,  94,  88,  88,  89,
+  96,  90,  97,  91,  98,  92,  99,  93,  100, 94,  101, 95,  102, 96,  96,
+  97,  104, 98,  105, 99,  106, 100, 107, 101, 108, 102, 109, 103, 110, 104,
+  104, 105, 112, 106, 113, 107, 114, 108, 115, 109, 116, 110, 117, 111, 118,
+  112, 112, 113, 120, 114, 121, 115, 122, 116, 123, 117, 124, 118, 125, 119,
+  126, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
+  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
+  14,  14,  0,   0,   1,   16,  2,   17,  3,   18,  4,   19,  5,   20,  6,
+  21,  7,   22,  8,   23,  9,   24,  10,  25,  11,  26,  12,  27,  13,  28,
+  14,  29,  15,  30,  16,  16,  17,  32,  18,  33,  19,  34,  20,  35,  21,
+  36,  22,  37,  23,  38,  24,  39,  25,  40,  26,  41,  27,  42,  28,  43,
+  29,  44,  30,  45,  31,  46,  32,  32,  33,  48,  34,  49,  35,  50,  36,
+  51,  37,  52,  38,  53,  39,  54,  40,  55,  41,  56,  42,  57,  43,  58,
+  44,  59,  45,  60,  46,  61,  47,  62,  48,  48,  49,  64,  50,  65,  51,
+  66,  52,  67,  53,  68,  54,  69,  55,  70,  56,  71,  57,  72,  58,  73,
+  59,  74,  60,  75,  61,  76,  62,  77,  63,  78,  64,  64,  65,  80,  66,
+  81,  67,  82,  68,  83,  69,  84,  70,  85,  71,  86,  72,  87,  73,  88,
+  74,  89,  75,  90,  76,  91,  77,  92,  78,  93,  79,  94,  80,  80,  81,
+  96,  82,  97,  83,  98,  84,  99,  85,  100, 86,  101, 87,  102, 88,  103,
+  89,  104, 90,  105, 91,  106, 92,  107, 93,  108, 94,  109, 95,  110, 96,
+  96,  97,  112, 98,  113, 99,  114, 100, 115, 101, 116, 102, 117, 103, 118,
+  104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
+  126, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   0,   0,   1,   1,   1,   16,  16,  16,  2,   2,   2,
+  17,  17,  32,  32,  32,  3,   3,   3,   18,  18,  33,  33,  48,  48,  48,
+  4,   4,   4,   19,  19,  34,  34,  49,  49,  64,  64,  64,  5,   5,   5,
+  20,  20,  35,  35,  50,  50,  65,  65,  80,  80,  80,  6,   6,   6,   21,
+  21,  36,  36,  51,  51,  66,  66,  81,  81,  96,  96,  96,  7,   7,   7,
+  22,  22,  37,  37,  52,  52,  67,  67,  82,  82,  97,  97,  112, 112, 112,
+  8,   8,   8,   23,  23,  38,  38,  53,  53,  68,  68,  83,  83,  98,  98,
+  113, 113, 128, 128, 128, 9,   9,   9,   24,  24,  39,  39,  54,  54,  69,
+  69,  84,  84,  99,  99,  114, 114, 129, 129, 144, 144, 144, 10,  10,  10,
+  25,  25,  40,  40,  55,  55,  70,  70,  85,  85,  100, 100, 115, 115, 130,
+  130, 145, 145, 160, 160, 160, 11,  11,  11,  26,  26,  41,  41,  56,  56,
+  71,  71,  86,  86,  101, 101, 116, 116, 131, 131, 146, 146, 161, 161, 176,
+  176, 176, 12,  12,  12,  27,  27,  42,  42,  57,  57,  72,  72,  87,  87,
+  102, 102, 117, 117, 132, 132, 147, 147, 162, 162, 177, 177, 192, 192, 192,
+  13,  13,  13,  28,  28,  43,  43,  58,  58,  73,  73,  88,  88,  103, 103,
+  118, 118, 133, 133, 148, 148, 163, 163, 178, 178, 193, 193, 208, 208, 208,
+  14,  14,  14,  29,  29,  44,  44,  59,  59,  74,  74,  89,  89,  104, 104,
+  119, 119, 134, 134, 149, 149, 164, 164, 179, 179, 194, 194, 209, 209, 224,
+  224, 224, 15,  30,  30,  45,  45,  60,  60,  75,  75,  90,  90,  105, 105,
+  120, 120, 135, 135, 150, 150, 165, 165, 180, 180, 195, 195, 210, 210, 225,
+  225, 240, 240, 240, 31,  46,  46,  61,  61,  76,  76,  91,  91,  106, 106,
+  121, 121, 136, 136, 151, 151, 166, 166, 181, 181, 196, 196, 211, 211, 226,
+  226, 241, 241, 256, 256, 256, 47,  62,  62,  77,  77,  92,  92,  107, 107,
+  122, 122, 137, 137, 152, 152, 167, 167, 182, 182, 197, 197, 212, 212, 227,
+  227, 242, 242, 257, 257, 272, 272, 272, 63,  78,  78,  93,  93,  108, 108,
+  123, 123, 138, 138, 153, 153, 168, 168, 183, 183, 198, 198, 213, 213, 228,
+  228, 243, 243, 258, 258, 273, 273, 288, 288, 288, 79,  94,  94,  109, 109,
+  124, 124, 139, 139, 154, 154, 169, 169, 184, 184, 199, 199, 214, 214, 229,
+  229, 244, 244, 259, 259, 274, 274, 289, 289, 304, 304, 304, 95,  110, 110,
+  125, 125, 140, 140, 155, 155, 170, 170, 185, 185, 200, 200, 215, 215, 230,
+  230, 245, 245, 260, 260, 275, 275, 290, 290, 305, 305, 320, 320, 320, 111,
+  126, 126, 141, 141, 156, 156, 171, 171, 186, 186, 201, 201, 216, 216, 231,
+  231, 246, 246, 261, 261, 276, 276, 291, 291, 306, 306, 321, 321, 336, 336,
+  336, 127, 142, 142, 157, 157, 172, 172, 187, 187, 202, 202, 217, 217, 232,
+  232, 247, 247, 262, 262, 277, 277, 292, 292, 307, 307, 322, 322, 337, 337,
+  352, 352, 352, 143, 158, 158, 173, 173, 188, 188, 203, 203, 218, 218, 233,
+  233, 248, 248, 263, 263, 278, 278, 293, 293, 308, 308, 323, 323, 338, 338,
+  353, 353, 368, 368, 368, 159, 174, 174, 189, 189, 204, 204, 219, 219, 234,
+  234, 249, 249, 264, 264, 279, 279, 294, 294, 309, 309, 324, 324, 339, 339,
+  354, 354, 369, 369, 384, 384, 384, 175, 190, 190, 205, 205, 220, 220, 235,
+  235, 250, 250, 265, 265, 280, 280, 295, 295, 310, 310, 325, 325, 340, 340,
+  355, 355, 370, 370, 385, 385, 400, 400, 400, 191, 206, 206, 221, 221, 236,
+  236, 251, 251, 266, 266, 281, 281, 296, 296, 311, 311, 326, 326, 341, 341,
+  356, 356, 371, 371, 386, 386, 401, 401, 416, 416, 416, 207, 222, 222, 237,
+  237, 252, 252, 267, 267, 282, 282, 297, 297, 312, 312, 327, 327, 342, 342,
+  357, 357, 372, 372, 387, 387, 402, 402, 417, 417, 432, 432, 432, 223, 238,
+  238, 253, 253, 268, 268, 283, 283, 298, 298, 313, 313, 328, 328, 343, 343,
+  358, 358, 373, 373, 388, 388, 403, 403, 418, 418, 433, 433, 448, 448, 448,
+  239, 254, 254, 269, 269, 284, 284, 299, 299, 314, 314, 329, 329, 344, 344,
+  359, 359, 374, 374, 389, 389, 404, 404, 419, 419, 434, 434, 449, 449, 464,
+  464, 464, 255, 270, 270, 285, 285, 300, 300, 315, 315, 330, 330, 345, 345,
+  360, 360, 375, 375, 390, 390, 405, 405, 420, 420, 435, 435, 450, 450, 465,
+  465, 480, 480, 480, 271, 286, 286, 301, 301, 316, 316, 331, 331, 346, 346,
+  361, 361, 376, 376, 391, 391, 406, 406, 421, 421, 436, 436, 451, 451, 466,
+  466, 481, 481, 496, 287, 302, 302, 317, 317, 332, 332, 347, 347, 362, 362,
+  377, 377, 392, 392, 407, 407, 422, 422, 437, 437, 452, 452, 467, 467, 482,
+  482, 497, 303, 318, 318, 333, 333, 348, 348, 363, 363, 378, 378, 393, 393,
+  408, 408, 423, 423, 438, 438, 453, 453, 468, 468, 483, 483, 498, 319, 334,
+  334, 349, 349, 364, 364, 379, 379, 394, 394, 409, 409, 424, 424, 439, 439,
+  454, 454, 469, 469, 484, 484, 499, 335, 350, 350, 365, 365, 380, 380, 395,
+  395, 410, 410, 425, 425, 440, 440, 455, 455, 470, 470, 485, 485, 500, 351,
+  366, 366, 381, 381, 396, 396, 411, 411, 426, 426, 441, 441, 456, 456, 471,
+  471, 486, 486, 501, 367, 382, 382, 397, 397, 412, 412, 427, 427, 442, 442,
+  457, 457, 472, 472, 487, 487, 502, 383, 398, 398, 413, 413, 428, 428, 443,
+  443, 458, 458, 473, 473, 488, 488, 503, 399, 414, 414, 429, 429, 444, 444,
+  459, 459, 474, 474, 489, 489, 504, 415, 430, 430, 445, 445, 460, 460, 475,
+  475, 490, 490, 505, 431, 446, 446, 461, 461, 476, 476, 491, 491, 506, 447,
+  462, 462, 477, 477, 492, 492, 507, 463, 478, 478, 493, 493, 508, 479, 494,
+  494, 509, 495, 510, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   0,   0,   1,   1,   1,   32,  32,  32,  2,   2,   2,
+  33,  33,  64,  64,  64,  3,   3,   3,   34,  34,  65,  65,  96,  96,  96,
+  4,   4,   4,   35,  35,  66,  66,  97,  97,  128, 128, 128, 5,   5,   5,
+  36,  36,  67,  67,  98,  98,  129, 129, 160, 160, 160, 6,   6,   6,   37,
+  37,  68,  68,  99,  99,  130, 130, 161, 161, 192, 192, 192, 7,   7,   7,
+  38,  38,  69,  69,  100, 100, 131, 131, 162, 162, 193, 193, 224, 224, 224,
+  8,   8,   8,   39,  39,  70,  70,  101, 101, 132, 132, 163, 163, 194, 194,
+  225, 225, 256, 256, 256, 9,   9,   9,   40,  40,  71,  71,  102, 102, 133,
+  133, 164, 164, 195, 195, 226, 226, 257, 257, 288, 288, 288, 10,  10,  10,
+  41,  41,  72,  72,  103, 103, 134, 134, 165, 165, 196, 196, 227, 227, 258,
+  258, 289, 289, 320, 320, 320, 11,  11,  11,  42,  42,  73,  73,  104, 104,
+  135, 135, 166, 166, 197, 197, 228, 228, 259, 259, 290, 290, 321, 321, 352,
+  352, 352, 12,  12,  12,  43,  43,  74,  74,  105, 105, 136, 136, 167, 167,
+  198, 198, 229, 229, 260, 260, 291, 291, 322, 322, 353, 353, 384, 384, 384,
+  13,  13,  13,  44,  44,  75,  75,  106, 106, 137, 137, 168, 168, 199, 199,
+  230, 230, 261, 261, 292, 292, 323, 323, 354, 354, 385, 385, 416, 416, 416,
+  14,  14,  14,  45,  45,  76,  76,  107, 107, 138, 138, 169, 169, 200, 200,
+  231, 231, 262, 262, 293, 293, 324, 324, 355, 355, 386, 386, 417, 417, 448,
+  448, 448, 15,  15,  15,  46,  46,  77,  77,  108, 108, 139, 139, 170, 170,
+  201, 201, 232, 232, 263, 263, 294, 294, 325, 325, 356, 356, 387, 387, 418,
+  418, 449, 449, 480, 16,  16,  16,  47,  47,  78,  78,  109, 109, 140, 140,
+  171, 171, 202, 202, 233, 233, 264, 264, 295, 295, 326, 326, 357, 357, 388,
+  388, 419, 419, 450, 450, 481, 17,  17,  17,  48,  48,  79,  79,  110, 110,
+  141, 141, 172, 172, 203, 203, 234, 234, 265, 265, 296, 296, 327, 327, 358,
+  358, 389, 389, 420, 420, 451, 451, 482, 18,  18,  18,  49,  49,  80,  80,
+  111, 111, 142, 142, 173, 173, 204, 204, 235, 235, 266, 266, 297, 297, 328,
+  328, 359, 359, 390, 390, 421, 421, 452, 452, 483, 19,  19,  19,  50,  50,
+  81,  81,  112, 112, 143, 143, 174, 174, 205, 205, 236, 236, 267, 267, 298,
+  298, 329, 329, 360, 360, 391, 391, 422, 422, 453, 453, 484, 20,  20,  20,
+  51,  51,  82,  82,  113, 113, 144, 144, 175, 175, 206, 206, 237, 237, 268,
+  268, 299, 299, 330, 330, 361, 361, 392, 392, 423, 423, 454, 454, 485, 21,
+  21,  21,  52,  52,  83,  83,  114, 114, 145, 145, 176, 176, 207, 207, 238,
+  238, 269, 269, 300, 300, 331, 331, 362, 362, 393, 393, 424, 424, 455, 455,
+  486, 22,  22,  22,  53,  53,  84,  84,  115, 115, 146, 146, 177, 177, 208,
+  208, 239, 239, 270, 270, 301, 301, 332, 332, 363, 363, 394, 394, 425, 425,
+  456, 456, 487, 23,  23,  23,  54,  54,  85,  85,  116, 116, 147, 147, 178,
+  178, 209, 209, 240, 240, 271, 271, 302, 302, 333, 333, 364, 364, 395, 395,
+  426, 426, 457, 457, 488, 24,  24,  24,  55,  55,  86,  86,  117, 117, 148,
+  148, 179, 179, 210, 210, 241, 241, 272, 272, 303, 303, 334, 334, 365, 365,
+  396, 396, 427, 427, 458, 458, 489, 25,  25,  25,  56,  56,  87,  87,  118,
+  118, 149, 149, 180, 180, 211, 211, 242, 242, 273, 273, 304, 304, 335, 335,
+  366, 366, 397, 397, 428, 428, 459, 459, 490, 26,  26,  26,  57,  57,  88,
+  88,  119, 119, 150, 150, 181, 181, 212, 212, 243, 243, 274, 274, 305, 305,
+  336, 336, 367, 367, 398, 398, 429, 429, 460, 460, 491, 27,  27,  27,  58,
+  58,  89,  89,  120, 120, 151, 151, 182, 182, 213, 213, 244, 244, 275, 275,
+  306, 306, 337, 337, 368, 368, 399, 399, 430, 430, 461, 461, 492, 28,  28,
+  28,  59,  59,  90,  90,  121, 121, 152, 152, 183, 183, 214, 214, 245, 245,
+  276, 276, 307, 307, 338, 338, 369, 369, 400, 400, 431, 431, 462, 462, 493,
+  29,  29,  29,  60,  60,  91,  91,  122, 122, 153, 153, 184, 184, 215, 215,
+  246, 246, 277, 277, 308, 308, 339, 339, 370, 370, 401, 401, 432, 432, 463,
+  463, 494, 30,  30,  30,  61,  61,  92,  92,  123, 123, 154, 154, 185, 185,
+  216, 216, 247, 247, 278, 278, 309, 309, 340, 340, 371, 371, 402, 402, 433,
+  433, 464, 464, 495, 31,  62,  62,  93,  93,  124, 124, 155, 155, 186, 186,
+  217, 217, 248, 248, 279, 279, 310, 310, 341, 341, 372, 372, 403, 403, 434,
+  434, 465, 465, 496, 63,  94,  94,  125, 125, 156, 156, 187, 187, 218, 218,
+  249, 249, 280, 280, 311, 311, 342, 342, 373, 373, 404, 404, 435, 435, 466,
+  466, 497, 95,  126, 126, 157, 157, 188, 188, 219, 219, 250, 250, 281, 281,
+  312, 312, 343, 343, 374, 374, 405, 405, 436, 436, 467, 467, 498, 127, 158,
+  158, 189, 189, 220, 220, 251, 251, 282, 282, 313, 313, 344, 344, 375, 375,
+  406, 406, 437, 437, 468, 468, 499, 159, 190, 190, 221, 221, 252, 252, 283,
+  283, 314, 314, 345, 345, 376, 376, 407, 407, 438, 438, 469, 469, 500, 191,
+  222, 222, 253, 253, 284, 284, 315, 315, 346, 346, 377, 377, 408, 408, 439,
+  439, 470, 470, 501, 223, 254, 254, 285, 285, 316, 316, 347, 347, 378, 378,
+  409, 409, 440, 440, 471, 471, 502, 255, 286, 286, 317, 317, 348, 348, 379,
+  379, 410, 410, 441, 441, 472, 472, 503, 287, 318, 318, 349, 349, 380, 380,
+  411, 411, 442, 442, 473, 473, 504, 319, 350, 350, 381, 381, 412, 412, 443,
+  443, 474, 474, 505, 351, 382, 382, 413, 413, 444, 444, 475, 475, 506, 383,
+  414, 414, 445, 445, 476, 476, 507, 415, 446, 446, 477, 477, 508, 447, 478,
+  478, 509, 479, 510, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   16,  16,  32,  32,  48,  48,  64,  64,  80,  80,  96,
+  96,  112, 112, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 208, 208,
+  224, 224, 240, 240, 256, 256, 272, 272, 288, 288, 304, 304, 320, 320, 336,
+  336, 352, 352, 368, 368, 384, 384, 400, 400, 416, 416, 432, 432, 448, 448,
+  464, 464, 480, 480, 0,   0,   1,   16,  17,  32,  33,  48,  49,  64,  65,
+  80,  81,  96,  97,  112, 113, 128, 129, 144, 145, 160, 161, 176, 177, 192,
+  193, 208, 209, 224, 225, 240, 241, 256, 257, 272, 273, 288, 289, 304, 305,
+  320, 321, 336, 337, 352, 353, 368, 369, 384, 385, 400, 401, 416, 417, 432,
+  433, 448, 449, 464, 465, 480, 481, 496, 1,   1,   2,   17,  18,  33,  34,
+  49,  50,  65,  66,  81,  82,  97,  98,  113, 114, 129, 130, 145, 146, 161,
+  162, 177, 178, 193, 194, 209, 210, 225, 226, 241, 242, 257, 258, 273, 274,
+  289, 290, 305, 306, 321, 322, 337, 338, 353, 354, 369, 370, 385, 386, 401,
+  402, 417, 418, 433, 434, 449, 450, 465, 466, 481, 482, 497, 2,   2,   3,
+  18,  19,  34,  35,  50,  51,  66,  67,  82,  83,  98,  99,  114, 115, 130,
+  131, 146, 147, 162, 163, 178, 179, 194, 195, 210, 211, 226, 227, 242, 243,
+  258, 259, 274, 275, 290, 291, 306, 307, 322, 323, 338, 339, 354, 355, 370,
+  371, 386, 387, 402, 403, 418, 419, 434, 435, 450, 451, 466, 467, 482, 483,
+  498, 3,   3,   4,   19,  20,  35,  36,  51,  52,  67,  68,  83,  84,  99,
+  100, 115, 116, 131, 132, 147, 148, 163, 164, 179, 180, 195, 196, 211, 212,
+  227, 228, 243, 244, 259, 260, 275, 276, 291, 292, 307, 308, 323, 324, 339,
+  340, 355, 356, 371, 372, 387, 388, 403, 404, 419, 420, 435, 436, 451, 452,
+  467, 468, 483, 484, 499, 4,   4,   5,   20,  21,  36,  37,  52,  53,  68,
+  69,  84,  85,  100, 101, 116, 117, 132, 133, 148, 149, 164, 165, 180, 181,
+  196, 197, 212, 213, 228, 229, 244, 245, 260, 261, 276, 277, 292, 293, 308,
+  309, 324, 325, 340, 341, 356, 357, 372, 373, 388, 389, 404, 405, 420, 421,
+  436, 437, 452, 453, 468, 469, 484, 485, 500, 5,   5,   6,   21,  22,  37,
+  38,  53,  54,  69,  70,  85,  86,  101, 102, 117, 118, 133, 134, 149, 150,
+  165, 166, 181, 182, 197, 198, 213, 214, 229, 230, 245, 246, 261, 262, 277,
+  278, 293, 294, 309, 310, 325, 326, 341, 342, 357, 358, 373, 374, 389, 390,
+  405, 406, 421, 422, 437, 438, 453, 454, 469, 470, 485, 486, 501, 6,   6,
+  7,   22,  23,  38,  39,  54,  55,  70,  71,  86,  87,  102, 103, 118, 119,
+  134, 135, 150, 151, 166, 167, 182, 183, 198, 199, 214, 215, 230, 231, 246,
+  247, 262, 263, 278, 279, 294, 295, 310, 311, 326, 327, 342, 343, 358, 359,
+  374, 375, 390, 391, 406, 407, 422, 423, 438, 439, 454, 455, 470, 471, 486,
+  487, 502, 7,   7,   8,   23,  24,  39,  40,  55,  56,  71,  72,  87,  88,
+  103, 104, 119, 120, 135, 136, 151, 152, 167, 168, 183, 184, 199, 200, 215,
+  216, 231, 232, 247, 248, 263, 264, 279, 280, 295, 296, 311, 312, 327, 328,
+  343, 344, 359, 360, 375, 376, 391, 392, 407, 408, 423, 424, 439, 440, 455,
+  456, 471, 472, 487, 488, 503, 8,   8,   9,   24,  25,  40,  41,  56,  57,
+  72,  73,  88,  89,  104, 105, 120, 121, 136, 137, 152, 153, 168, 169, 184,
+  185, 200, 201, 216, 217, 232, 233, 248, 249, 264, 265, 280, 281, 296, 297,
+  312, 313, 328, 329, 344, 345, 360, 361, 376, 377, 392, 393, 408, 409, 424,
+  425, 440, 441, 456, 457, 472, 473, 488, 489, 504, 9,   9,   10,  25,  26,
+  41,  42,  57,  58,  73,  74,  89,  90,  105, 106, 121, 122, 137, 138, 153,
+  154, 169, 170, 185, 186, 201, 202, 217, 218, 233, 234, 249, 250, 265, 266,
+  281, 282, 297, 298, 313, 314, 329, 330, 345, 346, 361, 362, 377, 378, 393,
+  394, 409, 410, 425, 426, 441, 442, 457, 458, 473, 474, 489, 490, 505, 10,
+  10,  11,  26,  27,  42,  43,  58,  59,  74,  75,  90,  91,  106, 107, 122,
+  123, 138, 139, 154, 155, 170, 171, 186, 187, 202, 203, 218, 219, 234, 235,
+  250, 251, 266, 267, 282, 283, 298, 299, 314, 315, 330, 331, 346, 347, 362,
+  363, 378, 379, 394, 395, 410, 411, 426, 427, 442, 443, 458, 459, 474, 475,
+  490, 491, 506, 11,  11,  12,  27,  28,  43,  44,  59,  60,  75,  76,  91,
+  92,  107, 108, 123, 124, 139, 140, 155, 156, 171, 172, 187, 188, 203, 204,
+  219, 220, 235, 236, 251, 252, 267, 268, 283, 284, 299, 300, 315, 316, 331,
+  332, 347, 348, 363, 364, 379, 380, 395, 396, 411, 412, 427, 428, 443, 444,
+  459, 460, 475, 476, 491, 492, 507, 12,  12,  13,  28,  29,  44,  45,  60,
+  61,  76,  77,  92,  93,  108, 109, 124, 125, 140, 141, 156, 157, 172, 173,
+  188, 189, 204, 205, 220, 221, 236, 237, 252, 253, 268, 269, 284, 285, 300,
+  301, 316, 317, 332, 333, 348, 349, 364, 365, 380, 381, 396, 397, 412, 413,
+  428, 429, 444, 445, 460, 461, 476, 477, 492, 493, 508, 13,  13,  14,  29,
+  30,  45,  46,  61,  62,  77,  78,  93,  94,  109, 110, 125, 126, 141, 142,
+  157, 158, 173, 174, 189, 190, 205, 206, 221, 222, 237, 238, 253, 254, 269,
+  270, 285, 286, 301, 302, 317, 318, 333, 334, 349, 350, 365, 366, 381, 382,
+  397, 398, 413, 414, 429, 430, 445, 446, 461, 462, 477, 478, 493, 494, 509,
+  14,  14,  15,  30,  31,  46,  47,  62,  63,  78,  79,  94,  95,  110, 111,
+  126, 127, 142, 143, 158, 159, 174, 175, 190, 191, 206, 207, 222, 223, 238,
+  239, 254, 255, 270, 271, 286, 287, 302, 303, 318, 319, 334, 335, 350, 351,
+  366, 367, 382, 383, 398, 399, 414, 415, 430, 431, 446, 447, 462, 463, 478,
+  479, 494, 495, 510, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   32,  32,  64,  64,  96,  96,  128, 128, 160, 160, 192,
+  192, 224, 224, 256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 416, 416,
+  448, 448, 0,   0,   1,   32,  33,  64,  65,  96,  97,  128, 129, 160, 161,
+  192, 193, 224, 225, 256, 257, 288, 289, 320, 321, 352, 353, 384, 385, 416,
+  417, 448, 449, 480, 1,   1,   2,   33,  34,  65,  66,  97,  98,  129, 130,
+  161, 162, 193, 194, 225, 226, 257, 258, 289, 290, 321, 322, 353, 354, 385,
+  386, 417, 418, 449, 450, 481, 2,   2,   3,   34,  35,  66,  67,  98,  99,
+  130, 131, 162, 163, 194, 195, 226, 227, 258, 259, 290, 291, 322, 323, 354,
+  355, 386, 387, 418, 419, 450, 451, 482, 3,   3,   4,   35,  36,  67,  68,
+  99,  100, 131, 132, 163, 164, 195, 196, 227, 228, 259, 260, 291, 292, 323,
+  324, 355, 356, 387, 388, 419, 420, 451, 452, 483, 4,   4,   5,   36,  37,
+  68,  69,  100, 101, 132, 133, 164, 165, 196, 197, 228, 229, 260, 261, 292,
+  293, 324, 325, 356, 357, 388, 389, 420, 421, 452, 453, 484, 5,   5,   6,
+  37,  38,  69,  70,  101, 102, 133, 134, 165, 166, 197, 198, 229, 230, 261,
+  262, 293, 294, 325, 326, 357, 358, 389, 390, 421, 422, 453, 454, 485, 6,
+  6,   7,   38,  39,  70,  71,  102, 103, 134, 135, 166, 167, 198, 199, 230,
+  231, 262, 263, 294, 295, 326, 327, 358, 359, 390, 391, 422, 423, 454, 455,
+  486, 7,   7,   8,   39,  40,  71,  72,  103, 104, 135, 136, 167, 168, 199,
+  200, 231, 232, 263, 264, 295, 296, 327, 328, 359, 360, 391, 392, 423, 424,
+  455, 456, 487, 8,   8,   9,   40,  41,  72,  73,  104, 105, 136, 137, 168,
+  169, 200, 201, 232, 233, 264, 265, 296, 297, 328, 329, 360, 361, 392, 393,
+  424, 425, 456, 457, 488, 9,   9,   10,  41,  42,  73,  74,  105, 106, 137,
+  138, 169, 170, 201, 202, 233, 234, 265, 266, 297, 298, 329, 330, 361, 362,
+  393, 394, 425, 426, 457, 458, 489, 10,  10,  11,  42,  43,  74,  75,  106,
+  107, 138, 139, 170, 171, 202, 203, 234, 235, 266, 267, 298, 299, 330, 331,
+  362, 363, 394, 395, 426, 427, 458, 459, 490, 11,  11,  12,  43,  44,  75,
+  76,  107, 108, 139, 140, 171, 172, 203, 204, 235, 236, 267, 268, 299, 300,
+  331, 332, 363, 364, 395, 396, 427, 428, 459, 460, 491, 12,  12,  13,  44,
+  45,  76,  77,  108, 109, 140, 141, 172, 173, 204, 205, 236, 237, 268, 269,
+  300, 301, 332, 333, 364, 365, 396, 397, 428, 429, 460, 461, 492, 13,  13,
+  14,  45,  46,  77,  78,  109, 110, 141, 142, 173, 174, 205, 206, 237, 238,
+  269, 270, 301, 302, 333, 334, 365, 366, 397, 398, 429, 430, 461, 462, 493,
+  14,  14,  15,  46,  47,  78,  79,  110, 111, 142, 143, 174, 175, 206, 207,
+  238, 239, 270, 271, 302, 303, 334, 335, 366, 367, 398, 399, 430, 431, 462,
+  463, 494, 15,  15,  16,  47,  48,  79,  80,  111, 112, 143, 144, 175, 176,
+  207, 208, 239, 240, 271, 272, 303, 304, 335, 336, 367, 368, 399, 400, 431,
+  432, 463, 464, 495, 16,  16,  17,  48,  49,  80,  81,  112, 113, 144, 145,
+  176, 177, 208, 209, 240, 241, 272, 273, 304, 305, 336, 337, 368, 369, 400,
+  401, 432, 433, 464, 465, 496, 17,  17,  18,  49,  50,  81,  82,  113, 114,
+  145, 146, 177, 178, 209, 210, 241, 242, 273, 274, 305, 306, 337, 338, 369,
+  370, 401, 402, 433, 434, 465, 466, 497, 18,  18,  19,  50,  51,  82,  83,
+  114, 115, 146, 147, 178, 179, 210, 211, 242, 243, 274, 275, 306, 307, 338,
+  339, 370, 371, 402, 403, 434, 435, 466, 467, 498, 19,  19,  20,  51,  52,
+  83,  84,  115, 116, 147, 148, 179, 180, 211, 212, 243, 244, 275, 276, 307,
+  308, 339, 340, 371, 372, 403, 404, 435, 436, 467, 468, 499, 20,  20,  21,
+  52,  53,  84,  85,  116, 117, 148, 149, 180, 181, 212, 213, 244, 245, 276,
+  277, 308, 309, 340, 341, 372, 373, 404, 405, 436, 437, 468, 469, 500, 21,
+  21,  22,  53,  54,  85,  86,  117, 118, 149, 150, 181, 182, 213, 214, 245,
+  246, 277, 278, 309, 310, 341, 342, 373, 374, 405, 406, 437, 438, 469, 470,
+  501, 22,  22,  23,  54,  55,  86,  87,  118, 119, 150, 151, 182, 183, 214,
+  215, 246, 247, 278, 279, 310, 311, 342, 343, 374, 375, 406, 407, 438, 439,
+  470, 471, 502, 23,  23,  24,  55,  56,  87,  88,  119, 120, 151, 152, 183,
+  184, 215, 216, 247, 248, 279, 280, 311, 312, 343, 344, 375, 376, 407, 408,
+  439, 440, 471, 472, 503, 24,  24,  25,  56,  57,  88,  89,  120, 121, 152,
+  153, 184, 185, 216, 217, 248, 249, 280, 281, 312, 313, 344, 345, 376, 377,
+  408, 409, 440, 441, 472, 473, 504, 25,  25,  26,  57,  58,  89,  90,  121,
+  122, 153, 154, 185, 186, 217, 218, 249, 250, 281, 282, 313, 314, 345, 346,
+  377, 378, 409, 410, 441, 442, 473, 474, 505, 26,  26,  27,  58,  59,  90,
+  91,  122, 123, 154, 155, 186, 187, 218, 219, 250, 251, 282, 283, 314, 315,
+  346, 347, 378, 379, 410, 411, 442, 443, 474, 475, 506, 27,  27,  28,  59,
+  60,  91,  92,  123, 124, 155, 156, 187, 188, 219, 220, 251, 252, 283, 284,
+  315, 316, 347, 348, 379, 380, 411, 412, 443, 444, 475, 476, 507, 28,  28,
+  29,  60,  61,  92,  93,  124, 125, 156, 157, 188, 189, 220, 221, 252, 253,
+  284, 285, 316, 317, 348, 349, 380, 381, 412, 413, 444, 445, 476, 477, 508,
+  29,  29,  30,  61,  62,  93,  94,  125, 126, 157, 158, 189, 190, 221, 222,
+  253, 254, 285, 286, 317, 318, 349, 350, 381, 382, 413, 414, 445, 446, 477,
+  478, 509, 30,  30,  31,  62,  63,  94,  95,  126, 127, 158, 159, 190, 191,
+  222, 223, 254, 255, 286, 287, 318, 319, 350, 351, 382, 383, 414, 415, 446,
+  447, 478, 479, 510, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
+  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
+  14,  14,  0,   0,   1,   16,  2,   17,  3,   18,  4,   19,  5,   20,  6,
+  21,  7,   22,  8,   23,  9,   24,  10,  25,  11,  26,  12,  27,  13,  28,
+  14,  29,  15,  30,  16,  16,  17,  32,  18,  33,  19,  34,  20,  35,  21,
+  36,  22,  37,  23,  38,  24,  39,  25,  40,  26,  41,  27,  42,  28,  43,
+  29,  44,  30,  45,  31,  46,  32,  32,  33,  48,  34,  49,  35,  50,  36,
+  51,  37,  52,  38,  53,  39,  54,  40,  55,  41,  56,  42,  57,  43,  58,
+  44,  59,  45,  60,  46,  61,  47,  62,  48,  48,  49,  64,  50,  65,  51,
+  66,  52,  67,  53,  68,  54,  69,  55,  70,  56,  71,  57,  72,  58,  73,
+  59,  74,  60,  75,  61,  76,  62,  77,  63,  78,  64,  64,  65,  80,  66,
+  81,  67,  82,  68,  83,  69,  84,  70,  85,  71,  86,  72,  87,  73,  88,
+  74,  89,  75,  90,  76,  91,  77,  92,  78,  93,  79,  94,  80,  80,  81,
+  96,  82,  97,  83,  98,  84,  99,  85,  100, 86,  101, 87,  102, 88,  103,
+  89,  104, 90,  105, 91,  106, 92,  107, 93,  108, 94,  109, 95,  110, 96,
+  96,  97,  112, 98,  113, 99,  114, 100, 115, 101, 116, 102, 117, 103, 118,
+  104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
+  126, 112, 112, 113, 128, 114, 129, 115, 130, 116, 131, 117, 132, 118, 133,
+  119, 134, 120, 135, 121, 136, 122, 137, 123, 138, 124, 139, 125, 140, 126,
+  141, 127, 142, 128, 128, 129, 144, 130, 145, 131, 146, 132, 147, 133, 148,
+  134, 149, 135, 150, 136, 151, 137, 152, 138, 153, 139, 154, 140, 155, 141,
+  156, 142, 157, 143, 158, 144, 144, 145, 160, 146, 161, 147, 162, 148, 163,
+  149, 164, 150, 165, 151, 166, 152, 167, 153, 168, 154, 169, 155, 170, 156,
+  171, 157, 172, 158, 173, 159, 174, 160, 160, 161, 176, 162, 177, 163, 178,
+  164, 179, 165, 180, 166, 181, 167, 182, 168, 183, 169, 184, 170, 185, 171,
+  186, 172, 187, 173, 188, 174, 189, 175, 190, 176, 176, 177, 192, 178, 193,
+  179, 194, 180, 195, 181, 196, 182, 197, 183, 198, 184, 199, 185, 200, 186,
+  201, 187, 202, 188, 203, 189, 204, 190, 205, 191, 206, 192, 192, 193, 208,
+  194, 209, 195, 210, 196, 211, 197, 212, 198, 213, 199, 214, 200, 215, 201,
+  216, 202, 217, 203, 218, 204, 219, 205, 220, 206, 221, 207, 222, 208, 208,
+  209, 224, 210, 225, 211, 226, 212, 227, 213, 228, 214, 229, 215, 230, 216,
+  231, 217, 232, 218, 233, 219, 234, 220, 235, 221, 236, 222, 237, 223, 238,
+  224, 224, 225, 240, 226, 241, 227, 242, 228, 243, 229, 244, 230, 245, 231,
+  246, 232, 247, 233, 248, 234, 249, 235, 250, 236, 251, 237, 252, 238, 253,
+  239, 254, 240, 240, 241, 256, 242, 257, 243, 258, 244, 259, 245, 260, 246,
+  261, 247, 262, 248, 263, 249, 264, 250, 265, 251, 266, 252, 267, 253, 268,
+  254, 269, 255, 270, 256, 256, 257, 272, 258, 273, 259, 274, 260, 275, 261,
+  276, 262, 277, 263, 278, 264, 279, 265, 280, 266, 281, 267, 282, 268, 283,
+  269, 284, 270, 285, 271, 286, 272, 272, 273, 288, 274, 289, 275, 290, 276,
+  291, 277, 292, 278, 293, 279, 294, 280, 295, 281, 296, 282, 297, 283, 298,
+  284, 299, 285, 300, 286, 301, 287, 302, 288, 288, 289, 304, 290, 305, 291,
+  306, 292, 307, 293, 308, 294, 309, 295, 310, 296, 311, 297, 312, 298, 313,
+  299, 314, 300, 315, 301, 316, 302, 317, 303, 318, 304, 304, 305, 320, 306,
+  321, 307, 322, 308, 323, 309, 324, 310, 325, 311, 326, 312, 327, 313, 328,
+  314, 329, 315, 330, 316, 331, 317, 332, 318, 333, 319, 334, 320, 320, 321,
+  336, 322, 337, 323, 338, 324, 339, 325, 340, 326, 341, 327, 342, 328, 343,
+  329, 344, 330, 345, 331, 346, 332, 347, 333, 348, 334, 349, 335, 350, 336,
+  336, 337, 352, 338, 353, 339, 354, 340, 355, 341, 356, 342, 357, 343, 358,
+  344, 359, 345, 360, 346, 361, 347, 362, 348, 363, 349, 364, 350, 365, 351,
+  366, 352, 352, 353, 368, 354, 369, 355, 370, 356, 371, 357, 372, 358, 373,
+  359, 374, 360, 375, 361, 376, 362, 377, 363, 378, 364, 379, 365, 380, 366,
+  381, 367, 382, 368, 368, 369, 384, 370, 385, 371, 386, 372, 387, 373, 388,
+  374, 389, 375, 390, 376, 391, 377, 392, 378, 393, 379, 394, 380, 395, 381,
+  396, 382, 397, 383, 398, 384, 384, 385, 400, 386, 401, 387, 402, 388, 403,
+  389, 404, 390, 405, 391, 406, 392, 407, 393, 408, 394, 409, 395, 410, 396,
+  411, 397, 412, 398, 413, 399, 414, 400, 400, 401, 416, 402, 417, 403, 418,
+  404, 419, 405, 420, 406, 421, 407, 422, 408, 423, 409, 424, 410, 425, 411,
+  426, 412, 427, 413, 428, 414, 429, 415, 430, 416, 416, 417, 432, 418, 433,
+  419, 434, 420, 435, 421, 436, 422, 437, 423, 438, 424, 439, 425, 440, 426,
+  441, 427, 442, 428, 443, 429, 444, 430, 445, 431, 446, 432, 432, 433, 448,
+  434, 449, 435, 450, 436, 451, 437, 452, 438, 453, 439, 454, 440, 455, 441,
+  456, 442, 457, 443, 458, 444, 459, 445, 460, 446, 461, 447, 462, 448, 448,
+  449, 464, 450, 465, 451, 466, 452, 467, 453, 468, 454, 469, 455, 470, 456,
+  471, 457, 472, 458, 473, 459, 474, 460, 475, 461, 476, 462, 477, 463, 478,
+  464, 464, 465, 480, 466, 481, 467, 482, 468, 483, 469, 484, 470, 485, 471,
+  486, 472, 487, 473, 488, 474, 489, 475, 490, 476, 491, 477, 492, 478, 493,
+  479, 494, 480, 480, 481, 496, 482, 497, 483, 498, 484, 499, 485, 500, 486,
+  501, 487, 502, 488, 503, 489, 504, 490, 505, 491, 506, 492, 507, 493, 508,
+  494, 509, 495, 510, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
+  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
+  14,  14,  15,  15,  16,  16,  17,  17,  18,  18,  19,  19,  20,  20,  21,
+  21,  22,  22,  23,  23,  24,  24,  25,  25,  26,  26,  27,  27,  28,  28,
+  29,  29,  30,  30,  0,   0,   1,   32,  2,   33,  3,   34,  4,   35,  5,
+  36,  6,   37,  7,   38,  8,   39,  9,   40,  10,  41,  11,  42,  12,  43,
+  13,  44,  14,  45,  15,  46,  16,  47,  17,  48,  18,  49,  19,  50,  20,
+  51,  21,  52,  22,  53,  23,  54,  24,  55,  25,  56,  26,  57,  27,  58,
+  28,  59,  29,  60,  30,  61,  31,  62,  32,  32,  33,  64,  34,  65,  35,
+  66,  36,  67,  37,  68,  38,  69,  39,  70,  40,  71,  41,  72,  42,  73,
+  43,  74,  44,  75,  45,  76,  46,  77,  47,  78,  48,  79,  49,  80,  50,
+  81,  51,  82,  52,  83,  53,  84,  54,  85,  55,  86,  56,  87,  57,  88,
+  58,  89,  59,  90,  60,  91,  61,  92,  62,  93,  63,  94,  64,  64,  65,
+  96,  66,  97,  67,  98,  68,  99,  69,  100, 70,  101, 71,  102, 72,  103,
+  73,  104, 74,  105, 75,  106, 76,  107, 77,  108, 78,  109, 79,  110, 80,
+  111, 81,  112, 82,  113, 83,  114, 84,  115, 85,  116, 86,  117, 87,  118,
+  88,  119, 89,  120, 90,  121, 91,  122, 92,  123, 93,  124, 94,  125, 95,
+  126, 96,  96,  97,  128, 98,  129, 99,  130, 100, 131, 101, 132, 102, 133,
+  103, 134, 104, 135, 105, 136, 106, 137, 107, 138, 108, 139, 109, 140, 110,
+  141, 111, 142, 112, 143, 113, 144, 114, 145, 115, 146, 116, 147, 117, 148,
+  118, 149, 119, 150, 120, 151, 121, 152, 122, 153, 123, 154, 124, 155, 125,
+  156, 126, 157, 127, 158, 128, 128, 129, 160, 130, 161, 131, 162, 132, 163,
+  133, 164, 134, 165, 135, 166, 136, 167, 137, 168, 138, 169, 139, 170, 140,
+  171, 141, 172, 142, 173, 143, 174, 144, 175, 145, 176, 146, 177, 147, 178,
+  148, 179, 149, 180, 150, 181, 151, 182, 152, 183, 153, 184, 154, 185, 155,
+  186, 156, 187, 157, 188, 158, 189, 159, 190, 160, 160, 161, 192, 162, 193,
+  163, 194, 164, 195, 165, 196, 166, 197, 167, 198, 168, 199, 169, 200, 170,
+  201, 171, 202, 172, 203, 173, 204, 174, 205, 175, 206, 176, 207, 177, 208,
+  178, 209, 179, 210, 180, 211, 181, 212, 182, 213, 183, 214, 184, 215, 185,
+  216, 186, 217, 187, 218, 188, 219, 189, 220, 190, 221, 191, 222, 192, 192,
+  193, 224, 194, 225, 195, 226, 196, 227, 197, 228, 198, 229, 199, 230, 200,
+  231, 201, 232, 202, 233, 203, 234, 204, 235, 205, 236, 206, 237, 207, 238,
+  208, 239, 209, 240, 210, 241, 211, 242, 212, 243, 213, 244, 214, 245, 215,
+  246, 216, 247, 217, 248, 218, 249, 219, 250, 220, 251, 221, 252, 222, 253,
+  223, 254, 224, 224, 225, 256, 226, 257, 227, 258, 228, 259, 229, 260, 230,
+  261, 231, 262, 232, 263, 233, 264, 234, 265, 235, 266, 236, 267, 237, 268,
+  238, 269, 239, 270, 240, 271, 241, 272, 242, 273, 243, 274, 244, 275, 245,
+  276, 246, 277, 247, 278, 248, 279, 249, 280, 250, 281, 251, 282, 252, 283,
+  253, 284, 254, 285, 255, 286, 256, 256, 257, 288, 258, 289, 259, 290, 260,
+  291, 261, 292, 262, 293, 263, 294, 264, 295, 265, 296, 266, 297, 267, 298,
+  268, 299, 269, 300, 270, 301, 271, 302, 272, 303, 273, 304, 274, 305, 275,
+  306, 276, 307, 277, 308, 278, 309, 279, 310, 280, 311, 281, 312, 282, 313,
+  283, 314, 284, 315, 285, 316, 286, 317, 287, 318, 288, 288, 289, 320, 290,
+  321, 291, 322, 292, 323, 293, 324, 294, 325, 295, 326, 296, 327, 297, 328,
+  298, 329, 299, 330, 300, 331, 301, 332, 302, 333, 303, 334, 304, 335, 305,
+  336, 306, 337, 307, 338, 308, 339, 309, 340, 310, 341, 311, 342, 312, 343,
+  313, 344, 314, 345, 315, 346, 316, 347, 317, 348, 318, 349, 319, 350, 320,
+  320, 321, 352, 322, 353, 323, 354, 324, 355, 325, 356, 326, 357, 327, 358,
+  328, 359, 329, 360, 330, 361, 331, 362, 332, 363, 333, 364, 334, 365, 335,
+  366, 336, 367, 337, 368, 338, 369, 339, 370, 340, 371, 341, 372, 342, 373,
+  343, 374, 344, 375, 345, 376, 346, 377, 347, 378, 348, 379, 349, 380, 350,
+  381, 351, 382, 352, 352, 353, 384, 354, 385, 355, 386, 356, 387, 357, 388,
+  358, 389, 359, 390, 360, 391, 361, 392, 362, 393, 363, 394, 364, 395, 365,
+  396, 366, 397, 367, 398, 368, 399, 369, 400, 370, 401, 371, 402, 372, 403,
+  373, 404, 374, 405, 375, 406, 376, 407, 377, 408, 378, 409, 379, 410, 380,
+  411, 381, 412, 382, 413, 383, 414, 384, 384, 385, 416, 386, 417, 387, 418,
+  388, 419, 389, 420, 390, 421, 391, 422, 392, 423, 393, 424, 394, 425, 395,
+  426, 396, 427, 397, 428, 398, 429, 399, 430, 400, 431, 401, 432, 402, 433,
+  403, 434, 404, 435, 405, 436, 406, 437, 407, 438, 408, 439, 409, 440, 410,
+  441, 411, 442, 412, 443, 413, 444, 414, 445, 415, 446, 416, 416, 417, 448,
+  418, 449, 419, 450, 420, 451, 421, 452, 422, 453, 423, 454, 424, 455, 425,
+  456, 426, 457, 427, 458, 428, 459, 429, 460, 430, 461, 431, 462, 432, 463,
+  433, 464, 434, 465, 435, 466, 436, 467, 437, 468, 438, 469, 439, 470, 440,
+  471, 441, 472, 442, 473, 443, 474, 444, 475, 445, 476, 446, 477, 447, 478,
+  448, 448, 449, 480, 450, 481, 451, 482, 452, 483, 453, 484, 454, 485, 455,
+  486, 456, 487, 457, 488, 458, 489, 459, 490, 460, 491, 461, 492, 462, 493,
+  463, 494, 464, 495, 465, 496, 466, 497, 467, 498, 468, 499, 469, 500, 470,
+  501, 471, 502, 472, 503, 473, 504, 474, 505, 475, 506, 476, 507, 477, 508,
+  478, 509, 479, 510, 0,   0
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   16,  16,  32,  32,  48,  48,  64,  64,  80,  80,  96,
+  96,  112, 112, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 208, 208,
+  224, 224, 0,   0,   1,   16,  17,  32,  33,  48,  49,  64,  65,  80,  81,
+  96,  97,  112, 113, 128, 129, 144, 145, 160, 161, 176, 177, 192, 193, 208,
+  209, 224, 225, 240, 1,   1,   2,   17,  18,  33,  34,  49,  50,  65,  66,
+  81,  82,  97,  98,  113, 114, 129, 130, 145, 146, 161, 162, 177, 178, 193,
+  194, 209, 210, 225, 226, 241, 2,   2,   3,   18,  19,  34,  35,  50,  51,
+  66,  67,  82,  83,  98,  99,  114, 115, 130, 131, 146, 147, 162, 163, 178,
+  179, 194, 195, 210, 211, 226, 227, 242, 3,   3,   4,   19,  20,  35,  36,
+  51,  52,  67,  68,  83,  84,  99,  100, 115, 116, 131, 132, 147, 148, 163,
+  164, 179, 180, 195, 196, 211, 212, 227, 228, 243, 4,   4,   5,   20,  21,
+  36,  37,  52,  53,  68,  69,  84,  85,  100, 101, 116, 117, 132, 133, 148,
+  149, 164, 165, 180, 181, 196, 197, 212, 213, 228, 229, 244, 5,   5,   6,
+  21,  22,  37,  38,  53,  54,  69,  70,  85,  86,  101, 102, 117, 118, 133,
+  134, 149, 150, 165, 166, 181, 182, 197, 198, 213, 214, 229, 230, 245, 6,
+  6,   7,   22,  23,  38,  39,  54,  55,  70,  71,  86,  87,  102, 103, 118,
+  119, 134, 135, 150, 151, 166, 167, 182, 183, 198, 199, 214, 215, 230, 231,
+  246, 7,   7,   8,   23,  24,  39,  40,  55,  56,  71,  72,  87,  88,  103,
+  104, 119, 120, 135, 136, 151, 152, 167, 168, 183, 184, 199, 200, 215, 216,
+  231, 232, 247, 8,   8,   9,   24,  25,  40,  41,  56,  57,  72,  73,  88,
+  89,  104, 105, 120, 121, 136, 137, 152, 153, 168, 169, 184, 185, 200, 201,
+  216, 217, 232, 233, 248, 9,   9,   10,  25,  26,  41,  42,  57,  58,  73,
+  74,  89,  90,  105, 106, 121, 122, 137, 138, 153, 154, 169, 170, 185, 186,
+  201, 202, 217, 218, 233, 234, 249, 10,  10,  11,  26,  27,  42,  43,  58,
+  59,  74,  75,  90,  91,  106, 107, 122, 123, 138, 139, 154, 155, 170, 171,
+  186, 187, 202, 203, 218, 219, 234, 235, 250, 11,  11,  12,  27,  28,  43,
+  44,  59,  60,  75,  76,  91,  92,  107, 108, 123, 124, 139, 140, 155, 156,
+  171, 172, 187, 188, 203, 204, 219, 220, 235, 236, 251, 12,  12,  13,  28,
+  29,  44,  45,  60,  61,  76,  77,  92,  93,  108, 109, 124, 125, 140, 141,
+  156, 157, 172, 173, 188, 189, 204, 205, 220, 221, 236, 237, 252, 13,  13,
+  14,  29,  30,  45,  46,  61,  62,  77,  78,  93,  94,  109, 110, 125, 126,
+  141, 142, 157, 158, 173, 174, 189, 190, 205, 206, 221, 222, 237, 238, 253,
+  14,  14,  15,  30,  31,  46,  47,  62,  63,  78,  79,  94,  95,  110, 111,
+  126, 127, 142, 143, 158, 159, 174, 175, 190, 191, 206, 207, 222, 223, 238,
+  239, 254, 0,   0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
+  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
+  14,  14,  0,   0,   1,   16,  2,   17,  3,   18,  4,   19,  5,   20,  6,
+  21,  7,   22,  8,   23,  9,   24,  10,  25,  11,  26,  12,  27,  13,  28,
+  14,  29,  15,  30,  16,  16,  17,  32,  18,  33,  19,  34,  20,  35,  21,
+  36,  22,  37,  23,  38,  24,  39,  25,  40,  26,  41,  27,  42,  28,  43,
+  29,  44,  30,  45,  31,  46,  32,  32,  33,  48,  34,  49,  35,  50,  36,
+  51,  37,  52,  38,  53,  39,  54,  40,  55,  41,  56,  42,  57,  43,  58,
+  44,  59,  45,  60,  46,  61,  47,  62,  48,  48,  49,  64,  50,  65,  51,
+  66,  52,  67,  53,  68,  54,  69,  55,  70,  56,  71,  57,  72,  58,  73,
+  59,  74,  60,  75,  61,  76,  62,  77,  63,  78,  64,  64,  65,  80,  66,
+  81,  67,  82,  68,  83,  69,  84,  70,  85,  71,  86,  72,  87,  73,  88,
+  74,  89,  75,  90,  76,  91,  77,  92,  78,  93,  79,  94,  80,  80,  81,
+  96,  82,  97,  83,  98,  84,  99,  85,  100, 86,  101, 87,  102, 88,  103,
+  89,  104, 90,  105, 91,  106, 92,  107, 93,  108, 94,  109, 95,  110, 96,
+  96,  97,  112, 98,  113, 99,  114, 100, 115, 101, 116, 102, 117, 103, 118,
+  104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
+  126, 112, 112, 113, 128, 114, 129, 115, 130, 116, 131, 117, 132, 118, 133,
+  119, 134, 120, 135, 121, 136, 122, 137, 123, 138, 124, 139, 125, 140, 126,
+  141, 127, 142, 128, 128, 129, 144, 130, 145, 131, 146, 132, 147, 133, 148,
+  134, 149, 135, 150, 136, 151, 137, 152, 138, 153, 139, 154, 140, 155, 141,
+  156, 142, 157, 143, 158, 144, 144, 145, 160, 146, 161, 147, 162, 148, 163,
+  149, 164, 150, 165, 151, 166, 152, 167, 153, 168, 154, 169, 155, 170, 156,
+  171, 157, 172, 158, 173, 159, 174, 160, 160, 161, 176, 162, 177, 163, 178,
+  164, 179, 165, 180, 166, 181, 167, 182, 168, 183, 169, 184, 170, 185, 171,
+  186, 172, 187, 173, 188, 174, 189, 175, 190, 176, 176, 177, 192, 178, 193,
+  179, 194, 180, 195, 181, 196, 182, 197, 183, 198, 184, 199, 185, 200, 186,
+  201, 187, 202, 188, 203, 189, 204, 190, 205, 191, 206, 192, 192, 193, 208,
+  194, 209, 195, 210, 196, 211, 197, 212, 198, 213, 199, 214, 200, 215, 201,
+  216, 202, 217, 203, 218, 204, 219, 205, 220, 206, 221, 207, 222, 208, 208,
+  209, 224, 210, 225, 211, 226, 212, 227, 213, 228, 214, 229, 215, 230, 216,
+  231, 217, 232, 218, 233, 219, 234, 220, 235, 221, 236, 222, 237, 223, 238,
+  224, 224, 225, 240, 226, 241, 227, 242, 228, 243, 229, 244, 230, 245, 231,
+  246, 232, 247, 233, 248, 234, 249, 235, 250, 236, 251, 237, 252, 238, 253,
+  239, 254, 0,   0,
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t,
+                col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   16,  16,  32,  32,  16,  0,   48,  48,  1,   16,  64,
+  64,  17,  32,  80,  80,  33,  48,  17,  1,   49,  64,  96,  96,  2,   17,
+  65,  80,  18,  33,  112, 112, 34,  49,  81,  96,  18,  2,   50,  65,  128,
+  128, 3,   18,  97,  112, 19,  34,  66,  81,  144, 144, 82,  97,  35,  50,
+  113, 128, 19,  3,   51,  66,  160, 160, 4,   19,  98,  113, 129, 144, 67,
+  82,  20,  35,  83,  98,  114, 129, 36,  51,  176, 176, 20,  4,   145, 160,
+  52,  67,  99,  114, 5,   20,  130, 145, 68,  83,  192, 192, 161, 176, 21,
+  36,  115, 130, 84,  99,  37,  52,  146, 161, 208, 208, 53,  68,  21,  5,
+  100, 115, 177, 192, 131, 146, 69,  84,  6,   21,  224, 224, 116, 131, 22,
+  37,  162, 177, 85,  100, 147, 162, 38,  53,  193, 208, 101, 116, 54,  69,
+  22,  6,   132, 147, 178, 193, 70,  85,  163, 178, 209, 224, 7,   22,  117,
+  132, 23,  38,  148, 163, 23,  7,   86,  101, 194, 209, 225, 240, 39,  54,
+  179, 194, 102, 117, 133, 148, 55,  70,  164, 179, 8,   23,  71,  86,  210,
+  225, 118, 133, 149, 164, 195, 210, 24,  39,  87,  102, 40,  55,  56,  71,
+  134, 149, 180, 195, 226, 241, 103, 118, 24,  8,   165, 180, 211, 226, 72,
+  87,  150, 165, 9,   24,  119, 134, 25,  40,  88,  103, 196, 211, 41,  56,
+  135, 150, 181, 196, 104, 119, 57,  72,  227, 242, 166, 181, 120, 135, 151,
+  166, 197, 212, 73,  88,  25,  9,   212, 227, 89,  104, 136, 151, 182, 197,
+  10,  25,  26,  41,  105, 120, 167, 182, 228, 243, 152, 167, 42,  57,  121,
+  136, 213, 228, 58,  73,  198, 213, 74,  89,  137, 152, 183, 198, 168, 183,
+  26,  10,  90,  105, 229, 244, 11,  26,  106, 121, 214, 229, 153, 168, 27,
+  42,  199, 214, 43,  58,  184, 199, 122, 137, 169, 184, 230, 245, 59,  74,
+  27,  11,  75,  90,  138, 153, 200, 215, 215, 230, 91,  106, 12,  27,  28,
+  43,  185, 200, 107, 122, 154, 169, 44,  59,  231, 246, 216, 231, 60,  75,
+  123, 138, 28,  12,  76,  91,  201, 216, 170, 185, 232, 247, 139, 154, 92,
+  107, 13,  28,  108, 123, 29,  44,  186, 201, 217, 232, 155, 170, 45,  60,
+  29,  13,  61,  76,  124, 139, 14,  14,  233, 248, 77,  92,  14,  29,  171,
+  186, 140, 155, 202, 217, 30,  45,  93,  108, 109, 124, 46,  61,  156, 171,
+  62,  77,  187, 202, 15,  30,  125, 140, 218, 233, 78,  93,  31,  46,  172,
+  187, 47,  62,  141, 156, 94,  109, 234, 249, 203, 218, 63,  78,  110, 125,
+  188, 203, 157, 172, 126, 141, 79,  94,  173, 188, 95,  110, 219, 234, 142,
+  157, 204, 219, 235, 250, 111, 126, 158, 173, 127, 142, 189, 204, 220, 235,
+  143, 158, 174, 189, 205, 220, 236, 251, 159, 174, 190, 205, 221, 236, 175,
+  190, 237, 252, 206, 221, 222, 237, 191, 206, 238, 253, 207, 222, 223, 238,
+  239, 254, 0,   0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   1,   1,   0,   1,   2,   2,   1,   16,  3,   3,   2,
+  17,  16,  17,  4,   4,   17,  32,  3,   18,  5,   5,   18,  33,  32,  33,
+  4,   19,  33,  48,  6,   6,   19,  34,  5,   20,  34,  49,  48,  49,  7,
+  7,   20,  35,  49,  64,  6,   21,  35,  50,  21,  36,  64,  65,  8,   8,
+  50,  65,  36,  51,  7,   22,  22,  37,  65,  80,  51,  66,  9,   9,   37,
+  52,  8,   23,  66,  81,  52,  67,  80,  81,  23,  38,  10,  10,  38,  53,
+  67,  82,  81,  96,  53,  68,  9,   24,  82,  97,  68,  83,  24,  39,  96,
+  97,  39,  54,  11,  11,  54,  69,  83,  98,  97,  112, 69,  84,  10,  25,
+  25,  40,  40,  55,  98,  113, 84,  99,  12,  12,  55,  70,  112, 113, 70,
+  85,  11,  26,  99,  114, 85,  100, 113, 128, 26,  41,  41,  56,  56,  71,
+  100, 115, 13,  13,  71,  86,  114, 129, 86,  101, 128, 129, 57,  72,  115,
+  130, 101, 116, 12,  27,  42,  57,  14,  14,  72,  87,  27,  42,  129, 144,
+  87,  102, 116, 131, 130, 145, 102, 117, 58,  73,  144, 145, 73,  88,  117,
+  132, 88,  103, 13,  28,  43,  58,  131, 146, 103, 118, 28,  43,  145, 160,
+  132, 147, 74,  89,  89,  104, 118, 133, 146, 161, 104, 119, 160, 161, 59,
+  74,  119, 134, 133, 148, 14,  29,  44,  59,  147, 162, 161, 176, 29,  44,
+  105, 120, 75,  90,  90,  105, 148, 163, 162, 177, 134, 149, 176, 177, 120,
+  135, 149, 164, 163, 178, 15,  30,  135, 150, 177, 192, 60,  75,  106, 121,
+  45,  60,  121, 136, 178, 193, 91,  106, 136, 151, 164, 179, 192, 193, 30,
+  45,  150, 165, 151, 166, 179, 194, 76,  91,  165, 180, 122, 137, 193, 208,
+  107, 122, 137, 152, 208, 209, 180, 195, 61,  76,  152, 167, 194, 209, 166,
+  181, 224, 224, 92,  107, 181, 196, 46,  61,  138, 153, 209, 224, 167, 182,
+  153, 168, 195, 210, 31,  46,  123, 138, 77,  92,  168, 183, 210, 225, 196,
+  211, 225, 240, 182, 197, 154, 169, 108, 123, 139, 154, 183, 198, 62,  77,
+  197, 212, 169, 184, 93,  108, 211, 226, 184, 199, 47,  62,  212, 227, 226,
+  241, 124, 139, 198, 213, 155, 170, 170, 185, 140, 155, 213, 228, 227, 242,
+  109, 124, 78,  93,  185, 200, 228, 243, 199, 214, 200, 215, 214, 229, 125,
+  140, 171, 186, 186, 201, 63,  78,  156, 171, 94,  109, 141, 156, 229, 244,
+  201, 216, 215, 230, 79,  94,  230, 245, 216, 231, 110, 125, 187, 202, 231,
+  246, 217, 232, 157, 172, 202, 217, 126, 141, 95,  110, 142, 157, 172, 187,
+  232, 247, 111, 126, 218, 233, 203, 218, 233, 248, 173, 188, 188, 203, 127,
+  142, 158, 173, 143, 158, 234, 249, 219, 234, 189, 204, 204, 219, 159, 174,
+  174, 189, 235, 250, 205, 220, 175, 190, 190, 205, 220, 235, 191, 206, 221,
+  236, 236, 251, 206, 221, 237, 252, 207, 222, 222, 237, 223, 238, 238, 253,
+  239, 254, 0,   0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   16,  0,   16,  16,  1,   16,  17,  1,   32,  32,  17,
+  32,  2,   17,  18,  2,   48,  48,  18,  33,  33,  48,  3,   18,  49,  64,
+  64,  65,  34,  49,  19,  3,   19,  34,  50,  65,  4,   19,  65,  80,  80,
+  81,  35,  50,  20,  4,   20,  35,  66,  81,  81,  96,  51,  66,  96,  97,
+  5,   20,  36,  51,  82,  97,  21,  36,  67,  82,  97,  112, 21,  5,   52,
+  67,  112, 113, 37,  52,  6,   21,  83,  98,  98,  113, 68,  83,  22,  6,
+  113, 128, 22,  37,  53,  68,  84,  99,  99,  114, 128, 129, 114, 129, 69,
+  84,  38,  53,  7,   22,  23,  7,   129, 144, 23,  38,  54,  69,  100, 115,
+  85,  100, 115, 130, 144, 145, 130, 145, 39,  54,  70,  85,  8,   23,  55,
+  70,  116, 131, 101, 116, 145, 160, 24,  39,  24,  8,   86,  101, 131, 146,
+  160, 161, 146, 161, 71,  86,  40,  55,  9,   24,  117, 132, 102, 117, 161,
+  176, 132, 147, 56,  71,  87,  102, 25,  40,  147, 162, 25,  9,   176, 177,
+  162, 177, 72,  87,  41,  56,  118, 133, 133, 148, 103, 118, 10,  25,  148,
+  163, 57,  72,  88,  103, 177, 192, 26,  41,  163, 178, 192, 193, 26,  10,
+  119, 134, 73,  88,  149, 164, 104, 119, 134, 149, 42,  57,  178, 193, 164,
+  179, 11,  26,  58,  73,  193, 208, 89,  104, 135, 150, 120, 135, 27,  42,
+  74,  89,  208, 209, 150, 165, 179, 194, 165, 180, 105, 120, 194, 209, 43,
+  58,  27,  11,  136, 151, 90,  105, 151, 166, 180, 195, 59,  74,  121, 136,
+  209, 224, 195, 210, 224, 225, 166, 181, 106, 121, 75,  90,  12,  27,  181,
+  196, 28,  12,  210, 225, 152, 167, 167, 182, 137, 152, 28,  43,  196, 211,
+  122, 137, 91,  106, 225, 240, 44,  59,  13,  28,  107, 122, 182, 197, 168,
+  183, 211, 226, 153, 168, 226, 241, 60,  75,  197, 212, 138, 153, 29,  44,
+  76,  91,  29,  13,  183, 198, 123, 138, 45,  60,  212, 227, 198, 213, 154,
+  169, 169, 184, 227, 242, 92,  107, 61,  76,  139, 154, 14,  29,  30,  14,
+  184, 199, 213, 228, 108, 123, 199, 214, 228, 243, 77,  92,  30,  45,  170,
+  185, 155, 170, 185, 200, 93,  108, 124, 139, 214, 229, 46,  61,  200, 215,
+  229, 244, 15,  30,  109, 124, 62,  77,  140, 155, 215, 230, 31,  46,  171,
+  186, 186, 201, 201, 216, 78,  93,  230, 245, 125, 140, 47,  62,  216, 231,
+  156, 171, 94,  109, 231, 246, 141, 156, 63,  78,  202, 217, 187, 202, 110,
+  125, 217, 232, 172, 187, 232, 247, 79,  94,  157, 172, 126, 141, 203, 218,
+  95,  110, 233, 248, 218, 233, 142, 157, 111, 126, 173, 188, 188, 203, 234,
+  249, 219, 234, 127, 142, 158, 173, 204, 219, 189, 204, 143, 158, 235, 250,
+  174, 189, 205, 220, 159, 174, 220, 235, 221, 236, 175, 190, 190, 205, 236,
+  251, 206, 221, 237, 252, 191, 206, 222, 237, 207, 222, 238, 253, 223, 238,
+  239, 254, 0,   0,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0,   0,    0,   0,    32,  32,   64,  64,   96,  96,   128, 128,  160, 160,
+  192, 192,  224, 224,  256, 256,  288, 288,  320, 320,  352, 352,  384, 384,
+  416, 416,  448, 448,  480, 480,  512, 512,  544, 544,  576, 576,  608, 608,
+  640, 640,  672, 672,  704, 704,  736, 736,  768, 768,  800, 800,  832, 832,
+  864, 864,  896, 896,  928, 928,  960, 960,  0,   0,    1,   32,   33,  64,
+  65,  96,   97,  128,  129, 160,  161, 192,  193, 224,  225, 256,  257, 288,
+  289, 320,  321, 352,  353, 384,  385, 416,  417, 448,  449, 480,  481, 512,
+  513, 544,  545, 576,  577, 608,  609, 640,  641, 672,  673, 704,  705, 736,
+  737, 768,  769, 800,  801, 832,  833, 864,  865, 896,  897, 928,  929, 960,
+  961, 992,  1,   1,    2,   33,   34,  65,   66,  97,   98,  129,  130, 161,
+  162, 193,  194, 225,  226, 257,  258, 289,  290, 321,  322, 353,  354, 385,
+  386, 417,  418, 449,  450, 481,  482, 513,  514, 545,  546, 577,  578, 609,
+  610, 641,  642, 673,  674, 705,  706, 737,  738, 769,  770, 801,  802, 833,
+  834, 865,  866, 897,  898, 929,  930, 961,  962, 993,  2,   2,    3,   34,
+  35,  66,   67,  98,   99,  130,  131, 162,  163, 194,  195, 226,  227, 258,
+  259, 290,  291, 322,  323, 354,  355, 386,  387, 418,  419, 450,  451, 482,
+  483, 514,  515, 546,  547, 578,  579, 610,  611, 642,  643, 674,  675, 706,
+  707, 738,  739, 770,  771, 802,  803, 834,  835, 866,  867, 898,  899, 930,
+  931, 962,  963, 994,  3,   3,    4,   35,   36,  67,   68,  99,   100, 131,
+  132, 163,  164, 195,  196, 227,  228, 259,  260, 291,  292, 323,  324, 355,
+  356, 387,  388, 419,  420, 451,  452, 483,  484, 515,  516, 547,  548, 579,
+  580, 611,  612, 643,  644, 675,  676, 707,  708, 739,  740, 771,  772, 803,
+  804, 835,  836, 867,  868, 899,  900, 931,  932, 963,  964, 995,  4,   4,
+  5,   36,   37,  68,   69,  100,  101, 132,  133, 164,  165, 196,  197, 228,
+  229, 260,  261, 292,  293, 324,  325, 356,  357, 388,  389, 420,  421, 452,
+  453, 484,  485, 516,  517, 548,  549, 580,  581, 612,  613, 644,  645, 676,
+  677, 708,  709, 740,  741, 772,  773, 804,  805, 836,  837, 868,  869, 900,
+  901, 932,  933, 964,  965, 996,  5,   5,    6,   37,   38,  69,   70,  101,
+  102, 133,  134, 165,  166, 197,  198, 229,  230, 261,  262, 293,  294, 325,
+  326, 357,  358, 389,  390, 421,  422, 453,  454, 485,  486, 517,  518, 549,
+  550, 581,  582, 613,  614, 645,  646, 677,  678, 709,  710, 741,  742, 773,
+  774, 805,  806, 837,  838, 869,  870, 901,  902, 933,  934, 965,  966, 997,
+  6,   6,    7,   38,   39,  70,   71,  102,  103, 134,  135, 166,  167, 198,
+  199, 230,  231, 262,  263, 294,  295, 326,  327, 358,  359, 390,  391, 422,
+  423, 454,  455, 486,  487, 518,  519, 550,  551, 582,  583, 614,  615, 646,
+  647, 678,  679, 710,  711, 742,  743, 774,  775, 806,  807, 838,  839, 870,
+  871, 902,  903, 934,  935, 966,  967, 998,  7,   7,    8,   39,   40,  71,
+  72,  103,  104, 135,  136, 167,  168, 199,  200, 231,  232, 263,  264, 295,
+  296, 327,  328, 359,  360, 391,  392, 423,  424, 455,  456, 487,  488, 519,
+  520, 551,  552, 583,  584, 615,  616, 647,  648, 679,  680, 711,  712, 743,
+  744, 775,  776, 807,  808, 839,  840, 871,  872, 903,  904, 935,  936, 967,
+  968, 999,  8,   8,    9,   40,   41,  72,   73,  104,  105, 136,  137, 168,
+  169, 200,  201, 232,  233, 264,  265, 296,  297, 328,  329, 360,  361, 392,
+  393, 424,  425, 456,  457, 488,  489, 520,  521, 552,  553, 584,  585, 616,
+  617, 648,  649, 680,  681, 712,  713, 744,  745, 776,  777, 808,  809, 840,
+  841, 872,  873, 904,  905, 936,  937, 968,  969, 1000, 9,   9,    10,  41,
+  42,  73,   74,  105,  106, 137,  138, 169,  170, 201,  202, 233,  234, 265,
+  266, 297,  298, 329,  330, 361,  362, 393,  394, 425,  426, 457,  458, 489,
+  490, 521,  522, 553,  554, 585,  586, 617,  618, 649,  650, 681,  682, 713,
+  714, 745,  746, 777,  778, 809,  810, 841,  842, 873,  874, 905,  906, 937,
+  938, 969,  970, 1001, 10,  10,   11,  42,   43,  74,   75,  106,  107, 138,
+  139, 170,  171, 202,  203, 234,  235, 266,  267, 298,  299, 330,  331, 362,
+  363, 394,  395, 426,  427, 458,  459, 490,  491, 522,  523, 554,  555, 586,
+  587, 618,  619, 650,  651, 682,  683, 714,  715, 746,  747, 778,  779, 810,
+  811, 842,  843, 874,  875, 906,  907, 938,  939, 970,  971, 1002, 11,  11,
+  12,  43,   44,  75,   76,  107,  108, 139,  140, 171,  172, 203,  204, 235,
+  236, 267,  268, 299,  300, 331,  332, 363,  364, 395,  396, 427,  428, 459,
+  460, 491,  492, 523,  524, 555,  556, 587,  588, 619,  620, 651,  652, 683,
+  684, 715,  716, 747,  748, 779,  780, 811,  812, 843,  844, 875,  876, 907,
+  908, 939,  940, 971,  972, 1003, 12,  12,   13,  44,   45,  76,   77,  108,
+  109, 140,  141, 172,  173, 204,  205, 236,  237, 268,  269, 300,  301, 332,
+  333, 364,  365, 396,  397, 428,  429, 460,  461, 492,  493, 524,  525, 556,
+  557, 588,  589, 620,  621, 652,  653, 684,  685, 716,  717, 748,  749, 780,
+  781, 812,  813, 844,  845, 876,  877, 908,  909, 940,  941, 972,  973, 1004,
+  13,  13,   14,  45,   46,  77,   78,  109,  110, 141,  142, 173,  174, 205,
+  206, 237,  238, 269,  270, 301,  302, 333,  334, 365,  366, 397,  398, 429,
+  430, 461,  462, 493,  494, 525,  526, 557,  558, 589,  590, 621,  622, 653,
+  654, 685,  686, 717,  718, 749,  750, 781,  782, 813,  814, 845,  846, 877,
+  878, 909,  910, 941,  942, 973,  974, 1005, 14,  14,   15,  46,   47,  78,
+  79,  110,  111, 142,  143, 174,  175, 206,  207, 238,  239, 270,  271, 302,
+  303, 334,  335, 366,  367, 398,  399, 430,  431, 462,  463, 494,  495, 526,
+  527, 558,  559, 590,  591, 622,  623, 654,  655, 686,  687, 718,  719, 750,
+  751, 782,  783, 814,  815, 846,  847, 878,  879, 910,  911, 942,  943, 974,
+  975, 1006, 15,  15,   16,  47,   48,  79,   80,  111,  112, 143,  144, 175,
+  176, 207,  208, 239,  240, 271,  272, 303,  304, 335,  336, 367,  368, 399,
+  400, 431,  432, 463,  464, 495,  496, 527,  528, 559,  560, 591,  592, 623,
+  624, 655,  656, 687,  688, 719,  720, 751,  752, 783,  784, 815,  816, 847,
+  848, 879,  880, 911,  912, 943,  944, 975,  976, 1007, 16,  16,   17,  48,
+  49,  80,   81,  112,  113, 144,  145, 176,  177, 208,  209, 240,  241, 272,
+  273, 304,  305, 336,  337, 368,  369, 400,  401, 432,  433, 464,  465, 496,
+  497, 528,  529, 560,  561, 592,  593, 624,  625, 656,  657, 688,  689, 720,
+  721, 752,  753, 784,  785, 816,  817, 848,  849, 880,  881, 912,  913, 944,
+  945, 976,  977, 1008, 17,  17,   18,  49,   50,  81,   82,  113,  114, 145,
+  146, 177,  178, 209,  210, 241,  242, 273,  274, 305,  306, 337,  338, 369,
+  370, 401,  402, 433,  434, 465,  466, 497,  498, 529,  530, 561,  562, 593,
+  594, 625,  626, 657,  658, 689,  690, 721,  722, 753,  754, 785,  786, 817,
+  818, 849,  850, 881,  882, 913,  914, 945,  946, 977,  978, 1009, 18,  18,
+  19,  50,   51,  82,   83,  114,  115, 146,  147, 178,  179, 210,  211, 242,
+  243, 274,  275, 306,  307, 338,  339, 370,  371, 402,  403, 434,  435, 466,
+  467, 498,  499, 530,  531, 562,  563, 594,  595, 626,  627, 658,  659, 690,
+  691, 722,  723, 754,  755, 786,  787, 818,  819, 850,  851, 882,  883, 914,
+  915, 946,  947, 978,  979, 1010, 19,  19,   20,  51,   52,  83,   84,  115,
+  116, 147,  148, 179,  180, 211,  212, 243,  244, 275,  276, 307,  308, 339,
+  340, 371,  372, 403,  404, 435,  436, 467,  468, 499,  500, 531,  532, 563,
+  564, 595,  596, 627,  628, 659,  660, 691,  692, 723,  724, 755,  756, 787,
+  788, 819,  820, 851,  852, 883,  884, 915,  916, 947,  948, 979,  980, 1011,
+  20,  20,   21,  52,   53,  84,   85,  116,  117, 148,  149, 180,  181, 212,
+  213, 244,  245, 276,  277, 308,  309, 340,  341, 372,  373, 404,  405, 436,
+  437, 468,  469, 500,  501, 532,  533, 564,  565, 596,  597, 628,  629, 660,
+  661, 692,  693, 724,  725, 756,  757, 788,  789, 820,  821, 852,  853, 884,
+  885, 916,  917, 948,  949, 980,  981, 1012, 21,  21,   22,  53,   54,  85,
+  86,  117,  118, 149,  150, 181,  182, 213,  214, 245,  246, 277,  278, 309,
+  310, 341,  342, 373,  374, 405,  406, 437,  438, 469,  470, 501,  502, 533,
+  534, 565,  566, 597,  598, 629,  630, 661,  662, 693,  694, 725,  726, 757,
+  758, 789,  790, 821,  822, 853,  854, 885,  886, 917,  918, 949,  950, 981,
+  982, 1013, 22,  22,   23,  54,   55,  86,   87,  118,  119, 150,  151, 182,
+  183, 214,  215, 246,  247, 278,  279, 310,  311, 342,  343, 374,  375, 406,
+  407, 438,  439, 470,  471, 502,  503, 534,  535, 566,  567, 598,  599, 630,
+  631, 662,  663, 694,  695, 726,  727, 758,  759, 790,  791, 822,  823, 854,
+  855, 886,  887, 918,  919, 950,  951, 982,  983, 1014, 23,  23,   24,  55,
+  56,  87,   88,  119,  120, 151,  152, 183,  184, 215,  216, 247,  248, 279,
+  280, 311,  312, 343,  344, 375,  376, 407,  408, 439,  440, 471,  472, 503,
+  504, 535,  536, 567,  568, 599,  600, 631,  632, 663,  664, 695,  696, 727,
+  728, 759,  760, 791,  792, 823,  824, 855,  856, 887,  888, 919,  920, 951,
+  952, 983,  984, 1015, 24,  24,   25,  56,   57,  88,   89,  120,  121, 152,
+  153, 184,  185, 216,  217, 248,  249, 280,  281, 312,  313, 344,  345, 376,
+  377, 408,  409, 440,  441, 472,  473, 504,  505, 536,  537, 568,  569, 600,
+  601, 632,  633, 664,  665, 696,  697, 728,  729, 760,  761, 792,  793, 824,
+  825, 856,  857, 888,  889, 920,  921, 952,  953, 984,  985, 1016, 25,  25,
+  26,  57,   58,  89,   90,  121,  122, 153,  154, 185,  186, 217,  218, 249,
+  250, 281,  282, 313,  314, 345,  346, 377,  378, 409,  410, 441,  442, 473,
+  474, 505,  506, 537,  538, 569,  570, 601,  602, 633,  634, 665,  666, 697,
+  698, 729,  730, 761,  762, 793,  794, 825,  826, 857,  858, 889,  890, 921,
+  922, 953,  954, 985,  986, 1017, 26,  26,   27,  58,   59,  90,   91,  122,
+  123, 154,  155, 186,  187, 218,  219, 250,  251, 282,  283, 314,  315, 346,
+  347, 378,  379, 410,  411, 442,  443, 474,  475, 506,  507, 538,  539, 570,
+  571, 602,  603, 634,  635, 666,  667, 698,  699, 730,  731, 762,  763, 794,
+  795, 826,  827, 858,  859, 890,  891, 922,  923, 954,  955, 986,  987, 1018,
+  27,  27,   28,  59,   60,  91,   92,  123,  124, 155,  156, 187,  188, 219,
+  220, 251,  252, 283,  284, 315,  316, 347,  348, 379,  380, 411,  412, 443,
+  444, 475,  476, 507,  508, 539,  540, 571,  572, 603,  604, 635,  636, 667,
+  668, 699,  700, 731,  732, 763,  764, 795,  796, 827,  828, 859,  860, 891,
+  892, 923,  924, 955,  956, 987,  988, 1019, 28,  28,   29,  60,   61,  92,
+  93,  124,  125, 156,  157, 188,  189, 220,  221, 252,  253, 284,  285, 316,
+  317, 348,  349, 380,  381, 412,  413, 444,  445, 476,  477, 508,  509, 540,
+  541, 572,  573, 604,  605, 636,  637, 668,  669, 700,  701, 732,  733, 764,
+  765, 796,  797, 828,  829, 860,  861, 892,  893, 924,  925, 956,  957, 988,
+  989, 1020, 29,  29,   30,  61,   62,  93,   94,  125,  126, 157,  158, 189,
+  190, 221,  222, 253,  254, 285,  286, 317,  318, 349,  350, 381,  382, 413,
+  414, 445,  446, 477,  478, 509,  510, 541,  542, 573,  574, 605,  606, 637,
+  638, 669,  670, 701,  702, 733,  734, 765,  766, 797,  798, 829,  830, 861,
+  862, 893,  894, 925,  926, 957,  958, 989,  990, 1021, 30,  30,   31,  62,
+  63,  94,   95,  126,  127, 158,  159, 190,  191, 222,  223, 254,  255, 286,
+  287, 318,  319, 350,  351, 382,  383, 414,  415, 446,  447, 478,  479, 510,
+  511, 542,  543, 574,  575, 606,  607, 638,  639, 670,  671, 702,  703, 734,
+  735, 766,  767, 798,  799, 830,  831, 862,  863, 894,  895, 926,  927, 958,
+  959, 990,  991, 1022, 0,   0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0,   0,    0,   0,    1,   1,    2,   2,    3,   3,    4,   4,    5,   5,
+  6,   6,    7,   7,    8,   8,    9,   9,    10,  10,   11,  11,   12,  12,
+  13,  13,   14,  14,   15,  15,   16,  16,   17,  17,   18,  18,   19,  19,
+  20,  20,   21,  21,   22,  22,   23,  23,   24,  24,   25,  25,   26,  26,
+  27,  27,   28,  28,   29,  29,   30,  30,   0,   0,    1,   32,   2,   33,
+  3,   34,   4,   35,   5,   36,   6,   37,   7,   38,   8,   39,   9,   40,
+  10,  41,   11,  42,   12,  43,   13,  44,   14,  45,   15,  46,   16,  47,
+  17,  48,   18,  49,   19,  50,   20,  51,   21,  52,   22,  53,   23,  54,
+  24,  55,   25,  56,   26,  57,   27,  58,   28,  59,   29,  60,   30,  61,
+  31,  62,   32,  32,   33,  64,   34,  65,   35,  66,   36,  67,   37,  68,
+  38,  69,   39,  70,   40,  71,   41,  72,   42,  73,   43,  74,   44,  75,
+  45,  76,   46,  77,   47,  78,   48,  79,   49,  80,   50,  81,   51,  82,
+  52,  83,   53,  84,   54,  85,   55,  86,   56,  87,   57,  88,   58,  89,
+  59,  90,   60,  91,   61,  92,   62,  93,   63,  94,   64,  64,   65,  96,
+  66,  97,   67,  98,   68,  99,   69,  100,  70,  101,  71,  102,  72,  103,
+  73,  104,  74,  105,  75,  106,  76,  107,  77,  108,  78,  109,  79,  110,
+  80,  111,  81,  112,  82,  113,  83,  114,  84,  115,  85,  116,  86,  117,
+  87,  118,  88,  119,  89,  120,  90,  121,  91,  122,  92,  123,  93,  124,
+  94,  125,  95,  126,  96,  96,   97,  128,  98,  129,  99,  130,  100, 131,
+  101, 132,  102, 133,  103, 134,  104, 135,  105, 136,  106, 137,  107, 138,
+  108, 139,  109, 140,  110, 141,  111, 142,  112, 143,  113, 144,  114, 145,
+  115, 146,  116, 147,  117, 148,  118, 149,  119, 150,  120, 151,  121, 152,
+  122, 153,  123, 154,  124, 155,  125, 156,  126, 157,  127, 158,  128, 128,
+  129, 160,  130, 161,  131, 162,  132, 163,  133, 164,  134, 165,  135, 166,
+  136, 167,  137, 168,  138, 169,  139, 170,  140, 171,  141, 172,  142, 173,
+  143, 174,  144, 175,  145, 176,  146, 177,  147, 178,  148, 179,  149, 180,
+  150, 181,  151, 182,  152, 183,  153, 184,  154, 185,  155, 186,  156, 187,
+  157, 188,  158, 189,  159, 190,  160, 160,  161, 192,  162, 193,  163, 194,
+  164, 195,  165, 196,  166, 197,  167, 198,  168, 199,  169, 200,  170, 201,
+  171, 202,  172, 203,  173, 204,  174, 205,  175, 206,  176, 207,  177, 208,
+  178, 209,  179, 210,  180, 211,  181, 212,  182, 213,  183, 214,  184, 215,
+  185, 216,  186, 217,  187, 218,  188, 219,  189, 220,  190, 221,  191, 222,
+  192, 192,  193, 224,  194, 225,  195, 226,  196, 227,  197, 228,  198, 229,
+  199, 230,  200, 231,  201, 232,  202, 233,  203, 234,  204, 235,  205, 236,
+  206, 237,  207, 238,  208, 239,  209, 240,  210, 241,  211, 242,  212, 243,
+  213, 244,  214, 245,  215, 246,  216, 247,  217, 248,  218, 249,  219, 250,
+  220, 251,  221, 252,  222, 253,  223, 254,  224, 224,  225, 256,  226, 257,
+  227, 258,  228, 259,  229, 260,  230, 261,  231, 262,  232, 263,  233, 264,
+  234, 265,  235, 266,  236, 267,  237, 268,  238, 269,  239, 270,  240, 271,
+  241, 272,  242, 273,  243, 274,  244, 275,  245, 276,  246, 277,  247, 278,
+  248, 279,  249, 280,  250, 281,  251, 282,  252, 283,  253, 284,  254, 285,
+  255, 286,  256, 256,  257, 288,  258, 289,  259, 290,  260, 291,  261, 292,
+  262, 293,  263, 294,  264, 295,  265, 296,  266, 297,  267, 298,  268, 299,
+  269, 300,  270, 301,  271, 302,  272, 303,  273, 304,  274, 305,  275, 306,
+  276, 307,  277, 308,  278, 309,  279, 310,  280, 311,  281, 312,  282, 313,
+  283, 314,  284, 315,  285, 316,  286, 317,  287, 318,  288, 288,  289, 320,
+  290, 321,  291, 322,  292, 323,  293, 324,  294, 325,  295, 326,  296, 327,
+  297, 328,  298, 329,  299, 330,  300, 331,  301, 332,  302, 333,  303, 334,
+  304, 335,  305, 336,  306, 337,  307, 338,  308, 339,  309, 340,  310, 341,
+  311, 342,  312, 343,  313, 344,  314, 345,  315, 346,  316, 347,  317, 348,
+  318, 349,  319, 350,  320, 320,  321, 352,  322, 353,  323, 354,  324, 355,
+  325, 356,  326, 357,  327, 358,  328, 359,  329, 360,  330, 361,  331, 362,
+  332, 363,  333, 364,  334, 365,  335, 366,  336, 367,  337, 368,  338, 369,
+  339, 370,  340, 371,  341, 372,  342, 373,  343, 374,  344, 375,  345, 376,
+  346, 377,  347, 378,  348, 379,  349, 380,  350, 381,  351, 382,  352, 352,
+  353, 384,  354, 385,  355, 386,  356, 387,  357, 388,  358, 389,  359, 390,
+  360, 391,  361, 392,  362, 393,  363, 394,  364, 395,  365, 396,  366, 397,
+  367, 398,  368, 399,  369, 400,  370, 401,  371, 402,  372, 403,  373, 404,
+  374, 405,  375, 406,  376, 407,  377, 408,  378, 409,  379, 410,  380, 411,
+  381, 412,  382, 413,  383, 414,  384, 384,  385, 416,  386, 417,  387, 418,
+  388, 419,  389, 420,  390, 421,  391, 422,  392, 423,  393, 424,  394, 425,
+  395, 426,  396, 427,  397, 428,  398, 429,  399, 430,  400, 431,  401, 432,
+  402, 433,  403, 434,  404, 435,  405, 436,  406, 437,  407, 438,  408, 439,
+  409, 440,  410, 441,  411, 442,  412, 443,  413, 444,  414, 445,  415, 446,
+  416, 416,  417, 448,  418, 449,  419, 450,  420, 451,  421, 452,  422, 453,
+  423, 454,  424, 455,  425, 456,  426, 457,  427, 458,  428, 459,  429, 460,
+  430, 461,  431, 462,  432, 463,  433, 464,  434, 465,  435, 466,  436, 467,
+  437, 468,  438, 469,  439, 470,  440, 471,  441, 472,  442, 473,  443, 474,
+  444, 475,  445, 476,  446, 477,  447, 478,  448, 448,  449, 480,  450, 481,
+  451, 482,  452, 483,  453, 484,  454, 485,  455, 486,  456, 487,  457, 488,
+  458, 489,  459, 490,  460, 491,  461, 492,  462, 493,  463, 494,  464, 495,
+  465, 496,  466, 497,  467, 498,  468, 499,  469, 500,  470, 501,  471, 502,
+  472, 503,  473, 504,  474, 505,  475, 506,  476, 507,  477, 508,  478, 509,
+  479, 510,  480, 480,  481, 512,  482, 513,  483, 514,  484, 515,  485, 516,
+  486, 517,  487, 518,  488, 519,  489, 520,  490, 521,  491, 522,  492, 523,
+  493, 524,  494, 525,  495, 526,  496, 527,  497, 528,  498, 529,  499, 530,
+  500, 531,  501, 532,  502, 533,  503, 534,  504, 535,  505, 536,  506, 537,
+  507, 538,  508, 539,  509, 540,  510, 541,  511, 542,  512, 512,  513, 544,
+  514, 545,  515, 546,  516, 547,  517, 548,  518, 549,  519, 550,  520, 551,
+  521, 552,  522, 553,  523, 554,  524, 555,  525, 556,  526, 557,  527, 558,
+  528, 559,  529, 560,  530, 561,  531, 562,  532, 563,  533, 564,  534, 565,
+  535, 566,  536, 567,  537, 568,  538, 569,  539, 570,  540, 571,  541, 572,
+  542, 573,  543, 574,  544, 544,  545, 576,  546, 577,  547, 578,  548, 579,
+  549, 580,  550, 581,  551, 582,  552, 583,  553, 584,  554, 585,  555, 586,
+  556, 587,  557, 588,  558, 589,  559, 590,  560, 591,  561, 592,  562, 593,
+  563, 594,  564, 595,  565, 596,  566, 597,  567, 598,  568, 599,  569, 600,
+  570, 601,  571, 602,  572, 603,  573, 604,  574, 605,  575, 606,  576, 576,
+  577, 608,  578, 609,  579, 610,  580, 611,  581, 612,  582, 613,  583, 614,
+  584, 615,  585, 616,  586, 617,  587, 618,  588, 619,  589, 620,  590, 621,
+  591, 622,  592, 623,  593, 624,  594, 625,  595, 626,  596, 627,  597, 628,
+  598, 629,  599, 630,  600, 631,  601, 632,  602, 633,  603, 634,  604, 635,
+  605, 636,  606, 637,  607, 638,  608, 608,  609, 640,  610, 641,  611, 642,
+  612, 643,  613, 644,  614, 645,  615, 646,  616, 647,  617, 648,  618, 649,
+  619, 650,  620, 651,  621, 652,  622, 653,  623, 654,  624, 655,  625, 656,
+  626, 657,  627, 658,  628, 659,  629, 660,  630, 661,  631, 662,  632, 663,
+  633, 664,  634, 665,  635, 666,  636, 667,  637, 668,  638, 669,  639, 670,
+  640, 640,  641, 672,  642, 673,  643, 674,  644, 675,  645, 676,  646, 677,
+  647, 678,  648, 679,  649, 680,  650, 681,  651, 682,  652, 683,  653, 684,
+  654, 685,  655, 686,  656, 687,  657, 688,  658, 689,  659, 690,  660, 691,
+  661, 692,  662, 693,  663, 694,  664, 695,  665, 696,  666, 697,  667, 698,
+  668, 699,  669, 700,  670, 701,  671, 702,  672, 672,  673, 704,  674, 705,
+  675, 706,  676, 707,  677, 708,  678, 709,  679, 710,  680, 711,  681, 712,
+  682, 713,  683, 714,  684, 715,  685, 716,  686, 717,  687, 718,  688, 719,
+  689, 720,  690, 721,  691, 722,  692, 723,  693, 724,  694, 725,  695, 726,
+  696, 727,  697, 728,  698, 729,  699, 730,  700, 731,  701, 732,  702, 733,
+  703, 734,  704, 704,  705, 736,  706, 737,  707, 738,  708, 739,  709, 740,
+  710, 741,  711, 742,  712, 743,  713, 744,  714, 745,  715, 746,  716, 747,
+  717, 748,  718, 749,  719, 750,  720, 751,  721, 752,  722, 753,  723, 754,
+  724, 755,  725, 756,  726, 757,  727, 758,  728, 759,  729, 760,  730, 761,
+  731, 762,  732, 763,  733, 764,  734, 765,  735, 766,  736, 736,  737, 768,
+  738, 769,  739, 770,  740, 771,  741, 772,  742, 773,  743, 774,  744, 775,
+  745, 776,  746, 777,  747, 778,  748, 779,  749, 780,  750, 781,  751, 782,
+  752, 783,  753, 784,  754, 785,  755, 786,  756, 787,  757, 788,  758, 789,
+  759, 790,  760, 791,  761, 792,  762, 793,  763, 794,  764, 795,  765, 796,
+  766, 797,  767, 798,  768, 768,  769, 800,  770, 801,  771, 802,  772, 803,
+  773, 804,  774, 805,  775, 806,  776, 807,  777, 808,  778, 809,  779, 810,
+  780, 811,  781, 812,  782, 813,  783, 814,  784, 815,  785, 816,  786, 817,
+  787, 818,  788, 819,  789, 820,  790, 821,  791, 822,  792, 823,  793, 824,
+  794, 825,  795, 826,  796, 827,  797, 828,  798, 829,  799, 830,  800, 800,
+  801, 832,  802, 833,  803, 834,  804, 835,  805, 836,  806, 837,  807, 838,
+  808, 839,  809, 840,  810, 841,  811, 842,  812, 843,  813, 844,  814, 845,
+  815, 846,  816, 847,  817, 848,  818, 849,  819, 850,  820, 851,  821, 852,
+  822, 853,  823, 854,  824, 855,  825, 856,  826, 857,  827, 858,  828, 859,
+  829, 860,  830, 861,  831, 862,  832, 832,  833, 864,  834, 865,  835, 866,
+  836, 867,  837, 868,  838, 869,  839, 870,  840, 871,  841, 872,  842, 873,
+  843, 874,  844, 875,  845, 876,  846, 877,  847, 878,  848, 879,  849, 880,
+  850, 881,  851, 882,  852, 883,  853, 884,  854, 885,  855, 886,  856, 887,
+  857, 888,  858, 889,  859, 890,  860, 891,  861, 892,  862, 893,  863, 894,
+  864, 864,  865, 896,  866, 897,  867, 898,  868, 899,  869, 900,  870, 901,
+  871, 902,  872, 903,  873, 904,  874, 905,  875, 906,  876, 907,  877, 908,
+  878, 909,  879, 910,  880, 911,  881, 912,  882, 913,  883, 914,  884, 915,
+  885, 916,  886, 917,  887, 918,  888, 919,  889, 920,  890, 921,  891, 922,
+  892, 923,  893, 924,  894, 925,  895, 926,  896, 896,  897, 928,  898, 929,
+  899, 930,  900, 931,  901, 932,  902, 933,  903, 934,  904, 935,  905, 936,
+  906, 937,  907, 938,  908, 939,  909, 940,  910, 941,  911, 942,  912, 943,
+  913, 944,  914, 945,  915, 946,  916, 947,  917, 948,  918, 949,  919, 950,
+  920, 951,  921, 952,  922, 953,  923, 954,  924, 955,  925, 956,  926, 957,
+  927, 958,  928, 928,  929, 960,  930, 961,  931, 962,  932, 963,  933, 964,
+  934, 965,  935, 966,  936, 967,  937, 968,  938, 969,  939, 970,  940, 971,
+  941, 972,  942, 973,  943, 974,  944, 975,  945, 976,  946, 977,  947, 978,
+  948, 979,  949, 980,  950, 981,  951, 982,  952, 983,  953, 984,  954, 985,
+  955, 986,  956, 987,  957, 988,  958, 989,  959, 990,  960, 960,  961, 992,
+  962, 993,  963, 994,  964, 995,  965, 996,  966, 997,  967, 998,  968, 999,
+  969, 1000, 970, 1001, 971, 1002, 972, 1003, 973, 1004, 974, 1005, 975, 1006,
+  976, 1007, 977, 1008, 978, 1009, 979, 1010, 980, 1011, 981, 1012, 982, 1013,
+  983, 1014, 984, 1015, 985, 1016, 986, 1017, 987, 1018, 988, 1019, 989, 1020,
+  990, 1021, 991, 1022, 0,   0,
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0,   0,    0,   0,    32,  0,    32,  32,   1,   32,  33,  1,    64,  64,
+  33,  64,   2,   33,   96,  96,   34,  2,    65,  96,  34,  65,   128, 128,
+  97,  128,  3,   34,   66,  97,   35,  3,    35,  66,  98,  129,  129, 160,
+  160, 161,  4,   35,   67,  98,   192, 192,  36,  4,   130, 161,  161, 192,
+  36,  67,   99,  130,  5,   36,   68,  99,   193, 224, 162, 193,  224, 225,
+  131, 162,  37,  68,   100, 131,  37,  5,    194, 225, 225, 256,  256, 257,
+  163, 194,  69,  100,  132, 163,  6,   37,   226, 257, 38,  6,    195, 226,
+  257, 288,  101, 132,  288, 289,  38,  69,   164, 195, 133, 164,  258, 289,
+  227, 258,  196, 227,  7,   38,   289, 320,  70,  101, 320, 321,  39,  7,
+  165, 196,  39,  70,   102, 133,  290, 321,  259, 290, 228, 259,  321, 352,
+  352, 353,  197, 228,  134, 165,  71,  102,  8,   39,  322, 353,  291, 322,
+  260, 291,  103, 134,  353, 384,  166, 197,  229, 260, 40,  71,   40,  8,
+  384, 385,  135, 166,  354, 385,  323, 354,  198, 229, 292, 323,  72,  103,
+  261, 292,  9,   40,   385, 416,  167, 198,  104, 135, 230, 261,  355, 386,
+  416, 417,  293, 324,  324, 355,  41,  9,    41,  72,  386, 417,  199, 230,
+  136, 167,  417, 448,  262, 293,  356, 387,  73,  104, 387, 418,  231, 262,
+  10,  41,   168, 199,  325, 356,  418, 449,  105, 136, 448, 449,  42,  73,
+  294, 325,  200, 231,  42,  10,   357, 388,  137, 168, 263, 294,  388, 419,
+  74,  105,  419, 450,  449, 480,  326, 357,  232, 263, 295, 326,  169, 200,
+  11,  42,   106, 137,  480, 481,  450, 481,  358, 389, 264, 295,  201, 232,
+  138, 169,  389, 420,  43,  74,   420, 451,  327, 358, 43,  11,   481, 512,
+  233, 264,  451, 482,  296, 327,  75,  106,  170, 201, 482, 513,  512, 513,
+  390, 421,  359, 390,  421, 452,  107, 138,  12,  43,  202, 233,  452, 483,
+  265, 296,  328, 359,  139, 170,  44,  75,   483, 514, 513, 544,  234, 265,
+  297, 328,  422, 453,  44,  12,   391, 422,  171, 202, 76,  107,  514, 545,
+  453, 484,  544, 545,  266, 297,  203, 234,  108, 139, 329, 360,  298, 329,
+  140, 171,  515, 546,  13,  44,   423, 454,  235, 266, 545, 576,  454, 485,
+  45,  76,   172, 203,  330, 361,  576, 577,  45,  13,  267, 298,  546, 577,
+  77,  108,  204, 235,  455, 486,  577, 608,  299, 330, 109, 140,  547, 578,
+  14,  45,   46,  14,   141, 172,  578, 609,  331, 362, 46,  77,   173, 204,
+  15,  15,   78,  109,  205, 236,  579, 610,  110, 141, 15,  46,   142, 173,
+  47,  78,   174, 205,  16,  16,   79,  110,  206, 237, 16,  47,   111, 142,
+  48,  79,   143, 174,  80,  111,  175, 206,  17,  48,  49,  17,   207, 238,
+  49,  80,   81,  112,  18,  18,   18,  49,   50,  81,  82,  113,  19,  50,
+  51,  82,   83,  114,  608, 609,  484, 515,  360, 391, 236, 267,  112, 143,
+  51,  19,   640, 640,  609, 640,  516, 547,  485, 516, 392, 423,  361, 392,
+  268, 299,  237, 268,  144, 175,  113, 144,  20,  51,  52,  20,   672, 672,
+  641, 672,  610, 641,  548, 579,  517, 548,  486, 517, 424, 455,  393, 424,
+  362, 393,  300, 331,  269, 300,  238, 269,  176, 207, 145, 176,  114, 145,
+  52,  83,   21,  52,   53,  21,   704, 704,  673, 704, 642, 673,  611, 642,
+  580, 611,  549, 580,  518, 549,  487, 518,  456, 487, 425, 456,  394, 425,
+  363, 394,  332, 363,  301, 332,  270, 301,  239, 270, 208, 239,  177, 208,
+  146, 177,  115, 146,  84,  115,  53,  84,   22,  53,  54,  22,   705, 736,
+  674, 705,  643, 674,  581, 612,  550, 581,  519, 550, 457, 488,  426, 457,
+  395, 426,  333, 364,  302, 333,  271, 302,  209, 240, 178, 209,  147, 178,
+  85,  116,  54,  85,   23,  54,   706, 737,  675, 706, 582, 613,  551, 582,
+  458, 489,  427, 458,  334, 365,  303, 334,  210, 241, 179, 210,  86,  117,
+  55,  86,   707, 738,  583, 614,  459, 490,  335, 366, 211, 242,  87,  118,
+  736, 737,  612, 643,  488, 519,  364, 395,  240, 271, 116, 147,  55,  23,
+  768, 768,  737, 768,  644, 675,  613, 644,  520, 551, 489, 520,  396, 427,
+  365, 396,  272, 303,  241, 272,  148, 179,  117, 148, 24,  55,   56,  24,
+  800, 800,  769, 800,  738, 769,  676, 707,  645, 676, 614, 645,  552, 583,
+  521, 552,  490, 521,  428, 459,  397, 428,  366, 397, 304, 335,  273, 304,
+  242, 273,  180, 211,  149, 180,  118, 149,  56,  87,  25,  56,   57,  25,
+  832, 832,  801, 832,  770, 801,  739, 770,  708, 739, 677, 708,  646, 677,
+  615, 646,  584, 615,  553, 584,  522, 553,  491, 522, 460, 491,  429, 460,
+  398, 429,  367, 398,  336, 367,  305, 336,  274, 305, 243, 274,  212, 243,
+  181, 212,  150, 181,  119, 150,  88,  119,  57,  88,  26,  57,   58,  26,
+  833, 864,  802, 833,  771, 802,  709, 740,  678, 709, 647, 678,  585, 616,
+  554, 585,  523, 554,  461, 492,  430, 461,  399, 430, 337, 368,  306, 337,
+  275, 306,  213, 244,  182, 213,  151, 182,  89,  120, 58,  89,   27,  58,
+  834, 865,  803, 834,  710, 741,  679, 710,  586, 617, 555, 586,  462, 493,
+  431, 462,  338, 369,  307, 338,  214, 245,  183, 214, 90,  121,  59,  90,
+  835, 866,  711, 742,  587, 618,  463, 494,  339, 370, 215, 246,  91,  122,
+  864, 865,  740, 771,  616, 647,  492, 523,  368, 399, 244, 275,  120, 151,
+  59,  27,   896, 896,  865, 896,  772, 803,  741, 772, 648, 679,  617, 648,
+  524, 555,  493, 524,  400, 431,  369, 400,  276, 307, 245, 276,  152, 183,
+  121, 152,  28,  59,   60,  28,   928, 928,  897, 928, 866, 897,  804, 835,
+  773, 804,  742, 773,  680, 711,  649, 680,  618, 649, 556, 587,  525, 556,
+  494, 525,  432, 463,  401, 432,  370, 401,  308, 339, 277, 308,  246, 277,
+  184, 215,  153, 184,  122, 153,  60,  91,   29,  60,  61,  29,   960, 960,
+  929, 960,  898, 929,  867, 898,  836, 867,  805, 836, 774, 805,  743, 774,
+  712, 743,  681, 712,  650, 681,  619, 650,  588, 619, 557, 588,  526, 557,
+  495, 526,  464, 495,  433, 464,  402, 433,  371, 402, 340, 371,  309, 340,
+  278, 309,  247, 278,  216, 247,  185, 216,  154, 185, 123, 154,  92,  123,
+  61,  92,   30,  61,   62,  30,   961, 992,  930, 961, 899, 930,  837, 868,
+  806, 837,  775, 806,  713, 744,  682, 713,  651, 682, 589, 620,  558, 589,
+  527, 558,  465, 496,  434, 465,  403, 434,  341, 372, 310, 341,  279, 310,
+  217, 248,  186, 217,  155, 186,  93,  124,  62,  93,  31,  62,   962, 993,
+  931, 962,  838, 869,  807, 838,  714, 745,  683, 714, 590, 621,  559, 590,
+  466, 497,  435, 466,  342, 373,  311, 342,  218, 249, 187, 218,  94,  125,
+  63,  94,   963, 994,  839, 870,  715, 746,  591, 622, 467, 498,  343, 374,
+  219, 250,  95,  126,  868, 899,  744, 775,  620, 651, 496, 527,  372, 403,
+  248, 279,  124, 155,  900, 931,  869, 900,  776, 807, 745, 776,  652, 683,
+  621, 652,  528, 559,  497, 528,  404, 435,  373, 404, 280, 311,  249, 280,
+  156, 187,  125, 156,  932, 963,  901, 932,  870, 901, 808, 839,  777, 808,
+  746, 777,  684, 715,  653, 684,  622, 653,  560, 591, 529, 560,  498, 529,
+  436, 467,  405, 436,  374, 405,  312, 343,  281, 312, 250, 281,  188, 219,
+  157, 188,  126, 157,  964, 995,  933, 964,  902, 933, 871, 902,  840, 871,
+  809, 840,  778, 809,  747, 778,  716, 747,  685, 716, 654, 685,  623, 654,
+  592, 623,  561, 592,  530, 561,  499, 530,  468, 499, 437, 468,  406, 437,
+  375, 406,  344, 375,  313, 344,  282, 313,  251, 282, 220, 251,  189, 220,
+  158, 189,  127, 158,  965, 996,  934, 965,  903, 934, 841, 872,  810, 841,
+  779, 810,  717, 748,  686, 717,  655, 686,  593, 624, 562, 593,  531, 562,
+  469, 500,  438, 469,  407, 438,  345, 376,  314, 345, 283, 314,  221, 252,
+  190, 221,  159, 190,  966, 997,  935, 966,  842, 873, 811, 842,  718, 749,
+  687, 718,  594, 625,  563, 594,  470, 501,  439, 470, 346, 377,  315, 346,
+  222, 253,  191, 222,  967, 998,  843, 874,  719, 750, 595, 626,  471, 502,
+  347, 378,  223, 254,  872, 903,  748, 779,  624, 655, 500, 531,  376, 407,
+  252, 283,  904, 935,  873, 904,  780, 811,  749, 780, 656, 687,  625, 656,
+  532, 563,  501, 532,  408, 439,  377, 408,  284, 315, 253, 284,  936, 967,
+  905, 936,  874, 905,  812, 843,  781, 812,  750, 781, 688, 719,  657, 688,
+  626, 657,  564, 595,  533, 564,  502, 533,  440, 471, 409, 440,  378, 409,
+  316, 347,  285, 316,  254, 285,  968, 999,  937, 968, 906, 937,  875, 906,
+  844, 875,  813, 844,  782, 813,  751, 782,  720, 751, 689, 720,  658, 689,
+  627, 658,  596, 627,  565, 596,  534, 565,  503, 534, 472, 503,  441, 472,
+  410, 441,  379, 410,  348, 379,  317, 348,  286, 317, 255, 286,  969, 1000,
+  938, 969,  907, 938,  845, 876,  814, 845,  783, 814, 721, 752,  690, 721,
+  659, 690,  597, 628,  566, 597,  535, 566,  473, 504, 442, 473,  411, 442,
+  349, 380,  318, 349,  287, 318,  970, 1001, 939, 970, 846, 877,  815, 846,
+  722, 753,  691, 722,  598, 629,  567, 598,  474, 505, 443, 474,  350, 381,
+  319, 350,  971, 1002, 847, 878,  723, 754,  599, 630, 475, 506,  351, 382,
+  876, 907,  752, 783,  628, 659,  504, 535,  380, 411, 908, 939,  877, 908,
+  784, 815,  753, 784,  660, 691,  629, 660,  536, 567, 505, 536,  412, 443,
+  381, 412,  940, 971,  909, 940,  878, 909,  816, 847, 785, 816,  754, 785,
+  692, 723,  661, 692,  630, 661,  568, 599,  537, 568, 506, 537,  444, 475,
+  413, 444,  382, 413,  972, 1003, 941, 972,  910, 941, 879, 910,  848, 879,
+  817, 848,  786, 817,  755, 786,  724, 755,  693, 724, 662, 693,  631, 662,
+  600, 631,  569, 600,  538, 569,  507, 538,  476, 507, 445, 476,  414, 445,
+  383, 414,  973, 1004, 942, 973,  911, 942,  849, 880, 818, 849,  787, 818,
+  725, 756,  694, 725,  663, 694,  601, 632,  570, 601, 539, 570,  477, 508,
+  446, 477,  415, 446,  974, 1005, 943, 974,  850, 881, 819, 850,  726, 757,
+  695, 726,  602, 633,  571, 602,  478, 509,  447, 478, 975, 1006, 851, 882,
+  727, 758,  603, 634,  479, 510,  880, 911,  756, 787, 632, 663,  508, 539,
+  912, 943,  881, 912,  788, 819,  757, 788,  664, 695, 633, 664,  540, 571,
+  509, 540,  944, 975,  913, 944,  882, 913,  820, 851, 789, 820,  758, 789,
+  696, 727,  665, 696,  634, 665,  572, 603,  541, 572, 510, 541,  976, 1007,
+  945, 976,  914, 945,  883, 914,  852, 883,  821, 852, 790, 821,  759, 790,
+  728, 759,  697, 728,  666, 697,  635, 666,  604, 635, 573, 604,  542, 573,
+  511, 542,  977, 1008, 946, 977,  915, 946,  853, 884, 822, 853,  791, 822,
+  729, 760,  698, 729,  667, 698,  605, 636,  574, 605, 543, 574,  978, 1009,
+  947, 978,  854, 885,  823, 854,  730, 761,  699, 730, 606, 637,  575, 606,
+  979, 1010, 855, 886,  731, 762,  607, 638,  884, 915, 760, 791,  636, 667,
+  916, 947,  885, 916,  792, 823,  761, 792,  668, 699, 637, 668,  948, 979,
+  917, 948,  886, 917,  824, 855,  793, 824,  762, 793, 700, 731,  669, 700,
+  638, 669,  980, 1011, 949, 980,  918, 949,  887, 918, 856, 887,  825, 856,
+  794, 825,  763, 794,  732, 763,  701, 732,  670, 701, 639, 670,  981, 1012,
+  950, 981,  919, 950,  857, 888,  826, 857,  795, 826, 733, 764,  702, 733,
+  671, 702,  982, 1013, 951, 982,  858, 889,  827, 858, 734, 765,  703, 734,
+  983, 1014, 859, 890,  735, 766,  888, 919,  764, 795, 920, 951,  889, 920,
+  796, 827,  765, 796,  952, 983,  921, 952,  890, 921, 828, 859,  797, 828,
+  766, 797,  984, 1015, 953, 984,  922, 953,  891, 922, 860, 891,  829, 860,
+  798, 829,  767, 798,  985, 1016, 954, 985,  923, 954, 861, 892,  830, 861,
+  799, 830,  986, 1017, 955, 986,  862, 893,  831, 862, 987, 1018, 863, 894,
+  892, 923,  924, 955,  893, 924,  956, 987,  925, 956, 894, 925,  988, 1019,
+  957, 988,  926, 957,  895, 926,  989, 1020, 958, 989, 927, 958,  990, 1021,
+  959, 990,  991, 1022, 0,   0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                v2_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0,   0,    0,   0,    0,   0,    1,   32,   1,   1,   32,  32,   2,   33,
+  33,  64,   34,  65,   2,   2,    64,  64,   3,   34,  65,  96,   35,  66,
+  66,  97,   3,   3,    96,  96,   4,   35,   97,  128, 67,  98,   36,  67,
+  98,  129,  4,   4,    68,  99,   99,  130,  128, 128, 5,   36,   129, 160,
+  37,  68,   130, 161,  100, 131,  69,  100,  131, 162, 5,   5,    160, 160,
+  6,   37,   161, 192,  38,  69,   162, 193,  101, 132, 132, 163,  70,  101,
+  163, 194,  6,   6,    192, 192,  7,   38,   133, 164, 193, 224,  102, 133,
+  164, 195,  39,  70,   194, 225,  71,  102,  195, 226, 134, 165,  165, 196,
+  7,   7,    224, 224,  8,   39,   103, 134,  196, 227, 225, 256,  40,  71,
+  226, 257,  166, 197,  72,  103,  227, 258,  135, 166, 197, 228,  104, 135,
+  228, 259,  8,   8,    256, 256,  9,   40,   257, 288, 41,  72,   167, 198,
+  198, 229,  258, 289,  136, 167,  229, 260,  73,  104, 259, 290,  105, 136,
+  260, 291,  199, 230,  9,   9,    168, 199,  230, 261, 288, 288,  10,  41,
+  289, 320,  42,  73,   290, 321,  137, 168,  261, 292, 74,  105,  291, 322,
+  200, 231,  231, 262,  106, 137,  292, 323,  169, 200, 262, 293,  10,  10,
+  320, 320,  11,  42,   321, 352,  43,  74,   138, 169, 293, 324,  322, 353,
+  232, 263,  75,  106,  201, 232,  263, 294,  323, 354, 170, 201,  294, 325,
+  107, 138,  324, 355,  11,  11,   352, 352,  12,  43,  233, 264,  264, 295,
+  353, 384,  139, 170,  325, 356,  44,  75,   354, 385, 202, 233,  295, 326,
+  76,  107,  355, 386,  171, 202,  326, 357,  108, 139, 356, 387,  265, 296,
+  234, 265,  296, 327,  12,  12,   140, 171,  357, 388, 384, 384,  13,  44,
+  203, 234,  327, 358,  385, 416,  45,  76,   386, 417, 77,  108,  387, 418,
+  172, 203,  358, 389,  266, 297,  297, 328,  109, 140, 235, 266,  328, 359,
+  388, 419,  204, 235,  359, 390,  141, 172,  389, 420, 13,  13,   416, 416,
+  14,  45,   417, 448,  46,  77,   298, 329,  418, 449, 267, 298,  329, 360,
+  78,  109,  173, 204,  390, 421,  419, 450,  236, 267, 360, 391,  110, 141,
+  420, 451,  205, 236,  391, 422,  142, 173,  299, 330, 330, 361,  421, 452,
+  14,  14,   268, 299,  361, 392,  448, 448,  15,  46,  449, 480,  47,  78,
+  450, 481,  174, 205,  422, 453,  237, 268,  392, 423, 79,  110,  451, 482,
+  111, 142,  452, 483,  331, 362,  300, 331,  362, 393, 206, 237,  423, 454,
+  143, 174,  269, 300,  393, 424,  453, 484,  480, 480, 481, 512,  238, 269,
+  424, 455,  482, 513,  175, 206,  454, 485,  332, 363, 363, 394,  483, 514,
+  301, 332,  394, 425,  484, 515,  207, 238,  455, 486, 270, 301,  425, 456,
+  485, 516,  364, 395,  239, 270,  456, 487,  512, 512, 333, 364,  395, 426,
+  513, 544,  486, 517,  514, 545,  302, 333,  426, 457, 515, 546,  487, 518,
+  516, 547,  271, 302,  457, 488,  365, 396,  396, 427, 517, 548,  334, 365,
+  427, 458,  488, 519,  544, 544,  303, 334,  458, 489, 518, 549,  545, 576,
+  546, 577,  547, 578,  489, 520,  397, 428,  519, 550, 366, 397,  428, 459,
+  548, 579,  335, 366,  459, 490,  549, 580,  520, 551, 490, 521,  550, 581,
+  576, 576,  577, 608,  398, 429,  429, 460,  578, 609, 367, 398,  460, 491,
+  521, 552,  579, 610,  551, 582,  491, 522,  580, 611, 581, 612,  552, 583,
+  522, 553,  430, 461,  399, 430,  461, 492,  582, 613, 492, 523,  608, 608,
+  609, 640,  610, 641,  553, 584,  611, 642,  523, 554, 583, 614,  612, 643,
+  431, 462,  462, 493,  554, 585,  493, 524,  584, 615, 613, 644,  524, 555,
+  614, 645,  640, 640,  585, 616,  641, 672,  555, 586, 642, 673,  615, 646,
+  463, 494,  643, 674,  494, 525,  644, 675,  525, 556, 586, 617,  616, 647,
+  645, 676,  556, 587,  646, 677,  495, 526,  617, 648, 587, 618,  672, 672,
+  526, 557,  673, 704,  674, 705,  647, 678,  557, 588, 675, 706,  618, 649,
+  676, 707,  588, 619,  648, 679,  677, 708,  527, 558, 558, 589,  678, 709,
+  619, 650,  649, 680,  704, 704,  589, 620,  705, 736, 679, 710,  706, 737,
+  707, 738,  650, 681,  620, 651,  708, 739,  680, 711, 559, 590,  709, 740,
+  590, 621,  651, 682,  681, 712,  710, 741,  621, 652, 736, 736,  737, 768,
+  711, 742,  738, 769,  682, 713,  652, 683,  739, 770, 591, 622,  740, 771,
+  712, 743,  622, 653,  741, 772,  683, 714,  653, 684, 713, 744,  742, 773,
+  623, 654,  743, 774,  768, 768,  769, 800,  684, 715, 714, 745,  770, 801,
+  771, 802,  654, 685,  744, 775,  772, 803,  715, 746, 773, 804,  685, 716,
+  745, 776,  774, 805,  655, 686,  716, 747,  775, 806, 746, 777,  800, 800,
+  801, 832,  686, 717,  802, 833,  803, 834,  776, 807, 804, 835,  747, 778,
+  717, 748,  805, 836,  777, 808,  687, 718,  806, 837, 748, 779,  718, 749,
+  778, 809,  807, 838,  832, 832,  833, 864,  834, 865, 835, 866,  808, 839,
+  749, 780,  836, 867,  779, 810,  719, 750,  837, 868, 809, 840,  838, 869,
+  780, 811,  750, 781,  810, 841,  839, 870,  864, 864, 865, 896,  866, 897,
+  840, 871,  867, 898,  781, 812,  811, 842,  868, 899, 751, 782,  869, 900,
+  841, 872,  812, 843,  870, 901,  782, 813,  842, 873, 871, 902,  896, 896,
+  897, 928,  813, 844,  898, 929,  872, 903,  783, 814, 843, 874,  899, 930,
+  900, 931,  873, 904,  901, 932,  814, 845,  844, 875, 902, 933,  874, 905,
+  903, 934,  845, 876,  928, 928,  815, 846,  929, 960, 930, 961,  875, 906,
+  904, 935,  931, 962,  932, 963,  905, 936,  846, 877, 933, 964,  876, 907,
+  934, 965,  906, 937,  935, 966,  877, 908,  847, 878, 960, 960,  907, 938,
+  961, 992,  936, 967,  962, 993,  963, 994,  964, 995, 878, 909,  937, 968,
+  908, 939,  965, 996,  966, 997,  938, 969,  879, 910, 909, 940,  967, 998,
+  939, 970,  968, 999,  910, 941,  969, 1000, 940, 971, 970, 1001, 911, 942,
+  941, 972,  971, 1002, 942, 973,  972, 1003, 943, 974, 973, 1004, 974, 1005,
+  975, 1006, 15,  15,   16,  47,   48,  79,   80,  111, 112, 143,  144, 175,
+  16,  16,   17,  48,   176, 207,  49,  80,   81,  112, 113, 144,  208, 239,
+  145, 176,  240, 271,  17,  17,   18,  49,   177, 208, 50,  81,   82,  113,
+  272, 303,  209, 240,  114, 145,  146, 177,  241, 272, 304, 335,  178, 209,
+  18,  18,   19,  50,   51,  82,   83,  114,  273, 304, 210, 241,  115, 146,
+  336, 367,  147, 178,  242, 273,  305, 336,  179, 210, 19,  19,   368, 399,
+  20,  51,   52,  83,   274, 305,  84,  115,  211, 242, 337, 368,  116, 147,
+  306, 337,  148, 179,  243, 274,  400, 431,  369, 400, 180, 211,  20,  20,
+  21,  52,   275, 306,  53,  84,   338, 369,  212, 243, 85,  116,  432, 463,
+  117, 148,  401, 432,  307, 338,  244, 275,  149, 180, 370, 401,  181, 212,
+  276, 307,  464, 495,  339, 370,  21,  21,   22,  53,  433, 464,  54,  85,
+  213, 244,  86,  117,  402, 433,  118, 149,  308, 339, 245, 276,  371, 402,
+  150, 181,  496, 527,  465, 496,  182, 213,  434, 465, 340, 371,  277, 308,
+  22,  22,   23,  54,   403, 434,  55,  86,   214, 245, 87,  118,  309, 340,
+  372, 403,  119, 150,  497, 528,  528, 559,  246, 277, 466, 497,  151, 182,
+  435, 466,  341, 372,  183, 214,  278, 309,  404, 435, 23,  23,   24,  55,
+  215, 246,  529, 560,  56,  87,   498, 529,  560, 591, 310, 341,  88,  119,
+  373, 404,  467, 498,  120, 151,  247, 278,  436, 467, 152, 183,  342, 373,
+  279, 310,  405, 436,  184, 215,  530, 561,  561, 592, 499, 530,  592, 623,
+  24,  24,   216, 247,  468, 499,  25,  56,   374, 405, 57,  88,   311, 342,
+  89,  120,  437, 468,  248, 279,  121, 152,  562, 593, 153, 184,  343, 374,
+  531, 562,  593, 624,  406, 437,  500, 531,  624, 655, 280, 311,  185, 216,
+  469, 500,  375, 406,  217, 248,  25,  25,   312, 343, 26,  57,   58,  89,
+  438, 469,  90,  121,  563, 594,  594, 625,  249, 280, 532, 563,  625, 656,
+  122, 153,  344, 375,  501, 532,  656, 687,  407, 438, 154, 185,  281, 312,
+  470, 501,  186, 217,  376, 407,  595, 626,  564, 595, 626, 657,  218, 249,
+  313, 344,  439, 470,  26,  26,   27,  58,   533, 564, 657, 688,  59,  90,
+  91,  122,  250, 281,  502, 533,  688, 719,  123, 154, 408, 439,  345, 376,
+  155, 186,  471, 502,  282, 313,  596, 627,  627, 658, 187, 218,  565, 596,
+  658, 689,  377, 408,  440, 471,  534, 565,  689, 720, 314, 345,  219, 250,
+  27,  27,   28,  59,   503, 534,  720, 751,  60,  91,  92,  123,  251, 282,
+  409, 440,  346, 377,  124, 155,  628, 659,  472, 503, 597, 628,  659, 690,
+  566, 597,  690, 721,  156, 187,  283, 314,  535, 566, 721, 752,  188, 219,
+  378, 409,  441, 472,  315, 346,  504, 535,  752, 783, 220, 251,  28,  28,
+  629, 660,  660, 691,  29,  60,   61,  92,   410, 441, 598, 629,  691, 722,
+  252, 283,  93,  124,  347, 378,  473, 504,  567, 598, 722, 753,  125, 156,
+  284, 315,  536, 567,  753, 784,  157, 188,  442, 473, 379, 410,  189, 220,
+  505, 536,  784, 815,  661, 692,  316, 347,  630, 661, 692, 723,  221, 252,
+  599, 630,  723, 754,  411, 442,  29,  29,   568, 599, 754, 785,  30,  61,
+  474, 505,  62,  93,   253, 284,  348, 379,  94,  125, 537, 568,  785, 816,
+  126, 157,  285, 316,  158, 189,  443, 474,  662, 693, 693, 724,  380, 411,
+  631, 662,  724, 755,  506, 537,  816, 847,  190, 221, 600, 631,  755, 786,
+  317, 348,  222, 253,  569, 600,  786, 817,  412, 443, 475, 506,  30,  30,
+  31,  62,   349, 380,  254, 285,  63,  94,   538, 569, 817, 848,  694, 725,
+  95,  126,  663, 694,  725, 756,  632, 663,  756, 787, 127, 158,  444, 475,
+  286, 317,  381, 412,  507, 538,  848, 879,  159, 190, 601, 632,  787, 818,
+  191, 222,  318, 349,  570, 601,  818, 849,  476, 507, 223, 254,  413, 444,
+  695, 726,  726, 757,  664, 695,  757, 788,  539, 570, 849, 880,  350, 381,
+  255, 286,  633, 664,  788, 819,  445, 476,  602, 633, 819, 850,  508, 539,
+  880, 911,  287, 318,  382, 413,  571, 602,  850, 881, 727, 758,  696, 727,
+  758, 789,  319, 350,  477, 508,  665, 696,  789, 820, 414, 445,  540, 571,
+  881, 912,  634, 665,  820, 851,  351, 382,  603, 634, 851, 882,  446, 477,
+  509, 540,  912, 943,  383, 414,  728, 759,  759, 790, 572, 603,  882, 913,
+  697, 728,  790, 821,  666, 697,  821, 852,  478, 509, 635, 666,  852, 883,
+  415, 446,  541, 572,  913, 944,  604, 635,  883, 914, 760, 791,  729, 760,
+  791, 822,  510, 541,  944, 975,  447, 478,  698, 729, 822, 853,  573, 604,
+  914, 945,  667, 698,  853, 884,  636, 667,  884, 915, 479, 510,  542, 573,
+  945, 976,  761, 792,  792, 823,  605, 636,  915, 946, 730, 761,  823, 854,
+  699, 730,  854, 885,  511, 542,  976, 1007, 574, 605, 946, 977,  668, 699,
+  885, 916,  637, 668,  916, 947,  543, 574,  793, 824, 977, 1008, 762, 793,
+  824, 855,  731, 762,  855, 886,  606, 637,  947, 978, 700, 731,  886, 917,
+  669, 700,  917, 948,  575, 606,  978, 1009, 638, 669, 948, 979,  794, 825,
+  825, 856,  763, 794,  856, 887,  732, 763,  887, 918, 607, 638,  979, 1010,
+  701, 732,  918, 949,  670, 701,  949, 980,  826, 857, 795, 826,  857, 888,
+  764, 795,  888, 919,  639, 670,  980, 1011, 733, 764, 919, 950,  702, 733,
+  950, 981,  671, 702,  981, 1012, 827, 858,  858, 889, 796, 827,  889, 920,
+  765, 796,  920, 951,  734, 765,  951, 982,  703, 734, 982, 1013, 859, 890,
+  828, 859,  890, 921,  797, 828,  921, 952,  766, 797, 952, 983,  735, 766,
+  983, 1014, 860, 891,  891, 922,  829, 860,  922, 953, 798, 829,  953, 984,
+  767, 798,  984, 1015, 892, 923,  861, 892,  923, 954, 830, 861,  954, 985,
+  799, 830,  985, 1016, 893, 924,  924, 955,  862, 893, 955, 986,  831, 862,
+  986, 1017, 925, 956,  894, 925,  956, 987,  863, 894, 987, 1018, 926, 957,
+  957, 988,  895, 926,  988, 1019, 958, 989,  927, 958, 989, 1020, 959, 990,
+  990, 1021, 991, 1022, 0,   0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                h2_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0,   0,    0,   0,    0,   0,    1,   32,   1,   1,    32,  32,   2,   33,
+  33,  64,   34,  65,   2,   2,    64,  64,   3,   34,   65,  96,   35,  66,
+  66,  97,   3,   3,    96,  96,   4,   35,   97,  128,  67,  98,   36,  67,
+  98,  129,  4,   4,    68,  99,   99,  130,  128, 128,  5,   36,   129, 160,
+  37,  68,   130, 161,  100, 131,  69,  100,  131, 162,  5,   5,    160, 160,
+  6,   37,   161, 192,  38,  69,   162, 193,  101, 132,  132, 163,  70,  101,
+  163, 194,  6,   6,    192, 192,  7,   38,   133, 164,  193, 224,  102, 133,
+  164, 195,  39,  70,   194, 225,  71,  102,  195, 226,  134, 165,  165, 196,
+  7,   7,    224, 224,  8,   39,   103, 134,  196, 227,  225, 256,  40,  71,
+  226, 257,  166, 197,  72,  103,  227, 258,  135, 166,  197, 228,  104, 135,
+  228, 259,  8,   8,    256, 256,  9,   40,   257, 288,  41,  72,   167, 198,
+  198, 229,  258, 289,  136, 167,  229, 260,  73,  104,  259, 290,  105, 136,
+  260, 291,  199, 230,  9,   9,    168, 199,  230, 261,  288, 288,  10,  41,
+  289, 320,  42,  73,   290, 321,  137, 168,  261, 292,  74,  105,  291, 322,
+  200, 231,  231, 262,  106, 137,  292, 323,  169, 200,  262, 293,  10,  10,
+  320, 320,  11,  42,   321, 352,  43,  74,   138, 169,  293, 324,  322, 353,
+  232, 263,  75,  106,  201, 232,  263, 294,  323, 354,  170, 201,  294, 325,
+  107, 138,  324, 355,  11,  11,   352, 352,  12,  43,   233, 264,  264, 295,
+  353, 384,  139, 170,  325, 356,  44,  75,   354, 385,  202, 233,  295, 326,
+  76,  107,  355, 386,  171, 202,  326, 357,  108, 139,  356, 387,  265, 296,
+  234, 265,  296, 327,  12,  12,   140, 171,  357, 388,  384, 384,  13,  44,
+  203, 234,  327, 358,  385, 416,  45,  76,   386, 417,  77,  108,  387, 418,
+  172, 203,  358, 389,  266, 297,  297, 328,  109, 140,  235, 266,  328, 359,
+  388, 419,  204, 235,  359, 390,  141, 172,  389, 420,  13,  13,   416, 416,
+  14,  45,   417, 448,  46,  77,   298, 329,  418, 449,  267, 298,  329, 360,
+  78,  109,  173, 204,  390, 421,  419, 450,  236, 267,  360, 391,  110, 141,
+  420, 451,  205, 236,  391, 422,  142, 173,  299, 330,  330, 361,  421, 452,
+  14,  14,   268, 299,  361, 392,  448, 448,  15,  46,   449, 480,  47,  78,
+  450, 481,  174, 205,  422, 453,  237, 268,  392, 423,  79,  110,  451, 482,
+  111, 142,  452, 483,  331, 362,  300, 331,  362, 393,  206, 237,  423, 454,
+  143, 174,  269, 300,  393, 424,  453, 484,  15,  15,   16,  47,   48,  79,
+  238, 269,  424, 455,  175, 206,  454, 485,  80,  111,  332, 363,  363, 394,
+  301, 332,  394, 425,  112, 143,  207, 238,  455, 486,  270, 301,  425, 456,
+  144, 175,  364, 395,  16,  16,   239, 270,  456, 487,  17,  48,   333, 364,
+  395, 426,  176, 207,  49,  80,   302, 333,  426, 457,  81,  112,  113, 144,
+  208, 239,  271, 302,  457, 488,  365, 396,  396, 427,  145, 176,  334, 365,
+  427, 458,  240, 271,  17,  17,   18,  49,   177, 208,  303, 334,  458, 489,
+  50,  81,   82,  113,  272, 303,  209, 240,  397, 428,  114, 145,  366, 397,
+  428, 459,  335, 366,  459, 490,  146, 177,  241, 272,  304, 335,  178, 209,
+  18,  18,   19,  50,   51,  82,   398, 429,  429, 460,  367, 398,  460, 491,
+  83,  114,  273, 304,  210, 241,  115, 146,  336, 367,  147, 178,  242, 273,
+  305, 336,  430, 461,  399, 430,  461, 492,  179, 210,  19,  19,   368, 399,
+  20,  51,   52,  83,   274, 305,  84,  115,  211, 242,  337, 368,  116, 147,
+  431, 462,  462, 493,  306, 337,  148, 179,  243, 274,  400, 431,  369, 400,
+  180, 211,  20,  20,   21,  52,   275, 306,  53,  84,   338, 369,  212, 243,
+  85,  116,  463, 494,  432, 463,  117, 148,  401, 432,  307, 338,  244, 275,
+  149, 180,  370, 401,  181, 212,  276, 307,  464, 495,  339, 370,  21,  21,
+  22,  53,   433, 464,  54,  85,   213, 244,  86,  117,  402, 433,  118, 149,
+  308, 339,  245, 276,  371, 402,  150, 181,  465, 496,  182, 213,  434, 465,
+  340, 371,  277, 308,  22,  22,   23,  54,   403, 434,  55,  86,   214, 245,
+  87,  118,  309, 340,  372, 403,  119, 150,  246, 277,  466, 497,  151, 182,
+  435, 466,  341, 372,  183, 214,  278, 309,  404, 435,  23,  23,   24,  55,
+  215, 246,  56,  87,   310, 341,  88,  119,  373, 404,  467, 498,  120, 151,
+  247, 278,  436, 467,  152, 183,  342, 373,  279, 310,  405, 436,  184, 215,
+  24,  24,   216, 247,  468, 499,  25,  56,   374, 405,  57,  88,   311, 342,
+  89,  120,  437, 468,  248, 279,  121, 152,  153, 184,  343, 374,  406, 437,
+  280, 311,  185, 216,  469, 500,  375, 406,  217, 248,  25,  25,   312, 343,
+  26,  57,   58,  89,   438, 469,  90,  121,  249, 280,  122, 153,  344, 375,
+  407, 438,  154, 185,  281, 312,  470, 501,  186, 217,  376, 407,  218, 249,
+  313, 344,  439, 470,  26,  26,   27,  58,   59,  90,   91,  122,  250, 281,
+  123, 154,  408, 439,  345, 376,  155, 186,  471, 502,  282, 313,  187, 218,
+  377, 408,  440, 471,  314, 345,  219, 250,  27,  27,   28,  59,   60,  91,
+  92,  123,  251, 282,  409, 440,  346, 377,  124, 155,  472, 503,  156, 187,
+  283, 314,  188, 219,  378, 409,  441, 472,  315, 346,  220, 251,  28,  28,
+  29,  60,   61,  92,   410, 441,  252, 283,  93,  124,  347, 378,  473, 504,
+  125, 156,  284, 315,  157, 188,  442, 473,  379, 410,  189, 220,  316, 347,
+  221, 252,  411, 442,  29,  29,   30,  61,   474, 505,  62,  93,   253, 284,
+  348, 379,  94,  125,  126, 157,  285, 316,  158, 189,  443, 474,  380, 411,
+  190, 221,  317, 348,  222, 253,  412, 443,  475, 506,  30,  30,   31,  62,
+  349, 380,  254, 285,  63,  94,   95,  126,  127, 158,  444, 475,  286, 317,
+  381, 412,  159, 190,  191, 222,  318, 349,  476, 507,  223, 254,  413, 444,
+  350, 381,  255, 286,  445, 476,  287, 318,  382, 413,  319, 350,  477, 508,
+  414, 445,  351, 382,  446, 477,  383, 414,  478, 509,  415, 446,  447, 478,
+  479, 510,  480, 480,  481, 512,  482, 513,  483, 514,  484, 515,  485, 516,
+  512, 512,  513, 544,  486, 517,  514, 545,  515, 546,  487, 518,  516, 547,
+  517, 548,  488, 519,  544, 544,  518, 549,  545, 576,  546, 577,  547, 578,
+  489, 520,  519, 550,  548, 579,  549, 580,  520, 551,  490, 521,  550, 581,
+  576, 576,  577, 608,  578, 609,  521, 552,  579, 610,  551, 582,  491, 522,
+  580, 611,  581, 612,  552, 583,  522, 553,  582, 613,  492, 523,  608, 608,
+  609, 640,  610, 641,  553, 584,  611, 642,  523, 554,  583, 614,  612, 643,
+  554, 585,  493, 524,  584, 615,  613, 644,  524, 555,  614, 645,  640, 640,
+  585, 616,  641, 672,  555, 586,  642, 673,  615, 646,  643, 674,  494, 525,
+  644, 675,  525, 556,  586, 617,  616, 647,  645, 676,  556, 587,  646, 677,
+  495, 526,  617, 648,  587, 618,  672, 672,  526, 557,  673, 704,  674, 705,
+  647, 678,  557, 588,  675, 706,  618, 649,  676, 707,  588, 619,  648, 679,
+  677, 708,  496, 527,  527, 558,  558, 589,  678, 709,  619, 650,  649, 680,
+  704, 704,  589, 620,  705, 736,  679, 710,  706, 737,  707, 738,  650, 681,
+  620, 651,  497, 528,  528, 559,  708, 739,  680, 711,  559, 590,  709, 740,
+  590, 621,  651, 682,  681, 712,  710, 741,  621, 652,  736, 736,  737, 768,
+  529, 560,  711, 742,  498, 529,  560, 591,  738, 769,  682, 713,  652, 683,
+  739, 770,  591, 622,  740, 771,  712, 743,  622, 653,  741, 772,  683, 714,
+  653, 684,  713, 744,  742, 773,  530, 561,  561, 592,  499, 530,  592, 623,
+  623, 654,  743, 774,  768, 768,  769, 800,  684, 715,  714, 745,  770, 801,
+  771, 802,  654, 685,  744, 775,  772, 803,  562, 593,  531, 562,  593, 624,
+  715, 746,  773, 804,  685, 716,  500, 531,  624, 655,  745, 776,  774, 805,
+  655, 686,  716, 747,  775, 806,  746, 777,  800, 800,  801, 832,  686, 717,
+  802, 833,  563, 594,  594, 625,  803, 834,  532, 563,  625, 656,  776, 807,
+  804, 835,  501, 532,  656, 687,  747, 778,  717, 748,  805, 836,  777, 808,
+  687, 718,  806, 837,  748, 779,  595, 626,  564, 595,  626, 657,  718, 749,
+  778, 809,  807, 838,  832, 832,  533, 564,  657, 688,  833, 864,  834, 865,
+  835, 866,  502, 533,  688, 719,  808, 839,  749, 780,  836, 867,  779, 810,
+  719, 750,  837, 868,  809, 840,  596, 627,  627, 658,  565, 596,  658, 689,
+  838, 869,  780, 811,  750, 781,  534, 565,  689, 720,  810, 841,  839, 870,
+  864, 864,  503, 534,  720, 751,  865, 896,  866, 897,  840, 871,  867, 898,
+  781, 812,  811, 842,  628, 659,  868, 899,  751, 782,  597, 628,  659, 690,
+  566, 597,  690, 721,  869, 900,  841, 872,  535, 566,  721, 752,  812, 843,
+  870, 901,  782, 813,  842, 873,  504, 535,  752, 783,  871, 902,  629, 660,
+  660, 691,  896, 896,  897, 928,  598, 629,  691, 722,  813, 844,  898, 929,
+  872, 903,  783, 814,  843, 874,  899, 930,  567, 598,  722, 753,  900, 931,
+  536, 567,  753, 784,  873, 904,  901, 932,  814, 845,  844, 875,  902, 933,
+  505, 536,  784, 815,  661, 692,  630, 661,  692, 723,  874, 905,  599, 630,
+  723, 754,  903, 934,  845, 876,  568, 599,  754, 785,  928, 928,  815, 846,
+  929, 960,  930, 961,  875, 906,  904, 935,  931, 962,  537, 568,  785, 816,
+  932, 963,  905, 936,  662, 693,  693, 724,  846, 877,  933, 964,  876, 907,
+  631, 662,  724, 755,  506, 537,  816, 847,  934, 965,  600, 631,  755, 786,
+  906, 937,  569, 600,  786, 817,  935, 966,  877, 908,  847, 878,  960, 960,
+  907, 938,  961, 992,  936, 967,  538, 569,  817, 848,  962, 993,  694, 725,
+  663, 694,  725, 756,  963, 994,  632, 663,  756, 787,  964, 995,  878, 909,
+  937, 968,  507, 538,  848, 879,  908, 939,  601, 632,  787, 818,  965, 996,
+  966, 997,  570, 601,  818, 849,  938, 969,  879, 910,  909, 940,  967, 998,
+  695, 726,  726, 757,  664, 695,  757, 788,  539, 570,  849, 880,  939, 970,
+  633, 664,  788, 819,  968, 999,  602, 633,  819, 850,  910, 941,  508, 539,
+  880, 911,  969, 1000, 940, 971,  571, 602,  850, 881,  727, 758,  696, 727,
+  758, 789,  970, 1001, 665, 696,  789, 820,  911, 942,  941, 972,  540, 571,
+  881, 912,  634, 665,  820, 851,  971, 1002, 603, 634,  851, 882,  942, 973,
+  509, 540,  912, 943,  728, 759,  759, 790,  972, 1003, 572, 603,  882, 913,
+  697, 728,  790, 821,  666, 697,  821, 852,  943, 974,  635, 666,  852, 883,
+  541, 572,  913, 944,  973, 1004, 604, 635,  883, 914,  760, 791,  729, 760,
+  791, 822,  510, 541,  944, 975,  974, 1005, 698, 729,  822, 853,  573, 604,
+  914, 945,  667, 698,  853, 884,  636, 667,  884, 915,  975, 1006, 542, 573,
+  945, 976,  761, 792,  792, 823,  605, 636,  915, 946,  730, 761,  823, 854,
+  699, 730,  854, 885,  511, 542,  976, 1007, 574, 605,  946, 977,  668, 699,
+  885, 916,  637, 668,  916, 947,  543, 574,  793, 824,  977, 1008, 762, 793,
+  824, 855,  731, 762,  855, 886,  606, 637,  947, 978,  700, 731,  886, 917,
+  669, 700,  917, 948,  575, 606,  978, 1009, 638, 669,  948, 979,  794, 825,
+  825, 856,  763, 794,  856, 887,  732, 763,  887, 918,  607, 638,  979, 1010,
+  701, 732,  918, 949,  670, 701,  949, 980,  826, 857,  795, 826,  857, 888,
+  764, 795,  888, 919,  639, 670,  980, 1011, 733, 764,  919, 950,  702, 733,
+  950, 981,  671, 702,  981, 1012, 827, 858,  858, 889,  796, 827,  889, 920,
+  765, 796,  920, 951,  734, 765,  951, 982,  703, 734,  982, 1013, 859, 890,
+  828, 859,  890, 921,  797, 828,  921, 952,  766, 797,  952, 983,  735, 766,
+  983, 1014, 860, 891,  891, 922,  829, 860,  922, 953,  798, 829,  953, 984,
+  767, 798,  984, 1015, 892, 923,  861, 892,  923, 954,  830, 861,  954, 985,
+  799, 830,  985, 1016, 893, 924,  924, 955,  862, 893,  955, 986,  831, 862,
+  986, 1017, 925, 956,  894, 925,  956, 987,  863, 894,  987, 1018, 926, 957,
+  957, 988,  895, 926,  988, 1019, 958, 989,  927, 958,  989, 1020, 959, 990,
+  990, 1021, 991, 1022, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                qtr_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0,   0,    0,   0,    0,   0,    1,   32,   1,   1,    32,  32,   2,   33,
+  33,  64,   34,  65,   2,   2,    64,  64,   3,   34,   65,  96,   35,  66,
+  66,  97,   3,   3,    96,  96,   4,   35,   97,  128,  67,  98,   36,  67,
+  98,  129,  4,   4,    68,  99,   99,  130,  128, 128,  5,   36,   129, 160,
+  37,  68,   130, 161,  100, 131,  69,  100,  131, 162,  5,   5,    160, 160,
+  6,   37,   161, 192,  38,  69,   162, 193,  101, 132,  132, 163,  70,  101,
+  163, 194,  6,   6,    192, 192,  7,   38,   133, 164,  193, 224,  102, 133,
+  164, 195,  39,  70,   194, 225,  71,  102,  195, 226,  134, 165,  165, 196,
+  7,   7,    224, 224,  8,   39,   103, 134,  196, 227,  225, 256,  40,  71,
+  226, 257,  166, 197,  72,  103,  227, 258,  135, 166,  197, 228,  104, 135,
+  228, 259,  8,   8,    256, 256,  9,   40,   257, 288,  41,  72,   167, 198,
+  198, 229,  258, 289,  136, 167,  229, 260,  73,  104,  259, 290,  105, 136,
+  260, 291,  199, 230,  9,   9,    168, 199,  230, 261,  288, 288,  10,  41,
+  289, 320,  42,  73,   290, 321,  137, 168,  261, 292,  74,  105,  291, 322,
+  200, 231,  231, 262,  106, 137,  292, 323,  169, 200,  262, 293,  10,  10,
+  320, 320,  11,  42,   321, 352,  43,  74,   138, 169,  293, 324,  322, 353,
+  232, 263,  75,  106,  201, 232,  263, 294,  323, 354,  170, 201,  294, 325,
+  107, 138,  324, 355,  11,  11,   352, 352,  12,  43,   233, 264,  264, 295,
+  353, 384,  139, 170,  325, 356,  44,  75,   354, 385,  202, 233,  295, 326,
+  76,  107,  355, 386,  171, 202,  326, 357,  108, 139,  356, 387,  265, 296,
+  234, 265,  296, 327,  12,  12,   140, 171,  357, 388,  384, 384,  13,  44,
+  203, 234,  327, 358,  385, 416,  45,  76,   386, 417,  77,  108,  387, 418,
+  172, 203,  358, 389,  266, 297,  297, 328,  109, 140,  235, 266,  328, 359,
+  388, 419,  204, 235,  359, 390,  141, 172,  389, 420,  13,  13,   416, 416,
+  14,  45,   417, 448,  46,  77,   298, 329,  418, 449,  267, 298,  329, 360,
+  78,  109,  173, 204,  390, 421,  419, 450,  236, 267,  360, 391,  110, 141,
+  420, 451,  205, 236,  391, 422,  142, 173,  299, 330,  330, 361,  421, 452,
+  14,  14,   268, 299,  361, 392,  448, 448,  15,  46,   449, 480,  47,  78,
+  450, 481,  174, 205,  422, 453,  237, 268,  392, 423,  79,  110,  451, 482,
+  111, 142,  452, 483,  331, 362,  300, 331,  362, 393,  206, 237,  423, 454,
+  143, 174,  269, 300,  393, 424,  453, 484,  238, 269,  424, 455,  175, 206,
+  454, 485,  332, 363,  363, 394,  301, 332,  394, 425,  207, 238,  455, 486,
+  270, 301,  425, 456,  364, 395,  239, 270,  456, 487,  333, 364,  395, 426,
+  302, 333,  426, 457,  271, 302,  457, 488,  365, 396,  396, 427,  334, 365,
+  427, 458,  303, 334,  458, 489,  397, 428,  366, 397,  428, 459,  335, 366,
+  459, 490,  398, 429,  429, 460,  367, 398,  460, 491,  430, 461,  399, 430,
+  461, 492,  431, 462,  462, 493,  463, 494,  15,  15,   480, 480,  16,  47,
+  481, 512,  48,  79,   482, 513,  80,  111,  483, 514,  112, 143,  484, 515,
+  144, 175,  485, 516,  16,  16,   512, 512,  17,  48,   513, 544,  176, 207,
+  486, 517,  49,  80,   514, 545,  81,  112,  515, 546,  113, 144,  208, 239,
+  487, 518,  516, 547,  145, 176,  517, 548,  240, 271,  488, 519,  17,  17,
+  544, 544,  18,  49,   177, 208,  518, 549,  545, 576,  50,  81,   546, 577,
+  82,  113,  547, 578,  272, 303,  489, 520,  209, 240,  519, 550,  114, 145,
+  548, 579,  146, 177,  549, 580,  241, 272,  520, 551,  304, 335,  490, 521,
+  178, 209,  550, 581,  18,  18,   576, 576,  19,  50,   577, 608,  51,  82,
+  578, 609,  83,  114,  273, 304,  521, 552,  579, 610,  210, 241,  551, 582,
+  115, 146,  336, 367,  491, 522,  580, 611,  147, 178,  581, 612,  242, 273,
+  552, 583,  305, 336,  522, 553,  179, 210,  582, 613,  19,  19,   368, 399,
+  492, 523,  608, 608,  20,  51,   609, 640,  52,  83,   610, 641,  274, 305,
+  553, 584,  84,  115,  611, 642,  211, 242,  337, 368,  523, 554,  583, 614,
+  116, 147,  612, 643,  306, 337,  554, 585,  148, 179,  243, 274,  400, 431,
+  493, 524,  584, 615,  613, 644,  369, 400,  524, 555,  180, 211,  614, 645,
+  20,  20,   640, 640,  21,  52,   275, 306,  585, 616,  641, 672,  53,  84,
+  338, 369,  555, 586,  642, 673,  212, 243,  615, 646,  85,  116,  643, 674,
+  432, 463,  494, 525,  117, 148,  644, 675,  401, 432,  525, 556,  307, 338,
+  586, 617,  244, 275,  616, 647,  149, 180,  645, 676,  370, 401,  556, 587,
+  181, 212,  646, 677,  276, 307,  464, 495,  495, 526,  617, 648,  339, 370,
+  587, 618,  21,  21,   672, 672,  22,  53,   433, 464,  526, 557,  673, 704,
+  54,  85,   674, 705,  213, 244,  647, 678,  86,  117,  402, 433,  557, 588,
+  675, 706,  118, 149,  308, 339,  618, 649,  676, 707,  245, 276,  371, 402,
+  588, 619,  648, 679,  150, 181,  677, 708,  496, 527,  465, 496,  527, 558,
+  182, 213,  434, 465,  558, 589,  678, 709,  340, 371,  619, 650,  277, 308,
+  649, 680,  22,  22,   704, 704,  23,  54,   403, 434,  589, 620,  705, 736,
+  55,  86,   214, 245,  679, 710,  706, 737,  87,  118,  707, 738,  309, 340,
+  650, 681,  372, 403,  620, 651,  119, 150,  497, 528,  528, 559,  708, 739,
+  246, 277,  680, 711,  466, 497,  559, 590,  151, 182,  709, 740,  435, 466,
+  590, 621,  341, 372,  651, 682,  183, 214,  278, 309,  681, 712,  710, 741,
+  404, 435,  621, 652,  23,  23,   736, 736,  24,  55,   737, 768,  215, 246,
+  529, 560,  711, 742,  56,  87,   498, 529,  560, 591,  738, 769,  310, 341,
+  682, 713,  88,  119,  373, 404,  652, 683,  739, 770,  467, 498,  591, 622,
+  120, 151,  740, 771,  247, 278,  712, 743,  436, 467,  622, 653,  152, 183,
+  741, 772,  342, 373,  683, 714,  279, 310,  405, 436,  653, 684,  713, 744,
+  184, 215,  742, 773,  530, 561,  561, 592,  499, 530,  592, 623,  24,  24,
+  216, 247,  468, 499,  623, 654,  743, 774,  768, 768,  25,  56,   769, 800,
+  374, 405,  684, 715,  57,  88,   311, 342,  714, 745,  770, 801,  89,  120,
+  771, 802,  437, 468,  654, 685,  248, 279,  744, 775,  121, 152,  772, 803,
+  562, 593,  153, 184,  343, 374,  531, 562,  593, 624,  715, 746,  773, 804,
+  406, 437,  685, 716,  500, 531,  624, 655,  280, 311,  745, 776,  185, 216,
+  774, 805,  469, 500,  655, 686,  375, 406,  716, 747,  217, 248,  775, 806,
+  25,  25,   312, 343,  746, 777,  800, 800,  26,  57,   801, 832,  58,  89,
+  438, 469,  686, 717,  802, 833,  90,  121,  563, 594,  594, 625,  803, 834,
+  249, 280,  532, 563,  625, 656,  776, 807,  122, 153,  804, 835,  344, 375,
+  501, 532,  656, 687,  747, 778,  407, 438,  717, 748,  154, 185,  805, 836,
+  281, 312,  777, 808,  470, 501,  687, 718,  186, 217,  806, 837,  376, 407,
+  748, 779,  595, 626,  564, 595,  626, 657,  218, 249,  313, 344,  439, 470,
+  718, 749,  778, 809,  807, 838,  26,  26,   832, 832,  27,  58,   533, 564,
+  657, 688,  833, 864,  59,  90,   834, 865,  91,  122,  835, 866,  250, 281,
+  502, 533,  688, 719,  808, 839,  123, 154,  408, 439,  749, 780,  836, 867,
+  345, 376,  779, 810,  155, 186,  471, 502,  719, 750,  837, 868,  282, 313,
+  809, 840,  596, 627,  627, 658,  187, 218,  565, 596,  658, 689,  838, 869,
+  377, 408,  780, 811,  440, 471,  750, 781,  534, 565,  689, 720,  314, 345,
+  810, 841,  219, 250,  839, 870,  27,  27,   864, 864,  28,  59,   503, 534,
+  720, 751,  865, 896,  60,  91,   866, 897,  92,  123,  251, 282,  840, 871,
+  867, 898,  409, 440,  781, 812,  346, 377,  811, 842,  124, 155,  628, 659,
+  868, 899,  472, 503,  751, 782,  597, 628,  659, 690,  566, 597,  690, 721,
+  156, 187,  869, 900,  283, 314,  841, 872,  535, 566,  721, 752,  188, 219,
+  378, 409,  812, 843,  870, 901,  441, 472,  782, 813,  315, 346,  842, 873,
+  504, 535,  752, 783,  220, 251,  871, 902,  28,  28,   629, 660,  660, 691,
+  896, 896,  29,  60,   897, 928,  61,  92,   410, 441,  598, 629,  691, 722,
+  813, 844,  898, 929,  252, 283,  872, 903,  93,  124,  347, 378,  473, 504,
+  783, 814,  843, 874,  899, 930,  567, 598,  722, 753,  125, 156,  900, 931,
+  284, 315,  536, 567,  753, 784,  873, 904,  157, 188,  901, 932,  442, 473,
+  814, 845,  379, 410,  844, 875,  189, 220,  902, 933,  505, 536,  784, 815,
+  661, 692,  316, 347,  630, 661,  692, 723,  874, 905,  221, 252,  599, 630,
+  723, 754,  903, 934,  411, 442,  845, 876,  29,  29,   568, 599,  754, 785,
+  928, 928,  30,  61,   474, 505,  815, 846,  929, 960,  62,  93,   930, 961,
+  253, 284,  348, 379,  875, 906,  904, 935,  94,  125,  931, 962,  537, 568,
+  785, 816,  126, 157,  932, 963,  285, 316,  905, 936,  158, 189,  443, 474,
+  662, 693,  693, 724,  846, 877,  933, 964,  380, 411,  876, 907,  631, 662,
+  724, 755,  506, 537,  816, 847,  190, 221,  934, 965,  600, 631,  755, 786,
+  317, 348,  906, 937,  222, 253,  569, 600,  786, 817,  935, 966,  412, 443,
+  877, 908,  475, 506,  847, 878,  30,  30,   960, 960,  31,  62,   349, 380,
+  907, 938,  961, 992,  254, 285,  936, 967,  63,  94,   538, 569,  817, 848,
+  962, 993,  694, 725,  95,  126,  663, 694,  725, 756,  963, 994,  632, 663,
+  756, 787,  127, 158,  964, 995,  444, 475,  878, 909,  286, 317,  937, 968,
+  381, 412,  507, 538,  848, 879,  908, 939,  159, 190,  601, 632,  787, 818,
+  965, 996,  191, 222,  966, 997,  318, 349,  570, 601,  818, 849,  938, 969,
+  476, 507,  879, 910,  223, 254,  413, 444,  909, 940,  967, 998,  695, 726,
+  726, 757,  664, 695,  757, 788,  539, 570,  849, 880,  350, 381,  939, 970,
+  255, 286,  633, 664,  788, 819,  968, 999,  445, 476,  602, 633,  819, 850,
+  910, 941,  508, 539,  880, 911,  287, 318,  969, 1000, 382, 413,  940, 971,
+  571, 602,  850, 881,  727, 758,  696, 727,  758, 789,  319, 350,  970, 1001,
+  477, 508,  665, 696,  789, 820,  911, 942,  414, 445,  941, 972,  540, 571,
+  881, 912,  634, 665,  820, 851,  351, 382,  971, 1002, 603, 634,  851, 882,
+  446, 477,  942, 973,  509, 540,  912, 943,  383, 414,  728, 759,  759, 790,
+  972, 1003, 572, 603,  882, 913,  697, 728,  790, 821,  666, 697,  821, 852,
+  478, 509,  943, 974,  635, 666,  852, 883,  415, 446,  541, 572,  913, 944,
+  973, 1004, 604, 635,  883, 914,  760, 791,  729, 760,  791, 822,  510, 541,
+  944, 975,  447, 478,  974, 1005, 698, 729,  822, 853,  573, 604,  914, 945,
+  667, 698,  853, 884,  636, 667,  884, 915,  479, 510,  975, 1006, 542, 573,
+  945, 976,  761, 792,  792, 823,  605, 636,  915, 946,  730, 761,  823, 854,
+  699, 730,  854, 885,  511, 542,  976, 1007, 574, 605,  946, 977,  668, 699,
+  885, 916,  637, 668,  916, 947,  543, 574,  793, 824,  977, 1008, 762, 793,
+  824, 855,  731, 762,  855, 886,  606, 637,  947, 978,  700, 731,  886, 917,
+  669, 700,  917, 948,  575, 606,  978, 1009, 638, 669,  948, 979,  794, 825,
+  825, 856,  763, 794,  856, 887,  732, 763,  887, 918,  607, 638,  979, 1010,
+  701, 732,  918, 949,  670, 701,  949, 980,  826, 857,  795, 826,  857, 888,
+  764, 795,  888, 919,  639, 670,  980, 1011, 733, 764,  919, 950,  702, 733,
+  950, 981,  671, 702,  981, 1012, 827, 858,  858, 889,  796, 827,  889, 920,
+  765, 796,  920, 951,  734, 765,  951, 982,  703, 734,  982, 1013, 859, 890,
+  828, 859,  890, 921,  797, 828,  921, 952,  766, 797,  952, 983,  735, 766,
+  983, 1014, 860, 891,  891, 922,  829, 860,  922, 953,  798, 829,  953, 984,
+  767, 798,  984, 1015, 892, 923,  861, 892,  923, 954,  830, 861,  954, 985,
+  799, 830,  985, 1016, 893, 924,  924, 955,  862, 893,  955, 986,  831, 862,
+  986, 1017, 925, 956,  894, 925,  956, 987,  863, 894,  987, 1018, 926, 957,
+  957, 988,  895, 926,  988, 1019, 958, 989,  927, 958,  989, 1020, 959, 990,
+  990, 1021, 991, 1022, 0,   0
+};
+
+#if CONFIG_TX64X64
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_64x64_neighbors[4097 * MAX_NEIGHBORS]) = {
+  0,    0,    0,    0,    0,    0,    1,    64,   1,    1,    64,   64,   2,
+  65,   65,   128,  66,   129,  2,    2,    128,  128,  3,    66,   129,  192,
+  67,   130,  130,  193,  3,    3,    192,  192,  4,    67,   193,  256,  131,
+  194,  68,   131,  194,  257,  4,    4,    132,  195,  195,  258,  256,  256,
+  5,    68,   257,  320,  69,   132,  258,  321,  196,  259,  133,  196,  259,
+  322,  5,    5,    320,  320,  6,    69,   321,  384,  70,   133,  322,  385,
+  197,  260,  260,  323,  134,  197,  323,  386,  6,    6,    384,  384,  7,
+  70,   261,  324,  385,  448,  198,  261,  324,  387,  71,   134,  386,  449,
+  135,  198,  387,  450,  262,  325,  325,  388,  7,    7,    448,  448,  8,
+  71,   199,  262,  388,  451,  449,  512,  72,   135,  450,  513,  326,  389,
+  136,  199,  451,  514,  263,  326,  389,  452,  200,  263,  452,  515,  8,
+  8,    512,  512,  9,    72,   513,  576,  73,   136,  327,  390,  390,  453,
+  514,  577,  264,  327,  453,  516,  137,  200,  515,  578,  201,  264,  516,
+  579,  391,  454,  9,    9,    328,  391,  454,  517,  576,  576,  10,   73,
+  577,  640,  74,   137,  578,  641,  265,  328,  517,  580,  138,  201,  579,
+  642,  392,  455,  455,  518,  202,  265,  580,  643,  329,  392,  518,  581,
+  10,   10,   640,  640,  11,   74,   641,  704,  75,   138,  266,  329,  581,
+  644,  642,  705,  456,  519,  139,  202,  393,  456,  519,  582,  643,  706,
+  330,  393,  582,  645,  203,  266,  644,  707,  11,   11,   704,  704,  12,
+  75,   457,  520,  520,  583,  705,  768,  267,  330,  645,  708,  76,   139,
+  706,  769,  394,  457,  583,  646,  140,  203,  707,  770,  331,  394,  646,
+  709,  204,  267,  708,  771,  521,  584,  458,  521,  584,  647,  12,   12,
+  268,  331,  709,  772,  768,  768,  13,   76,   395,  458,  647,  710,  769,
+  832,  77,   140,  770,  833,  141,  204,  771,  834,  332,  395,  710,  773,
+  522,  585,  585,  648,  205,  268,  459,  522,  648,  711,  772,  835,  396,
+  459,  711,  774,  269,  332,  773,  836,  13,   13,   832,  832,  14,   77,
+  833,  896,  78,   141,  586,  649,  834,  897,  523,  586,  649,  712,  142,
+  205,  333,  396,  774,  837,  835,  898,  460,  523,  712,  775,  206,  269,
+  836,  899,  397,  460,  775,  838,  270,  333,  587,  650,  650,  713,  837,
+  900,  14,   14,   524,  587,  713,  776,  896,  896,  15,   78,   897,  960,
+  79,   142,  898,  961,  334,  397,  838,  901,  461,  524,  776,  839,  143,
+  206,  899,  962,  207,  270,  900,  963,  651,  714,  588,  651,  714,  777,
+  398,  461,  839,  902,  271,  334,  525,  588,  777,  840,  901,  964,  15,
+  15,   960,  960,  16,   79,   961,  1024, 80,   143,  462,  525,  840,  903,
+  962,  1025, 335,  398,  902,  965,  144,  207,  652,  715,  715,  778,  963,
+  1026, 589,  652,  778,  841,  208,  271,  964,  1027, 399,  462,  903,  966,
+  526,  589,  841,  904,  272,  335,  965,  1028, 716,  779,  16,   16,   463,
+  526,  904,  967,  1024, 1024, 17,   80,   653,  716,  779,  842,  1025, 1088,
+  336,  399,  966,  1029, 81,   144,  1026, 1089, 590,  653,  842,  905,  145,
+  208,  1027, 1090, 209,  272,  400,  463,  967,  1030, 1028, 1091, 527,  590,
+  905,  968,  717,  780,  780,  843,  273,  336,  1029, 1092, 654,  717,  843,
+  906,  464,  527,  968,  1031, 17,   17,   1088, 1088, 18,   81,   337,  400,
+  591,  654,  906,  969,  1030, 1093, 1089, 1152, 82,   145,  1090, 1153, 146,
+  209,  1091, 1154, 528,  591,  969,  1032, 401,  464,  781,  844,  1031, 1094,
+  210,  273,  718,  781,  844,  907,  1092, 1155, 655,  718,  907,  970,  274,
+  337,  1093, 1156, 465,  528,  1032, 1095, 592,  655,  970,  1033, 338,  401,
+  1094, 1157, 18,   18,   1152, 1152, 19,   82,   1153, 1216, 83,   146,  782,
+  845,  845,  908,  1154, 1217, 719,  782,  908,  971,  147,  210,  529,  592,
+  1033, 1096, 1155, 1218, 402,  465,  1095, 1158, 211,  274,  656,  719,  971,
+  1034, 1156, 1219, 275,  338,  1157, 1220, 466,  529,  1096, 1159, 593,  656,
+  1034, 1097, 846,  909,  783,  846,  909,  972,  339,  402,  1158, 1221, 19,
+  19,   720,  783,  972,  1035, 1216, 1216, 20,   83,   1217, 1280, 84,   147,
+  1218, 1281, 530,  593,  1097, 1160, 148,  211,  1219, 1282, 403,  466,  657,
+  720,  1035, 1098, 1159, 1222, 212,  275,  1220, 1283, 847,  910,  910,  973,
+  594,  657,  1098, 1161, 276,  339,  467,  530,  784,  847,  973,  1036, 1160,
+  1223, 1221, 1284, 721,  784,  1036, 1099, 340,  403,  1222, 1285, 20,   20,
+  1280, 1280, 21,   84,   531,  594,  1161, 1224, 1281, 1344, 85,   148,  658,
+  721,  1099, 1162, 1282, 1345, 404,  467,  1223, 1286, 149,  212,  911,  974,
+  1283, 1346, 848,  911,  974,  1037, 213,  276,  1284, 1347, 785,  848,  1037,
+  1100, 595,  658,  1162, 1225, 468,  531,  1224, 1287, 277,  340,  1285, 1348,
+  722,  785,  1100, 1163, 341,  404,  1286, 1349, 532,  595,  912,  975,  975,
+  1038, 1225, 1288, 659,  722,  1163, 1226, 21,   21,   1344, 1344, 22,   85,
+  849,  912,  1038, 1101, 1345, 1408, 86,   149,  1346, 1409, 405,  468,  1287,
+  1350, 150,  213,  786,  849,  1101, 1164, 1347, 1410, 214,  277,  596,  659,
+  1226, 1289, 1348, 1411, 469,  532,  723,  786,  1164, 1227, 1288, 1351, 278,
+  341,  1349, 1412, 976,  1039, 913,  976,  1039, 1102, 342,  405,  850,  913,
+  1102, 1165, 1350, 1413, 660,  723,  1227, 1290, 533,  596,  1289, 1352, 22,
+  22,   1408, 1408, 23,   86,   787,  850,  1165, 1228, 1409, 1472, 87,   150,
+  406,  469,  1351, 1414, 1410, 1473, 151,  214,  1411, 1474, 597,  660,  1290,
+  1353, 724,  787,  1228, 1291, 215,  278,  977,  1040, 1040, 1103, 1412, 1475,
+  470,  533,  1352, 1415, 914,  977,  1103, 1166, 279,  342,  1413, 1476, 851,
+  914,  1166, 1229, 661,  724,  1291, 1354, 343,  406,  534,  597,  1353, 1416,
+  1414, 1477, 788,  851,  1229, 1292, 23,   23,   1472, 1472, 24,   87,   1473,
+  1536, 407,  470,  1041, 1104, 1415, 1478, 88,   151,  978,  1041, 1104, 1167,
+  1474, 1537, 598,  661,  1354, 1417, 152,  215,  725,  788,  1292, 1355, 1475,
+  1538, 915,  978,  1167, 1230, 216,  279,  1476, 1539, 471,  534,  1416, 1479,
+  852,  915,  1230, 1293, 280,  343,  1477, 1540, 662,  725,  1355, 1418, 535,
+  598,  789,  852,  1293, 1356, 1417, 1480, 344,  407,  1478, 1541, 1042, 1105,
+  1105, 1168, 979,  1042, 1168, 1231, 24,   24,   408,  471,  916,  979,  1231,
+  1294, 1479, 1542, 1536, 1536, 25,   88,   1537, 1600, 726,  789,  1356, 1419,
+  89,   152,  599,  662,  1418, 1481, 1538, 1601, 153,  216,  1539, 1602, 853,
+  916,  1294, 1357, 472,  535,  1480, 1543, 217,  280,  1540, 1603, 1106, 1169,
+  281,  344,  663,  726,  1043, 1106, 1169, 1232, 1419, 1482, 1541, 1604, 790,
+  853,  1357, 1420, 980,  1043, 1232, 1295, 536,  599,  1481, 1544, 345,  408,
+  1542, 1605, 917,  980,  1295, 1358, 727,  790,  1420, 1483, 409,  472,  1543,
+  1606, 25,   25,   600,  663,  1482, 1545, 1600, 1600, 26,   89,   1601, 1664,
+  90,   153,  854,  917,  1358, 1421, 1602, 1665, 154,  217,  1107, 1170, 1170,
+  1233, 1603, 1666, 473,  536,  1044, 1107, 1233, 1296, 1544, 1607, 218,  281,
+  1604, 1667, 664,  727,  981,  1044, 1296, 1359, 1483, 1546, 791,  854,  1421,
+  1484, 282,  345,  1605, 1668, 537,  600,  1545, 1608, 918,  981,  1359, 1422,
+  346,  409,  1606, 1669, 728,  791,  1484, 1547, 1171, 1234, 1108, 1171, 1234,
+  1297, 410,  473,  601,  664,  855,  918,  1422, 1485, 1546, 1609, 1607, 1670,
+  26,   26,   1664, 1664, 27,   90,   1045, 1108, 1297, 1360, 1665, 1728, 91,
+  154,  1666, 1729, 155,  218,  1667, 1730, 474,  537,  982,  1045, 1360, 1423,
+  1608, 1671, 219,  282,  792,  855,  1485, 1548, 1668, 1731, 665,  728,  1547,
+  1610, 283,  346,  919,  982,  1423, 1486, 1669, 1732, 538,  601,  1609, 1672,
+  1172, 1235, 1235, 1298, 347,  410,  1109, 1172, 1298, 1361, 1670, 1733, 729,
+  792,  1548, 1611, 856,  919,  1486, 1549, 1046, 1109, 1361, 1424, 602,  665,
+  1610, 1673, 411,  474,  1671, 1734, 27,   27,   1728, 1728, 28,   91,   983,
+  1046, 1424, 1487, 1729, 1792, 92,   155,  1730, 1793, 156,  219,  475,  538,
+  1672, 1735, 1731, 1794, 793,  856,  1549, 1612, 666,  729,  1611, 1674, 220,
+  283,  1236, 1299, 1732, 1795, 920,  983,  1487, 1550, 1173, 1236, 1299, 1362,
+  1110, 1173, 1362, 1425, 284,  347,  1733, 1796, 539,  602,  1673, 1736, 1047,
+  1110, 1425, 1488, 348,  411,  730,  793,  1612, 1675, 1734, 1797, 857,  920,
+  1550, 1613, 603,  666,  1674, 1737, 984,  1047, 1488, 1551, 412,  475,  1735,
+  1798, 28,   28,   1237, 1300, 1300, 1363, 1792, 1792, 29,   92,   1793, 1856,
+  93,   156,  794,  857,  1174, 1237, 1363, 1426, 1613, 1676, 1794, 1857, 476,
+  539,  1736, 1799, 157,  220,  667,  730,  921,  984,  1551, 1614, 1675, 1738,
+  1795, 1858, 1111, 1174, 1426, 1489, 221,  284,  1796, 1859, 540,  603,  1048,
+  1111, 1489, 1552, 1737, 1800, 285,  348,  1797, 1860, 858,  921,  1614, 1677,
+  731,  794,  1676, 1739, 349,  412,  1798, 1861, 985,  1048, 1552, 1615, 1301,
+  1364, 604,  667,  1238, 1301, 1364, 1427, 1738, 1801, 413,  476,  1175, 1238,
+  1427, 1490, 1799, 1862, 795,  858,  1677, 1740, 29,   29,   1112, 1175, 1490,
+  1553, 1856, 1856, 30,   93,   922,  985,  1615, 1678, 1857, 1920, 94,   157,
+  1858, 1921, 477,  540,  668,  731,  1739, 1802, 1800, 1863, 158,  221,  1859,
+  1922, 1049, 1112, 1553, 1616, 222,  285,  1860, 1923, 541,  604,  1801, 1864,
+  286,  349,  859,  922,  1302, 1365, 1365, 1428, 1678, 1741, 1861, 1924, 732,
+  795,  1740, 1803, 1239, 1302, 1428, 1491, 986,  1049, 1616, 1679, 350,  413,
+  1862, 1925, 1176, 1239, 1491, 1554, 605,  668,  1802, 1865, 414,  477,  1113,
+  1176, 1554, 1617, 1863, 1926, 796,  859,  1741, 1804, 923,  986,  1679, 1742,
+  30,   30,   1920, 1920, 31,   94,   669,  732,  1803, 1866, 1921, 1984, 478,
+  541,  1864, 1927, 95,   158,  1050, 1113, 1617, 1680, 1922, 1985, 1366, 1429,
+  159,  222,  1303, 1366, 1429, 1492, 1923, 1986, 1240, 1303, 1492, 1555, 223,
+  286,  1924, 1987, 860,  923,  1742, 1805, 542,  605,  1865, 1928, 733,  796,
+  987,  1050, 1680, 1743, 1804, 1867, 287,  350,  1177, 1240, 1555, 1618, 1925,
+  1988, 351,  414,  1926, 1989, 606,  669,  1114, 1177, 1618, 1681, 1866, 1929,
+  924,  987,  1743, 1806, 415,  478,  797,  860,  1805, 1868, 1927, 1990, 1367,
+  1430, 1430, 1493, 1304, 1367, 1493, 1556, 1051, 1114, 1681, 1744, 670,  733,
+  1867, 1930, 31,   31,   1984, 1984, 32,   95,   479,  542,  1241, 1304, 1556,
+  1619, 1928, 1991, 1985, 2048, 96,   159,  1986, 2049, 160,  223,  1987, 2050,
+  861,  924,  1178, 1241, 1619, 1682, 1806, 1869, 224,  287,  988,  1051, 1744,
+  1807, 1988, 2051, 543,  606,  1929, 1992, 734,  797,  1868, 1931, 288,  351,
+  1989, 2052, 1115, 1178, 1682, 1745, 1431, 1494, 352,  415,  1368, 1431, 1494,
+  1557, 1990, 2053, 607,  670,  1930, 1993, 925,  988,  1305, 1368, 1557, 1620,
+  1807, 1870, 798,  861,  1869, 1932, 416,  479,  1052, 1115, 1745, 1808, 1991,
+  2054, 1242, 1305, 1620, 1683, 671,  734,  1931, 1994, 480,  543,  1992, 2055,
+  32,   32,   2048, 2048, 33,   96,   1179, 1242, 1683, 1746, 2049, 2112, 97,
+  160,  2050, 2113, 862,  925,  1870, 1933, 989,  1052, 1808, 1871, 161,  224,
+  2051, 2114, 225,  288,  544,  607,  735,  798,  1432, 1495, 1495, 1558, 1932,
+  1995, 1993, 2056, 2052, 2115, 1116, 1179, 1746, 1809, 1369, 1432, 1558, 1621,
+  289,  352,  2053, 2116, 1306, 1369, 1621, 1684, 608,  671,  1994, 2057, 353,
+  416,  926,  989,  1871, 1934, 2054, 2117, 1243, 1306, 1684, 1747, 799,  862,
+  1053, 1116, 1809, 1872, 1933, 1996, 417,  480,  2055, 2118, 672,  735,  1180,
+  1243, 1747, 1810, 1995, 2058, 1496, 1559, 481,  544,  2056, 2119, 1433, 1496,
+  1559, 1622, 33,   33,   990,  1053, 1872, 1935, 2112, 2112, 34,   97,   863,
+  926,  1934, 1997, 2113, 2176, 98,   161,  1370, 1433, 1622, 1685, 2114, 2177,
+  162,  225,  1117, 1180, 1810, 1873, 2115, 2178, 736,  799,  1996, 2059, 545,
+  608,  1307, 1370, 1685, 1748, 2057, 2120, 226,  289,  2116, 2179, 290,  353,
+  2117, 2180, 1244, 1307, 1748, 1811, 927,  990,  1935, 1998, 609,  672,  1054,
+  1117, 1873, 1936, 2058, 2121, 354,  417,  2118, 2181, 800,  863,  1997, 2060,
+  1497, 1560, 1560, 1623, 1181, 1244, 1811, 1874, 418,  481,  1434, 1497, 1623,
+  1686, 2119, 2182, 673,  736,  2059, 2122, 1371, 1434, 1686, 1749, 991,  1054,
+  1936, 1999, 482,  545,  864,  927,  1998, 2061, 2120, 2183, 1118, 1181, 1874,
+  1937, 34,   34,   1308, 1371, 1749, 1812, 2176, 2176, 35,   98,   2177, 2240,
+  99,   162,  2178, 2241, 737,  800,  2060, 2123, 163,  226,  2179, 2242, 546,
+  609,  2121, 2184, 227,  290,  1245, 1308, 1812, 1875, 2180, 2243, 928,  991,
+  1999, 2062, 291,  354,  1055, 1118, 1561, 1624, 1937, 2000, 2181, 2244, 1498,
+  1561, 1624, 1687, 610,  673,  2122, 2185, 801,  864,  1435, 1498, 1687, 1750,
+  2061, 2124, 355,  418,  1182, 1245, 1875, 1938, 2182, 2245, 1372, 1435, 1750,
+  1813, 419,  482,  2183, 2246, 674,  737,  2123, 2186, 992,  1055, 2000, 2063,
+  1309, 1372, 1813, 1876, 865,  928,  1119, 1182, 1938, 2001, 2062, 2125, 483,
+  546,  2184, 2247, 35,   35,   2240, 2240, 36,   99,   2241, 2304, 100,  163,
+  738,  801,  1246, 1309, 1876, 1939, 2124, 2187, 2242, 2305, 1562, 1625, 1625,
+  1688, 164,  227,  1499, 1562, 1688, 1751, 2243, 2306, 547,  610,  2185, 2248,
+  228,  291,  2244, 2307, 1056, 1119, 1436, 1499, 1751, 1814, 2001, 2064, 929,
+  992,  2063, 2126, 292,  355,  2245, 2308, 1183, 1246, 1939, 2002, 611,  674,
+  802,  865,  1373, 1436, 1814, 1877, 2125, 2188, 2186, 2249, 356,  419,  2246,
+  2309, 1310, 1373, 1877, 1940, 420,  483,  993,  1056, 2064, 2127, 2247, 2310,
+  675,  738,  2187, 2250, 1120, 1183, 2002, 2065, 866,  929,  1626, 1689, 2126,
+  2189, 1563, 1626, 1689, 1752, 484,  547,  1500, 1563, 1752, 1815, 2248, 2311,
+  1247, 1310, 1940, 2003, 36,   36,   739,  802,  2188, 2251, 2304, 2304, 37,
+  100,  1437, 1500, 1815, 1878, 2305, 2368, 101,  164,  2306, 2369, 548,  611,
+  2249, 2312, 165,  228,  1057, 1120, 2065, 2128, 2307, 2370, 930,  993,  2127,
+  2190, 1374, 1437, 1878, 1941, 229,  292,  1184, 1247, 2003, 2066, 2308, 2371,
+  293,  356,  803,  866,  2189, 2252, 2309, 2372, 612,  675,  2250, 2313, 1311,
+  1374, 1941, 2004, 357,  420,  1627, 1690, 1690, 1753, 2310, 2373, 1564, 1627,
+  1753, 1816, 994,  1057, 2128, 2191, 1121, 1184, 2066, 2129, 676,  739,  1501,
+  1564, 1816, 1879, 2251, 2314, 421,  484,  2311, 2374, 867,  930,  2190, 2253,
+  1248, 1311, 2004, 2067, 1438, 1501, 1879, 1942, 485,  548,  2312, 2375, 740,
+  803,  2252, 2315, 37,   37,   2368, 2368, 38,   101,  1058, 1121, 1375, 1438,
+  1942, 2005, 2129, 2192, 2369, 2432, 102,  165,  2370, 2433, 549,  612,  931,
+  994,  1185, 1248, 2067, 2130, 2191, 2254, 2313, 2376, 166,  229,  2371, 2434,
+  1691, 1754, 230,  293,  1628, 1691, 1754, 1817, 2372, 2435, 804,  867,  1312,
+  1375, 2005, 2068, 2253, 2316, 1565, 1628, 1817, 1880, 294,  357,  613,  676,
+  2314, 2377, 2373, 2436, 1502, 1565, 1880, 1943, 358,  421,  1122, 1185, 2130,
+  2193, 2374, 2437, 995,  1058, 2192, 2255, 1249, 1312, 2068, 2131, 677,  740,
+  1439, 1502, 1943, 2006, 2315, 2378, 868,  931,  2254, 2317, 422,  485,  2375,
+  2438, 486,  549,  1376, 1439, 2006, 2069, 2376, 2439, 741,  804,  1692, 1755,
+  1755, 1818, 2316, 2379, 1059, 1122, 2193, 2256, 1186, 1249, 1629, 1692, 1818,
+  1881, 2131, 2194, 38,   38,   932,  995,  2255, 2318, 2432, 2432, 39,   102,
+  2433, 2496, 103,  166,  550,  613,  1566, 1629, 1881, 1944, 2377, 2440, 2434,
+  2497, 167,  230,  1313, 1376, 2069, 2132, 2435, 2498, 231,  294,  1503, 1566,
+  1944, 2007, 2436, 2499, 805,  868,  2317, 2380, 614,  677,  2378, 2441, 295,
+  358,  2437, 2500, 1123, 1186, 2194, 2257, 996,  1059, 2256, 2319, 1440, 1503,
+  2007, 2070, 1250, 1313, 2132, 2195, 359,  422,  2438, 2501, 678,  741,  869,
+  932,  2318, 2381, 2379, 2442, 1756, 1819, 423,  486,  1693, 1756, 1819, 1882,
+  2439, 2502, 1377, 1440, 2070, 2133, 1630, 1693, 1882, 1945, 487,  550,  1060,
+  1123, 2257, 2320, 2440, 2503, 1187, 1250, 1567, 1630, 1945, 2008, 2195, 2258,
+  742,  805,  2380, 2443, 933,  996,  2319, 2382, 1314, 1377, 2133, 2196, 39,
+  39,   1504, 1567, 2008, 2071, 2496, 2496, 40,   103,  2497, 2560, 551,  614,
+  2441, 2504, 104,  167,  2498, 2561, 168,  231,  2499, 2562, 806,  869,  2381,
+  2444, 232,  295,  2500, 2563, 1441, 1504, 2071, 2134, 1124, 1187, 2258, 2321,
+  615,  678,  2442, 2505, 296,  359,  997,  1060, 1251, 1314, 1757, 1820, 1820,
+  1883, 2196, 2259, 2320, 2383, 2501, 2564, 1694, 1757, 1883, 1946, 360,  423,
+  2502, 2565, 1631, 1694, 1946, 2009, 870,  933,  1378, 1441, 2134, 2197, 2382,
+  2445, 679,  742,  2443, 2506, 424,  487,  1568, 1631, 2009, 2072, 2503, 2566,
+  1188, 1251, 2259, 2322, 1061, 1124, 2321, 2384, 488,  551,  2504, 2567, 743,
+  806,  1505, 1568, 2072, 2135, 2444, 2507, 1315, 1378, 2197, 2260, 934,  997,
+  2383, 2446, 40,   40,   552,  615,  2505, 2568, 2560, 2560, 41,   104,  1821,
+  1884, 2561, 2624, 1758, 1821, 1884, 1947, 105,  168,  1442, 1505, 2135, 2198,
+  2562, 2625, 169,  232,  807,  870,  1695, 1758, 1947, 2010, 2445, 2508, 2563,
+  2626, 1125, 1188, 2322, 2385, 1252, 1315, 2260, 2323, 233,  296,  2564, 2627,
+  616,  679,  998,  1061, 1632, 1695, 2010, 2073, 2384, 2447, 2506, 2569, 297,
+  360,  2565, 2628, 1379, 1442, 2198, 2261, 1569, 1632, 2073, 2136, 361,  424,
+  871,  934,  2446, 2509, 2566, 2629, 680,  743,  2507, 2570, 425,  488,  1189,
+  1252, 2323, 2386, 2567, 2630, 1506, 1569, 2136, 2199, 1062, 1125, 2385, 2448,
+  1316, 1379, 2261, 2324, 1822, 1885, 1885, 1948, 744,  807,  2508, 2571, 489,
+  552,  1759, 1822, 1948, 2011, 2568, 2631, 935,  998,  2447, 2510, 1696, 1759,
+  2011, 2074, 1443, 1506, 2199, 2262, 553,  616,  2569, 2632, 41,   41,   2624,
+  2624, 42,   105,  1633, 1696, 2074, 2137, 2625, 2688, 106,  169,  1126, 1189,
+  2386, 2449, 2626, 2689, 808,  871,  1253, 1316, 2324, 2387, 2509, 2572, 170,
+  233,  2627, 2690, 999,  1062, 2448, 2511, 234,  297,  1380, 1443, 2262, 2325,
+  2628, 2691, 617,  680,  1570, 1633, 2137, 2200, 2570, 2633, 298,  361,  2629,
+  2692, 872,  935,  2510, 2573, 362,  425,  1886, 1949, 2630, 2693, 1507, 1570,
+  2200, 2263, 681,  744,  1823, 1886, 1949, 2012, 2571, 2634, 1190, 1253, 2387,
+  2450, 1760, 1823, 2012, 2075, 1063, 1126, 1317, 1380, 2325, 2388, 2449, 2512,
+  426,  489,  2631, 2694, 1697, 1760, 2075, 2138, 745,  808,  936,  999,  1444,
+  1507, 2263, 2326, 2511, 2574, 2572, 2635, 490,  553,  2632, 2695, 1634, 1697,
+  2138, 2201, 1254, 1317, 2388, 2451, 554,  617,  1127, 1190, 2450, 2513, 2633,
+  2696, 42,   42,   2688, 2688, 43,   106,  809,  872,  1571, 1634, 2201, 2264,
+  2573, 2636, 2689, 2752, 107,  170,  1381, 1444, 2326, 2389, 2690, 2753, 1000,
+  1063, 2512, 2575, 171,  234,  2691, 2754, 1887, 1950, 1950, 2013, 618,  681,
+  2634, 2697, 235,  298,  1824, 1887, 2013, 2076, 2692, 2755, 1508, 1571, 2264,
+  2327, 1761, 1824, 2076, 2139, 299,  362,  2693, 2756, 873,  936,  2574, 2637,
+  1191, 1254, 2451, 2514, 363,  426,  682,  745,  1318, 1381, 1698, 1761, 2139,
+  2202, 2389, 2452, 2635, 2698, 2694, 2757, 1064, 1127, 2513, 2576, 427,  490,
+  1445, 1508, 2327, 2390, 2695, 2758, 1635, 1698, 2202, 2265, 937,  1000, 2575,
+  2638, 746,  809,  2636, 2699, 491,  554,  2696, 2759, 1255, 1318, 1572, 1635,
+  2265, 2328, 2452, 2515, 1951, 2014, 1128, 1191, 1888, 1951, 2014, 2077, 2514,
+  2577, 1382, 1445, 2390, 2453, 555,  618,  1825, 1888, 2077, 2140, 2697, 2760,
+  810,  873,  2637, 2700, 43,   43,   2752, 2752, 44,   107,  1001, 1064, 2576,
+  2639, 2753, 2816, 108,  171,  1762, 1825, 2140, 2203, 2754, 2817, 172,  235,
+  1509, 1572, 2328, 2391, 2755, 2818, 619,  682,  2698, 2761, 236,  299,  2756,
+  2819, 1699, 1762, 2203, 2266, 874,  937,  2638, 2701, 300,  363,  1192, 1255,
+  2515, 2578, 2757, 2820, 1319, 1382, 2453, 2516, 683,  746,  1065, 1128, 2577,
+  2640, 2699, 2762, 364,  427,  1636, 1699, 2266, 2329, 2758, 2821, 1446, 1509,
+  2391, 2454, 428,  491,  1952, 2015, 2015, 2078, 2759, 2822, 938,  1001, 1889,
+  1952, 2078, 2141, 2639, 2702, 747,  810,  2700, 2763, 1573, 1636, 2329, 2392,
+  1826, 1889, 2141, 2204, 492,  555,  1256, 1319, 2516, 2579, 2760, 2823, 1129,
+  1192, 1383, 1446, 2454, 2517, 2578, 2641, 1763, 1826, 2204, 2267, 556,  619,
+  2761, 2824, 811,  874,  2701, 2764, 1002, 1065, 1510, 1573, 2392, 2455, 2640,
+  2703, 44,   44,   1700, 1763, 2267, 2330, 2816, 2816, 45,   108,  2817, 2880,
+  109,  172,  2818, 2881, 173,  236,  2819, 2882, 620,  683,  2762, 2825, 237,
+  300,  1320, 1383, 2517, 2580, 2820, 2883, 1193, 1256, 2579, 2642, 875,  938,
+  1637, 1700, 2330, 2393, 2702, 2765, 2016, 2079, 301,  364,  1447, 1510, 1953,
+  2016, 2079, 2142, 2455, 2518, 2821, 2884, 1066, 1129, 2641, 2704, 1890, 1953,
+  2142, 2205, 684,  747,  2763, 2826, 365,  428,  2822, 2885, 1827, 1890, 2205,
+  2268, 1574, 1637, 2393, 2456, 429,  492,  939,  1002, 2703, 2766, 2823, 2886,
+  748,  811,  1764, 1827, 2268, 2331, 2764, 2827, 1257, 1320, 2580, 2643, 1384,
+  1447, 2518, 2581, 1130, 1193, 2642, 2705, 493,  556,  2824, 2887, 1511, 1574,
+  2456, 2519, 1701, 1764, 2331, 2394, 812,  875,  1003, 1066, 2704, 2767, 2765,
+  2828, 557,  620,  2825, 2888, 2017, 2080, 2080, 2143, 45,   45,   2880, 2880,
+  46,   109,  1954, 2017, 2143, 2206, 2881, 2944, 110,  173,  1638, 1701, 2394,
+  2457, 2882, 2945, 1321, 1384, 2581, 2644, 174,  237,  621,  684,  1194, 1257,
+  1891, 1954, 2206, 2269, 2643, 2706, 2826, 2889, 2883, 2946, 1448, 1511, 2519,
+  2582, 238,  301,  876,  939,  2766, 2829, 2884, 2947, 1828, 1891, 2269, 2332,
+  1067, 1130, 2705, 2768, 302,  365,  2885, 2948, 685,  748,  1575, 1638, 2457,
+  2520, 2827, 2890, 366,  429,  2886, 2949, 1765, 1828, 2332, 2395, 940,  1003,
+  2767, 2830, 1258, 1321, 2644, 2707, 430,  493,  1385, 1448, 2582, 2645, 2887,
+  2950, 749,  812,  2828, 2891, 1131, 1194, 1702, 1765, 2395, 2458, 2706, 2769,
+  1512, 1575, 2520, 2583, 2081, 2144, 494,  557,  2018, 2081, 2144, 2207, 2888,
+  2951, 1955, 2018, 2207, 2270, 1004, 1067, 2768, 2831, 813,  876,  2829, 2892,
+  1892, 1955, 2270, 2333, 558,  621,  1639, 1702, 2458, 2521, 2889, 2952, 1322,
+  1385, 2645, 2708, 46,   46,   2944, 2944, 47,   110,  1195, 1258, 1449, 1512,
+  1829, 1892, 2333, 2396, 2583, 2646, 2707, 2770, 2945, 3008, 111,  174,  2946,
+  3009, 622,  685,  2890, 2953, 175,  238,  2947, 3010, 877,  940,  2830, 2893,
+  239,  302,  1068, 1131, 1576, 1639, 2521, 2584, 2769, 2832, 2948, 3011, 1766,
+  1829, 2396, 2459, 303,  366,  2949, 3012, 686,  749,  2891, 2954, 367,  430,
+  2082, 2145, 2145, 2208, 2950, 3013, 1386, 1449, 2646, 2709, 1259, 1322, 2019,
+  2082, 2208, 2271, 2708, 2771, 941,  1004, 1703, 1766, 2459, 2522, 2831, 2894,
+  1513, 1576, 1956, 2019, 2271, 2334, 2584, 2647, 431,  494,  2951, 3014, 750,
+  813,  1132, 1195, 2770, 2833, 2892, 2955, 1893, 1956, 2334, 2397, 495,  558,
+  2952, 3015, 1640, 1703, 2522, 2585, 1005, 1068, 2832, 2895, 814,  877,  1830,
+  1893, 2397, 2460, 2893, 2956, 559,  622,  1323, 1386, 2709, 2772, 2953, 3016,
+  1450, 1513, 2647, 2710, 1196, 1259, 2771, 2834, 47,   47,   3008, 3008, 48,
+  111,  1767, 1830, 2460, 2523, 3009, 3072, 1577, 1640, 2585, 2648, 112,  175,
+  3010, 3073, 623,  686,  2954, 3017, 878,  941,  2146, 2209, 2894, 2957, 176,
+  239,  3011, 3074, 1069, 1132, 2083, 2146, 2209, 2272, 2833, 2896, 240,  303,
+  2020, 2083, 2272, 2335, 3012, 3075, 304,  367,  1704, 1767, 2523, 2586, 3013,
+  3076, 687,  750,  1957, 2020, 2335, 2398, 2955, 3018, 1387, 1450, 2710, 2773,
+  1260, 1323, 2772, 2835, 368,  431,  1514, 1577, 2648, 2711, 3014, 3077, 942,
+  1005, 2895, 2958, 1894, 1957, 2398, 2461, 1133, 1196, 2834, 2897, 432,  495,
+  751,  814,  2956, 3019, 3015, 3078, 1641, 1704, 2586, 2649, 1831, 1894, 2461,
+  2524, 496,  559,  3016, 3079, 1006, 1069, 2896, 2959, 1324, 1387, 2773, 2836,
+  815,  878,  1451, 1514, 2711, 2774, 2957, 3020, 2147, 2210, 2210, 2273, 1768,
+  1831, 2524, 2587, 560,  623,  2084, 2147, 2273, 2336, 3017, 3080, 1197, 1260,
+  2835, 2898, 1578, 1641, 2649, 2712, 2021, 2084, 2336, 2399, 48,   48,   3072,
+  3072, 49,   112,  3073, 3136, 624,  687,  3018, 3081, 113,  176,  879,  942,
+  1070, 1133, 1958, 2021, 2399, 2462, 2897, 2960, 2958, 3021, 3074, 3137, 177,
+  240,  1705, 1768, 2587, 2650, 3075, 3138, 241,  304,  3076, 3139, 1388, 1451,
+  2774, 2837, 1895, 1958, 2462, 2525, 688,  751,  1261, 1324, 1515, 1578, 2712,
+  2775, 2836, 2899, 3019, 3082, 305,  368,  3077, 3140, 943,  1006, 2959, 3022,
+  369,  432,  3078, 3141, 1134, 1197, 1642, 1705, 2650, 2713, 2898, 2961, 1832,
+  1895, 2525, 2588, 752,  815,  3020, 3083, 433,  496,  2211, 2274, 3079, 3142,
+  2148, 2211, 2274, 2337, 2085, 2148, 2337, 2400, 497,  560,  1007, 1070, 1452,
+  1515, 1769, 1832, 2588, 2651, 2775, 2838, 2960, 3023, 3080, 3143, 1325, 1388,
+  2837, 2900, 2022, 2085, 2400, 2463, 816,  879,  3021, 3084, 1579, 1642, 2713,
+  2776, 1198, 1261, 2899, 2962, 561,  624,  1959, 2022, 2463, 2526, 3081, 3144,
+  1706, 1769, 2651, 2714, 1071, 1134, 2961, 3024, 49,   49,   880,  943,  1896,
+  1959, 2526, 2589, 3022, 3085, 3136, 3136, 50,   113,  625,  688,  3082, 3145,
+  3137, 3200, 114,  177,  3138, 3201, 178,  241,  1389, 1452, 2838, 2901, 3139,
+  3202, 1516, 1579, 2776, 2839, 242,  305,  1262, 1325, 2900, 2963, 3140, 3203,
+  2212, 2275, 2275, 2338, 689,  752,  1833, 1896, 2589, 2652, 3083, 3146, 306,
+  369,  1643, 1706, 2149, 2212, 2338, 2401, 2714, 2777, 3141, 3204, 944,  1007,
+  3023, 3086, 1135, 1198, 2086, 2149, 2401, 2464, 2962, 3025, 370,  433,  3142,
+  3205, 753,  816,  2023, 2086, 2464, 2527, 3084, 3147, 1770, 1833, 2652, 2715,
+  434,  497,  3143, 3206, 1453, 1516, 2839, 2902, 1326, 1389, 2901, 2964, 1008,
+  1071, 3024, 3087, 1580, 1643, 1960, 2023, 2527, 2590, 2777, 2840, 498,  561,
+  3144, 3207, 817,  880,  1199, 1262, 2963, 3026, 3085, 3148, 1707, 1770, 2715,
+  2778, 562,  625,  1897, 1960, 2590, 2653, 3145, 3208, 2276, 2339, 1072, 1135,
+  3025, 3088, 2213, 2276, 2339, 2402, 881,  944,  3086, 3149, 626,  689,  1390,
+  1453, 2150, 2213, 2402, 2465, 2902, 2965, 3146, 3209, 50,   50,   1517, 1580,
+  2840, 2903, 3200, 3200, 51,   114,  3201, 3264, 115,  178,  1834, 1897, 2653,
+  2716, 3202, 3265, 1263, 1326, 2964, 3027, 179,  242,  2087, 2150, 2465, 2528,
+  3203, 3266, 1644, 1707, 2778, 2841, 243,  306,  3204, 3267, 690,  753,  3147,
+  3210, 2024, 2087, 2528, 2591, 307,  370,  945,  1008, 3087, 3150, 3205, 3268,
+  1136, 1199, 3026, 3089, 1771, 1834, 2716, 2779, 371,  434,  3206, 3269, 1961,
+  2024, 2591, 2654, 754,  817,  3148, 3211, 1454, 1517, 2903, 2966, 435,  498,
+  1327, 1390, 1581, 1644, 2841, 2904, 2965, 3028, 3207, 3270, 1009, 1072, 3088,
+  3151, 1898, 1961, 2654, 2717, 499,  562,  1200, 1263, 1708, 1771, 2277, 2340,
+  2340, 2403, 2779, 2842, 3027, 3090, 3208, 3271, 818,  881,  2214, 2277, 2403,
+  2466, 3149, 3212, 2151, 2214, 2466, 2529, 563,  626,  3209, 3272, 2088, 2151,
+  2529, 2592, 1073, 1136, 1835, 1898, 2717, 2780, 3089, 3152, 1518, 1581, 2904,
+  2967, 1391, 1454, 2966, 3029, 882,  945,  3150, 3213, 627,  690,  1645, 1708,
+  2842, 2905, 3210, 3273, 51,   51,   1264, 1327, 3028, 3091, 3264, 3264, 52,
+  115,  2025, 2088, 2592, 2655, 3265, 3328, 116,  179,  3266, 3329, 180,  243,
+  3267, 3330, 244,  307,  1772, 1835, 2780, 2843, 3268, 3331, 691,  754,  3211,
+  3274, 946,  1009, 1137, 1200, 1962, 2025, 2655, 2718, 3090, 3153, 3151, 3214,
+  308,  371,  3269, 3332, 1455, 1518, 2341, 2404, 2967, 3030, 372,  435,  2278,
+  2341, 2404, 2467, 3270, 3333, 1582, 1645, 2905, 2968, 755,  818,  1328, 1391,
+  3029, 3092, 3212, 3275, 2215, 2278, 2467, 2530, 1899, 1962, 2718, 2781, 436,
+  499,  3271, 3334, 1709, 1772, 2843, 2906, 1010, 1073, 2152, 2215, 2530, 2593,
+  3152, 3215, 1201, 1264, 3091, 3154, 500,  563,  3272, 3335, 819,  882,  2089,
+  2152, 2593, 2656, 3213, 3276, 1836, 1899, 2781, 2844, 564,  627,  1519, 1582,
+  2968, 3031, 3273, 3336, 1392, 1455, 2026, 2089, 2656, 2719, 3030, 3093, 1074,
+  1137, 3153, 3216, 1646, 1709, 2906, 2969, 883,  946,  3214, 3277, 1265, 1328,
+  3092, 3155, 628,  691,  3274, 3337, 52,   52,   1773, 1836, 2844, 2907, 3328,
+  3328, 53,   116,  1963, 2026, 2719, 2782, 3329, 3392, 117,  180,  2342, 2405,
+  2405, 2468, 3330, 3393, 2279, 2342, 2468, 2531, 181,  244,  3331, 3394, 1138,
+  1201, 3154, 3217, 245,  308,  692,  755,  2216, 2279, 2531, 2594, 3275, 3338,
+  3332, 3395, 947,  1010, 3215, 3278, 1456, 1519, 3031, 3094, 309,  372,  1583,
+  1646, 2969, 3032, 3333, 3396, 1900, 1963, 2782, 2845, 2153, 2216, 2594, 2657,
+  1329, 1392, 3093, 3156, 373,  436,  1710, 1773, 2907, 2970, 3334, 3397, 756,
+  819,  3276, 3339, 2090, 2153, 2657, 2720, 1011, 1074, 3216, 3279, 437,  500,
+  3335, 3398, 1202, 1265, 3155, 3218, 1837, 1900, 2845, 2908, 501,  564,  820,
+  883,  2027, 2090, 2720, 2783, 3277, 3340, 3336, 3399, 1520, 1583, 3032, 3095,
+  1393, 1456, 1647, 1710, 2970, 3033, 3094, 3157, 2406, 2469, 565,  628,  1075,
+  1138, 2343, 2406, 2469, 2532, 3217, 3280, 3337, 3400, 2280, 2343, 2532, 2595,
+  1964, 2027, 2783, 2846, 884,  947,  1266, 1329, 1774, 1837, 2908, 2971, 3156,
+  3219, 3278, 3341, 2217, 2280, 2595, 2658, 629,  692,  3338, 3401, 53,   53,
+  3392, 3392, 54,   117,  3393, 3456, 118,  181,  2154, 2217, 2658, 2721, 3394,
+  3457, 182,  245,  1139, 1202, 1901, 1964, 2846, 2909, 3218, 3281, 3395, 3458,
+  948,  1011, 1584, 1647, 3033, 3096, 3279, 3342, 693,  756,  1457, 1520, 3095,
+  3158, 3339, 3402, 246,  309,  3396, 3459, 1711, 1774, 2091, 2154, 2721, 2784,
+  2971, 3034, 310,  373,  1330, 1393, 3157, 3220, 3397, 3460, 374,  437,  3398,
+  3461, 757,  820,  3340, 3403, 1838, 1901, 2909, 2972, 1012, 1075, 2028, 2091,
+  2784, 2847, 3280, 3343, 1203, 1266, 3219, 3282, 438,  501,  2407, 2470, 2470,
+  2533, 3399, 3462, 2344, 2407, 2533, 2596, 1521, 1584, 2281, 2344, 2596, 2659,
+  3096, 3159, 821,  884,  3341, 3404, 502,  565,  1648, 1711, 3034, 3097, 3400,
+  3463, 1394, 1457, 3158, 3221, 1965, 2028, 2847, 2910, 2218, 2281, 2659, 2722,
+  1076, 1139, 1775, 1838, 2972, 3035, 3281, 3344, 566,  629,  3401, 3464, 1267,
+  1330, 3220, 3283, 885,  948,  2155, 2218, 2722, 2785, 3342, 3405, 630,  693,
+  1902, 1965, 2910, 2973, 3402, 3465, 54,   54,   2092, 2155, 2785, 2848, 3456,
+  3456, 55,   118,  1585, 1648, 3097, 3160, 3457, 3520, 1140, 1203, 3282, 3345,
+  119,  182,  1458, 1521, 3159, 3222, 3458, 3521, 1712, 1775, 3035, 3098, 183,
+  246,  949,  1012, 3343, 3406, 3459, 3522, 694,  757,  3403, 3466, 247,  310,
+  3460, 3523, 1331, 1394, 2471, 2534, 3221, 3284, 2408, 2471, 2534, 2597, 2029,
+  2092, 2848, 2911, 311,  374,  1839, 1902, 2345, 2408, 2597, 2660, 2973, 3036,
+  3461, 3524, 758,  821,  2282, 2345, 2660, 2723, 3404, 3467, 375,  438,  3462,
+  3525, 1013, 1076, 1204, 1267, 3283, 3346, 3344, 3407, 439,  502,  2219, 2282,
+  2723, 2786, 3463, 3526, 1522, 1585, 3160, 3223, 1649, 1712, 1966, 2029, 2911,
+  2974, 3098, 3161, 822,  885,  1395, 1458, 3222, 3285, 3405, 3468, 1776, 1839,
+  3036, 3099, 503,  566,  3464, 3527, 2156, 2219, 2786, 2849, 1077, 1140, 3345,
+  3408, 1268, 1331, 3284, 3347, 567,  630,  3465, 3528, 1903, 1966, 2974, 3037,
+  886,  949,  3406, 3469, 2093, 2156, 2849, 2912, 2472, 2535, 2535, 2598, 631,
+  694,  1586, 1649, 2409, 2472, 2598, 2661, 3161, 3224, 3466, 3529, 1459, 1522,
+  1713, 1776, 3099, 3162, 3223, 3286, 1141, 1204, 2346, 2409, 2661, 2724, 3346,
+  3409, 55,   55,   3520, 3520, 56,   119,  3521, 3584, 120,  183,  2030, 2093,
+  2912, 2975, 3522, 3585, 950,  1013, 3407, 3470, 184,  247,  1332, 1395, 1840,
+  1903, 2283, 2346, 2724, 2787, 3037, 3100, 3285, 3348, 3523, 3586, 695,  758,
+  3467, 3530, 248,  311,  3524, 3587, 312,  375,  2220, 2283, 2787, 2850, 3525,
+  3588, 759,  822,  3468, 3531, 1205, 1268, 1967, 2030, 2975, 3038, 3347, 3410,
+  376,  439,  1014, 1077, 3408, 3471, 3526, 3589, 1650, 1713, 3162, 3225, 1523,
+  1586, 3224, 3287, 2157, 2220, 2850, 2913, 440,  503,  1777, 1840, 3100, 3163,
+  3527, 3590, 1396, 1459, 3286, 3349, 823,  886,  3469, 3532, 504,  567,  2536,
+  2599, 3528, 3591, 2473, 2536, 2599, 2662, 1904, 1967, 3038, 3101, 1078, 1141,
+  2094, 2157, 2913, 2976, 3409, 3472, 2410, 2473, 2662, 2725, 1269, 1332, 3348,
+  3411, 568,  631,  3529, 3592, 2347, 2410, 2725, 2788, 887,  950,  3470, 3533,
+  1587, 1650, 3225, 3288, 1714, 1777, 3163, 3226, 2284, 2347, 2788, 2851, 1460,
+  1523, 2031, 2094, 2976, 3039, 3287, 3350, 632,  695,  3530, 3593, 1142, 1205,
+  3410, 3473, 1841, 1904, 3101, 3164, 56,   56,   3584, 3584, 57,   120,  951,
+  1014, 1333, 1396, 2221, 2284, 2851, 2914, 3349, 3412, 3471, 3534, 3585, 3648,
+  121,  184,  3586, 3649, 696,  759,  3531, 3594, 185,  248,  3587, 3650, 249,
+  312,  1968, 2031, 3039, 3102, 3588, 3651, 2158, 2221, 2914, 2977, 313,  376,
+  3589, 3652, 1206, 1269, 1651, 1714, 3226, 3289, 3411, 3474, 760,  823,  1524,
+  1587, 3288, 3351, 3532, 3595, 1015, 1078, 2537, 2600, 2600, 2663, 3472, 3535,
+  1778, 1841, 3164, 3227, 377,  440,  2474, 2537, 2663, 2726, 3590, 3653, 1397,
+  1460, 2411, 2474, 2726, 2789, 3350, 3413, 441,  504,  2095, 2158, 2977, 3040,
+  3591, 3654, 1905, 1968, 3102, 3165, 824,  887,  2348, 2411, 2789, 2852, 3533,
+  3596, 505,  568,  3592, 3655, 1079, 1142, 3473, 3536, 1270, 1333, 3412, 3475,
+  2285, 2348, 2852, 2915, 2032, 2095, 3040, 3103, 1588, 1651, 3289, 3352, 569,
+  632,  1715, 1778, 3227, 3290, 3593, 3656, 888,  951,  3534, 3597, 1461, 1524,
+  3351, 3414, 1842, 1905, 2222, 2285, 2915, 2978, 3165, 3228, 633,  696,  1143,
+  1206, 3474, 3537, 3594, 3657, 1334, 1397, 3413, 3476, 952,  1015, 3535, 3598,
+  1969, 2032, 2601, 2664, 3103, 3166, 57,   57,   2538, 2601, 2664, 2727, 3648,
+  3648, 58,   121,  2159, 2222, 2978, 3041, 3649, 3712, 122,  185,  3650, 3713,
+  697,  760,  2475, 2538, 2727, 2790, 3595, 3658, 186,  249,  3651, 3714, 250,
+  313,  1652, 1715, 2412, 2475, 2790, 2853, 3290, 3353, 3652, 3715, 1525, 1588,
+  1779, 1842, 3228, 3291, 3352, 3415, 1207, 1270, 3475, 3538, 314,  377,  3653,
+  3716, 1016, 1079, 3536, 3599, 761,  824,  2096, 2159, 3041, 3104, 3596, 3659,
+  2349, 2412, 2853, 2916, 378,  441,  1398, 1461, 1906, 1969, 3166, 3229, 3414,
+  3477, 3654, 3717, 2286, 2349, 2916, 2979, 442,  505,  3655, 3718, 825,  888,
+  3597, 3660, 1080, 1143, 1271, 1334, 2033, 2096, 3104, 3167, 3476, 3539, 3537,
+  3600, 506,  569,  3656, 3719, 1716, 1779, 3291, 3354, 1589, 1652, 2223, 2286,
+  2979, 3042, 3353, 3416, 1843, 1906, 3229, 3292, 570,  633,  889,  952,  1462,
+  1525, 2602, 2665, 2665, 2728, 3415, 3478, 3598, 3661, 3657, 3720, 2539, 2602,
+  2728, 2791, 2476, 2539, 2791, 2854, 1144, 1207, 2160, 2223, 3042, 3105, 3538,
+  3601, 1970, 2033, 3167, 3230, 634,  697,  3658, 3721, 1335, 1398, 3477, 3540,
+  2413, 2476, 2854, 2917, 953,  1016, 3599, 3662, 58,   58,   3712, 3712, 59,
+  122,  3713, 3776, 123,  186,  698,  761,  1653, 1716, 2350, 2413, 2917, 2980,
+  3354, 3417, 3659, 3722, 3714, 3777, 1780, 1843, 3292, 3355, 187,  250,  2097,
+  2160, 3105, 3168, 3715, 3778, 1526, 1589, 3416, 3479, 251,  314,  1208, 1271,
+  3539, 3602, 3716, 3779, 1907, 1970, 3230, 3293, 1017, 1080, 2287, 2350, 2980,
+  3043, 3600, 3663, 315,  378,  3717, 3780, 762,  825,  3660, 3723, 1399, 1462,
+  3478, 3541, 379,  442,  3718, 3781, 2034, 2097, 3168, 3231, 2666, 2729, 2224,
+  2287, 3043, 3106, 443,  506,  2603, 2666, 2729, 2792, 3719, 3782, 826,  889,
+  3661, 3724, 1272, 1335, 2540, 2603, 2792, 2855, 3540, 3603, 1081, 1144, 1717,
+  1780, 3355, 3418, 3601, 3664, 1590, 1653, 3417, 3480, 507,  570,  1844, 1907,
+  3293, 3356, 3720, 3783, 2477, 2540, 2855, 2918, 1463, 1526, 3479, 3542, 2161,
+  2224, 3106, 3169, 890,  953,  2414, 2477, 2918, 2981, 3662, 3725, 571,  634,
+  1971, 2034, 3231, 3294, 3721, 3784, 1145, 1208, 3602, 3665, 1336, 1399, 3541,
+  3604, 2351, 2414, 2981, 3044, 635,  698,  3722, 3785, 954,  1017, 2098, 2161,
+  3169, 3232, 3663, 3726, 1654, 1717, 3418, 3481, 1781, 1844, 3356, 3419, 59,
+  59,   2288, 2351, 3044, 3107, 3776, 3776, 60,   123,  1527, 1590, 3480, 3543,
+  3777, 3840, 699,  762,  3723, 3786, 124,  187,  1908, 1971, 3294, 3357, 3778,
+  3841, 188,  251,  3779, 3842, 1209, 1272, 3603, 3666, 2667, 2730, 2730, 2793,
+  252,  315,  3780, 3843, 2604, 2667, 2793, 2856, 1018, 1081, 1400, 1463, 3542,
+  3605, 3664, 3727, 316,  379,  763,  826,  2035, 2098, 2541, 2604, 2856, 2919,
+  3232, 3295, 3724, 3787, 3781, 3844, 2225, 2288, 3107, 3170, 380,  443,  3782,
+  3845, 2478, 2541, 2919, 2982, 1718, 1781, 3419, 3482, 444,  507,  1273, 1336,
+  3604, 3667, 3783, 3846, 827,  890,  1591, 1654, 1845, 1908, 3357, 3420, 3481,
+  3544, 3725, 3788, 1082, 1145, 2415, 2478, 2982, 3045, 3665, 3728, 2162, 2225,
+  3170, 3233, 508,  571,  3784, 3847, 1464, 1527, 1972, 2035, 3295, 3358, 3543,
+  3606, 2352, 2415, 3045, 3108, 891,  954,  3726, 3789, 572,  635,  3785, 3848,
+  1146, 1209, 3666, 3729, 1337, 1400, 2099, 2162, 3233, 3296, 3605, 3668, 2289,
+  2352, 3108, 3171, 2731, 2794, 636,  699,  1782, 1845, 2668, 2731, 2794, 2857,
+  3420, 3483, 3786, 3849, 1655, 1718, 3482, 3545, 955,  1018, 2605, 2668, 2857,
+  2920, 3727, 3790, 1909, 1972, 3358, 3421, 1528, 1591, 3544, 3607, 2542, 2605,
+  2920, 2983, 60,   60,   700,  763,  3787, 3850, 3840, 3840, 61,   124,  3841,
+  3904, 125,  188,  1210, 1273, 2226, 2289, 3171, 3234, 3667, 3730, 3842, 3905,
+  2036, 2099, 3296, 3359, 189,  252,  2479, 2542, 2983, 3046, 3843, 3906, 1401,
+  1464, 3606, 3669, 253,  316,  1019, 1082, 3728, 3791, 3844, 3907, 764,  827,
+  3788, 3851, 317,  380,  3845, 3908, 2416, 2479, 3046, 3109, 1719, 1782, 3483,
+  3546, 381,  444,  1846, 1909, 2163, 2226, 3234, 3297, 3421, 3484, 3846, 3909,
+  1592, 1655, 3545, 3608, 1274, 1337, 3668, 3731, 828,  891,  3789, 3852, 445,
+  508,  1083, 1146, 1973, 2036, 2353, 2416, 3109, 3172, 3359, 3422, 3729, 3792,
+  3847, 3910, 1465, 1528, 3607, 3670, 509,  572,  2732, 2795, 2795, 2858, 3848,
+  3911, 2669, 2732, 2858, 2921, 2100, 2163, 3297, 3360, 892,  955,  2290, 2353,
+  3172, 3235, 3790, 3853, 2606, 2669, 2921, 2984, 573,  636,  3849, 3912, 1147,
+  1210, 1338, 1401, 3669, 3732, 3730, 3793, 1783, 1846, 2543, 2606, 2984, 3047,
+  3484, 3547, 1656, 1719, 3546, 3609, 1910, 1973, 3422, 3485, 637,  700,  3850,
+  3913, 956,  1019, 1529, 1592, 2480, 2543, 3047, 3110, 3608, 3671, 3791, 3854,
+  2227, 2290, 3235, 3298, 2037, 2100, 3360, 3423, 701,  764,  1211, 1274, 3731,
+  3794, 3851, 3914, 61,   61,   3904, 3904, 62,   125,  2417, 2480, 3110, 3173,
+  3905, 3968, 126,  189,  1402, 1465, 3670, 3733, 3906, 3969, 190,  253,  3907,
+  3970, 1020, 1083, 3792, 3855, 254,  317,  2164, 2227, 3298, 3361, 3908, 3971,
+  765,  828,  1720, 1783, 3547, 3610, 3852, 3915, 1847, 1910, 3485, 3548, 318,
+  381,  2354, 2417, 3173, 3236, 3909, 3972, 2796, 2859, 1593, 1656, 2733, 2796,
+  2859, 2922, 3609, 3672, 1974, 2037, 3423, 3486, 382,  445,  2670, 2733, 2922,
+  2985, 3910, 3973, 1275, 1338, 3732, 3795, 1084, 1147, 3793, 3856, 829,  892,
+  2607, 2670, 2985, 3048, 3853, 3916, 446,  509,  1466, 1529, 3671, 3734, 3911,
+  3974, 2291, 2354, 3236, 3299, 2101, 2164, 3361, 3424, 2544, 2607, 3048, 3111,
+  510,  573,  3912, 3975, 893,  956,  3854, 3917, 1784, 1847, 3548, 3611, 1339,
+  1402, 2481, 2544, 3111, 3174, 3733, 3796, 1148, 1211, 3794, 3857, 574,  637,
+  1657, 1720, 1911, 1974, 3486, 3549, 3610, 3673, 3913, 3976, 2228, 2291, 3299,
+  3362, 1530, 1593, 2038, 2101, 3424, 3487, 3672, 3735, 638,  701,  2418, 2481,
+  3174, 3237, 3914, 3977, 957,  1020, 3855, 3918, 1212, 1275, 2797, 2860, 2860,
+  2923, 3795, 3858, 702,  765,  1403, 1466, 2165, 2228, 2734, 2797, 2923, 2986,
+  3362, 3425, 3734, 3797, 3915, 3978, 62,   62,   3968, 3968, 63,   126,  2355,
+  2418, 3237, 3300, 3969, 4032, 127,  190,  2671, 2734, 2986, 3049, 3970, 4033,
+  1021, 1084, 1848, 1911, 3549, 3612, 3856, 3919, 191,  254,  1721, 1784, 3611,
+  3674, 3971, 4034, 255,  318,  2608, 2671, 3049, 3112, 3972, 4035, 1975, 2038,
+  3487, 3550, 766,  829,  3916, 3979, 1594, 1657, 3673, 3736, 319,  382,  3973,
+  4036, 1276, 1339, 2292, 2355, 3300, 3363, 3796, 3859, 2545, 2608, 3112, 3175,
+  383,  446,  2102, 2165, 3425, 3488, 3974, 4037, 1085, 1148, 1467, 1530, 3735,
+  3798, 3857, 3920, 830,  893,  3917, 3980, 447,  510,  3975, 4038, 2482, 2545,
+  3175, 3238, 511,  574,  1785, 1848, 3612, 3675, 3976, 4039, 2229, 2292, 3363,
+  3426, 1912, 1975, 3550, 3613, 894,  957,  1658, 1721, 3674, 3737, 3918, 3981,
+  1340, 1403, 3797, 3860, 1149, 1212, 2419, 2482, 3238, 3301, 3858, 3921, 2039,
+  2102, 3488, 3551, 575,  638,  2861, 2924, 3977, 4040, 2798, 2861, 2924, 2987,
+  1531, 1594, 3736, 3799, 2735, 2798, 2987, 3050, 2672, 2735, 3050, 3113, 639,
+  702,  958,  1021, 3919, 3982, 3978, 4041, 2166, 2229, 3426, 3489, 2356, 2419,
+  3301, 3364, 1213, 1276, 2609, 2672, 3113, 3176, 3859, 3922, 1404, 1467, 3798,
+  3861, 703,  766,  1849, 1912, 3613, 3676, 3979, 4042, 1722, 1785, 3675, 3738,
+  1976, 2039, 3551, 3614, 1022, 1085, 2546, 2609, 3176, 3239, 3920, 3983, 2293,
+  2356, 3364, 3427, 1595, 1658, 3737, 3800, 767,  830,  3980, 4043, 2103, 2166,
+  3489, 3552, 1277, 1340, 3860, 3923, 2483, 2546, 3239, 3302, 1468, 1531, 3799,
+  3862, 1086, 1149, 3921, 3984, 831,  894,  3981, 4044, 2230, 2293, 2862, 2925,
+  2925, 2988, 3427, 3490, 2799, 2862, 2988, 3051, 1786, 1849, 2420, 2483, 3302,
+  3365, 3676, 3739, 1913, 1976, 3614, 3677, 2736, 2799, 3051, 3114, 1659, 1722,
+  3738, 3801, 2040, 2103, 3552, 3615, 1341, 1404, 3861, 3924, 895,  958,  2673,
+  2736, 3114, 3177, 3982, 4045, 1150, 1213, 3922, 3985, 1532, 1595, 3800, 3863,
+  2357, 2420, 3365, 3428, 2167, 2230, 2610, 2673, 3177, 3240, 3490, 3553, 959,
+  1022, 3983, 4046, 2547, 2610, 3240, 3303, 1214, 1277, 1405, 1468, 1850, 1913,
+  3677, 3740, 3862, 3925, 3923, 3986, 1723, 1786, 1977, 2040, 3615, 3678, 3739,
+  3802, 2294, 2357, 3428, 3491, 1023, 1086, 1596, 1659, 2104, 2167, 2484, 2547,
+  3303, 3366, 3553, 3616, 3801, 3864, 3984, 4047, 2926, 2989, 2863, 2926, 2989,
+  3052, 2800, 2863, 3052, 3115, 1278, 1341, 3924, 3987, 1469, 1532, 2231, 2294,
+  2737, 2800, 3115, 3178, 3491, 3554, 3863, 3926, 2421, 2484, 3366, 3429, 1087,
+  1150, 3985, 4048, 1914, 1977, 2674, 2737, 3178, 3241, 3678, 3741, 1787, 1850,
+  3740, 3803, 2041, 2104, 3616, 3679, 1660, 1723, 3802, 3865, 2611, 2674, 3241,
+  3304, 1342, 1405, 2358, 2421, 3429, 3492, 3925, 3988, 2168, 2231, 3554, 3617,
+  1151, 1214, 3986, 4049, 1533, 1596, 3864, 3927, 2548, 2611, 3304, 3367, 2295,
+  2358, 3492, 3555, 1851, 1914, 3741, 3804, 1978, 2041, 2927, 2990, 2990, 3053,
+  3679, 3742, 1406, 1469, 3926, 3989, 1724, 1787, 2864, 2927, 3053, 3116, 3803,
+  3866, 1215, 1278, 2485, 2548, 3367, 3430, 3987, 4050, 2801, 2864, 3116, 3179,
+  2105, 2168, 3617, 3680, 1597, 1660, 3865, 3928, 2738, 2801, 3179, 3242, 2422,
+  2485, 3430, 3493, 2232, 2295, 3555, 3618, 2675, 2738, 3242, 3305, 1279, 1342,
+  3988, 4051, 1470, 1533, 3927, 3990, 1915, 1978, 3742, 3805, 1788, 1851, 3804,
+  3867, 2612, 2675, 3305, 3368, 2042, 2105, 3680, 3743, 2359, 2422, 3493, 3556,
+  1661, 1724, 3866, 3929, 2169, 2232, 3618, 3681, 2549, 2612, 3368, 3431, 1343,
+  1406, 3989, 4052, 2991, 3054, 1534, 1597, 2928, 2991, 3054, 3117, 3928, 3991,
+  2865, 2928, 3117, 3180, 2296, 2359, 3556, 3619, 2802, 2865, 3180, 3243, 2486,
+  2549, 3431, 3494, 1852, 1915, 3805, 3868, 1979, 2042, 3743, 3806, 1725, 1788,
+  2739, 2802, 3243, 3306, 3867, 3930, 1407, 1470, 2106, 2169, 3681, 3744, 3990,
+  4053, 2676, 2739, 3306, 3369, 1598, 1661, 2423, 2486, 3494, 3557, 3929, 3992,
+  2233, 2296, 3619, 3682, 2613, 2676, 3369, 3432, 1471, 1534, 3991, 4054, 1916,
+  1979, 3806, 3869, 1789, 1852, 2043, 2106, 2360, 2423, 3557, 3620, 3744, 3807,
+  3868, 3931, 2992, 3055, 3055, 3118, 2550, 2613, 3432, 3495, 2929, 2992, 3118,
+  3181, 1662, 1725, 2170, 2233, 3682, 3745, 3930, 3993, 2866, 2929, 3181, 3244,
+  2803, 2866, 3244, 3307, 1535, 1598, 2297, 2360, 3620, 3683, 3992, 4055, 2487,
+  2550, 3495, 3558, 2740, 2803, 3307, 3370, 1980, 2043, 3807, 3870, 1853, 1916,
+  3869, 3932, 2107, 2170, 3745, 3808, 1726, 1789, 2677, 2740, 3370, 3433, 3931,
+  3994, 2424, 2487, 3558, 3621, 2234, 2297, 3683, 3746, 1599, 1662, 3993, 4056,
+  2614, 2677, 3433, 3496, 3056, 3119, 2993, 3056, 3119, 3182, 2930, 2993, 3182,
+  3245, 2361, 2424, 3621, 3684, 1917, 1980, 3870, 3933, 2044, 2107, 3808, 3871,
+  2551, 2614, 3496, 3559, 2867, 2930, 3245, 3308, 1790, 1853, 3932, 3995, 2171,
+  2234, 3746, 3809, 2804, 2867, 3308, 3371, 1663, 1726, 3994, 4057, 2488, 2551,
+  3559, 3622, 2741, 2804, 3371, 3434, 2298, 2361, 3684, 3747, 2678, 2741, 3434,
+  3497, 1981, 2044, 3871, 3934, 1854, 1917, 3933, 3996, 2108, 2171, 3809, 3872,
+  2425, 2488, 3622, 3685, 1727, 1790, 3995, 4058, 3057, 3120, 3120, 3183, 2235,
+  2298, 2615, 2678, 3497, 3560, 3747, 3810, 2994, 3057, 3183, 3246, 2931, 2994,
+  3246, 3309, 2868, 2931, 3309, 3372, 2362, 2425, 3685, 3748, 2552, 2615, 3560,
+  3623, 1918, 1981, 3934, 3997, 2045, 2108, 2805, 2868, 3372, 3435, 3872, 3935,
+  1791, 1854, 3996, 4059, 2172, 2235, 3810, 3873, 2742, 2805, 3435, 3498, 2489,
+  2552, 3623, 3686, 2299, 2362, 3748, 3811, 2679, 2742, 3498, 3561, 3121, 3184,
+  3058, 3121, 3184, 3247, 1982, 2045, 3935, 3998, 2426, 2489, 3686, 3749, 1855,
+  1918, 2109, 2172, 2995, 3058, 3247, 3310, 3873, 3936, 3997, 4060, 2616, 2679,
+  3561, 3624, 2932, 2995, 3310, 3373, 2236, 2299, 3811, 3874, 2869, 2932, 3373,
+  3436, 2553, 2616, 3624, 3687, 2363, 2426, 3749, 3812, 2806, 2869, 3436, 3499,
+  2046, 2109, 3936, 3999, 1919, 1982, 3998, 4061, 2743, 2806, 3499, 3562, 2173,
+  2236, 3874, 3937, 2490, 2553, 3687, 3750, 2300, 2363, 3812, 3875, 2680, 2743,
+  3562, 3625, 3122, 3185, 3185, 3248, 3059, 3122, 3248, 3311, 2996, 3059, 3311,
+  3374, 2427, 2490, 2933, 2996, 3374, 3437, 3750, 3813, 1983, 2046, 2617, 2680,
+  3625, 3688, 3999, 4062, 2110, 2173, 3937, 4000, 2870, 2933, 3437, 3500, 2237,
+  2300, 3875, 3938, 2807, 2870, 3500, 3563, 2554, 2617, 3688, 3751, 2364, 2427,
+  3813, 3876, 2744, 2807, 3563, 3626, 2047, 2110, 4000, 4063, 2174, 2237, 3186,
+  3249, 3938, 4001, 2491, 2554, 3123, 3186, 3249, 3312, 3751, 3814, 3060, 3123,
+  3312, 3375, 2681, 2744, 3626, 3689, 2301, 2364, 3876, 3939, 2997, 3060, 3375,
+  3438, 2934, 2997, 3438, 3501, 2428, 2491, 3814, 3877, 2618, 2681, 3689, 3752,
+  2871, 2934, 3501, 3564, 2111, 2174, 4001, 4064, 2238, 2301, 3939, 4002, 2808,
+  2871, 3564, 3627, 2555, 2618, 3752, 3815, 2365, 2428, 3877, 3940, 2745, 2808,
+  3627, 3690, 3187, 3250, 3250, 3313, 3124, 3187, 3313, 3376, 3061, 3124, 3376,
+  3439, 2492, 2555, 3815, 3878, 2175, 2238, 2998, 3061, 3439, 3502, 4002, 4065,
+  2682, 2745, 3690, 3753, 2302, 2365, 3940, 4003, 2935, 2998, 3502, 3565, 2872,
+  2935, 3565, 3628, 2619, 2682, 3753, 3816, 2429, 2492, 3878, 3941, 2809, 2872,
+  3628, 3691, 2239, 2302, 4003, 4066, 2556, 2619, 3816, 3879, 3251, 3314, 3188,
+  3251, 3314, 3377, 3125, 3188, 3377, 3440, 2366, 2429, 2746, 2809, 3691, 3754,
+  3941, 4004, 3062, 3125, 3440, 3503, 2999, 3062, 3503, 3566, 2493, 2556, 3879,
+  3942, 2683, 2746, 3754, 3817, 2936, 2999, 3566, 3629, 2303, 2366, 4004, 4067,
+  2873, 2936, 3629, 3692, 2620, 2683, 3817, 3880, 2430, 2493, 3942, 4005, 2810,
+  2873, 3692, 3755, 3252, 3315, 3315, 3378, 3189, 3252, 3378, 3441, 3126, 3189,
+  3441, 3504, 2557, 2620, 3880, 3943, 3063, 3126, 3504, 3567, 2747, 2810, 3755,
+  3818, 2367, 2430, 4005, 4068, 3000, 3063, 3567, 3630, 2684, 2747, 3818, 3881,
+  2494, 2557, 2937, 3000, 3630, 3693, 3943, 4006, 2874, 2937, 3693, 3756, 2621,
+  2684, 3881, 3944, 3316, 3379, 3253, 3316, 3379, 3442, 2431, 2494, 4006, 4069,
+  3190, 3253, 3442, 3505, 2811, 2874, 3756, 3819, 3127, 3190, 3505, 3568, 3064,
+  3127, 3568, 3631, 2558, 2621, 3944, 4007, 2748, 2811, 3819, 3882, 3001, 3064,
+  3631, 3694, 2938, 3001, 3694, 3757, 2685, 2748, 3882, 3945, 2495, 2558, 4007,
+  4070, 2875, 2938, 3757, 3820, 3317, 3380, 3380, 3443, 3254, 3317, 3443, 3506,
+  2622, 2685, 3191, 3254, 3506, 3569, 3945, 4008, 2812, 2875, 3820, 3883, 3128,
+  3191, 3569, 3632, 3065, 3128, 3632, 3695, 2559, 2622, 4008, 4071, 2749, 2812,
+  3883, 3946, 3002, 3065, 3695, 3758, 2939, 3002, 3758, 3821, 2686, 2749, 3946,
+  4009, 3381, 3444, 3318, 3381, 3444, 3507, 2876, 2939, 3821, 3884, 3255, 3318,
+  3507, 3570, 3192, 3255, 3570, 3633, 2623, 2686, 3129, 3192, 3633, 3696, 4009,
+  4072, 2813, 2876, 3884, 3947, 3066, 3129, 3696, 3759, 3003, 3066, 3759, 3822,
+  2750, 2813, 3947, 4010, 2940, 3003, 3822, 3885, 3382, 3445, 3445, 3508, 3319,
+  3382, 3508, 3571, 2687, 2750, 4010, 4073, 3256, 3319, 3571, 3634, 2877, 2940,
+  3885, 3948, 3193, 3256, 3634, 3697, 3130, 3193, 3697, 3760, 2814, 2877, 3948,
+  4011, 3067, 3130, 3760, 3823, 3004, 3067, 3823, 3886, 2751, 2814, 4011, 4074,
+  3446, 3509, 3383, 3446, 3509, 3572, 2941, 3004, 3886, 3949, 3320, 3383, 3572,
+  3635, 3257, 3320, 3635, 3698, 3194, 3257, 3698, 3761, 2878, 2941, 3949, 4012,
+  3131, 3194, 3761, 3824, 3068, 3131, 3824, 3887, 2815, 2878, 4012, 4075, 3005,
+  3068, 3887, 3950, 3447, 3510, 3510, 3573, 3384, 3447, 3573, 3636, 3321, 3384,
+  3636, 3699, 2942, 3005, 3950, 4013, 3258, 3321, 3699, 3762, 3195, 3258, 3762,
+  3825, 2879, 2942, 4013, 4076, 3132, 3195, 3825, 3888, 3069, 3132, 3888, 3951,
+  3511, 3574, 3448, 3511, 3574, 3637, 3006, 3069, 3951, 4014, 3385, 3448, 3637,
+  3700, 3322, 3385, 3700, 3763, 3259, 3322, 3763, 3826, 2943, 3006, 4014, 4077,
+  3196, 3259, 3826, 3889, 3133, 3196, 3889, 3952, 3070, 3133, 3952, 4015, 3512,
+  3575, 3575, 3638, 3449, 3512, 3638, 3701, 3386, 3449, 3701, 3764, 3007, 3070,
+  4015, 4078, 3323, 3386, 3764, 3827, 3260, 3323, 3827, 3890, 3197, 3260, 3890,
+  3953, 3134, 3197, 3953, 4016, 3576, 3639, 3071, 3134, 4016, 4079, 3513, 3576,
+  3639, 3702, 3450, 3513, 3702, 3765, 3387, 3450, 3765, 3828, 3324, 3387, 3828,
+  3891, 3261, 3324, 3891, 3954, 3198, 3261, 3954, 4017, 3135, 3198, 4017, 4080,
+  3577, 3640, 3640, 3703, 3514, 3577, 3703, 3766, 3451, 3514, 3766, 3829, 3388,
+  3451, 3829, 3892, 3325, 3388, 3892, 3955, 3262, 3325, 3955, 4018, 3199, 3262,
+  4018, 4081, 3641, 3704, 3578, 3641, 3704, 3767, 3515, 3578, 3767, 3830, 3452,
+  3515, 3830, 3893, 3389, 3452, 3893, 3956, 3326, 3389, 3956, 4019, 3263, 3326,
+  4019, 4082, 3642, 3705, 3705, 3768, 3579, 3642, 3768, 3831, 3516, 3579, 3831,
+  3894, 3453, 3516, 3894, 3957, 3390, 3453, 3957, 4020, 3327, 3390, 4020, 4083,
+  3706, 3769, 3643, 3706, 3769, 3832, 3580, 3643, 3832, 3895, 3517, 3580, 3895,
+  3958, 3454, 3517, 3958, 4021, 3391, 3454, 4021, 4084, 3707, 3770, 3770, 3833,
+  3644, 3707, 3833, 3896, 3581, 3644, 3896, 3959, 3518, 3581, 3959, 4022, 3455,
+  3518, 4022, 4085, 3771, 3834, 3708, 3771, 3834, 3897, 3645, 3708, 3897, 3960,
+  3582, 3645, 3960, 4023, 3519, 3582, 4023, 4086, 3772, 3835, 3835, 3898, 3709,
+  3772, 3898, 3961, 3646, 3709, 3961, 4024, 3583, 3646, 4024, 4087, 3836, 3899,
+  3773, 3836, 3899, 3962, 3710, 3773, 3962, 4025, 3647, 3710, 4025, 4088, 3837,
+  3900, 3900, 3963, 3774, 3837, 3963, 4026, 3711, 3774, 4026, 4089, 3901, 3964,
+  3838, 3901, 3964, 4027, 3775, 3838, 4027, 4090, 3902, 3965, 3965, 4028, 3839,
+  3902, 4028, 4091, 3966, 4029, 3903, 3966, 4029, 4092, 3967, 4030, 4030, 4093,
+  4031, 4094, 0,    0,
+};
+#endif  // CONFIG_TX64X64
+
+#if CONFIG_CB4X4
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_2x2[4]) = { 0, 1, 2,
+                                                                        3 };
+#endif
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x4[16]) = {
+  0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x4[16]) = {
+  0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x4[16]) = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t, av1_col_iscan_4x4[16]) = {
+  0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_row_iscan_4x4[16]) = {
+  0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x8[32]) = {
+  0,  1,  4,  9,  2,  3,  6,  11, 5,  7,  8,  13, 10, 12, 14, 17,
+  15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x8[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x8[32]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x4[32]) = {
+  0, 1, 4, 9,  15, 19, 24, 28, 2,  3,  6,  11, 16, 21, 25, 29,
+  5, 7, 8, 13, 18, 22, 26, 30, 10, 12, 14, 17, 20, 23, 27, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x4[32]) = {
+  0, 4, 8,  12, 16, 20, 24, 28, 1, 5, 9,  13, 17, 21, 25, 29,
+  2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x4[32]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x16[64]) = {
+  0,  1,  3,  6,  2,  4,  7,  10, 5,  8,  11, 14, 9,  12, 15, 18,
+  13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 30, 25, 28, 31, 34,
+  29, 32, 35, 38, 33, 36, 39, 42, 37, 40, 43, 46, 41, 44, 47, 50,
+  45, 48, 51, 54, 49, 52, 55, 58, 53, 56, 59, 61, 57, 60, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x4[64]) = {
+  0, 1,  3,  6,  10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54,
+  2, 4,  7,  11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 58,
+  5, 8,  12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 59, 61,
+  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 60, 62, 63,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x16[64]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x4[64]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x16[64]) = {
+  0,  16, 32, 48, 1,  17, 33, 49, 2,  18, 34, 50, 3,  19, 35, 51,
+  4,  20, 36, 52, 5,  21, 37, 53, 6,  22, 38, 54, 7,  23, 39, 55,
+  8,  24, 40, 56, 9,  25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
+  12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x4[64]) = {
+  0,  16, 32, 48, 1,  17, 33, 49, 2,  18, 34, 50, 3,  19, 35, 51,
+  4,  20, 36, 52, 5,  21, 37, 53, 6,  22, 38, 54, 7,  23, 39, 55,
+  8,  24, 40, 56, 9,  25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
+  12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63,
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x32[256]) = {
+  0,   1,   3,   6,   10,  15,  21,  28,  2,   4,   7,   11,  16,  22,  29,
+  36,  5,   8,   12,  17,  23,  30,  37,  44,  9,   13,  18,  24,  31,  38,
+  45,  52,  14,  19,  25,  32,  39,  46,  53,  60,  20,  26,  33,  40,  47,
+  54,  61,  68,  27,  34,  41,  48,  55,  62,  69,  76,  35,  42,  49,  56,
+  63,  70,  77,  84,  43,  50,  57,  64,  71,  78,  85,  92,  51,  58,  65,
+  72,  79,  86,  93,  100, 59,  66,  73,  80,  87,  94,  101, 108, 67,  74,
+  81,  88,  95,  102, 109, 116, 75,  82,  89,  96,  103, 110, 117, 124, 83,
+  90,  97,  104, 111, 118, 125, 132, 91,  98,  105, 112, 119, 126, 133, 140,
+  99,  106, 113, 120, 127, 134, 141, 148, 107, 114, 121, 128, 135, 142, 149,
+  156, 115, 122, 129, 136, 143, 150, 157, 164, 123, 130, 137, 144, 151, 158,
+  165, 172, 131, 138, 145, 152, 159, 166, 173, 180, 139, 146, 153, 160, 167,
+  174, 181, 188, 147, 154, 161, 168, 175, 182, 189, 196, 155, 162, 169, 176,
+  183, 190, 197, 204, 163, 170, 177, 184, 191, 198, 205, 212, 171, 178, 185,
+  192, 199, 206, 213, 220, 179, 186, 193, 200, 207, 214, 221, 228, 187, 194,
+  201, 208, 215, 222, 229, 235, 195, 202, 209, 216, 223, 230, 236, 241, 203,
+  210, 217, 224, 231, 237, 242, 246, 211, 218, 225, 232, 238, 243, 247, 250,
+  219, 226, 233, 239, 244, 248, 251, 253, 227, 234, 240, 245, 249, 252, 254,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x8[256]) = {
+  0,   1,   3,   6,   10,  15,  21,  28,  36,  44,  52,  60,  68,  76,  84,
+  92,  100, 108, 116, 124, 132, 140, 148, 156, 164, 172, 180, 188, 196, 204,
+  212, 220, 2,   4,   7,   11,  16,  22,  29,  37,  45,  53,  61,  69,  77,
+  85,  93,  101, 109, 117, 125, 133, 141, 149, 157, 165, 173, 181, 189, 197,
+  205, 213, 221, 228, 5,   8,   12,  17,  23,  30,  38,  46,  54,  62,  70,
+  78,  86,  94,  102, 110, 118, 126, 134, 142, 150, 158, 166, 174, 182, 190,
+  198, 206, 214, 222, 229, 235, 9,   13,  18,  24,  31,  39,  47,  55,  63,
+  71,  79,  87,  95,  103, 111, 119, 127, 135, 143, 151, 159, 167, 175, 183,
+  191, 199, 207, 215, 223, 230, 236, 241, 14,  19,  25,  32,  40,  48,  56,
+  64,  72,  80,  88,  96,  104, 112, 120, 128, 136, 144, 152, 160, 168, 176,
+  184, 192, 200, 208, 216, 224, 231, 237, 242, 246, 20,  26,  33,  41,  49,
+  57,  65,  73,  81,  89,  97,  105, 113, 121, 129, 137, 145, 153, 161, 169,
+  177, 185, 193, 201, 209, 217, 225, 232, 238, 243, 247, 250, 27,  34,  42,
+  50,  58,  66,  74,  82,  90,  98,  106, 114, 122, 130, 138, 146, 154, 162,
+  170, 178, 186, 194, 202, 210, 218, 226, 233, 239, 244, 248, 251, 253, 35,
+  43,  51,  59,  67,  75,  83,  91,  99,  107, 115, 123, 131, 139, 147, 155,
+  163, 171, 179, 187, 195, 203, 211, 219, 227, 234, 240, 245, 249, 252, 254,
+  255,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x32[256]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x8[256]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x32[256]) = {
+  0,  32, 64, 96,  128, 160, 192, 224, 1,  33, 65, 97,  129, 161, 193, 225,
+  2,  34, 66, 98,  130, 162, 194, 226, 3,  35, 67, 99,  131, 163, 195, 227,
+  4,  36, 68, 100, 132, 164, 196, 228, 5,  37, 69, 101, 133, 165, 197, 229,
+  6,  38, 70, 102, 134, 166, 198, 230, 7,  39, 71, 103, 135, 167, 199, 231,
+  8,  40, 72, 104, 136, 168, 200, 232, 9,  41, 73, 105, 137, 169, 201, 233,
+  10, 42, 74, 106, 138, 170, 202, 234, 11, 43, 75, 107, 139, 171, 203, 235,
+  12, 44, 76, 108, 140, 172, 204, 236, 13, 45, 77, 109, 141, 173, 205, 237,
+  14, 46, 78, 110, 142, 174, 206, 238, 15, 47, 79, 111, 143, 175, 207, 239,
+  16, 48, 80, 112, 144, 176, 208, 240, 17, 49, 81, 113, 145, 177, 209, 241,
+  18, 50, 82, 114, 146, 178, 210, 242, 19, 51, 83, 115, 147, 179, 211, 243,
+  20, 52, 84, 116, 148, 180, 212, 244, 21, 53, 85, 117, 149, 181, 213, 245,
+  22, 54, 86, 118, 150, 182, 214, 246, 23, 55, 87, 119, 151, 183, 215, 247,
+  24, 56, 88, 120, 152, 184, 216, 248, 25, 57, 89, 121, 153, 185, 217, 249,
+  26, 58, 90, 122, 154, 186, 218, 250, 27, 59, 91, 123, 155, 187, 219, 251,
+  28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253,
+  30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x8[256]) = {
+  0,   8,   16,  24,  32,  40,  48,  56,  64,  72,  80,  88,  96,  104, 112,
+  120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232,
+  240, 248, 1,   9,   17,  25,  33,  41,  49,  57,  65,  73,  81,  89,  97,
+  105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217,
+  225, 233, 241, 249, 2,   10,  18,  26,  34,  42,  50,  58,  66,  74,  82,
+  90,  98,  106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202,
+  210, 218, 226, 234, 242, 250, 3,   11,  19,  27,  35,  43,  51,  59,  67,
+  75,  83,  91,  99,  107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187,
+  195, 203, 211, 219, 227, 235, 243, 251, 4,   12,  20,  28,  36,  44,  52,
+  60,  68,  76,  84,  92,  100, 108, 116, 124, 132, 140, 148, 156, 164, 172,
+  180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 5,   13,  21,  29,  37,
+  45,  53,  61,  69,  77,  85,  93,  101, 109, 117, 125, 133, 141, 149, 157,
+  165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 6,   14,  22,
+  30,  38,  46,  54,  62,  70,  78,  86,  94,  102, 110, 118, 126, 134, 142,
+  150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 7,
+  15,  23,  31,  39,  47,  55,  63,  71,  79,  87,  95,  103, 111, 119, 127,
+  135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247,
+  255,
+};
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x8[64]) = {
+  0, 8,  16, 24, 32, 40, 48, 56, 1, 9,  17, 25, 33, 41, 49, 57,
+  2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
+  4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61,
+  6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x8[64]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t, av1_col_iscan_8x8[64]) = {
+  0,  3,  8,  15, 22, 32, 40, 47, 1,  5,  11, 18, 26, 34, 44, 51,
+  2,  7,  13, 20, 28, 38, 46, 54, 4,  10, 16, 24, 31, 41, 50, 56,
+  6,  12, 21, 27, 35, 43, 52, 58, 9,  17, 25, 33, 39, 48, 55, 60,
+  14, 23, 30, 37, 45, 53, 59, 62, 19, 29, 36, 42, 49, 57, 61, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_row_iscan_8x8[64]) = {
+  0,  1,  2,  5,  8,  12, 19, 24, 3,  4,  7,  10, 15, 20, 30, 39,
+  6,  9,  13, 16, 21, 27, 37, 46, 11, 14, 17, 23, 28, 34, 44, 52,
+  18, 22, 25, 31, 35, 41, 50, 57, 26, 29, 33, 38, 43, 49, 55, 59,
+  32, 36, 42, 47, 51, 54, 60, 61, 40, 45, 48, 53, 56, 58, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x8[64]) = {
+  0,  2,  5,  9,  14, 22, 31, 37, 1,  4,  8,  13, 19, 26, 38, 44,
+  3,  6,  10, 17, 24, 30, 42, 49, 7,  11, 15, 21, 29, 36, 47, 53,
+  12, 16, 20, 27, 34, 43, 52, 57, 18, 23, 28, 35, 41, 48, 56, 60,
+  25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x16[128]) = {
+  0,  1,  3,   6,   10,  15,  21,  28,  2,  4,   7,   11,  16,  22,  29,  36,
+  5,  8,  12,  17,  23,  30,  37,  44,  9,  13,  18,  24,  31,  38,  45,  52,
+  14, 19, 25,  32,  39,  46,  53,  60,  20, 26,  33,  40,  47,  54,  61,  68,
+  27, 34, 41,  48,  55,  62,  69,  76,  35, 42,  49,  56,  63,  70,  77,  84,
+  43, 50, 57,  64,  71,  78,  85,  92,  51, 58,  65,  72,  79,  86,  93,  100,
+  59, 66, 73,  80,  87,  94,  101, 107, 67, 74,  81,  88,  95,  102, 108, 113,
+  75, 82, 89,  96,  103, 109, 114, 118, 83, 90,  97,  104, 110, 115, 119, 122,
+  91, 98, 105, 111, 116, 120, 123, 125, 99, 106, 112, 117, 121, 124, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x8[128]) = {
+  0,  1,  3,  6,  10, 15, 21, 28, 36, 44,  52,  60,  68,  76,  84,  92,
+  2,  4,  7,  11, 16, 22, 29, 37, 45, 53,  61,  69,  77,  85,  93,  100,
+  5,  8,  12, 17, 23, 30, 38, 46, 54, 62,  70,  78,  86,  94,  101, 107,
+  9,  13, 18, 24, 31, 39, 47, 55, 63, 71,  79,  87,  95,  102, 108, 113,
+  14, 19, 25, 32, 40, 48, 56, 64, 72, 80,  88,  96,  103, 109, 114, 118,
+  20, 26, 33, 41, 49, 57, 65, 73, 81, 89,  97,  104, 110, 115, 119, 122,
+  27, 34, 42, 50, 58, 66, 74, 82, 90, 98,  105, 111, 116, 120, 123, 125,
+  35, 43, 51, 59, 67, 75, 83, 91, 99, 106, 112, 117, 121, 124, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x16[128]) = {
+  0,  16, 32, 48, 64, 80, 96,  112, 1,  17, 33, 49, 65, 81, 97,  113,
+  2,  18, 34, 50, 66, 82, 98,  114, 3,  19, 35, 51, 67, 83, 99,  115,
+  4,  20, 36, 52, 68, 84, 100, 116, 5,  21, 37, 53, 69, 85, 101, 117,
+  6,  22, 38, 54, 70, 86, 102, 118, 7,  23, 39, 55, 71, 87, 103, 119,
+  8,  24, 40, 56, 72, 88, 104, 120, 9,  25, 41, 57, 73, 89, 105, 121,
+  10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123,
+  12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125,
+  14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x8[128]) = {
+  0, 8,  16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96,  104, 112, 120,
+  1, 9,  17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97,  105, 113, 121,
+  2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98,  106, 114, 122,
+  3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99,  107, 115, 123,
+  4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124,
+  5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125,
+  6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
+  7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x16[128]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x8[128]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x32[512]) = {
+  0,   1,   3,   6,   10,  15,  21,  28,  36,  45,  55,  66,  78,  91,  105,
+  120, 2,   4,   7,   11,  16,  22,  29,  37,  46,  56,  67,  79,  92,  106,
+  121, 136, 5,   8,   12,  17,  23,  30,  38,  47,  57,  68,  80,  93,  107,
+  122, 137, 152, 9,   13,  18,  24,  31,  39,  48,  58,  69,  81,  94,  108,
+  123, 138, 153, 168, 14,  19,  25,  32,  40,  49,  59,  70,  82,  95,  109,
+  124, 139, 154, 169, 184, 20,  26,  33,  41,  50,  60,  71,  83,  96,  110,
+  125, 140, 155, 170, 185, 200, 27,  34,  42,  51,  61,  72,  84,  97,  111,
+  126, 141, 156, 171, 186, 201, 216, 35,  43,  52,  62,  73,  85,  98,  112,
+  127, 142, 157, 172, 187, 202, 217, 232, 44,  53,  63,  74,  86,  99,  113,
+  128, 143, 158, 173, 188, 203, 218, 233, 248, 54,  64,  75,  87,  100, 114,
+  129, 144, 159, 174, 189, 204, 219, 234, 249, 264, 65,  76,  88,  101, 115,
+  130, 145, 160, 175, 190, 205, 220, 235, 250, 265, 280, 77,  89,  102, 116,
+  131, 146, 161, 176, 191, 206, 221, 236, 251, 266, 281, 296, 90,  103, 117,
+  132, 147, 162, 177, 192, 207, 222, 237, 252, 267, 282, 297, 312, 104, 118,
+  133, 148, 163, 178, 193, 208, 223, 238, 253, 268, 283, 298, 313, 328, 119,
+  134, 149, 164, 179, 194, 209, 224, 239, 254, 269, 284, 299, 314, 329, 344,
+  135, 150, 165, 180, 195, 210, 225, 240, 255, 270, 285, 300, 315, 330, 345,
+  360, 151, 166, 181, 196, 211, 226, 241, 256, 271, 286, 301, 316, 331, 346,
+  361, 376, 167, 182, 197, 212, 227, 242, 257, 272, 287, 302, 317, 332, 347,
+  362, 377, 392, 183, 198, 213, 228, 243, 258, 273, 288, 303, 318, 333, 348,
+  363, 378, 393, 407, 199, 214, 229, 244, 259, 274, 289, 304, 319, 334, 349,
+  364, 379, 394, 408, 421, 215, 230, 245, 260, 275, 290, 305, 320, 335, 350,
+  365, 380, 395, 409, 422, 434, 231, 246, 261, 276, 291, 306, 321, 336, 351,
+  366, 381, 396, 410, 423, 435, 446, 247, 262, 277, 292, 307, 322, 337, 352,
+  367, 382, 397, 411, 424, 436, 447, 457, 263, 278, 293, 308, 323, 338, 353,
+  368, 383, 398, 412, 425, 437, 448, 458, 467, 279, 294, 309, 324, 339, 354,
+  369, 384, 399, 413, 426, 438, 449, 459, 468, 476, 295, 310, 325, 340, 355,
+  370, 385, 400, 414, 427, 439, 450, 460, 469, 477, 484, 311, 326, 341, 356,
+  371, 386, 401, 415, 428, 440, 451, 461, 470, 478, 485, 491, 327, 342, 357,
+  372, 387, 402, 416, 429, 441, 452, 462, 471, 479, 486, 492, 497, 343, 358,
+  373, 388, 403, 417, 430, 442, 453, 463, 472, 480, 487, 493, 498, 502, 359,
+  374, 389, 404, 418, 431, 443, 454, 464, 473, 481, 488, 494, 499, 503, 506,
+  375, 390, 405, 419, 432, 444, 455, 465, 474, 482, 489, 495, 500, 504, 507,
+  509, 391, 406, 420, 433, 445, 456, 466, 475, 483, 490, 496, 501, 505, 508,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x16[512]) = {
+  0,   1,   3,   6,   10,  15,  21,  28,  36,  45,  55,  66,  78,  91,  105,
+  120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344,
+  360, 376, 2,   4,   7,   11,  16,  22,  29,  37,  46,  56,  67,  79,  92,
+  106, 121, 137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329,
+  345, 361, 377, 392, 5,   8,   12,  17,  23,  30,  38,  47,  57,  68,  80,
+  93,  107, 122, 138, 154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314,
+  330, 346, 362, 378, 393, 407, 9,   13,  18,  24,  31,  39,  48,  58,  69,
+  81,  94,  108, 123, 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299,
+  315, 331, 347, 363, 379, 394, 408, 421, 14,  19,  25,  32,  40,  49,  59,
+  70,  82,  95,  109, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284,
+  300, 316, 332, 348, 364, 380, 395, 409, 422, 434, 20,  26,  33,  41,  50,
+  60,  71,  83,  96,  110, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269,
+  285, 301, 317, 333, 349, 365, 381, 396, 410, 423, 435, 446, 27,  34,  42,
+  51,  61,  72,  84,  97,  111, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+  270, 286, 302, 318, 334, 350, 366, 382, 397, 411, 424, 436, 447, 457, 35,
+  43,  52,  62,  73,  85,  98,  112, 127, 143, 159, 175, 191, 207, 223, 239,
+  255, 271, 287, 303, 319, 335, 351, 367, 383, 398, 412, 425, 437, 448, 458,
+  467, 44,  53,  63,  74,  86,  99,  113, 128, 144, 160, 176, 192, 208, 224,
+  240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 399, 413, 426, 438, 449,
+  459, 468, 476, 54,  64,  75,  87,  100, 114, 129, 145, 161, 177, 193, 209,
+  225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 400, 414, 427, 439,
+  450, 460, 469, 477, 484, 65,  76,  88,  101, 115, 130, 146, 162, 178, 194,
+  210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 401, 415, 428,
+  440, 451, 461, 470, 478, 485, 491, 77,  89,  102, 116, 131, 147, 163, 179,
+  195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371, 387, 402, 416,
+  429, 441, 452, 462, 471, 479, 486, 492, 497, 90,  103, 117, 132, 148, 164,
+  180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340, 356, 372, 388, 403,
+  417, 430, 442, 453, 463, 472, 480, 487, 493, 498, 502, 104, 118, 133, 149,
+  165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325, 341, 357, 373, 389,
+  404, 418, 431, 443, 454, 464, 473, 481, 488, 494, 499, 503, 506, 119, 134,
+  150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342, 358, 374,
+  390, 405, 419, 432, 444, 455, 465, 474, 482, 489, 495, 500, 504, 507, 509,
+  135, 151, 167, 183, 199, 215, 231, 247, 263, 279, 295, 311, 327, 343, 359,
+  375, 391, 406, 420, 433, 445, 456, 466, 475, 483, 490, 496, 501, 505, 508,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x32[512]) = {
+  0,  32, 64, 96,  128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480,
+  1,  33, 65, 97,  129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481,
+  2,  34, 66, 98,  130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482,
+  3,  35, 67, 99,  131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483,
+  4,  36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484,
+  5,  37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485,
+  6,  38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486,
+  7,  39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487,
+  8,  40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488,
+  9,  41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489,
+  10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490,
+  11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491,
+  12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492,
+  13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493,
+  14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494,
+  15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495,
+  16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496,
+  17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497,
+  18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498,
+  19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499,
+  20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500,
+  21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501,
+  22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502,
+  23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503,
+  24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504,
+  25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505,
+  26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506,
+  27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507,
+  28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508,
+  29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509,
+  30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510,
+  31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x16[512]) = {
+  0,   16,  32,  48,  64,  80,  96,  112, 128, 144, 160, 176, 192, 208, 224,
+  240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464,
+  480, 496, 1,   17,  33,  49,  65,  81,  97,  113, 129, 145, 161, 177, 193,
+  209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433,
+  449, 465, 481, 497, 2,   18,  34,  50,  66,  82,  98,  114, 130, 146, 162,
+  178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402,
+  418, 434, 450, 466, 482, 498, 3,   19,  35,  51,  67,  83,  99,  115, 131,
+  147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371,
+  387, 403, 419, 435, 451, 467, 483, 499, 4,   20,  36,  52,  68,  84,  100,
+  116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340,
+  356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5,   21,  37,  53,  69,
+  85,  101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309,
+  325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6,   22,  38,
+  54,  70,  86,  102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278,
+  294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7,
+  23,  39,  55,  71,  87,  103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+  263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487,
+  503, 8,   24,  40,  56,  72,  88,  104, 120, 136, 152, 168, 184, 200, 216,
+  232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456,
+  472, 488, 504, 9,   25,  41,  57,  73,  89,  105, 121, 137, 153, 169, 185,
+  201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425,
+  441, 457, 473, 489, 505, 10,  26,  42,  58,  74,  90,  106, 122, 138, 154,
+  170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394,
+  410, 426, 442, 458, 474, 490, 506, 11,  27,  43,  59,  75,  91,  107, 123,
+  139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363,
+  379, 395, 411, 427, 443, 459, 475, 491, 507, 12,  28,  44,  60,  76,  92,
+  108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332,
+  348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13,  29,  45,  61,
+  77,  93,  109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301,
+  317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14,  30,
+  46,  62,  78,  94,  110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270,
+  286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510,
+  15,  31,  47,  63,  79,  95,  111, 127, 143, 159, 175, 191, 207, 223, 239,
+  255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479,
+  495, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x32[512]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+  270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+  285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+  300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+  315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+  330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+  345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+  360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+  375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+  390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+  405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+  420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+  435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+  450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+  465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+  480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+  495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x16[512]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+  270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+  285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+  300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+  315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+  330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+  345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+  360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+  375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+  390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+  405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+  420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+  435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+  450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+  465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+  480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+  495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+  510, 511,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x16[256]) = {
+  0,  16, 32, 48, 64, 80, 96,  112, 128, 144, 160, 176, 192, 208, 224, 240,
+  1,  17, 33, 49, 65, 81, 97,  113, 129, 145, 161, 177, 193, 209, 225, 241,
+  2,  18, 34, 50, 66, 82, 98,  114, 130, 146, 162, 178, 194, 210, 226, 242,
+  3,  19, 35, 51, 67, 83, 99,  115, 131, 147, 163, 179, 195, 211, 227, 243,
+  4,  20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+  5,  21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+  6,  22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+  7,  23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+  8,  24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+  9,  25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+  10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+  11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+  12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+  13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+  14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+  15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x16[256]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255,
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t, av1_col_iscan_16x16[256]) = {
+  0,  4,  11,  20,  31,  43,  59,  75,  85,  109, 130, 150, 165, 181, 195, 198,
+  1,  6,  14,  23,  34,  47,  64,  81,  95,  114, 135, 153, 171, 188, 201, 212,
+  2,  8,  16,  25,  38,  52,  67,  83,  101, 116, 136, 157, 172, 190, 205, 216,
+  3,  10, 18,  29,  41,  55,  71,  89,  103, 119, 141, 159, 176, 194, 208, 218,
+  5,  12, 21,  32,  45,  58,  74,  93,  104, 123, 144, 164, 179, 196, 210, 223,
+  7,  15, 26,  37,  49,  63,  78,  96,  112, 129, 146, 166, 182, 200, 215, 228,
+  9,  19, 28,  39,  54,  69,  86,  102, 117, 132, 151, 170, 187, 206, 220, 230,
+  13, 24, 35,  46,  60,  73,  91,  108, 122, 137, 154, 174, 189, 207, 224, 235,
+  17, 30, 40,  53,  66,  82,  98,  115, 126, 142, 161, 180, 197, 213, 227, 237,
+  22, 36, 48,  62,  76,  92,  105, 120, 133, 147, 167, 186, 203, 219, 232, 240,
+  27, 44, 56,  70,  84,  99,  113, 127, 140, 156, 175, 193, 209, 226, 236, 244,
+  33, 51, 68,  79,  94,  110, 125, 138, 149, 162, 184, 202, 217, 229, 241, 247,
+  42, 61, 77,  90,  106, 121, 134, 148, 160, 173, 191, 211, 225, 238, 245, 251,
+  50, 72, 87,  100, 118, 128, 145, 158, 168, 183, 204, 222, 233, 242, 249, 253,
+  57, 80, 97,  111, 131, 143, 155, 169, 178, 192, 214, 231, 239, 246, 250, 254,
+  65, 88, 107, 124, 139, 152, 163, 177, 185, 199, 221, 234, 243, 248, 252, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_row_iscan_16x16[256]) = {
+  0,   1,   2,   4,   6,   9,   12,  17,  22,  29,  36,  43,  54,  64,  76,
+  86,  3,   5,   7,   11,  15,  19,  25,  32,  38,  48,  59,  68,  84,  99,
+  115, 130, 8,   10,  13,  18,  23,  27,  33,  42,  51,  60,  72,  88,  103,
+  119, 142, 167, 14,  16,  20,  26,  31,  37,  44,  53,  61,  73,  85,  100,
+  116, 135, 161, 185, 21,  24,  30,  35,  40,  47,  55,  65,  74,  81,  94,
+  112, 133, 154, 179, 205, 28,  34,  39,  45,  50,  58,  67,  77,  87,  96,
+  106, 121, 146, 169, 196, 212, 41,  46,  49,  56,  63,  70,  79,  90,  98,
+  107, 122, 138, 159, 182, 207, 222, 52,  57,  62,  69,  75,  83,  93,  102,
+  110, 120, 134, 150, 176, 195, 215, 226, 66,  71,  78,  82,  91,  97,  108,
+  113, 127, 136, 148, 168, 188, 202, 221, 232, 80,  89,  92,  101, 105, 114,
+  125, 131, 139, 151, 162, 177, 192, 208, 223, 234, 95,  104, 109, 117, 123,
+  128, 143, 144, 155, 165, 175, 190, 206, 219, 233, 239, 111, 118, 124, 129,
+  140, 147, 157, 164, 170, 181, 191, 203, 224, 230, 240, 243, 126, 132, 137,
+  145, 153, 160, 174, 178, 184, 197, 204, 216, 231, 237, 244, 246, 141, 149,
+  156, 166, 172, 180, 189, 199, 200, 210, 220, 228, 238, 242, 249, 251, 152,
+  163, 171, 183, 186, 193, 201, 211, 214, 218, 227, 236, 245, 247, 252, 253,
+  158, 173, 187, 194, 198, 209, 213, 217, 225, 229, 235, 241, 248, 250, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x16[256]) = {
+  0,   2,   5,   9,   17,  24,  36,  44,  55,  72,  88,  104, 128, 143, 166,
+  179, 1,   4,   8,   13,  20,  30,  40,  54,  66,  79,  96,  113, 141, 154,
+  178, 196, 3,   7,   11,  18,  25,  33,  46,  57,  71,  86,  101, 119, 148,
+  164, 186, 201, 6,   12,  16,  23,  31,  39,  53,  64,  78,  92,  110, 127,
+  153, 169, 193, 208, 10,  14,  19,  28,  37,  47,  58,  67,  84,  98,  114,
+  133, 161, 176, 198, 214, 15,  21,  26,  34,  43,  52,  65,  77,  91,  106,
+  120, 140, 165, 185, 205, 221, 22,  27,  32,  41,  48,  60,  73,  85,  99,
+  116, 130, 151, 175, 190, 211, 225, 29,  35,  42,  49,  59,  69,  81,  95,
+  108, 125, 139, 155, 182, 197, 217, 229, 38,  45,  51,  61,  68,  80,  93,
+  105, 118, 134, 150, 168, 191, 207, 223, 234, 50,  56,  63,  74,  83,  94,
+  109, 117, 129, 147, 163, 177, 199, 213, 228, 238, 62,  70,  76,  87,  97,
+  107, 122, 131, 145, 159, 172, 188, 210, 222, 235, 242, 75,  82,  90,  102,
+  112, 124, 138, 146, 157, 173, 187, 202, 219, 230, 240, 245, 89,  100, 111,
+  123, 132, 142, 156, 167, 180, 189, 203, 216, 231, 237, 246, 250, 103, 115,
+  126, 136, 149, 162, 171, 183, 194, 204, 215, 224, 236, 241, 248, 252, 121,
+  135, 144, 158, 170, 181, 192, 200, 209, 218, 227, 233, 243, 244, 251, 254,
+  137, 152, 160, 174, 184, 195, 206, 212, 220, 226, 232, 239, 247, 249, 253,
+  255,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x32[1024]) = {
+  0,   32,   64,  96,   128, 160,  192, 224,  256, 288,  320, 352,  384, 416,
+  448, 480,  512, 544,  576, 608,  640, 672,  704, 736,  768, 800,  832, 864,
+  896, 928,  960, 992,  1,   33,   65,  97,   129, 161,  193, 225,  257, 289,
+  321, 353,  385, 417,  449, 481,  513, 545,  577, 609,  641, 673,  705, 737,
+  769, 801,  833, 865,  897, 929,  961, 993,  2,   34,   66,  98,   130, 162,
+  194, 226,  258, 290,  322, 354,  386, 418,  450, 482,  514, 546,  578, 610,
+  642, 674,  706, 738,  770, 802,  834, 866,  898, 930,  962, 994,  3,   35,
+  67,  99,   131, 163,  195, 227,  259, 291,  323, 355,  387, 419,  451, 483,
+  515, 547,  579, 611,  643, 675,  707, 739,  771, 803,  835, 867,  899, 931,
+  963, 995,  4,   36,   68,  100,  132, 164,  196, 228,  260, 292,  324, 356,
+  388, 420,  452, 484,  516, 548,  580, 612,  644, 676,  708, 740,  772, 804,
+  836, 868,  900, 932,  964, 996,  5,   37,   69,  101,  133, 165,  197, 229,
+  261, 293,  325, 357,  389, 421,  453, 485,  517, 549,  581, 613,  645, 677,
+  709, 741,  773, 805,  837, 869,  901, 933,  965, 997,  6,   38,   70,  102,
+  134, 166,  198, 230,  262, 294,  326, 358,  390, 422,  454, 486,  518, 550,
+  582, 614,  646, 678,  710, 742,  774, 806,  838, 870,  902, 934,  966, 998,
+  7,   39,   71,  103,  135, 167,  199, 231,  263, 295,  327, 359,  391, 423,
+  455, 487,  519, 551,  583, 615,  647, 679,  711, 743,  775, 807,  839, 871,
+  903, 935,  967, 999,  8,   40,   72,  104,  136, 168,  200, 232,  264, 296,
+  328, 360,  392, 424,  456, 488,  520, 552,  584, 616,  648, 680,  712, 744,
+  776, 808,  840, 872,  904, 936,  968, 1000, 9,   41,   73,  105,  137, 169,
+  201, 233,  265, 297,  329, 361,  393, 425,  457, 489,  521, 553,  585, 617,
+  649, 681,  713, 745,  777, 809,  841, 873,  905, 937,  969, 1001, 10,  42,
+  74,  106,  138, 170,  202, 234,  266, 298,  330, 362,  394, 426,  458, 490,
+  522, 554,  586, 618,  650, 682,  714, 746,  778, 810,  842, 874,  906, 938,
+  970, 1002, 11,  43,   75,  107,  139, 171,  203, 235,  267, 299,  331, 363,
+  395, 427,  459, 491,  523, 555,  587, 619,  651, 683,  715, 747,  779, 811,
+  843, 875,  907, 939,  971, 1003, 12,  44,   76,  108,  140, 172,  204, 236,
+  268, 300,  332, 364,  396, 428,  460, 492,  524, 556,  588, 620,  652, 684,
+  716, 748,  780, 812,  844, 876,  908, 940,  972, 1004, 13,  45,   77,  109,
+  141, 173,  205, 237,  269, 301,  333, 365,  397, 429,  461, 493,  525, 557,
+  589, 621,  653, 685,  717, 749,  781, 813,  845, 877,  909, 941,  973, 1005,
+  14,  46,   78,  110,  142, 174,  206, 238,  270, 302,  334, 366,  398, 430,
+  462, 494,  526, 558,  590, 622,  654, 686,  718, 750,  782, 814,  846, 878,
+  910, 942,  974, 1006, 15,  47,   79,  111,  143, 175,  207, 239,  271, 303,
+  335, 367,  399, 431,  463, 495,  527, 559,  591, 623,  655, 687,  719, 751,
+  783, 815,  847, 879,  911, 943,  975, 1007, 16,  48,   80,  112,  144, 176,
+  208, 240,  272, 304,  336, 368,  400, 432,  464, 496,  528, 560,  592, 624,
+  656, 688,  720, 752,  784, 816,  848, 880,  912, 944,  976, 1008, 17,  49,
+  81,  113,  145, 177,  209, 241,  273, 305,  337, 369,  401, 433,  465, 497,
+  529, 561,  593, 625,  657, 689,  721, 753,  785, 817,  849, 881,  913, 945,
+  977, 1009, 18,  50,   82,  114,  146, 178,  210, 242,  274, 306,  338, 370,
+  402, 434,  466, 498,  530, 562,  594, 626,  658, 690,  722, 754,  786, 818,
+  850, 882,  914, 946,  978, 1010, 19,  51,   83,  115,  147, 179,  211, 243,
+  275, 307,  339, 371,  403, 435,  467, 499,  531, 563,  595, 627,  659, 691,
+  723, 755,  787, 819,  851, 883,  915, 947,  979, 1011, 20,  52,   84,  116,
+  148, 180,  212, 244,  276, 308,  340, 372,  404, 436,  468, 500,  532, 564,
+  596, 628,  660, 692,  724, 756,  788, 820,  852, 884,  916, 948,  980, 1012,
+  21,  53,   85,  117,  149, 181,  213, 245,  277, 309,  341, 373,  405, 437,
+  469, 501,  533, 565,  597, 629,  661, 693,  725, 757,  789, 821,  853, 885,
+  917, 949,  981, 1013, 22,  54,   86,  118,  150, 182,  214, 246,  278, 310,
+  342, 374,  406, 438,  470, 502,  534, 566,  598, 630,  662, 694,  726, 758,
+  790, 822,  854, 886,  918, 950,  982, 1014, 23,  55,   87,  119,  151, 183,
+  215, 247,  279, 311,  343, 375,  407, 439,  471, 503,  535, 567,  599, 631,
+  663, 695,  727, 759,  791, 823,  855, 887,  919, 951,  983, 1015, 24,  56,
+  88,  120,  152, 184,  216, 248,  280, 312,  344, 376,  408, 440,  472, 504,
+  536, 568,  600, 632,  664, 696,  728, 760,  792, 824,  856, 888,  920, 952,
+  984, 1016, 25,  57,   89,  121,  153, 185,  217, 249,  281, 313,  345, 377,
+  409, 441,  473, 505,  537, 569,  601, 633,  665, 697,  729, 761,  793, 825,
+  857, 889,  921, 953,  985, 1017, 26,  58,   90,  122,  154, 186,  218, 250,
+  282, 314,  346, 378,  410, 442,  474, 506,  538, 570,  602, 634,  666, 698,
+  730, 762,  794, 826,  858, 890,  922, 954,  986, 1018, 27,  59,   91,  123,
+  155, 187,  219, 251,  283, 315,  347, 379,  411, 443,  475, 507,  539, 571,
+  603, 635,  667, 699,  731, 763,  795, 827,  859, 891,  923, 955,  987, 1019,
+  28,  60,   92,  124,  156, 188,  220, 252,  284, 316,  348, 380,  412, 444,
+  476, 508,  540, 572,  604, 636,  668, 700,  732, 764,  796, 828,  860, 892,
+  924, 956,  988, 1020, 29,  61,   93,  125,  157, 189,  221, 253,  285, 317,
+  349, 381,  413, 445,  477, 509,  541, 573,  605, 637,  669, 701,  733, 765,
+  797, 829,  861, 893,  925, 957,  989, 1021, 30,  62,   94,  126,  158, 190,
+  222, 254,  286, 318,  350, 382,  414, 446,  478, 510,  542, 574,  606, 638,
+  670, 702,  734, 766,  798, 830,  862, 894,  926, 958,  990, 1022, 31,  63,
+  95,  127,  159, 191,  223, 255,  287, 319,  351, 383,  415, 447,  479, 511,
+  543, 575,  607, 639,  671, 703,  735, 767,  799, 831,  863, 895,  927, 959,
+  991, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x32[1024]) = {
+  0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,   12,
+  13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,
+  26,   27,   28,   29,   30,   31,   32,   33,   34,   35,   36,   37,   38,
+  39,   40,   41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,
+  52,   53,   54,   55,   56,   57,   58,   59,   60,   61,   62,   63,   64,
+  65,   66,   67,   68,   69,   70,   71,   72,   73,   74,   75,   76,   77,
+  78,   79,   80,   81,   82,   83,   84,   85,   86,   87,   88,   89,   90,
+  91,   92,   93,   94,   95,   96,   97,   98,   99,   100,  101,  102,  103,
+  104,  105,  106,  107,  108,  109,  110,  111,  112,  113,  114,  115,  116,
+  117,  118,  119,  120,  121,  122,  123,  124,  125,  126,  127,  128,  129,
+  130,  131,  132,  133,  134,  135,  136,  137,  138,  139,  140,  141,  142,
+  143,  144,  145,  146,  147,  148,  149,  150,  151,  152,  153,  154,  155,
+  156,  157,  158,  159,  160,  161,  162,  163,  164,  165,  166,  167,  168,
+  169,  170,  171,  172,  173,  174,  175,  176,  177,  178,  179,  180,  181,
+  182,  183,  184,  185,  186,  187,  188,  189,  190,  191,  192,  193,  194,
+  195,  196,  197,  198,  199,  200,  201,  202,  203,  204,  205,  206,  207,
+  208,  209,  210,  211,  212,  213,  214,  215,  216,  217,  218,  219,  220,
+  221,  222,  223,  224,  225,  226,  227,  228,  229,  230,  231,  232,  233,
+  234,  235,  236,  237,  238,  239,  240,  241,  242,  243,  244,  245,  246,
+  247,  248,  249,  250,  251,  252,  253,  254,  255,  256,  257,  258,  259,
+  260,  261,  262,  263,  264,  265,  266,  267,  268,  269,  270,  271,  272,
+  273,  274,  275,  276,  277,  278,  279,  280,  281,  282,  283,  284,  285,
+  286,  287,  288,  289,  290,  291,  292,  293,  294,  295,  296,  297,  298,
+  299,  300,  301,  302,  303,  304,  305,  306,  307,  308,  309,  310,  311,
+  312,  313,  314,  315,  316,  317,  318,  319,  320,  321,  322,  323,  324,
+  325,  326,  327,  328,  329,  330,  331,  332,  333,  334,  335,  336,  337,
+  338,  339,  340,  341,  342,  343,  344,  345,  346,  347,  348,  349,  350,
+  351,  352,  353,  354,  355,  356,  357,  358,  359,  360,  361,  362,  363,
+  364,  365,  366,  367,  368,  369,  370,  371,  372,  373,  374,  375,  376,
+  377,  378,  379,  380,  381,  382,  383,  384,  385,  386,  387,  388,  389,
+  390,  391,  392,  393,  394,  395,  396,  397,  398,  399,  400,  401,  402,
+  403,  404,  405,  406,  407,  408,  409,  410,  411,  412,  413,  414,  415,
+  416,  417,  418,  419,  420,  421,  422,  423,  424,  425,  426,  427,  428,
+  429,  430,  431,  432,  433,  434,  435,  436,  437,  438,  439,  440,  441,
+  442,  443,  444,  445,  446,  447,  448,  449,  450,  451,  452,  453,  454,
+  455,  456,  457,  458,  459,  460,  461,  462,  463,  464,  465,  466,  467,
+  468,  469,  470,  471,  472,  473,  474,  475,  476,  477,  478,  479,  480,
+  481,  482,  483,  484,  485,  486,  487,  488,  489,  490,  491,  492,  493,
+  494,  495,  496,  497,  498,  499,  500,  501,  502,  503,  504,  505,  506,
+  507,  508,  509,  510,  511,  512,  513,  514,  515,  516,  517,  518,  519,
+  520,  521,  522,  523,  524,  525,  526,  527,  528,  529,  530,  531,  532,
+  533,  534,  535,  536,  537,  538,  539,  540,  541,  542,  543,  544,  545,
+  546,  547,  548,  549,  550,  551,  552,  553,  554,  555,  556,  557,  558,
+  559,  560,  561,  562,  563,  564,  565,  566,  567,  568,  569,  570,  571,
+  572,  573,  574,  575,  576,  577,  578,  579,  580,  581,  582,  583,  584,
+  585,  586,  587,  588,  589,  590,  591,  592,  593,  594,  595,  596,  597,
+  598,  599,  600,  601,  602,  603,  604,  605,  606,  607,  608,  609,  610,
+  611,  612,  613,  614,  615,  616,  617,  618,  619,  620,  621,  622,  623,
+  624,  625,  626,  627,  628,  629,  630,  631,  632,  633,  634,  635,  636,
+  637,  638,  639,  640,  641,  642,  643,  644,  645,  646,  647,  648,  649,
+  650,  651,  652,  653,  654,  655,  656,  657,  658,  659,  660,  661,  662,
+  663,  664,  665,  666,  667,  668,  669,  670,  671,  672,  673,  674,  675,
+  676,  677,  678,  679,  680,  681,  682,  683,  684,  685,  686,  687,  688,
+  689,  690,  691,  692,  693,  694,  695,  696,  697,  698,  699,  700,  701,
+  702,  703,  704,  705,  706,  707,  708,  709,  710,  711,  712,  713,  714,
+  715,  716,  717,  718,  719,  720,  721,  722,  723,  724,  725,  726,  727,
+  728,  729,  730,  731,  732,  733,  734,  735,  736,  737,  738,  739,  740,
+  741,  742,  743,  744,  745,  746,  747,  748,  749,  750,  751,  752,  753,
+  754,  755,  756,  757,  758,  759,  760,  761,  762,  763,  764,  765,  766,
+  767,  768,  769,  770,  771,  772,  773,  774,  775,  776,  777,  778,  779,
+  780,  781,  782,  783,  784,  785,  786,  787,  788,  789,  790,  791,  792,
+  793,  794,  795,  796,  797,  798,  799,  800,  801,  802,  803,  804,  805,
+  806,  807,  808,  809,  810,  811,  812,  813,  814,  815,  816,  817,  818,
+  819,  820,  821,  822,  823,  824,  825,  826,  827,  828,  829,  830,  831,
+  832,  833,  834,  835,  836,  837,  838,  839,  840,  841,  842,  843,  844,
+  845,  846,  847,  848,  849,  850,  851,  852,  853,  854,  855,  856,  857,
+  858,  859,  860,  861,  862,  863,  864,  865,  866,  867,  868,  869,  870,
+  871,  872,  873,  874,  875,  876,  877,  878,  879,  880,  881,  882,  883,
+  884,  885,  886,  887,  888,  889,  890,  891,  892,  893,  894,  895,  896,
+  897,  898,  899,  900,  901,  902,  903,  904,  905,  906,  907,  908,  909,
+  910,  911,  912,  913,  914,  915,  916,  917,  918,  919,  920,  921,  922,
+  923,  924,  925,  926,  927,  928,  929,  930,  931,  932,  933,  934,  935,
+  936,  937,  938,  939,  940,  941,  942,  943,  944,  945,  946,  947,  948,
+  949,  950,  951,  952,  953,  954,  955,  956,  957,  958,  959,  960,  961,
+  962,  963,  964,  965,  966,  967,  968,  969,  970,  971,  972,  973,  974,
+  975,  976,  977,  978,  979,  980,  981,  982,  983,  984,  985,  986,  987,
+  988,  989,  990,  991,  992,  993,  994,  995,  996,  997,  998,  999,  1000,
+  1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
+  1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x32[1024]) = {
+  0,    2,    5,    10,   17,   25,   38,   47,   62,   83,   101,  121,  145,
+  170,  193,  204,  210,  219,  229,  233,  245,  257,  275,  299,  342,  356,
+  377,  405,  455,  471,  495,  527,  1,    4,    8,    15,   22,   30,   45,
+  58,   74,   92,   112,  133,  158,  184,  203,  215,  222,  228,  234,  237,
+  256,  274,  298,  317,  355,  376,  404,  426,  470,  494,  526,  551,  3,
+  7,    12,   18,   28,   36,   52,   64,   82,   102,  118,  142,  164,  189,
+  208,  217,  224,  231,  235,  238,  273,  297,  316,  329,  375,  403,  425,
+  440,  493,  525,  550,  567,  6,    11,   16,   23,   31,   43,   60,   73,
+  90,   109,  126,  150,  173,  196,  211,  220,  226,  232,  236,  239,  296,
+  315,  328,  335,  402,  424,  439,  447,  524,  549,  566,  575,  9,    14,
+  19,   29,   37,   50,   65,   78,   95,   116,  134,  157,  179,  201,  214,
+  223,  244,  255,  272,  295,  341,  354,  374,  401,  454,  469,  492,  523,
+  582,  596,  617,  645,  13,   20,   26,   35,   44,   54,   72,   85,   105,
+  123,  140,  163,  182,  205,  216,  225,  254,  271,  294,  314,  353,  373,
+  400,  423,  468,  491,  522,  548,  595,  616,  644,  666,  21,   27,   33,
+  42,   53,   63,   80,   94,   113,  132,  151,  172,  190,  209,  218,  227,
+  270,  293,  313,  327,  372,  399,  422,  438,  490,  521,  547,  565,  615,
+  643,  665,  680,  24,   32,   39,   48,   57,   71,   88,   104,  120,  139,
+  159,  178,  197,  212,  221,  230,  292,  312,  326,  334,  398,  421,  437,
+  446,  520,  546,  564,  574,  642,  664,  679,  687,  34,   40,   46,   56,
+  68,   81,   96,   111,  130,  147,  167,  186,  243,  253,  269,  291,  340,
+  352,  371,  397,  453,  467,  489,  519,  581,  594,  614,  641,  693,  705,
+  723,  747,  41,   49,   55,   67,   77,   91,   107,  124,  138,  161,  177,
+  194,  252,  268,  290,  311,  351,  370,  396,  420,  466,  488,  518,  545,
+  593,  613,  640,  663,  704,  722,  746,  765,  51,   59,   66,   76,   89,
+  99,   119,  131,  149,  168,  181,  200,  267,  289,  310,  325,  369,  395,
+  419,  436,  487,  517,  544,  563,  612,  639,  662,  678,  721,  745,  764,
+  777,  61,   69,   75,   87,   100,  114,  129,  144,  162,  180,  191,  207,
+  288,  309,  324,  333,  394,  418,  435,  445,  516,  543,  562,  573,  638,
+  661,  677,  686,  744,  763,  776,  783,  70,   79,   86,   97,   108,  122,
+  137,  155,  242,  251,  266,  287,  339,  350,  368,  393,  452,  465,  486,
+  515,  580,  592,  611,  637,  692,  703,  720,  743,  788,  798,  813,  833,
+  84,   93,   103,  110,  125,  141,  154,  171,  250,  265,  286,  308,  349,
+  367,  392,  417,  464,  485,  514,  542,  591,  610,  636,  660,  702,  719,
+  742,  762,  797,  812,  832,  848,  98,   106,  115,  127,  143,  156,  169,
+  185,  264,  285,  307,  323,  366,  391,  416,  434,  484,  513,  541,  561,
+  609,  635,  659,  676,  718,  741,  761,  775,  811,  831,  847,  858,  117,
+  128,  136,  148,  160,  175,  188,  198,  284,  306,  322,  332,  390,  415,
+  433,  444,  512,  540,  560,  572,  634,  658,  675,  685,  740,  760,  774,
+  782,  830,  846,  857,  863,  135,  146,  152,  165,  241,  249,  263,  283,
+  338,  348,  365,  389,  451,  463,  483,  511,  579,  590,  608,  633,  691,
+  701,  717,  739,  787,  796,  810,  829,  867,  875,  887,  903,  153,  166,
+  174,  183,  248,  262,  282,  305,  347,  364,  388,  414,  462,  482,  510,
+  539,  589,  607,  632,  657,  700,  716,  738,  759,  795,  809,  828,  845,
+  874,  886,  902,  915,  176,  187,  195,  202,  261,  281,  304,  321,  363,
+  387,  413,  432,  481,  509,  538,  559,  606,  631,  656,  674,  715,  737,
+  758,  773,  808,  827,  844,  856,  885,  901,  914,  923,  192,  199,  206,
+  213,  280,  303,  320,  331,  386,  412,  431,  443,  508,  537,  558,  571,
+  630,  655,  673,  684,  736,  757,  772,  781,  826,  843,  855,  862,  900,
+  913,  922,  927,  240,  247,  260,  279,  337,  346,  362,  385,  450,  461,
+  480,  507,  578,  588,  605,  629,  690,  699,  714,  735,  786,  794,  807,
+  825,  866,  873,  884,  899,  930,  936,  945,  957,  246,  259,  278,  302,
+  345,  361,  384,  411,  460,  479,  506,  536,  587,  604,  628,  654,  698,
+  713,  734,  756,  793,  806,  824,  842,  872,  883,  898,  912,  935,  944,
+  956,  966,  258,  277,  301,  319,  360,  383,  410,  430,  478,  505,  535,
+  557,  603,  627,  653,  672,  712,  733,  755,  771,  805,  823,  841,  854,
+  882,  897,  911,  921,  943,  955,  965,  972,  276,  300,  318,  330,  382,
+  409,  429,  442,  504,  534,  556,  570,  626,  652,  671,  683,  732,  754,
+  770,  780,  822,  840,  853,  861,  896,  910,  920,  926,  954,  964,  971,
+  975,  336,  344,  359,  381,  449,  459,  477,  503,  577,  586,  602,  625,
+  689,  697,  711,  731,  785,  792,  804,  821,  865,  871,  881,  895,  929,
+  934,  942,  953,  977,  981,  987,  995,  343,  358,  380,  408,  458,  476,
+  502,  533,  585,  601,  624,  651,  696,  710,  730,  753,  791,  803,  820,
+  839,  870,  880,  894,  909,  933,  941,  952,  963,  980,  986,  994,  1001,
+  357,  379,  407,  428,  475,  501,  532,  555,  600,  623,  650,  670,  709,
+  729,  752,  769,  802,  819,  838,  852,  879,  893,  908,  919,  940,  951,
+  962,  970,  985,  993,  1000, 1005, 378,  406,  427,  441,  500,  531,  554,
+  569,  622,  649,  669,  682,  728,  751,  768,  779,  818,  837,  851,  860,
+  892,  907,  918,  925,  950,  961,  969,  974,  992,  999,  1004, 1007, 448,
+  457,  474,  499,  576,  584,  599,  621,  688,  695,  708,  727,  784,  790,
+  801,  817,  864,  869,  878,  891,  928,  932,  939,  949,  976,  979,  984,
+  991,  1008, 1010, 1013, 1017, 456,  473,  498,  530,  583,  598,  620,  648,
+  694,  707,  726,  750,  789,  800,  816,  836,  868,  877,  890,  906,  931,
+  938,  948,  960,  978,  983,  990,  998,  1009, 1012, 1016, 1020, 472,  497,
+  529,  553,  597,  619,  647,  668,  706,  725,  749,  767,  799,  815,  835,
+  850,  876,  889,  905,  917,  937,  947,  959,  968,  982,  989,  997,  1003,
+  1011, 1015, 1019, 1022, 496,  528,  552,  568,  618,  646,  667,  681,  724,
+  748,  766,  778,  814,  834,  849,  859,  888,  904,  916,  924,  946,  958,
+  967,  973,  988,  996,  1002, 1006, 1014, 1018, 1021, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_v2_iscan_32x32[1024]) = {
+  0,    1,    4,    9,    15,   22,   33,   43,   56,   71,   86,   104,  121,
+  142,  166,  189,  512,  518,  527,  539,  551,  566,  584,  602,  621,  644,
+  668,  695,  721,  748,  780,  811,  2,    3,    6,    11,   17,   26,   35,
+  45,   58,   73,   90,   106,  123,  146,  168,  193,  513,  519,  528,  540,
+  553,  567,  585,  603,  622,  647,  670,  696,  722,  751,  783,  812,  5,
+  7,    8,    13,   20,   28,   37,   50,   62,   75,   92,   108,  129,  150,
+  170,  195,  514,  521,  530,  541,  554,  569,  587,  605,  625,  649,  671,
+  699,  725,  752,  785,  815,  10,   12,   14,   19,   23,   31,   41,   52,
+  65,   81,   96,   113,  133,  152,  175,  201,  515,  522,  531,  542,  556,
+  572,  589,  607,  629,  651,  673,  700,  726,  757,  788,  819,  16,   18,
+  21,   24,   30,   39,   48,   59,   69,   83,   100,  119,  137,  158,  181,
+  203,  516,  523,  534,  545,  559,  574,  591,  610,  632,  654,  679,  704,
+  730,  762,  791,  824,  25,   27,   29,   32,   40,   46,   54,   67,   79,
+  94,   109,  127,  143,  164,  185,  210,  517,  525,  535,  547,  561,  578,
+  595,  615,  635,  656,  684,  707,  737,  766,  793,  830,  34,   36,   38,
+  42,   49,   55,   64,   76,   87,   102,  117,  135,  154,  176,  197,  219,
+  520,  529,  538,  550,  565,  580,  598,  618,  639,  664,  687,  712,  741,
+  769,  802,  833,  44,   47,   51,   53,   60,   68,   77,   85,   98,   114,
+  131,  147,  162,  183,  208,  227,  524,  533,  544,  557,  571,  588,  606,
+  623,  645,  667,  692,  720,  747,  776,  806,  838,  57,   61,   63,   66,
+  70,   80,   88,   99,   112,  124,  140,  159,  179,  199,  216,  233,  526,
+  536,  548,  562,  577,  593,  613,  633,  653,  676,  701,  727,  756,  786,
+  814,  847,  72,   74,   78,   82,   84,   95,   103,  115,  125,  139,  156,
+  173,  190,  211,  229,  246,  532,  543,  555,  568,  581,  601,  619,  637,
+  663,  685,  709,  738,  763,  792,  826,  855,  89,   91,   93,   97,   101,
+  110,  118,  132,  141,  157,  171,  186,  206,  224,  241,  255,  537,  549,
+  560,  576,  592,  608,  628,  650,  669,  693,  719,  744,  773,  805,  834,
+  862,  105,  107,  111,  116,  120,  128,  136,  148,  160,  174,  187,  205,
+  221,  236,  251,  267,  546,  558,  570,  583,  600,  617,  636,  657,  680,
+  706,  729,  758,  787,  813,  846,  871,  122,  126,  130,  134,  138,  144,
+  155,  163,  180,  191,  207,  222,  232,  248,  264,  278,  552,  564,  579,
+  594,  609,  630,  648,  666,  688,  715,  742,  768,  797,  827,  856,  877,
+  145,  149,  151,  153,  161,  165,  177,  184,  200,  212,  225,  237,  249,
+  262,  275,  289,  563,  575,  590,  604,  620,  638,  660,  683,  705,  728,
+  753,  779,  809,  839,  866,  889,  167,  169,  172,  178,  182,  188,  198,
+  209,  217,  230,  242,  252,  265,  276,  288,  301,  573,  586,  599,  616,
+  634,  652,  672,  694,  716,  743,  767,  794,  825,  850,  874,  899,  192,
+  194,  196,  202,  204,  213,  220,  228,  234,  247,  256,  268,  279,  290,
+  302,  315,  582,  597,  614,  631,  646,  665,  686,  708,  732,  759,  784,
+  810,  837,  863,  886,  908,  214,  215,  218,  223,  226,  231,  239,  244,
+  253,  261,  271,  283,  292,  304,  317,  325,  596,  611,  626,  642,  661,
+  681,  702,  723,  745,  770,  800,  828,  853,  875,  897,  919,  235,  238,
+  240,  243,  245,  250,  257,  263,  270,  280,  287,  298,  307,  319,  329,
+  340,  612,  624,  640,  658,  677,  697,  717,  739,  764,  789,  816,  844,
+  867,  890,  909,  927,  254,  258,  259,  260,  266,  269,  272,  282,  286,
+  296,  303,  312,  323,  333,  341,  355,  627,  641,  655,  674,  690,  713,
+  735,  760,  781,  807,  835,  857,  880,  902,  921,  940,  273,  274,  277,
+  281,  284,  285,  291,  299,  305,  310,  320,  327,  337,  346,  357,  369,
+  643,  659,  675,  689,  710,  733,  754,  777,  803,  831,  851,  872,  892,
+  913,  934,  950,  293,  294,  295,  297,  300,  306,  308,  314,  321,  326,
+  335,  343,  352,  361,  372,  378,  662,  678,  691,  711,  731,  749,  774,
+  798,  822,  848,  869,  887,  906,  925,  942,  961,  309,  311,  313,  316,
+  318,  322,  324,  332,  338,  344,  351,  358,  367,  375,  386,  394,  682,
+  698,  714,  734,  750,  772,  795,  820,  842,  864,  884,  904,  923,  938,
+  954,  967,  328,  330,  331,  334,  336,  339,  342,  348,  354,  359,  366,
+  374,  382,  391,  400,  409,  703,  718,  736,  755,  775,  796,  818,  840,
+  860,  882,  900,  917,  936,  952,  965,  977,  345,  347,  349,  350,  353,
+  356,  360,  364,  371,  376,  383,  389,  395,  406,  412,  423,  724,  740,
+  761,  778,  799,  821,  841,  859,  878,  895,  915,  932,  948,  963,  975,
+  986,  362,  363,  365,  368,  370,  373,  377,  379,  387,  392,  397,  405,
+  411,  420,  428,  439,  746,  765,  782,  804,  823,  843,  861,  879,  894,
+  911,  930,  946,  959,  973,  984,  994,  380,  381,  384,  385,  388,  390,
+  393,  396,  403,  408,  413,  422,  427,  436,  444,  452,  771,  790,  808,
+  832,  849,  865,  883,  896,  912,  928,  944,  957,  971,  982,  992,  1001,
+  398,  399,  401,  402,  404,  407,  410,  414,  419,  425,  429,  437,  442,
+  449,  458,  465,  801,  817,  836,  852,  870,  885,  901,  916,  931,  945,
+  956,  969,  980,  990,  999,  1007, 415,  416,  417,  418,  421,  424,  426,
+  430,  434,  441,  445,  453,  459,  463,  473,  480,  829,  845,  858,  873,
+  888,  905,  918,  933,  947,  958,  970,  979,  988,  997,  1005, 1012, 431,
+  432,  433,  435,  438,  440,  443,  446,  451,  456,  461,  468,  475,  479,
+  488,  494,  854,  868,  881,  893,  907,  924,  937,  949,  960,  972,  981,
+  989,  996,  1003, 1010, 1016, 447,  448,  450,  454,  455,  457,  460,  462,
+  469,  472,  477,  482,  490,  495,  499,  503,  876,  891,  903,  914,  926,
+  939,  953,  964,  974,  983,  991,  998,  1004, 1009, 1014, 1019, 464,  466,
+  467,  470,  471,  474,  476,  478,  484,  489,  493,  497,  501,  504,  506,
+  508,  898,  910,  922,  935,  943,  955,  966,  976,  985,  993,  1000, 1006,
+  1011, 1015, 1018, 1021, 481,  483,  485,  486,  487,  491,  492,  496,  498,
+  500,  502,  505,  507,  509,  510,  511,  920,  929,  941,  951,  962,  968,
+  978,  987,  995,  1002, 1008, 1013, 1017, 1020, 1022, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_h2_iscan_32x32[1024]) = {
+  0,    1,    4,    9,    15,   22,   33,   43,   56,   71,   86,   104,  121,
+  142,  166,  189,  214,  233,  254,  273,  292,  309,  328,  345,  362,  378,
+  397,  415,  431,  447,  464,  481,  2,    3,    6,    11,   17,   26,   35,
+  45,   58,   73,   90,   106,  123,  146,  168,  193,  215,  236,  255,  274,
+  294,  310,  329,  346,  363,  381,  399,  416,  432,  448,  465,  482,  5,
+  7,    8,    13,   20,   28,   37,   50,   62,   75,   92,   108,  129,  150,
+  170,  195,  216,  240,  259,  275,  295,  312,  331,  348,  365,  383,  400,
+  417,  433,  449,  467,  485,  10,   12,   14,   19,   23,   31,   41,   52,
+  65,   81,   96,   113,  133,  152,  175,  201,  221,  243,  260,  280,  297,
+  315,  333,  350,  367,  385,  402,  418,  434,  452,  470,  486,  16,   18,
+  21,   24,   30,   39,   48,   59,   69,   83,   100,  119,  137,  158,  181,
+  203,  226,  244,  264,  283,  300,  318,  335,  353,  370,  388,  404,  420,
+  438,  455,  471,  487,  25,   27,   29,   32,   40,   46,   54,   67,   79,
+  94,   109,  127,  143,  164,  185,  210,  231,  250,  269,  285,  304,  322,
+  339,  356,  373,  389,  407,  423,  440,  457,  473,  491,  34,   36,   38,
+  42,   49,   55,   64,   76,   87,   102,  117,  135,  154,  176,  197,  219,
+  239,  256,  272,  291,  308,  324,  341,  359,  377,  393,  410,  426,  442,
+  460,  476,  492,  44,   47,   51,   53,   60,   68,   77,   85,   98,   114,
+  131,  147,  162,  183,  208,  227,  245,  262,  282,  298,  314,  332,  349,
+  364,  379,  396,  412,  430,  446,  462,  478,  495,  57,   61,   63,   66,
+  70,   80,   88,   99,   112,  124,  140,  159,  179,  199,  217,  234,  253,
+  270,  286,  305,  321,  337,  354,  371,  387,  403,  419,  435,  451,  468,
+  484,  498,  72,   74,   78,   82,   84,   95,   103,  115,  125,  139,  156,
+  173,  190,  211,  229,  246,  261,  281,  296,  311,  325,  344,  360,  375,
+  392,  408,  425,  441,  456,  472,  489,  500,  89,   91,   93,   97,   101,
+  110,  118,  132,  141,  157,  171,  186,  206,  224,  241,  257,  271,  287,
+  303,  320,  336,  351,  366,  384,  398,  413,  429,  445,  461,  477,  493,
+  502,  105,  107,  111,  116,  120,  128,  136,  148,  160,  174,  187,  205,
+  222,  237,  251,  267,  284,  299,  313,  327,  343,  358,  374,  390,  405,
+  422,  437,  453,  469,  483,  497,  505,  122,  126,  130,  134,  138,  144,
+  155,  163,  180,  191,  207,  223,  232,  248,  265,  278,  293,  307,  323,
+  338,  352,  368,  382,  395,  411,  427,  443,  459,  475,  490,  501,  507,
+  145,  149,  151,  153,  161,  165,  177,  184,  200,  212,  225,  238,  249,
+  263,  276,  289,  306,  319,  334,  347,  361,  376,  391,  406,  421,  436,
+  450,  463,  479,  496,  504,  509,  167,  169,  172,  178,  182,  188,  198,
+  209,  218,  230,  242,  252,  266,  277,  288,  301,  317,  330,  342,  357,
+  372,  386,  401,  414,  428,  444,  458,  474,  488,  499,  506,  510,  192,
+  194,  196,  202,  204,  213,  220,  228,  235,  247,  258,  268,  279,  290,
+  302,  316,  326,  340,  355,  369,  380,  394,  409,  424,  439,  454,  466,
+  480,  494,  503,  508,  511,  512,  513,  514,  515,  516,  517,  520,  523,
+  526,  532,  537,  545,  551,  561,  573,  581,  596,  610,  625,  642,  661,
+  680,  701,  722,  745,  770,  800,  827,  853,  875,  897,  919,  518,  519,
+  521,  522,  524,  525,  528,  533,  536,  542,  549,  557,  564,  575,  585,
+  597,  611,  623,  640,  656,  676,  696,  717,  739,  763,  789,  815,  844,
+  867,  889,  909,  927,  527,  529,  530,  531,  534,  535,  538,  544,  548,
+  555,  560,  569,  579,  589,  598,  614,  626,  641,  655,  673,  690,  712,
+  735,  760,  780,  806,  834,  857,  880,  902,  921,  940,  539,  540,  541,
+  543,  546,  547,  550,  558,  562,  567,  576,  583,  593,  603,  616,  631,
+  643,  657,  674,  689,  710,  733,  752,  776,  803,  830,  850,  872,  892,
+  913,  934,  950,  552,  553,  554,  556,  559,  563,  565,  571,  577,  582,
+  591,  600,  609,  620,  634,  644,  662,  677,  691,  711,  730,  748,  773,
+  798,  822,  847,  869,  887,  906,  925,  942,  961,  566,  568,  570,  572,
+  574,  578,  580,  588,  594,  601,  608,  617,  629,  637,  652,  665,  681,
+  697,  713,  734,  749,  772,  793,  819,  842,  863,  884,  904,  923,  938,
+  954,  967,  584,  586,  587,  590,  592,  595,  599,  605,  613,  618,  628,
+  636,  648,  660,  671,  686,  702,  718,  736,  753,  774,  794,  818,  840,
+  860,  882,  900,  917,  936,  952,  965,  977,  602,  604,  606,  607,  612,
+  615,  619,  624,  633,  638,  649,  658,  666,  683,  692,  707,  723,  740,
+  761,  777,  799,  820,  841,  859,  877,  895,  915,  932,  948,  963,  975,
+  986,  621,  622,  627,  630,  632,  635,  639,  645,  653,  663,  668,  682,
+  688,  704,  716,  732,  746,  764,  781,  804,  823,  843,  861,  878,  894,
+  911,  930,  946,  959,  973,  984,  994,  646,  647,  650,  651,  654,  659,
+  664,  667,  678,  685,  693,  706,  715,  728,  743,  757,  771,  790,  807,
+  831,  848,  864,  883,  896,  912,  928,  944,  957,  971,  982,  992,  1001,
+  669,  670,  672,  675,  679,  684,  687,  694,  703,  709,  719,  729,  741,
+  754,  767,  783,  801,  816,  835,  851,  870,  885,  901,  916,  931,  945,
+  956,  969,  980,  990,  999,  1007, 695,  698,  699,  700,  705,  708,  714,
+  720,  726,  738,  744,  758,  768,  779,  795,  810,  828,  845,  858,  873,
+  888,  905,  918,  933,  947,  958,  970,  979,  988,  997,  1005, 1012, 721,
+  724,  725,  727,  731,  737,  742,  747,  756,  765,  775,  786,  797,  809,
+  825,  837,  854,  868,  881,  893,  907,  924,  937,  949,  960,  972,  981,
+  989,  996,  1003, 1010, 1016, 750,  751,  755,  759,  762,  766,  769,  778,
+  787,  792,  805,  812,  829,  838,  852,  865,  876,  890,  903,  914,  926,
+  939,  953,  964,  974,  983,  991,  998,  1004, 1009, 1014, 1019, 782,  784,
+  785,  788,  791,  796,  802,  808,  814,  826,  836,  846,  856,  866,  874,
+  886,  898,  910,  922,  935,  943,  955,  966,  976,  985,  993,  1000, 1006,
+  1011, 1015, 1018, 1021, 811,  813,  817,  821,  824,  832,  833,  839,  849,
+  855,  862,  871,  879,  891,  899,  908,  920,  929,  941,  951,  962,  968,
+  978,  987,  995,  1002, 1008, 1013, 1017, 1020, 1022, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_qtr_iscan_32x32[1024]) = {
+  0,    1,    4,    9,    15,   22,   33,   43,   56,   71,   86,   104,  121,
+  142,  166,  189,  256,  268,  286,  310,  334,  364,  400,  435,  471,  510,
+  553,  598,  640,  683,  732,  780,  2,    3,    6,    11,   17,   26,   35,
+  45,   58,   73,   90,   106,  123,  146,  168,  193,  258,  270,  288,  312,
+  338,  366,  402,  437,  473,  516,  557,  600,  642,  687,  736,  782,  5,
+  7,    8,    13,   20,   28,   37,   50,   62,   75,   92,   108,  129,  150,
+  170,  195,  260,  274,  292,  314,  340,  370,  406,  441,  478,  520,  559,
+  604,  646,  689,  740,  788,  10,   12,   14,   19,   23,   31,   41,   52,
+  65,   81,   96,   113,  133,  152,  175,  201,  262,  276,  294,  316,  344,
+  376,  410,  445,  484,  524,  563,  606,  648,  697,  746,  793,  16,   18,
+  21,   24,   30,   39,   48,   59,   69,   83,   100,  119,  137,  158,  181,
+  203,  264,  278,  300,  322,  350,  380,  414,  451,  490,  530,  571,  612,
+  656,  705,  750,  799,  25,   27,   29,   32,   40,   46,   54,   67,   79,
+  94,   109,  127,  143,  164,  185,  210,  266,  282,  302,  326,  354,  388,
+  422,  459,  496,  533,  579,  618,  665,  711,  754,  809,  34,   36,   38,
+  42,   49,   55,   64,   76,   87,   102,  117,  135,  154,  176,  197,  216,
+  272,  289,  308,  332,  362,  392,  427,  465,  504,  545,  585,  626,  671,
+  717,  766,  813,  44,   47,   51,   53,   60,   68,   77,   85,   98,   114,
+  131,  147,  162,  183,  208,  222,  279,  298,  320,  346,  374,  408,  442,
+  475,  511,  551,  592,  638,  681,  726,  772,  821,  57,   61,   63,   66,
+  70,   80,   88,   99,   112,  124,  140,  159,  179,  199,  214,  227,  284,
+  304,  328,  355,  386,  418,  455,  492,  528,  567,  608,  649,  695,  742,
+  786,  833,  72,   74,   78,   82,   84,   95,   103,  115,  125,  139,  156,
+  173,  190,  211,  224,  233,  296,  317,  342,  367,  394,  433,  466,  500,
+  543,  581,  622,  667,  707,  752,  803,  843,  89,   91,   93,   97,   101,
+  110,  118,  132,  141,  157,  171,  186,  206,  220,  231,  239,  306,  330,
+  352,  384,  415,  447,  482,  521,  554,  593,  636,  677,  722,  770,  815,
+  852,  105,  107,  111,  116,  120,  128,  136,  148,  160,  174,  187,  205,
+  218,  229,  237,  244,  323,  347,  371,  398,  431,  463,  498,  534,  573,
+  616,  654,  698,  743,  783,  831,  864,  122,  126,  130,  134,  138,  144,
+  155,  163,  180,  191,  207,  219,  226,  235,  242,  248,  335,  360,  390,
+  419,  449,  485,  518,  549,  587,  630,  672,  715,  760,  805,  845,  872,
+  145,  149,  151,  153,  161,  165,  177,  184,  200,  212,  221,  230,  236,
+  241,  246,  251,  356,  382,  411,  438,  469,  501,  539,  577,  613,  652,
+  690,  730,  776,  822,  858,  886,  167,  169,  172,  178,  182,  188,  198,
+  209,  215,  225,  232,  238,  243,  247,  250,  253,  378,  403,  428,  461,
+  494,  526,  560,  594,  632,  675,  713,  755,  801,  837,  868,  897,  192,
+  194,  196,  202,  204,  213,  217,  223,  228,  234,  240,  245,  249,  252,
+  254,  255,  395,  425,  457,  488,  512,  547,  583,  619,  659,  699,  737,
+  778,  819,  854,  882,  907,  257,  259,  261,  263,  265,  267,  273,  280,
+  285,  297,  307,  324,  336,  357,  379,  396,  424,  452,  479,  508,  541,
+  574,  609,  643,  679,  719,  764,  806,  841,  870,  895,  919,  269,  271,
+  275,  277,  281,  283,  290,  299,  305,  318,  331,  348,  361,  383,  404,
+  426,  453,  476,  506,  535,  568,  601,  634,  669,  708,  748,  789,  829,
+  860,  887,  909,  927,  287,  291,  293,  295,  301,  303,  309,  321,  329,
+  343,  353,  372,  391,  412,  429,  458,  480,  507,  532,  564,  590,  627,
+  663,  703,  733,  773,  816,  847,  876,  901,  921,  940,  311,  313,  315,
+  319,  325,  327,  333,  349,  358,  368,  385,  399,  420,  439,  462,  489,
+  509,  536,  565,  589,  624,  661,  691,  727,  768,  810,  838,  866,  890,
+  913,  934,  950,  337,  339,  341,  345,  351,  359,  363,  375,  387,  397,
+  416,  432,  450,  470,  495,  513,  542,  569,  591,  625,  657,  684,  723,
+  762,  797,  834,  862,  884,  905,  925,  942,  961,  365,  369,  373,  377,
+  381,  389,  393,  409,  421,  434,  448,  464,  486,  502,  527,  548,  575,
+  602,  628,  662,  685,  721,  756,  794,  827,  855,  880,  903,  923,  938,
+  954,  967,  401,  405,  407,  413,  417,  423,  430,  443,  456,  467,  483,
+  499,  519,  540,  561,  584,  610,  635,  664,  692,  724,  757,  792,  825,
+  850,  878,  899,  917,  936,  952,  965,  977,  436,  440,  444,  446,  454,
+  460,  468,  477,  493,  503,  522,  537,  550,  578,  595,  620,  644,  670,
+  704,  728,  763,  795,  826,  849,  873,  893,  915,  932,  948,  963,  975,
+  986,  472,  474,  481,  487,  491,  497,  505,  514,  529,  544,  555,  576,
+  588,  614,  633,  660,  680,  709,  734,  769,  798,  828,  851,  874,  892,
+  911,  930,  946,  959,  973,  984,  994,  515,  517,  523,  525,  531,  538,
+  546,  552,  570,  582,  596,  617,  631,  653,  676,  700,  720,  749,  774,
+  811,  835,  856,  879,  894,  912,  928,  944,  957,  971,  982,  992,  1001,
+  556,  558,  562,  566,  572,  580,  586,  597,  611,  623,  637,  655,  673,
+  693,  714,  738,  765,  790,  817,  839,  863,  881,  900,  916,  931,  945,
+  956,  969,  980,  990,  999,  1007, 599,  603,  605,  607,  615,  621,  629,
+  639,  650,  668,  678,  701,  716,  731,  758,  779,  807,  830,  848,  867,
+  885,  904,  918,  933,  947,  958,  970,  979,  988,  997,  1005, 1012, 641,
+  645,  647,  651,  658,  666,  674,  682,  696,  710,  725,  744,  761,  777,
+  802,  820,  842,  861,  877,  891,  906,  924,  937,  949,  960,  972,  981,
+  989,  996,  1003, 1010, 1016, 686,  688,  694,  702,  706,  712,  718,  729,
+  745,  753,  771,  784,  808,  823,  840,  857,  871,  888,  902,  914,  926,
+  939,  953,  964,  974,  983,  991,  998,  1004, 1009, 1014, 1019, 735,  739,
+  741,  747,  751,  759,  767,  775,  787,  804,  818,  832,  846,  859,  869,
+  883,  896,  910,  922,  935,  943,  955,  966,  976,  985,  993,  1000, 1006,
+  1011, 1015, 1018, 1021, 781,  785,  791,  796,  800,  812,  814,  824,  836,
+  844,  853,  865,  875,  889,  898,  908,  920,  929,  941,  951,  962,  968,
+  978,  987,  995,  1002, 1008, 1013, 1017, 1020, 1022, 1023,
+};
+
+#if CONFIG_TX64X64
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_64x64[4096]) = {
+  0,    1,    4,    9,    15,   22,   33,   43,   56,   71,   86,   104,  121,
+  142,  166,  189,  214,  239,  269,  300,  331,  363,  400,  435,  471,  510,
+  553,  598,  640,  683,  732,  780,  833,  884,  937,  995,  1048, 1107, 1165,
+  1230, 1293, 1353, 1422, 1489, 1562, 1632, 1701, 1776, 1850, 1929, 2006, 2091,
+  2173, 2252, 2339, 2421, 2516, 2603, 2694, 2786, 2879, 2978, 3076, 3175, 2,
+  3,    6,    11,   17,   26,   35,   45,   58,   73,   90,   106,  123,  146,
+  168,  193,  216,  243,  271,  302,  335,  365,  402,  437,  473,  516,  557,
+  600,  642,  687,  736,  782,  835,  886,  941,  999,  1050, 1111, 1167, 1234,
+  1297, 1357, 1424, 1491, 1564, 1636, 1703, 1778, 1852, 1931, 2012, 2095, 2177,
+  2256, 2341, 2425, 2518, 2605, 2698, 2788, 2883, 2982, 3078, 3177, 5,    7,
+  8,    13,   20,   28,   37,   50,   62,   75,   92,   108,  129,  150,  170,
+  195,  218,  249,  277,  304,  337,  369,  406,  441,  478,  520,  559,  604,
+  646,  689,  740,  788,  841,  890,  945,  1001, 1052, 1115, 1173, 1236, 1301,
+  1362, 1428, 1497, 1568, 1638, 1707, 1786, 1858, 1935, 2016, 2097, 2181, 2260,
+  2343, 2431, 2520, 2613, 2702, 2790, 2889, 2984, 3082, 3181, 10,   12,   14,
+  19,   23,   31,   41,   52,   65,   81,   96,   113,  133,  152,  175,  201,
+  224,  253,  279,  310,  341,  375,  410,  445,  484,  524,  563,  606,  648,
+  697,  746,  793,  843,  896,  949,  1005, 1060, 1119, 1181, 1242, 1303, 1366,
+  1436, 1503, 1572, 1640, 1713, 1790, 1865, 1943, 2018, 2103, 2183, 2266, 2347,
+  2437, 2526, 2617, 2708, 2800, 2893, 2992, 3086, 3189, 16,   18,   21,   24,
+  30,   39,   48,   59,   69,   83,   100,  119,  137,  158,  181,  203,  230,
+  255,  286,  316,  347,  380,  414,  451,  490,  530,  571,  612,  656,  705,
+  750,  799,  849,  898,  959,  1009, 1066, 1127, 1184, 1246, 1307, 1376, 1440,
+  1509, 1578, 1644, 1723, 1794, 1871, 1947, 2024, 2109, 2185, 2270, 2361, 2443,
+  2536, 2619, 2710, 2806, 2899, 2998, 3090, 3193, 25,   27,   29,   32,   40,
+  46,   54,   67,   79,   94,   109,  127,  143,  164,  185,  210,  236,  263,
+  292,  320,  353,  388,  422,  459,  496,  533,  579,  618,  665,  711,  754,
+  809,  857,  910,  961,  1015, 1074, 1131, 1194, 1254, 1315, 1384, 1448, 1517,
+  1584, 1655, 1731, 1802, 1875, 1959, 2034, 2115, 2197, 2280, 2367, 2452, 2538,
+  2625, 2722, 2816, 2907, 3004, 3100, 3203, 34,   36,   38,   42,   49,   55,
+  64,   76,   87,   102,  117,  135,  154,  176,  197,  222,  247,  272,  298,
+  329,  361,  392,  427,  465,  504,  545,  585,  626,  671,  717,  766,  813,
+  862,  916,  971,  1028, 1084, 1139, 1200, 1264, 1325, 1390, 1452, 1523, 1594,
+  1667, 1737, 1806, 1887, 1963, 2046, 2123, 2202, 2290, 2371, 2462, 2548, 2641,
+  2732, 2822, 2917, 3010, 3111, 3211, 44,   47,   51,   53,   60,   68,   77,
+  85,   98,   114,  131,  147,  162,  183,  208,  232,  256,  283,  314,  343,
+  373,  408,  442,  475,  511,  551,  592,  638,  681,  726,  772,  821,  874,
+  926,  979,  1034, 1088, 1153, 1214, 1271, 1335, 1396, 1469, 1533, 1600, 1673,
+  1745, 1824, 1897, 1973, 2054, 2131, 2216, 2300, 2383, 2468, 2558, 2649, 2740,
+  2829, 2923, 3022, 3123, 3221, 57,   61,   63,   66,   70,   80,   88,   99,
+  112,  124,  140,  159,  179,  199,  219,  240,  267,  294,  322,  354,  386,
+  418,  455,  492,  528,  567,  608,  649,  695,  742,  786,  836,  882,  933,
+  989,  1046, 1101, 1161, 1216, 1279, 1343, 1410, 1479, 1543, 1614, 1687, 1758,
+  1832, 1905, 1980, 2066, 2141, 2226, 2306, 2395, 2484, 2566, 2659, 2750, 2845,
+  2939, 3032, 3133, 3225, 72,   74,   78,   82,   84,   95,   103,  115,  125,
+  139,  156,  173,  190,  211,  234,  259,  281,  311,  339,  366,  394,  433,
+  466,  500,  543,  581,  622,  667,  707,  752,  803,  853,  899,  955,  1007,
+  1064, 1117, 1175, 1237, 1299, 1354, 1420, 1485, 1556, 1624, 1697, 1770, 1842,
+  1919, 1998, 2074, 2155, 2234, 2319, 2409, 2492, 2581, 2671, 2760, 2859, 2949,
+  3046, 3145, 3245, 89,   91,   93,   97,   101,  110,  118,  132,  141,  157,
+  171,  186,  206,  228,  251,  273,  296,  324,  351,  384,  415,  447,  482,
+  521,  554,  593,  636,  677,  722,  770,  815,  866,  914,  967,  1022, 1078,
+  1135, 1195, 1252, 1313, 1378, 1444, 1507, 1576, 1642, 1714, 1788, 1860, 1933,
+  2013, 2085, 2169, 2250, 2337, 2417, 2502, 2597, 2683, 2778, 2869, 2960, 3060,
+  3157, 3256, 105,  107,  111,  116,  120,  128,  136,  148,  160,  174,  187,
+  205,  225,  244,  265,  290,  317,  344,  370,  398,  431,  463,  498,  534,
+  573,  616,  654,  698,  743,  783,  831,  880,  928,  983,  1036, 1092, 1149,
+  1208, 1266, 1333, 1394, 1457, 1524, 1590, 1665, 1733, 1804, 1879, 1953, 2030,
+  2111, 2189, 2271, 2357, 2441, 2534, 2615, 2704, 2791, 2887, 2979, 3072, 3167,
+  3270, 122,  126,  130,  134,  138,  144,  155,  163,  180,  191,  207,  226,
+  238,  261,  287,  308,  332,  359,  390,  419,  449,  485,  518,  549,  587,
+  630,  672,  715,  760,  805,  855,  900,  953,  1003, 1053, 1108, 1163, 1220,
+  1287, 1345, 1408, 1473, 1541, 1608, 1677, 1749, 1826, 1898, 1971, 2048, 2127,
+  2208, 2294, 2373, 2458, 2542, 2631, 2726, 2818, 2908, 3002, 3094, 3199, 3286,
+  145,  149,  151,  153,  161,  165,  177,  184,  200,  212,  229,  245,  262,
+  284,  305,  327,  355,  382,  411,  438,  469,  501,  539,  577,  613,  652,
+  690,  730,  776,  822,  872,  922,  973,  1024, 1079, 1132, 1188, 1250, 1305,
+  1367, 1432, 1492, 1560, 1626, 1693, 1766, 1838, 1911, 1992, 2068, 2149, 2228,
+  2307, 2393, 2478, 2564, 2655, 2742, 2833, 2927, 3020, 3119, 3219, 3298, 167,
+  169,  172,  178,  182,  188,  198,  209,  220,  235,  252,  266,  288,  306,
+  326,  349,  378,  403,  428,  461,  494,  526,  560,  594,  632,  675,  713,
+  755,  801,  845,  892,  942,  990,  1042, 1096, 1155, 1212, 1267, 1329, 1391,
+  1450, 1519, 1582, 1650, 1724, 1792, 1862, 1936, 2007, 2083, 2167, 2246, 2329,
+  2413, 2496, 2585, 2675, 2761, 2855, 2947, 3040, 3135, 3233, 3320, 192,  194,
+  196,  202,  204,  213,  223,  233,  241,  260,  274,  291,  309,  328,  350,
+  376,  395,  425,  457,  488,  512,  547,  583,  619,  659,  699,  737,  778,
+  819,  868,  917,  965,  1013, 1072, 1123, 1176, 1231, 1289, 1351, 1414, 1474,
+  1539, 1604, 1674, 1741, 1816, 1891, 1961, 2040, 2116, 2191, 2276, 2353, 2438,
+  2524, 2606, 2689, 2784, 2871, 2968, 3062, 3161, 3257, 3334, 215,  217,  221,
+  227,  231,  237,  248,  257,  268,  282,  297,  318,  333,  356,  379,  396,
+  424,  452,  479,  508,  541,  574,  609,  643,  679,  719,  764,  806,  850,
+  894,  938,  987,  1038, 1089, 1145, 1204, 1258, 1316, 1379, 1438, 1501, 1565,
+  1628, 1694, 1764, 1836, 1907, 1981, 2060, 2137, 2220, 2298, 2377, 2464, 2549,
+  2635, 2724, 2812, 2903, 2999, 3088, 3185, 3278, 3350, 242,  246,  250,  254,
+  258,  264,  275,  285,  295,  312,  325,  345,  360,  383,  404,  426,  453,
+  476,  506,  535,  568,  601,  634,  669,  708,  748,  789,  829,  875,  923,
+  968,  1016, 1068, 1120, 1168, 1224, 1280, 1341, 1402, 1465, 1531, 1591, 1661,
+  1729, 1795, 1867, 1937, 2004, 2079, 2159, 2242, 2320, 2405, 2488, 2573, 2661,
+  2744, 2839, 2933, 3023, 3117, 3215, 3296, 3373, 270,  276,  278,  280,  289,
+  293,  299,  315,  323,  340,  352,  371,  391,  412,  429,  458,  480,  507,
+  532,  564,  590,  627,  663,  703,  733,  773,  816,  859,  906,  950,  993,
+  1043, 1094, 1147, 1201, 1256, 1311, 1372, 1429, 1486, 1550, 1618, 1685, 1751,
+  1827, 1895, 1965, 2042, 2119, 2192, 2268, 2348, 2429, 2512, 2599, 2684, 2772,
+  2863, 2951, 3048, 3143, 3239, 3324, 3393, 301,  303,  307,  313,  319,  321,
+  330,  346,  357,  367,  385,  399,  420,  439,  462,  489,  509,  536,  565,
+  589,  624,  661,  691,  727,  768,  810,  846,  887,  929,  977,  1029, 1076,
+  1128, 1177, 1226, 1283, 1339, 1397, 1461, 1521, 1585, 1648, 1715, 1779, 1848,
+  1923, 1996, 2069, 2142, 2224, 2302, 2381, 2465, 2544, 2627, 2720, 2807, 2895,
+  2985, 3073, 3163, 3264, 3338, 3413, 334,  336,  338,  342,  348,  358,  362,
+  374,  387,  397,  416,  432,  450,  470,  495,  513,  542,  569,  591,  625,
+  657,  684,  723,  762,  797,  837,  878,  920,  963,  1010, 1054, 1105, 1157,
+  1206, 1262, 1317, 1374, 1433, 1483, 1545, 1615, 1681, 1743, 1812, 1885, 1954,
+  2025, 2101, 2174, 2248, 2330, 2411, 2490, 2579, 2663, 2745, 2835, 2924, 3018,
+  3115, 3205, 3290, 3363, 3431, 364,  368,  372,  377,  381,  389,  393,  409,
+  421,  434,  448,  464,  486,  502,  527,  548,  575,  602,  628,  662,  685,
+  721,  756,  794,  827,  869,  912,  956,  996,  1040, 1086, 1137, 1189, 1243,
+  1291, 1349, 1404, 1466, 1525, 1588, 1645, 1711, 1774, 1843, 1909, 1988, 2058,
+  2132, 2209, 2288, 2368, 2445, 2527, 2607, 2687, 2780, 2865, 2953, 3049, 3139,
+  3237, 3318, 3387, 3451, 401,  405,  407,  413,  417,  423,  430,  443,  456,
+  467,  483,  499,  519,  540,  561,  584,  610,  635,  664,  692,  724,  757,
+  792,  825,  863,  908,  946,  985,  1032, 1080, 1125, 1169, 1217, 1275, 1330,
+  1386, 1441, 1498, 1554, 1619, 1683, 1746, 1810, 1883, 1949, 2019, 2086, 2165,
+  2238, 2314, 2399, 2479, 2562, 2645, 2733, 2820, 2904, 2996, 3083, 3168, 3268,
+  3339, 3407, 3474, 436,  440,  444,  446,  454,  460,  468,  477,  493,  503,
+  522,  537,  550,  578,  595,  620,  644,  670,  704,  728,  763,  795,  826,
+  861,  901,  935,  980,  1025, 1069, 1112, 1159, 1209, 1260, 1309, 1363, 1418,
+  1475, 1534, 1598, 1656, 1721, 1780, 1846, 1912, 1982, 2056, 2129, 2199, 2278,
+  2358, 2432, 2508, 2593, 2677, 2762, 2851, 2941, 3030, 3124, 3216, 3294, 3365,
+  3433, 3488, 472,  474,  481,  487,  491,  497,  505,  514,  529,  544,  555,
+  576,  588,  614,  633,  660,  680,  709,  734,  769,  798,  828,  864,  902,
+  932,  975,  1020, 1061, 1102, 1150, 1198, 1247, 1294, 1346, 1400, 1455, 1513,
+  1573, 1629, 1689, 1755, 1820, 1888, 1955, 2022, 2092, 2163, 2235, 2312, 2389,
+  2472, 2554, 2632, 2716, 2804, 2884, 2974, 3063, 3153, 3250, 3326, 3395, 3454,
+  3512, 515,  517,  523,  525,  531,  538,  546,  552,  570,  582,  596,  617,
+  631,  653,  676,  700,  720,  749,  774,  811,  838,  870,  909,  936,  976,
+  1017, 1058, 1099, 1143, 1192, 1238, 1284, 1336, 1388, 1445, 1493, 1546, 1610,
+  1671, 1734, 1796, 1856, 1925, 1994, 2062, 2133, 2206, 2281, 2354, 2426, 2503,
+  2587, 2669, 2754, 2843, 2928, 3016, 3105, 3201, 3284, 3351, 3421, 3480, 3534,
+  556,  558,  562,  566,  572,  580,  586,  597,  611,  623,  637,  655,  673,
+  693,  714,  738,  765,  790,  817,  847,  879,  913,  947,  981,  1021, 1059,
+  1097, 1140, 1185, 1227, 1277, 1327, 1380, 1425, 1481, 1537, 1595, 1651, 1708,
+  1771, 1834, 1901, 1966, 2035, 2107, 2170, 2244, 2315, 2396, 2474, 2552, 2628,
+  2711, 2792, 2875, 2966, 3056, 3146, 3234, 3314, 3383, 3445, 3504, 3559, 599,
+  603,  605,  607,  615,  621,  629,  639,  650,  668,  678,  701,  716,  731,
+  758,  779,  807,  830,  860,  888,  921,  957,  986,  1026, 1062, 1100, 1141,
+  1183, 1221, 1272, 1323, 1368, 1416, 1471, 1526, 1580, 1633, 1691, 1752, 1817,
+  1876, 1944, 2002, 2072, 2143, 2218, 2291, 2363, 2435, 2509, 2589, 2672, 2752,
+  2840, 2921, 3008, 3095, 3190, 3274, 3344, 3409, 3470, 3526, 3577, 641,  645,
+  647,  651,  658,  666,  674,  682,  696,  710,  725,  744,  761,  777,  802,
+  820,  851,  876,  907,  930,  964,  997,  1033, 1070, 1103, 1144, 1186, 1222,
+  1270, 1318, 1360, 1411, 1463, 1515, 1569, 1622, 1678, 1739, 1800, 1853, 1917,
+  1983, 2052, 2121, 2186, 2253, 2331, 2406, 2482, 2559, 2639, 2717, 2798, 2877,
+  2961, 3052, 3137, 3226, 3306, 3379, 3437, 3492, 3553, 3601, 686,  688,  694,
+  702,  706,  712,  718,  729,  745,  753,  771,  784,  808,  823,  848,  871,
+  895,  924,  951,  978,  1011, 1041, 1081, 1113, 1151, 1193, 1228, 1273, 1319,
+  1358, 1406, 1458, 1510, 1557, 1612, 1669, 1727, 1781, 1839, 1903, 1969, 2031,
+  2098, 2160, 2232, 2304, 2375, 2453, 2528, 2601, 2679, 2758, 2846, 2929, 3011,
+  3098, 3186, 3271, 3340, 3401, 3466, 3522, 3571, 3620, 735,  739,  741,  747,
+  751,  759,  767,  775,  787,  804,  818,  832,  856,  873,  893,  918,  939,
+  969,  994,  1030, 1055, 1087, 1126, 1160, 1199, 1239, 1278, 1324, 1361, 1407,
+  1453, 1505, 1551, 1605, 1663, 1716, 1768, 1830, 1893, 1951, 2008, 2075, 2139,
+  2214, 2284, 2349, 2418, 2494, 2571, 2653, 2734, 2810, 2890, 2972, 3058, 3147,
+  3231, 3310, 3375, 3435, 3490, 3545, 3595, 3642, 781,  785,  791,  796,  800,
+  812,  814,  824,  839,  854,  867,  881,  903,  925,  943,  966,  988,  1018,
+  1044, 1077, 1106, 1138, 1170, 1210, 1248, 1285, 1328, 1369, 1412, 1459, 1506,
+  1549, 1601, 1657, 1704, 1762, 1821, 1880, 1938, 1999, 2063, 2125, 2193, 2257,
+  2327, 2401, 2475, 2545, 2620, 2691, 2776, 2860, 2942, 3024, 3109, 3197, 3276,
+  3345, 3403, 3468, 3520, 3569, 3616, 3664, 834,  840,  842,  844,  852,  858,
+  865,  877,  883,  904,  915,  931,  954,  974,  991,  1014, 1039, 1071, 1095,
+  1129, 1158, 1190, 1218, 1261, 1295, 1337, 1381, 1417, 1464, 1511, 1552, 1602,
+  1654, 1699, 1759, 1813, 1872, 1927, 1990, 2049, 2113, 2178, 2239, 2308, 2378,
+  2450, 2521, 2594, 2667, 2746, 2824, 2909, 2990, 3070, 3154, 3243, 3316, 3381,
+  3441, 3493, 3547, 3597, 3640, 3682, 885,  889,  891,  897,  905,  911,  919,
+  927,  934,  958,  970,  984,  1004, 1027, 1045, 1073, 1090, 1121, 1148, 1178,
+  1207, 1244, 1276, 1310, 1347, 1389, 1426, 1472, 1516, 1558, 1606, 1658, 1700,
+  1757, 1807, 1868, 1920, 1978, 2043, 2104, 2157, 2229, 2296, 2364, 2422, 2498,
+  2574, 2650, 2727, 2801, 2872, 2954, 3038, 3129, 3212, 3288, 3352, 3419, 3475,
+  3524, 3573, 3621, 3668, 3707, 940,  944,  948,  952,  960,  962,  972,  982,
+  992,  1008, 1023, 1037, 1056, 1082, 1098, 1124, 1146, 1171, 1202, 1229, 1263,
+  1292, 1331, 1364, 1401, 1446, 1482, 1527, 1570, 1613, 1664, 1705, 1760, 1808,
+  1863, 1915, 1976, 2036, 2087, 2153, 2221, 2286, 2344, 2414, 2486, 2556, 2623,
+  2699, 2773, 2853, 2937, 3012, 3091, 3169, 3260, 3330, 3391, 3447, 3505, 3555,
+  3603, 3646, 3684, 3727, 998,  1000, 1002, 1006, 1012, 1019, 1031, 1035, 1047,
+  1065, 1083, 1093, 1109, 1133, 1156, 1179, 1205, 1225, 1257, 1286, 1320, 1350,
+  1387, 1419, 1456, 1494, 1538, 1581, 1623, 1670, 1717, 1763, 1814, 1869, 1916,
+  1974, 2028, 2081, 2150, 2212, 2272, 2335, 2403, 2469, 2539, 2608, 2680, 2755,
+  2827, 2915, 2986, 3068, 3151, 3229, 3300, 3366, 3427, 3484, 3532, 3581, 3630,
+  3672, 3709, 3745, 1049, 1051, 1057, 1063, 1067, 1075, 1085, 1091, 1104, 1118,
+  1136, 1152, 1164, 1191, 1213, 1232, 1259, 1281, 1312, 1340, 1375, 1405, 1442,
+  1476, 1514, 1547, 1596, 1634, 1679, 1728, 1769, 1822, 1873, 1921, 1977, 2029,
+  2078, 2144, 2203, 2264, 2325, 2390, 2459, 2529, 2591, 2665, 2738, 2813, 2880,
+  2957, 3041, 3127, 3206, 3282, 3348, 3399, 3460, 3513, 3565, 3609, 3650, 3695,
+  3733, 3768, 1110, 1114, 1116, 1122, 1130, 1134, 1142, 1154, 1162, 1180, 1196,
+  1211, 1223, 1251, 1268, 1290, 1321, 1342, 1373, 1398, 1434, 1467, 1499, 1535,
+  1574, 1611, 1652, 1692, 1740, 1782, 1831, 1881, 1928, 1979, 2037, 2082, 2145,
+  2200, 2261, 2321, 2387, 2454, 2513, 2583, 2656, 2730, 2793, 2867, 2945, 3025,
+  3101, 3178, 3262, 3328, 3388, 3443, 3494, 3543, 3591, 3636, 3678, 3715, 3754,
+  3790, 1166, 1172, 1174, 1182, 1187, 1197, 1203, 1215, 1219, 1240, 1253, 1269,
+  1288, 1306, 1332, 1352, 1382, 1403, 1430, 1462, 1484, 1528, 1555, 1599, 1630,
+  1672, 1709, 1753, 1801, 1840, 1894, 1939, 1991, 2044, 2088, 2151, 2204, 2262,
+  2318, 2384, 2448, 2504, 2577, 2646, 2712, 2782, 2856, 2934, 3006, 3079, 3158,
+  3240, 3307, 3371, 3425, 3481, 3530, 3575, 3618, 3660, 3701, 3741, 3774, 3807,
+  1233, 1235, 1241, 1245, 1249, 1255, 1265, 1274, 1282, 1300, 1314, 1334, 1348,
+  1370, 1392, 1415, 1439, 1468, 1487, 1522, 1548, 1589, 1620, 1659, 1690, 1735,
+  1772, 1818, 1854, 1904, 1952, 2000, 2050, 2105, 2154, 2213, 2265, 2322, 2385,
+  2446, 2500, 2569, 2642, 2705, 2770, 2849, 2919, 2993, 3064, 3140, 3223, 3292,
+  3353, 3414, 3464, 3516, 3561, 3607, 3648, 3687, 3725, 3762, 3796, 3827, 1296,
+  1298, 1302, 1304, 1308, 1322, 1326, 1338, 1344, 1355, 1383, 1395, 1409, 1435,
+  1451, 1477, 1502, 1532, 1553, 1586, 1616, 1646, 1684, 1722, 1756, 1797, 1835,
+  1877, 1918, 1970, 2009, 2064, 2114, 2158, 2222, 2273, 2326, 2388, 2449, 2501,
+  2567, 2636, 2695, 2768, 2836, 2910, 2976, 3053, 3131, 3209, 3279, 3336, 3397,
+  3449, 3500, 3549, 3593, 3634, 3676, 3713, 3747, 3784, 3817, 3845, 1356, 1359,
+  1365, 1371, 1377, 1385, 1393, 1399, 1413, 1421, 1447, 1460, 1478, 1495, 1520,
+  1540, 1566, 1592, 1621, 1649, 1682, 1712, 1747, 1783, 1823, 1857, 1902, 1945,
+  1984, 2032, 2076, 2126, 2179, 2230, 2287, 2336, 2391, 2455, 2505, 2570, 2637,
+  2692, 2763, 2830, 2901, 2969, 3044, 3120, 3194, 3265, 3331, 3385, 3439, 3486,
+  3536, 3582, 3626, 3665, 3703, 3739, 3772, 3802, 3835, 3864, 1423, 1427, 1431,
+  1437, 1443, 1449, 1454, 1470, 1480, 1488, 1508, 1529, 1542, 1561, 1583, 1607,
+  1631, 1662, 1686, 1718, 1744, 1775, 1811, 1847, 1889, 1926, 1967, 2003, 2053,
+  2099, 2140, 2194, 2240, 2297, 2345, 2404, 2460, 2514, 2578, 2643, 2696, 2764,
+  2826, 2897, 2962, 3036, 3112, 3182, 3254, 3321, 3376, 3429, 3478, 3527, 3567,
+  3611, 3652, 3693, 3731, 3764, 3794, 3825, 3853, 3882, 1490, 1496, 1500, 1504,
+  1512, 1518, 1530, 1536, 1544, 1559, 1577, 1593, 1609, 1627, 1653, 1675, 1695,
+  1730, 1754, 1784, 1815, 1844, 1884, 1913, 1956, 1995, 2038, 2073, 2122, 2161,
+  2215, 2258, 2309, 2365, 2415, 2470, 2530, 2584, 2647, 2706, 2769, 2831, 2898,
+  2959, 3033, 3106, 3170, 3252, 3312, 3367, 3423, 3471, 3518, 3563, 3605, 3644,
+  3680, 3717, 3755, 3788, 3819, 3847, 3874, 3898, 1563, 1567, 1571, 1575, 1579,
+  1587, 1597, 1603, 1617, 1625, 1643, 1666, 1680, 1696, 1725, 1742, 1765, 1798,
+  1828, 1849, 1886, 1910, 1950, 1985, 2023, 2065, 2108, 2146, 2187, 2233, 2285,
+  2328, 2379, 2423, 2487, 2540, 2592, 2657, 2713, 2771, 2837, 2902, 2963, 3034,
+  3104, 3164, 3248, 3304, 3361, 3417, 3462, 3510, 3557, 3598, 3638, 3674, 3711,
+  3743, 3776, 3811, 3839, 3868, 3892, 3917, 1635, 1637, 1639, 1641, 1647, 1660,
+  1668, 1676, 1688, 1698, 1719, 1736, 1750, 1767, 1793, 1819, 1837, 1870, 1896,
+  1924, 1957, 1989, 2020, 2057, 2093, 2134, 2171, 2219, 2254, 2305, 2350, 2402,
+  2451, 2499, 2557, 2609, 2666, 2731, 2783, 2850, 2911, 2970, 3037, 3107, 3165,
+  3246, 3301, 3359, 3410, 3458, 3508, 3551, 3589, 3632, 3670, 3705, 3737, 3770,
+  3800, 3829, 3858, 3886, 3911, 3933, 1702, 1706, 1710, 1720, 1726, 1732, 1738,
+  1748, 1761, 1773, 1789, 1805, 1829, 1841, 1864, 1892, 1908, 1940, 1968, 1997,
+  2026, 2059, 2089, 2130, 2164, 2207, 2245, 2292, 2332, 2376, 2419, 2476, 2522,
+  2575, 2624, 2681, 2739, 2794, 2857, 2920, 2977, 3045, 3113, 3171, 3249, 3302,
+  3358, 3404, 3455, 3502, 3541, 3587, 3628, 3661, 3699, 3735, 3766, 3797, 3823,
+  3851, 3876, 3903, 3927, 3950, 1777, 1785, 1787, 1791, 1799, 1803, 1809, 1825,
+  1833, 1845, 1861, 1882, 1899, 1914, 1941, 1962, 1986, 2005, 2045, 2070, 2102,
+  2135, 2166, 2201, 2236, 2282, 2316, 2366, 2407, 2456, 2495, 2546, 2595, 2651,
+  2700, 2756, 2814, 2868, 2935, 2994, 3054, 3121, 3183, 3253, 3305, 3360, 3405,
+  3453, 3498, 3539, 3585, 3622, 3658, 3697, 3728, 3760, 3792, 3821, 3849, 3872,
+  3896, 3919, 3942, 3964, 1851, 1855, 1859, 1866, 1874, 1878, 1890, 1900, 1906,
+  1922, 1934, 1958, 1972, 1993, 2010, 2041, 2061, 2080, 2120, 2147, 2175, 2210,
+  2241, 2279, 2313, 2355, 2397, 2436, 2483, 2531, 2572, 2621, 2668, 2728, 2774,
+  2828, 2881, 2946, 3007, 3065, 3132, 3195, 3255, 3313, 3362, 3411, 3456, 3499,
+  3538, 3579, 3614, 3656, 3691, 3723, 3758, 3786, 3815, 3843, 3870, 3894, 3915,
+  3937, 3956, 3975, 1930, 1932, 1942, 1946, 1948, 1960, 1964, 1975, 1987, 2001,
+  2014, 2033, 2051, 2071, 2084, 2117, 2138, 2162, 2195, 2225, 2249, 2289, 2317,
+  2359, 2392, 2427, 2477, 2510, 2560, 2602, 2654, 2693, 2747, 2802, 2854, 2916,
+  2958, 3026, 3080, 3141, 3210, 3266, 3322, 3368, 3418, 3459, 3503, 3540, 3580,
+  3613, 3654, 3688, 3721, 3752, 3782, 3813, 3841, 3865, 3890, 3913, 3935, 3954,
+  3972, 3989, 2011, 2015, 2017, 2021, 2027, 2039, 2047, 2055, 2067, 2077, 2090,
+  2112, 2128, 2152, 2168, 2196, 2223, 2243, 2269, 2303, 2333, 2369, 2400, 2433,
+  2473, 2506, 2553, 2590, 2640, 2682, 2735, 2777, 2825, 2873, 2938, 2987, 3042,
+  3102, 3159, 3224, 3280, 3332, 3377, 3424, 3463, 3509, 3542, 3586, 3615, 3655,
+  3685, 3719, 3750, 3780, 3809, 3836, 3862, 3888, 3909, 3931, 3952, 3970, 3987,
+  4003, 2094, 2096, 2100, 2106, 2110, 2118, 2124, 2136, 2148, 2156, 2172, 2190,
+  2211, 2231, 2247, 2277, 2299, 2323, 2351, 2382, 2412, 2447, 2480, 2511, 2555,
+  2588, 2629, 2673, 2718, 2759, 2811, 2861, 2912, 2955, 3013, 3069, 3128, 3179,
+  3241, 3293, 3337, 3386, 3430, 3472, 3511, 3552, 3588, 3623, 3657, 3689, 3720,
+  3749, 3778, 3805, 3833, 3860, 3884, 3907, 3929, 3948, 3968, 3985, 4001, 4016,
+  2176, 2180, 2182, 2184, 2188, 2198, 2205, 2217, 2227, 2237, 2251, 2274, 2295,
+  2310, 2334, 2356, 2380, 2408, 2430, 2466, 2491, 2532, 2563, 2596, 2633, 2670,
+  2714, 2753, 2799, 2847, 2891, 2943, 2991, 3039, 3092, 3152, 3207, 3263, 3308,
+  3354, 3398, 3440, 3479, 3519, 3558, 3590, 3629, 3659, 3692, 3722, 3751, 3779,
+  3804, 3831, 3856, 3880, 3905, 3925, 3946, 3966, 3983, 3999, 4014, 4028, 2255,
+  2259, 2263, 2267, 2275, 2283, 2293, 2301, 2311, 2324, 2338, 2360, 2374, 2394,
+  2416, 2439, 2467, 2489, 2515, 2547, 2580, 2610, 2648, 2678, 2719, 2757, 2795,
+  2841, 2878, 2930, 2973, 3027, 3071, 3130, 3172, 3230, 3283, 3329, 3372, 3415,
+  3450, 3487, 3528, 3564, 3599, 3633, 3662, 3698, 3724, 3753, 3781, 3806, 3832,
+  3855, 3878, 3901, 3923, 3944, 3962, 3981, 3997, 4012, 4026, 4039, 2340, 2342,
+  2346, 2352, 2362, 2370, 2372, 2386, 2398, 2410, 2420, 2442, 2461, 2481, 2497,
+  2525, 2550, 2576, 2600, 2630, 2664, 2688, 2736, 2765, 2805, 2844, 2876, 2922,
+  2964, 3014, 3059, 3110, 3155, 3213, 3261, 3303, 3349, 3389, 3426, 3465, 3501,
+  3537, 3568, 3606, 3639, 3671, 3700, 3729, 3759, 3783, 3810, 3834, 3857, 3879,
+  3900, 3921, 3940, 3960, 3979, 3995, 4010, 4024, 4037, 4049, 2424, 2428, 2434,
+  2440, 2444, 2457, 2463, 2471, 2485, 2493, 2507, 2535, 2543, 2565, 2586, 2611,
+  2638, 2662, 2685, 2721, 2748, 2781, 2821, 2852, 2885, 2931, 2967, 3009, 3055,
+  3099, 3148, 3198, 3244, 3289, 3333, 3369, 3400, 3444, 3482, 3517, 3550, 3583,
+  3612, 3645, 3675, 3706, 3736, 3761, 3787, 3814, 3837, 3861, 3881, 3902, 3922,
+  3939, 3958, 3977, 3993, 4008, 4022, 4035, 4047, 4058, 2517, 2519, 2523, 2533,
+  2537, 2541, 2551, 2561, 2568, 2582, 2598, 2616, 2634, 2658, 2676, 2690, 2725,
+  2749, 2775, 2808, 2838, 2866, 2905, 2944, 2975, 3017, 3057, 3096, 3138, 3187,
+  3232, 3277, 3317, 3355, 3392, 3428, 3461, 3495, 3531, 3562, 3594, 3627, 3653,
+  3681, 3712, 3738, 3767, 3793, 3816, 3842, 3863, 3885, 3906, 3924, 3941, 3959,
+  3974, 3991, 4006, 4020, 4033, 4045, 4056, 4066, 2604, 2612, 2614, 2618, 2622,
+  2626, 2644, 2652, 2660, 2674, 2686, 2707, 2729, 2743, 2766, 2785, 2815, 2842,
+  2864, 2896, 2925, 2956, 2997, 3031, 3066, 3108, 3149, 3191, 3227, 3272, 3311,
+  3346, 3382, 3420, 3448, 3485, 3514, 3544, 3576, 3608, 3635, 3666, 3694, 3718,
+  3744, 3771, 3798, 3822, 3844, 3866, 3889, 3908, 3926, 3945, 3961, 3978, 3992,
+  4005, 4018, 4031, 4043, 4054, 4064, 4073, 2697, 2701, 2703, 2709, 2715, 2723,
+  2737, 2741, 2751, 2767, 2779, 2796, 2819, 2834, 2858, 2874, 2906, 2936, 2952,
+  2988, 3019, 3050, 3084, 3125, 3156, 3202, 3235, 3275, 3309, 3341, 3378, 3406,
+  3442, 3476, 3506, 3533, 3566, 3592, 3619, 3649, 3677, 3704, 3732, 3756, 3777,
+  3801, 3824, 3850, 3871, 3891, 3910, 3930, 3947, 3963, 3980, 3994, 4007, 4019,
+  4030, 4041, 4052, 4062, 4071, 4079, 2787, 2789, 2797, 2803, 2809, 2817, 2823,
+  2832, 2848, 2862, 2870, 2888, 2913, 2932, 2948, 2971, 3000, 3028, 3051, 3074,
+  3116, 3142, 3173, 3217, 3251, 3285, 3315, 3347, 3380, 3402, 3436, 3469, 3496,
+  3525, 3556, 3584, 3610, 3637, 3663, 3690, 3714, 3740, 3765, 3789, 3812, 3830,
+  3852, 3873, 3895, 3914, 3932, 3949, 3967, 3982, 3996, 4009, 4021, 4032, 4042,
+  4051, 4060, 4069, 4077, 4084, 2882, 2886, 2892, 2894, 2900, 2914, 2918, 2926,
+  2940, 2950, 2965, 2980, 3003, 3021, 3043, 3067, 3089, 3118, 3144, 3166, 3208,
+  3238, 3269, 3295, 3327, 3356, 3384, 3412, 3438, 3467, 3491, 3521, 3548, 3574,
+  3604, 3631, 3651, 3679, 3702, 3726, 3748, 3773, 3795, 3820, 3840, 3859, 3877,
+  3897, 3916, 3936, 3953, 3969, 3984, 3998, 4011, 4023, 4034, 4044, 4053, 4061,
+  4068, 4075, 4082, 4088, 2981, 2983, 2989, 2995, 3001, 3005, 3015, 3029, 3035,
+  3047, 3061, 3075, 3097, 3122, 3136, 3162, 3188, 3218, 3242, 3267, 3291, 3319,
+  3342, 3370, 3396, 3422, 3446, 3473, 3497, 3523, 3546, 3570, 3600, 3624, 3647,
+  3673, 3696, 3716, 3742, 3763, 3785, 3803, 3826, 3848, 3869, 3887, 3904, 3920,
+  3938, 3955, 3971, 3986, 4000, 4013, 4025, 4036, 4046, 4055, 4063, 4070, 4076,
+  4081, 4086, 4091, 3077, 3081, 3085, 3087, 3093, 3103, 3114, 3126, 3134, 3150,
+  3160, 3174, 3200, 3220, 3236, 3258, 3281, 3297, 3325, 3343, 3364, 3390, 3408,
+  3434, 3457, 3483, 3507, 3529, 3554, 3572, 3596, 3617, 3641, 3669, 3686, 3710,
+  3734, 3757, 3775, 3799, 3818, 3838, 3854, 3875, 3893, 3912, 3928, 3943, 3957,
+  3973, 3988, 4002, 4015, 4027, 4038, 4048, 4057, 4065, 4072, 4078, 4083, 4087,
+  4090, 4093, 3176, 3180, 3184, 3192, 3196, 3204, 3214, 3222, 3228, 3247, 3259,
+  3273, 3287, 3299, 3323, 3335, 3357, 3374, 3394, 3416, 3432, 3452, 3477, 3489,
+  3515, 3535, 3560, 3578, 3602, 3625, 3643, 3667, 3683, 3708, 3730, 3746, 3769,
+  3791, 3808, 3828, 3846, 3867, 3883, 3899, 3918, 3934, 3951, 3965, 3976, 3990,
+  4004, 4017, 4029, 4040, 4050, 4059, 4067, 4074, 4080, 4085, 4089, 4092, 4094,
+  4095,
+};
+#endif  // CONFIG_TX64X64
+
+const SCAN_ORDER av1_default_scan_orders[TX_SIZES] = {
+#if CONFIG_CB4X4
+  { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
+#endif
+  { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+  { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+  { default_scan_16x16, av1_default_iscan_16x16, default_scan_16x16_neighbors },
+  { default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors },
+#if CONFIG_TX64X64
+  { default_scan_64x64, av1_default_iscan_64x64, default_scan_64x64_neighbors },
+#endif  // CONFIG_TX64X64
+};
+
+const SCAN_ORDER av1_intra_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
+#if CONFIG_CB4X4
+  {
+      // TX_2X2
+      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
+      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
+      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
+      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+#endif
+  {
+      // TX_4X4
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_8X8
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
+      { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
+      { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
+      { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
+      { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
+      { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
+      { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
+      { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_16X16
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
+      { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
+      { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
+      { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
+      { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
+      { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
+      { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
+      { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_32X32
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
+      { v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
+      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
+#if CONFIG_EXT_TX
+      { h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
+      { v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
+      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
+      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
+      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+#if CONFIG_TX64X64
+  {
+      // TX_64X64
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+#endif  // CONFIG_TX64X64
+  {
+      // TX_4X8
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_8X4
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_8X16
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_16X8
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_16X32
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_32X16
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+};
+
+const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
+#if CONFIG_CB4X4
+  {
+      // TX_2X2
+      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
+      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
+      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
+      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+#endif
+  {
+      // TX_4X4
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_8X8
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
+      { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
+      { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
+      { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_16X16
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
+      { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
+      { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
+      { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_32X32
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
+      { v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
+      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
+#if CONFIG_EXT_TX
+      { h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
+      { v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
+      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
+      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
+      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+#if CONFIG_TX64X64
+  {
+      // TX_64X64
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+      { default_scan_64x64, av1_default_iscan_64x64,
+        default_scan_64x64_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+#endif  // CONFIG_TX64X64
+  {
+      // TX_4X8
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_8X4
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_8X16
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_16X8
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_16X32
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_32X16
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_4X16
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
+      { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
+      { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
+      { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
+      { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
+      { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
+      { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
+      { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_16X4
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
+      { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
+      { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
+      { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
+      { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
+      { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
+      { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
+      { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_8X32
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
+      { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
+      { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
+      { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
+      { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
+      { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
+      { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
+      { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_32X8
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
+      { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
+      { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
+      { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
+      { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
+      { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
+      { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
+      { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+};
+
+#if CONFIG_ADAPT_SCAN
+// TX_32X32 will has 1024 coefficients whose indexes can be represented in 10
+// bits
+#define COEFF_IDX_BITS (10 + CONFIG_TX64X64)
+#define COEFF_IDX_SIZE (1 << COEFF_IDX_BITS)
+#define COEFF_IDX_MASK (COEFF_IDX_SIZE - 1)
+
+static uint32_t *get_non_zero_prob(FRAME_CONTEXT *fc, TX_SIZE tx_size,
+                                   TX_TYPE tx_type) {
+  switch (tx_size) {
+#if CONFIG_CB4X4
+    case TX_2X2: return fc->non_zero_prob_2x2[tx_type];
+#endif
+    case TX_4X4: return fc->non_zero_prob_4X4[tx_type];
+    case TX_8X8: return fc->non_zero_prob_8X8[tx_type];
+    case TX_16X16: return fc->non_zero_prob_16X16[tx_type];
+    case TX_32X32: return fc->non_zero_prob_32X32[tx_type];
+#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+    case TX_4X8: return fc->non_zero_prob_4X8[tx_type];
+    case TX_8X4: return fc->non_zero_prob_8X4[tx_type];
+    case TX_8X16: return fc->non_zero_prob_8X16[tx_type];
+    case TX_16X8: return fc->non_zero_prob_16X8[tx_type];
+    case TX_16X32: return fc->non_zero_prob_16X32[tx_type];
+    case TX_32X16: return fc->non_zero_prob_32X16[tx_type];
+#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+    default: assert(0); return NULL;
+  }
+}
+
+static int16_t *get_adapt_scan(FRAME_CONTEXT *fc, TX_SIZE tx_size,
+                               TX_TYPE tx_type) {
+  switch (tx_size) {
+#if CONFIG_CB4X4
+    case TX_2X2: return fc->scan_2x2[tx_type];
+#endif
+    case TX_4X4: return fc->scan_4X4[tx_type];
+    case TX_8X8: return fc->scan_8X8[tx_type];
+    case TX_16X16: return fc->scan_16X16[tx_type];
+    case TX_32X32: return fc->scan_32X32[tx_type];
+#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+    case TX_4X8: return fc->scan_4X8[tx_type];
+    case TX_8X4: return fc->scan_8X4[tx_type];
+    case TX_8X16: return fc->scan_8X16[tx_type];
+    case TX_16X8: return fc->scan_16X8[tx_type];
+    case TX_16X32: return fc->scan_16X32[tx_type];
+    case TX_32X16: return fc->scan_32X16[tx_type];
+#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+    default: assert(0); return NULL;
+  }
+}
+
+static int16_t *get_adapt_iscan(FRAME_CONTEXT *fc, TX_SIZE tx_size,
+                                TX_TYPE tx_type) {
+  switch (tx_size) {
+#if CONFIG_CB4X4
+    case TX_2X2: return fc->iscan_2x2[tx_type];
+#endif
+    case TX_4X4: return fc->iscan_4X4[tx_type];
+    case TX_8X8: return fc->iscan_8X8[tx_type];
+    case TX_16X16: return fc->iscan_16X16[tx_type];
+    case TX_32X32: return fc->iscan_32X32[tx_type];
+#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+    case TX_4X8: return fc->iscan_4X8[tx_type];
+    case TX_8X4: return fc->iscan_8X4[tx_type];
+    case TX_8X16: return fc->iscan_8X16[tx_type];
+    case TX_16X8: return fc->iscan_16X8[tx_type];
+    case TX_16X32: return fc->iscan_16X32[tx_type];
+    case TX_32X16: return fc->iscan_32X16[tx_type];
+#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+    default: assert(0); return NULL;
+  }
+}
+
+static int16_t *get_adapt_nb(FRAME_CONTEXT *fc, TX_SIZE tx_size,
+                             TX_TYPE tx_type) {
+  switch (tx_size) {
+#if CONFIG_CB4X4
+    case TX_2X2: return fc->nb_2x2[tx_type];
+#endif
+    case TX_4X4: return fc->nb_4X4[tx_type];
+    case TX_8X8: return fc->nb_8X8[tx_type];
+    case TX_16X16: return fc->nb_16X16[tx_type];
+    case TX_32X32: return fc->nb_32X32[tx_type];
+#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+    case TX_4X8: return fc->nb_4X8[tx_type];
+    case TX_8X4: return fc->nb_8X4[tx_type];
+    case TX_8X16: return fc->nb_8X16[tx_type];
+    case TX_16X8: return fc->nb_16X8[tx_type];
+    case TX_16X32: return fc->nb_16X32[tx_type];
+    case TX_32X16: return fc->nb_32X16[tx_type];
+#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+    default: assert(0); return NULL;
+  }
+}
+
+static uint32_t *get_non_zero_counts(FRAME_COUNTS *counts, TX_SIZE tx_size,
+                                     TX_TYPE tx_type) {
+  switch (tx_size) {
+#if CONFIG_CB4X4
+    case TX_2X2: return counts->non_zero_count_2x2[tx_type];
+#endif
+    case TX_4X4: return counts->non_zero_count_4X4[tx_type];
+    case TX_8X8: return counts->non_zero_count_8X8[tx_type];
+    case TX_16X16: return counts->non_zero_count_16X16[tx_type];
+    case TX_32X32: return counts->non_zero_count_32X32[tx_type];
+#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+    case TX_4X8: return counts->non_zero_count_4x8[tx_type];
+    case TX_8X4: return counts->non_zero_count_8x4[tx_type];
+    case TX_8X16: return counts->non_zero_count_8x16[tx_type];
+    case TX_16X8: return counts->non_zero_count_16x8[tx_type];
+    case TX_16X32: return counts->non_zero_count_16x32[tx_type];
+    case TX_32X16: return counts->non_zero_count_32x16[tx_type];
+#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+    default: assert(0); return NULL;
+  }
+}
+
+static INLINE int clamp_64(int64_t value, int low, int high) {
+  return value < low ? low : (value > high ? high : (int)value);
+}
+
+static void update_scan_prob(AV1_COMMON *cm, TX_SIZE tx_size, TX_TYPE tx_type,
+                             int rate_16) {
+  FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+  uint32_t *prev_non_zero_prob = get_non_zero_prob(pre_fc, tx_size, tx_type);
+  uint32_t *non_zero_prob = get_non_zero_prob(cm->fc, tx_size, tx_type);
+  uint32_t *non_zero_count = get_non_zero_counts(&cm->counts, tx_size, tx_type);
+  const int tx2d_size = tx_size_2d[tx_size];
+  unsigned int block_num = cm->counts.txb_count[tx_size][tx_type];
+  int i;
+  for (i = 0; i < tx2d_size; i++) {
+    int64_t curr_prob =
+        block_num == 0 ? 0 : (non_zero_count[i] << 16) / block_num;
+    int64_t prev_prob = prev_non_zero_prob[i];
+    int64_t pred_prob =
+        (curr_prob * rate_16 + prev_prob * ((1 << 16) - rate_16)) >> 16;
+    // TODO(angiebird): reduce the bit usage of probabilities and remove
+    // clamp_64()
+    non_zero_prob[i] = clamp_64(pred_prob, 0, UINT16_MAX);
+  }
+}
+
+static void update_scan_count(int16_t *scan, int max_scan,
+                              const tran_low_t *dqcoeffs,
+                              uint32_t *non_zero_count) {
+  int i;
+  for (i = 0; i < max_scan; ++i) {
+    int coeff_idx = scan[i];
+    non_zero_count[coeff_idx] += (dqcoeffs[coeff_idx] != 0);
+  }
+}
+
+void av1_update_scan_count_facade(AV1_COMMON *cm, FRAME_COUNTS *counts,
+                                  TX_SIZE tx_size, TX_TYPE tx_type,
+                                  const tran_low_t *dqcoeffs, int max_scan) {
+  int16_t *scan = get_adapt_scan(cm->fc, tx_size, tx_type);
+  uint32_t *non_zero_count = get_non_zero_counts(counts, tx_size, tx_type);
+  update_scan_count(scan, max_scan, dqcoeffs, non_zero_count);
+  ++counts->txb_count[tx_size][tx_type];
+}
+
+static int cmp_prob(const void *a, const void *b) {
+  return *(const uint32_t *)b > *(const uint32_t *)a ? 1 : -1;
+}
+
+void av1_augment_prob(TX_SIZE tx_size, TX_TYPE tx_type, uint32_t *prob) {
+  // TODO(angiebird): check if we need is_inter here
+  const SCAN_ORDER *sc = get_default_scan(tx_size, tx_type, 0);
+  const int tx1d_wide = tx_size_wide[tx_size];
+  const int tx1d_high = tx_size_high[tx_size];
+  int r, c;
+  for (r = 0; r < tx1d_high; r++) {
+    for (c = 0; c < tx1d_wide; c++) {
+      const int idx = r * tx1d_wide + c;
+      const uint32_t mask_16 = ((1 << 16) - 1);
+      const uint32_t tie_breaker = ~((uint32_t)sc->iscan[idx]);
+      // prob[idx]: 16 bits  dummy: 6 bits  scan_idx: 10 bits
+      prob[idx] = (prob[idx] << 16) | (mask_16 & tie_breaker);
+    }
+  }
+}
+
+// topological sort
+static void dfs_scan(int tx1d_size, int *scan_idx, int coeff_idx, int16_t *scan,
+                     int16_t *iscan) {
+  const int r = coeff_idx / tx1d_size;
+  const int c = coeff_idx % tx1d_size;
+
+  if (iscan[coeff_idx] != -1) return;
+
+  if (r > 0) dfs_scan(tx1d_size, scan_idx, coeff_idx - tx1d_size, scan, iscan);
+
+  if (c > 0) dfs_scan(tx1d_size, scan_idx, coeff_idx - 1, scan, iscan);
+
+  scan[*scan_idx] = coeff_idx;
+  iscan[coeff_idx] = *scan_idx;
+  ++(*scan_idx);
+}
+
+void av1_update_neighbors(int tx_size, const int16_t *scan,
+                          const int16_t *iscan, int16_t *neighbors) {
+  const int tx1d_wide = tx_size_wide[tx_size];
+  const int tx1d_high = tx_size_high[tx_size];
+  const int tx2d_size = tx_size_2d[tx_size];
+  int scan_idx;
+  for (scan_idx = 0; scan_idx < tx2d_size; ++scan_idx) {
+    const int coeff_idx = scan[scan_idx];
+    const int r = coeff_idx / tx1d_wide;
+    const int c = coeff_idx % tx1d_wide;
+    const int nb_offset_r[5] = { -1, 0, -1, -1, 1 };
+    const int nb_offset_c[5] = { 0, -1, -1, 1, -1 };
+    const int nb_num = 5;
+    int nb_count = 0;
+    int nb_idx;
+
+    for (nb_idx = 0; nb_idx < nb_num; ++nb_idx) {
+      if (nb_count < 2) {
+        int nb_r = r + nb_offset_r[nb_idx];
+        int nb_c = c + nb_offset_c[nb_idx];
+        int nb_coeff_idx = nb_r * tx1d_wide + nb_c;
+        int valid_pos =
+            nb_r >= 0 && nb_r < tx1d_high && nb_c >= 0 && nb_c < tx1d_wide;
+        if (valid_pos && iscan[nb_coeff_idx] < scan_idx) {
+          neighbors[scan_idx * MAX_NEIGHBORS + nb_count] = nb_coeff_idx;
+          ++nb_count;
+        }
+      } else {
+        break;
+      }
+    }
+
+    if (nb_count == 1) {
+      neighbors[scan_idx * MAX_NEIGHBORS + 1] =
+          neighbors[scan_idx * MAX_NEIGHBORS + 0];
+    } else if (nb_count == 0) {
+      neighbors[scan_idx * MAX_NEIGHBORS + 0] = scan[0];
+      neighbors[scan_idx * MAX_NEIGHBORS + 1] = scan[0];
+    }
+  }
+  neighbors[tx2d_size * MAX_NEIGHBORS + 0] = scan[0];
+  neighbors[tx2d_size * MAX_NEIGHBORS + 1] = scan[0];
+}
+
+void av1_update_sort_order(TX_SIZE tx_size, TX_TYPE tx_type,
+                           const uint32_t *non_zero_prob, int16_t *sort_order) {
+  const SCAN_ORDER *sc = get_default_scan(tx_size, tx_type, 0);
+  uint32_t temp[COEFF_IDX_SIZE];
+  const int tx2d_size = tx_size_2d[tx_size];
+  int sort_idx;
+  assert(tx2d_size <= COEFF_IDX_SIZE);
+  memcpy(temp, non_zero_prob, tx2d_size * sizeof(*non_zero_prob));
+  av1_augment_prob(tx_size, tx_type, temp);
+  qsort(temp, tx2d_size, sizeof(*temp), cmp_prob);
+  for (sort_idx = 0; sort_idx < tx2d_size; ++sort_idx) {
+    const int default_scan_idx =
+        (temp[sort_idx] & COEFF_IDX_MASK) ^ COEFF_IDX_MASK;
+    const int coeff_idx = sc->scan[default_scan_idx];
+    sort_order[sort_idx] = coeff_idx;
+  }
+}
+
+void av1_update_scan_order(TX_SIZE tx_size, int16_t *sort_order, int16_t *scan,
+                           int16_t *iscan) {
+  int coeff_idx;
+  int scan_idx;
+  int sort_idx;
+  const int tx1d_size = tx_size_wide[tx_size];
+  const int tx2d_size = tx_size_2d[tx_size];
+
+  for (coeff_idx = 0; coeff_idx < tx2d_size; ++coeff_idx) {
+    iscan[coeff_idx] = -1;
+  }
+
+  scan_idx = 0;
+  for (sort_idx = 0; sort_idx < tx2d_size; ++sort_idx) {
+    coeff_idx = sort_order[sort_idx];
+    dfs_scan(tx1d_size, &scan_idx, coeff_idx, scan, iscan);
+  }
+}
+
+static void update_scan_order_facade(AV1_COMMON *cm, TX_SIZE tx_size,
+                                     TX_TYPE tx_type) {
+  int16_t sort_order[COEFF_IDX_SIZE];
+  uint32_t *non_zero_prob = get_non_zero_prob(cm->fc, tx_size, tx_type);
+  int16_t *scan = get_adapt_scan(cm->fc, tx_size, tx_type);
+  int16_t *iscan = get_adapt_iscan(cm->fc, tx_size, tx_type);
+  int16_t *nb = get_adapt_nb(cm->fc, tx_size, tx_type);
+  assert(tx_size_2d[tx_size] <= COEFF_IDX_SIZE);
+  av1_update_sort_order(tx_size, tx_type, non_zero_prob, sort_order);
+  av1_update_scan_order(tx_size, sort_order, scan, iscan);
+  av1_update_neighbors(tx_size, scan, iscan, nb);
+}
+
+static void update_eob_threshold(AV1_COMMON *cm, TX_SIZE tx_size,
+                                 TX_TYPE tx_type) {
+  int i, row, col, row_limit, col_limit, cal_idx = 0;
+  const int tx_width = tx_size_wide[tx_size];
+  const int tx_height = tx_size_high[tx_size];
+
+  row_limit = tx_width >> 1;
+  col_limit = tx_height >> 1;
+
+  if (tx_width >= 8 && tx_height >= 8) {
+    SCAN_ORDER *sc = &cm->fc->sc[tx_size][tx_type];
+    int16_t *threshold = &cm->fc->eob_threshold[tx_size][tx_type][0];
+    const int tx2d_size = tx_size_2d[tx_size];
+
+    while (cal_idx < EOB_THRESHOLD_NUM) {
+      for (i = 0; i < tx2d_size; ++i) {
+        row = sc->scan[i] / tx_height;
+        col = sc->scan[i] % tx_width;
+        if (row >= row_limit || col >= col_limit) break;
+      }
+      row_limit >>= 1;
+      col_limit >>= 1;
+      threshold[cal_idx] = i;
+      cal_idx++;
+    }
+  }
+}
+
+void av1_init_scan_order(AV1_COMMON *cm) {
+  TX_SIZE tx_size;
+  TX_TYPE tx_type;
+  for (tx_size = 0; tx_size < TX_SIZES_ALL; ++tx_size) {
+#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+    if (tx_size > TX_32X16) continue;
+#else
+    if (tx_size >= TX_SIZES) continue;
+#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+    for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+      uint32_t *non_zero_prob = get_non_zero_prob(cm->fc, tx_size, tx_type);
+      const int tx2d_size = tx_size_2d[tx_size];
+      int i;
+      SCAN_ORDER *sc = &cm->fc->sc[tx_size][tx_type];
+      for (i = 0; i < tx2d_size; ++i) {
+        non_zero_prob[i] = (1 << 16) / 2;  // init non_zero_prob to 0.5
+      }
+      update_scan_order_facade(cm, tx_size, tx_type);
+      sc->scan = get_adapt_scan(cm->fc, tx_size, tx_type);
+      sc->iscan = get_adapt_iscan(cm->fc, tx_size, tx_type);
+      sc->neighbors = get_adapt_nb(cm->fc, tx_size, tx_type);
+      update_eob_threshold(cm, tx_size, tx_type);
+    }
+  }
+}
+
+void av1_adapt_scan_order(AV1_COMMON *cm) {
+  TX_SIZE tx_size;
+  for (tx_size = 0; tx_size < TX_SIZES_ALL; ++tx_size) {
+#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+    if (tx_size > TX_32X16) continue;
+#else
+    if (tx_size >= TX_SIZES) continue;
+#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+    TX_TYPE tx_type;
+    for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+      update_scan_prob(cm, tx_size, tx_type, ADAPT_SCAN_UPDATE_RATE_16);
+      update_scan_order_facade(cm, tx_size, tx_type);
+      update_eob_threshold(cm, tx_size, tx_type);
+    }
+  }
+}
+
+void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd) {
+  xd->eob_threshold_md = (const EobThresholdMD *)cm->fc->eob_threshold;
+}
+#endif  // CONFIG_ADAPT_SCAN
diff --git a/third_party/aom/av1/common/scan.h b/third_party/aom/av1/common/scan.h
new file mode 100644
index 000000000..ecef11368
--- /dev/null
+++ b/third_party/aom/av1/common/scan.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_SCAN_H_
+#define AV1_COMMON_SCAN_H_
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_NEIGHBORS 2
+
+extern const SCAN_ORDER av1_default_scan_orders[TX_SIZES];
+extern const SCAN_ORDER av1_intra_scan_orders[TX_SIZES_ALL][TX_TYPES];
+extern const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES];
+
+#if CONFIG_ADAPT_SCAN
+void av1_update_scan_count_facade(AV1_COMMON *cm, FRAME_COUNTS *counts,
+                                  TX_SIZE tx_size, TX_TYPE tx_type,
+                                  const tran_low_t *dqcoeffs, int max_scan);
+
+// embed r + c and coeff_idx info with nonzero probabilities. When sorting the
+// nonzero probabilities, if there is a tie, the coefficient with smaller r + c
+// will be scanned first
+void av1_augment_prob(TX_SIZE tx_size, TX_TYPE tx_type, uint32_t *prob);
+
+// apply quick sort on nonzero probabilities to obtain a sort order
+void av1_update_sort_order(TX_SIZE tx_size, TX_TYPE tx_type,
+                           const uint32_t *non_zero_prob, int16_t *sort_order);
+
+// apply topological sort on the nonzero probabilities sorting order to
+// guarantee each to-be-scanned coefficient's upper and left coefficient will be
+// scanned before the to-be-scanned coefficient.
+void av1_update_scan_order(TX_SIZE tx_size, int16_t *sort_order, int16_t *scan,
+                           int16_t *iscan);
+
+// For each coeff_idx in scan[], update its above and left neighbors in
+// neighbors[] accordingly.
+void av1_update_neighbors(int tx_size, const int16_t *scan,
+                          const int16_t *iscan, int16_t *neighbors);
+void av1_init_scan_order(AV1_COMMON *cm);
+void av1_adapt_scan_order(AV1_COMMON *cm);
+#endif
+void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd);
+
+static INLINE int get_coef_context(const int16_t *neighbors,
+                                   const uint8_t *token_cache, int c) {
+  return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
+          token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >>
+         1;
+}
+
+static INLINE const SCAN_ORDER *get_default_scan(TX_SIZE tx_size,
+                                                 TX_TYPE tx_type,
+                                                 int is_inter) {
+#if CONFIG_EXT_TX || CONFIG_VAR_TX
+  return is_inter ? &av1_inter_scan_orders[tx_size][tx_type]
+                  : &av1_intra_scan_orders[tx_size][tx_type];
+#else
+  (void)is_inter;
+  return &av1_intra_scan_orders[tx_size][tx_type];
+#endif  // CONFIG_EXT_TX
+}
+
+static INLINE const SCAN_ORDER *get_scan(const AV1_COMMON *cm, TX_SIZE tx_size,
+                                         TX_TYPE tx_type, int is_inter) {
+#if CONFIG_ADAPT_SCAN
+  (void)is_inter;
+  return &cm->fc->sc[tx_size][tx_type];
+#else   // CONFIG_ADAPT_SCAN
+  (void)cm;
+  return get_default_scan(tx_size, tx_type, is_inter);
+#endif  // CONFIG_ADAPT_SCAN
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_SCAN_H_
diff --git a/third_party/aom/av1/common/seg_common.c b/third_party/aom/av1/common/seg_common.c
new file mode 100644
index 000000000..21a853629
--- /dev/null
+++ b/third_party/aom/av1/common/seg_common.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/blockd.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/quant_common.h"
+
+static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 0, 0 };
+
+static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ, MAX_LOOP_FILTER, 3,
+                                                       0 };
+
+// These functions provide access to new segment level features.
+// Eventually these function may be "optimized out" but for the moment,
+// the coding mechanism is still subject to change so these provide a
+// convenient single point of change.
+
+void av1_clearall_segfeatures(struct segmentation *seg) {
+  av1_zero(seg->feature_data);
+  av1_zero(seg->feature_mask);
+}
+
+void av1_enable_segfeature(struct segmentation *seg, int segment_id,
+                           SEG_LVL_FEATURES feature_id) {
+  seg->feature_mask[segment_id] |= 1 << feature_id;
+}
+
+int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id) {
+  return seg_feature_data_max[feature_id];
+}
+
+int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
+  return seg_feature_data_signed[feature_id];
+}
+
+void av1_set_segdata(struct segmentation *seg, int segment_id,
+                     SEG_LVL_FEATURES feature_id, int seg_data) {
+  assert(seg_data <= seg_feature_data_max[feature_id]);
+  if (seg_data < 0) {
+    assert(seg_feature_data_signed[feature_id]);
+    assert(-seg_data <= seg_feature_data_max[feature_id]);
+  }
+
+  seg->feature_data[segment_id][feature_id] = seg_data;
+}
+
+const aom_tree_index av1_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = {
+  2, 4, 6, 8, 10, 12, 0, -1, -2, -3, -4, -5, -6, -7
+};
+
+// TBD? Functions to read and write segment data with range / validity checking
diff --git a/third_party/aom/av1/common/seg_common.h b/third_party/aom/av1/common/seg_common.h
new file mode 100644
index 000000000..03ed38e79
--- /dev/null
+++ b/third_party/aom/av1/common/seg_common.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_SEG_COMMON_H_
+#define AV1_COMMON_SEG_COMMON_H_
+
+#include "aom_dsp/prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SEGMENT_DELTADATA 0
+#define SEGMENT_ABSDATA 1
+
+#define MAX_SEGMENTS 8
+#define SEG_TREE_PROBS (MAX_SEGMENTS - 1)
+
+#define PREDICTION_PROBS 3
+
+// Segment level features.
+typedef enum {
+  SEG_LVL_ALT_Q = 0,      // Use alternate Quantizer ....
+  SEG_LVL_ALT_LF = 1,     // Use alternate loop filter value...
+  SEG_LVL_REF_FRAME = 2,  // Optional Segment reference frame
+  SEG_LVL_SKIP = 3,       // Optional Segment (0,0) + skip mode
+  SEG_LVL_MAX = 4         // Number of features supported
+} SEG_LVL_FEATURES;
+
+struct segmentation {
+  uint8_t enabled;
+  uint8_t update_map;
+  uint8_t update_data;
+  uint8_t abs_delta;
+  uint8_t temporal_update;
+
+  int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX];
+  unsigned int feature_mask[MAX_SEGMENTS];
+};
+
+struct segmentation_probs {
+  aom_prob tree_probs[SEG_TREE_PROBS];
+#if CONFIG_EC_MULTISYMBOL
+  aom_cdf_prob tree_cdf[CDF_SIZE(MAX_SEGMENTS)];
+#endif
+  aom_prob pred_probs[PREDICTION_PROBS];
+};
+
+static INLINE int segfeature_active(const struct segmentation *seg,
+                                    int segment_id,
+                                    SEG_LVL_FEATURES feature_id) {
+  return seg->enabled && (seg->feature_mask[segment_id] & (1 << feature_id));
+}
+
+void av1_clearall_segfeatures(struct segmentation *seg);
+
+void av1_enable_segfeature(struct segmentation *seg, int segment_id,
+                           SEG_LVL_FEATURES feature_id);
+
+int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id);
+
+int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id);
+
+void av1_set_segdata(struct segmentation *seg, int segment_id,
+                     SEG_LVL_FEATURES feature_id, int seg_data);
+
+static INLINE int get_segdata(const struct segmentation *seg, int segment_id,
+                              SEG_LVL_FEATURES feature_id) {
+  return seg->feature_data[segment_id][feature_id];
+}
+
+extern const aom_tree_index av1_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_SEG_COMMON_H_
diff --git a/third_party/aom/av1/common/thread_common.c b/third_party/aom/av1/common/thread_common.c
new file mode 100644
index 000000000..ca8b1b3bd
--- /dev/null
+++ b/third_party/aom/av1/common/thread_common.c
@@ -0,0 +1,529 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_config.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/thread_common.h"
+#include "av1/common/reconinter.h"
+
+#if CONFIG_MULTITHREAD
+static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
+  const int kMaxTryLocks = 4000;
+  int locked = 0;
+  int i;
+
+  for (i = 0; i < kMaxTryLocks; ++i) {
+    if (!pthread_mutex_trylock(mutex)) {
+      locked = 1;
+      break;
+    }
+  }
+
+  if (!locked) pthread_mutex_lock(mutex);
+}
+#endif  // CONFIG_MULTITHREAD
+
+static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c) {
+#if CONFIG_MULTITHREAD
+  const int nsync = lf_sync->sync_range;
+
+  if (r && !(c & (nsync - 1))) {
+    pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1];
+    mutex_lock(mutex);
+
+    while (c > lf_sync->cur_sb_col[r - 1] - nsync) {
+      pthread_cond_wait(&lf_sync->cond_[r - 1], mutex);
+    }
+    pthread_mutex_unlock(mutex);
+  }
+#else
+  (void)lf_sync;
+  (void)r;
+  (void)c;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c,
+                              const int sb_cols) {
+#if CONFIG_MULTITHREAD
+  const int nsync = lf_sync->sync_range;
+  int cur;
+  // Only signal when there are enough filtered SB for next row to run.
+  int sig = 1;
+
+  if (c < sb_cols - 1) {
+    cur = c;
+    if (c % nsync) sig = 0;
+  } else {
+    cur = sb_cols + nsync;
+  }
+
+  if (sig) {
+    mutex_lock(&lf_sync->mutex_[r]);
+
+    lf_sync->cur_sb_col[r] = cur;
+
+    pthread_cond_signal(&lf_sync->cond_[r]);
+    pthread_mutex_unlock(&lf_sync->mutex_[r]);
+  }
+#else
+  (void)lf_sync;
+  (void)r;
+  (void)c;
+  (void)sb_cols;
+#endif  // CONFIG_MULTITHREAD
+}
+
+#if !CONFIG_EXT_PARTITION_TYPES
+static INLINE enum lf_path get_loop_filter_path(
+    int y_only, struct macroblockd_plane planes[MAX_MB_PLANE]) {
+  if (y_only)
+    return LF_PATH_444;
+  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
+    return LF_PATH_420;
+  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
+    return LF_PATH_444;
+  else
+    return LF_PATH_SLOW;
+}
+
+static INLINE void loop_filter_block_plane_ver(
+    AV1_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int plane,
+    MODE_INFO **mi, int mi_row, int mi_col, enum lf_path path,
+    LOOP_FILTER_MASK *lfm) {
+  if (plane == 0) {
+    av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, lfm);
+  } else {
+    switch (path) {
+      case LF_PATH_420:
+        av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, lfm);
+        break;
+      case LF_PATH_444:
+        av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, lfm);
+        break;
+      case LF_PATH_SLOW:
+        av1_filter_block_plane_non420_ver(cm, &planes[plane], mi, mi_row,
+                                          mi_col);
+        break;
+    }
+  }
+}
+
+static INLINE void loop_filter_block_plane_hor(
+    AV1_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int plane,
+    MODE_INFO **mi, int mi_row, int mi_col, enum lf_path path,
+    LOOP_FILTER_MASK *lfm) {
+  if (plane == 0) {
+    av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, lfm);
+  } else {
+    switch (path) {
+      case LF_PATH_420:
+        av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, lfm);
+        break;
+      case LF_PATH_444:
+        av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, lfm);
+        break;
+      case LF_PATH_SLOW:
+        av1_filter_block_plane_non420_hor(cm, &planes[plane], mi, mi_row,
+                                          mi_col);
+        break;
+    }
+  }
+}
+#endif
+// Row-based multi-threaded loopfilter hook
+#if CONFIG_PARALLEL_DEBLOCKING
+static int loop_filter_ver_row_worker(AV1LfSync *const lf_sync,
+                                      LFWorkerData *const lf_data) {
+  const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
+  int mi_row, mi_col;
+#if !CONFIG_EXT_PARTITION_TYPES
+  enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
+#endif
+  for (mi_row = lf_data->start; mi_row < lf_data->stop;
+       mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
+    MODE_INFO **const mi =
+        lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
+
+    for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
+         mi_col += lf_data->cm->mib_size) {
+      LOOP_FILTER_MASK lfm;
+      int plane;
+
+      av1_setup_dst_planes(lf_data->planes, lf_data->cm->sb_size,
+                           lf_data->frame_buffer, mi_row, mi_col);
+      av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
+                     lf_data->cm->mi_stride, &lfm);
+
+#if CONFIG_EXT_PARTITION_TYPES
+      for (plane = 0; plane < num_planes; ++plane)
+        av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane],
+                                          mi + mi_col, mi_row, mi_col);
+#else
+
+      for (plane = 0; plane < num_planes; ++plane)
+        loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane,
+                                    mi + mi_col, mi_row, mi_col, path, &lfm);
+#endif
+    }
+  }
+  return 1;
+}
+
+static int loop_filter_hor_row_worker(AV1LfSync *const lf_sync,
+                                      LFWorkerData *const lf_data) {
+  const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
+  const int sb_cols =
+      mi_cols_aligned_to_sb(lf_data->cm) >> lf_data->cm->mib_size_log2;
+  int mi_row, mi_col;
+#if !CONFIG_EXT_PARTITION_TYPES
+  enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
+#endif
+
+  for (mi_row = lf_data->start; mi_row < lf_data->stop;
+       mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
+    MODE_INFO **const mi =
+        lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
+
+    for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
+         mi_col += lf_data->cm->mib_size) {
+      const int r = mi_row >> lf_data->cm->mib_size_log2;
+      const int c = mi_col >> lf_data->cm->mib_size_log2;
+      LOOP_FILTER_MASK lfm;
+      int plane;
+
+      // TODO(wenhao.zhang@intel.com): For better parallelization, reorder
+      // the outer loop to column-based and remove the synchronizations here.
+      sync_read(lf_sync, r, c);
+
+      av1_setup_dst_planes(lf_data->planes, lf_data->cm->sb_size,
+                           lf_data->frame_buffer, mi_row, mi_col);
+      av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
+                     lf_data->cm->mi_stride, &lfm);
+#if CONFIG_EXT_PARTITION_TYPES
+      for (plane = 0; plane < num_planes; ++plane)
+        av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane],
+                                          mi + mi_col, mi_row, mi_col);
+#else
+      for (plane = 0; plane < num_planes; ++plane)
+        loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane,
+                                    mi + mi_col, mi_row, mi_col, path, &lfm);
+#endif
+      sync_write(lf_sync, r, c, sb_cols);
+    }
+  }
+  return 1;
+}
+#else  //  CONFIG_PARALLEL_DEBLOCKING
+static int loop_filter_row_worker(AV1LfSync *const lf_sync,
+                                  LFWorkerData *const lf_data) {
+  const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
+  const int sb_cols =
+      mi_cols_aligned_to_sb(lf_data->cm) >> lf_data->cm->mib_size_log2;
+  int mi_row, mi_col;
+#if !CONFIG_EXT_PARTITION_TYPES
+  enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
+#endif  // !CONFIG_EXT_PARTITION_TYPES
+
+#if CONFIG_EXT_PARTITION
+  printf(
+      "STOPPING: This code has not been modified to work with the "
+      "extended coding unit size experiment");
+  exit(EXIT_FAILURE);
+#endif  // CONFIG_EXT_PARTITION
+
+  for (mi_row = lf_data->start; mi_row < lf_data->stop;
+       mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
+    MODE_INFO **const mi =
+        lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
+
+    for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
+         mi_col += lf_data->cm->mib_size) {
+      const int r = mi_row >> lf_data->cm->mib_size_log2;
+      const int c = mi_col >> lf_data->cm->mib_size_log2;
+#if !CONFIG_EXT_PARTITION_TYPES
+      LOOP_FILTER_MASK lfm;
+#endif
+      int plane;
+
+      sync_read(lf_sync, r, c);
+
+      av1_setup_dst_planes(lf_data->planes, lf_data->cm->sb_size,
+                           lf_data->frame_buffer, mi_row, mi_col);
+#if CONFIG_EXT_PARTITION_TYPES
+      for (plane = 0; plane < num_planes; ++plane) {
+        av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane],
+                                          mi + mi_col, mi_row, mi_col);
+        av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane],
+                                          mi + mi_col, mi_row, mi_col);
+      }
+#else
+      av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
+                     lf_data->cm->mi_stride, &lfm);
+
+      for (plane = 0; plane < num_planes; ++plane) {
+        loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane,
+                                    mi + mi_col, mi_row, mi_col, path, &lfm);
+        loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane,
+                                    mi + mi_col, mi_row, mi_col, path, &lfm);
+      }
+#endif  // CONFIG_EXT_PARTITION_TYPES
+      sync_write(lf_sync, r, c, sb_cols);
+    }
+  }
+  return 1;
+}
+#endif  //  CONFIG_PARALLEL_DEBLOCKING
+
+static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+                                struct macroblockd_plane planes[MAX_MB_PLANE],
+                                int start, int stop, int y_only,
+                                AVxWorker *workers, int nworkers,
+                                AV1LfSync *lf_sync) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  // Number of superblock rows and cols
+  const int sb_rows = mi_rows_aligned_to_sb(cm) >> cm->mib_size_log2;
+  // Decoder may allocate more threads than number of tiles based on user's
+  // input.
+  const int tile_cols = cm->tile_cols;
+  const int num_workers = AOMMIN(nworkers, tile_cols);
+  int i;
+
+#if CONFIG_EXT_PARTITION
+  printf(
+      "STOPPING: This code has not been modified to work with the "
+      "extended coding unit size experiment");
+  exit(EXIT_FAILURE);
+#endif  // CONFIG_EXT_PARTITION
+
+  if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+      num_workers > lf_sync->num_workers) {
+    av1_loop_filter_dealloc(lf_sync);
+    av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
+  }
+
+// Set up loopfilter thread data.
+// The decoder is capping num_workers because it has been observed that using
+// more threads on the loopfilter than there are cores will hurt performance
+// on Android. This is because the system will only schedule the tile decode
+// workers on cores equal to the number of tile columns. Then if the decoder
+// tries to use more threads for the loopfilter, it will hurt performance
+// because of contention. If the multithreading code changes in the future
+// then the number of workers used by the loopfilter should be revisited.
+
+#if CONFIG_PARALLEL_DEBLOCKING
+  // Initialize cur_sb_col to -1 for all SB rows.
+  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+
+  // Filter all the vertical edges in the whole frame
+  for (i = 0; i < num_workers; ++i) {
+    AVxWorker *const worker = &workers[i];
+    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+
+    worker->hook = (AVxWorkerHook)loop_filter_ver_row_worker;
+    worker->data1 = lf_sync;
+    worker->data2 = lf_data;
+
+    // Loopfilter data
+    av1_loop_filter_data_reset(lf_data, frame, cm, planes);
+    lf_data->start = start + i * cm->mib_size;
+    lf_data->stop = stop;
+    lf_data->y_only = y_only;
+
+    // Start loopfiltering
+    if (i == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait till all rows are finished
+  for (i = 0; i < num_workers; ++i) {
+    winterface->sync(&workers[i]);
+  }
+
+  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+  // Filter all the horizontal edges in the whole frame
+  for (i = 0; i < num_workers; ++i) {
+    AVxWorker *const worker = &workers[i];
+    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+
+    worker->hook = (AVxWorkerHook)loop_filter_hor_row_worker;
+    worker->data1 = lf_sync;
+    worker->data2 = lf_data;
+
+    // Loopfilter data
+    av1_loop_filter_data_reset(lf_data, frame, cm, planes);
+    lf_data->start = start + i * cm->mib_size;
+    lf_data->stop = stop;
+    lf_data->y_only = y_only;
+
+    // Start loopfiltering
+    if (i == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait till all rows are finished
+  for (i = 0; i < num_workers; ++i) {
+    winterface->sync(&workers[i]);
+  }
+#else   // CONFIG_PARALLEL_DEBLOCKING
+  // Initialize cur_sb_col to -1 for all SB rows.
+  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+
+  for (i = 0; i < num_workers; ++i) {
+    AVxWorker *const worker = &workers[i];
+    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+
+    worker->hook = (AVxWorkerHook)loop_filter_row_worker;
+    worker->data1 = lf_sync;
+    worker->data2 = lf_data;
+
+    // Loopfilter data
+    av1_loop_filter_data_reset(lf_data, frame, cm, planes);
+    lf_data->start = start + i * cm->mib_size;
+    lf_data->stop = stop;
+    lf_data->y_only = y_only;
+
+    // Start loopfiltering
+    if (i == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait till all rows are finished
+  for (i = 0; i < num_workers; ++i) {
+    winterface->sync(&workers[i]);
+  }
+#endif  // CONFIG_PARALLEL_DEBLOCKING
+}
+
+void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+                              struct macroblockd_plane planes[MAX_MB_PLANE],
+                              int frame_filter_level, int y_only,
+                              int partial_frame, AVxWorker *workers,
+                              int num_workers, AV1LfSync *lf_sync) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+
+  if (!frame_filter_level) return;
+
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_rows;
+  if (partial_frame && cm->mi_rows > 8) {
+    start_mi_row = cm->mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+  av1_loop_filter_frame_init(cm, frame_filter_level);
+
+  loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row, y_only,
+                      workers, num_workers, lf_sync);
+}
+
+// Set up nsync by width.
+static INLINE int get_sync_range(int width) {
+  // nsync numbers are picked by testing. For example, for 4k
+  // video, using 4 gives best performance.
+  if (width < 640)
+    return 1;
+  else if (width <= 1280)
+    return 2;
+  else if (width <= 4096)
+    return 4;
+  else
+    return 8;
+}
+
+// Allocate memory for lf row synchronization
+void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
+                           int width, int num_workers) {
+  lf_sync->rows = rows;
+#if CONFIG_MULTITHREAD
+  {
+    int i;
+
+    CHECK_MEM_ERROR(cm, lf_sync->mutex_,
+                    aom_malloc(sizeof(*lf_sync->mutex_) * rows));
+    if (lf_sync->mutex_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_mutex_init(&lf_sync->mutex_[i], NULL);
+      }
+    }
+
+    CHECK_MEM_ERROR(cm, lf_sync->cond_,
+                    aom_malloc(sizeof(*lf_sync->cond_) * rows));
+    if (lf_sync->cond_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_cond_init(&lf_sync->cond_[i], NULL);
+      }
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+                  aom_malloc(num_workers * sizeof(*lf_sync->lfdata)));
+  lf_sync->num_workers = num_workers;
+
+  CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
+                  aom_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
+
+  // Set up nsync.
+  lf_sync->sync_range = get_sync_range(width);
+}
+
+// Deallocate lf synchronization related mutex and data
+void av1_loop_filter_dealloc(AV1LfSync *lf_sync) {
+  if (lf_sync != NULL) {
+#if CONFIG_MULTITHREAD
+    int i;
+
+    if (lf_sync->mutex_ != NULL) {
+      for (i = 0; i < lf_sync->rows; ++i) {
+        pthread_mutex_destroy(&lf_sync->mutex_[i]);
+      }
+      aom_free(lf_sync->mutex_);
+    }
+    if (lf_sync->cond_ != NULL) {
+      for (i = 0; i < lf_sync->rows; ++i) {
+        pthread_cond_destroy(&lf_sync->cond_[i]);
+      }
+      aom_free(lf_sync->cond_);
+    }
+#endif  // CONFIG_MULTITHREAD
+    aom_free(lf_sync->lfdata);
+    aom_free(lf_sync->cur_sb_col);
+    // clear the structure as the source of this call may be a resize in which
+    // case this call will be followed by an _alloc() which may fail.
+    av1_zero(*lf_sync);
+  }
+}
+
+// Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int'
+// members, so we treat it as an array, and sum over the whole length.
+void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts,
+                                 FRAME_COUNTS *counts) {
+  unsigned int *const acc = (unsigned int *)acc_counts;
+  const unsigned int *const cnt = (unsigned int *)counts;
+
+  const unsigned int n_counts = sizeof(FRAME_COUNTS) / sizeof(unsigned int);
+  unsigned int i;
+
+  for (i = 0; i < n_counts; i++) acc[i] += cnt[i];
+}
diff --git a/third_party/aom/av1/common/thread_common.h b/third_party/aom/av1/common/thread_common.h
new file mode 100644
index 000000000..7b57ae8f3
--- /dev/null
+++ b/third_party/aom/av1/common/thread_common.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_LOOPFILTER_THREAD_H_
+#define AV1_COMMON_LOOPFILTER_THREAD_H_
+#include "./aom_config.h"
+#include "av1/common/av1_loopfilter.h"
+#include "aom_util/aom_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+struct FRAME_COUNTS;
+
+// Loopfilter row synchronization
+typedef struct AV1LfSyncData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_;
+  pthread_cond_t *cond_;
+#endif
+  // Allocate memory to store the loop-filtered superblock index in each row.
+  int *cur_sb_col;
+  // The optimal sync_range for different resolution and platform should be
+  // determined by testing. Currently, it is chosen to be a power-of-2 number.
+  int sync_range;
+  int rows;
+
+  // Row-based parallel loopfilter data
+  LFWorkerData *lfdata;
+  int num_workers;
+} AV1LfSync;
+
+// Allocate memory for loopfilter row synchronization.
+void av1_loop_filter_alloc(AV1LfSync *lf_sync, struct AV1Common *cm, int rows,
+                           int width, int num_workers);
+
+// Deallocate loopfilter synchronization related mutex and data.
+void av1_loop_filter_dealloc(AV1LfSync *lf_sync);
+
+// Multi-threaded loopfilter that uses the tile threads.
+void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+                              struct macroblockd_plane planes[MAX_MB_PLANE],
+                              int frame_filter_level, int y_only,
+                              int partial_frame, AVxWorker *workers,
+                              int num_workers, AV1LfSync *lf_sync);
+
+void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts,
+                                 struct FRAME_COUNTS *counts);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_LOOPFILTER_THREAD_H_
diff --git a/third_party/aom/av1/common/tile_common.c b/third_party/aom/av1/common/tile_common.c
new file mode 100644
index 000000000..b8008ac2e
--- /dev/null
+++ b/third_party/aom/av1/common/tile_common.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/tile_common.h"
+#include "av1/common/onyxc_int.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) {
+  tile->mi_row_start = row * cm->tile_height;
+  tile->mi_row_end = AOMMIN(tile->mi_row_start + cm->tile_height, cm->mi_rows);
+}
+
+void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
+  tile->mi_col_start = col * cm->tile_width;
+  tile->mi_col_end = AOMMIN(tile->mi_col_start + cm->tile_width, cm->mi_cols);
+}
+
+#if CONFIG_DEPENDENT_HORZTILES && CONFIG_TILE_GROUPS
+void av1_tile_set_tg_boundary(TileInfo *tile, const AV1_COMMON *const cm,
+                              int row, int col) {
+  if (row < cm->tile_rows - 1) {
+    tile->tg_horz_boundary =
+        col >= cm->tile_group_start_col[row][col]
+            ? (row == cm->tile_group_start_row[row][col] ? 1 : 0)
+            : (row == cm->tile_group_start_row[row + 1][col] ? 1 : 0);
+  } else {
+    assert(col >= cm->tile_group_start_col[row][col]);
+    tile->tg_horz_boundary =
+        (row == cm->tile_group_start_row[row][col] ? 1 : 0);
+  }
+}
+#endif
+void av1_tile_init(TileInfo *tile, const AV1_COMMON *cm, int row, int col) {
+  av1_tile_set_row(tile, cm, row);
+  av1_tile_set_col(tile, cm, col);
+#if CONFIG_DEPENDENT_HORZTILES && CONFIG_TILE_GROUPS
+  av1_tile_set_tg_boundary(tile, cm, row, col);
+#endif
+}
+
+#if !CONFIG_EXT_TILE
+
+#if CONFIG_EXT_PARTITION
+#define MIN_TILE_WIDTH_MAX_SB 2
+#define MAX_TILE_WIDTH_MAX_SB 32
+#else
+#define MIN_TILE_WIDTH_MAX_SB 4
+#define MAX_TILE_WIDTH_MAX_SB 64
+#endif  // CONFIG_EXT_PARTITION
+
+static int get_min_log2_tile_cols(int max_sb_cols) {
+  int min_log2 = 0;
+  while ((MAX_TILE_WIDTH_MAX_SB << min_log2) < max_sb_cols) ++min_log2;
+  return min_log2;
+}
+
+static int get_max_log2_tile_cols(int max_sb_cols) {
+  int max_log2 = 1;
+  while ((max_sb_cols >> max_log2) >= MIN_TILE_WIDTH_MAX_SB) ++max_log2;
+  return max_log2 - 1;
+}
+
+void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols,
+                         int *max_log2_tile_cols) {
+  const int max_sb_cols =
+      ALIGN_POWER_OF_TWO(mi_cols, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+  *min_log2_tile_cols = get_min_log2_tile_cols(max_sb_cols);
+  *max_log2_tile_cols = get_max_log2_tile_cols(max_sb_cols);
+  assert(*min_log2_tile_cols <= *max_log2_tile_cols);
+}
+#endif  // !CONFIG_EXT_TILE
+
+void av1_update_boundary_info(const struct AV1Common *cm,
+                              const TileInfo *const tile_info, int mi_row,
+                              int mi_col) {
+  int row, col;
+  for (row = mi_row; row < (mi_row + cm->mib_size); row++)
+    for (col = mi_col; col < (mi_col + cm->mib_size); col++) {
+      MODE_INFO *const mi = cm->mi + row * cm->mi_stride + col;
+      mi->mbmi.boundary_info = 0;
+      if (cm->tile_cols * cm->tile_rows > 1) {
+#if CONFIG_DEPENDENT_HORZTILES
+        if (row == tile_info->mi_row_start &&
+            (!cm->dependent_horz_tiles || tile_info->tg_horz_boundary))
+#if CONFIG_TILE_GROUPS
+#else
+          if (row == tile_info->mi_row_start && !cm->dependent_horz_tiles)
+#endif  // CONFIG_TILE_GROUPS
+#else
+        if (row == tile_info->mi_row_start)
+#endif  // CONFIG_DEPENDENT_HORZTILES
+          mi->mbmi.boundary_info |= TILE_ABOVE_BOUNDARY;
+        if (col == tile_info->mi_col_start)
+          mi->mbmi.boundary_info |= TILE_LEFT_BOUNDARY;
+        if ((row + 1) >= tile_info->mi_row_end)
+          mi->mbmi.boundary_info |= TILE_BOTTOM_BOUNDARY;
+        if ((col + 1) >= tile_info->mi_col_end)
+          mi->mbmi.boundary_info |= TILE_RIGHT_BOUNDARY;
+      }
+      // Frame boundary is treated as tile boundary
+      if (row == 0)
+        mi->mbmi.boundary_info |= FRAME_ABOVE_BOUNDARY | TILE_ABOVE_BOUNDARY;
+      if (col == 0)
+        mi->mbmi.boundary_info |= FRAME_LEFT_BOUNDARY | TILE_LEFT_BOUNDARY;
+      if ((row + 1) >= cm->mi_rows)
+        mi->mbmi.boundary_info |= FRAME_BOTTOM_BOUNDARY | TILE_BOTTOM_BOUNDARY;
+      if ((col + 1) >= cm->mi_cols)
+        mi->mbmi.boundary_info |= FRAME_RIGHT_BOUNDARY | TILE_RIGHT_BOUNDARY;
+    }
+}
+
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+int av1_disable_loopfilter_on_tile_boundary(const struct AV1Common *cm) {
+  return (!cm->loop_filter_across_tiles_enabled &&
+          (cm->tile_cols * cm->tile_rows > 1));
+}
+#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
diff --git a/third_party/aom/av1/common/tile_common.h b/third_party/aom/av1/common/tile_common.h
new file mode 100644
index 000000000..617dda202
--- /dev/null
+++ b/third_party/aom/av1/common/tile_common.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_TILE_COMMON_H_
+#define AV1_COMMON_TILE_COMMON_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "./aom_config.h"
+
+struct AV1Common;
+
+#if CONFIG_TILE_GROUPS
+#define DEFAULT_MAX_NUM_TG 1
+#endif
+
+typedef struct TileInfo {
+  int mi_row_start, mi_row_end;
+  int mi_col_start, mi_col_end;
+  int tg_horz_boundary;
+} TileInfo;
+
+// initializes 'tile->mi_(row|col)_(start|end)' for (row, col) based on
+// 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)'
+void av1_tile_init(TileInfo *tile, const struct AV1Common *cm, int row,
+                   int col);
+
+void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row);
+void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col);
+#if CONFIG_DEPENDENT_HORZTILES && CONFIG_TILE_GROUPS
+void av1_tile_set_tg_boundary(TileInfo *tile, const struct AV1Common *cm,
+                              int row, int col);
+#endif
+void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols,
+                         int *max_log2_tile_cols);
+
+void av1_update_boundary_info(const struct AV1Common *cm,
+                              const TileInfo *const tile_info, int mi_row,
+                              int mi_col);
+
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+int av1_disable_loopfilter_on_tile_boundary(const struct AV1Common *cm);
+#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_TILE_COMMON_H_
diff --git a/third_party/aom/av1/common/txb_common.c b/third_party/aom/av1/common/txb_common.c
new file mode 100644
index 000000000..08a685b59
--- /dev/null
+++ b/third_party/aom/av1/common/txb_common.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "aom/aom_integer.h"
+#include "av1/common/onyxc_int.h"
+
+const int16_t av1_coeff_band_4x4[16] = { 0, 1, 2,  3,  4,  5,  6,  7,
+                                         8, 9, 10, 11, 12, 13, 14, 15 };
+
+const int16_t av1_coeff_band_8x8[64] = {
+  0,  1,  2,  2,  3,  3,  4,  4,  5,  6,  2,  2,  3,  3,  4,  4,
+  7,  7,  8,  8,  9,  9,  10, 10, 7,  7,  8,  8,  9,  9,  10, 10,
+  11, 11, 12, 12, 13, 13, 14, 14, 11, 11, 12, 12, 13, 13, 14, 14,
+  15, 15, 16, 16, 17, 17, 18, 18, 15, 15, 16, 16, 17, 17, 18, 18,
+};
+
+const int16_t av1_coeff_band_16x16[256] = {
+  0,  1,  4,  4,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9,  9,  9,  2,  3,  4,
+  4,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9,  9,  9,  5,  5,  6,  6,  7,  7,
+  7,  7,  8,  8,  8,  8,  9,  9,  9,  9,  5,  5,  6,  6,  7,  7,  7,  7,  8,
+  8,  8,  8,  9,  9,  9,  9,  10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12,
+  13, 13, 13, 13, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13,
+  13, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 10, 10,
+  10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15,
+  15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 14, 14, 14, 14, 15, 15, 15, 15,
+  16, 16, 16, 16, 17, 17, 17, 17, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16,
+  16, 17, 17, 17, 17, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17,
+  17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 18,
+  18, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 18, 18, 18, 18,
+  19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 18, 18, 18, 18, 19, 19, 19,
+  19, 20, 20, 20, 20, 21, 21, 21, 21,
+};
+
+const int16_t av1_coeff_band_32x32[1024] = {
+  0,  1,  4,  4,  7,  7,  7,  7,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11,
+  11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 2,  3,  4,  4,  7,  7,
+  7,  7,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12,
+  12, 12, 12, 12, 12, 12, 12, 5,  5,  6,  6,  7,  7,  7,  7,  10, 10, 10, 10,
+  10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12,
+  12, 5,  5,  6,  6,  7,  7,  7,  7,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11,
+  11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 8,  8,  8,  8,  9,
+  9,  9,  9,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11,
+  12, 12, 12, 12, 12, 12, 12, 12, 8,  8,  8,  8,  9,  9,  9,  9,  10, 10, 10,
+  10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12,
+  12, 12, 8,  8,  8,  8,  9,  9,  9,  9,  10, 10, 10, 10, 10, 10, 10, 10, 11,
+  11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 8,  8,  8,  8,
+  9,  9,  9,  9,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11,
+  11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14,
+  14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16,
+  16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14,
+  15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 13, 13, 13,
+  13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15,
+  15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 14,
+  14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16,
+  16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14,
+  14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 13, 13,
+  13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15,
+  15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13,
+  14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16,
+  16, 16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14,
+  14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 17,
+  17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
+  19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, 17, 17,
+  17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20,
+  20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18,
+  18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20,
+  17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19,
+  19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, 17,
+  17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20,
+  20, 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18,
+  18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20,
+  20, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19,
+  19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17,
+  17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
+  20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22,
+  22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24,
+  24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23,
+  23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 21, 21, 21, 21,
+  21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23,
+  23, 24, 24, 24, 24, 24, 24, 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22,
+  22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24,
+  24, 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+  23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 21, 21, 21,
+  21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23,
+  23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22,
+  22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24,
+  24, 24, 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22,
+  22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24,
+};
+
+void av1_adapt_txb_probs(AV1_COMMON *cm, unsigned int count_sat,
+                         unsigned int update_factor) {
+  FRAME_CONTEXT *fc = cm->fc;
+  const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+  const FRAME_COUNTS *counts = &cm->counts;
+  TX_SIZE tx_size;
+  int plane, ctx, level;
+
+  // Update probability models for transform block skip flag
+  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size)
+    for (ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx)
+      fc->txb_skip[tx_size][ctx] = mode_mv_merge_probs(
+          pre_fc->txb_skip[tx_size][ctx], counts->txb_skip[tx_size][ctx]);
+
+  for (plane = 0; plane < PLANE_TYPES; ++plane)
+    for (ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
+      fc->dc_sign[plane][ctx] = mode_mv_merge_probs(
+          pre_fc->dc_sign[plane][ctx], counts->dc_sign[plane][ctx]);
+  // Update probability models for non-zero coefficient map and eob flag.
+  for (level = 0; level < NUM_BASE_LEVELS; ++level)
+    for (tx_size = 0; tx_size < TX_SIZES; ++tx_size)
+      for (plane = 0; plane < PLANE_TYPES; ++plane)
+        for (ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx)
+          fc->coeff_base[tx_size][plane][level][ctx] =
+              merge_probs(pre_fc->coeff_base[tx_size][plane][level][ctx],
+                          counts->coeff_base[tx_size][plane][level][ctx],
+                          count_sat, update_factor);
+
+  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
+    for (plane = 0; plane < PLANE_TYPES; ++plane) {
+      for (ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) {
+        fc->nz_map[tx_size][plane][ctx] = merge_probs(
+            pre_fc->nz_map[tx_size][plane][ctx],
+            counts->nz_map[tx_size][plane][ctx], count_sat, update_factor);
+      }
+
+      for (ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) {
+        fc->eob_flag[tx_size][plane][ctx] = merge_probs(
+            pre_fc->eob_flag[tx_size][plane][ctx],
+            counts->eob_flag[tx_size][plane][ctx], count_sat, update_factor);
+      }
+    }
+  }
+
+  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
+    for (plane = 0; plane < PLANE_TYPES; ++plane)
+      for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx)
+        fc->coeff_lps[tx_size][plane][ctx] = merge_probs(
+            pre_fc->coeff_lps[tx_size][plane][ctx],
+            counts->coeff_lps[tx_size][plane][ctx], count_sat, update_factor);
+  }
+}
diff --git a/third_party/aom/av1/common/txb_common.h b/third_party/aom/av1/common/txb_common.h
new file mode 100644
index 000000000..cdd9ca26e
--- /dev/null
+++ b/third_party/aom/av1/common/txb_common.h
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_TXB_COMMON_H_
+#define AV1_COMMON_TXB_COMMON_H_
+extern const int16_t av1_coeff_band_4x4[16];
+
+extern const int16_t av1_coeff_band_8x8[64];
+
+extern const int16_t av1_coeff_band_16x16[256];
+
+extern const int16_t av1_coeff_band_32x32[1024];
+
+typedef struct txb_ctx {
+  int txb_skip_ctx;
+  int dc_sign_ctx;
+} TXB_CTX;
+
+#define BASE_CONTEXT_POSITION_NUM 12
+static int base_ref_offset[BASE_CONTEXT_POSITION_NUM][2] = {
+  /* clang-format off*/
+  { -2, 0 }, { -1, -1 }, { -1, 0 }, { -1, 1 }, { 0, -2 }, { 0, -1 }, { 0, 1 },
+  { 0, 2 },  { 1, -1 },  { 1, 0 },  { 1, 1 },  { 2, 0 }
+  /* clang-format on*/
+};
+
+static INLINE int get_base_ctx(const tran_low_t *tcoeffs,
+                               int c,  // raster order
+                               const int bwl, const int level) {
+  const int row = c >> bwl;
+  const int col = c - (row << bwl);
+  const int stride = 1 << bwl;
+  const int level_minus_1 = level - 1;
+  int ctx = 0;
+  int mag = 0;
+  int idx;
+  int ctx_idx = -1;
+  tran_low_t abs_coeff;
+
+  ctx = 0;
+  for (idx = 0; idx < BASE_CONTEXT_POSITION_NUM; ++idx) {
+    int ref_row = row + base_ref_offset[idx][0];
+    int ref_col = col + base_ref_offset[idx][1];
+    int pos = (ref_row << bwl) + ref_col;
+
+    if (ref_row < 0 || ref_col < 0 || ref_row >= stride || ref_col >= stride)
+      continue;
+
+    abs_coeff = abs(tcoeffs[pos]);
+    ctx += abs_coeff > level_minus_1;
+
+    if (base_ref_offset[idx][0] >= 0 && base_ref_offset[idx][1] >= 0)
+      mag |= abs_coeff > level;
+  }
+  ctx = (ctx + 1) >> 1;
+  if (row == 0 && col == 0) {
+    ctx_idx = (ctx << 1) + mag;
+    assert(ctx_idx < 8);
+  } else if (row == 0) {
+    ctx_idx = 8 + (ctx << 1) + mag;
+    assert(ctx_idx < 18);
+  } else if (col == 0) {
+    ctx_idx = 8 + 10 + (ctx << 1) + mag;
+    assert(ctx_idx < 28);
+  } else {
+    ctx_idx = 8 + 10 + 10 + (ctx << 1) + mag;
+    assert(ctx_idx < COEFF_BASE_CONTEXTS);
+  }
+  return ctx_idx;
+}
+
+#define BR_CONTEXT_POSITION_NUM 8  // Base range coefficient context
+static int br_ref_offset[BR_CONTEXT_POSITION_NUM][2] = {
+  /* clang-format off*/
+  { -1, -1 }, { -1, 0 }, { -1, 1 }, { 0, -1 },
+  { 0, 1 },   { 1, -1 }, { 1, 0 },  { 1, 1 },
+  /* clang-format on*/
+};
+
+static int br_level_map[9] = {
+  0, 0, 1, 1, 2, 2, 3, 3, 3,
+};
+
+static INLINE int get_level_ctx(const tran_low_t *tcoeffs,
+                                const int c,  // raster order
+                                const int bwl) {
+  const int row = c >> bwl;
+  const int col = c - (row << bwl);
+  const int stride = 1 << bwl;
+  const int level_minus_1 = NUM_BASE_LEVELS;
+  int ctx = 0;
+  int idx;
+  tran_low_t abs_coeff;
+  int mag = 0, offset = 0;
+
+  for (idx = 0; idx < BR_CONTEXT_POSITION_NUM; ++idx) {
+    int ref_row = row + br_ref_offset[idx][0];
+    int ref_col = col + br_ref_offset[idx][1];
+    int pos = (ref_row << bwl) + ref_col;
+
+    if (ref_row < 0 || ref_col < 0 || ref_row >= stride || ref_col >= stride)
+      continue;
+
+    abs_coeff = abs(tcoeffs[pos]);
+    ctx += abs_coeff > level_minus_1;
+
+    if (br_ref_offset[idx][0] >= 0 && br_ref_offset[idx][1] >= 0)
+      mag = AOMMAX(mag, abs_coeff);
+  }
+
+  if (mag <= 1)
+    offset = 0;
+  else if (mag <= 3)
+    offset = 1;
+  else if (mag <= 6)
+    offset = 2;
+  else
+    offset = 3;
+
+  ctx = br_level_map[ctx];
+
+  ctx += offset * BR_TMP_OFFSET;
+
+  // DC: 0 - 1
+  if (row == 0 && col == 0) return ctx;
+
+  // Top row: 2 - 4
+  if (row == 0) return 2 + ctx;
+
+  // Left column: 5 - 7
+  if (col == 0) return 5 + ctx;
+
+  // others: 8 - 11
+  return 8 + ctx;
+}
+
+static int sig_ref_offset[11][2] = {
+  { -2, -1 }, { -2, 0 }, { -2, 1 }, { -1, -2 }, { -1, -1 }, { -1, 0 },
+  { -1, 1 },  { 0, -2 }, { 0, -1 }, { 1, -2 },  { 1, -1 },
+};
+
+static INLINE int get_nz_map_ctx(const tran_low_t *tcoeffs,
+                                 const uint8_t *txb_mask,
+                                 const int c,  // raster order
+                                 const int bwl) {
+  const int row = c >> bwl;
+  const int col = c - (row << bwl);
+  int ctx = 0;
+  int idx;
+  int stride = 1 << bwl;
+
+  if (row == 0 && col == 0) return 0;
+
+  if (row == 0 && col == 1) return 1 + (tcoeffs[0] != 0);
+
+  if (row == 1 && col == 0) return 3 + (tcoeffs[0] != 0);
+
+  if (row == 1 && col == 1) {
+    int pos;
+    ctx = (tcoeffs[0] != 0);
+
+    if (txb_mask[1]) ctx += (tcoeffs[1] != 0);
+    pos = 1 << bwl;
+    if (txb_mask[pos]) ctx += (tcoeffs[pos] != 0);
+
+    ctx = (ctx + 1) >> 1;
+
+    assert(5 + ctx <= 7);
+
+    return 5 + ctx;
+  }
+
+  for (idx = 0; idx < 11; ++idx) {
+    int ref_row = row + sig_ref_offset[idx][0];
+    int ref_col = col + sig_ref_offset[idx][1];
+    int pos;
+
+    if (ref_row < 0 || ref_col < 0 || ref_row >= stride || ref_col >= stride)
+      continue;
+
+    pos = (ref_row << bwl) + ref_col;
+
+    if (txb_mask[pos]) ctx += (tcoeffs[pos] != 0);
+  }
+
+  if (row == 0) {
+    ctx = (ctx + 1) >> 1;
+
+    assert(ctx < 3);
+    return 8 + ctx;
+  }
+
+  if (col == 0) {
+    ctx = (ctx + 1) >> 1;
+
+    assert(ctx < 3);
+    return 11 + ctx;
+  }
+
+  ctx >>= 1;
+
+  assert(14 + ctx < 20);
+
+  return 14 + ctx;
+}
+
+static INLINE int get_eob_ctx(const tran_low_t *tcoeffs,
+                              const int c,  // raster order
+                              const int bwl) {
+  (void)tcoeffs;
+  if (bwl == 2) return av1_coeff_band_4x4[c];
+  if (bwl == 3) return av1_coeff_band_8x8[c];
+  if (bwl == 4) return av1_coeff_band_16x16[c];
+  if (bwl == 5) return av1_coeff_band_32x32[c];
+
+  assert(0);
+  return 0;
+}
+
+static INLINE void set_dc_sign(int *cul_level, tran_low_t v) {
+  if (v < 0)
+    *cul_level |= 1 << COEFF_CONTEXT_BITS;
+  else if (v > 0)
+    *cul_level += 2 << COEFF_CONTEXT_BITS;
+}
+
+static INLINE int get_dc_sign_ctx(int dc_sign) {
+  int dc_sign_ctx = 0;
+  if (dc_sign < 0)
+    dc_sign_ctx = 1;
+  else if (dc_sign > 0)
+    dc_sign_ctx = 2;
+
+  return dc_sign_ctx;
+}
+
+static INLINE void get_txb_ctx(BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               int plane, const ENTROPY_CONTEXT *a,
+                               const ENTROPY_CONTEXT *l, TXB_CTX *txb_ctx) {
+  const int tx_size_in_blocks = 1 << tx_size;
+  int ctx_offset = (plane == 0) ? 0 : 7;
+  int k;
+
+  if (plane_bsize > txsize_to_bsize[tx_size]) ctx_offset += 3;
+
+  int dc_sign = 0;
+  for (k = 0; k < tx_size_in_blocks; ++k) {
+    int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS;
+    if (sign == 1)
+      --dc_sign;
+    else if (sign == 2)
+      ++dc_sign;
+    else if (sign != 0)
+      assert(0);
+
+    sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS;
+    if (sign == 1)
+      --dc_sign;
+    else if (sign == 2)
+      ++dc_sign;
+    else if (sign != 0)
+      assert(0);
+  }
+  txb_ctx->dc_sign_ctx = get_dc_sign_ctx(dc_sign);
+
+  if (plane == 0) {
+    int top = 0;
+    int left = 0;
+    for (k = 0; k < tx_size_in_blocks; ++k) {
+      top = AOMMAX(top, ((uint8_t)a[k] & COEFF_CONTEXT_MASK));
+      left = AOMMAX(left, ((uint8_t)l[k] & COEFF_CONTEXT_MASK));
+    }
+    top = AOMMIN(top, 255);
+    left = AOMMIN(left, 255);
+
+    if (plane_bsize == txsize_to_bsize[tx_size])
+      txb_ctx->txb_skip_ctx = 0;
+    else if (top == 0 && left == 0)
+      txb_ctx->txb_skip_ctx = 1;
+    else if (top == 0 || left == 0)
+      txb_ctx->txb_skip_ctx = 2 + (AOMMAX(top, left) > 3);
+    else if (AOMMAX(top, left) <= 3)
+      txb_ctx->txb_skip_ctx = 4;
+    else if (AOMMIN(top, left) <= 3)
+      txb_ctx->txb_skip_ctx = 5;
+    else
+      txb_ctx->txb_skip_ctx = 6;
+  } else {
+    int ctx_base = get_entropy_context(tx_size, a, l);
+    txb_ctx->txb_skip_ctx = ctx_offset + ctx_base;
+  }
+}
+
+void av1_adapt_txb_probs(AV1_COMMON *cm, unsigned int count_sat,
+                         unsigned int update_factor);
+#endif  // AV1_COMMON_TXB_COMMON_H_
diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c
new file mode 100644
index 000000000..9d13dc705
--- /dev/null
+++ b/third_party/aom/av1/common/warped_motion.c
@@ -0,0 +1,1773 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+
+/* clang-format off */
+static const int error_measure_lut[512] = {
+  // pow 0.7
+  16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
+  16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
+  15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
+  15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
+  14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
+  14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
+  14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
+  13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
+  13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
+  12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
+  12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
+  12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
+  11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
+  11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
+  10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
+  10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
+  10058, 10002,  9947,  9891,  9835,  9779,  9723,  9666,
+  9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
+  9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
+  8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
+  8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
+  7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
+  7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
+  6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
+  6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
+  5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
+  5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
+  4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
+  3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
+  3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
+  2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
+  1323, 1187, 1045,  894,  731,  550,  339,    0,
+  339,  550,  731,  894, 1045, 1187, 1323, 1452,
+  1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
+  2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
+  3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
+  3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
+  4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
+  5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
+  5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
+  6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
+  6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
+  7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
+  7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
+  8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
+  8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
+  9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
+  9723,  9779,  9835,  9891,  9947, 10002, 10058, 10113,
+  10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
+  10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
+  11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
+  11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
+  11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
+  12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
+  12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
+  13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
+  13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
+  13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
+  14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
+  14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
+  15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
+  15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
+  15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
+  16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
+};
+/* clang-format on */
+
+static ProjectPointsFunc get_project_points_type(TransformationType type) {
+  switch (type) {
+    case HOMOGRAPHY: return project_points_homography;
+    case AFFINE: return project_points_affine;
+    case ROTZOOM: return project_points_rotzoom;
+    case TRANSLATION: return project_points_translation;
+    default: assert(0); return NULL;
+  }
+}
+
+void project_points_translation(int32_t *mat, int *points, int *proj,
+                                const int n, const int stride_points,
+                                const int stride_proj, const int subsampling_x,
+                                const int subsampling_y) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const int x = *(points++), y = *(points++);
+    if (subsampling_x)
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          ((x * (1 << (WARPEDMODEL_PREC_BITS + 1))) + mat[0]),
+          WARPEDDIFF_PREC_BITS + 1);
+    else
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          ((x * (1 << WARPEDMODEL_PREC_BITS)) + mat[0]), WARPEDDIFF_PREC_BITS);
+    if (subsampling_y)
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          ((y * (1 << (WARPEDMODEL_PREC_BITS + 1))) + mat[1]),
+          WARPEDDIFF_PREC_BITS + 1);
+    else
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          ((y * (1 << WARPEDMODEL_PREC_BITS))) + mat[1], WARPEDDIFF_PREC_BITS);
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+void project_points_rotzoom(int32_t *mat, int *points, int *proj, const int n,
+                            const int stride_points, const int stride_proj,
+                            const int subsampling_x, const int subsampling_y) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const int x = *(points++), y = *(points++);
+    if (subsampling_x)
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          mat[2] * 2 * x + mat[3] * 2 * y + mat[0] +
+              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+          WARPEDDIFF_PREC_BITS + 1);
+    else
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[0],
+                                            WARPEDDIFF_PREC_BITS);
+    if (subsampling_y)
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          -mat[3] * 2 * x + mat[2] * 2 * y + mat[1] +
+              (-mat[3] + mat[2] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+          WARPEDDIFF_PREC_BITS + 1);
+    else
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(-mat[3] * x + mat[2] * y + mat[1],
+                                            WARPEDDIFF_PREC_BITS);
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+void project_points_affine(int32_t *mat, int *points, int *proj, const int n,
+                           const int stride_points, const int stride_proj,
+                           const int subsampling_x, const int subsampling_y) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const int x = *(points++), y = *(points++);
+    if (subsampling_x)
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          mat[2] * 2 * x + mat[3] * 2 * y + mat[0] +
+              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+          WARPEDDIFF_PREC_BITS + 1);
+    else
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[0],
+                                            WARPEDDIFF_PREC_BITS);
+    if (subsampling_y)
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          mat[4] * 2 * x + mat[5] * 2 * y + mat[1] +
+              (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+          WARPEDDIFF_PREC_BITS + 1);
+    else
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[4] * x + mat[5] * y + mat[1],
+                                            WARPEDDIFF_PREC_BITS);
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+void project_points_hortrapezoid(int32_t *mat, int *points, int *proj,
+                                 const int n, const int stride_points,
+                                 const int stride_proj, const int subsampling_x,
+                                 const int subsampling_y) {
+  int i;
+  int64_t x, y, Z;
+  int64_t xp, yp;
+  for (i = 0; i < n; ++i) {
+    x = *(points++), y = *(points++);
+    x = (subsampling_x ? 4 * x + 1 : 2 * x);
+    y = (subsampling_y ? 4 * y + 1 : 2 * y);
+
+    Z = (mat[7] * y + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
+    xp = (mat[2] * x + mat[3] * y + 2 * mat[0]) *
+         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
+                WARPEDMODEL_PREC_BITS));
+    yp = (mat[5] * y + 2 * mat[1]) *
+         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
+                WARPEDMODEL_PREC_BITS));
+
+    xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
+    yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
+
+    if (subsampling_x) xp = (xp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
+    if (subsampling_y) yp = (yp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
+    *(proj++) = (int)xp;
+    *(proj++) = (int)yp;
+
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+void project_points_vertrapezoid(int32_t *mat, int *points, int *proj,
+                                 const int n, const int stride_points,
+                                 const int stride_proj, const int subsampling_x,
+                                 const int subsampling_y) {
+  int i;
+  int64_t x, y, Z;
+  int64_t xp, yp;
+  for (i = 0; i < n; ++i) {
+    x = *(points++), y = *(points++);
+    x = (subsampling_x ? 4 * x + 1 : 2 * x);
+    y = (subsampling_y ? 4 * y + 1 : 2 * y);
+
+    Z = (mat[6] * x + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
+    xp = (mat[2] * x + 2 * mat[0]) *
+         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
+                WARPEDMODEL_PREC_BITS));
+    yp = (mat[4] * x + mat[5] * y + 2 * mat[1]) *
+         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
+                WARPEDMODEL_PREC_BITS));
+
+    xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
+    yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
+
+    if (subsampling_x) xp = (xp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
+    if (subsampling_y) yp = (yp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
+    *(proj++) = (int)xp;
+    *(proj++) = (int)yp;
+
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+void project_points_homography(int32_t *mat, int *points, int *proj,
+                               const int n, const int stride_points,
+                               const int stride_proj, const int subsampling_x,
+                               const int subsampling_y) {
+  int i;
+  int64_t x, y, Z;
+  int64_t xp, yp;
+  for (i = 0; i < n; ++i) {
+    x = *(points++), y = *(points++);
+    x = (subsampling_x ? 4 * x + 1 : 2 * x);
+    y = (subsampling_y ? 4 * y + 1 : 2 * y);
+
+    Z = (mat[6] * x + mat[7] * y + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
+    xp = (mat[2] * x + mat[3] * y + 2 * mat[0]) *
+         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
+                WARPEDMODEL_PREC_BITS));
+    yp = (mat[4] * x + mat[5] * y + 2 * mat[1]) *
+         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
+                WARPEDMODEL_PREC_BITS));
+
+    xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
+    yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
+
+    if (subsampling_x) xp = (xp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
+    if (subsampling_y) yp = (yp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
+    *(proj++) = (int)xp;
+    *(proj++) = (int)yp;
+
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+// 'points' are at original scale, output 'proj's are scaled up by
+// 1 << WARPEDPIXEL_PREC_BITS
+void project_points(WarpedMotionParams *wm_params, int *points, int *proj,
+                    const int n, const int stride_points, const int stride_proj,
+                    const int subsampling_x, const int subsampling_y) {
+  switch (wm_params->wmtype) {
+    case AFFINE:
+      project_points_affine(wm_params->wmmat, points, proj, n, stride_points,
+                            stride_proj, subsampling_x, subsampling_y);
+      break;
+    case ROTZOOM:
+      project_points_rotzoom(wm_params->wmmat, points, proj, n, stride_points,
+                             stride_proj, subsampling_x, subsampling_y);
+      break;
+    case HOMOGRAPHY:
+      project_points_homography(wm_params->wmmat, points, proj, n,
+                                stride_points, stride_proj, subsampling_x,
+                                subsampling_y);
+      break;
+    default: assert(0 && "Invalid warped motion type!"); return;
+  }
+}
+
+static const int16_t
+    filter_ntap[WARPEDPIXEL_PREC_SHIFTS][WARPEDPIXEL_FILTER_TAPS] = {
+#if WARPEDPIXEL_PREC_BITS == 6
+      { 0, 0, 128, 0, 0, 0 },      { 0, -1, 128, 2, -1, 0 },
+      { 1, -3, 127, 4, -1, 0 },    { 1, -4, 126, 6, -2, 1 },
+      { 1, -5, 126, 8, -3, 1 },    { 1, -6, 125, 11, -4, 1 },
+      { 1, -7, 124, 13, -4, 1 },   { 2, -8, 123, 15, -5, 1 },
+      { 2, -9, 122, 18, -6, 1 },   { 2, -10, 121, 20, -6, 1 },
+      { 2, -11, 120, 22, -7, 2 },  { 2, -12, 119, 25, -8, 2 },
+      { 3, -13, 117, 27, -8, 2 },  { 3, -13, 116, 29, -9, 2 },
+      { 3, -14, 114, 32, -10, 3 }, { 3, -15, 113, 35, -10, 2 },
+      { 3, -15, 111, 37, -11, 3 }, { 3, -16, 109, 40, -11, 3 },
+      { 3, -16, 108, 42, -12, 3 }, { 4, -17, 106, 45, -13, 3 },
+      { 4, -17, 104, 47, -13, 3 }, { 4, -17, 102, 50, -14, 3 },
+      { 4, -17, 100, 52, -14, 3 }, { 4, -18, 98, 55, -15, 4 },
+      { 4, -18, 96, 58, -15, 3 },  { 4, -18, 94, 60, -16, 4 },
+      { 4, -18, 91, 63, -16, 4 },  { 4, -18, 89, 65, -16, 4 },
+      { 4, -18, 87, 68, -17, 4 },  { 4, -18, 85, 70, -17, 4 },
+      { 4, -18, 82, 73, -17, 4 },  { 4, -18, 80, 75, -17, 4 },
+      { 4, -18, 78, 78, -18, 4 },  { 4, -17, 75, 80, -18, 4 },
+      { 4, -17, 73, 82, -18, 4 },  { 4, -17, 70, 85, -18, 4 },
+      { 4, -17, 68, 87, -18, 4 },  { 4, -16, 65, 89, -18, 4 },
+      { 4, -16, 63, 91, -18, 4 },  { 4, -16, 60, 94, -18, 4 },
+      { 3, -15, 58, 96, -18, 4 },  { 4, -15, 55, 98, -18, 4 },
+      { 3, -14, 52, 100, -17, 4 }, { 3, -14, 50, 102, -17, 4 },
+      { 3, -13, 47, 104, -17, 4 }, { 3, -13, 45, 106, -17, 4 },
+      { 3, -12, 42, 108, -16, 3 }, { 3, -11, 40, 109, -16, 3 },
+      { 3, -11, 37, 111, -15, 3 }, { 2, -10, 35, 113, -15, 3 },
+      { 3, -10, 32, 114, -14, 3 }, { 2, -9, 29, 116, -13, 3 },
+      { 2, -8, 27, 117, -13, 3 },  { 2, -8, 25, 119, -12, 2 },
+      { 2, -7, 22, 120, -11, 2 },  { 1, -6, 20, 121, -10, 2 },
+      { 1, -6, 18, 122, -9, 2 },   { 1, -5, 15, 123, -8, 2 },
+      { 1, -4, 13, 124, -7, 1 },   { 1, -4, 11, 125, -6, 1 },
+      { 1, -3, 8, 126, -5, 1 },    { 1, -2, 6, 126, -4, 1 },
+      { 0, -1, 4, 127, -3, 1 },    { 0, -1, 2, 128, -1, 0 },
+#elif WARPEDPIXEL_PREC_BITS == 5
+      { 0, 0, 128, 0, 0, 0 },      { 1, -3, 127, 4, -1, 0 },
+      { 1, -5, 126, 8, -3, 1 },    { 1, -7, 124, 13, -4, 1 },
+      { 2, -9, 122, 18, -6, 1 },   { 2, -11, 120, 22, -7, 2 },
+      { 3, -13, 117, 27, -8, 2 },  { 3, -14, 114, 32, -10, 3 },
+      { 3, -15, 111, 37, -11, 3 }, { 3, -16, 108, 42, -12, 3 },
+      { 4, -17, 104, 47, -13, 3 }, { 4, -17, 100, 52, -14, 3 },
+      { 4, -18, 96, 58, -15, 3 },  { 4, -18, 91, 63, -16, 4 },
+      { 4, -18, 87, 68, -17, 4 },  { 4, -18, 82, 73, -17, 4 },
+      { 4, -18, 78, 78, -18, 4 },  { 4, -17, 73, 82, -18, 4 },
+      { 4, -17, 68, 87, -18, 4 },  { 4, -16, 63, 91, -18, 4 },
+      { 3, -15, 58, 96, -18, 4 },  { 3, -14, 52, 100, -17, 4 },
+      { 3, -13, 47, 104, -17, 4 }, { 3, -12, 42, 108, -16, 3 },
+      { 3, -11, 37, 111, -15, 3 }, { 3, -10, 32, 114, -14, 3 },
+      { 2, -8, 27, 117, -13, 3 },  { 2, -7, 22, 120, -11, 2 },
+      { 1, -6, 18, 122, -9, 2 },   { 1, -4, 13, 124, -7, 1 },
+      { 1, -3, 8, 126, -5, 1 },    { 0, -1, 4, 127, -3, 1 },
+#endif  // WARPEDPIXEL_PREC_BITS == 6
+    };
+
+static int32_t do_ntap_filter(int32_t *p, int x) {
+  int i;
+  int32_t sum = 0;
+  for (i = 0; i < WARPEDPIXEL_FILTER_TAPS; ++i) {
+    sum += p[i - WARPEDPIXEL_FILTER_TAPS / 2 + 1] * filter_ntap[x][i];
+  }
+  return sum;
+}
+
+static int32_t do_cubic_filter(int32_t *p, int x) {
+  if (x == 0) {
+    return p[0] * (1 << WARPEDPIXEL_FILTER_BITS);
+  } else if (x == (1 << WARPEDPIXEL_PREC_BITS)) {
+    return p[1] * (1 << WARPEDPIXEL_FILTER_BITS);
+  } else {
+    const int64_t v1 = (int64_t)x * x * x * (3 * (p[0] - p[1]) + p[2] - p[-1]);
+    const int64_t v2 =
+        (int64_t)x * x * (2 * p[-1] - 5 * p[0] + 4 * p[1] - p[2]);
+    const int64_t v3 = x * (p[1] - p[-1]);
+    const int64_t v4 = 2 * p[0];
+    return (int32_t)ROUND_POWER_OF_TWO_SIGNED(
+        (v4 * (1 << (3 * WARPEDPIXEL_PREC_BITS))) +
+            (v3 * (1 << (2 * WARPEDPIXEL_PREC_BITS))) +
+            (v2 * (1 << WARPEDPIXEL_PREC_BITS)) + v1,
+        3 * WARPEDPIXEL_PREC_BITS + 1 - WARPEDPIXEL_FILTER_BITS);
+  }
+}
+
+static INLINE void get_subcolumn(int taps, uint8_t *ref, int32_t *col,
+                                 int stride, int x, int y_start) {
+  int i;
+  for (i = 0; i < taps; ++i) {
+    col[i] = ref[(i + y_start) * stride + x];
+  }
+}
+
+static uint8_t bi_ntap_filter(uint8_t *ref, int x, int y, int stride) {
+  int32_t val, arr[WARPEDPIXEL_FILTER_TAPS];
+  int k;
+  int i = (int)x >> WARPEDPIXEL_PREC_BITS;
+  int j = (int)y >> WARPEDPIXEL_PREC_BITS;
+  for (k = 0; k < WARPEDPIXEL_FILTER_TAPS; ++k) {
+    int32_t arr_temp[WARPEDPIXEL_FILTER_TAPS];
+    get_subcolumn(WARPEDPIXEL_FILTER_TAPS, ref, arr_temp, stride,
+                  i + k + 1 - WARPEDPIXEL_FILTER_TAPS / 2,
+                  j + 1 - WARPEDPIXEL_FILTER_TAPS / 2);
+    arr[k] = do_ntap_filter(arr_temp + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
+                            y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
+  }
+  val = do_ntap_filter(arr + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
+                       x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
+  val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
+  return (uint8_t)clip_pixel(val);
+}
+
+static uint8_t bi_cubic_filter(uint8_t *ref, int x, int y, int stride) {
+  int32_t val, arr[4];
+  int k;
+  int i = (int)x >> WARPEDPIXEL_PREC_BITS;
+  int j = (int)y >> WARPEDPIXEL_PREC_BITS;
+  for (k = 0; k < 4; ++k) {
+    int32_t arr_temp[4];
+    get_subcolumn(4, ref, arr_temp, stride, i + k - 1, j - 1);
+    arr[k] =
+        do_cubic_filter(arr_temp + 1, y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
+  }
+  val = do_cubic_filter(arr + 1, x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
+  val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
+  return (uint8_t)clip_pixel(val);
+}
+
+static uint8_t bi_linear_filter(uint8_t *ref, int x, int y, int stride) {
+  const int ix = x >> WARPEDPIXEL_PREC_BITS;
+  const int iy = y >> WARPEDPIXEL_PREC_BITS;
+  const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
+  const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
+  int32_t val;
+  val = ROUND_POWER_OF_TWO_SIGNED(
+      ref[iy * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sy) *
+              (WARPEDPIXEL_PREC_SHIFTS - sx) +
+          ref[iy * stride + ix + 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) * sx +
+          ref[(iy + 1) * stride + ix] * sy * (WARPEDPIXEL_PREC_SHIFTS - sx) +
+          ref[(iy + 1) * stride + ix + 1] * sy * sx,
+      WARPEDPIXEL_PREC_BITS * 2);
+  return (uint8_t)clip_pixel(val);
+}
+
+static uint8_t warp_interpolate(uint8_t *ref, int x, int y, int width,
+                                int height, int stride) {
+  int ix = x >> WARPEDPIXEL_PREC_BITS;
+  int iy = y >> WARPEDPIXEL_PREC_BITS;
+  int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
+  int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
+  int32_t v;
+
+  if (ix < 0 && iy < 0)
+    return ref[0];
+  else if (ix < 0 && iy >= height - 1)
+    return ref[(height - 1) * stride];
+  else if (ix >= width - 1 && iy < 0)
+    return ref[width - 1];
+  else if (ix >= width - 1 && iy >= height - 1)
+    return ref[(height - 1) * stride + (width - 1)];
+  else if (ix < 0) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[iy * stride] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
+            ref[(iy + 1) * stride] * sy,
+        WARPEDPIXEL_PREC_BITS);
+    return clip_pixel(v);
+  } else if (iy < 0) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) + ref[ix + 1] * sx,
+        WARPEDPIXEL_PREC_BITS);
+    return clip_pixel(v);
+  } else if (ix >= width - 1) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[iy * stride + width - 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
+            ref[(iy + 1) * stride + width - 1] * sy,
+        WARPEDPIXEL_PREC_BITS);
+    return clip_pixel(v);
+  } else if (iy >= height - 1) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[(height - 1) * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) +
+            ref[(height - 1) * stride + ix + 1] * sx,
+        WARPEDPIXEL_PREC_BITS);
+    return clip_pixel(v);
+  } else if (ix >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
+             iy >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
+             ix < width - WARPEDPIXEL_FILTER_TAPS / 2 &&
+             iy < height - WARPEDPIXEL_FILTER_TAPS / 2) {
+    return bi_ntap_filter(ref, x, y, stride);
+  } else if (ix >= 1 && iy >= 1 && ix < width - 2 && iy < height - 2) {
+    return bi_cubic_filter(ref, x, y, stride);
+  } else {
+    return bi_linear_filter(ref, x, y, stride);
+  }
+}
+
+// For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
+// at a time. The zoom/rotation/shear in the model are applied to the
+// "fractional" position of each pixel, which therefore varies within
+// [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
+// We need an extra 2 taps to fit this in, for a total of 8 taps.
+/* clang-format off */
+const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
+#if WARPEDPIXEL_PREC_BITS == 6
+  // [-1, 0)
+  { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
+  { 1, - 3, 127,   4, - 1, 0, 0, 0 }, { 1, - 4, 126,   6, - 2, 1, 0, 0 },
+  { 1, - 5, 126,   8, - 3, 1, 0, 0 }, { 1, - 6, 125,  11, - 4, 1, 0, 0 },
+  { 1, - 7, 124,  13, - 4, 1, 0, 0 }, { 2, - 8, 123,  15, - 5, 1, 0, 0 },
+  { 2, - 9, 122,  18, - 6, 1, 0, 0 }, { 2, -10, 121,  20, - 6, 1, 0, 0 },
+  { 2, -11, 120,  22, - 7, 2, 0, 0 }, { 2, -12, 119,  25, - 8, 2, 0, 0 },
+  { 3, -13, 117,  27, - 8, 2, 0, 0 }, { 3, -13, 116,  29, - 9, 2, 0, 0 },
+  { 3, -14, 114,  32, -10, 3, 0, 0 }, { 3, -15, 113,  35, -10, 2, 0, 0 },
+  { 3, -15, 111,  37, -11, 3, 0, 0 }, { 3, -16, 109,  40, -11, 3, 0, 0 },
+  { 3, -16, 108,  42, -12, 3, 0, 0 }, { 4, -17, 106,  45, -13, 3, 0, 0 },
+  { 4, -17, 104,  47, -13, 3, 0, 0 }, { 4, -17, 102,  50, -14, 3, 0, 0 },
+  { 4, -17, 100,  52, -14, 3, 0, 0 }, { 4, -18,  98,  55, -15, 4, 0, 0 },
+  { 4, -18,  96,  58, -15, 3, 0, 0 }, { 4, -18,  94,  60, -16, 4, 0, 0 },
+  { 4, -18,  91,  63, -16, 4, 0, 0 }, { 4, -18,  89,  65, -16, 4, 0, 0 },
+  { 4, -18,  87,  68, -17, 4, 0, 0 }, { 4, -18,  85,  70, -17, 4, 0, 0 },
+  { 4, -18,  82,  73, -17, 4, 0, 0 }, { 4, -18,  80,  75, -17, 4, 0, 0 },
+  { 4, -18,  78,  78, -18, 4, 0, 0 }, { 4, -17,  75,  80, -18, 4, 0, 0 },
+  { 4, -17,  73,  82, -18, 4, 0, 0 }, { 4, -17,  70,  85, -18, 4, 0, 0 },
+  { 4, -17,  68,  87, -18, 4, 0, 0 }, { 4, -16,  65,  89, -18, 4, 0, 0 },
+  { 4, -16,  63,  91, -18, 4, 0, 0 }, { 4, -16,  60,  94, -18, 4, 0, 0 },
+  { 3, -15,  58,  96, -18, 4, 0, 0 }, { 4, -15,  55,  98, -18, 4, 0, 0 },
+  { 3, -14,  52, 100, -17, 4, 0, 0 }, { 3, -14,  50, 102, -17, 4, 0, 0 },
+  { 3, -13,  47, 104, -17, 4, 0, 0 }, { 3, -13,  45, 106, -17, 4, 0, 0 },
+  { 3, -12,  42, 108, -16, 3, 0, 0 }, { 3, -11,  40, 109, -16, 3, 0, 0 },
+  { 3, -11,  37, 111, -15, 3, 0, 0 }, { 2, -10,  35, 113, -15, 3, 0, 0 },
+  { 3, -10,  32, 114, -14, 3, 0, 0 }, { 2, - 9,  29, 116, -13, 3, 0, 0 },
+  { 2, - 8,  27, 117, -13, 3, 0, 0 }, { 2, - 8,  25, 119, -12, 2, 0, 0 },
+  { 2, - 7,  22, 120, -11, 2, 0, 0 }, { 1, - 6,  20, 121, -10, 2, 0, 0 },
+  { 1, - 6,  18, 122, - 9, 2, 0, 0 }, { 1, - 5,  15, 123, - 8, 2, 0, 0 },
+  { 1, - 4,  13, 124, - 7, 1, 0, 0 }, { 1, - 4,  11, 125, - 6, 1, 0, 0 },
+  { 1, - 3,   8, 126, - 5, 1, 0, 0 }, { 1, - 2,   6, 126, - 4, 1, 0, 0 },
+  { 0, - 1,   4, 127, - 3, 1, 0, 0 }, { 0,   0,   2, 127, - 1, 0, 0, 0 },
+
+  // [0, 1)
+  { 0,  0,   0, 127,   1,   0,  0,  0}, { 0,  0,  -1, 127,   2,   0,  0,  0},
+  { 0,  1,  -3, 127,   4,  -2,  1,  0}, { 0,  1,  -5, 127,   6,  -2,  1,  0},
+  { 0,  2,  -6, 126,   8,  -3,  1,  0}, {-1,  2,  -7, 126,  11,  -4,  2, -1},
+  {-1,  3,  -8, 125,  13,  -5,  2, -1}, {-1,  3, -10, 124,  16,  -6,  3, -1},
+  {-1,  4, -11, 123,  18,  -7,  3, -1}, {-1,  4, -12, 122,  20,  -7,  3, -1},
+  {-1,  4, -13, 121,  23,  -8,  3, -1}, {-2,  5, -14, 120,  25,  -9,  4, -1},
+  {-1,  5, -15, 119,  27, -10,  4, -1}, {-1,  5, -16, 118,  30, -11,  4, -1},
+  {-2,  6, -17, 116,  33, -12,  5, -1}, {-2,  6, -17, 114,  35, -12,  5, -1},
+  {-2,  6, -18, 113,  38, -13,  5, -1}, {-2,  7, -19, 111,  41, -14,  6, -2},
+  {-2,  7, -19, 110,  43, -15,  6, -2}, {-2,  7, -20, 108,  46, -15,  6, -2},
+  {-2,  7, -20, 106,  49, -16,  6, -2}, {-2,  7, -21, 104,  51, -16,  7, -2},
+  {-2,  7, -21, 102,  54, -17,  7, -2}, {-2,  8, -21, 100,  56, -18,  7, -2},
+  {-2,  8, -22,  98,  59, -18,  7, -2}, {-2,  8, -22,  96,  62, -19,  7, -2},
+  {-2,  8, -22,  94,  64, -19,  7, -2}, {-2,  8, -22,  91,  67, -20,  8, -2},
+  {-2,  8, -22,  89,  69, -20,  8, -2}, {-2,  8, -22,  87,  72, -21,  8, -2},
+  {-2,  8, -21,  84,  74, -21,  8, -2}, {-2,  8, -22,  82,  77, -21,  8, -2},
+  {-2,  8, -21,  79,  79, -21,  8, -2}, {-2,  8, -21,  77,  82, -22,  8, -2},
+  {-2,  8, -21,  74,  84, -21,  8, -2}, {-2,  8, -21,  72,  87, -22,  8, -2},
+  {-2,  8, -20,  69,  89, -22,  8, -2}, {-2,  8, -20,  67,  91, -22,  8, -2},
+  {-2,  7, -19,  64,  94, -22,  8, -2}, {-2,  7, -19,  62,  96, -22,  8, -2},
+  {-2,  7, -18,  59,  98, -22,  8, -2}, {-2,  7, -18,  56, 100, -21,  8, -2},
+  {-2,  7, -17,  54, 102, -21,  7, -2}, {-2,  7, -16,  51, 104, -21,  7, -2},
+  {-2,  6, -16,  49, 106, -20,  7, -2}, {-2,  6, -15,  46, 108, -20,  7, -2},
+  {-2,  6, -15,  43, 110, -19,  7, -2}, {-2,  6, -14,  41, 111, -19,  7, -2},
+  {-1,  5, -13,  38, 113, -18,  6, -2}, {-1,  5, -12,  35, 114, -17,  6, -2},
+  {-1,  5, -12,  33, 116, -17,  6, -2}, {-1,  4, -11,  30, 118, -16,  5, -1},
+  {-1,  4, -10,  27, 119, -15,  5, -1}, {-1,  4,  -9,  25, 120, -14,  5, -2},
+  {-1,  3,  -8,  23, 121, -13,  4, -1}, {-1,  3,  -7,  20, 122, -12,  4, -1},
+  {-1,  3,  -7,  18, 123, -11,  4, -1}, {-1,  3,  -6,  16, 124, -10,  3, -1},
+  {-1,  2,  -5,  13, 125,  -8,  3, -1}, {-1,  2,  -4,  11, 126,  -7,  2, -1},
+  { 0,  1,  -3,   8, 126,  -6,  2,  0}, { 0,  1,  -2,   6, 127,  -5,  1,  0},
+  { 0,  1,  -2,   4, 127,  -3,  1,  0}, { 0,  0,   0,   2, 127,  -1,  0,  0},
+
+  // [1, 2)
+  { 0, 0, 0,   1, 127,   0,   0, 0 }, { 0, 0, 0, - 1, 127,   2,   0, 0 },
+  { 0, 0, 1, - 3, 127,   4, - 1, 0 }, { 0, 0, 1, - 4, 126,   6, - 2, 1 },
+  { 0, 0, 1, - 5, 126,   8, - 3, 1 }, { 0, 0, 1, - 6, 125,  11, - 4, 1 },
+  { 0, 0, 1, - 7, 124,  13, - 4, 1 }, { 0, 0, 2, - 8, 123,  15, - 5, 1 },
+  { 0, 0, 2, - 9, 122,  18, - 6, 1 }, { 0, 0, 2, -10, 121,  20, - 6, 1 },
+  { 0, 0, 2, -11, 120,  22, - 7, 2 }, { 0, 0, 2, -12, 119,  25, - 8, 2 },
+  { 0, 0, 3, -13, 117,  27, - 8, 2 }, { 0, 0, 3, -13, 116,  29, - 9, 2 },
+  { 0, 0, 3, -14, 114,  32, -10, 3 }, { 0, 0, 3, -15, 113,  35, -10, 2 },
+  { 0, 0, 3, -15, 111,  37, -11, 3 }, { 0, 0, 3, -16, 109,  40, -11, 3 },
+  { 0, 0, 3, -16, 108,  42, -12, 3 }, { 0, 0, 4, -17, 106,  45, -13, 3 },
+  { 0, 0, 4, -17, 104,  47, -13, 3 }, { 0, 0, 4, -17, 102,  50, -14, 3 },
+  { 0, 0, 4, -17, 100,  52, -14, 3 }, { 0, 0, 4, -18,  98,  55, -15, 4 },
+  { 0, 0, 4, -18,  96,  58, -15, 3 }, { 0, 0, 4, -18,  94,  60, -16, 4 },
+  { 0, 0, 4, -18,  91,  63, -16, 4 }, { 0, 0, 4, -18,  89,  65, -16, 4 },
+  { 0, 0, 4, -18,  87,  68, -17, 4 }, { 0, 0, 4, -18,  85,  70, -17, 4 },
+  { 0, 0, 4, -18,  82,  73, -17, 4 }, { 0, 0, 4, -18,  80,  75, -17, 4 },
+  { 0, 0, 4, -18,  78,  78, -18, 4 }, { 0, 0, 4, -17,  75,  80, -18, 4 },
+  { 0, 0, 4, -17,  73,  82, -18, 4 }, { 0, 0, 4, -17,  70,  85, -18, 4 },
+  { 0, 0, 4, -17,  68,  87, -18, 4 }, { 0, 0, 4, -16,  65,  89, -18, 4 },
+  { 0, 0, 4, -16,  63,  91, -18, 4 }, { 0, 0, 4, -16,  60,  94, -18, 4 },
+  { 0, 0, 3, -15,  58,  96, -18, 4 }, { 0, 0, 4, -15,  55,  98, -18, 4 },
+  { 0, 0, 3, -14,  52, 100, -17, 4 }, { 0, 0, 3, -14,  50, 102, -17, 4 },
+  { 0, 0, 3, -13,  47, 104, -17, 4 }, { 0, 0, 3, -13,  45, 106, -17, 4 },
+  { 0, 0, 3, -12,  42, 108, -16, 3 }, { 0, 0, 3, -11,  40, 109, -16, 3 },
+  { 0, 0, 3, -11,  37, 111, -15, 3 }, { 0, 0, 2, -10,  35, 113, -15, 3 },
+  { 0, 0, 3, -10,  32, 114, -14, 3 }, { 0, 0, 2, - 9,  29, 116, -13, 3 },
+  { 0, 0, 2, - 8,  27, 117, -13, 3 }, { 0, 0, 2, - 8,  25, 119, -12, 2 },
+  { 0, 0, 2, - 7,  22, 120, -11, 2 }, { 0, 0, 1, - 6,  20, 121, -10, 2 },
+  { 0, 0, 1, - 6,  18, 122, - 9, 2 }, { 0, 0, 1, - 5,  15, 123, - 8, 2 },
+  { 0, 0, 1, - 4,  13, 124, - 7, 1 }, { 0, 0, 1, - 4,  11, 125, - 6, 1 },
+  { 0, 0, 1, - 3,   8, 126, - 5, 1 }, { 0, 0, 1, - 2,   6, 126, - 4, 1 },
+  { 0, 0, 0, - 1,   4, 127, - 3, 1 }, { 0, 0, 0,   0,   2, 127, - 1, 0 },
+
+#elif WARPEDPIXEL_PREC_BITS == 5
+  // [-1, 0)
+  {0,   0, 127,   1,   0, 0, 0, 0}, {1,  -3, 127,   4,  -1, 0, 0, 0},
+  {1,  -5, 126,   8,  -3, 1, 0, 0}, {1,  -7, 124,  13,  -4, 1, 0, 0},
+  {2,  -9, 122,  18,  -6, 1, 0, 0}, {2, -11, 120,  22,  -7, 2, 0, 0},
+  {3, -13, 117,  27,  -8, 2, 0, 0}, {3, -14, 114,  32, -10, 3, 0, 0},
+  {3, -15, 111,  37, -11, 3, 0, 0}, {3, -16, 108,  42, -12, 3, 0, 0},
+  {4, -17, 104,  47, -13, 3, 0, 0}, {4, -17, 100,  52, -14, 3, 0, 0},
+  {4, -18,  96,  58, -15, 3, 0, 0}, {4, -18,  91,  63, -16, 4, 0, 0},
+  {4, -18,  87,  68, -17, 4, 0, 0}, {4, -18,  82,  73, -17, 4, 0, 0},
+  {4, -18,  78,  78, -18, 4, 0, 0}, {4, -17,  73,  82, -18, 4, 0, 0},
+  {4, -17,  68,  87, -18, 4, 0, 0}, {4, -16,  63,  91, -18, 4, 0, 0},
+  {3, -15,  58,  96, -18, 4, 0, 0}, {3, -14,  52, 100, -17, 4, 0, 0},
+  {3, -13,  47, 104, -17, 4, 0, 0}, {3, -12,  42, 108, -16, 3, 0, 0},
+  {3, -11,  37, 111, -15, 3, 0, 0}, {3, -10,  32, 114, -14, 3, 0, 0},
+  {2,  -8,  27, 117, -13, 3, 0, 0}, {2,  -7,  22, 120, -11, 2, 0, 0},
+  {1,  -6,  18, 122,  -9, 2, 0, 0}, {1,  -4,  13, 124,  -7, 1, 0, 0},
+  {1,  -3,   8, 126,  -5, 1, 0, 0}, {0,  -1,   4, 127,  -3, 1, 0, 0},
+  // [0, 1)
+  { 0,  0,   0, 127,   1,   0,   0,  0}, { 0,  1,  -3, 127,   4,  -2,   1,  0},
+  { 0,  2,  -6, 126,   8,  -3,   1,  0}, {-1,  3,  -8, 125,  13,  -5,   2, -1},
+  {-1,  4, -11, 123,  18,  -7,   3, -1}, {-1,  4, -13, 121,  23,  -8,   3, -1},
+  {-1,  5, -15, 119,  27, -10,   4, -1}, {-2,  6, -17, 116,  33, -12,   5, -1},
+  {-2,  6, -18, 113,  38, -13,   5, -1}, {-2,  7, -19, 110,  43, -15,   6, -2},
+  {-2,  7, -20, 106,  49, -16,   6, -2}, {-2,  7, -21, 102,  54, -17,   7, -2},
+  {-2,  8, -22,  98,  59, -18,   7, -2}, {-2,  8, -22,  94,  64, -19,   7, -2},
+  {-2,  8, -22,  89,  69, -20,   8, -2}, {-2,  8, -21,  84,  74, -21,   8, -2},
+  {-2,  8, -21,  79,  79, -21,   8, -2}, {-2,  8, -21,  74,  84, -21,   8, -2},
+  {-2,  8, -20,  69,  89, -22,   8, -2}, {-2,  7, -19,  64,  94, -22,   8, -2},
+  {-2,  7, -18,  59,  98, -22,   8, -2}, {-2,  7, -17,  54, 102, -21,   7, -2},
+  {-2,  6, -16,  49, 106, -20,   7, -2}, {-2,  6, -15,  43, 110, -19,   7, -2},
+  {-1,  5, -13,  38, 113, -18,   6, -2}, {-1,  5, -12,  33, 116, -17,   6, -2},
+  {-1,  4, -10,  27, 119, -15,   5, -1}, {-1,  3,  -8,  23, 121, -13,   4, -1},
+  {-1,  3,  -7,  18, 123, -11,   4, -1}, {-1,  2,  -5,  13, 125,  -8,   3, -1},
+  { 0,  1,  -3,   8, 126,  -6,   2,  0}, { 0,  1,  -2,   4, 127,  -3,   1,  0},
+  // [1, 2)
+  {0, 0, 0,   1, 127,   0,   0, 0}, {0, 0, 1,  -3, 127,   4,  -1, 0},
+  {0, 0, 1,  -5, 126,   8,  -3, 1}, {0, 0, 1,  -7, 124,  13,  -4, 1},
+  {0, 0, 2,  -9, 122,  18,  -6, 1}, {0, 0, 2, -11, 120,  22,  -7, 2},
+  {0, 0, 3, -13, 117,  27,  -8, 2}, {0, 0, 3, -14, 114,  32, -10, 3},
+  {0, 0, 3, -15, 111,  37, -11, 3}, {0, 0, 3, -16, 108,  42, -12, 3},
+  {0, 0, 4, -17, 104,  47, -13, 3}, {0, 0, 4, -17, 100,  52, -14, 3},
+  {0, 0, 4, -18,  96,  58, -15, 3}, {0, 0, 4, -18,  91,  63, -16, 4},
+  {0, 0, 4, -18,  87,  68, -17, 4}, {0, 0, 4, -18,  82,  73, -17, 4},
+  {0, 0, 4, -18,  78,  78, -18, 4}, {0, 0, 4, -17,  73,  82, -18, 4},
+  {0, 0, 4, -17,  68,  87, -18, 4}, {0, 0, 4, -16,  63,  91, -18, 4},
+  {0, 0, 3, -15,  58,  96, -18, 4}, {0, 0, 3, -14,  52, 100, -17, 4},
+  {0, 0, 3, -13,  47, 104, -17, 4}, {0, 0, 3, -12,  42, 108, -16, 3},
+  {0, 0, 3, -11,  37, 111, -15, 3}, {0, 0, 3, -10,  32, 114, -14, 3},
+  {0, 0, 2,  -8,  27, 117, -13, 3}, {0, 0, 2,  -7,  22, 120, -11, 2},
+  {0, 0, 1,  -6,  18, 122,  -9, 2}, {0, 0, 1,  -4,  13, 124,  -7, 1},
+  {0, 0, 1,  -3,   8, 126,  -5, 1}, {0, 0, 0,  -1,   4, 127,  -3, 1},
+
+#endif  // WARPEDPIXEL_PREC_BITS == 6
+
+  // dummy
+  { 0, 0, 0, 0,   1, 127, 0, 0 },
+};
+
+/* clang-format on */
+
+#define DIV_LUT_PREC_BITS 14
+#define DIV_LUT_BITS 8
+#define DIV_LUT_NUM (1 << DIV_LUT_BITS)
+
+static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
+  16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
+  15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
+  15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
+  14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
+  13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
+  13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
+  13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
+  12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
+  12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
+  11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
+  11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
+  11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
+  10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
+  10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
+  10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
+  9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
+  9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
+  9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
+  9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
+  9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
+  8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
+  8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
+  8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
+  8240,  8224,  8208,  8192,
+};
+
+static INLINE int16_t saturate_int16(int32_t v) {
+  if (v > 32767)
+    return 32767;
+  else if (v < -32768)
+    return -32768;
+  return v;
+}
+
+#if CONFIG_WARPED_MOTION
+// Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
+// at precision of DIV_LUT_PREC_BITS along with the shift.
+static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
+  int64_t e, f;
+  *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
+                               : get_msb((unsigned int)D));
+  // e is obtained from D after resetting the most significant 1 bit.
+  e = D - ((uint64_t)1 << *shift);
+  // Get the most significant DIV_LUT_BITS (8) bits of e into f
+  if (*shift > DIV_LUT_BITS)
+    f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
+  else
+    f = e << (DIV_LUT_BITS - *shift);
+  assert(f <= DIV_LUT_NUM);
+  *shift += DIV_LUT_PREC_BITS;
+  // Use f as lookup into the precomputed table of multipliers
+  return div_lut[f];
+}
+#endif  // CONFIG_WARPED_MOTION
+
+static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
+  int32_t e, f;
+  *shift = get_msb(D);
+  // e is obtained from D after resetting the most significant 1 bit.
+  e = D - ((uint32_t)1 << *shift);
+  // Get the most significant DIV_LUT_BITS (8) bits of e into f
+  if (*shift > DIV_LUT_BITS)
+    f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
+  else
+    f = e << (DIV_LUT_BITS - *shift);
+  assert(f <= DIV_LUT_NUM);
+  *shift += DIV_LUT_PREC_BITS;
+  // Use f as lookup into the precomputed table of multipliers
+  return div_lut[f];
+}
+
+static int is_affine_valid(WarpedMotionParams *wm) {
+  const int32_t *mat = wm->wmmat;
+  return (mat[2] > 0);
+}
+
+static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
+                                   int16_t delta) {
+  if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
+      (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
+    return 0;
+  else
+    return 1;
+}
+
+// Returns 1 on success or 0 on an invalid affine set
+int get_shear_params(WarpedMotionParams *wm) {
+  const int32_t *mat = wm->wmmat;
+  if (!is_affine_valid(wm)) return 0;
+  wm->alpha =
+      clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
+  wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
+  int16_t shift;
+  int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
+  int64_t v;
+  v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
+  wm->gamma =
+      clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
+  v = ((int64_t)mat[3] * mat[4]) * y;
+  wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
+                        (1 << WARPEDMODEL_PREC_BITS),
+                    INT16_MIN, INT16_MAX);
+  if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
+    return 0;
+  return 1;
+}
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE void highbd_get_subcolumn(int taps, uint16_t *ref, int32_t *col,
+                                        int stride, int x, int y_start) {
+  int i;
+  for (i = 0; i < taps; ++i) {
+    col[i] = ref[(i + y_start) * stride + x];
+  }
+}
+
+static uint16_t highbd_bi_ntap_filter(uint16_t *ref, int x, int y, int stride,
+                                      int bd) {
+  int32_t val, arr[WARPEDPIXEL_FILTER_TAPS];
+  int k;
+  int i = (int)x >> WARPEDPIXEL_PREC_BITS;
+  int j = (int)y >> WARPEDPIXEL_PREC_BITS;
+  for (k = 0; k < WARPEDPIXEL_FILTER_TAPS; ++k) {
+    int32_t arr_temp[WARPEDPIXEL_FILTER_TAPS];
+    highbd_get_subcolumn(WARPEDPIXEL_FILTER_TAPS, ref, arr_temp, stride,
+                         i + k + 1 - WARPEDPIXEL_FILTER_TAPS / 2,
+                         j + 1 - WARPEDPIXEL_FILTER_TAPS / 2);
+    arr[k] = do_ntap_filter(arr_temp + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
+                            y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
+  }
+  val = do_ntap_filter(arr + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
+                       x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
+  val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
+  return (uint16_t)clip_pixel_highbd(val, bd);
+}
+
+static uint16_t highbd_bi_cubic_filter(uint16_t *ref, int x, int y, int stride,
+                                       int bd) {
+  int32_t val, arr[4];
+  int k;
+  int i = (int)x >> WARPEDPIXEL_PREC_BITS;
+  int j = (int)y >> WARPEDPIXEL_PREC_BITS;
+  for (k = 0; k < 4; ++k) {
+    int32_t arr_temp[4];
+    highbd_get_subcolumn(4, ref, arr_temp, stride, i + k - 1, j - 1);
+    arr[k] =
+        do_cubic_filter(arr_temp + 1, y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
+  }
+  val = do_cubic_filter(arr + 1, x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
+  val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
+  return (uint16_t)clip_pixel_highbd(val, bd);
+}
+
+static uint16_t highbd_bi_linear_filter(uint16_t *ref, int x, int y, int stride,
+                                        int bd) {
+  const int ix = x >> WARPEDPIXEL_PREC_BITS;
+  const int iy = y >> WARPEDPIXEL_PREC_BITS;
+  const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
+  const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
+  int32_t val;
+  val = ROUND_POWER_OF_TWO_SIGNED(
+      ref[iy * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sy) *
+              (WARPEDPIXEL_PREC_SHIFTS - sx) +
+          ref[iy * stride + ix + 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) * sx +
+          ref[(iy + 1) * stride + ix] * sy * (WARPEDPIXEL_PREC_SHIFTS - sx) +
+          ref[(iy + 1) * stride + ix + 1] * sy * sx,
+      WARPEDPIXEL_PREC_BITS * 2);
+  return (uint16_t)clip_pixel_highbd(val, bd);
+}
+
+static uint16_t highbd_warp_interpolate(uint16_t *ref, int x, int y, int width,
+                                        int height, int stride, int bd) {
+  int ix = x >> WARPEDPIXEL_PREC_BITS;
+  int iy = y >> WARPEDPIXEL_PREC_BITS;
+  int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
+  int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
+  int32_t v;
+
+  if (ix < 0 && iy < 0)
+    return ref[0];
+  else if (ix < 0 && iy > height - 1)
+    return ref[(height - 1) * stride];
+  else if (ix > width - 1 && iy < 0)
+    return ref[width - 1];
+  else if (ix > width - 1 && iy > height - 1)
+    return ref[(height - 1) * stride + (width - 1)];
+  else if (ix < 0) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[iy * stride] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
+            ref[(iy + 1) * stride] * sy,
+        WARPEDPIXEL_PREC_BITS);
+    return clip_pixel_highbd(v, bd);
+  } else if (iy < 0) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) + ref[ix + 1] * sx,
+        WARPEDPIXEL_PREC_BITS);
+    return clip_pixel_highbd(v, bd);
+  } else if (ix > width - 1) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[iy * stride + width - 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
+            ref[(iy + 1) * stride + width - 1] * sy,
+        WARPEDPIXEL_PREC_BITS);
+    return clip_pixel_highbd(v, bd);
+  } else if (iy > height - 1) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[(height - 1) * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) +
+            ref[(height - 1) * stride + ix + 1] * sx,
+        WARPEDPIXEL_PREC_BITS);
+    return clip_pixel_highbd(v, bd);
+  } else if (ix >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
+             iy >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
+             ix < width - WARPEDPIXEL_FILTER_TAPS / 2 &&
+             iy < height - WARPEDPIXEL_FILTER_TAPS / 2) {
+    return highbd_bi_ntap_filter(ref, x, y, stride, bd);
+  } else if (ix >= 1 && iy >= 1 && ix < width - 2 && iy < height - 2) {
+    return highbd_bi_cubic_filter(ref, x, y, stride, bd);
+  } else {
+    return highbd_bi_linear_filter(ref, x, y, stride, bd);
+  }
+}
+
+static INLINE int highbd_error_measure(int err, int bd) {
+  const int b = bd - 8;
+  const int bmask = (1 << b) - 1;
+  const int v = (1 << b);
+  int e1, e2;
+  err = abs(err);
+  e1 = err >> b;
+  e2 = err & bmask;
+  return error_measure_lut[255 + e1] * (v - e2) +
+         error_measure_lut[256 + e1] * e2;
+}
+
+static void highbd_warp_plane_old(WarpedMotionParams *wm, uint8_t *ref8,
+                                  int width, int height, int stride,
+                                  uint8_t *pred8, int p_col, int p_row,
+                                  int p_width, int p_height, int p_stride,
+                                  int subsampling_x, int subsampling_y,
+                                  int x_scale, int y_scale, int bd,
+                                  int ref_frm) {
+  int i, j;
+  ProjectPointsFunc projectpoints = get_project_points_type(wm->wmtype);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  if (projectpoints == NULL) return;
+  for (i = p_row; i < p_row + p_height; ++i) {
+    for (j = p_col; j < p_col + p_width; ++j) {
+      int in[2], out[2];
+      in[0] = j;
+      in[1] = i;
+      projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
+      out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, 4);
+      out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, 4);
+      if (ref_frm)
+        pred[(j - p_col) + (i - p_row) * p_stride] = ROUND_POWER_OF_TWO(
+            pred[(j - p_col) + (i - p_row) * p_stride] +
+                highbd_warp_interpolate(ref, out[0], out[1], width, height,
+                                        stride, bd),
+            1);
+      else
+        pred[(j - p_col) + (i - p_row) * p_stride] = highbd_warp_interpolate(
+            ref, out[0], out[1], width, height, stride, bd);
+    }
+  }
+}
+
+// Note: For an explanation of the warp algorithm, see the comment
+// above warp_plane()
+//
+// Note also: The "worst case" in terms of modulus of the data stored into 'tmp'
+// (ie, the result of 'sum' in the horizontal filter) occurs when:
+// coeffs = { -2,   8, -22,  87,  72, -21,   8, -2}, and
+// ref =    {  0, 255,   0, 255, 255,   0, 255,  0}
+// Before rounding, this gives sum = 716625. After rounding,
+// HORSHEAR_REDUCE_PREC_BITS = 4 => sum = 44789 > 2^15
+// HORSHEAR_REDUCE_PREC_BITS = 5 => sum = 22395 < 2^15
+//
+// So, as long as HORSHEAR_REDUCE_PREC_BITS >= 5, we can safely use a 16-bit
+// intermediate array.
+void av1_highbd_warp_affine_c(int32_t *mat, uint16_t *ref, int width,
+                              int height, int stride, uint16_t *pred, int p_col,
+                              int p_row, int p_width, int p_height,
+                              int p_stride, int subsampling_x,
+                              int subsampling_y, int bd, int ref_frm,
+                              int16_t alpha, int16_t beta, int16_t gamma,
+                              int16_t delta) {
+#if HORSHEAR_REDUCE_PREC_BITS >= 5
+  int16_t tmp[15 * 8];
+#else
+  int32_t tmp[15 * 8];
+#endif
+  int i, j, k, l, m;
+
+  /* Note: For this code to work, the left/right frame borders need to be
+     extended by at least 13 pixels each. By the time we get here, other
+     code will have set up this border, but we allow an explicit check
+     for debugging purposes.
+  */
+  /*for (i = 0; i < height; ++i) {
+    for (j = 0; j < 13; ++j) {
+      assert(ref[i * stride - 13 + j] == ref[i * stride]);
+      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+    }
+  }*/
+
+  for (i = p_row; i < p_row + p_height; i += 8) {
+    for (j = p_col; j < p_col + p_width; j += 8) {
+      int32_t x4, y4, ix4, sx4, iy4, sy4;
+      if (subsampling_x)
+        x4 = ROUND_POWER_OF_TWO_SIGNED(
+            mat[2] * 2 * (j + 4) + mat[3] * 2 * (i + 4) + mat[0] +
+                (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+            1);
+      else
+        x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
+
+      if (subsampling_y)
+        y4 = ROUND_POWER_OF_TWO_SIGNED(
+            mat[4] * 2 * (j + 4) + mat[5] * 2 * (i + 4) + mat[1] +
+                (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+            1);
+      else
+        y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
+
+      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Horizontal filter
+      for (k = -7; k < 8; ++k) {
+        int iy = iy4 + k;
+        if (iy < 0)
+          iy = 0;
+        else if (iy > height - 1)
+          iy = height - 1;
+
+        if (ix4 <= -7) {
+          for (l = 0; l < 8; ++l) {
+            tmp[(k + 7) * 8 + l] =
+                ref[iy * stride] *
+                (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
+          }
+        } else if (ix4 >= width + 6) {
+          for (l = 0; l < 8; ++l) {
+            tmp[(k + 7) * 8 + l] =
+                ref[iy * stride + (width - 1)] *
+                (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
+          }
+        } else {
+          int sx = sx4 + alpha * (-4) + beta * k;
+
+          for (l = -4; l < 4; ++l) {
+            int ix = ix4 + l - 3;
+            const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
+                             WARPEDPIXEL_PREC_SHIFTS;
+            const int16_t *coeffs = warped_filter[offs];
+            int32_t sum = 0;
+            // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+            for (m = 0; m < 8; ++m) {
+              sum += ref[iy * stride + ix + m] * coeffs[m];
+            }
+            sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
+#if HORSHEAR_REDUCE_PREC_BITS >= 5
+            tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum);
+#else
+            tmp[(k + 7) * 8 + (l + 4)] = sum;
+#endif
+            sx += alpha;
+          }
+        }
+      }
+
+      // Vertical filter
+      for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+        int sy = sy4 + gamma * (-4) + delta * k;
+        for (l = -4; l < 4; ++l) {
+          uint16_t *p =
+              &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
+                           WARPEDPIXEL_PREC_SHIFTS;
+          const int16_t *coeffs = warped_filter[offs];
+          int32_t sum = 0;
+          // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+          for (m = 0; m < 8; ++m) {
+            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
+          }
+          sum = clip_pixel_highbd(
+              ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS), bd);
+          if (ref_frm)
+            *p = ROUND_POWER_OF_TWO(*p + sum, 1);
+          else
+            *p = sum;
+          sy += gamma;
+        }
+      }
+    }
+  }
+}
+
+static void highbd_warp_plane(WarpedMotionParams *wm, uint8_t *ref8, int width,
+                              int height, int stride, uint8_t *pred8, int p_col,
+                              int p_row, int p_width, int p_height,
+                              int p_stride, int subsampling_x,
+                              int subsampling_y, int x_scale, int y_scale,
+                              int bd, int ref_frm) {
+  if (wm->wmtype == ROTZOOM) {
+    wm->wmmat[5] = wm->wmmat[2];
+    wm->wmmat[4] = -wm->wmmat[3];
+  }
+  if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) && x_scale == 16 &&
+      y_scale == 16) {
+    int32_t *mat = wm->wmmat;
+    const int16_t alpha = wm->alpha;
+    const int16_t beta = wm->beta;
+    const int16_t gamma = wm->gamma;
+    const int16_t delta = wm->delta;
+
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+    uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+    av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
+                           p_width, p_height, p_stride, subsampling_x,
+                           subsampling_y, bd, ref_frm, alpha, beta, gamma,
+                           delta);
+  } else {
+    highbd_warp_plane_old(wm, ref8, width, height, stride, pred8, p_col, p_row,
+                          p_width, p_height, p_stride, subsampling_x,
+                          subsampling_y, x_scale, y_scale, bd, ref_frm);
+  }
+}
+
+static double highbd_warp_erroradv(WarpedMotionParams *wm, uint8_t *ref8,
+                                   int width, int height, int stride,
+                                   uint8_t *dst8, int p_col, int p_row,
+                                   int p_width, int p_height, int p_stride,
+                                   int subsampling_x, int subsampling_y,
+                                   int x_scale, int y_scale, int bd) {
+  int gm_err = 0, no_gm_err = 0;
+  int64_t gm_sumerr = 0, no_gm_sumerr = 0;
+  int i, j;
+  uint16_t *tmp = aom_malloc(p_width * p_height * sizeof(*tmp));
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_warp_plane(wm, ref8, width, height, stride, CONVERT_TO_BYTEPTR(tmp),
+                    p_col, p_row, p_width, p_height, p_width, subsampling_x,
+                    subsampling_y, x_scale, y_scale, bd, 0);
+  for (i = 0; i < p_height; ++i) {
+    for (j = 0; j < p_width; ++j) {
+      gm_err = dst[j + i * p_stride] - tmp[j + i * p_width];
+      no_gm_err =
+          dst[j + i * p_stride] - ref[(j + p_col) + (i + p_row) * stride];
+      gm_sumerr += highbd_error_measure(gm_err, bd);
+      no_gm_sumerr += highbd_error_measure(no_gm_err, bd);
+    }
+  }
+  aom_free(tmp);
+  return (double)gm_sumerr / no_gm_sumerr;
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+static INLINE int error_measure(int err) {
+  return error_measure_lut[255 + err];
+}
+
+static void warp_plane_old(WarpedMotionParams *wm, uint8_t *ref, int width,
+                           int height, int stride, uint8_t *pred, int p_col,
+                           int p_row, int p_width, int p_height, int p_stride,
+                           int subsampling_x, int subsampling_y, int x_scale,
+                           int y_scale, int ref_frm) {
+  int i, j;
+  ProjectPointsFunc projectpoints = get_project_points_type(wm->wmtype);
+  if (projectpoints == NULL) return;
+  for (i = p_row; i < p_row + p_height; ++i) {
+    for (j = p_col; j < p_col + p_width; ++j) {
+      int in[2], out[2];
+      in[0] = j;
+      in[1] = i;
+      projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
+      out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, 4);
+      out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, 4);
+      if (ref_frm)
+        pred[(j - p_col) + (i - p_row) * p_stride] = ROUND_POWER_OF_TWO(
+            pred[(j - p_col) + (i - p_row) * p_stride] +
+                warp_interpolate(ref, out[0], out[1], width, height, stride),
+            1);
+      else
+        pred[(j - p_col) + (i - p_row) * p_stride] =
+            warp_interpolate(ref, out[0], out[1], width, height, stride);
+    }
+  }
+}
+
+/* The warp filter for ROTZOOM and AFFINE models works as follows:
+   * Split the input into 8x8 blocks
+   * For each block, project the point (4, 4) within the block, to get the
+     overall block position. Split into integer and fractional coordinates,
+     maintaining full WARPEDMODEL precision
+   * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
+     variable horizontal offset. This means that, while the rows of the
+     intermediate buffer align with the rows of the *reference* image, the
+     columns align with the columns of the *destination* image.
+   * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
+     destination is too small we crop the output at this stage). Each pixel has
+     a variable vertical offset, so that the resulting rows are aligned with
+     the rows of the destination image.
+
+   To accomplish these alignments, we factor the warp matrix as a
+   product of two shear / asymmetric zoom matrices:
+   / a b \  = /   1       0    \ * / 1+alpha  beta \
+   \ c d /    \ gamma  1+delta /   \    0      1   /
+   where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
+   The second shear (with alpha and beta) is applied by the horizontal filter,
+   then the first shear (with gamma and delta) is applied by the vertical
+   filter.
+
+   The only limitation is that, to fit this in a fixed 8-tap filter size,
+   the fractional pixel offsets must be at most +-1. Since the horizontal filter
+   generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
+   within the block, the parameters must satisfy
+   4 * |alpha| + 7 * |beta| <= 1   and   4 * |gamma| + 7 * |delta| <= 1
+   for this filter to be applicable.
+
+   Note: warp_affine() assumes that the caller has done all of the relevant
+   checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
+   are set appropriately (if using a ROTZOOM model), and that alpha, beta,
+   gamma, delta are all in range.
+
+   TODO(david.barker): Maybe support scaled references?
+*/
+void av1_warp_affine_c(int32_t *mat, uint8_t *ref, int width, int height,
+                       int stride, uint8_t *pred, int p_col, int p_row,
+                       int p_width, int p_height, int p_stride,
+                       int subsampling_x, int subsampling_y, int ref_frm,
+                       int16_t alpha, int16_t beta, int16_t gamma,
+                       int16_t delta) {
+  int16_t tmp[15 * 8];
+  int i, j, k, l, m;
+
+  /* Note: For this code to work, the left/right frame borders need to be
+     extended by at least 13 pixels each. By the time we get here, other
+     code will have set up this border, but we allow an explicit check
+     for debugging purposes.
+  */
+  /*for (i = 0; i < height; ++i) {
+    for (j = 0; j < 13; ++j) {
+      assert(ref[i * stride - 13 + j] == ref[i * stride]);
+      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+    }
+  }*/
+
+  for (i = p_row; i < p_row + p_height; i += 8) {
+    for (j = p_col; j < p_col + p_width; j += 8) {
+      int32_t x4, y4, ix4, sx4, iy4, sy4;
+      if (subsampling_x)
+        x4 = ROUND_POWER_OF_TWO_SIGNED(
+            mat[2] * 2 * (j + 4) + mat[3] * 2 * (i + 4) + mat[0] +
+                (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+            1);
+      else
+        x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
+
+      if (subsampling_y)
+        y4 = ROUND_POWER_OF_TWO_SIGNED(
+            mat[4] * 2 * (j + 4) + mat[5] * 2 * (i + 4) + mat[1] +
+                (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+            1);
+      else
+        y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
+
+      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Horizontal filter
+      for (k = -7; k < 8; ++k) {
+        int iy = iy4 + k;
+        if (iy < 0)
+          iy = 0;
+        else if (iy > height - 1)
+          iy = height - 1;
+
+        if (ix4 <= -7) {
+          // In this case, the rightmost pixel sampled is in column
+          // ix4 + 3 + 7 - 3 = ix4 + 7 <= 0, ie. the entire block
+          // will sample only from the leftmost column
+          // (once border extension is taken into account)
+          for (l = 0; l < 8; ++l) {
+            tmp[(k + 7) * 8 + l] =
+                ref[iy * stride] *
+                (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
+          }
+        } else if (ix4 >= width + 6) {
+          // In this case, the leftmost pixel sampled is in column
+          // ix4 - 4 + 0 - 3 = ix4 - 7 >= width - 1, ie. the entire block
+          // will sample only from the rightmost column
+          // (once border extension is taken into account)
+          for (l = 0; l < 8; ++l) {
+            tmp[(k + 7) * 8 + l] =
+                ref[iy * stride + (width - 1)] *
+                (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
+          }
+        } else {
+          // If we get here, then
+          // the leftmost pixel sampled is
+          // ix4 - 4 + 0 - 3 = ix4 - 7 >= -13
+          // and the rightmost pixel sampled is at most
+          // ix4 + 3 + 7 - 3 = ix4 + 7 <= width + 12
+          // So, assuming that border extension has been done, we
+          // don't need to explicitly clamp values.
+          int sx = sx4 + alpha * (-4) + beta * k;
+
+          for (l = -4; l < 4; ++l) {
+            int ix = ix4 + l - 3;
+            // At this point, sx = sx4 + alpha * l + beta * k
+            const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
+                             WARPEDPIXEL_PREC_SHIFTS;
+            const int16_t *coeffs = warped_filter[offs];
+            int32_t sum = 0;
+            // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+            for (m = 0; m < 8; ++m) {
+              sum += ref[iy * stride + ix + m] * coeffs[m];
+            }
+            sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
+            tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum);
+            sx += alpha;
+          }
+        }
+      }
+
+      // Vertical filter
+      for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+        int sy = sy4 + gamma * (-4) + delta * k;
+        for (l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
+          uint8_t *p =
+              &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+          // At this point, sy = sy4 + gamma * l + delta * k
+          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
+                           WARPEDPIXEL_PREC_SHIFTS;
+          const int16_t *coeffs = warped_filter[offs];
+          int32_t sum = 0;
+          // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+          for (m = 0; m < 8; ++m) {
+            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
+          }
+          sum = clip_pixel(ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS));
+          if (ref_frm)
+            *p = ROUND_POWER_OF_TWO(*p + sum, 1);
+          else
+            *p = sum;
+          sy += gamma;
+        }
+      }
+    }
+  }
+}
+
+static void warp_plane(WarpedMotionParams *wm, uint8_t *ref, int width,
+                       int height, int stride, uint8_t *pred, int p_col,
+                       int p_row, int p_width, int p_height, int p_stride,
+                       int subsampling_x, int subsampling_y, int x_scale,
+                       int y_scale, int ref_frm) {
+  if (wm->wmtype == ROTZOOM) {
+    wm->wmmat[5] = wm->wmmat[2];
+    wm->wmmat[4] = -wm->wmmat[3];
+  }
+  if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) && x_scale == 16 &&
+      y_scale == 16) {
+    int32_t *mat = wm->wmmat;
+    const int16_t alpha = wm->alpha;
+    const int16_t beta = wm->beta;
+    const int16_t gamma = wm->gamma;
+    const int16_t delta = wm->delta;
+
+    av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
+                    p_width, p_height, p_stride, subsampling_x, subsampling_y,
+                    ref_frm, alpha, beta, gamma, delta);
+  } else {
+    warp_plane_old(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
+                   p_height, p_stride, subsampling_x, subsampling_y, x_scale,
+                   y_scale, ref_frm);
+  }
+}
+
+static double warp_erroradv(WarpedMotionParams *wm, uint8_t *ref, int width,
+                            int height, int stride, uint8_t *dst, int p_col,
+                            int p_row, int p_width, int p_height, int p_stride,
+                            int subsampling_x, int subsampling_y, int x_scale,
+                            int y_scale) {
+  int gm_err = 0, no_gm_err = 0;
+  int gm_sumerr = 0, no_gm_sumerr = 0;
+  int i, j;
+  uint8_t *tmp = aom_malloc(p_width * p_height);
+  warp_plane(wm, ref, width, height, stride, tmp, p_col, p_row, p_width,
+             p_height, p_width, subsampling_x, subsampling_y, x_scale, y_scale,
+             0);
+
+  for (i = 0; i < p_height; ++i) {
+    for (j = 0; j < p_width; ++j) {
+      gm_err = dst[j + i * p_stride] - tmp[j + i * p_width];
+      no_gm_err =
+          dst[j + i * p_stride] - ref[(j + p_col) + (i + p_row) * stride];
+      gm_sumerr += error_measure(gm_err);
+      no_gm_sumerr += error_measure(no_gm_err);
+    }
+  }
+
+  aom_free(tmp);
+  return (double)gm_sumerr / no_gm_sumerr;
+}
+
+double av1_warp_erroradv(WarpedMotionParams *wm,
+#if CONFIG_HIGHBITDEPTH
+                         int use_hbd, int bd,
+#endif  // CONFIG_HIGHBITDEPTH
+                         uint8_t *ref, int width, int height, int stride,
+                         uint8_t *dst, int p_col, int p_row, int p_width,
+                         int p_height, int p_stride, int subsampling_x,
+                         int subsampling_y, int x_scale, int y_scale) {
+  if (wm->wmtype <= AFFINE)
+    if (!get_shear_params(wm)) return 1;
+#if CONFIG_HIGHBITDEPTH
+  if (use_hbd)
+    return highbd_warp_erroradv(
+        wm, ref, width, height, stride, dst, p_col, p_row, p_width, p_height,
+        p_stride, subsampling_x, subsampling_y, x_scale, y_scale, bd);
+#endif  // CONFIG_HIGHBITDEPTH
+  return warp_erroradv(wm, ref, width, height, stride, dst, p_col, p_row,
+                       p_width, p_height, p_stride, subsampling_x,
+                       subsampling_y, x_scale, y_scale);
+}
+
+void av1_warp_plane(WarpedMotionParams *wm,
+#if CONFIG_HIGHBITDEPTH
+                    int use_hbd, int bd,
+#endif  // CONFIG_HIGHBITDEPTH
+                    uint8_t *ref, int width, int height, int stride,
+                    uint8_t *pred, int p_col, int p_row, int p_width,
+                    int p_height, int p_stride, int subsampling_x,
+                    int subsampling_y, int x_scale, int y_scale, int ref_frm) {
+#if CONFIG_HIGHBITDEPTH
+  if (use_hbd)
+    highbd_warp_plane(wm, ref, width, height, stride, pred, p_col, p_row,
+                      p_width, p_height, p_stride, subsampling_x, subsampling_y,
+                      x_scale, y_scale, bd, ref_frm);
+  else
+#endif  // CONFIG_HIGHBITDEPTH
+    warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
+               p_height, p_stride, subsampling_x, subsampling_y, x_scale,
+               y_scale, ref_frm);
+}
+
+#if CONFIG_WARPED_MOTION
+#define LEAST_SQUARES_ORDER 2
+
+#define LS_MV_MAX 256  // max mv in 1/8-pel
+#define LS_STEP 2
+
+// Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
+// the precision needed is:
+//   (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
+//   (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
+//   1 [for sign] +
+//   LEAST_SQUARES_SAMPLES_MAX_BITS
+//        [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
+// The value is 23
+#define LS_MAT_RANGE_BITS \
+  ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
+
+// Bit-depth reduction from the full-range
+#define LS_MAT_DOWN_BITS 2
+
+// bits range of A, Bx and By after downshifting
+#define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
+#define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
+#define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
+
+#define LS_SUM(a) ((a)*4 + LS_STEP * 2)
+#define LS_SQUARE(a) \
+  (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> 2)
+#define LS_PRODUCT1(a, b) \
+  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> 2)
+#define LS_PRODUCT2(a, b) \
+  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> 2)
+
+#if LEAST_SQUARES_ORDER == 2
+static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize,
+                           int mvy, int mvx, WarpedMotionParams *wm, int mi_row,
+                           int mi_col) {
+  int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
+  int32_t Bx[2] = { 0, 0 };
+  int32_t By[2] = { 0, 0 };
+  int i, n = 0;
+
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int suy = (mi_row * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1) * 8;
+  const int sux = (mi_col * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1) * 8;
+  const int duy = suy + mvy;
+  const int dux = sux + mvx;
+
+  // Assume the center pixel of the block has exactly the same motion vector
+  // as transmitted for the block. First shift the origin of the source
+  // points to the block center, and the origin of the destination points to
+  // the block center added to the motion vector transmitted.
+  // Let (xi, yi) denote the source points and (xi', yi') denote destination
+  // points after origin shfifting, for i = 0, 1, 2, .... n-1.
+  // Then if  P = [x0, y0,
+  //               x1, y1
+  //               x2, y1,
+  //                ....
+  //              ]
+  //          q = [x0', x1', x2', ... ]'
+  //          r = [y0', y1', y2', ... ]'
+  // the least squares problems that need to be solved are:
+  //          [h1, h2]' = inv(P'P)P'q and
+  //          [h3, h4]' = inv(P'P)P'r
+  // where the affine transformation is given by:
+  //          x' = h1.x + h2.y
+  //          y' = h3.x + h4.y
+  //
+  // The loop below computes: A = P'P, Bx = P'q, By = P'r
+  // We need to just compute inv(A).Bx and inv(A).By for the solutions.
+  int sx, sy, dx, dy;
+  // Contribution from neighbor block
+  for (i = 0; i < np && n < LEAST_SQUARES_SAMPLES_MAX; i++) {
+    dx = pts2[i * 2] - dux;
+    dy = pts2[i * 2 + 1] - duy;
+    sx = pts1[i * 2] - sux;
+    sy = pts1[i * 2 + 1] - suy;
+    if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
+      A[0][0] += LS_SQUARE(sx);
+      A[0][1] += LS_PRODUCT1(sx, sy);
+      A[1][1] += LS_SQUARE(sy);
+      Bx[0] += LS_PRODUCT2(sx, dx);
+      Bx[1] += LS_PRODUCT1(sy, dx);
+      By[0] += LS_PRODUCT1(sx, dy);
+      By[1] += LS_PRODUCT2(sy, dy);
+      n++;
+    }
+  }
+  int downshift;
+  if (n >= 4)
+    downshift = LS_MAT_DOWN_BITS;
+  else if (n >= 2)
+    downshift = LS_MAT_DOWN_BITS - 1;
+  else
+    downshift = LS_MAT_DOWN_BITS - 2;
+
+  // Reduce precision by downshift bits
+  A[0][0] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[0][0], downshift), LS_MAT_MIN,
+                  LS_MAT_MAX);
+  A[0][1] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[0][1], downshift), LS_MAT_MIN,
+                  LS_MAT_MAX);
+  A[1][1] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[1][1], downshift), LS_MAT_MIN,
+                  LS_MAT_MAX);
+  Bx[0] = clamp(ROUND_POWER_OF_TWO_SIGNED(Bx[0], downshift), LS_MAT_MIN,
+                LS_MAT_MAX);
+  Bx[1] = clamp(ROUND_POWER_OF_TWO_SIGNED(Bx[1], downshift), LS_MAT_MIN,
+                LS_MAT_MAX);
+  By[0] = clamp(ROUND_POWER_OF_TWO_SIGNED(By[0], downshift), LS_MAT_MIN,
+                LS_MAT_MAX);
+  By[1] = clamp(ROUND_POWER_OF_TWO_SIGNED(By[1], downshift), LS_MAT_MIN,
+                LS_MAT_MAX);
+
+  int64_t Px[2], Py[2], Det;
+  int16_t iDet, shift;
+
+  // These divided by the Det, are the least squares solutions
+  Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
+  Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
+  Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
+  Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
+
+  // Compute Determinant of A
+  Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
+  if (Det == 0) return 1;
+  iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
+  shift -= WARPEDMODEL_PREC_BITS;
+  if (shift < 0) {
+    iDet <<= (-shift);
+    shift = 0;
+  }
+
+  int64_t v;
+  v = Px[0] * (int64_t)iDet;
+  wm->wmmat[2] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift));
+  v = Px[1] * (int64_t)iDet;
+  wm->wmmat[3] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift));
+  v = ((int64_t)dux * (1 << WARPEDMODEL_PREC_BITS)) -
+      (int64_t)sux * wm->wmmat[2] - (int64_t)suy * wm->wmmat[3];
+  wm->wmmat[0] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED(v, 3));
+
+  v = Py[0] * (int64_t)iDet;
+  wm->wmmat[4] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift));
+  v = Py[1] * (int64_t)iDet;
+  wm->wmmat[5] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift));
+  v = ((int64_t)duy * (1 << WARPEDMODEL_PREC_BITS)) -
+      (int64_t)sux * wm->wmmat[4] - (int64_t)suy * wm->wmmat[5];
+  wm->wmmat[1] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED(v, 3));
+
+  wm->wmmat[6] = wm->wmmat[7] = 0;
+
+  // Clamp values
+  wm->wmmat[0] = clamp(wm->wmmat[0], -WARPEDMODEL_TRANS_CLAMP,
+                       WARPEDMODEL_TRANS_CLAMP - 1);
+  wm->wmmat[1] = clamp(wm->wmmat[1], -WARPEDMODEL_TRANS_CLAMP,
+                       WARPEDMODEL_TRANS_CLAMP - 1);
+  wm->wmmat[2] = clamp(wm->wmmat[2], -WARPEDMODEL_DIAGAFFINE_CLAMP,
+                       WARPEDMODEL_DIAGAFFINE_CLAMP - 1);
+  wm->wmmat[5] = clamp(wm->wmmat[5], -WARPEDMODEL_DIAGAFFINE_CLAMP,
+                       WARPEDMODEL_DIAGAFFINE_CLAMP - 1);
+  wm->wmmat[3] = clamp(wm->wmmat[3], -WARPEDMODEL_NONDIAGAFFINE_CLAMP,
+                       WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+  wm->wmmat[4] = clamp(wm->wmmat[4], -WARPEDMODEL_NONDIAGAFFINE_CLAMP,
+                       WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+  return 0;
+}
+
+#else
+
+static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize,
+                           int mvy, int mvx, WarpedMotionParams *wm, int mi_row,
+                           int mi_col) {
+  int32_t A[3][3] = { { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 } };
+  int32_t Bx[3] = { 0, 0, 0 };
+  int32_t By[3] = { 0, 0, 0 };
+  int i, n = 0, off;
+
+  int64_t C00, C01, C02, C11, C12, C22;
+  int64_t Px[3], Py[3];
+  int64_t Det, v;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int cy_offset = AOMMAX(bh, MI_SIZE) / 2 - 1;
+  const int cx_offset = AOMMAX(bw, MI_SIZE) / 2 - 1;
+
+  // Offsets to make the values in the arrays smaller
+  const int ux = mi_col * MI_SIZE * 8, uy = mi_row * MI_SIZE * 8;
+  // Let source points (xi, yi) map to destimation points (xi', yi'),
+  //     for i = 0, 1, 2, .... n-1
+  // Then if  P = [x0, y0, 1,
+  //               x1, y1, 1
+  //               x2, y2, 1,
+  //                ....
+  //              ]
+  //          q = [x0', x1', x2', ... ]'
+  //          r = [y0', y1', y2', ... ]'
+  // the least squares problems that need to be solved are:
+  //          [h1, h2, dx]' = inv(P'P)P'q and
+  //          [h3, h4, dy]' = inv(P'P)P'r
+  // where the affine transformation is given by:
+  //          x' = h1.x + h2.y + dx
+  //          y' = h3.x + h4.y + dy
+  //
+  // The loop below computes: A = P'P, Bx = P'q, By = P'r
+  // We need to just compute inv(A).Bx and inv(A).By for the solutions.
+  //
+  int sx, sy, dx, dy;
+  // Contribution from sample in current block
+  sx = cx_offset * 8;
+  sy = cy_offset * 8;
+  dx = sx + mvx;
+  dy = sy + mvy;
+  if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
+    A[0][0] += LS_SQUARE(sx);
+    A[0][1] += LS_PRODUCT1(sx, sy);
+    A[0][2] += LS_SUM(sx);
+    A[1][1] += LS_SQUARE(sy);
+    A[1][2] += LS_SUM(sy);
+    A[2][2] += 4;
+    Bx[0] += LS_PRODUCT2(sx, dx);
+    Bx[1] += LS_PRODUCT1(sy, dx);
+    Bx[2] += LS_SUM(dx);
+    By[0] += LS_PRODUCT1(sx, dy);
+    By[1] += LS_PRODUCT2(sy, dy);
+    By[2] += LS_SUM(dy);
+    n++;
+  }
+  // Contribution from neighbor block
+  for (i = 0; i < np && n < LEAST_SQUARES_SAMPLES_MAX; i++) {
+    dx = pts2[i * 2] - ux;
+    dy = pts2[i * 2 + 1] - uy;
+    sx = pts1[i * 2] - ux;
+    sy = pts1[i * 2 + 1] - uy;
+    if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
+      A[0][0] += LS_SQUARE(sx);
+      A[0][1] += LS_PRODUCT1(sx, sy);
+      A[0][2] += LS_SUM(sx);
+      A[1][1] += LS_SQUARE(sy);
+      A[1][2] += LS_SUM(sy);
+      A[2][2] += 4;
+      Bx[0] += LS_PRODUCT2(sx, dx);
+      Bx[1] += LS_PRODUCT1(sy, dx);
+      Bx[2] += LS_SUM(dx);
+      By[0] += LS_PRODUCT1(sx, dy);
+      By[1] += LS_PRODUCT2(sy, dy);
+      By[2] += LS_SUM(dy);
+      n++;
+    }
+  }
+  // Compute Cofactors of A
+  C00 = (int64_t)A[1][1] * A[2][2] - (int64_t)A[1][2] * A[1][2];
+  C01 = (int64_t)A[1][2] * A[0][2] - (int64_t)A[0][1] * A[2][2];
+  C02 = (int64_t)A[0][1] * A[1][2] - (int64_t)A[0][2] * A[1][1];
+  C11 = (int64_t)A[0][0] * A[2][2] - (int64_t)A[0][2] * A[0][2];
+  C12 = (int64_t)A[0][1] * A[0][2] - (int64_t)A[0][0] * A[1][2];
+  C22 = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
+
+  // Scale by 1/64
+  C00 = ROUND_POWER_OF_TWO_SIGNED(C00, 6);
+  C01 = ROUND_POWER_OF_TWO_SIGNED(C01, 6);
+  C02 = ROUND_POWER_OF_TWO_SIGNED(C02, 6);
+  C11 = ROUND_POWER_OF_TWO_SIGNED(C11, 6);
+  C12 = ROUND_POWER_OF_TWO_SIGNED(C12, 6);
+  C22 = ROUND_POWER_OF_TWO_SIGNED(C22, 6);
+
+  // Compute Determinant of A
+  Det = C00 * A[0][0] + C01 * A[0][1] + C02 * A[0][2];
+  if (Det == 0) return 1;
+
+  // These divided by the Det, are the least squares solutions
+  Px[0] = C00 * Bx[0] + C01 * Bx[1] + C02 * Bx[2];
+  Px[1] = C01 * Bx[0] + C11 * Bx[1] + C12 * Bx[2];
+  Px[2] = C02 * Bx[0] + C12 * Bx[1] + C22 * Bx[2];
+  Py[0] = C00 * By[0] + C01 * By[1] + C02 * By[2];
+  Py[1] = C01 * By[0] + C11 * By[1] + C12 * By[2];
+  Py[2] = C02 * By[0] + C12 * By[1] + C22 * By[2];
+
+  int16_t shift;
+  int64_t iDet;
+  iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
+  shift -= WARPEDMODEL_PREC_BITS;
+  if (shift < 0) {
+    iDet <<= (-shift);
+    shift = 0;
+  }
+
+  v = Px[0] * iDet;
+  wm->wmmat[2] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift);
+  v = Px[1] * iDet;
+  wm->wmmat[3] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift);
+  v = Px[2] * iDet;
+  wm->wmmat[0] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift + 3);
+  // Adjust x displacement for the offset
+  off = (ux << WARPEDMODEL_PREC_BITS) - ux * wm->wmmat[2] - uy * wm->wmmat[3];
+  wm->wmmat[0] += ROUND_POWER_OF_TWO_SIGNED(off, 3);
+
+  v = Py[0] * iDet;
+  wm->wmmat[4] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift);
+  v = Py[1] * iDet;
+  wm->wmmat[5] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift);
+  v = Py[2] * iDet;
+  wm->wmmat[1] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift + 3);
+  // Adjust y displacement for the offset
+  off = (uy << WARPEDMODEL_PREC_BITS) - ux * wm->wmmat[4] - uy * wm->wmmat[5];
+  wm->wmmat[1] += ROUND_POWER_OF_TWO_SIGNED(off, 3);
+  wm->wmmat[6] = wm->wmmat[7] = 0;
+
+  // Clamp values
+  wm->wmmat[0] = clamp(wm->wmmat[0], -WARPEDMODEL_TRANS_CLAMP,
+                       WARPEDMODEL_TRANS_CLAMP - 1);
+  wm->wmmat[1] = clamp(wm->wmmat[1], -WARPEDMODEL_TRANS_CLAMP,
+                       WARPEDMODEL_TRANS_CLAMP - 1);
+  wm->wmmat[2] = clamp(wm->wmmat[2], -WARPEDMODEL_DIAGAFFINE_CLAMP,
+                       WARPEDMODEL_DIAGAFFINE_CLAMP - 1);
+  wm->wmmat[5] = clamp(wm->wmmat[5], -WARPEDMODEL_DIAGAFFINE_CLAMP,
+                       WARPEDMODEL_DIAGAFFINE_CLAMP - 1);
+  wm->wmmat[3] = clamp(wm->wmmat[3], -WARPEDMODEL_NONDIAGAFFINE_CLAMP,
+                       WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+  wm->wmmat[4] = clamp(wm->wmmat[4], -WARPEDMODEL_NONDIAGAFFINE_CLAMP,
+                       WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+
+  return 0;
+}
+#endif  // LEAST_SQUARES_ORDER == 2
+
+int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
+                    int mvx, WarpedMotionParams *wm_params, int mi_row,
+                    int mi_col) {
+  int result = 1;
+  switch (wm_params->wmtype) {
+    case AFFINE:
+      result = find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params,
+                               mi_row, mi_col);
+      break;
+    default: assert(0 && "Invalid warped motion type!"); return 1;
+  }
+  if (result == 0) {
+    if (wm_params->wmtype == ROTZOOM) {
+      wm_params->wmmat[5] = wm_params->wmmat[2];
+      wm_params->wmmat[4] = -wm_params->wmmat[3];
+    }
+    if (wm_params->wmtype == AFFINE || wm_params->wmtype == ROTZOOM) {
+      // check compatibility with the fast warp filter
+      if (!get_shear_params(wm_params)) return 1;
+    }
+  }
+
+  return result;
+}
+#endif  // CONFIG_WARPED_MOTION
diff --git a/third_party/aom/av1/common/warped_motion.h b/third_party/aom/av1/common/warped_motion.h
new file mode 100644
index 000000000..dfd8dae34
--- /dev/null
+++ b/third_party/aom/av1/common/warped_motion.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_WARPED_MOTION_H_
+#define AV1_COMMON_WARPED_MOTION_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "./aom_config.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/mv.h"
+
+#define MAX_PARAMDIM 9
+#if CONFIG_WARPED_MOTION
+#define SAMPLES_ARRAY_SIZE ((2 * MAX_MIB_SIZE + 2) * 2)
+
+#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
+#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
+
+#define DEFAULT_WMTYPE AFFINE
+#endif  // CONFIG_WARPED_MOTION
+
+extern const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
+
+typedef void (*ProjectPointsFunc)(int32_t *mat, int *points, int *proj,
+                                  const int n, const int stride_points,
+                                  const int stride_proj,
+                                  const int subsampling_x,
+                                  const int subsampling_y);
+
+void project_points_translation(int32_t *mat, int *points, int *proj,
+                                const int n, const int stride_points,
+                                const int stride_proj, const int subsampling_x,
+                                const int subsampling_y);
+
+void project_points_rotzoom(int32_t *mat, int *points, int *proj, const int n,
+                            const int stride_points, const int stride_proj,
+                            const int subsampling_x, const int subsampling_y);
+
+void project_points_affine(int32_t *mat, int *points, int *proj, const int n,
+                           const int stride_points, const int stride_proj,
+                           const int subsampling_x, const int subsampling_y);
+
+void project_points_hortrapezoid(int32_t *mat, int *points, int *proj,
+                                 const int n, const int stride_points,
+                                 const int stride_proj, const int subsampling_x,
+                                 const int subsampling_y);
+void project_points_vertrapezoid(int32_t *mat, int *points, int *proj,
+                                 const int n, const int stride_points,
+                                 const int stride_proj, const int subsampling_x,
+                                 const int subsampling_y);
+void project_points_homography(int32_t *mat, int *points, int *proj,
+                               const int n, const int stride_points,
+                               const int stride_proj, const int subsampling_x,
+                               const int subsampling_y);
+
+void project_points(WarpedMotionParams *wm_params, int *points, int *proj,
+                    const int n, const int stride_points, const int stride_proj,
+                    const int subsampling_x, const int subsampling_y);
+
+double av1_warp_erroradv(WarpedMotionParams *wm,
+#if CONFIG_HIGHBITDEPTH
+                         int use_hbd, int bd,
+#endif  // CONFIG_HIGHBITDEPTH
+                         uint8_t *ref, int width, int height, int stride,
+                         uint8_t *dst, int p_col, int p_row, int p_width,
+                         int p_height, int p_stride, int subsampling_x,
+                         int subsampling_y, int x_scale, int y_scale);
+
+void av1_warp_plane(WarpedMotionParams *wm,
+#if CONFIG_HIGHBITDEPTH
+                    int use_hbd, int bd,
+#endif  // CONFIG_HIGHBITDEPTH
+                    uint8_t *ref, int width, int height, int stride,
+                    uint8_t *pred, int p_col, int p_row, int p_width,
+                    int p_height, int p_stride, int subsampling_x,
+                    int subsampling_y, int x_scale, int y_scale, int ref_frm);
+
+int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
+                    int mvx, WarpedMotionParams *wm_params, int mi_row,
+                    int mi_col);
+
+int get_shear_params(WarpedMotionParams *wm);
+#endif  // AV1_COMMON_WARPED_MOTION_H_
diff --git a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
new file mode 100644
index 000000000..91102bbaf
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
@@ -0,0 +1,1029 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+#include "av1/common/filter.h"
+
+#define WIDTH_BOUND (16)
+#define HEIGHT_BOUND (16)
+
+#if CONFIG_DUAL_FILTER
+DECLARE_ALIGNED(16, static int8_t,
+                sub_pel_filters_12sharp_signal_dir[15][2][16]);
+
+DECLARE_ALIGNED(16, static int8_t,
+                sub_pel_filters_12sharp_ver_signal_dir[15][6][16]);
+#endif  // CONFIG_DUAL_FILTER
+
+#if USE_TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(16, static int8_t,
+                sub_pel_filters_temporalfilter_12_signal_dir[15][2][16]);
+
+DECLARE_ALIGNED(16, static int8_t,
+                sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16]);
+#endif
+
+typedef int8_t (*SubpelFilterCoeffs)[16];
+
+static INLINE SubpelFilterCoeffs
+get_subpel_filter_signal_dir(const InterpFilterParams p, int index) {
+#if CONFIG_DUAL_FILTER
+  if (p.interp_filter == MULTITAP_SHARP) {
+    return &sub_pel_filters_12sharp_signal_dir[index][0];
+  }
+#endif
+#if USE_TEMPORALFILTER_12TAP
+  if (p.interp_filter == TEMPORALFILTER_12TAP) {
+    return &sub_pel_filters_temporalfilter_12_signal_dir[index][0];
+  }
+#endif
+  (void)p;
+  (void)index;
+  return NULL;
+}
+
+static INLINE SubpelFilterCoeffs
+get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
+#if CONFIG_DUAL_FILTER
+  if (p.interp_filter == MULTITAP_SHARP) {
+    return &sub_pel_filters_12sharp_ver_signal_dir[index][0];
+  }
+#endif
+#if USE_TEMPORALFILTER_12TAP
+  if (p.interp_filter == TEMPORALFILTER_12TAP) {
+    return &sub_pel_filters_temporalfilter_12_ver_signal_dir[index][0];
+  }
+#endif
+  (void)p;
+  (void)index;
+  return NULL;
+}
+
+static INLINE void transpose_4x8(const __m128i *in, __m128i *out) {
+  __m128i t0, t1;
+
+  t0 = _mm_unpacklo_epi16(in[0], in[1]);
+  t1 = _mm_unpacklo_epi16(in[2], in[3]);
+
+  out[0] = _mm_unpacklo_epi32(t0, t1);
+  out[1] = _mm_srli_si128(out[0], 8);
+  out[2] = _mm_unpackhi_epi32(t0, t1);
+  out[3] = _mm_srli_si128(out[2], 8);
+
+  t0 = _mm_unpackhi_epi16(in[0], in[1]);
+  t1 = _mm_unpackhi_epi16(in[2], in[3]);
+
+  out[4] = _mm_unpacklo_epi32(t0, t1);
+  out[5] = _mm_srli_si128(out[4], 8);
+  // Note: We ignore out[6] and out[7] because
+  // they're zero vectors.
+}
+
+typedef void (*store_pixel_t)(const __m128i *x, uint8_t *dst);
+
+static INLINE __m128i accumulate_store(const __m128i *x, uint8_t *src) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i y = _mm_loadl_epi64((__m128i const *)src);
+  y = _mm_unpacklo_epi8(y, zero);
+  y = _mm_add_epi16(*x, y);
+  y = _mm_add_epi16(y, one);
+  y = _mm_srai_epi16(y, 1);
+  y = _mm_packus_epi16(y, y);
+  return y;
+}
+
+static INLINE void store_2_pixel_only(const __m128i *x, uint8_t *dst) {
+  uint32_t temp;
+  __m128i u = _mm_packus_epi16(*x, *x);
+  temp = _mm_cvtsi128_si32(u);
+  *(uint16_t *)dst = (uint16_t)temp;
+}
+
+static INLINE void accumulate_store_2_pixel(const __m128i *x, uint8_t *dst) {
+  uint32_t temp;
+  __m128i y = accumulate_store(x, dst);
+  temp = _mm_cvtsi128_si32(y);
+  *(uint16_t *)dst = (uint16_t)temp;
+}
+
+static store_pixel_t store2pixelTab[2] = { store_2_pixel_only,
+                                           accumulate_store_2_pixel };
+
+static INLINE void store_4_pixel_only(const __m128i *x, uint8_t *dst) {
+  __m128i u = _mm_packus_epi16(*x, *x);
+  *(int *)dst = _mm_cvtsi128_si32(u);
+}
+
+static INLINE void accumulate_store_4_pixel(const __m128i *x, uint8_t *dst) {
+  __m128i y = accumulate_store(x, dst);
+  *(int *)dst = _mm_cvtsi128_si32(y);
+}
+
+static store_pixel_t store4pixelTab[2] = { store_4_pixel_only,
+                                           accumulate_store_4_pixel };
+
+static void horiz_w4_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                           store_pixel_t store_func, uint8_t *dst) {
+  __m128i sumPairRow[4];
+  __m128i sumPairCol[8];
+  __m128i pixel;
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i zero = _mm_setzero_si128();
+
+  if (10 == tapsNum) {
+    src -= 1;
+  }
+
+  pixel = _mm_loadu_si128((__m128i const *)src);
+  sumPairRow[0] = _mm_maddubs_epi16(pixel, f[0]);
+  sumPairRow[2] = _mm_maddubs_epi16(pixel, f[1]);
+  sumPairRow[2] = _mm_srli_si128(sumPairRow[2], 2);
+
+  pixel = _mm_loadu_si128((__m128i const *)(src + 1));
+  sumPairRow[1] = _mm_maddubs_epi16(pixel, f[0]);
+  sumPairRow[3] = _mm_maddubs_epi16(pixel, f[1]);
+  sumPairRow[3] = _mm_srli_si128(sumPairRow[3], 2);
+
+  transpose_4x8(sumPairRow, sumPairCol);
+
+  sumPairRow[0] = _mm_adds_epi16(sumPairCol[0], sumPairCol[1]);
+  sumPairRow[1] = _mm_adds_epi16(sumPairCol[4], sumPairCol[5]);
+
+  sumPairRow[2] = _mm_min_epi16(sumPairCol[2], sumPairCol[3]);
+  sumPairRow[3] = _mm_max_epi16(sumPairCol[2], sumPairCol[3]);
+
+  sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[1]);
+  sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[2]);
+  sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[3]);
+
+  sumPairRow[1] = _mm_mulhrs_epi16(sumPairRow[0], k_256);
+  sumPairRow[1] = _mm_packus_epi16(sumPairRow[1], sumPairRow[1]);
+  sumPairRow[1] = _mm_unpacklo_epi8(sumPairRow[1], zero);
+
+  store_func(&sumPairRow[1], dst);
+}
+
+static void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                           store_pixel_t store, uint8_t *buf) {
+  horiz_w4_ssse3(src, f, tapsNum, store, buf);
+  src += 4;
+  buf += 4;
+  horiz_w4_ssse3(src, f, tapsNum, store, buf);
+}
+
+static void horiz_w16_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                            store_pixel_t store, uint8_t *buf) {
+  horiz_w8_ssse3(src, f, tapsNum, store, buf);
+  src += 8;
+  buf += 8;
+  horiz_w8_ssse3(src, f, tapsNum, store, buf);
+}
+
+static void horiz_w32_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                            store_pixel_t store, uint8_t *buf) {
+  horiz_w16_ssse3(src, f, tapsNum, store, buf);
+  src += 16;
+  buf += 16;
+  horiz_w16_ssse3(src, f, tapsNum, store, buf);
+}
+
+static void horiz_w64_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                            store_pixel_t store, uint8_t *buf) {
+  horiz_w32_ssse3(src, f, tapsNum, store, buf);
+  src += 32;
+  buf += 32;
+  horiz_w32_ssse3(src, f, tapsNum, store, buf);
+}
+
+static void horiz_w128_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                             store_pixel_t store, uint8_t *buf) {
+  horiz_w64_ssse3(src, f, tapsNum, store, buf);
+  src += 64;
+  buf += 64;
+  horiz_w64_ssse3(src, f, tapsNum, store, buf);
+}
+
+static void (*horizTab[6])(const uint8_t *, const __m128i *, int, store_pixel_t,
+                           uint8_t *) = {
+  horiz_w4_ssse3,  horiz_w8_ssse3,  horiz_w16_ssse3,
+  horiz_w32_ssse3, horiz_w64_ssse3, horiz_w128_ssse3,
+};
+
+static void filter_horiz_ssse3(const uint8_t *src, __m128i *f, int tapsNum,
+                               int width, store_pixel_t store, uint8_t *dst) {
+  switch (width) {
+    // Note:
+    // For width=2 and 4, store function must be different
+    case 2:
+    case 4: horizTab[0](src, f, tapsNum, store, dst); break;
+    case 8: horizTab[1](src, f, tapsNum, store, dst); break;
+    case 16: horizTab[2](src, f, tapsNum, store, dst); break;
+    case 32: horizTab[3](src, f, tapsNum, store, dst); break;
+    case 64: horizTab[4](src, f, tapsNum, store, dst); break;
+    case 128: horizTab[5](src, f, tapsNum, store, dst); break;
+    default: assert(0);
+  }
+}
+
+// Vertical 8-pixel parallel
+typedef void (*transpose_to_dst_t)(const uint16_t *src, int src_stride,
+                                   uint8_t *dst, int dst_stride);
+
+static INLINE void transpose8x8_direct_to_dst(const uint16_t *src,
+                                              int src_stride, uint8_t *dst,
+                                              int dst_stride) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  __m128i v0, v1, v2, v3;
+
+  __m128i u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  __m128i u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  __m128i u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  __m128i u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+  __m128i u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+  __m128i u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+  __m128i u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+  __m128i u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+
+  u0 = _mm_mulhrs_epi16(u0, k_256);
+  u1 = _mm_mulhrs_epi16(u1, k_256);
+  u2 = _mm_mulhrs_epi16(u2, k_256);
+  u3 = _mm_mulhrs_epi16(u3, k_256);
+  u4 = _mm_mulhrs_epi16(u4, k_256);
+  u5 = _mm_mulhrs_epi16(u5, k_256);
+  u6 = _mm_mulhrs_epi16(u6, k_256);
+  u7 = _mm_mulhrs_epi16(u7, k_256);
+
+  v0 = _mm_packus_epi16(u0, u1);
+  v1 = _mm_packus_epi16(u2, u3);
+  v2 = _mm_packus_epi16(u4, u5);
+  v3 = _mm_packus_epi16(u6, u7);
+
+  u0 = _mm_unpacklo_epi8(v0, v1);
+  u1 = _mm_unpackhi_epi8(v0, v1);
+  u2 = _mm_unpacklo_epi8(v2, v3);
+  u3 = _mm_unpackhi_epi8(v2, v3);
+
+  u4 = _mm_unpacklo_epi8(u0, u1);
+  u5 = _mm_unpacklo_epi8(u2, u3);
+  u6 = _mm_unpackhi_epi8(u0, u1);
+  u7 = _mm_unpackhi_epi8(u2, u3);
+
+  u0 = _mm_unpacklo_epi32(u4, u5);
+  u1 = _mm_unpackhi_epi32(u4, u5);
+  u2 = _mm_unpacklo_epi32(u6, u7);
+  u3 = _mm_unpackhi_epi32(u6, u7);
+
+  u4 = _mm_srli_si128(u0, 8);
+  u5 = _mm_srli_si128(u1, 8);
+  u6 = _mm_srli_si128(u2, 8);
+  u7 = _mm_srli_si128(u3, 8);
+
+  _mm_storel_epi64((__m128i *)dst, u0);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), u4);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), u1);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), u5);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), u2);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), u6);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), u3);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), u7);
+}
+
+static INLINE void transpose8x8_accumu_to_dst(const uint16_t *src,
+                                              int src_stride, uint8_t *dst,
+                                              int dst_stride) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+
+  __m128i u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  __m128i u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  __m128i u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  __m128i u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+  __m128i u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+  __m128i u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+  __m128i u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+  __m128i u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+
+  u0 = _mm_mulhrs_epi16(u0, k_256);
+  u1 = _mm_mulhrs_epi16(u1, k_256);
+  u2 = _mm_mulhrs_epi16(u2, k_256);
+  u3 = _mm_mulhrs_epi16(u3, k_256);
+  u4 = _mm_mulhrs_epi16(u4, k_256);
+  u5 = _mm_mulhrs_epi16(u5, k_256);
+  u6 = _mm_mulhrs_epi16(u6, k_256);
+  u7 = _mm_mulhrs_epi16(u7, k_256);
+
+  v0 = _mm_packus_epi16(u0, u1);
+  v1 = _mm_packus_epi16(u2, u3);
+  v2 = _mm_packus_epi16(u4, u5);
+  v3 = _mm_packus_epi16(u6, u7);
+
+  u0 = _mm_unpacklo_epi8(v0, v1);
+  u1 = _mm_unpackhi_epi8(v0, v1);
+  u2 = _mm_unpacklo_epi8(v2, v3);
+  u3 = _mm_unpackhi_epi8(v2, v3);
+
+  u4 = _mm_unpacklo_epi8(u0, u1);
+  u5 = _mm_unpacklo_epi8(u2, u3);
+  u6 = _mm_unpackhi_epi8(u0, u1);
+  u7 = _mm_unpackhi_epi8(u2, u3);
+
+  u0 = _mm_unpacklo_epi32(u4, u5);
+  u1 = _mm_unpackhi_epi32(u4, u5);
+  u2 = _mm_unpacklo_epi32(u6, u7);
+  u3 = _mm_unpackhi_epi32(u6, u7);
+
+  u4 = _mm_srli_si128(u0, 8);
+  u5 = _mm_srli_si128(u1, 8);
+  u6 = _mm_srli_si128(u2, 8);
+  u7 = _mm_srli_si128(u3, 8);
+
+  v0 = _mm_loadl_epi64((__m128i const *)(dst + 0 * dst_stride));
+  v1 = _mm_loadl_epi64((__m128i const *)(dst + 1 * dst_stride));
+  v2 = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
+  v3 = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
+  v4 = _mm_loadl_epi64((__m128i const *)(dst + 4 * dst_stride));
+  v5 = _mm_loadl_epi64((__m128i const *)(dst + 5 * dst_stride));
+  v6 = _mm_loadl_epi64((__m128i const *)(dst + 6 * dst_stride));
+  v7 = _mm_loadl_epi64((__m128i const *)(dst + 7 * dst_stride));
+
+  u0 = _mm_unpacklo_epi8(u0, zero);
+  u1 = _mm_unpacklo_epi8(u1, zero);
+  u2 = _mm_unpacklo_epi8(u2, zero);
+  u3 = _mm_unpacklo_epi8(u3, zero);
+  u4 = _mm_unpacklo_epi8(u4, zero);
+  u5 = _mm_unpacklo_epi8(u5, zero);
+  u6 = _mm_unpacklo_epi8(u6, zero);
+  u7 = _mm_unpacklo_epi8(u7, zero);
+
+  v0 = _mm_unpacklo_epi8(v0, zero);
+  v1 = _mm_unpacklo_epi8(v1, zero);
+  v2 = _mm_unpacklo_epi8(v2, zero);
+  v3 = _mm_unpacklo_epi8(v3, zero);
+  v4 = _mm_unpacklo_epi8(v4, zero);
+  v5 = _mm_unpacklo_epi8(v5, zero);
+  v6 = _mm_unpacklo_epi8(v6, zero);
+  v7 = _mm_unpacklo_epi8(v7, zero);
+
+  v0 = _mm_adds_epi16(u0, v0);
+  v1 = _mm_adds_epi16(u4, v1);
+  v2 = _mm_adds_epi16(u1, v2);
+  v3 = _mm_adds_epi16(u5, v3);
+  v4 = _mm_adds_epi16(u2, v4);
+  v5 = _mm_adds_epi16(u6, v5);
+  v6 = _mm_adds_epi16(u3, v6);
+  v7 = _mm_adds_epi16(u7, v7);
+
+  v0 = _mm_adds_epi16(v0, one);
+  v1 = _mm_adds_epi16(v1, one);
+  v2 = _mm_adds_epi16(v2, one);
+  v3 = _mm_adds_epi16(v3, one);
+  v4 = _mm_adds_epi16(v4, one);
+  v5 = _mm_adds_epi16(v5, one);
+  v6 = _mm_adds_epi16(v6, one);
+  v7 = _mm_adds_epi16(v7, one);
+
+  v0 = _mm_srai_epi16(v0, 1);
+  v1 = _mm_srai_epi16(v1, 1);
+  v2 = _mm_srai_epi16(v2, 1);
+  v3 = _mm_srai_epi16(v3, 1);
+  v4 = _mm_srai_epi16(v4, 1);
+  v5 = _mm_srai_epi16(v5, 1);
+  v6 = _mm_srai_epi16(v6, 1);
+  v7 = _mm_srai_epi16(v7, 1);
+
+  u0 = _mm_packus_epi16(v0, v1);
+  u1 = _mm_packus_epi16(v2, v3);
+  u2 = _mm_packus_epi16(v4, v5);
+  u3 = _mm_packus_epi16(v6, v7);
+
+  u4 = _mm_srli_si128(u0, 8);
+  u5 = _mm_srli_si128(u1, 8);
+  u6 = _mm_srli_si128(u2, 8);
+  u7 = _mm_srli_si128(u3, 8);
+
+  _mm_storel_epi64((__m128i *)dst, u0);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), u4);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), u1);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), u5);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), u2);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), u6);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), u3);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), u7);
+}
+
+static transpose_to_dst_t trans8x8Tab[2] = { transpose8x8_direct_to_dst,
+                                             transpose8x8_accumu_to_dst };
+
+static INLINE void transpose_8x16(const __m128i *in, __m128i *out) {
+  __m128i t0, t1, t2, t3, u0, u1;
+
+  t0 = _mm_unpacklo_epi16(in[0], in[1]);
+  t1 = _mm_unpacklo_epi16(in[2], in[3]);
+  t2 = _mm_unpacklo_epi16(in[4], in[5]);
+  t3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  u0 = _mm_unpacklo_epi32(t0, t1);
+  u1 = _mm_unpacklo_epi32(t2, t3);
+
+  out[0] = _mm_unpacklo_epi64(u0, u1);
+  out[1] = _mm_unpackhi_epi64(u0, u1);
+
+  u0 = _mm_unpackhi_epi32(t0, t1);
+  u1 = _mm_unpackhi_epi32(t2, t3);
+
+  out[2] = _mm_unpacklo_epi64(u0, u1);
+  out[3] = _mm_unpackhi_epi64(u0, u1);
+
+  t0 = _mm_unpackhi_epi16(in[0], in[1]);
+  t1 = _mm_unpackhi_epi16(in[2], in[3]);
+  t2 = _mm_unpackhi_epi16(in[4], in[5]);
+  t3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  u0 = _mm_unpacklo_epi32(t0, t1);
+  u1 = _mm_unpacklo_epi32(t2, t3);
+
+  out[4] = _mm_unpacklo_epi64(u0, u1);
+  out[5] = _mm_unpackhi_epi64(u0, u1);
+
+  // Ignore out[6] and out[7]
+  // they're zero vectors.
+}
+
+static void filter_horiz_v8p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                   __m128i *f, int tapsNum, uint16_t *buf) {
+  __m128i s[8], t[6];
+  __m128i min_x2x3, max_x2x3;
+  __m128i temp;
+
+  if (tapsNum == 10) {
+    src_ptr -= 1;
+  }
+  s[0] = _mm_loadu_si128((const __m128i *)src_ptr);
+  s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
+  s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+  s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+  s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+  s[6] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+  s[7] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+
+  // TRANSPOSE...
+  // Vecotor represents column pixel pairs instead of a row
+  transpose_8x16(s, t);
+
+  // multiply 2 adjacent elements with the filter and add the result
+  s[0] = _mm_maddubs_epi16(t[0], f[0]);
+  s[1] = _mm_maddubs_epi16(t[1], f[1]);
+  s[2] = _mm_maddubs_epi16(t[2], f[2]);
+  s[3] = _mm_maddubs_epi16(t[3], f[3]);
+  s[4] = _mm_maddubs_epi16(t[4], f[4]);
+  s[5] = _mm_maddubs_epi16(t[5], f[5]);
+
+  // add and saturate the results together
+  min_x2x3 = _mm_min_epi16(s[2], s[3]);
+  max_x2x3 = _mm_max_epi16(s[2], s[3]);
+  temp = _mm_adds_epi16(s[0], s[1]);
+  temp = _mm_adds_epi16(temp, s[5]);
+  temp = _mm_adds_epi16(temp, s[4]);
+
+  temp = _mm_adds_epi16(temp, min_x2x3);
+  temp = _mm_adds_epi16(temp, max_x2x3);
+
+  _mm_storeu_si128((__m128i *)buf, temp);
+}
+
+// Vertical 4-pixel parallel
+static INLINE void transpose4x4_direct_to_dst(const uint16_t *src,
+                                              int src_stride, uint8_t *dst,
+                                              int dst_stride) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  __m128i v0, v1, v2, v3;
+
+  // TODO(luoyi): two loads, 8 elements per load (two bytes per element)
+  __m128i u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
+  __m128i u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
+  __m128i u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
+  __m128i u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
+
+  v0 = _mm_unpacklo_epi16(u0, u1);
+  v1 = _mm_unpacklo_epi16(u2, u3);
+
+  v2 = _mm_unpacklo_epi32(v0, v1);
+  v3 = _mm_unpackhi_epi32(v0, v1);
+
+  u0 = _mm_mulhrs_epi16(v2, k_256);
+  u1 = _mm_mulhrs_epi16(v3, k_256);
+
+  u0 = _mm_packus_epi16(u0, u1);
+  u1 = _mm_srli_si128(u0, 4);
+  u2 = _mm_srli_si128(u0, 8);
+  u3 = _mm_srli_si128(u0, 12);
+
+  *(int *)(dst) = _mm_cvtsi128_si32(u0);
+  *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u1);
+  *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(u2);
+  *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(u3);
+}
+
+static INLINE void transpose4x4_accumu_to_dst(const uint16_t *src,
+                                              int src_stride, uint8_t *dst,
+                                              int dst_stride) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+
+  __m128i v0, v1, v2, v3;
+
+  __m128i u0 = _mm_loadl_epi64((__m128i const *)(src));
+  __m128i u1 = _mm_loadl_epi64((__m128i const *)(src + src_stride));
+  __m128i u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
+  __m128i u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
+
+  v0 = _mm_unpacklo_epi16(u0, u1);
+  v1 = _mm_unpacklo_epi16(u2, u3);
+
+  v2 = _mm_unpacklo_epi32(v0, v1);
+  v3 = _mm_unpackhi_epi32(v0, v1);
+
+  u0 = _mm_mulhrs_epi16(v2, k_256);
+  u1 = _mm_mulhrs_epi16(v3, k_256);
+
+  u2 = _mm_packus_epi16(u0, u1);
+  u0 = _mm_unpacklo_epi8(u2, zero);
+  u1 = _mm_unpackhi_epi8(u2, zero);
+
+  // load pixel values
+  v0 = _mm_loadl_epi64((__m128i const *)(dst));
+  v1 = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
+  v2 = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
+  v3 = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
+
+  v0 = _mm_unpacklo_epi8(v0, zero);
+  v1 = _mm_unpacklo_epi8(v1, zero);
+  v2 = _mm_unpacklo_epi8(v2, zero);
+  v3 = _mm_unpacklo_epi8(v3, zero);
+
+  v0 = _mm_unpacklo_epi64(v0, v1);
+  v1 = _mm_unpacklo_epi64(v2, v3);
+
+  u0 = _mm_adds_epi16(u0, v0);
+  u1 = _mm_adds_epi16(u1, v1);
+
+  u0 = _mm_adds_epi16(u0, one);
+  u1 = _mm_adds_epi16(u1, one);
+
+  u0 = _mm_srai_epi16(u0, 1);
+  u1 = _mm_srai_epi16(u1, 1);
+
+  // saturation and pack to pixels
+  u0 = _mm_packus_epi16(u0, u1);
+  u1 = _mm_srli_si128(u0, 4);
+  u2 = _mm_srli_si128(u0, 8);
+  u3 = _mm_srli_si128(u0, 12);
+
+  *(int *)(dst) = _mm_cvtsi128_si32(u0);
+  *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u1);
+  *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(u2);
+  *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(u3);
+}
+
+static transpose_to_dst_t trans4x4Tab[2] = { transpose4x4_direct_to_dst,
+                                             transpose4x4_accumu_to_dst };
+
+static void filter_horiz_v4p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                   __m128i *f, int tapsNum, uint16_t *buf) {
+  __m128i A, B, C, D;
+  __m128i tr0_0, tr0_1, s1s0, s3s2, s5s4, s7s6, s9s8, sbsa;
+  __m128i x0, x1, x2, x3, x4, x5;
+  __m128i min_x2x3, max_x2x3, temp;
+
+  if (tapsNum == 10) {
+    src_ptr -= 1;
+  }
+  A = _mm_loadu_si128((const __m128i *)src_ptr);
+  B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
+  C = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  D = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+
+  // TRANSPOSE...
+  // Vecotor represents column pixel pairs instead of a row
+  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
+  tr0_0 = _mm_unpacklo_epi16(A, B);
+  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
+  tr0_1 = _mm_unpacklo_epi16(C, D);
+  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
+  s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
+  s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  // 02 03 12 13 22 23 32 33
+  s3s2 = _mm_srli_si128(s1s0, 8);
+  // 06 07 16 17 26 27 36 37
+  s7s6 = _mm_srli_si128(s5s4, 8);
+
+  tr0_0 = _mm_unpackhi_epi16(A, B);
+  tr0_1 = _mm_unpackhi_epi16(C, D);
+  s9s8 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  sbsa = _mm_srli_si128(s9s8, 8);
+
+  // multiply 2 adjacent elements with the filter and add the result
+  x0 = _mm_maddubs_epi16(s1s0, f[0]);
+  x1 = _mm_maddubs_epi16(s3s2, f[1]);
+  x2 = _mm_maddubs_epi16(s5s4, f[2]);
+  x3 = _mm_maddubs_epi16(s7s6, f[3]);
+  x4 = _mm_maddubs_epi16(s9s8, f[4]);
+  x5 = _mm_maddubs_epi16(sbsa, f[5]);
+  // add and saturate the results together
+  min_x2x3 = _mm_min_epi16(x2, x3);
+  max_x2x3 = _mm_max_epi16(x2, x3);
+  temp = _mm_adds_epi16(x0, x1);
+  temp = _mm_adds_epi16(temp, x5);
+  temp = _mm_adds_epi16(temp, x4);
+
+  temp = _mm_adds_epi16(temp, min_x2x3);
+  temp = _mm_adds_epi16(temp, max_x2x3);
+  _mm_storel_epi64((__m128i *)buf, temp);
+}
+
+// Note:
+//  This function assumes:
+// (1) 10/12-taps filters
+// (2) x_step_q4 = 16 then filter is fixed at the call
+
+void av1_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
+                              int dst_stride, int w, int h,
+                              const InterpFilterParams filter_params,
+                              const int subpel_x_q4, int x_step_q4,
+                              ConvolveParams *conv_params) {
+  DECLARE_ALIGNED(16, uint16_t, temp[8 * 8]);
+  __m128i verf[6];
+  __m128i horf[2];
+  SubpelFilterCoeffs hCoeffs, vCoeffs;
+  const uint8_t *src_ptr;
+  store_pixel_t store2p = store2pixelTab[conv_params->ref];
+  store_pixel_t store4p = store4pixelTab[conv_params->ref];
+  transpose_to_dst_t transpose_4x4 = trans4x4Tab[conv_params->ref];
+  transpose_to_dst_t transpose_8x8 = trans8x8Tab[conv_params->ref];
+
+  const int tapsNum = filter_params.taps;
+  int block_height, block_residu;
+  int i, col, count;
+  (void)x_step_q4;
+
+  if (0 == subpel_x_q4 || 16 != x_step_q4) {
+    av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
+                         subpel_x_q4, x_step_q4, conv_params);
+    return;
+  }
+
+  hCoeffs = get_subpel_filter_signal_dir(filter_params, subpel_x_q4 - 1);
+  vCoeffs = get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1);
+
+  if (!hCoeffs || !vCoeffs) {
+    av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
+                         subpel_x_q4, x_step_q4, conv_params);
+    return;
+  }
+
+  verf[0] = *((const __m128i *)(vCoeffs));
+  verf[1] = *((const __m128i *)(vCoeffs + 1));
+  verf[2] = *((const __m128i *)(vCoeffs + 2));
+  verf[3] = *((const __m128i *)(vCoeffs + 3));
+  verf[4] = *((const __m128i *)(vCoeffs + 4));
+  verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+  horf[0] = *((const __m128i *)(hCoeffs));
+  horf[1] = *((const __m128i *)(hCoeffs + 1));
+
+  count = 0;
+
+  // here tapsNum is filter size
+  src -= (tapsNum >> 1) - 1;
+  src_ptr = src;
+  if (w > WIDTH_BOUND && h > HEIGHT_BOUND) {
+    // 8-pixels parallel
+    block_height = h >> 3;
+    block_residu = h & 7;
+
+    do {
+      for (col = 0; col < w; col += 8) {
+        for (i = 0; i < 8; ++i) {
+          filter_horiz_v8p_ssse3(src_ptr, src_stride, verf, tapsNum,
+                                 temp + (i * 8));
+          src_ptr += 1;
+        }
+        transpose_8x8(temp, 8, dst + col, dst_stride);
+      }
+      count++;
+      src_ptr = src + count * src_stride * 8;
+      dst += dst_stride * 8;
+    } while (count < block_height);
+
+    for (i = 0; i < block_residu; ++i) {
+      filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
+      src_ptr += src_stride;
+      dst += dst_stride;
+    }
+  } else {
+    if (w > 2) {
+      // 4-pixels parallel
+      block_height = h >> 2;
+      block_residu = h & 3;
+
+      do {
+        for (col = 0; col < w; col += 4) {
+          for (i = 0; i < 4; ++i) {
+            filter_horiz_v4p_ssse3(src_ptr, src_stride, verf, tapsNum,
+                                   temp + (i * 4));
+            src_ptr += 1;
+          }
+          transpose_4x4(temp, 4, dst + col, dst_stride);
+        }
+        count++;
+        src_ptr = src + count * src_stride * 4;
+        dst += dst_stride * 4;
+      } while (count < block_height);
+
+      for (i = 0; i < block_residu; ++i) {
+        filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
+        src_ptr += src_stride;
+        dst += dst_stride;
+      }
+    } else {
+      for (i = 0; i < h; i++) {
+        filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store2p, dst);
+        src_ptr += src_stride;
+        dst += dst_stride;
+      }
+    }
+  }
+}
+
+// Vertical convolution filtering
+static INLINE void store_8_pixel_only(const __m128i *x, uint8_t *dst) {
+  __m128i u = _mm_packus_epi16(*x, *x);
+  _mm_storel_epi64((__m128i *)dst, u);
+}
+
+static INLINE void accumulate_store_8_pixel(const __m128i *x, uint8_t *dst) {
+  __m128i y = accumulate_store(x, dst);
+  _mm_storel_epi64((__m128i *)dst, y);
+}
+
+static store_pixel_t store8pixelTab[2] = { store_8_pixel_only,
+                                           accumulate_store_8_pixel };
+
+static __m128i filter_vert_ssse3(const uint8_t *src, int src_stride,
+                                 int tapsNum, __m128i *f) {
+  __m128i s[12];
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i min_x2x3, max_x2x3, sum;
+  int i = 0;
+  int r = 0;
+
+  if (10 == tapsNum) {
+    i += 1;
+    s[0] = zero;
+  }
+  while (i < 12) {
+    s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
+    i += 1;
+    r += 1;
+  }
+
+  s[0] = _mm_unpacklo_epi8(s[0], s[1]);
+  s[2] = _mm_unpacklo_epi8(s[2], s[3]);
+  s[4] = _mm_unpacklo_epi8(s[4], s[5]);
+  s[6] = _mm_unpacklo_epi8(s[6], s[7]);
+  s[8] = _mm_unpacklo_epi8(s[8], s[9]);
+  s[10] = _mm_unpacklo_epi8(s[10], s[11]);
+
+  s[0] = _mm_maddubs_epi16(s[0], f[0]);
+  s[2] = _mm_maddubs_epi16(s[2], f[1]);
+  s[4] = _mm_maddubs_epi16(s[4], f[2]);
+  s[6] = _mm_maddubs_epi16(s[6], f[3]);
+  s[8] = _mm_maddubs_epi16(s[8], f[4]);
+  s[10] = _mm_maddubs_epi16(s[10], f[5]);
+
+  min_x2x3 = _mm_min_epi16(s[4], s[6]);
+  max_x2x3 = _mm_max_epi16(s[4], s[6]);
+  sum = _mm_adds_epi16(s[0], s[2]);
+  sum = _mm_adds_epi16(sum, s[10]);
+  sum = _mm_adds_epi16(sum, s[8]);
+
+  sum = _mm_adds_epi16(sum, min_x2x3);
+  sum = _mm_adds_epi16(sum, max_x2x3);
+
+  sum = _mm_mulhrs_epi16(sum, k_256);
+  sum = _mm_packus_epi16(sum, sum);
+  sum = _mm_unpacklo_epi8(sum, zero);
+  return sum;
+}
+
+static void filter_vert_horiz_parallel_ssse3(const uint8_t *src, int src_stride,
+                                             __m128i *f, int tapsNum,
+                                             store_pixel_t store_func,
+                                             uint8_t *dst) {
+  __m128i sum = filter_vert_ssse3(src, src_stride, tapsNum, f);
+  store_func(&sum, dst);
+}
+
+static void filter_vert_compute_small(const uint8_t *src, int src_stride,
+                                      __m128i *f, int tapsNum,
+                                      store_pixel_t store_func, int h,
+                                      uint8_t *dst, int dst_stride) {
+  int rowIndex = 0;
+  do {
+    filter_vert_horiz_parallel_ssse3(src, src_stride, f, tapsNum, store_func,
+                                     dst);
+    rowIndex++;
+    src += src_stride;
+    dst += dst_stride;
+  } while (rowIndex < h);
+}
+
+static void filter_vert_compute_large(const uint8_t *src, int src_stride,
+                                      __m128i *f, int tapsNum,
+                                      store_pixel_t store_func, int w, int h,
+                                      uint8_t *dst, int dst_stride) {
+  int col;
+  int rowIndex = 0;
+  const uint8_t *src_ptr = src;
+  uint8_t *dst_ptr = dst;
+
+  do {
+    for (col = 0; col < w; col += 8) {
+      filter_vert_horiz_parallel_ssse3(src_ptr, src_stride, f, tapsNum,
+                                       store_func, dst_ptr);
+      src_ptr += 8;
+      dst_ptr += 8;
+    }
+    rowIndex++;
+    src_ptr = src + rowIndex * src_stride;
+    dst_ptr = dst + rowIndex * dst_stride;
+  } while (rowIndex < h);
+}
+
+void av1_convolve_vert_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             const InterpFilterParams filter_params,
+                             const int subpel_y_q4, int y_step_q4,
+                             ConvolveParams *conv_params) {
+  __m128i verf[6];
+  SubpelFilterCoeffs vCoeffs;
+  const uint8_t *src_ptr;
+  uint8_t *dst_ptr = dst;
+  store_pixel_t store2p = store2pixelTab[conv_params->ref];
+  store_pixel_t store4p = store4pixelTab[conv_params->ref];
+  store_pixel_t store8p = store8pixelTab[conv_params->ref];
+  const int tapsNum = filter_params.taps;
+
+  if (0 == subpel_y_q4 || 16 != y_step_q4) {
+    av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
+                        subpel_y_q4, y_step_q4, conv_params);
+    return;
+  }
+
+  vCoeffs = get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1);
+
+  if (!vCoeffs) {
+    av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
+                        subpel_y_q4, y_step_q4, conv_params);
+    return;
+  }
+
+  verf[0] = *((const __m128i *)(vCoeffs));
+  verf[1] = *((const __m128i *)(vCoeffs + 1));
+  verf[2] = *((const __m128i *)(vCoeffs + 2));
+  verf[3] = *((const __m128i *)(vCoeffs + 3));
+  verf[4] = *((const __m128i *)(vCoeffs + 4));
+  verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+  src -= src_stride * ((tapsNum >> 1) - 1);
+  src_ptr = src;
+
+  if (w > 4) {
+    filter_vert_compute_large(src_ptr, src_stride, verf, tapsNum, store8p, w, h,
+                              dst_ptr, dst_stride);
+  } else if (4 == w) {
+    filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store4p, h,
+                              dst_ptr, dst_stride);
+  } else if (2 == w) {
+    filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store2p, h,
+                              dst_ptr, dst_stride);
+  } else {
+    assert(0);
+  }
+}
+
+static void init_simd_horiz_filter(const int16_t *filter_ptr, int taps,
+                                   int8_t (*simd_horiz_filter)[2][16]) {
+  int shift;
+  int offset = (12 - taps) / 2;
+  const int16_t *filter_row;
+  for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
+    int i;
+    filter_row = filter_ptr + shift * taps;
+    for (i = 0; i < offset; ++i) simd_horiz_filter[shift - 1][0][i] = 0;
+
+    for (i = 0; i < offset + 2; ++i) simd_horiz_filter[shift - 1][1][i] = 0;
+
+    for (i = 0; i < taps; ++i) {
+      simd_horiz_filter[shift - 1][0][i + offset] = (int8_t)filter_row[i];
+      simd_horiz_filter[shift - 1][1][i + offset + 2] = (int8_t)filter_row[i];
+    }
+
+    for (i = offset + taps; i < 16; ++i) simd_horiz_filter[shift - 1][0][i] = 0;
+
+    for (i = offset + 2 + taps; i < 16; ++i)
+      simd_horiz_filter[shift - 1][1][i] = 0;
+  }
+}
+
+static void init_simd_vert_filter(const int16_t *filter_ptr, int taps,
+                                  int8_t (*simd_vert_filter)[6][16]) {
+  int shift;
+  int offset = (12 - taps) / 2;
+  const int16_t *filter_row;
+  for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
+    int i;
+    filter_row = filter_ptr + shift * taps;
+    for (i = 0; i < 6; ++i) {
+      int j;
+      for (j = 0; j < 16; ++j) {
+        int c = i * 2 + (j % 2) - offset;
+        if (c >= 0 && c < taps)
+          simd_vert_filter[shift - 1][i][j] = (int8_t)filter_row[c];
+        else
+          simd_vert_filter[shift - 1][i][j] = 0;
+      }
+    }
+  }
+}
+
+typedef struct SimdFilter {
+  InterpFilter interp_filter;
+  int8_t (*simd_horiz_filter)[2][16];
+  int8_t (*simd_vert_filter)[6][16];
+} SimdFilter;
+
+#if CONFIG_DUAL_FILTER
+#define MULTITAP_FILTER_NUM 1
+SimdFilter simd_filters[MULTITAP_FILTER_NUM] = {
+  { MULTITAP_SHARP, &sub_pel_filters_12sharp_signal_dir[0],
+    &sub_pel_filters_12sharp_ver_signal_dir[0] },
+};
+#endif
+
+#if USE_TEMPORALFILTER_12TAP
+SimdFilter temporal_simd_filter = {
+  TEMPORALFILTER_12TAP, &sub_pel_filters_temporalfilter_12_signal_dir[0],
+  &sub_pel_filters_temporalfilter_12_ver_signal_dir[0]
+};
+#endif
+
+void av1_lowbd_convolve_init_ssse3(void) {
+#if USE_TEMPORALFILTER_12TAP
+  {
+    InterpFilterParams filter_params =
+        av1_get_interp_filter_params(temporal_simd_filter.interp_filter);
+    int taps = filter_params.taps;
+    const int16_t *filter_ptr = filter_params.filter_ptr;
+    init_simd_horiz_filter(filter_ptr, taps,
+                           temporal_simd_filter.simd_horiz_filter);
+    init_simd_vert_filter(filter_ptr, taps,
+                          temporal_simd_filter.simd_vert_filter);
+  }
+#endif
+#if CONFIG_DUAL_FILTER
+  {
+    int i;
+    for (i = 0; i < MULTITAP_FILTER_NUM; ++i) {
+      InterpFilter interp_filter = simd_filters[i].interp_filter;
+      InterpFilterParams filter_params =
+          av1_get_interp_filter_params(interp_filter);
+      int taps = filter_params.taps;
+      const int16_t *filter_ptr = filter_params.filter_ptr;
+      init_simd_horiz_filter(filter_ptr, taps,
+                             simd_filters[i].simd_horiz_filter);
+      init_simd_vert_filter(filter_ptr, taps, simd_filters[i].simd_vert_filter);
+    }
+  }
+#endif
+  return;
+}
diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c
new file mode 100644
index 000000000..d04b667f1
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c
@@ -0,0 +1,839 @@
+#include "av1/common/x86/av1_txfm1d_sse4.h"
+
+void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 32;
+  const int num_per_128 = 4;
+  const int32_t *cospi;
+  __m128i buf0[32];
+  __m128i buf1[32];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    int j;
+    for (j = 0; j < 32; ++j) {
+      buf0[j] = input[j * col_num + col];
+    }
+
+    // stage 1
+    stage_idx++;
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[0], buf0[31]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[1], buf0[30]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[2], buf0[29]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[28]);
+    buf1[28] = _mm_sub_epi32(buf0[3], buf0[28]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[4], buf0[27]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[5], buf0[26]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[25]);
+    buf1[25] = _mm_sub_epi32(buf0[6], buf0[25]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[24]);
+    buf1[24] = _mm_sub_epi32(buf0[7], buf0[24]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[8], buf0[23]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[9], buf0[22]);
+    buf1[10] = _mm_add_epi32(buf0[10], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[10], buf0[21]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[20]);
+    buf1[20] = _mm_sub_epi32(buf0[11], buf0[20]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[19]);
+    buf1[19] = _mm_sub_epi32(buf0[12], buf0[19]);
+    buf1[13] = _mm_add_epi32(buf0[13], buf0[18]);
+    buf1[18] = _mm_sub_epi32(buf0[13], buf0[18]);
+    buf1[14] = _mm_add_epi32(buf0[14], buf0[17]);
+    buf1[17] = _mm_sub_epi32(buf0[14], buf0[17]);
+    buf1[15] = _mm_add_epi32(buf0[15], buf0[16]);
+    buf1[16] = _mm_sub_epi32(buf0[15], buf0[16]);
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
+    buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
+    buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
+    buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]);
+    buf0[2] = _mm_add_epi32(buf1[2], buf1[13]);
+    buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]);
+    buf0[3] = _mm_add_epi32(buf1[3], buf1[12]);
+    buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]);
+    buf0[4] = _mm_add_epi32(buf1[4], buf1[11]);
+    buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]);
+    buf0[5] = _mm_add_epi32(buf1[5], buf1[10]);
+    buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]);
+    buf0[6] = _mm_add_epi32(buf1[6], buf1[9]);
+    buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]);
+    buf0[7] = _mm_add_epi32(buf1[7], buf1[8]);
+    buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    buf0[18] = buf1[18];
+    buf0[19] = buf1[19];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+                        buf0[24], bit);
+    buf0[28] = buf1[28];
+    buf0[29] = buf1[29];
+    buf0[30] = buf1[30];
+    buf0[31] = buf1[31];
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
+    buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
+    buf1[8] = buf0[8];
+    buf1[9] = buf0[9];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+                        buf1[13], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+                        buf1[12], bit);
+    buf1[14] = buf0[14];
+    buf1[15] = buf0[15];
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]);
+    buf1[18] = _mm_add_epi32(buf0[18], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[20]);
+    buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]);
+    buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]);
+    buf1[31] = _mm_add_epi32(buf0[31], buf0[24]);
+    buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]);
+    buf1[30] = _mm_add_epi32(buf0[30], buf0[25]);
+    buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]);
+    buf1[29] = _mm_add_epi32(buf0[29], buf0[26]);
+    buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]);
+    buf1[28] = _mm_add_epi32(buf0[28], buf0[27]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
+    buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
+    buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
+    buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
+    buf0[4] = buf1[4];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5],
+                        buf0[6], bit);
+    buf0[7] = buf1[7];
+    buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
+    buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
+    buf0[9] = _mm_add_epi32(buf1[9], buf1[10]);
+    buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]);
+    buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]);
+    buf0[15] = _mm_add_epi32(buf1[15], buf1[12]);
+    buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]);
+    buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+                        buf0[28], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    buf0[22] = buf1[22];
+    buf0[23] = buf1[23];
+    buf0[24] = buf1[24];
+    buf0[25] = buf1[25];
+    buf0[30] = buf1[30];
+    buf0[31] = buf1[31];
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
+                        buf1[1], bit);
+    btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
+                        buf1[3], bit);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
+    buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
+    buf1[8] = buf0[8];
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
+                        buf1[14], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+                        buf1[13], bit);
+    buf1[11] = buf0[11];
+    buf1[12] = buf0[12];
+    buf1[15] = buf0[15];
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[19]);
+    buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[18]);
+    buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]);
+    buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]);
+    buf1[23] = _mm_add_epi32(buf0[23], buf0[20]);
+    buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]);
+    buf1[22] = _mm_add_epi32(buf0[22], buf0[21]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]);
+    buf1[25] = _mm_add_epi32(buf0[25], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]);
+    buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]);
+    buf1[31] = _mm_add_epi32(buf0[31], buf0[28]);
+    buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]);
+    buf1[30] = _mm_add_epi32(buf0[30], buf0[29]);
+
+    // stage 6
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+                        bit);
+    btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5],
+                        buf0[6], bit);
+    buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
+    buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
+    buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
+    buf0[11] = _mm_add_epi32(buf1[11], buf1[10]);
+    buf0[12] = _mm_add_epi32(buf1[12], buf1[13]);
+    buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]);
+    buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
+    buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
+    buf0[16] = buf1[16];
+    btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+                        buf0[30], bit);
+    btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+                        buf0[29], bit);
+    buf0[19] = buf1[19];
+    buf0[20] = buf1[20];
+    btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+                        buf0[25], bit);
+    buf0[23] = buf1[23];
+    buf0[24] = buf1[24];
+    buf0[27] = buf1[27];
+    buf0[28] = buf1[28];
+    buf0[31] = buf1[31];
+
+    // stage 7
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[1];
+    buf1[2] = buf0[2];
+    buf1[3] = buf0[3];
+    buf1[4] = buf0[4];
+    buf1[5] = buf0[5];
+    buf1[6] = buf0[6];
+    buf1[7] = buf0[7];
+    btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8],
+                        buf1[15], bit);
+    btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
+                        buf1[14], bit);
+    btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+                        buf1[13], bit);
+    btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+                        buf1[12], bit);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
+    buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
+    buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[18]);
+    buf1[20] = _mm_add_epi32(buf0[20], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]);
+    buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]);
+    buf1[23] = _mm_add_epi32(buf0[23], buf0[22]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[25]);
+    buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]);
+    buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]);
+    buf1[27] = _mm_add_epi32(buf0[27], buf0[26]);
+    buf1[28] = _mm_add_epi32(buf0[28], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]);
+    buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]);
+    buf1[31] = _mm_add_epi32(buf0[31], buf0[30]);
+
+    // stage 8
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    buf0[10] = buf1[10];
+    buf0[11] = buf1[11];
+    buf0[12] = buf1[12];
+    buf0[13] = buf1[13];
+    buf0[14] = buf1[14];
+    buf0[15] = buf1[15];
+    btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
+                        buf0[31], bit);
+    btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+                        buf0[30], bit);
+    btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+                        buf0[29], bit);
+    btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+                        buf0[28], bit);
+    btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+                        buf0[27], bit);
+    btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+                        buf0[25], bit);
+    btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
+                        buf0[24], bit);
+
+    // stage 9
+    stage_idx++;
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[16];
+    buf1[2] = buf0[8];
+    buf1[3] = buf0[24];
+    buf1[4] = buf0[4];
+    buf1[5] = buf0[20];
+    buf1[6] = buf0[12];
+    buf1[7] = buf0[28];
+    buf1[8] = buf0[2];
+    buf1[9] = buf0[18];
+    buf1[10] = buf0[10];
+    buf1[11] = buf0[26];
+    buf1[12] = buf0[6];
+    buf1[13] = buf0[22];
+    buf1[14] = buf0[14];
+    buf1[15] = buf0[30];
+    buf1[16] = buf0[1];
+    buf1[17] = buf0[17];
+    buf1[18] = buf0[9];
+    buf1[19] = buf0[25];
+    buf1[20] = buf0[5];
+    buf1[21] = buf0[21];
+    buf1[22] = buf0[13];
+    buf1[23] = buf0[29];
+    buf1[24] = buf0[3];
+    buf1[25] = buf0[19];
+    buf1[26] = buf0[11];
+    buf1[27] = buf0[27];
+    buf1[28] = buf0[7];
+    buf1[29] = buf0[23];
+    buf1[30] = buf0[15];
+    buf1[31] = buf0[31];
+
+    for (j = 0; j < 32; ++j) {
+      output[j * col_num + col] = buf1[j];
+    }
+  }
+}
+
+void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 4;
+  const int num_per_128 = 4;
+  const int32_t *cospi;
+  __m128i buf0[4];
+  __m128i buf1[4];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    int j;
+    for (j = 0; j < 4; ++j) {
+      buf0[j] = input[j * col_num + col];
+    }
+
+    // stage 1
+    stage_idx++;
+    buf1[0] = buf0[3];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[1];
+    buf1[3] = buf0[2];
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
+                        bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+
+    // stage 3
+    stage_idx++;
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+
+    // stage 5
+    stage_idx++;
+    buf1[0] = buf0[0];
+    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
+    buf1[2] = buf0[3];
+    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
+
+    for (j = 0; j < 4; ++j) {
+      output[j * col_num + col] = buf1[j];
+    }
+  }
+}
+
+void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t *cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 32;
+  const int num_per_128 = 4;
+  const int32_t *cospi;
+  __m128i buf0[32];
+  __m128i buf1[32];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    int j;
+    for (j = 0; j < 32; ++j) {
+      buf0[j] = input[j * col_num + col];
+    }
+
+    // stage 1
+    stage_idx++;
+    buf1[0] = buf0[31];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[29];
+    buf1[3] = buf0[2];
+    buf1[4] = buf0[27];
+    buf1[5] = buf0[4];
+    buf1[6] = buf0[25];
+    buf1[7] = buf0[6];
+    buf1[8] = buf0[23];
+    buf1[9] = buf0[8];
+    buf1[10] = buf0[21];
+    buf1[11] = buf0[10];
+    buf1[12] = buf0[19];
+    buf1[13] = buf0[12];
+    buf1[14] = buf0[17];
+    buf1[15] = buf0[14];
+    buf1[16] = buf0[15];
+    buf1[17] = buf0[16];
+    buf1[18] = buf0[13];
+    buf1[19] = buf0[18];
+    buf1[20] = buf0[11];
+    buf1[21] = buf0[20];
+    buf1[22] = buf0[9];
+    buf1[23] = buf0[22];
+    buf1[24] = buf0[7];
+    buf1[25] = buf0[24];
+    buf1[26] = buf0[5];
+    buf1[27] = buf0[26];
+    buf1[28] = buf0[3];
+    buf1[29] = buf0[28];
+    buf1[30] = buf0[1];
+    buf1[31] = buf0[30];
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1],
+                        bit);
+    btf_32_sse4_1_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3],
+                        bit);
+    btf_32_sse4_1_type0(cospi[9], cospi[55], buf1[4], buf1[5], buf0[4], buf0[5],
+                        bit);
+    btf_32_sse4_1_type0(cospi[13], cospi[51], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    btf_32_sse4_1_type0(cospi[17], cospi[47], buf1[8], buf1[9], buf0[8],
+                        buf0[9], bit);
+    btf_32_sse4_1_type0(cospi[21], cospi[43], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    btf_32_sse4_1_type0(cospi[25], cospi[39], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(cospi[29], cospi[35], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    btf_32_sse4_1_type0(cospi[33], cospi[31], buf1[16], buf1[17], buf0[16],
+                        buf0[17], bit);
+    btf_32_sse4_1_type0(cospi[37], cospi[27], buf1[18], buf1[19], buf0[18],
+                        buf0[19], bit);
+    btf_32_sse4_1_type0(cospi[41], cospi[23], buf1[20], buf1[21], buf0[20],
+                        buf0[21], bit);
+    btf_32_sse4_1_type0(cospi[45], cospi[19], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    btf_32_sse4_1_type0(cospi[49], cospi[15], buf1[24], buf1[25], buf0[24],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(cospi[53], cospi[11], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(cospi[57], cospi[7], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(cospi[61], cospi[3], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 3
+    stage_idx++;
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[16]);
+    buf1[16] = _mm_sub_epi32(buf0[0], buf0[16]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[17]);
+    buf1[17] = _mm_sub_epi32(buf0[1], buf0[17]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[18]);
+    buf1[18] = _mm_sub_epi32(buf0[2], buf0[18]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[19]);
+    buf1[19] = _mm_sub_epi32(buf0[3], buf0[19]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[20]);
+    buf1[20] = _mm_sub_epi32(buf0[4], buf0[20]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[5], buf0[21]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[6], buf0[22]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[7], buf0[23]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[24]);
+    buf1[24] = _mm_sub_epi32(buf0[8], buf0[24]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[25]);
+    buf1[25] = _mm_sub_epi32(buf0[9], buf0[25]);
+    buf1[10] = _mm_add_epi32(buf0[10], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[10], buf0[26]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[11], buf0[27]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[28]);
+    buf1[28] = _mm_sub_epi32(buf0[12], buf0[28]);
+    buf1[13] = _mm_add_epi32(buf0[13], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[13], buf0[29]);
+    buf1[14] = _mm_add_epi32(buf0[14], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[14], buf0[30]);
+    buf1[15] = _mm_add_epi32(buf0[15], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[15], buf0[31]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    buf0[10] = buf1[10];
+    buf0[11] = buf1[11];
+    buf0[12] = buf1[12];
+    buf0[13] = buf1[13];
+    buf0[14] = buf1[14];
+    buf0[15] = buf1[15];
+    btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[16], buf1[17], buf0[16],
+                        buf0[17], bit);
+    btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[18], buf1[19], buf0[18],
+                        buf0[19], bit);
+    btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[20], buf1[21], buf0[20],
+                        buf0[21], bit);
+    btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    btf_32_sse4_1_type0(-cospi[60], cospi[4], buf1[24], buf1[25], buf0[24],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(-cospi[44], cospi[20], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[28], cospi[36], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[12], cospi[52], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 5
+    stage_idx++;
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[8]);
+    buf1[8] = _mm_sub_epi32(buf0[0], buf0[8]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[9]);
+    buf1[9] = _mm_sub_epi32(buf0[1], buf0[9]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[2], buf0[10]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[3], buf0[11]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[12]);
+    buf1[12] = _mm_sub_epi32(buf0[4], buf0[12]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[5], buf0[13]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[6], buf0[14]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[7], buf0[15]);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[24]);
+    buf1[24] = _mm_sub_epi32(buf0[16], buf0[24]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[25]);
+    buf1[25] = _mm_sub_epi32(buf0[17], buf0[25]);
+    buf1[18] = _mm_add_epi32(buf0[18], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[18], buf0[26]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[19], buf0[27]);
+    buf1[20] = _mm_add_epi32(buf0[20], buf0[28]);
+    buf1[28] = _mm_sub_epi32(buf0[20], buf0[28]);
+    buf1[21] = _mm_add_epi32(buf0[21], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[21], buf0[29]);
+    buf1[22] = _mm_add_epi32(buf0[22], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[22], buf0[30]);
+    buf1[23] = _mm_add_epi32(buf0[23], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[23], buf0[31]);
+
+    // stage 6
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
+                        bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    buf0[18] = buf1[18];
+    buf0[19] = buf1[19];
+    buf0[20] = buf1[20];
+    buf0[21] = buf1[21];
+    buf0[22] = buf1[22];
+    buf0[23] = buf1[23];
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[24], buf1[25], buf0[24],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 7
+    stage_idx++;
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
+    buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[12]);
+    buf1[12] = _mm_sub_epi32(buf0[8], buf0[12]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[9], buf0[13]);
+    buf1[10] = _mm_add_epi32(buf0[10], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[10], buf0[14]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[11], buf0[15]);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[20]);
+    buf1[20] = _mm_sub_epi32(buf0[16], buf0[20]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[17], buf0[21]);
+    buf1[18] = _mm_add_epi32(buf0[18], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[18], buf0[22]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[19], buf0[23]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[28]);
+    buf1[28] = _mm_sub_epi32(buf0[24], buf0[28]);
+    buf1[25] = _mm_add_epi32(buf0[25], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[25], buf0[29]);
+    buf1[26] = _mm_add_epi32(buf0[26], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[26], buf0[30]);
+    buf1[27] = _mm_add_epi32(buf0[27], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[27], buf0[31]);
+
+    // stage 8
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
+                        buf0[5], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    buf0[10] = buf1[10];
+    buf0[11] = buf1[11];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    buf0[18] = buf1[18];
+    buf0[19] = buf1[19];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[20], buf1[21], buf0[20],
+                        buf0[21], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    buf0[24] = buf1[24];
+    buf0[25] = buf1[25];
+    buf0[26] = buf1[26];
+    buf0[27] = buf1[27];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 9
+    stage_idx++;
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[8], buf0[10]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[9], buf0[11]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[12], buf0[14]);
+    buf1[13] = _mm_add_epi32(buf0[13], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[13], buf0[15]);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[18]);
+    buf1[18] = _mm_sub_epi32(buf0[16], buf0[18]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[19]);
+    buf1[19] = _mm_sub_epi32(buf0[17], buf0[19]);
+    buf1[20] = _mm_add_epi32(buf0[20], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[20], buf0[22]);
+    buf1[21] = _mm_add_epi32(buf0[21], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[21], buf0[23]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[24], buf0[26]);
+    buf1[25] = _mm_add_epi32(buf0[25], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[25], buf0[27]);
+    buf1[28] = _mm_add_epi32(buf0[28], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[28], buf0[30]);
+    buf1[29] = _mm_add_epi32(buf0[29], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[29], buf0[31]);
+
+    // stage 10
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    buf0[12] = buf1[12];
+    buf0[13] = buf1[13];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[18], buf1[19], buf0[18],
+                        buf0[19], bit);
+    buf0[20] = buf1[20];
+    buf0[21] = buf1[21];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    buf0[24] = buf1[24];
+    buf0[25] = buf1[25];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    buf0[28] = buf1[28];
+    buf0[29] = buf1[29];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 11
+    stage_idx++;
+    buf1[0] = buf0[0];
+    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[16]);
+    buf1[2] = buf0[24];
+    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[8]);
+    buf1[4] = buf0[12];
+    buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[28]);
+    buf1[6] = buf0[20];
+    buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
+    buf1[8] = buf0[6];
+    buf1[9] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[22]);
+    buf1[10] = buf0[30];
+    buf1[11] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[14]);
+    buf1[12] = buf0[10];
+    buf1[13] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[26]);
+    buf1[14] = buf0[18];
+    buf1[15] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
+    buf1[16] = buf0[3];
+    buf1[17] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[19]);
+    buf1[18] = buf0[27];
+    buf1[19] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[11]);
+    buf1[20] = buf0[15];
+    buf1[21] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[31]);
+    buf1[22] = buf0[23];
+    buf1[23] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
+    buf1[24] = buf0[5];
+    buf1[25] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[21]);
+    buf1[26] = buf0[29];
+    buf1[27] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[13]);
+    buf1[28] = buf0[9];
+    buf1[29] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[25]);
+    buf1[30] = buf0[17];
+    buf1[31] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
+
+    for (j = 0; j < 32; ++j) {
+      output[j * col_num + col] = buf1[j];
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
new file mode 100644
index 000000000..78c261374
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm1d_sse4.h"
+
+static INLINE void int16_array_with_stride_to_int32_array_without_stride(
+    const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
+  int r, c;
+  for (r = 0; r < txfm1d_size; r++) {
+    for (c = 0; c < txfm1d_size; c++) {
+      output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
+    }
+  }
+}
+
+typedef void (*TxfmFuncSSE2)(const __m128i *input, __m128i *output,
+                             const int8_t *cos_bit, const int8_t *stage_range);
+
+static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT32: return av1_fdct32_new_sse4_1; break;
+    case TXFM_TYPE_ADST32: return av1_fadst32_new_sse4_1; break;
+    default: assert(0);
+  }
+  return NULL;
+}
+
+static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
+                                     const int stride, const TXFM_2D_CFG *cfg,
+                                     int32_t *txfm_buf) {
+  const int txfm_size = cfg->txfm_size;
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->stage_range_row;
+  const int8_t *cos_bit_col = cfg->cos_bit_col;
+  const int8_t *cos_bit_row = cfg->cos_bit_row;
+  const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+  __m128i *buf_128 = (__m128i *)txfm_buf;
+  __m128i *out_128 = (__m128i *)output;
+  int num_per_128 = 4;
+  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+
+  int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
+                                                        txfm_size);
+  round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
+  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+  round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+  transpose_32(txfm_size, out_128, buf_128);
+  txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
+  round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
+  transpose_32(txfm_size, buf_128, out_128);
+}
+
+void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
+                                 int stride, int tx_type, int bd) {
+  DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
+  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_32X32);
+  (void)bd;
+  fwd_txfm2d_sse4_1(input, output, stride, cfg.cfg, txfm_buf);
+}
+
+void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+                                 int stride, int tx_type, int bd) {
+  DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
+  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_64x64_cfg(tx_type);
+  (void)bd;
+  fwd_txfm2d_sse4_1(input, output, stride, cfg.cfg, txfm_buf);
+}
diff --git a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
new file mode 100644
index 000000000..cf6249bdc
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/filter.h"
+
+#if CONFIG_DUAL_FILTER
+DECLARE_ALIGNED(16, static int16_t, subpel_filters_sharp[15][6][8]);
+#endif
+
+#if USE_TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(16, static int16_t, subpel_temporalfilter[15][6][8]);
+#endif
+
+typedef int16_t (*HbdSubpelFilterCoeffs)[8];
+
+typedef void (*TransposeSave)(int width, int pixelsNum, uint32_t *src,
+                              int src_stride, uint16_t *dst, int dst_stride,
+                              int bd);
+
+static INLINE HbdSubpelFilterCoeffs
+hbd_get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
+#if CONFIG_DUAL_FILTER
+  if (p.interp_filter == MULTITAP_SHARP) {
+    return &subpel_filters_sharp[index][0];
+  }
+#endif
+#if USE_TEMPORALFILTER_12TAP
+  if (p.interp_filter == TEMPORALFILTER_12TAP) {
+    return &subpel_temporalfilter[index][0];
+  }
+#endif
+  (void)p;
+  (void)index;
+  return NULL;
+}
+
+static void init_simd_filter(const int16_t *filter_ptr, int taps,
+                             int16_t (*simd_filter)[6][8]) {
+  int shift;
+  int offset = (12 - taps) / 2;
+  for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
+    const int16_t *filter_row = filter_ptr + shift * taps;
+    int i, j;
+    for (i = 0; i < 12; ++i) {
+      for (j = 0; j < 4; ++j) {
+        int r = i / 2;
+        int c = j * 2 + (i % 2);
+        if (i - offset >= 0 && i - offset < taps)
+          simd_filter[shift - 1][r][c] = filter_row[i - offset];
+        else
+          simd_filter[shift - 1][r][c] = 0;
+      }
+    }
+  }
+}
+
+void av1_highbd_convolve_init_sse4_1(void) {
+#if USE_TEMPORALFILTER_12TAP
+  {
+    InterpFilterParams filter_params =
+        av1_get_interp_filter_params(TEMPORALFILTER_12TAP);
+    int taps = filter_params.taps;
+    const int16_t *filter_ptr = filter_params.filter_ptr;
+    init_simd_filter(filter_ptr, taps, subpel_temporalfilter);
+  }
+#endif
+#if CONFIG_DUAL_FILTER
+  {
+    InterpFilterParams filter_params =
+        av1_get_interp_filter_params(MULTITAP_SHARP);
+    int taps = filter_params.taps;
+    const int16_t *filter_ptr = filter_params.filter_ptr;
+    init_simd_filter(filter_ptr, taps, subpel_filters_sharp);
+  }
+#endif
+}
+
+// pixelsNum 0: write all 4 pixels
+//           1/2/3: residual pixels 1/2/3
+static void writePixel(__m128i *u, int width, int pixelsNum, uint16_t *dst,
+                       int dst_stride) {
+  if (2 == width) {
+    if (0 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+      *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
+      *(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]);
+    } else if (1 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+    } else if (2 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+    } else if (3 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+      *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
+    }
+  } else {
+    if (0 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
+      _mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]);
+    } else if (1 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+    } else if (2 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+    } else if (3 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
+    }
+  }
+}
+
+// 16-bit pixels clip with bd (10/12)
+static void highbd_clip(__m128i *p, int numVecs, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i clamped, mask;
+  int i;
+
+  for (i = 0; i < numVecs; i++) {
+    mask = _mm_cmpgt_epi16(p[i], max);
+    clamped = _mm_andnot_si128(mask, p[i]);
+    mask = _mm_and_si128(mask, max);
+    clamped = _mm_or_si128(mask, clamped);
+    mask = _mm_cmpgt_epi16(clamped, zero);
+    p[i] = _mm_and_si128(clamped, mask);
+  }
+}
+
+static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) {
+  __m128i v0, v1;
+  __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+
+  u[0] = _mm_loadu_si128((__m128i const *)src);
+  u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
+  u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+  u[0] = _mm_add_epi32(u[0], rnd);
+  u[1] = _mm_add_epi32(u[1], rnd);
+  u[2] = _mm_add_epi32(u[2], rnd);
+  u[3] = _mm_add_epi32(u[3], rnd);
+
+  u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
+  u[1] = _mm_srai_epi32(u[1], FILTER_BITS);
+  u[2] = _mm_srai_epi32(u[2], FILTER_BITS);
+  u[3] = _mm_srai_epi32(u[3], FILTER_BITS);
+
+  u[0] = _mm_packus_epi32(u[0], u[1]);
+  u[1] = _mm_packus_epi32(u[2], u[3]);
+
+  highbd_clip(u, 2, bd);
+
+  v0 = _mm_unpacklo_epi16(u[0], u[1]);
+  v1 = _mm_unpackhi_epi16(u[0], u[1]);
+
+  u[0] = _mm_unpacklo_epi16(v0, v1);
+  u[2] = _mm_unpackhi_epi16(v0, v1);
+
+  u[1] = _mm_srli_si128(u[0], 8);
+  u[3] = _mm_srli_si128(u[2], 8);
+}
+
+// pixelsNum = 0     : all 4 rows of pixels will be saved.
+// pixelsNum = 1/2/3 : residual 1/2/4 rows of pixels will be saved.
+void trans_save_4x4(int width, int pixelsNum, uint32_t *src, int src_stride,
+                    uint16_t *dst, int dst_stride, int bd) {
+  __m128i u[4];
+  transClipPixel(src, src_stride, u, bd);
+  writePixel(u, width, pixelsNum, dst, dst_stride);
+}
+
+void trans_accum_save_4x4(int width, int pixelsNum, uint32_t *src,
+                          int src_stride, uint16_t *dst, int dst_stride,
+                          int bd) {
+  __m128i u[4], v[4];
+  const __m128i ones = _mm_set1_epi16(1);
+
+  transClipPixel(src, src_stride, u, bd);
+
+  v[0] = _mm_loadl_epi64((__m128i const *)dst);
+  v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
+  v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
+  v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
+
+  u[0] = _mm_add_epi16(u[0], v[0]);
+  u[1] = _mm_add_epi16(u[1], v[1]);
+  u[2] = _mm_add_epi16(u[2], v[2]);
+  u[3] = _mm_add_epi16(u[3], v[3]);
+
+  u[0] = _mm_add_epi16(u[0], ones);
+  u[1] = _mm_add_epi16(u[1], ones);
+  u[2] = _mm_add_epi16(u[2], ones);
+  u[3] = _mm_add_epi16(u[3], ones);
+
+  u[0] = _mm_srai_epi16(u[0], 1);
+  u[1] = _mm_srai_epi16(u[1], 1);
+  u[2] = _mm_srai_epi16(u[2], 1);
+  u[3] = _mm_srai_epi16(u[3], 1);
+
+  writePixel(u, width, pixelsNum, dst, dst_stride);
+}
+
+static TransposeSave transSaveTab[2] = { trans_save_4x4, trans_accum_save_4x4 };
+
+static INLINE void transpose_pair(__m128i *in, __m128i *out) {
+  __m128i x0, x1;
+
+  x0 = _mm_unpacklo_epi32(in[0], in[1]);
+  x1 = _mm_unpacklo_epi32(in[2], in[3]);
+
+  out[0] = _mm_unpacklo_epi64(x0, x1);
+  out[1] = _mm_unpackhi_epi64(x0, x1);
+
+  x0 = _mm_unpackhi_epi32(in[0], in[1]);
+  x1 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  out[2] = _mm_unpacklo_epi64(x0, x1);
+  out[3] = _mm_unpackhi_epi64(x0, x1);
+
+  x0 = _mm_unpacklo_epi32(in[4], in[5]);
+  x1 = _mm_unpacklo_epi32(in[6], in[7]);
+
+  out[4] = _mm_unpacklo_epi64(x0, x1);
+  out[5] = _mm_unpackhi_epi64(x0, x1);
+}
+
+static void highbd_filter_horiz(const uint16_t *src, int src_stride, __m128i *f,
+                                int tapsNum, uint32_t *buf) {
+  __m128i u[8], v[6];
+
+  if (tapsNum == 10) {
+    src -= 1;
+  }
+
+  u[0] = _mm_loadu_si128((__m128i const *)src);
+  u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
+  u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+  u[4] = _mm_loadu_si128((__m128i const *)(src + 8));
+  u[5] = _mm_loadu_si128((__m128i const *)(src + src_stride + 8));
+  u[6] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride + 8));
+  u[7] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride + 8));
+
+  transpose_pair(u, v);
+
+  u[0] = _mm_madd_epi16(v[0], f[0]);
+  u[1] = _mm_madd_epi16(v[1], f[1]);
+  u[2] = _mm_madd_epi16(v[2], f[2]);
+  u[3] = _mm_madd_epi16(v[3], f[3]);
+  u[4] = _mm_madd_epi16(v[4], f[4]);
+  u[5] = _mm_madd_epi16(v[5], f[5]);
+
+  u[6] = _mm_min_epi32(u[2], u[3]);
+  u[7] = _mm_max_epi32(u[2], u[3]);
+
+  u[0] = _mm_add_epi32(u[0], u[1]);
+  u[0] = _mm_add_epi32(u[0], u[5]);
+  u[0] = _mm_add_epi32(u[0], u[4]);
+  u[0] = _mm_add_epi32(u[0], u[6]);
+  u[0] = _mm_add_epi32(u[0], u[7]);
+
+  _mm_storeu_si128((__m128i *)buf, u[0]);
+}
+
+void av1_highbd_convolve_horiz_sse4_1(const uint16_t *src, int src_stride,
+                                      uint16_t *dst, int dst_stride, int w,
+                                      int h,
+                                      const InterpFilterParams filter_params,
+                                      const int subpel_x_q4, int x_step_q4,
+                                      int avg, int bd) {
+  DECLARE_ALIGNED(16, uint32_t, temp[4 * 4]);
+  __m128i verf[6];
+  HbdSubpelFilterCoeffs vCoeffs;
+  const uint16_t *srcPtr;
+  const int tapsNum = filter_params.taps;
+  int i, col, count, blkResidu, blkHeight;
+  TransposeSave transSave = transSaveTab[avg];
+  (void)x_step_q4;
+
+  if (0 == subpel_x_q4 || 16 != x_step_q4) {
+    av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
+                                filter_params, subpel_x_q4, x_step_q4, avg, bd);
+    return;
+  }
+
+  vCoeffs =
+      hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1);
+  if (!vCoeffs) {
+    av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
+                                filter_params, subpel_x_q4, x_step_q4, avg, bd);
+    return;
+  }
+
+  verf[0] = *((const __m128i *)(vCoeffs));
+  verf[1] = *((const __m128i *)(vCoeffs + 1));
+  verf[2] = *((const __m128i *)(vCoeffs + 2));
+  verf[3] = *((const __m128i *)(vCoeffs + 3));
+  verf[4] = *((const __m128i *)(vCoeffs + 4));
+  verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+  src -= (tapsNum >> 1) - 1;
+  srcPtr = src;
+
+  count = 0;
+  blkHeight = h >> 2;
+  blkResidu = h & 3;
+
+  while (blkHeight != 0) {
+    for (col = 0; col < w; col += 4) {
+      for (i = 0; i < 4; ++i) {
+        highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
+        srcPtr += 1;
+      }
+      transSave(w, 0, temp, 4, dst + col, dst_stride, bd);
+    }
+    count++;
+    srcPtr = src + count * src_stride * 4;
+    dst += dst_stride * 4;
+    blkHeight--;
+  }
+
+  if (blkResidu == 0) return;
+
+  for (col = 0; col < w; col += 4) {
+    for (i = 0; i < 4; ++i) {
+      highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
+      srcPtr += 1;
+    }
+    transSave(w, blkResidu, temp, 4, dst + col, dst_stride, bd);
+  }
+}
+
+// Vertical convolutional filter
+
+typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst);
+
+static void highbdRndingPacks(__m128i *u) {
+  __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+  u[0] = _mm_add_epi32(u[0], rnd);
+  u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
+  u[0] = _mm_packus_epi32(u[0], u[0]);
+}
+
+static void write2pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(u[0]);
+}
+
+static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
+  __m128i v = _mm_loadl_epi64((__m128i const *)dst);
+  const __m128i ones = _mm_set1_epi16(1);
+
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+
+  v = _mm_add_epi16(v, u[0]);
+  v = _mm_add_epi16(v, ones);
+  v = _mm_srai_epi16(v, 1);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(v);
+}
+
+WritePixels write2pixelsTab[2] = { write2pixelsOnly, write2pixelsAccum };
+
+static void write4pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+  _mm_storel_epi64((__m128i *)dst, u[0]);
+}
+
+static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
+  __m128i v = _mm_loadl_epi64((__m128i const *)dst);
+  const __m128i ones = _mm_set1_epi16(1);
+
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+
+  v = _mm_add_epi16(v, u[0]);
+  v = _mm_add_epi16(v, ones);
+  v = _mm_srai_epi16(v, 1);
+  _mm_storel_epi64((__m128i *)dst, v);
+}
+
+WritePixels write4pixelsTab[2] = { write4pixelsOnly, write4pixelsAccum };
+
+static void filter_vert_horiz_parallel(const uint16_t *src, int src_stride,
+                                       const __m128i *f, int taps,
+                                       uint16_t *dst, WritePixels saveFunc,
+                                       int bd) {
+  __m128i s[12];
+  __m128i zero = _mm_setzero_si128();
+  int i = 0;
+  int r = 0;
+
+  // TODO(luoyi) treat s[12] as a circular buffer in width = 2 case
+  if (10 == taps) {
+    i += 1;
+    s[0] = zero;
+  }
+  while (i < 12) {
+    s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
+    i += 1;
+    r += 1;
+  }
+
+  s[0] = _mm_unpacklo_epi16(s[0], s[1]);
+  s[2] = _mm_unpacklo_epi16(s[2], s[3]);
+  s[4] = _mm_unpacklo_epi16(s[4], s[5]);
+  s[6] = _mm_unpacklo_epi16(s[6], s[7]);
+  s[8] = _mm_unpacklo_epi16(s[8], s[9]);
+  s[10] = _mm_unpacklo_epi16(s[10], s[11]);
+
+  s[0] = _mm_madd_epi16(s[0], f[0]);
+  s[2] = _mm_madd_epi16(s[2], f[1]);
+  s[4] = _mm_madd_epi16(s[4], f[2]);
+  s[6] = _mm_madd_epi16(s[6], f[3]);
+  s[8] = _mm_madd_epi16(s[8], f[4]);
+  s[10] = _mm_madd_epi16(s[10], f[5]);
+
+  s[1] = _mm_min_epi32(s[4], s[6]);
+  s[3] = _mm_max_epi32(s[4], s[6]);
+
+  s[0] = _mm_add_epi32(s[0], s[2]);
+  s[0] = _mm_add_epi32(s[0], s[10]);
+  s[0] = _mm_add_epi32(s[0], s[8]);
+  s[0] = _mm_add_epi32(s[0], s[1]);
+  s[0] = _mm_add_epi32(s[0], s[3]);
+
+  saveFunc(s, bd, dst);
+}
+
+static void highbd_filter_vert_compute_large(const uint16_t *src,
+                                             int src_stride, const __m128i *f,
+                                             int taps, int w, int h,
+                                             uint16_t *dst, int dst_stride,
+                                             int avg, int bd) {
+  int col;
+  int rowIndex = 0;
+  const uint16_t *src_ptr = src;
+  uint16_t *dst_ptr = dst;
+  const int step = 4;
+  WritePixels write4pixels = write4pixelsTab[avg];
+
+  do {
+    for (col = 0; col < w; col += step) {
+      filter_vert_horiz_parallel(src_ptr, src_stride, f, taps, dst_ptr,
+                                 write4pixels, bd);
+      src_ptr += step;
+      dst_ptr += step;
+    }
+    rowIndex++;
+    src_ptr = src + rowIndex * src_stride;
+    dst_ptr = dst + rowIndex * dst_stride;
+  } while (rowIndex < h);
+}
+
+static void highbd_filter_vert_compute_small(const uint16_t *src,
+                                             int src_stride, const __m128i *f,
+                                             int taps, int w, int h,
+                                             uint16_t *dst, int dst_stride,
+                                             int avg, int bd) {
+  int rowIndex = 0;
+  WritePixels write2pixels = write2pixelsTab[avg];
+  (void)w;
+
+  do {
+    filter_vert_horiz_parallel(src, src_stride, f, taps, dst, write2pixels, bd);
+    rowIndex++;
+    src += src_stride;
+    dst += dst_stride;
+  } while (rowIndex < h);
+}
+
+void av1_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride,
+                                     uint16_t *dst, int dst_stride, int w,
+                                     int h,
+                                     const InterpFilterParams filter_params,
+                                     const int subpel_y_q4, int y_step_q4,
+                                     int avg, int bd) {
+  __m128i verf[6];
+  HbdSubpelFilterCoeffs vCoeffs;
+  const int tapsNum = filter_params.taps;
+
+  if (0 == subpel_y_q4 || 16 != y_step_q4) {
+    av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
+                               filter_params, subpel_y_q4, y_step_q4, avg, bd);
+    return;
+  }
+
+  vCoeffs =
+      hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1);
+  if (!vCoeffs) {
+    av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
+                               filter_params, subpel_y_q4, y_step_q4, avg, bd);
+    return;
+  }
+
+  verf[0] = *((const __m128i *)(vCoeffs));
+  verf[1] = *((const __m128i *)(vCoeffs + 1));
+  verf[2] = *((const __m128i *)(vCoeffs + 2));
+  verf[3] = *((const __m128i *)(vCoeffs + 3));
+  verf[4] = *((const __m128i *)(vCoeffs + 4));
+  verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+  src -= src_stride * ((tapsNum >> 1) - 1);
+
+  if (w > 2) {
+    highbd_filter_vert_compute_large(src, src_stride, verf, tapsNum, w, h, dst,
+                                     dst_stride, avg, bd);
+  } else {
+    highbd_filter_vert_compute_small(src, src_stride, verf, tapsNum, w, h, dst,
+                                     dst_stride, avg, bd);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h
new file mode 100644
index 000000000..af7afb7ee
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h
@@ -0,0 +1,144 @@
+#ifndef AV1_TXMF1D_SSE2_H_
+#define AV1_TXMF1D_SSE2_H_
+
+#include <smmintrin.h>
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+
+void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t *cos_bit, const int8_t *stage_range);
+
+void av1_idct4_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t *cos_bit, const int8_t *stage_range);
+void av1_idct8_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t *cos_bit, const int8_t *stage_range);
+void av1_idct16_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_idct32_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_idct64_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+
+void av1_iadst4_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_iadst8_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_iadst16_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t *cos_bit, const int8_t *stage_range);
+void av1_iadst32_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t *cos_bit, const int8_t *stage_range);
+
+static INLINE void transpose_32_4x4(int stride, const __m128i *input,
+                                    __m128i *output) {
+  __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
+  __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
+
+  output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+  output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+  output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+  output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+// the entire input block can be represent by a grid of 4x4 blocks
+// each 4x4 blocks can be represent by 4 vertical __m128i
+// we first transpose each 4x4 block internally
+// than transpose the grid
+static INLINE void transpose_32(int txfm_size, const __m128i *input,
+                                __m128i *output) {
+  const int num_per_128 = 4;
+  const int row_size = txfm_size;
+  const int col_size = txfm_size / num_per_128;
+  int r, c;
+
+  // transpose each 4x4 block internally
+  for (r = 0; r < row_size; r += 4) {
+    for (c = 0; c < col_size; c++) {
+      transpose_32_4x4(col_size, &input[r * col_size + c],
+                       &output[c * 4 * col_size + r / 4]);
+    }
+  }
+}
+
+static INLINE __m128i round_shift_32_sse4_1(__m128i vec, int bit) {
+  __m128i tmp, round;
+  round = _mm_set1_epi32(1 << (bit - 1));
+  tmp = _mm_add_epi32(vec, round);
+  return _mm_srai_epi32(tmp, bit);
+}
+
+static INLINE void round_shift_array_32_sse4_1(__m128i *input, __m128i *output,
+                                               const int size, const int bit) {
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = round_shift_32_sse4_1(input[i], bit);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = _mm_slli_epi32(input[i], -bit);
+    }
+  }
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                         \
+    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
+    ww0 = _mm_set1_epi32(w0);                                  \
+    ww1 = _mm_set1_epi32(w1);                                  \
+    in0_w0 = _mm_mullo_epi32(in0, ww0);                        \
+    in1_w1 = _mm_mullo_epi32(in1, ww1);                        \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
+    out0 = round_shift_32_sse4_1(out0, bit);                   \
+    in0_w1 = _mm_mullo_epi32(in0, ww1);                        \
+    in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
+    out1 = _mm_sub_epi32(in0_w1, in1_w0);                      \
+    out1 = round_shift_32_sse4_1(out1, bit);                   \
+  } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                         \
+    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
+    ww0 = _mm_set1_epi32(w0);                                  \
+    ww1 = _mm_set1_epi32(w1);                                  \
+    in0_w0 = _mm_mullo_epi32(in0, ww0);                        \
+    in1_w1 = _mm_mullo_epi32(in1, ww1);                        \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
+    out0 = round_shift_32_sse4_1(out0, bit);                   \
+    in0_w1 = _mm_mullo_epi32(in0, ww1);                        \
+    in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
+    out1 = _mm_sub_epi32(in1_w0, in0_w1);                      \
+    out1 = round_shift_32_sse4_1(out1, bit);                   \
+  } while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AV1_TXMF1D_SSE2_H_
diff --git a/third_party/aom/av1/common/x86/filterintra_sse4.c b/third_party/aom/av1/common/x86/filterintra_sse4.c
new file mode 100644
index 000000000..4f77da446
--- /dev/null
+++ b/third_party/aom/av1/common/x86/filterintra_sse4.c
@@ -0,0 +1,898 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "aom_ports/mem.h"
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+#if USE_3TAP_INTRA_FILTER
+void filterintra_sse4_3tap_dummy_func(void);
+void filterintra_sse4_3tap_dummy_func(void) {}
+#else
+
+static INLINE void AddPixelsSmall(const uint8_t *above, const uint8_t *left,
+                                  __m128i *sum) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)above);
+  const __m128i l = _mm_loadu_si128((const __m128i *)left);
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i u0 = _mm_unpacklo_epi8(a, zero);
+  __m128i u1 = _mm_unpacklo_epi8(l, zero);
+
+  sum[0] = _mm_add_epi16(u0, u1);
+}
+
+static INLINE int GetMeanValue4x4(const uint8_t *above, const uint8_t *left,
+                                  __m128i *params) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint16_t sum_value;
+
+  AddPixelsSmall(above, left, &sum_vector);
+
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+  u = _mm_srli_si128(sum_vector, 2);
+  sum_vector = _mm_add_epi16(sum_vector, u);
+
+  sum_value = _mm_extract_epi16(sum_vector, 0);
+  sum_value += 4;
+  sum_value >>= 3;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+static INLINE int GetMeanValue8x8(const uint8_t *above, const uint8_t *left,
+                                  __m128i *params) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint16_t sum_value;
+
+  AddPixelsSmall(above, left, &sum_vector);
+
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+
+  u = _mm_srli_si128(sum_vector, 2);
+  sum_vector = _mm_add_epi16(sum_vector, u);
+
+  sum_value = _mm_extract_epi16(sum_vector, 0);
+  sum_value += 8;
+  sum_value >>= 4;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+static INLINE void AddPixelsLarge(const uint8_t *above, const uint8_t *left,
+                                  __m128i *sum) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)above);
+  const __m128i l = _mm_loadu_si128((const __m128i *)left);
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i u0 = _mm_unpacklo_epi8(a, zero);
+  __m128i u1 = _mm_unpacklo_epi8(l, zero);
+
+  sum[0] = _mm_add_epi16(u0, u1);
+
+  u0 = _mm_unpackhi_epi8(a, zero);
+  u1 = _mm_unpackhi_epi8(l, zero);
+
+  sum[0] = _mm_add_epi16(sum[0], u0);
+  sum[0] = _mm_add_epi16(sum[0], u1);
+}
+
+static INLINE int GetMeanValue16x16(const uint8_t *above, const uint8_t *left,
+                                    __m128i *params) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint16_t sum_value;
+
+  AddPixelsLarge(above, left, &sum_vector);
+
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+
+  u = _mm_srli_si128(sum_vector, 2);
+  sum_vector = _mm_add_epi16(sum_vector, u);
+
+  sum_value = _mm_extract_epi16(sum_vector, 0);
+  sum_value += 16;
+  sum_value >>= 5;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+static INLINE int GetMeanValue32x32(const uint8_t *above, const uint8_t *left,
+                                    __m128i *params) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector[2], u;
+  uint16_t sum_value;
+
+  AddPixelsLarge(above, left, &sum_vector[0]);
+  AddPixelsLarge(above + 16, left + 16, &sum_vector[1]);
+
+  sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]);
+  sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 4 values
+  sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 2 values
+
+  u = _mm_srli_si128(sum_vector[0], 2);
+  sum_vector[0] = _mm_add_epi16(sum_vector[0], u);
+
+  sum_value = _mm_extract_epi16(sum_vector[0], 0);
+  sum_value += 32;
+  sum_value >>= 6;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+// Note:
+//  params[4] : mean value, 4 int32_t repetition
+//
+static INLINE int CalcRefPixelsMeanValue(const uint8_t *above,
+                                         const uint8_t *left, int bs,
+                                         __m128i *params) {
+  int meanValue = 0;
+  switch (bs) {
+    case 4: meanValue = GetMeanValue4x4(above, left, params); break;
+    case 8: meanValue = GetMeanValue8x8(above, left, params); break;
+    case 16: meanValue = GetMeanValue16x16(above, left, params); break;
+    case 32: meanValue = GetMeanValue32x32(above, left, params); break;
+    default: assert(0);
+  }
+  return meanValue;
+}
+
+// Note:
+//  params[0-3] : 4-tap filter coefficients (int32_t per coefficient)
+//
+static INLINE void GetIntraFilterParams(int bs, int mode, __m128i *params) {
+  const TX_SIZE tx_size =
+      (bs == 32) ? TX_32X32
+                 : ((bs == 16) ? TX_16X16 : ((bs == 8) ? TX_8X8 : (TX_4X4)));
+  // c0
+  params[0] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][0],
+                            av1_filter_intra_taps_4[tx_size][mode][0],
+                            av1_filter_intra_taps_4[tx_size][mode][0],
+                            av1_filter_intra_taps_4[tx_size][mode][0]);
+  // c1
+  params[1] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][1],
+                            av1_filter_intra_taps_4[tx_size][mode][1],
+                            av1_filter_intra_taps_4[tx_size][mode][1],
+                            av1_filter_intra_taps_4[tx_size][mode][1]);
+  // c2
+  params[2] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][2],
+                            av1_filter_intra_taps_4[tx_size][mode][2],
+                            av1_filter_intra_taps_4[tx_size][mode][2],
+                            av1_filter_intra_taps_4[tx_size][mode][2]);
+  // c3
+  params[3] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][3],
+                            av1_filter_intra_taps_4[tx_size][mode][3],
+                            av1_filter_intra_taps_4[tx_size][mode][3],
+                            av1_filter_intra_taps_4[tx_size][mode][3]);
+}
+
+static const int maxBlkSize = 32;
+
+static INLINE void SavePred4x4(int *pred, const __m128i *mean, uint8_t *dst,
+                               ptrdiff_t stride) {
+  const int predStride = (maxBlkSize << 1) + 1;
+  __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
+  __m128i p1 = _mm_loadu_si128((const __m128i *)(pred + predStride));
+  __m128i p2 = _mm_loadu_si128((const __m128i *)(pred + 2 * predStride));
+  __m128i p3 = _mm_loadu_si128((const __m128i *)(pred + 3 * predStride));
+
+  p0 = _mm_add_epi32(p0, mean[0]);
+  p1 = _mm_add_epi32(p1, mean[0]);
+  p2 = _mm_add_epi32(p2, mean[0]);
+  p3 = _mm_add_epi32(p3, mean[0]);
+
+  p0 = _mm_packus_epi32(p0, p1);
+  p1 = _mm_packus_epi32(p2, p3);
+  p0 = _mm_packus_epi16(p0, p1);
+
+  *((int *)dst) = _mm_cvtsi128_si32(p0);
+  p0 = _mm_srli_si128(p0, 4);
+  *((int *)(dst + stride)) = _mm_cvtsi128_si32(p0);
+  p0 = _mm_srli_si128(p0, 4);
+  *((int *)(dst + 2 * stride)) = _mm_cvtsi128_si32(p0);
+  p0 = _mm_srli_si128(p0, 4);
+  *((int *)(dst + 3 * stride)) = _mm_cvtsi128_si32(p0);
+}
+
+static void SavePred8x8(int *pred, const __m128i *mean, uint8_t *dst,
+                        ptrdiff_t stride) {
+  const int predStride = (maxBlkSize << 1) + 1;
+  __m128i p0, p1, p2, p3;
+  int r = 0;
+
+  while (r < 8) {
+    p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
+    p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
+    r += 1;
+    p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
+    p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
+
+    p0 = _mm_add_epi32(p0, mean[0]);
+    p1 = _mm_add_epi32(p1, mean[0]);
+    p2 = _mm_add_epi32(p2, mean[0]);
+    p3 = _mm_add_epi32(p3, mean[0]);
+
+    p0 = _mm_packus_epi32(p0, p1);
+    p1 = _mm_packus_epi32(p2, p3);
+    p0 = _mm_packus_epi16(p0, p1);
+
+    _mm_storel_epi64((__m128i *)dst, p0);
+    dst += stride;
+    p0 = _mm_srli_si128(p0, 8);
+    _mm_storel_epi64((__m128i *)dst, p0);
+    dst += stride;
+    r += 1;
+  }
+}
+
+static void SavePred16x16(int *pred, const __m128i *mean, uint8_t *dst,
+                          ptrdiff_t stride) {
+  const int predStride = (maxBlkSize << 1) + 1;
+  __m128i p0, p1, p2, p3;
+  int r = 0;
+
+  while (r < 16) {
+    p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
+    p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
+    p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8));
+    p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12));
+
+    p0 = _mm_add_epi32(p0, mean[0]);
+    p1 = _mm_add_epi32(p1, mean[0]);
+    p2 = _mm_add_epi32(p2, mean[0]);
+    p3 = _mm_add_epi32(p3, mean[0]);
+
+    p0 = _mm_packus_epi32(p0, p1);
+    p1 = _mm_packus_epi32(p2, p3);
+    p0 = _mm_packus_epi16(p0, p1);
+
+    _mm_storel_epi64((__m128i *)dst, p0);
+    p0 = _mm_srli_si128(p0, 8);
+    _mm_storel_epi64((__m128i *)(dst + 8), p0);
+    dst += stride;
+    r += 1;
+  }
+}
+
+static void SavePred32x32(int *pred, const __m128i *mean, uint8_t *dst,
+                          ptrdiff_t stride) {
+  const int predStride = (maxBlkSize << 1) + 1;
+  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
+  int r = 0;
+
+  while (r < 32) {
+    p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
+    p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
+    p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8));
+    p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12));
+
+    p4 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 16));
+    p5 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 20));
+    p6 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 24));
+    p7 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 28));
+
+    p0 = _mm_add_epi32(p0, mean[0]);
+    p1 = _mm_add_epi32(p1, mean[0]);
+    p2 = _mm_add_epi32(p2, mean[0]);
+    p3 = _mm_add_epi32(p3, mean[0]);
+
+    p4 = _mm_add_epi32(p4, mean[0]);
+    p5 = _mm_add_epi32(p5, mean[0]);
+    p6 = _mm_add_epi32(p6, mean[0]);
+    p7 = _mm_add_epi32(p7, mean[0]);
+
+    p0 = _mm_packus_epi32(p0, p1);
+    p1 = _mm_packus_epi32(p2, p3);
+    p0 = _mm_packus_epi16(p0, p1);
+
+    p4 = _mm_packus_epi32(p4, p5);
+    p5 = _mm_packus_epi32(p6, p7);
+    p4 = _mm_packus_epi16(p4, p5);
+
+    _mm_storel_epi64((__m128i *)dst, p0);
+    p0 = _mm_srli_si128(p0, 8);
+    _mm_storel_epi64((__m128i *)(dst + 8), p0);
+
+    _mm_storel_epi64((__m128i *)(dst + 16), p4);
+    p4 = _mm_srli_si128(p4, 8);
+    _mm_storel_epi64((__m128i *)(dst + 24), p4);
+
+    dst += stride;
+    r += 1;
+  }
+}
+
+static void SavePrediction(int *pred, const __m128i *mean, int bs, uint8_t *dst,
+                           ptrdiff_t stride) {
+  switch (bs) {
+    case 4: SavePred4x4(pred, mean, dst, stride); break;
+    case 8: SavePred8x8(pred, mean, dst, stride); break;
+    case 16: SavePred16x16(pred, mean, dst, stride); break;
+    case 32: SavePred32x32(pred, mean, dst, stride); break;
+    default: assert(0);
+  }
+}
+
+typedef void (*ProducePixelsFunc)(__m128i *p, const __m128i *prm, int *pred,
+                                  const int predStride);
+
+static void ProduceFourPixels(__m128i *p, const __m128i *prm, int *pred,
+                              const int predStride) {
+  __m128i u0, u1, u2;
+  int c0 = _mm_extract_epi32(prm[1], 0);
+  int x = *(pred + predStride);
+  int sum;
+
+  u0 = _mm_mullo_epi32(p[0], prm[2]);
+  u1 = _mm_mullo_epi32(p[1], prm[0]);
+  u2 = _mm_mullo_epi32(p[2], prm[3]);
+
+  u0 = _mm_add_epi32(u0, u1);
+  u0 = _mm_add_epi32(u0, u2);
+
+  sum = _mm_extract_epi32(u0, 0);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 1) = x;
+
+  sum = _mm_extract_epi32(u0, 1);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 2) = x;
+
+  sum = _mm_extract_epi32(u0, 2);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 3) = x;
+
+  sum = _mm_extract_epi32(u0, 3);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 4) = x;
+}
+
+static void ProduceThreePixels(__m128i *p, const __m128i *prm, int *pred,
+                               const int predStride) {
+  __m128i u0, u1, u2;
+  int c0 = _mm_extract_epi32(prm[1], 0);
+  int x = *(pred + predStride);
+  int sum;
+
+  u0 = _mm_mullo_epi32(p[0], prm[2]);
+  u1 = _mm_mullo_epi32(p[1], prm[0]);
+  u2 = _mm_mullo_epi32(p[2], prm[3]);
+
+  u0 = _mm_add_epi32(u0, u1);
+  u0 = _mm_add_epi32(u0, u2);
+
+  sum = _mm_extract_epi32(u0, 0);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 1) = x;
+
+  sum = _mm_extract_epi32(u0, 1);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 2) = x;
+
+  sum = _mm_extract_epi32(u0, 2);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 3) = x;
+}
+
+static void ProduceTwoPixels(__m128i *p, const __m128i *prm, int *pred,
+                             const int predStride) {
+  __m128i u0, u1, u2;
+  int c0 = _mm_extract_epi32(prm[1], 0);
+  int x = *(pred + predStride);
+  int sum;
+
+  u0 = _mm_mullo_epi32(p[0], prm[2]);
+  u1 = _mm_mullo_epi32(p[1], prm[0]);
+  u2 = _mm_mullo_epi32(p[2], prm[3]);
+
+  u0 = _mm_add_epi32(u0, u1);
+  u0 = _mm_add_epi32(u0, u2);
+
+  sum = _mm_extract_epi32(u0, 0);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 1) = x;
+
+  sum = _mm_extract_epi32(u0, 1);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 2) = x;
+}
+
+static void ProduceOnePixels(__m128i *p, const __m128i *prm, int *pred,
+                             const int predStride) {
+  __m128i u0, u1, u2;
+  int c0 = _mm_extract_epi32(prm[1], 0);
+  int x = *(pred + predStride);
+  int sum;
+
+  u0 = _mm_mullo_epi32(p[0], prm[2]);
+  u1 = _mm_mullo_epi32(p[1], prm[0]);
+  u2 = _mm_mullo_epi32(p[2], prm[3]);
+
+  u0 = _mm_add_epi32(u0, u1);
+  u0 = _mm_add_epi32(u0, u2);
+
+  sum = _mm_extract_epi32(u0, 0);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 1) = x;
+}
+
+static ProducePixelsFunc prodPixelsFuncTab[4] = {
+  ProduceOnePixels, ProduceTwoPixels, ProduceThreePixels, ProduceFourPixels
+};
+
+static void ProducePixels(int *pred, const __m128i *prm, int remain) {
+  __m128i p[3];
+  const int predStride = (maxBlkSize << 1) + 1;
+  int index;
+
+  p[0] = _mm_loadu_si128((const __m128i *)pred);
+  p[1] = _mm_loadu_si128((const __m128i *)(pred + 1));
+  p[2] = _mm_loadu_si128((const __m128i *)(pred + 2));
+
+  if (remain <= 2) {
+    return;
+  }
+  if (remain > 5) {
+    index = 3;
+  } else {
+    index = remain - 3;
+  }
+  prodPixelsFuncTab[index](p, prm, pred, predStride);
+}
+
+// Note:
+//  At column index c, the remaining pixels are R = 2 * bs + 1 - r - c
+//  the number of pixels to produce is R - 2 = 2 * bs - r - c - 1
+static void GeneratePrediction(const uint8_t *above, const uint8_t *left,
+                               const int bs, const __m128i *prm, int meanValue,
+                               uint8_t *dst, ptrdiff_t stride) {
+  int pred[33][65];
+  int r, c, colBound;
+  int remainings;
+
+  for (r = 0; r < bs; ++r) {
+    pred[r + 1][0] = (int)left[r] - meanValue;
+  }
+
+  above -= 1;
+  for (c = 0; c < 2 * bs + 1; ++c) {
+    pred[0][c] = (int)above[c] - meanValue;
+  }
+
+  r = 0;
+  c = 0;
+  while (r < bs) {
+    colBound = (bs << 1) - r;
+    for (c = 0; c < colBound; c += 4) {
+      remainings = colBound - c + 1;
+      ProducePixels(&pred[r][c], prm, remainings);
+    }
+    r += 1;
+  }
+
+  SavePrediction(&pred[1][1], &prm[4], bs, dst, stride);
+}
+
+static void FilterPrediction(const uint8_t *above, const uint8_t *left, int bs,
+                             __m128i *prm, uint8_t *dst, ptrdiff_t stride) {
+  int meanValue = 0;
+  meanValue = CalcRefPixelsMeanValue(above, left, bs, &prm[4]);
+  GeneratePrediction(above, left, bs, prm, meanValue, dst, stride);
+}
+
+void av1_dc_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, DC_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_v_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, V_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_h_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, H_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d45_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D45_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d135_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D135_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d117_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D117_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d153_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D153_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d207_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D207_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d63_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D63_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_tm_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, TM_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+// ============== High Bit Depth ==============
+#if CONFIG_HIGHBITDEPTH
+static INLINE int HighbdGetMeanValue4x4(const uint16_t *above,
+                                        const uint16_t *left, const int bd,
+                                        __m128i *params) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)above);
+  const __m128i l = _mm_loadu_si128((const __m128i *)left);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint16_t sum_value;
+  (void)bd;
+
+  sum_vector = _mm_add_epi16(a, l);
+
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+  u = _mm_srli_si128(sum_vector, 2);
+  sum_vector = _mm_add_epi16(sum_vector, u);
+
+  sum_value = _mm_extract_epi16(sum_vector, 0);
+  sum_value += 4;
+  sum_value >>= 3;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+static INLINE int HighbdGetMeanValue8x8(const uint16_t *above,
+                                        const uint16_t *left, const int bd,
+                                        __m128i *params) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)above);
+  const __m128i l = _mm_loadu_si128((const __m128i *)left);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint16_t sum_value;
+  (void)bd;
+
+  sum_vector = _mm_add_epi16(a, l);
+
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+
+  u = _mm_srli_si128(sum_vector, 2);
+  sum_vector = _mm_add_epi16(sum_vector, u);
+
+  sum_value = _mm_extract_epi16(sum_vector, 0);
+  sum_value += 8;
+  sum_value >>= 4;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+// Note:
+//  Process 16 pixels above and left, 10-bit depth
+//  Add to the last 8 pixels sum
+static INLINE void AddPixels10bit(const uint16_t *above, const uint16_t *left,
+                                  __m128i *sum) {
+  __m128i a = _mm_loadu_si128((const __m128i *)above);
+  __m128i l = _mm_loadu_si128((const __m128i *)left);
+  sum[0] = _mm_add_epi16(a, l);
+  a = _mm_loadu_si128((const __m128i *)(above + 8));
+  l = _mm_loadu_si128((const __m128i *)(left + 8));
+  sum[0] = _mm_add_epi16(sum[0], a);
+  sum[0] = _mm_add_epi16(sum[0], l);
+}
+
+// Note:
+//  Process 16 pixels above and left, 12-bit depth
+//  Add to the last 8 pixels sum
+static INLINE void AddPixels12bit(const uint16_t *above, const uint16_t *left,
+                                  __m128i *sum) {
+  __m128i a = _mm_loadu_si128((const __m128i *)above);
+  __m128i l = _mm_loadu_si128((const __m128i *)left);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i v0, v1;
+
+  v0 = _mm_unpacklo_epi16(a, zero);
+  v1 = _mm_unpacklo_epi16(l, zero);
+  sum[0] = _mm_add_epi32(v0, v1);
+
+  v0 = _mm_unpackhi_epi16(a, zero);
+  v1 = _mm_unpackhi_epi16(l, zero);
+  sum[0] = _mm_add_epi32(sum[0], v0);
+  sum[0] = _mm_add_epi32(sum[0], v1);
+
+  a = _mm_loadu_si128((const __m128i *)(above + 8));
+  l = _mm_loadu_si128((const __m128i *)(left + 8));
+
+  v0 = _mm_unpacklo_epi16(a, zero);
+  v1 = _mm_unpacklo_epi16(l, zero);
+  sum[0] = _mm_add_epi32(sum[0], v0);
+  sum[0] = _mm_add_epi32(sum[0], v1);
+
+  v0 = _mm_unpackhi_epi16(a, zero);
+  v1 = _mm_unpackhi_epi16(l, zero);
+  sum[0] = _mm_add_epi32(sum[0], v0);
+  sum[0] = _mm_add_epi32(sum[0], v1);
+}
+
+static INLINE int HighbdGetMeanValue16x16(const uint16_t *above,
+                                          const uint16_t *left, const int bd,
+                                          __m128i *params) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint32_t sum_value = 0;
+
+  if (10 == bd) {
+    AddPixels10bit(above, left, &sum_vector);
+    sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
+    sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+
+    u = _mm_srli_si128(sum_vector, 2);
+    sum_vector = _mm_add_epi16(sum_vector, u);
+    sum_value = _mm_extract_epi16(sum_vector, 0);
+  } else if (12 == bd) {
+    AddPixels12bit(above, left, &sum_vector);
+
+    sum_vector = _mm_hadd_epi32(sum_vector, zero);
+    u = _mm_srli_si128(sum_vector, 4);
+    sum_vector = _mm_add_epi32(u, sum_vector);
+    sum_value = _mm_extract_epi32(sum_vector, 0);
+  }
+
+  sum_value += 16;
+  sum_value >>= 5;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+static INLINE int HighbdGetMeanValue32x32(const uint16_t *above,
+                                          const uint16_t *left, const int bd,
+                                          __m128i *params) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector[2], u;
+  uint32_t sum_value = 0;
+
+  if (10 == bd) {
+    AddPixels10bit(above, left, &sum_vector[0]);
+    AddPixels10bit(above + 16, left + 16, &sum_vector[1]);
+
+    sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]);
+    sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 4 values
+    sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 2 values
+
+    u = _mm_srli_si128(sum_vector[0], 2);
+    sum_vector[0] = _mm_add_epi16(sum_vector[0], u);
+    sum_value = _mm_extract_epi16(sum_vector[0], 0);
+  } else if (12 == bd) {
+    AddPixels12bit(above, left, &sum_vector[0]);
+    AddPixels12bit(above + 16, left + 16, &sum_vector[1]);
+
+    sum_vector[0] = _mm_add_epi32(sum_vector[0], sum_vector[1]);
+    sum_vector[0] = _mm_hadd_epi32(sum_vector[0], zero);
+    u = _mm_srli_si128(sum_vector[0], 4);
+    sum_vector[0] = _mm_add_epi32(u, sum_vector[0]);
+    sum_value = _mm_extract_epi32(sum_vector[0], 0);
+  }
+
+  sum_value += 32;
+  sum_value >>= 6;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+// Note:
+//  params[4] : mean value, 4 int32_t repetition
+//
+static INLINE int HighbdCalcRefPixelsMeanValue(const uint16_t *above,
+                                               const uint16_t *left, int bs,
+                                               const int bd, __m128i *params) {
+  int meanValue = 0;
+  switch (bs) {
+    case 4: meanValue = HighbdGetMeanValue4x4(above, left, bd, params); break;
+    case 8: meanValue = HighbdGetMeanValue8x8(above, left, bd, params); break;
+    case 16:
+      meanValue = HighbdGetMeanValue16x16(above, left, bd, params);
+      break;
+    case 32:
+      meanValue = HighbdGetMeanValue32x32(above, left, bd, params);
+      break;
+    default: assert(0);
+  }
+  return meanValue;
+}
+
+// Note:
+//  At column index c, the remaining pixels are R = 2 * bs + 1 - r - c
+//  the number of pixels to produce is R - 2 = 2 * bs - r - c - 1
+static void HighbdGeneratePrediction(const uint16_t *above,
+                                     const uint16_t *left, const int bs,
+                                     const int bd, const __m128i *prm,
+                                     int meanValue, uint16_t *dst,
+                                     ptrdiff_t stride) {
+  int pred[33][65];
+  int r, c, colBound;
+  int remainings;
+  int ipred;
+
+  for (r = 0; r < bs; ++r) {
+    pred[r + 1][0] = (int)left[r] - meanValue;
+  }
+
+  above -= 1;
+  for (c = 0; c < 2 * bs + 1; ++c) {
+    pred[0][c] = (int)above[c] - meanValue;
+  }
+
+  r = 0;
+  c = 0;
+  while (r < bs) {
+    colBound = (bs << 1) - r;
+    for (c = 0; c < colBound; c += 4) {
+      remainings = colBound - c + 1;
+      ProducePixels(&pred[r][c], prm, remainings);
+    }
+    r += 1;
+  }
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      ipred = pred[r + 1][c + 1] + meanValue;
+      dst[c] = clip_pixel_highbd(ipred, bd);
+    }
+    dst += stride;
+  }
+}
+
+static void HighbdFilterPrediction(const uint16_t *above, const uint16_t *left,
+                                   int bs, const int bd, __m128i *prm,
+                                   uint16_t *dst, ptrdiff_t stride) {
+  int meanValue = 0;
+  meanValue = HighbdCalcRefPixelsMeanValue(above, left, bs, bd, &prm[4]);
+  HighbdGeneratePrediction(above, left, bs, bd, prm, meanValue, dst, stride);
+}
+
+void av1_highbd_dc_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                           int bs, const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, DC_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_v_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                          int bs, const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, V_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_h_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                          int bs, const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, H_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d45_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                            int bs, const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D45_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d135_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                             int bs, const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D135_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d117_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                             int bs, const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D117_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d153_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                             int bs, const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D153_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d207_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                             int bs, const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D207_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d63_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                            int bs, const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D63_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_tm_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                           int bs, const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, TM_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+#endif  // USE_3TAP_INTRA_FILTER
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
new file mode 100644
index 000000000..d10f1ccc2
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -0,0 +1,557 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "av1/common/av1_inv_txfm2d_cfg.h"
+
+// Note:
+//  Total 32x4 registers to represent 32x32 block coefficients.
+//  For high bit depth, each coefficient is 4-byte.
+//  Each __m256i register holds 8 coefficients.
+//  So each "row" we needs 4 register. Totally 32 rows
+//  Register layout:
+//   v0,   v1,   v2,   v3,
+//   v4,   v5,   v6,   v7,
+//   ... ...
+//   v124, v125, v126, v127
+
+static void transpose_32x32_8x8(const __m256i *in, __m256i *out) {
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i x0, x1;
+
+  u0 = _mm256_unpacklo_epi32(in[0], in[4]);
+  u1 = _mm256_unpackhi_epi32(in[0], in[4]);
+
+  u2 = _mm256_unpacklo_epi32(in[8], in[12]);
+  u3 = _mm256_unpackhi_epi32(in[8], in[12]);
+
+  u4 = _mm256_unpacklo_epi32(in[16], in[20]);
+  u5 = _mm256_unpackhi_epi32(in[16], in[20]);
+
+  u6 = _mm256_unpacklo_epi32(in[24], in[28]);
+  u7 = _mm256_unpackhi_epi32(in[24], in[28]);
+
+  x0 = _mm256_unpacklo_epi64(u0, u2);
+  x1 = _mm256_unpacklo_epi64(u4, u6);
+  out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[16] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpackhi_epi64(u0, u2);
+  x1 = _mm256_unpackhi_epi64(u4, u6);
+  out[4] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[20] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpacklo_epi64(u1, u3);
+  x1 = _mm256_unpacklo_epi64(u5, u7);
+  out[8] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[24] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpackhi_epi64(u1, u3);
+  x1 = _mm256_unpackhi_epi64(u5, u7);
+  out[12] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[28] = _mm256_permute2f128_si256(x0, x1, 0x31);
+}
+
+static void transpose_32x32_16x16(const __m256i *in, __m256i *out) {
+  transpose_32x32_8x8(&in[0], &out[0]);
+  transpose_32x32_8x8(&in[1], &out[32]);
+  transpose_32x32_8x8(&in[32], &out[1]);
+  transpose_32x32_8x8(&in[33], &out[33]);
+}
+
+static void transpose_32x32(const __m256i *in, __m256i *out) {
+  transpose_32x32_16x16(&in[0], &out[0]);
+  transpose_32x32_16x16(&in[2], &out[64]);
+  transpose_32x32_16x16(&in[64], &out[2]);
+  transpose_32x32_16x16(&in[66], &out[66]);
+}
+
+static void load_buffer_32x32(const int32_t *coeff, __m256i *in) {
+  int i;
+  for (i = 0; i < 128; ++i) {
+    in[i] = _mm256_loadu_si256((const __m256i *)coeff);
+    coeff += 8;
+  }
+}
+
+static void round_shift_32x32(__m256i *in, int shift) {
+  __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
+  int i = 0;
+
+  while (i < 128) {
+    in[i] = _mm256_add_epi32(in[i], rnding);
+    in[i] = _mm256_srai_epi32(in[i], shift);
+    i++;
+  }
+}
+
+static __m256i highbd_clamp_epi32(__m256i x, int bd) {
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
+  __m256i clamped, mask;
+
+  mask = _mm256_cmpgt_epi16(x, max);
+  clamped = _mm256_andnot_si256(mask, x);
+  mask = _mm256_and_si256(mask, max);
+  clamped = _mm256_or_si256(mask, clamped);
+  mask = _mm256_cmpgt_epi16(clamped, zero);
+  clamped = _mm256_and_si256(clamped, mask);
+
+  return clamped;
+}
+
+static void write_buffer_32x32(__m256i *in, uint16_t *output, int stride,
+                               int fliplr, int flipud, int shift, int bd) {
+  __m256i u0, u1, x0, x1, x2, x3, v0, v1, v2, v3;
+  const __m256i zero = _mm256_setzero_si256();
+  int i = 0;
+  (void)fliplr;
+  (void)flipud;
+
+  round_shift_32x32(in, shift);
+
+  while (i < 128) {
+    u0 = _mm256_loadu_si256((const __m256i *)output);
+    u1 = _mm256_loadu_si256((const __m256i *)(output + 16));
+
+    x0 = _mm256_unpacklo_epi16(u0, zero);
+    x1 = _mm256_unpackhi_epi16(u0, zero);
+    x2 = _mm256_unpacklo_epi16(u1, zero);
+    x3 = _mm256_unpackhi_epi16(u1, zero);
+
+    v0 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x20);
+    v1 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x31);
+    v2 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x20);
+    v3 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x31);
+
+    v0 = _mm256_add_epi32(v0, x0);
+    v1 = _mm256_add_epi32(v1, x1);
+    v2 = _mm256_add_epi32(v2, x2);
+    v3 = _mm256_add_epi32(v3, x3);
+
+    v0 = _mm256_packus_epi32(v0, v1);
+    v2 = _mm256_packus_epi32(v2, v3);
+
+    v0 = highbd_clamp_epi32(v0, bd);
+    v2 = highbd_clamp_epi32(v2, bd);
+
+    _mm256_storeu_si256((__m256i *)output, v0);
+    _mm256_storeu_si256((__m256i *)(output + 16), v2);
+    output += stride;
+    i += 4;
+  }
+}
+
+static INLINE __m256i half_btf_avx2(__m256i w0, __m256i n0, __m256i w1,
+                                    __m256i n1, __m256i rounding, int bit) {
+  __m256i x, y;
+
+  x = _mm256_mullo_epi32(w0, n0);
+  y = _mm256_mullo_epi32(w1, n1);
+  x = _mm256_add_epi32(x, y);
+  x = _mm256_add_epi32(x, rounding);
+  x = _mm256_srai_epi32(x, bit);
+  return x;
+}
+
+static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
+  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+  const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
+  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+  const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
+  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+  const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+  __m256i bf1[32], bf0[32];
+  int col;
+
+  for (col = 0; col < 4; ++col) {
+    // stage 0
+    // stage 1
+    bf1[0] = in[0 * 4 + col];
+    bf1[1] = in[16 * 4 + col];
+    bf1[2] = in[8 * 4 + col];
+    bf1[3] = in[24 * 4 + col];
+    bf1[4] = in[4 * 4 + col];
+    bf1[5] = in[20 * 4 + col];
+    bf1[6] = in[12 * 4 + col];
+    bf1[7] = in[28 * 4 + col];
+    bf1[8] = in[2 * 4 + col];
+    bf1[9] = in[18 * 4 + col];
+    bf1[10] = in[10 * 4 + col];
+    bf1[11] = in[26 * 4 + col];
+    bf1[12] = in[6 * 4 + col];
+    bf1[13] = in[22 * 4 + col];
+    bf1[14] = in[14 * 4 + col];
+    bf1[15] = in[30 * 4 + col];
+    bf1[16] = in[1 * 4 + col];
+    bf1[17] = in[17 * 4 + col];
+    bf1[18] = in[9 * 4 + col];
+    bf1[19] = in[25 * 4 + col];
+    bf1[20] = in[5 * 4 + col];
+    bf1[21] = in[21 * 4 + col];
+    bf1[22] = in[13 * 4 + col];
+    bf1[23] = in[29 * 4 + col];
+    bf1[24] = in[3 * 4 + col];
+    bf1[25] = in[19 * 4 + col];
+    bf1[26] = in[11 * 4 + col];
+    bf1[27] = in[27 * 4 + col];
+    bf1[28] = in[7 * 4 + col];
+    bf1[29] = in[23 * 4 + col];
+    bf1[30] = in[15 * 4 + col];
+    bf1[31] = in[31 * 4 + col];
+
+    // stage 2
+    bf0[0] = bf1[0];
+    bf0[1] = bf1[1];
+    bf0[2] = bf1[2];
+    bf0[3] = bf1[3];
+    bf0[4] = bf1[4];
+    bf0[5] = bf1[5];
+    bf0[6] = bf1[6];
+    bf0[7] = bf1[7];
+    bf0[8] = bf1[8];
+    bf0[9] = bf1[9];
+    bf0[10] = bf1[10];
+    bf0[11] = bf1[11];
+    bf0[12] = bf1[12];
+    bf0[13] = bf1[13];
+    bf0[14] = bf1[14];
+    bf0[15] = bf1[15];
+    bf0[16] = half_btf_avx2(cospi62, bf1[16], cospim2, bf1[31], rounding, bit);
+    bf0[17] = half_btf_avx2(cospi30, bf1[17], cospim34, bf1[30], rounding, bit);
+    bf0[18] = half_btf_avx2(cospi46, bf1[18], cospim18, bf1[29], rounding, bit);
+    bf0[19] = half_btf_avx2(cospi14, bf1[19], cospim50, bf1[28], rounding, bit);
+    bf0[20] = half_btf_avx2(cospi54, bf1[20], cospim10, bf1[27], rounding, bit);
+    bf0[21] = half_btf_avx2(cospi22, bf1[21], cospim42, bf1[26], rounding, bit);
+    bf0[22] = half_btf_avx2(cospi38, bf1[22], cospim26, bf1[25], rounding, bit);
+    bf0[23] = half_btf_avx2(cospi6, bf1[23], cospim58, bf1[24], rounding, bit);
+    bf0[24] = half_btf_avx2(cospi58, bf1[23], cospi6, bf1[24], rounding, bit);
+    bf0[25] = half_btf_avx2(cospi26, bf1[22], cospi38, bf1[25], rounding, bit);
+    bf0[26] = half_btf_avx2(cospi42, bf1[21], cospi22, bf1[26], rounding, bit);
+    bf0[27] = half_btf_avx2(cospi10, bf1[20], cospi54, bf1[27], rounding, bit);
+    bf0[28] = half_btf_avx2(cospi50, bf1[19], cospi14, bf1[28], rounding, bit);
+    bf0[29] = half_btf_avx2(cospi18, bf1[18], cospi46, bf1[29], rounding, bit);
+    bf0[30] = half_btf_avx2(cospi34, bf1[17], cospi30, bf1[30], rounding, bit);
+    bf0[31] = half_btf_avx2(cospi2, bf1[16], cospi62, bf1[31], rounding, bit);
+
+    // stage 3
+    bf1[0] = bf0[0];
+    bf1[1] = bf0[1];
+    bf1[2] = bf0[2];
+    bf1[3] = bf0[3];
+    bf1[4] = bf0[4];
+    bf1[5] = bf0[5];
+    bf1[6] = bf0[6];
+    bf1[7] = bf0[7];
+    bf1[8] = half_btf_avx2(cospi60, bf0[8], cospim4, bf0[15], rounding, bit);
+    bf1[9] = half_btf_avx2(cospi28, bf0[9], cospim36, bf0[14], rounding, bit);
+    bf1[10] = half_btf_avx2(cospi44, bf0[10], cospim20, bf0[13], rounding, bit);
+    bf1[11] = half_btf_avx2(cospi12, bf0[11], cospim52, bf0[12], rounding, bit);
+    bf1[12] = half_btf_avx2(cospi52, bf0[11], cospi12, bf0[12], rounding, bit);
+    bf1[13] = half_btf_avx2(cospi20, bf0[10], cospi44, bf0[13], rounding, bit);
+    bf1[14] = half_btf_avx2(cospi36, bf0[9], cospi28, bf0[14], rounding, bit);
+    bf1[15] = half_btf_avx2(cospi4, bf0[8], cospi60, bf0[15], rounding, bit);
+    bf1[16] = _mm256_add_epi32(bf0[16], bf0[17]);
+    bf1[17] = _mm256_sub_epi32(bf0[16], bf0[17]);
+    bf1[18] = _mm256_sub_epi32(bf0[19], bf0[18]);
+    bf1[19] = _mm256_add_epi32(bf0[18], bf0[19]);
+    bf1[20] = _mm256_add_epi32(bf0[20], bf0[21]);
+    bf1[21] = _mm256_sub_epi32(bf0[20], bf0[21]);
+    bf1[22] = _mm256_sub_epi32(bf0[23], bf0[22]);
+    bf1[23] = _mm256_add_epi32(bf0[22], bf0[23]);
+    bf1[24] = _mm256_add_epi32(bf0[24], bf0[25]);
+    bf1[25] = _mm256_sub_epi32(bf0[24], bf0[25]);
+    bf1[26] = _mm256_sub_epi32(bf0[27], bf0[26]);
+    bf1[27] = _mm256_add_epi32(bf0[26], bf0[27]);
+    bf1[28] = _mm256_add_epi32(bf0[28], bf0[29]);
+    bf1[29] = _mm256_sub_epi32(bf0[28], bf0[29]);
+    bf1[30] = _mm256_sub_epi32(bf0[31], bf0[30]);
+    bf1[31] = _mm256_add_epi32(bf0[30], bf0[31]);
+
+    // stage 4
+    bf0[0] = bf1[0];
+    bf0[1] = bf1[1];
+    bf0[2] = bf1[2];
+    bf0[3] = bf1[3];
+    bf0[4] = half_btf_avx2(cospi56, bf1[4], cospim8, bf1[7], rounding, bit);
+    bf0[5] = half_btf_avx2(cospi24, bf1[5], cospim40, bf1[6], rounding, bit);
+    bf0[6] = half_btf_avx2(cospi40, bf1[5], cospi24, bf1[6], rounding, bit);
+    bf0[7] = half_btf_avx2(cospi8, bf1[4], cospi56, bf1[7], rounding, bit);
+    bf0[8] = _mm256_add_epi32(bf1[8], bf1[9]);
+    bf0[9] = _mm256_sub_epi32(bf1[8], bf1[9]);
+    bf0[10] = _mm256_sub_epi32(bf1[11], bf1[10]);
+    bf0[11] = _mm256_add_epi32(bf1[10], bf1[11]);
+    bf0[12] = _mm256_add_epi32(bf1[12], bf1[13]);
+    bf0[13] = _mm256_sub_epi32(bf1[12], bf1[13]);
+    bf0[14] = _mm256_sub_epi32(bf1[15], bf1[14]);
+    bf0[15] = _mm256_add_epi32(bf1[14], bf1[15]);
+    bf0[16] = bf1[16];
+    bf0[17] = half_btf_avx2(cospim8, bf1[17], cospi56, bf1[30], rounding, bit);
+    bf0[18] = half_btf_avx2(cospim56, bf1[18], cospim8, bf1[29], rounding, bit);
+    bf0[19] = bf1[19];
+    bf0[20] = bf1[20];
+    bf0[21] = half_btf_avx2(cospim40, bf1[21], cospi24, bf1[26], rounding, bit);
+    bf0[22] =
+        half_btf_avx2(cospim24, bf1[22], cospim40, bf1[25], rounding, bit);
+    bf0[23] = bf1[23];
+    bf0[24] = bf1[24];
+    bf0[25] = half_btf_avx2(cospim40, bf1[22], cospi24, bf1[25], rounding, bit);
+    bf0[26] = half_btf_avx2(cospi24, bf1[21], cospi40, bf1[26], rounding, bit);
+    bf0[27] = bf1[27];
+    bf0[28] = bf1[28];
+    bf0[29] = half_btf_avx2(cospim8, bf1[18], cospi56, bf1[29], rounding, bit);
+    bf0[30] = half_btf_avx2(cospi56, bf1[17], cospi8, bf1[30], rounding, bit);
+    bf0[31] = bf1[31];
+
+    // stage 5
+    bf1[0] = half_btf_avx2(cospi32, bf0[0], cospi32, bf0[1], rounding, bit);
+    bf1[1] = half_btf_avx2(cospi32, bf0[0], cospim32, bf0[1], rounding, bit);
+    bf1[2] = half_btf_avx2(cospi48, bf0[2], cospim16, bf0[3], rounding, bit);
+    bf1[3] = half_btf_avx2(cospi16, bf0[2], cospi48, bf0[3], rounding, bit);
+    bf1[4] = _mm256_add_epi32(bf0[4], bf0[5]);
+    bf1[5] = _mm256_sub_epi32(bf0[4], bf0[5]);
+    bf1[6] = _mm256_sub_epi32(bf0[7], bf0[6]);
+    bf1[7] = _mm256_add_epi32(bf0[6], bf0[7]);
+    bf1[8] = bf0[8];
+    bf1[9] = half_btf_avx2(cospim16, bf0[9], cospi48, bf0[14], rounding, bit);
+    bf1[10] =
+        half_btf_avx2(cospim48, bf0[10], cospim16, bf0[13], rounding, bit);
+    bf1[11] = bf0[11];
+    bf1[12] = bf0[12];
+    bf1[13] = half_btf_avx2(cospim16, bf0[10], cospi48, bf0[13], rounding, bit);
+    bf1[14] = half_btf_avx2(cospi48, bf0[9], cospi16, bf0[14], rounding, bit);
+    bf1[15] = bf0[15];
+    bf1[16] = _mm256_add_epi32(bf0[16], bf0[19]);
+    bf1[17] = _mm256_add_epi32(bf0[17], bf0[18]);
+    bf1[18] = _mm256_sub_epi32(bf0[17], bf0[18]);
+    bf1[19] = _mm256_sub_epi32(bf0[16], bf0[19]);
+    bf1[20] = _mm256_sub_epi32(bf0[23], bf0[20]);
+    bf1[21] = _mm256_sub_epi32(bf0[22], bf0[21]);
+    bf1[22] = _mm256_add_epi32(bf0[21], bf0[22]);
+    bf1[23] = _mm256_add_epi32(bf0[20], bf0[23]);
+    bf1[24] = _mm256_add_epi32(bf0[24], bf0[27]);
+    bf1[25] = _mm256_add_epi32(bf0[25], bf0[26]);
+    bf1[26] = _mm256_sub_epi32(bf0[25], bf0[26]);
+    bf1[27] = _mm256_sub_epi32(bf0[24], bf0[27]);
+    bf1[28] = _mm256_sub_epi32(bf0[31], bf0[28]);
+    bf1[29] = _mm256_sub_epi32(bf0[30], bf0[29]);
+    bf1[30] = _mm256_add_epi32(bf0[29], bf0[30]);
+    bf1[31] = _mm256_add_epi32(bf0[28], bf0[31]);
+
+    // stage 6
+    bf0[0] = _mm256_add_epi32(bf1[0], bf1[3]);
+    bf0[1] = _mm256_add_epi32(bf1[1], bf1[2]);
+    bf0[2] = _mm256_sub_epi32(bf1[1], bf1[2]);
+    bf0[3] = _mm256_sub_epi32(bf1[0], bf1[3]);
+    bf0[4] = bf1[4];
+    bf0[5] = half_btf_avx2(cospim32, bf1[5], cospi32, bf1[6], rounding, bit);
+    bf0[6] = half_btf_avx2(cospi32, bf1[5], cospi32, bf1[6], rounding, bit);
+    bf0[7] = bf1[7];
+    bf0[8] = _mm256_add_epi32(bf1[8], bf1[11]);
+    bf0[9] = _mm256_add_epi32(bf1[9], bf1[10]);
+    bf0[10] = _mm256_sub_epi32(bf1[9], bf1[10]);
+    bf0[11] = _mm256_sub_epi32(bf1[8], bf1[11]);
+    bf0[12] = _mm256_sub_epi32(bf1[15], bf1[12]);
+    bf0[13] = _mm256_sub_epi32(bf1[14], bf1[13]);
+    bf0[14] = _mm256_add_epi32(bf1[13], bf1[14]);
+    bf0[15] = _mm256_add_epi32(bf1[12], bf1[15]);
+    bf0[16] = bf1[16];
+    bf0[17] = bf1[17];
+    bf0[18] = half_btf_avx2(cospim16, bf1[18], cospi48, bf1[29], rounding, bit);
+    bf0[19] = half_btf_avx2(cospim16, bf1[19], cospi48, bf1[28], rounding, bit);
+    bf0[20] =
+        half_btf_avx2(cospim48, bf1[20], cospim16, bf1[27], rounding, bit);
+    bf0[21] =
+        half_btf_avx2(cospim48, bf1[21], cospim16, bf1[26], rounding, bit);
+    bf0[22] = bf1[22];
+    bf0[23] = bf1[23];
+    bf0[24] = bf1[24];
+    bf0[25] = bf1[25];
+    bf0[26] = half_btf_avx2(cospim16, bf1[21], cospi48, bf1[26], rounding, bit);
+    bf0[27] = half_btf_avx2(cospim16, bf1[20], cospi48, bf1[27], rounding, bit);
+    bf0[28] = half_btf_avx2(cospi48, bf1[19], cospi16, bf1[28], rounding, bit);
+    bf0[29] = half_btf_avx2(cospi48, bf1[18], cospi16, bf1[29], rounding, bit);
+    bf0[30] = bf1[30];
+    bf0[31] = bf1[31];
+
+    // stage 7
+    bf1[0] = _mm256_add_epi32(bf0[0], bf0[7]);
+    bf1[1] = _mm256_add_epi32(bf0[1], bf0[6]);
+    bf1[2] = _mm256_add_epi32(bf0[2], bf0[5]);
+    bf1[3] = _mm256_add_epi32(bf0[3], bf0[4]);
+    bf1[4] = _mm256_sub_epi32(bf0[3], bf0[4]);
+    bf1[5] = _mm256_sub_epi32(bf0[2], bf0[5]);
+    bf1[6] = _mm256_sub_epi32(bf0[1], bf0[6]);
+    bf1[7] = _mm256_sub_epi32(bf0[0], bf0[7]);
+    bf1[8] = bf0[8];
+    bf1[9] = bf0[9];
+    bf1[10] = half_btf_avx2(cospim32, bf0[10], cospi32, bf0[13], rounding, bit);
+    bf1[11] = half_btf_avx2(cospim32, bf0[11], cospi32, bf0[12], rounding, bit);
+    bf1[12] = half_btf_avx2(cospi32, bf0[11], cospi32, bf0[12], rounding, bit);
+    bf1[13] = half_btf_avx2(cospi32, bf0[10], cospi32, bf0[13], rounding, bit);
+    bf1[14] = bf0[14];
+    bf1[15] = bf0[15];
+    bf1[16] = _mm256_add_epi32(bf0[16], bf0[23]);
+    bf1[17] = _mm256_add_epi32(bf0[17], bf0[22]);
+    bf1[18] = _mm256_add_epi32(bf0[18], bf0[21]);
+    bf1[19] = _mm256_add_epi32(bf0[19], bf0[20]);
+    bf1[20] = _mm256_sub_epi32(bf0[19], bf0[20]);
+    bf1[21] = _mm256_sub_epi32(bf0[18], bf0[21]);
+    bf1[22] = _mm256_sub_epi32(bf0[17], bf0[22]);
+    bf1[23] = _mm256_sub_epi32(bf0[16], bf0[23]);
+    bf1[24] = _mm256_sub_epi32(bf0[31], bf0[24]);
+    bf1[25] = _mm256_sub_epi32(bf0[30], bf0[25]);
+    bf1[26] = _mm256_sub_epi32(bf0[29], bf0[26]);
+    bf1[27] = _mm256_sub_epi32(bf0[28], bf0[27]);
+    bf1[28] = _mm256_add_epi32(bf0[27], bf0[28]);
+    bf1[29] = _mm256_add_epi32(bf0[26], bf0[29]);
+    bf1[30] = _mm256_add_epi32(bf0[25], bf0[30]);
+    bf1[31] = _mm256_add_epi32(bf0[24], bf0[31]);
+
+    // stage 8
+    bf0[0] = _mm256_add_epi32(bf1[0], bf1[15]);
+    bf0[1] = _mm256_add_epi32(bf1[1], bf1[14]);
+    bf0[2] = _mm256_add_epi32(bf1[2], bf1[13]);
+    bf0[3] = _mm256_add_epi32(bf1[3], bf1[12]);
+    bf0[4] = _mm256_add_epi32(bf1[4], bf1[11]);
+    bf0[5] = _mm256_add_epi32(bf1[5], bf1[10]);
+    bf0[6] = _mm256_add_epi32(bf1[6], bf1[9]);
+    bf0[7] = _mm256_add_epi32(bf1[7], bf1[8]);
+    bf0[8] = _mm256_sub_epi32(bf1[7], bf1[8]);
+    bf0[9] = _mm256_sub_epi32(bf1[6], bf1[9]);
+    bf0[10] = _mm256_sub_epi32(bf1[5], bf1[10]);
+    bf0[11] = _mm256_sub_epi32(bf1[4], bf1[11]);
+    bf0[12] = _mm256_sub_epi32(bf1[3], bf1[12]);
+    bf0[13] = _mm256_sub_epi32(bf1[2], bf1[13]);
+    bf0[14] = _mm256_sub_epi32(bf1[1], bf1[14]);
+    bf0[15] = _mm256_sub_epi32(bf1[0], bf1[15]);
+    bf0[16] = bf1[16];
+    bf0[17] = bf1[17];
+    bf0[18] = bf1[18];
+    bf0[19] = bf1[19];
+    bf0[20] = half_btf_avx2(cospim32, bf1[20], cospi32, bf1[27], rounding, bit);
+    bf0[21] = half_btf_avx2(cospim32, bf1[21], cospi32, bf1[26], rounding, bit);
+    bf0[22] = half_btf_avx2(cospim32, bf1[22], cospi32, bf1[25], rounding, bit);
+    bf0[23] = half_btf_avx2(cospim32, bf1[23], cospi32, bf1[24], rounding, bit);
+    bf0[24] = half_btf_avx2(cospi32, bf1[23], cospi32, bf1[24], rounding, bit);
+    bf0[25] = half_btf_avx2(cospi32, bf1[22], cospi32, bf1[25], rounding, bit);
+    bf0[26] = half_btf_avx2(cospi32, bf1[21], cospi32, bf1[26], rounding, bit);
+    bf0[27] = half_btf_avx2(cospi32, bf1[20], cospi32, bf1[27], rounding, bit);
+    bf0[28] = bf1[28];
+    bf0[29] = bf1[29];
+    bf0[30] = bf1[30];
+    bf0[31] = bf1[31];
+
+    // stage 9
+    out[0 * 4 + col] = _mm256_add_epi32(bf0[0], bf0[31]);
+    out[1 * 4 + col] = _mm256_add_epi32(bf0[1], bf0[30]);
+    out[2 * 4 + col] = _mm256_add_epi32(bf0[2], bf0[29]);
+    out[3 * 4 + col] = _mm256_add_epi32(bf0[3], bf0[28]);
+    out[4 * 4 + col] = _mm256_add_epi32(bf0[4], bf0[27]);
+    out[5 * 4 + col] = _mm256_add_epi32(bf0[5], bf0[26]);
+    out[6 * 4 + col] = _mm256_add_epi32(bf0[6], bf0[25]);
+    out[7 * 4 + col] = _mm256_add_epi32(bf0[7], bf0[24]);
+    out[8 * 4 + col] = _mm256_add_epi32(bf0[8], bf0[23]);
+    out[9 * 4 + col] = _mm256_add_epi32(bf0[9], bf0[22]);
+    out[10 * 4 + col] = _mm256_add_epi32(bf0[10], bf0[21]);
+    out[11 * 4 + col] = _mm256_add_epi32(bf0[11], bf0[20]);
+    out[12 * 4 + col] = _mm256_add_epi32(bf0[12], bf0[19]);
+    out[13 * 4 + col] = _mm256_add_epi32(bf0[13], bf0[18]);
+    out[14 * 4 + col] = _mm256_add_epi32(bf0[14], bf0[17]);
+    out[15 * 4 + col] = _mm256_add_epi32(bf0[15], bf0[16]);
+    out[16 * 4 + col] = _mm256_sub_epi32(bf0[15], bf0[16]);
+    out[17 * 4 + col] = _mm256_sub_epi32(bf0[14], bf0[17]);
+    out[18 * 4 + col] = _mm256_sub_epi32(bf0[13], bf0[18]);
+    out[19 * 4 + col] = _mm256_sub_epi32(bf0[12], bf0[19]);
+    out[20 * 4 + col] = _mm256_sub_epi32(bf0[11], bf0[20]);
+    out[21 * 4 + col] = _mm256_sub_epi32(bf0[10], bf0[21]);
+    out[22 * 4 + col] = _mm256_sub_epi32(bf0[9], bf0[22]);
+    out[23 * 4 + col] = _mm256_sub_epi32(bf0[8], bf0[23]);
+    out[24 * 4 + col] = _mm256_sub_epi32(bf0[7], bf0[24]);
+    out[25 * 4 + col] = _mm256_sub_epi32(bf0[6], bf0[25]);
+    out[26 * 4 + col] = _mm256_sub_epi32(bf0[5], bf0[26]);
+    out[27 * 4 + col] = _mm256_sub_epi32(bf0[4], bf0[27]);
+    out[28 * 4 + col] = _mm256_sub_epi32(bf0[3], bf0[28]);
+    out[29 * 4 + col] = _mm256_sub_epi32(bf0[2], bf0[29]);
+    out[30 * 4 + col] = _mm256_sub_epi32(bf0[1], bf0[30]);
+    out[31 * 4 + col] = _mm256_sub_epi32(bf0[0], bf0[31]);
+  }
+}
+
+void av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff, uint16_t *output,
+                                   int stride, int tx_type, int bd) {
+  __m256i in[128], out[128];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &inv_txfm_2d_cfg_dct_dct_32;
+      load_buffer_32x32(coeff, in);
+      transpose_32x32(in, out);
+      idct32_avx2(out, in, cfg->cos_bit_row[2]);
+      round_shift_32x32(in, -cfg->shift[0]);
+      transpose_32x32(in, out);
+      idct32_avx2(out, in, cfg->cos_bit_col[2]);
+      write_buffer_32x32(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    default: assert(0);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
new file mode 100644
index 000000000..24b2760b9
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -0,0 +1,1398 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "av1/common/av1_inv_txfm2d_cfg.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+
+static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
+  in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+  in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+  in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+  in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+}
+
+static void idct4x4_sse4_1(__m128i *in, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3, x, y;
+
+  v0 = _mm_unpacklo_epi32(in[0], in[1]);
+  v1 = _mm_unpackhi_epi32(in[0], in[1]);
+  v2 = _mm_unpacklo_epi32(in[2], in[3]);
+  v3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  u0 = _mm_unpacklo_epi64(v0, v2);
+  u1 = _mm_unpackhi_epi64(v0, v2);
+  u2 = _mm_unpacklo_epi64(v1, v3);
+  u3 = _mm_unpackhi_epi64(v1, v3);
+
+  x = _mm_mullo_epi32(u0, cospi32);
+  y = _mm_mullo_epi32(u2, cospi32);
+  v0 = _mm_add_epi32(x, y);
+  v0 = _mm_add_epi32(v0, rnding);
+  v0 = _mm_srai_epi32(v0, bit);
+
+  v1 = _mm_sub_epi32(x, y);
+  v1 = _mm_add_epi32(v1, rnding);
+  v1 = _mm_srai_epi32(v1, bit);
+
+  x = _mm_mullo_epi32(u1, cospi48);
+  y = _mm_mullo_epi32(u3, cospim16);
+  v2 = _mm_add_epi32(x, y);
+  v2 = _mm_add_epi32(v2, rnding);
+  v2 = _mm_srai_epi32(v2, bit);
+
+  x = _mm_mullo_epi32(u1, cospi16);
+  y = _mm_mullo_epi32(u3, cospi48);
+  v3 = _mm_add_epi32(x, y);
+  v3 = _mm_add_epi32(v3, rnding);
+  v3 = _mm_srai_epi32(v3, bit);
+
+  in[0] = _mm_add_epi32(v0, v3);
+  in[1] = _mm_add_epi32(v1, v2);
+  in[2] = _mm_sub_epi32(v1, v2);
+  in[3] = _mm_sub_epi32(v0, v3);
+}
+
+static void iadst4x4_sse4_1(__m128i *in, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3, x, y;
+
+  v0 = _mm_unpacklo_epi32(in[0], in[1]);
+  v1 = _mm_unpackhi_epi32(in[0], in[1]);
+  v2 = _mm_unpacklo_epi32(in[2], in[3]);
+  v3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  u0 = _mm_unpacklo_epi64(v0, v2);
+  u1 = _mm_unpackhi_epi64(v0, v2);
+  u2 = _mm_unpacklo_epi64(v1, v3);
+  u3 = _mm_unpackhi_epi64(v1, v3);
+
+  // stage 0
+  // stage 1
+  u1 = _mm_sub_epi32(zero, u1);
+  u3 = _mm_sub_epi32(zero, u3);
+
+  // stage 2
+  v0 = u0;
+  v1 = u3;
+  x = _mm_mullo_epi32(u1, cospi32);
+  y = _mm_mullo_epi32(u2, cospi32);
+  v2 = _mm_add_epi32(x, y);
+  v2 = _mm_add_epi32(v2, rnding);
+  v2 = _mm_srai_epi32(v2, bit);
+
+  v3 = _mm_sub_epi32(x, y);
+  v3 = _mm_add_epi32(v3, rnding);
+  v3 = _mm_srai_epi32(v3, bit);
+
+  // stage 3
+  u0 = _mm_add_epi32(v0, v2);
+  u1 = _mm_add_epi32(v1, v3);
+  u2 = _mm_sub_epi32(v0, v2);
+  u3 = _mm_sub_epi32(v1, v3);
+
+  // stage 4
+  x = _mm_mullo_epi32(u0, cospi8);
+  y = _mm_mullo_epi32(u1, cospi56);
+  in[3] = _mm_add_epi32(x, y);
+  in[3] = _mm_add_epi32(in[3], rnding);
+  in[3] = _mm_srai_epi32(in[3], bit);
+
+  x = _mm_mullo_epi32(u0, cospi56);
+  y = _mm_mullo_epi32(u1, cospim8);
+  in[0] = _mm_add_epi32(x, y);
+  in[0] = _mm_add_epi32(in[0], rnding);
+  in[0] = _mm_srai_epi32(in[0], bit);
+
+  x = _mm_mullo_epi32(u2, cospi40);
+  y = _mm_mullo_epi32(u3, cospi24);
+  in[1] = _mm_add_epi32(x, y);
+  in[1] = _mm_add_epi32(in[1], rnding);
+  in[1] = _mm_srai_epi32(in[1], bit);
+
+  x = _mm_mullo_epi32(u2, cospi24);
+  y = _mm_mullo_epi32(u3, cospim40);
+  in[2] = _mm_add_epi32(x, y);
+  in[2] = _mm_add_epi32(in[2], rnding);
+  in[2] = _mm_srai_epi32(in[2], bit);
+}
+
+static INLINE void round_shift_4x4(__m128i *in, int shift) {
+  __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
+
+  in[0] = _mm_add_epi32(in[0], rnding);
+  in[1] = _mm_add_epi32(in[1], rnding);
+  in[2] = _mm_add_epi32(in[2], rnding);
+  in[3] = _mm_add_epi32(in[3], rnding);
+
+  in[0] = _mm_srai_epi32(in[0], shift);
+  in[1] = _mm_srai_epi32(in[1], shift);
+  in[2] = _mm_srai_epi32(in[2], shift);
+  in[3] = _mm_srai_epi32(in[3], shift);
+}
+
+static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i clamped, mask;
+
+  mask = _mm_cmpgt_epi16(u, max);
+  clamped = _mm_andnot_si128(mask, u);
+  mask = _mm_and_si128(mask, max);
+  clamped = _mm_or_si128(mask, clamped);
+  mask = _mm_cmpgt_epi16(clamped, zero);
+  clamped = _mm_and_si128(clamped, mask);
+
+  return clamped;
+}
+
+static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
+                             int fliplr, int flipud, int shift, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+
+  round_shift_4x4(in, shift);
+
+  v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
+  v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
+  v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
+  v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
+
+  v0 = _mm_unpacklo_epi16(v0, zero);
+  v1 = _mm_unpacklo_epi16(v1, zero);
+  v2 = _mm_unpacklo_epi16(v2, zero);
+  v3 = _mm_unpacklo_epi16(v3, zero);
+
+  if (fliplr) {
+    in[0] = _mm_shuffle_epi32(in[0], 0x1B);
+    in[1] = _mm_shuffle_epi32(in[1], 0x1B);
+    in[2] = _mm_shuffle_epi32(in[2], 0x1B);
+    in[3] = _mm_shuffle_epi32(in[3], 0x1B);
+  }
+
+  if (flipud) {
+    u0 = _mm_add_epi32(in[3], v0);
+    u1 = _mm_add_epi32(in[2], v1);
+    u2 = _mm_add_epi32(in[1], v2);
+    u3 = _mm_add_epi32(in[0], v3);
+  } else {
+    u0 = _mm_add_epi32(in[0], v0);
+    u1 = _mm_add_epi32(in[1], v1);
+    u2 = _mm_add_epi32(in[2], v2);
+    u3 = _mm_add_epi32(in[3], v3);
+  }
+
+  v0 = _mm_packus_epi32(u0, u1);
+  v2 = _mm_packus_epi32(u2, u3);
+
+  u0 = highbd_clamp_epi16(v0, bd);
+  u2 = highbd_clamp_epi16(v2, bd);
+
+  v0 = _mm_unpacklo_epi64(u0, u0);
+  v1 = _mm_unpackhi_epi64(u0, u0);
+  v2 = _mm_unpacklo_epi64(u2, u2);
+  v3 = _mm_unpackhi_epi64(u2, u2);
+
+  _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
+  _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
+  _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
+  _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
+}
+
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
+                                   int stride, int tx_type, int bd) {
+  __m128i in[4];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &inv_txfm_2d_cfg_dct_dct_4;
+      load_buffer_4x4(coeff, in);
+      idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_4;
+      load_buffer_4x4(coeff, in);
+      idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case DCT_ADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_4;
+      load_buffer_4x4(coeff, in);
+      idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+    case DCT_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 1, 1, -cfg->shift[1], bd);
+      break;
+    case ADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0);
+  }
+}
+
+// 8x8
+static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
+  in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+  in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+  in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+  in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+  in[4] = _mm_load_si128((const __m128i *)(coeff + 16));
+  in[5] = _mm_load_si128((const __m128i *)(coeff + 20));
+  in[6] = _mm_load_si128((const __m128i *)(coeff + 24));
+  in[7] = _mm_load_si128((const __m128i *)(coeff + 28));
+  in[8] = _mm_load_si128((const __m128i *)(coeff + 32));
+  in[9] = _mm_load_si128((const __m128i *)(coeff + 36));
+  in[10] = _mm_load_si128((const __m128i *)(coeff + 40));
+  in[11] = _mm_load_si128((const __m128i *)(coeff + 44));
+  in[12] = _mm_load_si128((const __m128i *)(coeff + 48));
+  in[13] = _mm_load_si128((const __m128i *)(coeff + 52));
+  in[14] = _mm_load_si128((const __m128i *)(coeff + 56));
+  in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
+}
+
+static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
+  int col;
+
+  // Note:
+  //  Even column: 0, 2, ..., 14
+  //  Odd column: 1, 3, ..., 15
+  //  one even column plus one odd column constructs one row (8 coeffs)
+  //  total we have 8 rows (8x8).
+  for (col = 0; col < 2; ++col) {
+    // stage 0
+    // stage 1
+    // stage 2
+    u0 = in[0 * 2 + col];
+    u1 = in[4 * 2 + col];
+    u2 = in[2 * 2 + col];
+    u3 = in[6 * 2 + col];
+
+    x = _mm_mullo_epi32(in[1 * 2 + col], cospi56);
+    y = _mm_mullo_epi32(in[7 * 2 + col], cospim8);
+    u4 = _mm_add_epi32(x, y);
+    u4 = _mm_add_epi32(u4, rnding);
+    u4 = _mm_srai_epi32(u4, bit);
+
+    x = _mm_mullo_epi32(in[1 * 2 + col], cospi8);
+    y = _mm_mullo_epi32(in[7 * 2 + col], cospi56);
+    u7 = _mm_add_epi32(x, y);
+    u7 = _mm_add_epi32(u7, rnding);
+    u7 = _mm_srai_epi32(u7, bit);
+
+    x = _mm_mullo_epi32(in[5 * 2 + col], cospi24);
+    y = _mm_mullo_epi32(in[3 * 2 + col], cospim40);
+    u5 = _mm_add_epi32(x, y);
+    u5 = _mm_add_epi32(u5, rnding);
+    u5 = _mm_srai_epi32(u5, bit);
+
+    x = _mm_mullo_epi32(in[5 * 2 + col], cospi40);
+    y = _mm_mullo_epi32(in[3 * 2 + col], cospi24);
+    u6 = _mm_add_epi32(x, y);
+    u6 = _mm_add_epi32(u6, rnding);
+    u6 = _mm_srai_epi32(u6, bit);
+
+    // stage 3
+    x = _mm_mullo_epi32(u0, cospi32);
+    y = _mm_mullo_epi32(u1, cospi32);
+    v0 = _mm_add_epi32(x, y);
+    v0 = _mm_add_epi32(v0, rnding);
+    v0 = _mm_srai_epi32(v0, bit);
+
+    v1 = _mm_sub_epi32(x, y);
+    v1 = _mm_add_epi32(v1, rnding);
+    v1 = _mm_srai_epi32(v1, bit);
+
+    x = _mm_mullo_epi32(u2, cospi48);
+    y = _mm_mullo_epi32(u3, cospim16);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    x = _mm_mullo_epi32(u2, cospi16);
+    y = _mm_mullo_epi32(u3, cospi48);
+    v3 = _mm_add_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    v4 = _mm_add_epi32(u4, u5);
+    v5 = _mm_sub_epi32(u4, u5);
+    v6 = _mm_sub_epi32(u7, u6);
+    v7 = _mm_add_epi32(u6, u7);
+
+    // stage 4
+    u0 = _mm_add_epi32(v0, v3);
+    u1 = _mm_add_epi32(v1, v2);
+    u2 = _mm_sub_epi32(v1, v2);
+    u3 = _mm_sub_epi32(v0, v3);
+    u4 = v4;
+    u7 = v7;
+
+    x = _mm_mullo_epi32(v5, cospi32);
+    y = _mm_mullo_epi32(v6, cospi32);
+    u6 = _mm_add_epi32(y, x);
+    u6 = _mm_add_epi32(u6, rnding);
+    u6 = _mm_srai_epi32(u6, bit);
+
+    u5 = _mm_sub_epi32(y, x);
+    u5 = _mm_add_epi32(u5, rnding);
+    u5 = _mm_srai_epi32(u5, bit);
+
+    // stage 5
+    out[0 * 2 + col] = _mm_add_epi32(u0, u7);
+    out[1 * 2 + col] = _mm_add_epi32(u1, u6);
+    out[2 * 2 + col] = _mm_add_epi32(u2, u5);
+    out[3 * 2 + col] = _mm_add_epi32(u3, u4);
+    out[4 * 2 + col] = _mm_sub_epi32(u3, u4);
+    out[5 * 2 + col] = _mm_sub_epi32(u2, u5);
+    out[6 * 2 + col] = _mm_sub_epi32(u1, u6);
+    out[7 * 2 + col] = _mm_sub_epi32(u0, u7);
+  }
+}
+
+static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
+  int col;
+
+  // Note:
+  //  Even column: 0, 2, ..., 14
+  //  Odd column: 1, 3, ..., 15
+  //  one even column plus one odd column constructs one row (8 coeffs)
+  //  total we have 8 rows (8x8).
+  for (col = 0; col < 2; ++col) {
+    // stage 0
+    // stage 1
+    u0 = in[2 * 0 + col];
+    u1 = _mm_sub_epi32(zero, in[2 * 7 + col]);
+    u2 = _mm_sub_epi32(zero, in[2 * 3 + col]);
+    u3 = in[2 * 4 + col];
+    u4 = _mm_sub_epi32(zero, in[2 * 1 + col]);
+    u5 = in[2 * 6 + col];
+    u6 = in[2 * 2 + col];
+    u7 = _mm_sub_epi32(zero, in[2 * 5 + col]);
+
+    // stage 2
+    v0 = u0;
+    v1 = u1;
+
+    x = _mm_mullo_epi32(u2, cospi32);
+    y = _mm_mullo_epi32(u3, cospi32);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    v3 = _mm_sub_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    v4 = u4;
+    v5 = u5;
+
+    x = _mm_mullo_epi32(u6, cospi32);
+    y = _mm_mullo_epi32(u7, cospi32);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    v7 = _mm_sub_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 3
+    u0 = _mm_add_epi32(v0, v2);
+    u1 = _mm_add_epi32(v1, v3);
+    u2 = _mm_sub_epi32(v0, v2);
+    u3 = _mm_sub_epi32(v1, v3);
+    u4 = _mm_add_epi32(v4, v6);
+    u5 = _mm_add_epi32(v5, v7);
+    u6 = _mm_sub_epi32(v4, v6);
+    u7 = _mm_sub_epi32(v5, v7);
+
+    // stage 4
+    v0 = u0;
+    v1 = u1;
+    v2 = u2;
+    v3 = u3;
+
+    x = _mm_mullo_epi32(u4, cospi16);
+    y = _mm_mullo_epi32(u5, cospi48);
+    v4 = _mm_add_epi32(x, y);
+    v4 = _mm_add_epi32(v4, rnding);
+    v4 = _mm_srai_epi32(v4, bit);
+
+    x = _mm_mullo_epi32(u4, cospi48);
+    y = _mm_mullo_epi32(u5, cospim16);
+    v5 = _mm_add_epi32(x, y);
+    v5 = _mm_add_epi32(v5, rnding);
+    v5 = _mm_srai_epi32(v5, bit);
+
+    x = _mm_mullo_epi32(u6, cospim48);
+    y = _mm_mullo_epi32(u7, cospi16);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    x = _mm_mullo_epi32(u6, cospi16);
+    y = _mm_mullo_epi32(u7, cospi48);
+    v7 = _mm_add_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 5
+    u0 = _mm_add_epi32(v0, v4);
+    u1 = _mm_add_epi32(v1, v5);
+    u2 = _mm_add_epi32(v2, v6);
+    u3 = _mm_add_epi32(v3, v7);
+    u4 = _mm_sub_epi32(v0, v4);
+    u5 = _mm_sub_epi32(v1, v5);
+    u6 = _mm_sub_epi32(v2, v6);
+    u7 = _mm_sub_epi32(v3, v7);
+
+    // stage 6
+    x = _mm_mullo_epi32(u0, cospi4);
+    y = _mm_mullo_epi32(u1, cospi60);
+    v0 = _mm_add_epi32(x, y);
+    v0 = _mm_add_epi32(v0, rnding);
+    v0 = _mm_srai_epi32(v0, bit);
+
+    x = _mm_mullo_epi32(u0, cospi60);
+    y = _mm_mullo_epi32(u1, cospim4);
+    v1 = _mm_add_epi32(x, y);
+    v1 = _mm_add_epi32(v1, rnding);
+    v1 = _mm_srai_epi32(v1, bit);
+
+    x = _mm_mullo_epi32(u2, cospi20);
+    y = _mm_mullo_epi32(u3, cospi44);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    x = _mm_mullo_epi32(u2, cospi44);
+    y = _mm_mullo_epi32(u3, cospim20);
+    v3 = _mm_add_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    x = _mm_mullo_epi32(u4, cospi36);
+    y = _mm_mullo_epi32(u5, cospi28);
+    v4 = _mm_add_epi32(x, y);
+    v4 = _mm_add_epi32(v4, rnding);
+    v4 = _mm_srai_epi32(v4, bit);
+
+    x = _mm_mullo_epi32(u4, cospi28);
+    y = _mm_mullo_epi32(u5, cospim36);
+    v5 = _mm_add_epi32(x, y);
+    v5 = _mm_add_epi32(v5, rnding);
+    v5 = _mm_srai_epi32(v5, bit);
+
+    x = _mm_mullo_epi32(u6, cospi52);
+    y = _mm_mullo_epi32(u7, cospi12);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    x = _mm_mullo_epi32(u6, cospi12);
+    y = _mm_mullo_epi32(u7, cospim52);
+    v7 = _mm_add_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 7
+    out[2 * 0 + col] = v1;
+    out[2 * 1 + col] = v6;
+    out[2 * 2 + col] = v3;
+    out[2 * 3 + col] = v4;
+    out[2 * 4 + col] = v5;
+    out[2 * 5 + col] = v2;
+    out[2 * 6 + col] = v7;
+    out[2 * 7 + col] = v0;
+  }
+}
+
+static void round_shift_8x8(__m128i *in, int shift) {
+  round_shift_4x4(&in[0], shift);
+  round_shift_4x4(&in[4], shift);
+  round_shift_4x4(&in[8], shift);
+  round_shift_4x4(&in[12], shift);
+}
+
+static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
+                             int fliplr, int bd) {
+  __m128i x0, x1;
+  const __m128i zero = _mm_setzero_si128();
+
+  x0 = _mm_unpacklo_epi16(pred, zero);
+  x1 = _mm_unpackhi_epi16(pred, zero);
+
+  if (fliplr) {
+    res_lo = _mm_shuffle_epi32(res_lo, 0x1B);
+    res_hi = _mm_shuffle_epi32(res_hi, 0x1B);
+    x0 = _mm_add_epi32(res_hi, x0);
+    x1 = _mm_add_epi32(res_lo, x1);
+
+  } else {
+    x0 = _mm_add_epi32(res_lo, x0);
+    x1 = _mm_add_epi32(res_hi, x1);
+  }
+
+  x0 = _mm_packus_epi32(x0, x1);
+  return highbd_clamp_epi16(x0, bd);
+}
+
+static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
+                             int fliplr, int flipud, int shift, int bd) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+
+  round_shift_8x8(in, shift);
+
+  v0 = _mm_load_si128((__m128i const *)(output + 0 * stride));
+  v1 = _mm_load_si128((__m128i const *)(output + 1 * stride));
+  v2 = _mm_load_si128((__m128i const *)(output + 2 * stride));
+  v3 = _mm_load_si128((__m128i const *)(output + 3 * stride));
+  v4 = _mm_load_si128((__m128i const *)(output + 4 * stride));
+  v5 = _mm_load_si128((__m128i const *)(output + 5 * stride));
+  v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
+  v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
+
+  if (flipud) {
+    u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
+    u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
+    u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
+    u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
+    u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
+    u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
+    u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
+    u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
+  } else {
+    u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
+    u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
+    u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
+    u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
+    u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
+    u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
+    u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
+    u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
+  }
+
+  _mm_store_si128((__m128i *)(output + 0 * stride), u0);
+  _mm_store_si128((__m128i *)(output + 1 * stride), u1);
+  _mm_store_si128((__m128i *)(output + 2 * stride), u2);
+  _mm_store_si128((__m128i *)(output + 3 * stride), u3);
+  _mm_store_si128((__m128i *)(output + 4 * stride), u4);
+  _mm_store_si128((__m128i *)(output + 5 * stride), u5);
+  _mm_store_si128((__m128i *)(output + 6 * stride), u6);
+  _mm_store_si128((__m128i *)(output + 7 * stride), u7);
+}
+
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
+                                   int stride, int tx_type, int bd) {
+  __m128i in[16], out[16];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &inv_txfm_2d_cfg_dct_dct_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case DCT_ADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+    case DCT_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 1, 1, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0);
+  }
+}
+
+// 16x16
+static void load_buffer_16x16(const int32_t *coeff, __m128i *in) {
+  int i;
+  for (i = 0; i < 64; ++i) {
+    in[i] = _mm_load_si128((const __m128i *)(coeff + (i << 2)));
+  }
+}
+
+static void assign_8x8_input_from_16x16(const __m128i *in, __m128i *in8x8,
+                                        int col) {
+  int i;
+  for (i = 0; i < 16; i += 2) {
+    in8x8[i] = in[col];
+    in8x8[i + 1] = in[col + 1];
+    col += 4;
+  }
+}
+
+static void swap_addr(uint16_t **output1, uint16_t **output2) {
+  uint16_t *tmp;
+  tmp = *output1;
+  *output1 = *output2;
+  *output2 = tmp;
+}
+
+static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride,
+                               int fliplr, int flipud, int shift, int bd) {
+  __m128i in8x8[16];
+  uint16_t *leftUp = &output[0];
+  uint16_t *rightUp = &output[8];
+  uint16_t *leftDown = &output[8 * stride];
+  uint16_t *rightDown = &output[8 * stride + 8];
+
+  if (fliplr) {
+    swap_addr(&leftUp, &rightUp);
+    swap_addr(&leftDown, &rightDown);
+  }
+
+  if (flipud) {
+    swap_addr(&leftUp, &leftDown);
+    swap_addr(&rightUp, &rightDown);
+  }
+
+  // Left-up quarter
+  assign_8x8_input_from_16x16(in, in8x8, 0);
+  write_buffer_8x8(in8x8, leftUp, stride, fliplr, flipud, shift, bd);
+
+  // Right-up quarter
+  assign_8x8_input_from_16x16(in, in8x8, 2);
+  write_buffer_8x8(in8x8, rightUp, stride, fliplr, flipud, shift, bd);
+
+  // Left-down quarter
+  assign_8x8_input_from_16x16(in, in8x8, 32);
+  write_buffer_8x8(in8x8, leftDown, stride, fliplr, flipud, shift, bd);
+
+  // Right-down quarter
+  assign_8x8_input_from_16x16(in, in8x8, 34);
+  write_buffer_8x8(in8x8, rightDown, stride, fliplr, flipud, shift, bd);
+}
+
+static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u[16], v[16], x, y;
+  int col;
+
+  for (col = 0; col < 4; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = in[0 * 4 + col];
+    u[1] = in[8 * 4 + col];
+    u[2] = in[4 * 4 + col];
+    u[3] = in[12 * 4 + col];
+    u[4] = in[2 * 4 + col];
+    u[5] = in[10 * 4 + col];
+    u[6] = in[6 * 4 + col];
+    u[7] = in[14 * 4 + col];
+    u[8] = in[1 * 4 + col];
+    u[9] = in[9 * 4 + col];
+    u[10] = in[5 * 4 + col];
+    u[11] = in[13 * 4 + col];
+    u[12] = in[3 * 4 + col];
+    u[13] = in[11 * 4 + col];
+    u[14] = in[7 * 4 + col];
+    u[15] = in[15 * 4 + col];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = half_btf_sse4_1(cospi60, u[8], cospim4, u[15], rnding, bit);
+    v[9] = half_btf_sse4_1(cospi28, u[9], cospim36, u[14], rnding, bit);
+    v[10] = half_btf_sse4_1(cospi44, u[10], cospim20, u[13], rnding, bit);
+    v[11] = half_btf_sse4_1(cospi12, u[11], cospim52, u[12], rnding, bit);
+    v[12] = half_btf_sse4_1(cospi52, u[11], cospi12, u[12], rnding, bit);
+    v[13] = half_btf_sse4_1(cospi20, u[10], cospi44, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospi36, u[9], cospi28, u[14], rnding, bit);
+    v[15] = half_btf_sse4_1(cospi4, u[8], cospi60, u[15], rnding, bit);
+
+    // stage 3
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+    u[4] = half_btf_sse4_1(cospi56, v[4], cospim8, v[7], rnding, bit);
+    u[5] = half_btf_sse4_1(cospi24, v[5], cospim40, v[6], rnding, bit);
+    u[6] = half_btf_sse4_1(cospi40, v[5], cospi24, v[6], rnding, bit);
+    u[7] = half_btf_sse4_1(cospi8, v[4], cospi56, v[7], rnding, bit);
+    u[8] = _mm_add_epi32(v[8], v[9]);
+    u[9] = _mm_sub_epi32(v[8], v[9]);
+    u[10] = _mm_sub_epi32(v[11], v[10]);
+    u[11] = _mm_add_epi32(v[10], v[11]);
+    u[12] = _mm_add_epi32(v[12], v[13]);
+    u[13] = _mm_sub_epi32(v[12], v[13]);
+    u[14] = _mm_sub_epi32(v[15], v[14]);
+    u[15] = _mm_add_epi32(v[14], v[15]);
+
+    // stage 4
+    x = _mm_mullo_epi32(u[0], cospi32);
+    y = _mm_mullo_epi32(u[1], cospi32);
+    v[0] = _mm_add_epi32(x, y);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_sub_epi32(x, y);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = half_btf_sse4_1(cospi48, u[2], cospim16, u[3], rnding, bit);
+    v[3] = half_btf_sse4_1(cospi16, u[2], cospi48, u[3], rnding, bit);
+    v[4] = _mm_add_epi32(u[4], u[5]);
+    v[5] = _mm_sub_epi32(u[4], u[5]);
+    v[6] = _mm_sub_epi32(u[7], u[6]);
+    v[7] = _mm_add_epi32(u[6], u[7]);
+    v[8] = u[8];
+    v[9] = half_btf_sse4_1(cospim16, u[9], cospi48, u[14], rnding, bit);
+    v[10] = half_btf_sse4_1(cospim48, u[10], cospim16, u[13], rnding, bit);
+    v[11] = u[11];
+    v[12] = u[12];
+    v[13] = half_btf_sse4_1(cospim16, u[10], cospi48, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospi48, u[9], cospi16, u[14], rnding, bit);
+    v[15] = u[15];
+
+    // stage 5
+    u[0] = _mm_add_epi32(v[0], v[3]);
+    u[1] = _mm_add_epi32(v[1], v[2]);
+    u[2] = _mm_sub_epi32(v[1], v[2]);
+    u[3] = _mm_sub_epi32(v[0], v[3]);
+    u[4] = v[4];
+
+    x = _mm_mullo_epi32(v[5], cospi32);
+    y = _mm_mullo_epi32(v[6], cospi32);
+    u[5] = _mm_sub_epi32(y, x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_add_epi32(y, x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[7] = v[7];
+    u[8] = _mm_add_epi32(v[8], v[11]);
+    u[9] = _mm_add_epi32(v[9], v[10]);
+    u[10] = _mm_sub_epi32(v[9], v[10]);
+    u[11] = _mm_sub_epi32(v[8], v[11]);
+    u[12] = _mm_sub_epi32(v[15], v[12]);
+    u[13] = _mm_sub_epi32(v[14], v[13]);
+    u[14] = _mm_add_epi32(v[13], v[14]);
+    u[15] = _mm_add_epi32(v[12], v[15]);
+
+    // stage 6
+    v[0] = _mm_add_epi32(u[0], u[7]);
+    v[1] = _mm_add_epi32(u[1], u[6]);
+    v[2] = _mm_add_epi32(u[2], u[5]);
+    v[3] = _mm_add_epi32(u[3], u[4]);
+    v[4] = _mm_sub_epi32(u[3], u[4]);
+    v[5] = _mm_sub_epi32(u[2], u[5]);
+    v[6] = _mm_sub_epi32(u[1], u[6]);
+    v[7] = _mm_sub_epi32(u[0], u[7]);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[13], cospi32);
+    v[10] = _mm_sub_epi32(y, x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_add_epi32(x, y);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    x = _mm_mullo_epi32(u[11], cospi32);
+    y = _mm_mullo_epi32(u[12], cospi32);
+    v[11] = _mm_sub_epi32(y, x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_add_epi32(x, y);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 7
+    out[0 * 4 + col] = _mm_add_epi32(v[0], v[15]);
+    out[1 * 4 + col] = _mm_add_epi32(v[1], v[14]);
+    out[2 * 4 + col] = _mm_add_epi32(v[2], v[13]);
+    out[3 * 4 + col] = _mm_add_epi32(v[3], v[12]);
+    out[4 * 4 + col] = _mm_add_epi32(v[4], v[11]);
+    out[5 * 4 + col] = _mm_add_epi32(v[5], v[10]);
+    out[6 * 4 + col] = _mm_add_epi32(v[6], v[9]);
+    out[7 * 4 + col] = _mm_add_epi32(v[7], v[8]);
+    out[8 * 4 + col] = _mm_sub_epi32(v[7], v[8]);
+    out[9 * 4 + col] = _mm_sub_epi32(v[6], v[9]);
+    out[10 * 4 + col] = _mm_sub_epi32(v[5], v[10]);
+    out[11 * 4 + col] = _mm_sub_epi32(v[4], v[11]);
+    out[12 * 4 + col] = _mm_sub_epi32(v[3], v[12]);
+    out[13 * 4 + col] = _mm_sub_epi32(v[2], v[13]);
+    out[14 * 4 + col] = _mm_sub_epi32(v[1], v[14]);
+    out[15 * 4 + col] = _mm_sub_epi32(v[0], v[15]);
+  }
+}
+
+static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i u[16], v[16], x, y;
+  int col;
+
+  for (col = 0; col < 4; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = in[0 * 4 + col];
+    u[1] = _mm_sub_epi32(zero, in[15 * 4 + col]);
+    u[2] = _mm_sub_epi32(zero, in[7 * 4 + col]);
+    u[3] = in[8 * 4 + col];
+    u[4] = _mm_sub_epi32(zero, in[3 * 4 + col]);
+    u[5] = in[12 * 4 + col];
+    u[6] = in[4 * 4 + col];
+    u[7] = _mm_sub_epi32(zero, in[11 * 4 + col]);
+    u[8] = _mm_sub_epi32(zero, in[1 * 4 + col]);
+    u[9] = in[14 * 4 + col];
+    u[10] = in[6 * 4 + col];
+    u[11] = _mm_sub_epi32(zero, in[9 * 4 + col]);
+    u[12] = in[2 * 4 + col];
+    u[13] = _mm_sub_epi32(zero, in[13 * 4 + col]);
+    u[14] = _mm_sub_epi32(zero, in[5 * 4 + col]);
+    u[15] = in[10 * 4 + col];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+
+    x = _mm_mullo_epi32(u[2], cospi32);
+    y = _mm_mullo_epi32(u[3], cospi32);
+    v[2] = _mm_add_epi32(x, y);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_sub_epi32(x, y);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    x = _mm_mullo_epi32(u[6], cospi32);
+    y = _mm_mullo_epi32(u[7], cospi32);
+    v[6] = _mm_add_epi32(x, y);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_sub_epi32(x, y);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[11], cospi32);
+    v[10] = _mm_add_epi32(x, y);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_sub_epi32(x, y);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    x = _mm_mullo_epi32(u[14], cospi32);
+    y = _mm_mullo_epi32(u[15], cospi32);
+    v[14] = _mm_add_epi32(x, y);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_sub_epi32(x, y);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 3
+    u[0] = _mm_add_epi32(v[0], v[2]);
+    u[1] = _mm_add_epi32(v[1], v[3]);
+    u[2] = _mm_sub_epi32(v[0], v[2]);
+    u[3] = _mm_sub_epi32(v[1], v[3]);
+    u[4] = _mm_add_epi32(v[4], v[6]);
+    u[5] = _mm_add_epi32(v[5], v[7]);
+    u[6] = _mm_sub_epi32(v[4], v[6]);
+    u[7] = _mm_sub_epi32(v[5], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[10]);
+    u[9] = _mm_add_epi32(v[9], v[11]);
+    u[10] = _mm_sub_epi32(v[8], v[10]);
+    u[11] = _mm_sub_epi32(v[9], v[11]);
+    u[12] = _mm_add_epi32(v[12], v[14]);
+    u[13] = _mm_add_epi32(v[13], v[15]);
+    u[14] = _mm_sub_epi32(v[12], v[14]);
+    u[15] = _mm_sub_epi32(v[13], v[15]);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = half_btf_sse4_1(cospi16, u[4], cospi48, u[5], rnding, bit);
+    v[5] = half_btf_sse4_1(cospi48, u[4], cospim16, u[5], rnding, bit);
+    v[6] = half_btf_sse4_1(cospim48, u[6], cospi16, u[7], rnding, bit);
+    v[7] = half_btf_sse4_1(cospi16, u[6], cospi48, u[7], rnding, bit);
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+    v[12] = half_btf_sse4_1(cospi16, u[12], cospi48, u[13], rnding, bit);
+    v[13] = half_btf_sse4_1(cospi48, u[12], cospim16, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospim48, u[14], cospi16, u[15], rnding, bit);
+    v[15] = half_btf_sse4_1(cospi16, u[14], cospi48, u[15], rnding, bit);
+
+    // stage 5
+    u[0] = _mm_add_epi32(v[0], v[4]);
+    u[1] = _mm_add_epi32(v[1], v[5]);
+    u[2] = _mm_add_epi32(v[2], v[6]);
+    u[3] = _mm_add_epi32(v[3], v[7]);
+    u[4] = _mm_sub_epi32(v[0], v[4]);
+    u[5] = _mm_sub_epi32(v[1], v[5]);
+    u[6] = _mm_sub_epi32(v[2], v[6]);
+    u[7] = _mm_sub_epi32(v[3], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[12]);
+    u[9] = _mm_add_epi32(v[9], v[13]);
+    u[10] = _mm_add_epi32(v[10], v[14]);
+    u[11] = _mm_add_epi32(v[11], v[15]);
+    u[12] = _mm_sub_epi32(v[8], v[12]);
+    u[13] = _mm_sub_epi32(v[9], v[13]);
+    u[14] = _mm_sub_epi32(v[10], v[14]);
+    u[15] = _mm_sub_epi32(v[11], v[15]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+    v[8] = half_btf_sse4_1(cospi8, u[8], cospi56, u[9], rnding, bit);
+    v[9] = half_btf_sse4_1(cospi56, u[8], cospim8, u[9], rnding, bit);
+    v[10] = half_btf_sse4_1(cospi40, u[10], cospi24, u[11], rnding, bit);
+    v[11] = half_btf_sse4_1(cospi24, u[10], cospim40, u[11], rnding, bit);
+    v[12] = half_btf_sse4_1(cospim56, u[12], cospi8, u[13], rnding, bit);
+    v[13] = half_btf_sse4_1(cospi8, u[12], cospi56, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospim24, u[14], cospi40, u[15], rnding, bit);
+    v[15] = half_btf_sse4_1(cospi40, u[14], cospi24, u[15], rnding, bit);
+
+    // stage 7
+    u[0] = _mm_add_epi32(v[0], v[8]);
+    u[1] = _mm_add_epi32(v[1], v[9]);
+    u[2] = _mm_add_epi32(v[2], v[10]);
+    u[3] = _mm_add_epi32(v[3], v[11]);
+    u[4] = _mm_add_epi32(v[4], v[12]);
+    u[5] = _mm_add_epi32(v[5], v[13]);
+    u[6] = _mm_add_epi32(v[6], v[14]);
+    u[7] = _mm_add_epi32(v[7], v[15]);
+    u[8] = _mm_sub_epi32(v[0], v[8]);
+    u[9] = _mm_sub_epi32(v[1], v[9]);
+    u[10] = _mm_sub_epi32(v[2], v[10]);
+    u[11] = _mm_sub_epi32(v[3], v[11]);
+    u[12] = _mm_sub_epi32(v[4], v[12]);
+    u[13] = _mm_sub_epi32(v[5], v[13]);
+    u[14] = _mm_sub_epi32(v[6], v[14]);
+    u[15] = _mm_sub_epi32(v[7], v[15]);
+
+    // stage 8
+    v[0] = half_btf_sse4_1(cospi2, u[0], cospi62, u[1], rnding, bit);
+    v[1] = half_btf_sse4_1(cospi62, u[0], cospim2, u[1], rnding, bit);
+    v[2] = half_btf_sse4_1(cospi10, u[2], cospi54, u[3], rnding, bit);
+    v[3] = half_btf_sse4_1(cospi54, u[2], cospim10, u[3], rnding, bit);
+    v[4] = half_btf_sse4_1(cospi18, u[4], cospi46, u[5], rnding, bit);
+    v[5] = half_btf_sse4_1(cospi46, u[4], cospim18, u[5], rnding, bit);
+    v[6] = half_btf_sse4_1(cospi26, u[6], cospi38, u[7], rnding, bit);
+    v[7] = half_btf_sse4_1(cospi38, u[6], cospim26, u[7], rnding, bit);
+    v[8] = half_btf_sse4_1(cospi34, u[8], cospi30, u[9], rnding, bit);
+    v[9] = half_btf_sse4_1(cospi30, u[8], cospim34, u[9], rnding, bit);
+    v[10] = half_btf_sse4_1(cospi42, u[10], cospi22, u[11], rnding, bit);
+    v[11] = half_btf_sse4_1(cospi22, u[10], cospim42, u[11], rnding, bit);
+    v[12] = half_btf_sse4_1(cospi50, u[12], cospi14, u[13], rnding, bit);
+    v[13] = half_btf_sse4_1(cospi14, u[12], cospim50, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospi58, u[14], cospi6, u[15], rnding, bit);
+    v[15] = half_btf_sse4_1(cospi6, u[14], cospim58, u[15], rnding, bit);
+
+    // stage 9
+    out[0 * 4 + col] = v[1];
+    out[1 * 4 + col] = v[14];
+    out[2 * 4 + col] = v[3];
+    out[3 * 4 + col] = v[12];
+    out[4 * 4 + col] = v[5];
+    out[5 * 4 + col] = v[10];
+    out[6 * 4 + col] = v[7];
+    out[7 * 4 + col] = v[8];
+    out[8 * 4 + col] = v[9];
+    out[9 * 4 + col] = v[6];
+    out[10 * 4 + col] = v[11];
+    out[11 * 4 + col] = v[4];
+    out[12 * 4 + col] = v[13];
+    out[13 * 4 + col] = v[2];
+    out[14 * 4 + col] = v[15];
+    out[15 * 4 + col] = v[0];
+  }
+}
+
+static void round_shift_16x16(__m128i *in, int shift) {
+  round_shift_8x8(&in[0], shift);
+  round_shift_8x8(&in[16], shift);
+  round_shift_8x8(&in[32], shift);
+  round_shift_8x8(&in[48], shift);
+}
+
+void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
+                                     int stride, int tx_type, int bd) {
+  __m128i in[64], out[64];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &inv_txfm_2d_cfg_dct_dct_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case DCT_ADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+    case DCT_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 1, 1, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+#endif
+    default: assert(0);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
new file mode 100644
index 000000000..bc96defe3
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _HIGHBD_TXFM_UTILITY_SSE4_H
+#define _HIGHBD_TXFM_UTILITY_SSE4_H
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
+  do {                                                \
+    __m128i u0, u1, u2, u3;                           \
+    u0 = _mm_unpacklo_epi32(x0, x1);                  \
+    u1 = _mm_unpackhi_epi32(x0, x1);                  \
+    u2 = _mm_unpacklo_epi32(x2, x3);                  \
+    u3 = _mm_unpackhi_epi32(x2, x3);                  \
+    y0 = _mm_unpacklo_epi64(u0, u2);                  \
+    y1 = _mm_unpackhi_epi64(u0, u2);                  \
+    y2 = _mm_unpacklo_epi64(u1, u3);                  \
+    y3 = _mm_unpackhi_epi64(u1, u3);                  \
+  } while (0)
+
+static INLINE void transpose_8x8(const __m128i *in, __m128i *out) {
+  TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
+  TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
+  TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
+  TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13],
+                out[15]);
+}
+
+static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
+  // Upper left 8x8
+  TRANSPOSE_4X4(in[0], in[4], in[8], in[12], out[0], out[4], out[8], out[12]);
+  TRANSPOSE_4X4(in[1], in[5], in[9], in[13], out[16], out[20], out[24],
+                out[28]);
+  TRANSPOSE_4X4(in[16], in[20], in[24], in[28], out[1], out[5], out[9],
+                out[13]);
+  TRANSPOSE_4X4(in[17], in[21], in[25], in[29], out[17], out[21], out[25],
+                out[29]);
+
+  // Upper right 8x8
+  TRANSPOSE_4X4(in[2], in[6], in[10], in[14], out[32], out[36], out[40],
+                out[44]);
+  TRANSPOSE_4X4(in[3], in[7], in[11], in[15], out[48], out[52], out[56],
+                out[60]);
+  TRANSPOSE_4X4(in[18], in[22], in[26], in[30], out[33], out[37], out[41],
+                out[45]);
+  TRANSPOSE_4X4(in[19], in[23], in[27], in[31], out[49], out[53], out[57],
+                out[61]);
+
+  // Lower left 8x8
+  TRANSPOSE_4X4(in[32], in[36], in[40], in[44], out[2], out[6], out[10],
+                out[14]);
+  TRANSPOSE_4X4(in[33], in[37], in[41], in[45], out[18], out[22], out[26],
+                out[30]);
+  TRANSPOSE_4X4(in[48], in[52], in[56], in[60], out[3], out[7], out[11],
+                out[15]);
+  TRANSPOSE_4X4(in[49], in[53], in[57], in[61], out[19], out[23], out[27],
+                out[31]);
+  // Lower right 8x8
+  TRANSPOSE_4X4(in[34], in[38], in[42], in[46], out[34], out[38], out[42],
+                out[46]);
+  TRANSPOSE_4X4(in[35], in[39], in[43], in[47], out[50], out[54], out[58],
+                out[62]);
+  TRANSPOSE_4X4(in[50], in[54], in[58], in[62], out[35], out[39], out[43],
+                out[47]);
+  TRANSPOSE_4X4(in[51], in[55], in[59], in[63], out[51], out[55], out[59],
+                out[63]);
+}
+
+// Note:
+//  rounding = 1 << (bit - 1)
+static INLINE __m128i half_btf_sse4_1(__m128i w0, __m128i n0, __m128i w1,
+                                      __m128i n1, __m128i rounding, int bit) {
+  __m128i x, y;
+
+  x = _mm_mullo_epi32(w0, n0);
+  y = _mm_mullo_epi32(w1, n1);
+  x = _mm_add_epi32(x, y);
+  x = _mm_add_epi32(x, rounding);
+  x = _mm_srai_epi32(x, bit);
+  return x;
+}
+
+#endif  // _HIGHBD_TXFM_UTILITY_SSE4_H
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
new file mode 100644
index 000000000..c25db88b7
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+
+static const __m128i *const filter = (const __m128i *const)warped_filter;
+
+/* SSE2 version of the rotzoom/affine warp filter */
+void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width,
+                                  int height, int stride, uint16_t *pred,
+                                  int p_col, int p_row, int p_width,
+                                  int p_height, int p_stride, int subsampling_x,
+                                  int subsampling_y, int bd, int ref_frm,
+                                  int16_t alpha, int16_t beta, int16_t gamma,
+                                  int16_t delta) {
+#if HORSHEAR_REDUCE_PREC_BITS >= 5
+  __m128i tmp[15];
+#else
+#error "HORSHEAR_REDUCE_PREC_BITS < 5 not currently supported by SSSE3 filter"
+#endif
+  int i, j, k;
+
+  /* Note: For this code to work, the left/right frame borders need to be
+     extended by at least 13 pixels each. By the time we get here, other
+     code will have set up this border, but we allow an explicit check
+     for debugging purposes.
+  */
+  /*for (i = 0; i < height; ++i) {
+    for (j = 0; j < 13; ++j) {
+      assert(ref[i * stride - 13 + j] == ref[i * stride]);
+      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+    }
+  }*/
+
+  for (i = 0; i < p_height; i += 8) {
+    for (j = 0; j < p_width; j += 8) {
+      // (x, y) coordinates of the center of this block in the destination
+      // image
+      int32_t dst_x = p_col + j + 4;
+      int32_t dst_y = p_row + i + 4;
+
+      int32_t x4, y4, ix4, sx4, iy4, sy4;
+      if (subsampling_x)
+        x4 = ROUND_POWER_OF_TWO_SIGNED(
+            mat[2] * 2 * dst_x + mat[3] * 2 * dst_y + mat[0] +
+                (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+            1);
+      else
+        x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];
+
+      if (subsampling_y)
+        y4 = ROUND_POWER_OF_TWO_SIGNED(
+            mat[4] * 2 * dst_x + mat[5] * 2 * dst_y + mat[1] +
+                (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+            1);
+      else
+        y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];
+
+      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Horizontal filter
+      for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+        int iy = iy4 + k;
+        if (iy < 0)
+          iy = 0;
+        else if (iy > height - 1)
+          iy = height - 1;
+
+        // If the block is aligned such that, after clamping, every sample
+        // would be taken from the leftmost/rightmost column, then we can
+        // skip the expensive horizontal filter.
+        if (ix4 <= -7) {
+          tmp[k + 7] = _mm_set1_epi16(
+              ref[iy * stride] *
+              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+        } else if (ix4 >= width + 6) {
+          tmp[k + 7] = _mm_set1_epi16(
+              ref[iy * stride + (width - 1)] *
+              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+        } else {
+          int sx = sx4 + alpha * (-4) + beta * k +
+                   // Include rounding and offset here
+                   (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+                   (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+          // Load source pixels
+          __m128i src =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+          __m128i src2 =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+          // Filter even-index pixels
+          __m128i tmp_0 = filter[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS];
+          __m128i tmp_2 = filter[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS];
+          __m128i tmp_4 = filter[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS];
+          __m128i tmp_6 = filter[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS];
+
+          // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
+          __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+          // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
+          __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+          // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
+          __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+          // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
+          __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+          // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
+          __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+          // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
+          __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+          // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
+          __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+          // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
+          __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+          __m128i round_const =
+              _mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
+
+          // Calculate filtered results
+          __m128i res_0 = _mm_madd_epi16(src, coeff_0);
+          __m128i res_2 =
+              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 4), coeff_2);
+          __m128i res_4 =
+              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 8), coeff_4);
+          __m128i res_6 =
+              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 12), coeff_6);
+
+          __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                           _mm_add_epi32(res_2, res_6));
+          res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+                                    HORSHEAR_REDUCE_PREC_BITS);
+
+          // Filter odd-index pixels
+          __m128i tmp_1 = filter[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS];
+          __m128i tmp_3 = filter[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS];
+          __m128i tmp_5 = filter[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS];
+          __m128i tmp_7 = filter[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS];
+
+          __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+          __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+          __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+          __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+          __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+          __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+          __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+          __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+          __m128i res_1 =
+              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 2), coeff_1);
+          __m128i res_3 =
+              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 6), coeff_3);
+          __m128i res_5 =
+              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 10), coeff_5);
+          __m128i res_7 =
+              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 14), coeff_7);
+
+          __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                          _mm_add_epi32(res_3, res_7));
+          res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+                                   HORSHEAR_REDUCE_PREC_BITS);
+
+          // Combine results into one register.
+          // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
+          // as this order helps with the vertical filter.
+          tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
+        }
+      }
+
+      // Vertical filter
+      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+        int sy = sy4 + gamma * (-4) + delta * k +
+                 (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+                 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+        // Load from tmp and rearrange pairs of consecutive rows into the
+        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+        __m128i *src = tmp + (k + 4);
+        __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+        __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+        __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+        __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+        // Filter even-index pixels
+        __m128i tmp_0 = filter[(sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS];
+        __m128i tmp_2 = filter[(sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS];
+        __m128i tmp_4 = filter[(sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS];
+        __m128i tmp_6 = filter[(sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS];
+
+        __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+        __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+        __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+        __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+        __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+        __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+        __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+        __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+        __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+        __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+        __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+        __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                         _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+        __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+        __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+        __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+        __m128i tmp_1 = filter[(sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS];
+        __m128i tmp_3 = filter[(sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS];
+        __m128i tmp_5 = filter[(sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS];
+        __m128i tmp_7 = filter[(sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS];
+
+        __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+        __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+        __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+        __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+        __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+        __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+        __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+        __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+        __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+        __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+        __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+        __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                        _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        // Round and pack into 8 bits
+        __m128i round_const =
+            _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);
+
+        __m128i res_lo_round = _mm_srai_epi32(
+            _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
+        __m128i res_hi_round = _mm_srai_epi32(
+            _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
+
+        __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+        // Clamp res_16bit to the range [0, 2^bd - 1]
+        __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
+        __m128i zero = _mm_setzero_si128();
+        res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
+
+        // Store, blending with 'pred' if needed
+        __m128i *p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+        // Note: If we're outputting a 4x4 block, we need to be very careful
+        // to only output 4 pixels at this point, to avoid encode/decode
+        // mismatches when encoding with multiple threads.
+        if (p_width == 4) {
+          if (ref_frm) res_16bit = _mm_avg_epu16(res_16bit, _mm_loadl_epi64(p));
+          _mm_storel_epi64(p, res_16bit);
+        } else {
+          if (ref_frm) res_16bit = _mm_avg_epu16(res_16bit, _mm_loadu_si128(p));
+          _mm_storeu_si128(p, res_16bit);
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
new file mode 100644
index 000000000..efc8d1e24
--- /dev/null
+++ b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
@@ -0,0 +1,507 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>  // avx2
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
+#if CONFIG_HIGHBITDEPTH
+  *in = _mm256_setr_epi16(
+      (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
+      (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
+      (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
+      (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
+      (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
+      (int16_t)coeff[15]);
+#else
+  *in = _mm256_loadu_si256((const __m256i *)coeff);
+#endif
+}
+
+static void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
+  int i = 0;
+  while (i < 16) {
+    load_coeff(coeff + (i << 4), &in[i]);
+    i += 1;
+  }
+}
+
+static void recon_and_store(const __m256i *res, uint8_t *output) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i x = _mm_loadu_si128((__m128i const *)output);
+  __m128i p0 = _mm_unpacklo_epi8(x, zero);
+  __m128i p1 = _mm_unpackhi_epi8(x, zero);
+
+  p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res));
+  p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1));
+  x = _mm_packus_epi16(p0, p1);
+  _mm_storeu_si128((__m128i *)output, x);
+}
+
+#define IDCT_ROUNDING_POS (6)
+
+static void write_buffer_16x16(__m256i *in, const int stride, uint8_t *output) {
+  const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1));
+  int i = 0;
+
+  while (i < 16) {
+    in[i] = _mm256_add_epi16(in[i], rounding);
+    in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS);
+    recon_and_store(&in[i], output + i * stride);
+    i += 1;
+  }
+}
+
+static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1,
+                                     const __m256i *c0, const __m256i *c1,
+                                     __m256i *b0, __m256i *b1) {
+  __m256i x0, x1;
+  x0 = _mm256_unpacklo_epi16(*a0, *a1);
+  x1 = _mm256_unpackhi_epi16(*a0, *a1);
+  *b0 = butter_fly(x0, x1, *c0);
+  *b1 = butter_fly(x0, x1, *c1);
+}
+
+static void idct16_avx2(__m256i *in) {
+  const __m256i cospi_p30_m02 = pair256_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m256i cospi_p02_p30 = pair256_set_epi16(cospi_2_64, cospi_30_64);
+  const __m256i cospi_p14_m18 = pair256_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m256i cospi_p18_p14 = pair256_set_epi16(cospi_18_64, cospi_14_64);
+  const __m256i cospi_p22_m10 = pair256_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m256i cospi_p10_p22 = pair256_set_epi16(cospi_10_64, cospi_22_64);
+  const __m256i cospi_p06_m26 = pair256_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m256i cospi_p26_p06 = pair256_set_epi16(cospi_26_64, cospi_6_64);
+  const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
+  const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
+  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
+  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
+  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m256i t0, t1, t2, t3, t4, t5, t6, t7;
+
+  // stage 1, (0-7)
+  u0 = in[0];
+  u1 = in[8];
+  u2 = in[4];
+  u3 = in[12];
+  u4 = in[2];
+  u5 = in[10];
+  u6 = in[6];
+  u7 = in[14];
+
+  // stage 2, (0-7)
+  // stage 3, (0-7)
+  t0 = u0;
+  t1 = u1;
+  t2 = u2;
+  t3 = u3;
+  unpack_butter_fly(&u4, &u7, &cospi_p28_m04, &cospi_p04_p28, &t4, &t7);
+  unpack_butter_fly(&u5, &u6, &cospi_p12_m20, &cospi_p20_p12, &t5, &t6);
+
+  // stage 4, (0-7)
+  unpack_butter_fly(&t0, &t1, &cospi_p16_p16, &cospi_p16_m16, &u0, &u1);
+  unpack_butter_fly(&t2, &t3, &cospi_p24_m08, &cospi_p08_p24, &u2, &u3);
+  u4 = _mm256_add_epi16(t4, t5);
+  u5 = _mm256_sub_epi16(t4, t5);
+  u6 = _mm256_sub_epi16(t7, t6);
+  u7 = _mm256_add_epi16(t7, t6);
+
+  // stage 5, (0-7)
+  t0 = _mm256_add_epi16(u0, u3);
+  t1 = _mm256_add_epi16(u1, u2);
+  t2 = _mm256_sub_epi16(u1, u2);
+  t3 = _mm256_sub_epi16(u0, u3);
+  t4 = u4;
+  t7 = u7;
+  unpack_butter_fly(&u6, &u5, &cospi_p16_m16, &cospi_p16_p16, &t5, &t6);
+
+  // stage 6, (0-7)
+  u0 = _mm256_add_epi16(t0, t7);
+  u1 = _mm256_add_epi16(t1, t6);
+  u2 = _mm256_add_epi16(t2, t5);
+  u3 = _mm256_add_epi16(t3, t4);
+  u4 = _mm256_sub_epi16(t3, t4);
+  u5 = _mm256_sub_epi16(t2, t5);
+  u6 = _mm256_sub_epi16(t1, t6);
+  u7 = _mm256_sub_epi16(t0, t7);
+
+  // stage 1, (8-15)
+  v0 = in[1];
+  v1 = in[9];
+  v2 = in[5];
+  v3 = in[13];
+  v4 = in[3];
+  v5 = in[11];
+  v6 = in[7];
+  v7 = in[15];
+
+  // stage 2, (8-15)
+  unpack_butter_fly(&v0, &v7, &cospi_p30_m02, &cospi_p02_p30, &t0, &t7);
+  unpack_butter_fly(&v1, &v6, &cospi_p14_m18, &cospi_p18_p14, &t1, &t6);
+  unpack_butter_fly(&v2, &v5, &cospi_p22_m10, &cospi_p10_p22, &t2, &t5);
+  unpack_butter_fly(&v3, &v4, &cospi_p06_m26, &cospi_p26_p06, &t3, &t4);
+
+  // stage 3, (8-15)
+  v0 = _mm256_add_epi16(t0, t1);
+  v1 = _mm256_sub_epi16(t0, t1);
+  v2 = _mm256_sub_epi16(t3, t2);
+  v3 = _mm256_add_epi16(t2, t3);
+  v4 = _mm256_add_epi16(t4, t5);
+  v5 = _mm256_sub_epi16(t4, t5);
+  v6 = _mm256_sub_epi16(t7, t6);
+  v7 = _mm256_add_epi16(t6, t7);
+
+  // stage 4, (8-15)
+  t0 = v0;
+  t7 = v7;
+  t3 = v3;
+  t4 = v4;
+  unpack_butter_fly(&v1, &v6, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6);
+  unpack_butter_fly(&v2, &v5, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5);
+
+  // stage 5, (8-15)
+  v0 = _mm256_add_epi16(t0, t3);
+  v1 = _mm256_add_epi16(t1, t2);
+  v2 = _mm256_sub_epi16(t1, t2);
+  v3 = _mm256_sub_epi16(t0, t3);
+  v4 = _mm256_sub_epi16(t7, t4);
+  v5 = _mm256_sub_epi16(t6, t5);
+  v6 = _mm256_add_epi16(t6, t5);
+  v7 = _mm256_add_epi16(t7, t4);
+
+  // stage 6, (8-15)
+  t0 = v0;
+  t1 = v1;
+  t6 = v6;
+  t7 = v7;
+  unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &t2, &t5);
+  unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &t3, &t4);
+
+  // stage 7
+  in[0] = _mm256_add_epi16(u0, t7);
+  in[1] = _mm256_add_epi16(u1, t6);
+  in[2] = _mm256_add_epi16(u2, t5);
+  in[3] = _mm256_add_epi16(u3, t4);
+  in[4] = _mm256_add_epi16(u4, t3);
+  in[5] = _mm256_add_epi16(u5, t2);
+  in[6] = _mm256_add_epi16(u6, t1);
+  in[7] = _mm256_add_epi16(u7, t0);
+  in[8] = _mm256_sub_epi16(u7, t0);
+  in[9] = _mm256_sub_epi16(u6, t1);
+  in[10] = _mm256_sub_epi16(u5, t2);
+  in[11] = _mm256_sub_epi16(u4, t3);
+  in[12] = _mm256_sub_epi16(u3, t4);
+  in[13] = _mm256_sub_epi16(u2, t5);
+  in[14] = _mm256_sub_epi16(u1, t6);
+  in[15] = _mm256_sub_epi16(u0, t7);
+}
+
+static void idct16(__m256i *in) {
+  mm256_transpose_16x16(in);
+  idct16_avx2(in);
+}
+
+static INLINE void butterfly_32b(const __m256i *a0, const __m256i *a1,
+                                 const __m256i *c0, const __m256i *c1,
+                                 __m256i *b) {
+  __m256i x0, x1;
+  x0 = _mm256_unpacklo_epi16(*a0, *a1);
+  x1 = _mm256_unpackhi_epi16(*a0, *a1);
+  b[0] = _mm256_madd_epi16(x0, *c0);
+  b[1] = _mm256_madd_epi16(x1, *c0);
+  b[2] = _mm256_madd_epi16(x0, *c1);
+  b[3] = _mm256_madd_epi16(x1, *c1);
+}
+
+static INLINE void group_rounding(__m256i *a, int num) {
+  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  int i;
+  for (i = 0; i < num; ++i) {
+    a[i] = _mm256_add_epi32(a[i], dct_rounding);
+    a[i] = _mm256_srai_epi32(a[i], DCT_CONST_BITS);
+  }
+}
+
+static INLINE void add_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
+  __m256i x[4];
+  x[0] = _mm256_add_epi32(a[0], b[0]);
+  x[1] = _mm256_add_epi32(a[1], b[1]);
+  x[2] = _mm256_add_epi32(a[2], b[2]);
+  x[3] = _mm256_add_epi32(a[3], b[3]);
+
+  group_rounding(x, 4);
+
+  out[0] = _mm256_packs_epi32(x[0], x[1]);
+  out[1] = _mm256_packs_epi32(x[2], x[3]);
+}
+
+static INLINE void sub_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
+  __m256i x[4];
+  x[0] = _mm256_sub_epi32(a[0], b[0]);
+  x[1] = _mm256_sub_epi32(a[1], b[1]);
+  x[2] = _mm256_sub_epi32(a[2], b[2]);
+  x[3] = _mm256_sub_epi32(a[3], b[3]);
+
+  group_rounding(x, 4);
+
+  out[0] = _mm256_packs_epi32(x[0], x[1]);
+  out[1] = _mm256_packs_epi32(x[2], x[3]);
+}
+
+static INLINE void butterfly_rnd(__m256i *a, __m256i *out) {
+  group_rounding(a, 4);
+  out[0] = _mm256_packs_epi32(a[0], a[1]);
+  out[1] = _mm256_packs_epi32(a[2], a[3]);
+}
+
+static void iadst16_avx2(__m256i *in) {
+  const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64);
+  const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64);
+  const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64);
+  const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64);
+  const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64);
+  const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
+  const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64);
+  const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64);
+  const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
+  const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
+  const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
+  const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64);
+  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i x[16], s[16];
+  __m256i u[4], v[4];
+
+  // stage 1
+  butterfly_32b(&in[15], &in[0], &cospi_p01_p31, &cospi_p31_m01, u);
+  butterfly_32b(&in[7], &in[8], &cospi_p17_p15, &cospi_p15_m17, v);
+  add_rnd(u, v, &x[0]);
+  sub_rnd(u, v, &x[8]);
+
+  butterfly_32b(&in[13], &in[2], &cospi_p05_p27, &cospi_p27_m05, u);
+  butterfly_32b(&in[5], &in[10], &cospi_p21_p11, &cospi_p11_m21, v);
+  add_rnd(u, v, &x[2]);
+  sub_rnd(u, v, &x[10]);
+
+  butterfly_32b(&in[11], &in[4], &cospi_p09_p23, &cospi_p23_m09, u);
+  butterfly_32b(&in[3], &in[12], &cospi_p25_p07, &cospi_p07_m25, v);
+  add_rnd(u, v, &x[4]);
+  sub_rnd(u, v, &x[12]);
+
+  butterfly_32b(&in[9], &in[6], &cospi_p13_p19, &cospi_p19_m13, u);
+  butterfly_32b(&in[1], &in[14], &cospi_p29_p03, &cospi_p03_m29, v);
+  add_rnd(u, v, &x[6]);
+  sub_rnd(u, v, &x[14]);
+
+  // stage 2
+  s[0] = _mm256_add_epi16(x[0], x[4]);
+  s[1] = _mm256_add_epi16(x[1], x[5]);
+  s[2] = _mm256_add_epi16(x[2], x[6]);
+  s[3] = _mm256_add_epi16(x[3], x[7]);
+  s[4] = _mm256_sub_epi16(x[0], x[4]);
+  s[5] = _mm256_sub_epi16(x[1], x[5]);
+  s[6] = _mm256_sub_epi16(x[2], x[6]);
+  s[7] = _mm256_sub_epi16(x[3], x[7]);
+  butterfly_32b(&x[8], &x[9], &cospi_p04_p28, &cospi_p28_m04, u);
+  butterfly_32b(&x[12], &x[13], &cospi_m28_p04, &cospi_p04_p28, v);
+  add_rnd(u, v, &s[8]);
+  sub_rnd(u, v, &s[12]);
+
+  butterfly_32b(&x[10], &x[11], &cospi_p20_p12, &cospi_p12_m20, u);
+  butterfly_32b(&x[14], &x[15], &cospi_m12_p20, &cospi_p20_p12, v);
+  add_rnd(u, v, &s[10]);
+  sub_rnd(u, v, &s[14]);
+
+  // stage 3
+  x[0] = _mm256_add_epi16(s[0], s[2]);
+  x[1] = _mm256_add_epi16(s[1], s[3]);
+  x[2] = _mm256_sub_epi16(s[0], s[2]);
+  x[3] = _mm256_sub_epi16(s[1], s[3]);
+
+  x[8] = _mm256_add_epi16(s[8], s[10]);
+  x[9] = _mm256_add_epi16(s[9], s[11]);
+  x[10] = _mm256_sub_epi16(s[8], s[10]);
+  x[11] = _mm256_sub_epi16(s[9], s[11]);
+
+  butterfly_32b(&s[4], &s[5], &cospi_p08_p24, &cospi_p24_m08, u);
+  butterfly_32b(&s[6], &s[7], &cospi_m24_p08, &cospi_p08_p24, v);
+  add_rnd(u, v, &x[4]);
+  sub_rnd(u, v, &x[6]);
+
+  butterfly_32b(&s[12], &s[13], &cospi_p08_p24, &cospi_p24_m08, u);
+  butterfly_32b(&s[14], &s[15], &cospi_m24_p08, &cospi_p08_p24, v);
+  add_rnd(u, v, &x[12]);
+  sub_rnd(u, v, &x[14]);
+
+  // stage 4
+  butterfly_32b(&x[2], &x[3], &cospi_m16_m16, &cospi_p16_m16, u);
+  butterfly_32b(&x[6], &x[7], &cospi_p16_p16, &cospi_m16_p16, v);
+  butterfly_rnd(u, &x[2]);
+  butterfly_rnd(v, &x[6]);
+
+  butterfly_32b(&x[10], &x[11], &cospi_p16_p16, &cospi_m16_p16, u);
+  butterfly_32b(&x[14], &x[15], &cospi_m16_m16, &cospi_p16_m16, v);
+  butterfly_rnd(u, &x[10]);
+  butterfly_rnd(v, &x[14]);
+
+  in[0] = x[0];
+  in[1] = _mm256_sub_epi16(zero, x[8]);
+  in[2] = x[12];
+  in[3] = _mm256_sub_epi16(zero, x[4]);
+  in[4] = x[6];
+  in[5] = x[14];
+  in[6] = x[10];
+  in[7] = x[2];
+  in[8] = x[3];
+  in[9] = x[11];
+  in[10] = x[15];
+  in[11] = x[7];
+  in[12] = x[5];
+  in[13] = _mm256_sub_epi16(zero, x[13]);
+  in[14] = x[9];
+  in[15] = _mm256_sub_epi16(zero, x[1]);
+}
+
+static void iadst16(__m256i *in) {
+  mm256_transpose_16x16(in);
+  iadst16_avx2(in);
+}
+
+#if CONFIG_EXT_TX
+static void flip_row(__m256i *in, int rows) {
+  int i;
+  for (i = 0; i < rows; ++i) {
+    mm256_reverse_epi16(&in[i]);
+  }
+}
+
+static void flip_col(uint8_t **dest, int *stride, int rows) {
+  *dest = *dest + (rows - 1) * (*stride);
+  *stride = -*stride;
+}
+
+static void iidtx16(__m256i *in) {
+  mm256_transpose_16x16(in);
+  txfm_scaling16_avx2(Sqrt2, in);
+}
+#endif
+
+void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
+                               int stride, int tx_type) {
+  __m256i in[16];
+
+  load_buffer_16x16(input, in);
+  switch (tx_type) {
+    case DCT_DCT:
+      idct16(in);
+      idct16(in);
+      break;
+    case ADST_DCT:
+      idct16(in);
+      iadst16(in);
+      break;
+    case DCT_ADST:
+      iadst16(in);
+      idct16(in);
+      break;
+    case ADST_ADST:
+      iadst16(in);
+      iadst16(in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      idct16(in);
+      iadst16(in);
+      flip_col(&dest, &stride, 16);
+      break;
+    case DCT_FLIPADST:
+      iadst16(in);
+      idct16(in);
+      flip_row(in, 16);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst16(in);
+      iadst16(in);
+      flip_row(in, 16);
+      flip_col(&dest, &stride, 16);
+      break;
+    case ADST_FLIPADST:
+      iadst16(in);
+      iadst16(in);
+      flip_row(in, 16);
+      break;
+    case FLIPADST_ADST:
+      iadst16(in);
+      iadst16(in);
+      flip_col(&dest, &stride, 16);
+      break;
+    case IDTX:
+      iidtx16(in);
+      iidtx16(in);
+      break;
+    case V_DCT:
+      iidtx16(in);
+      idct16(in);
+      break;
+    case H_DCT:
+      idct16(in);
+      iidtx16(in);
+      break;
+    case V_ADST:
+      iidtx16(in);
+      iadst16(in);
+      break;
+    case H_ADST:
+      iadst16(in);
+      iidtx16(in);
+      break;
+    case V_FLIPADST:
+      iidtx16(in);
+      iadst16(in);
+      flip_col(&dest, &stride, 16);
+      break;
+    case H_FLIPADST:
+      iadst16(in);
+      iidtx16(in);
+      flip_row(in, 16);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+  write_buffer_16x16(in, stride, dest);
+}
diff --git a/third_party/aom/av1/common/x86/idct_intrin_sse2.c b/third_party/aom/av1/common/x86/idct_intrin_sse2.c
new file mode 100644
index 000000000..522e8988c
--- /dev/null
+++ b/third_party/aom/av1/common/x86/idct_intrin_sse2.c
@@ -0,0 +1,1402 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "aom_dsp/x86/inv_txfm_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
+#include "av1/common/enums.h"
+
+#if CONFIG_EXT_TX
+static INLINE void fliplr_4x4(__m128i in[2]) {
+  in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+  in[0] = _mm_shufflehi_epi16(in[0], 0x1b);
+  in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+  in[1] = _mm_shufflehi_epi16(in[1], 0x1b);
+}
+
+static INLINE void fliplr_8x8(__m128i in[8]) {
+  in[0] = mm_reverse_epi16(in[0]);
+  in[1] = mm_reverse_epi16(in[1]);
+  in[2] = mm_reverse_epi16(in[2]);
+  in[3] = mm_reverse_epi16(in[3]);
+
+  in[4] = mm_reverse_epi16(in[4]);
+  in[5] = mm_reverse_epi16(in[5]);
+  in[6] = mm_reverse_epi16(in[6]);
+  in[7] = mm_reverse_epi16(in[7]);
+}
+
+static INLINE void fliplr_16x8(__m128i in[16]) {
+  fliplr_8x8(&in[0]);
+  fliplr_8x8(&in[8]);
+}
+
+#define FLIPLR_16x16(in0, in1) \
+  do {                         \
+    __m128i *tmp;              \
+    fliplr_16x8(in0);          \
+    fliplr_16x8(in1);          \
+    tmp = (in0);               \
+    (in0) = (in1);             \
+    (in1) = tmp;               \
+  } while (0)
+
+#define FLIPUD_PTR(dest, stride, size)       \
+  do {                                       \
+    (dest) = (dest) + ((size)-1) * (stride); \
+    (stride) = -(stride);                    \
+  } while (0)
+#endif
+
+void av1_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  __m128i in[2];
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i eight = _mm_set1_epi16(8);
+
+  in[0] = load_input_data(input);
+  in[1] = load_input_data(input + 8);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      aom_idct4_sse2(in);
+      aom_idct4_sse2(in);
+      break;
+    case ADST_DCT:
+      aom_idct4_sse2(in);
+      aom_iadst4_sse2(in);
+      break;
+    case DCT_ADST:
+      aom_iadst4_sse2(in);
+      aom_idct4_sse2(in);
+      break;
+    case ADST_ADST:
+      aom_iadst4_sse2(in);
+      aom_iadst4_sse2(in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      aom_idct4_sse2(in);
+      aom_iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      break;
+    case DCT_FLIPADST:
+      aom_iadst4_sse2(in);
+      aom_idct4_sse2(in);
+      fliplr_4x4(in);
+      break;
+    case FLIPADST_FLIPADST:
+      aom_iadst4_sse2(in);
+      aom_iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      fliplr_4x4(in);
+      break;
+    case ADST_FLIPADST:
+      aom_iadst4_sse2(in);
+      aom_iadst4_sse2(in);
+      fliplr_4x4(in);
+      break;
+    case FLIPADST_ADST:
+      aom_iadst4_sse2(in);
+      aom_iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+
+  // Final round and shift
+  in[0] = _mm_add_epi16(in[0], eight);
+  in[1] = _mm_add_epi16(in[1], eight);
+
+  in[0] = _mm_srai_epi16(in[0], 4);
+  in[1] = _mm_srai_epi16(in[1], 4);
+
+  // Reconstruction and Store
+  {
+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
+    __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
+    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+    __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+    d0 = _mm_unpacklo_epi32(d0, d1);
+    d2 = _mm_unpacklo_epi32(d2, d3);
+    d0 = _mm_unpacklo_epi8(d0, zero);
+    d2 = _mm_unpacklo_epi8(d2, zero);
+    d0 = _mm_add_epi16(d0, in[0]);
+    d2 = _mm_add_epi16(d2, in[1]);
+    d0 = _mm_packus_epi16(d0, d2);
+    // store result[0]
+    *(int *)dest = _mm_cvtsi128_si32(d0);
+    // store result[1]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+    // store result[2]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+    // store result[3]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+  }
+}
+
+void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  __m128i in[8];
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+
+  // load input data
+  in[0] = load_input_data(input);
+  in[1] = load_input_data(input + 8 * 1);
+  in[2] = load_input_data(input + 8 * 2);
+  in[3] = load_input_data(input + 8 * 3);
+  in[4] = load_input_data(input + 8 * 4);
+  in[5] = load_input_data(input + 8 * 5);
+  in[6] = load_input_data(input + 8 * 6);
+  in[7] = load_input_data(input + 8 * 7);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      aom_idct8_sse2(in);
+      aom_idct8_sse2(in);
+      break;
+    case ADST_DCT:
+      aom_idct8_sse2(in);
+      aom_iadst8_sse2(in);
+      break;
+    case DCT_ADST:
+      aom_iadst8_sse2(in);
+      aom_idct8_sse2(in);
+      break;
+    case ADST_ADST:
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      aom_idct8_sse2(in);
+      aom_iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      break;
+    case DCT_FLIPADST:
+      aom_iadst8_sse2(in);
+      aom_idct8_sse2(in);
+      fliplr_8x8(in);
+      break;
+    case FLIPADST_FLIPADST:
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      fliplr_8x8(in);
+      break;
+    case ADST_FLIPADST:
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in);
+      fliplr_8x8(in);
+      break;
+    case FLIPADST_ADST:
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+  in[4] = _mm_adds_epi16(in[4], final_rounding);
+  in[5] = _mm_adds_epi16(in[5], final_rounding);
+  in[6] = _mm_adds_epi16(in[6], final_rounding);
+  in[7] = _mm_adds_epi16(in[7], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 5);
+  in[1] = _mm_srai_epi16(in[1], 5);
+  in[2] = _mm_srai_epi16(in[2], 5);
+  in[3] = _mm_srai_epi16(in[3], 5);
+  in[4] = _mm_srai_epi16(in[4], 5);
+  in[5] = _mm_srai_epi16(in[5], 5);
+  in[6] = _mm_srai_epi16(in[6], 5);
+  in[7] = _mm_srai_epi16(in[7], 5);
+
+  RECON_AND_STORE(dest + 0 * stride, in[0]);
+  RECON_AND_STORE(dest + 1 * stride, in[1]);
+  RECON_AND_STORE(dest + 2 * stride, in[2]);
+  RECON_AND_STORE(dest + 3 * stride, in[3]);
+  RECON_AND_STORE(dest + 4 * stride, in[4]);
+  RECON_AND_STORE(dest + 5 * stride, in[5]);
+  RECON_AND_STORE(dest + 6 * stride, in[6]);
+  RECON_AND_STORE(dest + 7 * stride, in[7]);
+}
+
+#if CONFIG_EXT_TX
+static void iidtx16_sse2(__m128i *in0, __m128i *in1) {
+  array_transpose_16x16(in0, in1);
+  idtx16_8col(in0);
+  idtx16_8col(in1);
+}
+#endif  // CONFIG_EXT_TX
+
+void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride, int tx_type) {
+  __m128i in[32];
+  __m128i *in0 = &in[0];
+  __m128i *in1 = &in[16];
+
+  load_buffer_8x16(input, in0);
+  input += 8;
+  load_buffer_8x16(input, in1);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      aom_idct16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      break;
+    case ADST_DCT:
+      aom_idct16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      break;
+    case DCT_ADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      break;
+    case ADST_ADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      aom_idct16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+    case DCT_FLIPADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_FLIPADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case ADST_FLIPADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_ADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+    case IDTX:
+      iidtx16_sse2(in0, in1);
+      iidtx16_sse2(in0, in1);
+      break;
+    case V_DCT:
+      iidtx16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      break;
+    case H_DCT:
+      aom_idct16_sse2(in0, in1);
+      iidtx16_sse2(in0, in1);
+      break;
+    case V_ADST:
+      iidtx16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      break;
+    case H_ADST:
+      aom_iadst16_sse2(in0, in1);
+      iidtx16_sse2(in0, in1);
+      break;
+    case V_FLIPADST:
+      iidtx16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+    case H_FLIPADST:
+      aom_iadst16_sse2(in0, in1);
+      iidtx16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+
+  write_buffer_8x16(dest, in0, stride);
+  dest += 8;
+  write_buffer_8x16(dest, in1, stride);
+}
+
+#if CONFIG_EXT_TX
+static void iidtx8_sse2(__m128i *in) {
+  in[0] = _mm_slli_epi16(in[0], 1);
+  in[1] = _mm_slli_epi16(in[1], 1);
+  in[2] = _mm_slli_epi16(in[2], 1);
+  in[3] = _mm_slli_epi16(in[3], 1);
+  in[4] = _mm_slli_epi16(in[4], 1);
+  in[5] = _mm_slli_epi16(in[5], 1);
+  in[6] = _mm_slli_epi16(in[6], 1);
+  in[7] = _mm_slli_epi16(in[7], 1);
+}
+
+static INLINE void iidtx4_sse2(__m128i *in) {
+  const __m128i v_scale_w = _mm_set1_epi16(Sqrt2);
+
+  const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
+  const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
+  const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
+  const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
+
+  const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
+  const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
+  const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
+  const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
+
+  in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
+  in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
+}
+
+// load 8x8 array
+static INLINE void flip_buffer_lr_8x8(__m128i *in) {
+  in[0] = mm_reverse_epi16(in[0]);
+  in[1] = mm_reverse_epi16(in[1]);
+  in[2] = mm_reverse_epi16(in[2]);
+  in[3] = mm_reverse_epi16(in[3]);
+  in[4] = mm_reverse_epi16(in[4]);
+  in[5] = mm_reverse_epi16(in[5]);
+  in[6] = mm_reverse_epi16(in[6]);
+  in[7] = mm_reverse_epi16(in[7]);
+}
+#endif  // CONFIG_EXT_TX
+
+void av1_iht8x16_128_add_sse2(const tran_low_t *input, uint8_t *dest,
+                              int stride, int tx_type) {
+  __m128i in[16];
+
+  in[0] = load_input_data(input + 0 * 8);
+  in[1] = load_input_data(input + 1 * 8);
+  in[2] = load_input_data(input + 2 * 8);
+  in[3] = load_input_data(input + 3 * 8);
+  in[4] = load_input_data(input + 4 * 8);
+  in[5] = load_input_data(input + 5 * 8);
+  in[6] = load_input_data(input + 6 * 8);
+  in[7] = load_input_data(input + 7 * 8);
+
+  in[8] = load_input_data(input + 8 * 8);
+  in[9] = load_input_data(input + 9 * 8);
+  in[10] = load_input_data(input + 10 * 8);
+  in[11] = load_input_data(input + 11 * 8);
+  in[12] = load_input_data(input + 12 * 8);
+  in[13] = load_input_data(input + 13 * 8);
+  in[14] = load_input_data(input + 14 * 8);
+  in[15] = load_input_data(input + 15 * 8);
+
+  // Row transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case H_DCT:
+#endif
+      aom_idct8_sse2(in);
+      array_transpose_8x8(in, in);
+      aom_idct8_sse2(in + 8);
+      array_transpose_8x8(in + 8, in + 8);
+      break;
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case H_ADST:
+    case H_FLIPADST:
+#endif
+      aom_iadst8_sse2(in);
+      array_transpose_8x8(in, in);
+      aom_iadst8_sse2(in + 8);
+      array_transpose_8x8(in + 8, in + 8);
+      break;
+#if CONFIG_EXT_TX
+    case V_FLIPADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+      iidtx8_sse2(in);
+      iidtx8_sse2(in + 8);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  scale_sqrt2_8x8(in);
+  scale_sqrt2_8x8(in + 8);
+
+  // Column transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case DCT_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case V_DCT:
+#endif
+      idct16_8col(in);
+      break;
+    case ADST_DCT:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_ADST:
+    case ADST_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case FLIPADST_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+#endif
+      iadst16_8col(in);
+      break;
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case IDTX: idtx16_8col(in); break;
+#endif
+    default: assert(0); break;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case H_DCT:
+#endif
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_ADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+#endif
+      write_buffer_8x16(dest, in, stride);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST: write_buffer_8x16(dest + stride * 15, in, -stride); break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      flip_buffer_lr_8x8(in);
+      flip_buffer_lr_8x8(in + 8);
+      write_buffer_8x16(dest, in, stride);
+      break;
+    case FLIPADST_FLIPADST:
+      flip_buffer_lr_8x8(in);
+      flip_buffer_lr_8x8(in + 8);
+      write_buffer_8x16(dest + stride * 15, in, -stride);
+      break;
+#endif
+    default: assert(0); break;
+  }
+}
+
+static INLINE void write_buffer_8x8_round6(uint8_t *dest, __m128i *in,
+                                           int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  const __m128i zero = _mm_setzero_si128();
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+  in[4] = _mm_adds_epi16(in[4], final_rounding);
+  in[5] = _mm_adds_epi16(in[5], final_rounding);
+  in[6] = _mm_adds_epi16(in[6], final_rounding);
+  in[7] = _mm_adds_epi16(in[7], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 6);
+  in[1] = _mm_srai_epi16(in[1], 6);
+  in[2] = _mm_srai_epi16(in[2], 6);
+  in[3] = _mm_srai_epi16(in[3], 6);
+  in[4] = _mm_srai_epi16(in[4], 6);
+  in[5] = _mm_srai_epi16(in[5], 6);
+  in[6] = _mm_srai_epi16(in[6], 6);
+  in[7] = _mm_srai_epi16(in[7], 6);
+
+  RECON_AND_STORE(dest + 0 * stride, in[0]);
+  RECON_AND_STORE(dest + 1 * stride, in[1]);
+  RECON_AND_STORE(dest + 2 * stride, in[2]);
+  RECON_AND_STORE(dest + 3 * stride, in[3]);
+  RECON_AND_STORE(dest + 4 * stride, in[4]);
+  RECON_AND_STORE(dest + 5 * stride, in[5]);
+  RECON_AND_STORE(dest + 6 * stride, in[6]);
+  RECON_AND_STORE(dest + 7 * stride, in[7]);
+}
+
+void av1_iht16x8_128_add_sse2(const tran_low_t *input, uint8_t *dest,
+                              int stride, int tx_type) {
+  __m128i in[16];
+
+  // Transpose 16x8 input into in[]
+  in[0] = load_input_data(input + 0 * 16);
+  in[1] = load_input_data(input + 1 * 16);
+  in[2] = load_input_data(input + 2 * 16);
+  in[3] = load_input_data(input + 3 * 16);
+  in[4] = load_input_data(input + 4 * 16);
+  in[5] = load_input_data(input + 5 * 16);
+  in[6] = load_input_data(input + 6 * 16);
+  in[7] = load_input_data(input + 7 * 16);
+  array_transpose_8x8(in, in);
+
+  in[8] = load_input_data(input + 8 + 0 * 16);
+  in[9] = load_input_data(input + 8 + 1 * 16);
+  in[10] = load_input_data(input + 8 + 2 * 16);
+  in[11] = load_input_data(input + 8 + 3 * 16);
+  in[12] = load_input_data(input + 8 + 4 * 16);
+  in[13] = load_input_data(input + 8 + 5 * 16);
+  in[14] = load_input_data(input + 8 + 6 * 16);
+  in[15] = load_input_data(input + 8 + 7 * 16);
+  array_transpose_8x8(in + 8, in + 8);
+
+  // Row transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case H_DCT:
+#endif
+      idct16_8col(in);
+      break;
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case H_ADST:
+    case H_FLIPADST:
+#endif
+      iadst16_8col(in);
+      break;
+#if CONFIG_EXT_TX
+    case V_FLIPADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX: idtx16_8col(in); break;
+#endif
+    default: assert(0); break;
+  }
+
+  // Scale
+  scale_sqrt2_8x8(in);
+  scale_sqrt2_8x8(in + 8);
+
+  // Column transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case DCT_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case V_DCT:
+#endif
+      aom_idct8_sse2(in);
+      aom_idct8_sse2(in + 8);
+      break;
+    case ADST_DCT:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_ADST:
+    case ADST_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case FLIPADST_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+#endif
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in + 8);
+      break;
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case IDTX:
+      array_transpose_8x8(in, in);
+      array_transpose_8x8(in + 8, in + 8);
+      iidtx8_sse2(in);
+      iidtx8_sse2(in + 8);
+      break;
+#endif
+    default: assert(0); break;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+#endif
+      write_buffer_8x8_round6(dest, in, stride);
+      write_buffer_8x8_round6(dest + 8, in + 8, stride);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST:
+      write_buffer_8x8_round6(dest + stride * 7, in, -stride);
+      write_buffer_8x8_round6(dest + stride * 7 + 8, in + 8, -stride);
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      flip_buffer_lr_8x8(in);
+      flip_buffer_lr_8x8(in + 8);
+      write_buffer_8x8_round6(dest, in + 8, stride);
+      write_buffer_8x8_round6(dest + 8, in, stride);
+      break;
+    case FLIPADST_FLIPADST:
+      flip_buffer_lr_8x8(in);
+      flip_buffer_lr_8x8(in + 8);
+      write_buffer_8x8_round6(dest + stride * 7, in + 8, -stride);
+      write_buffer_8x8_round6(dest + stride * 7 + 8, in, -stride);
+      break;
+#endif
+    default: assert(0); break;
+  }
+}
+
+static INLINE void write_buffer_8x4_round5(uint8_t *dest, __m128i *in,
+                                           int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+  const __m128i zero = _mm_setzero_si128();
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 5);
+  in[1] = _mm_srai_epi16(in[1], 5);
+  in[2] = _mm_srai_epi16(in[2], 5);
+  in[3] = _mm_srai_epi16(in[3], 5);
+
+  RECON_AND_STORE(dest + 0 * stride, in[0]);
+  RECON_AND_STORE(dest + 1 * stride, in[1]);
+  RECON_AND_STORE(dest + 2 * stride, in[2]);
+  RECON_AND_STORE(dest + 3 * stride, in[3]);
+}
+
+void av1_iht8x4_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  __m128i in[8];
+
+  in[0] = load_input_data(input + 0 * 8);
+  in[1] = load_input_data(input + 1 * 8);
+  in[2] = load_input_data(input + 2 * 8);
+  in[3] = load_input_data(input + 3 * 8);
+
+  // Row transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case H_DCT:
+#endif
+      aom_idct8_sse2(in);
+      break;
+    case DCT_ADST:
+    case ADST_ADST: aom_iadst8_sse2(in); break;
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case H_ADST:
+    case H_FLIPADST: aom_iadst8_sse2(in); break;
+    case V_FLIPADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX: iidtx8_sse2(in); array_transpose_8x8(in, in);
+#endif
+      break;
+    default: assert(0); break;
+  }
+
+  scale_sqrt2_8x8(in);
+
+  // Repack data. We pack into the bottom half of 'in'
+  // so that the next repacking stage can pack into the
+  // top half without overwriting anything
+  in[7] = _mm_unpacklo_epi64(in[6], in[7]);
+  in[6] = _mm_unpacklo_epi64(in[4], in[5]);
+  in[5] = _mm_unpacklo_epi64(in[2], in[3]);
+  in[4] = _mm_unpacklo_epi64(in[0], in[1]);
+
+  // Column transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case DCT_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case V_DCT:
+#endif
+      aom_idct4_sse2(in + 4);
+      aom_idct4_sse2(in + 6);
+      break;
+    case ADST_DCT:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_ADST:
+    case ADST_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case FLIPADST_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+#endif
+      aom_iadst4_sse2(in + 4);
+      aom_iadst4_sse2(in + 6);
+      break;
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case IDTX:
+      iidtx4_sse2(in + 4);
+      array_transpose_4x4(in + 4);
+      iidtx4_sse2(in + 6);
+      array_transpose_4x4(in + 6);
+      break;
+#endif
+    default: assert(0); break;
+  }
+
+  // Repack data
+  in[0] = _mm_unpacklo_epi64(in[4], in[6]);
+  in[1] = _mm_unpackhi_epi64(in[4], in[6]);
+  in[2] = _mm_unpacklo_epi64(in[5], in[7]);
+  in[3] = _mm_unpackhi_epi64(in[5], in[7]);
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX: break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST: FLIPUD_PTR(dest, stride, 4); break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      in[0] = mm_reverse_epi16(in[0]);
+      in[1] = mm_reverse_epi16(in[1]);
+      in[2] = mm_reverse_epi16(in[2]);
+      in[3] = mm_reverse_epi16(in[3]);
+      break;
+    case FLIPADST_FLIPADST:
+      in[0] = mm_reverse_epi16(in[0]);
+      in[1] = mm_reverse_epi16(in[1]);
+      in[2] = mm_reverse_epi16(in[2]);
+      in[3] = mm_reverse_epi16(in[3]);
+      FLIPUD_PTR(dest, stride, 4);
+#endif
+      break;
+    default: assert(0); break;
+  }
+  write_buffer_8x4_round5(dest, in, stride);
+}
+
+static INLINE void write_buffer_4x8_round5(uint8_t *dest, __m128i *in,
+                                           int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+  const __m128i zero = _mm_setzero_si128();
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 5);
+  in[1] = _mm_srai_epi16(in[1], 5);
+  in[2] = _mm_srai_epi16(in[2], 5);
+  in[3] = _mm_srai_epi16(in[3], 5);
+
+  // Reconstruction and Store
+  {
+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
+    __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
+    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+    __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+    __m128i d4 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 4));
+    __m128i d5 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 5));
+    __m128i d6 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 6));
+    __m128i d7 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 7));
+
+    d0 = _mm_unpacklo_epi32(d0, d1);
+    d2 = _mm_unpacklo_epi32(d2, d3);
+    d4 = _mm_unpacklo_epi32(d4, d5);
+    d6 = _mm_unpacklo_epi32(d6, d7);
+    d0 = _mm_unpacklo_epi8(d0, zero);
+    d2 = _mm_unpacklo_epi8(d2, zero);
+    d4 = _mm_unpacklo_epi8(d4, zero);
+    d6 = _mm_unpacklo_epi8(d6, zero);
+    d0 = _mm_add_epi16(d0, in[0]);
+    d2 = _mm_add_epi16(d2, in[1]);
+    d4 = _mm_add_epi16(d4, in[2]);
+    d6 = _mm_add_epi16(d6, in[3]);
+
+    d0 = _mm_packus_epi16(d0, d2);
+    *(int *)dest = _mm_cvtsi128_si32(d0);
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+    d0 = _mm_packus_epi16(d4, d6);
+    *(int *)(dest + stride * 4) = _mm_cvtsi128_si32(d0);
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 5) = _mm_cvtsi128_si32(d0);
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 6) = _mm_cvtsi128_si32(d0);
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 7) = _mm_cvtsi128_si32(d0);
+  }
+}
+
+void av1_iht4x8_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  __m128i in[8];
+
+  // Load rows, packed two per element of 'in'.
+  // We pack into the bottom half of 'in' so that the
+  // later repacking stage can pack into the
+  // top half without overwriting anything
+  in[4] = load_input_data(input + 0 * 8);
+  in[5] = load_input_data(input + 1 * 8);
+  in[6] = load_input_data(input + 2 * 8);
+  in[7] = load_input_data(input + 3 * 8);
+
+  // Row transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case H_DCT:
+#endif
+      aom_idct4_sse2(in + 4);
+      aom_idct4_sse2(in + 6);
+      break;
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case H_ADST:
+    case H_FLIPADST:
+#endif
+      aom_iadst4_sse2(in + 4);
+      aom_iadst4_sse2(in + 6);
+      break;
+#if CONFIG_EXT_TX
+    case V_FLIPADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+      iidtx4_sse2(in + 4);
+      array_transpose_4x4(in + 4);
+      iidtx4_sse2(in + 6);
+      array_transpose_4x4(in + 6);
+      break;
+#endif
+    default: assert(0); break;
+  }
+
+  scale_sqrt2_8x4(in + 4);
+
+  // Repack data
+  in[0] = _mm_unpacklo_epi64(in[4], in[6]);
+  in[1] = _mm_unpackhi_epi64(in[4], in[6]);
+  in[2] = _mm_unpacklo_epi64(in[5], in[7]);
+  in[3] = _mm_unpackhi_epi64(in[5], in[7]);
+
+  // Column transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case DCT_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case V_DCT:
+#endif
+      aom_idct8_sse2(in);
+      break;
+    case ADST_DCT:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_ADST:
+    case ADST_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case FLIPADST_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+#endif
+      aom_iadst8_sse2(in);
+      break;
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case IDTX:
+      iidtx8_sse2(in);
+      array_transpose_8x8(in, in);
+      break;
+#endif
+    default: assert(0); break;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+#endif
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST: FLIPUD_PTR(dest, stride, 8); break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+      in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+      in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+      in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+      in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
+      in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
+      in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
+      in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
+      break;
+    case FLIPADST_FLIPADST:
+      in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+      in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+      in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+      in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+      in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
+      in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
+      in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
+      in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
+      FLIPUD_PTR(dest, stride, 8);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  in[0] = _mm_unpacklo_epi64(in[0], in[1]);
+  in[1] = _mm_unpacklo_epi64(in[2], in[3]);
+  in[2] = _mm_unpacklo_epi64(in[4], in[5]);
+  in[3] = _mm_unpacklo_epi64(in[6], in[7]);
+  write_buffer_4x8_round5(dest, in, stride);
+}
+
+// Note: The 16-column 32-element transforms take input in the form of four
+// 8x16 blocks (each stored as a __m128i[16]), which are the four quadrants
+// of the overall 16x32 input buffer.
+static INLINE void idct32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+                                __m128i *br) {
+  array_transpose_16x16(tl, tr);
+  array_transpose_16x16(bl, br);
+  idct32_8col(tl, bl);
+  idct32_8col(tr, br);
+}
+
+static INLINE void ihalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+                                      __m128i *br) {
+  __m128i tmpl[16], tmpr[16];
+  int i;
+
+  // Copy the top half of the input to temporary storage
+  for (i = 0; i < 16; ++i) {
+    tmpl[i] = tl[i];
+    tmpr[i] = tr[i];
+  }
+
+  // Generate the top half of the output
+  for (i = 0; i < 16; ++i) {
+    tl[i] = _mm_slli_epi16(bl[i], 2);
+    tr[i] = _mm_slli_epi16(br[i], 2);
+  }
+  array_transpose_16x16(tl, tr);
+
+  // Copy the temporary storage back to the bottom half of the input
+  for (i = 0; i < 16; ++i) {
+    bl[i] = tmpl[i];
+    br[i] = tmpr[i];
+  }
+
+  // Generate the bottom half of the output
+  scale_sqrt2_8x16(bl);
+  scale_sqrt2_8x16(br);
+  aom_idct16_sse2(bl, br);  // Includes a transposition
+}
+
+#if CONFIG_EXT_TX
+static INLINE void iidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+                                 __m128i *br) {
+  int i;
+  array_transpose_16x16(tl, tr);
+  array_transpose_16x16(bl, br);
+  for (i = 0; i < 16; ++i) {
+    tl[i] = _mm_slli_epi16(tl[i], 2);
+    tr[i] = _mm_slli_epi16(tr[i], 2);
+    bl[i] = _mm_slli_epi16(bl[i], 2);
+    br[i] = _mm_slli_epi16(br[i], 2);
+  }
+}
+#endif  // CONFIG_EXT_TX
+
+static INLINE void write_buffer_16x32_round6(uint8_t *dest, __m128i *intl,
+                                             __m128i *intr, __m128i *inbl,
+                                             __m128i *inbr, int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  int i;
+
+  for (i = 0; i < 16; ++i) {
+    intl[i] = _mm_adds_epi16(intl[i], final_rounding);
+    intr[i] = _mm_adds_epi16(intr[i], final_rounding);
+    inbl[i] = _mm_adds_epi16(inbl[i], final_rounding);
+    inbr[i] = _mm_adds_epi16(inbr[i], final_rounding);
+    intl[i] = _mm_srai_epi16(intl[i], 6);
+    intr[i] = _mm_srai_epi16(intr[i], 6);
+    inbl[i] = _mm_srai_epi16(inbl[i], 6);
+    inbr[i] = _mm_srai_epi16(inbr[i], 6);
+    RECON_AND_STORE(dest + i * stride + 0, intl[i]);
+    RECON_AND_STORE(dest + i * stride + 8, intr[i]);
+    RECON_AND_STORE(dest + (i + 16) * stride + 0, inbl[i]);
+    RECON_AND_STORE(dest + (i + 16) * stride + 8, inbr[i]);
+  }
+}
+
+void av1_iht16x32_512_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride, int tx_type) {
+  __m128i intl[16], intr[16], inbl[16], inbr[16];
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    intl[i] = load_input_data(input + i * 16 + 0);
+    intr[i] = load_input_data(input + i * 16 + 8);
+    inbl[i] = load_input_data(input + (i + 16) * 16 + 0);
+    inbr[i] = load_input_data(input + (i + 16) * 16 + 8);
+  }
+
+  // Row transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case H_DCT:
+#endif
+      aom_idct16_sse2(intl, intr);
+      aom_idct16_sse2(inbl, inbr);
+      break;
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case H_ADST:
+    case H_FLIPADST:
+#endif
+      aom_iadst16_sse2(intl, intr);
+      aom_iadst16_sse2(inbl, inbr);
+      break;
+#if CONFIG_EXT_TX
+    case V_FLIPADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+      iidtx16_sse2(intl, intr);
+      iidtx16_sse2(inbl, inbr);
+      break;
+#endif
+    default: assert(0); break;
+  }
+
+  scale_sqrt2_8x16(intl);
+  scale_sqrt2_8x16(intr);
+  scale_sqrt2_8x16(inbl);
+  scale_sqrt2_8x16(inbr);
+
+  // Column transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case DCT_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case V_DCT:
+#endif
+      idct32_16col(intl, intr, inbl, inbr);
+      break;
+    case ADST_DCT:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_ADST:
+    case ADST_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case FLIPADST_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+#endif
+      ihalfright32_16col(intl, intr, inbl, inbr);
+      break;
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case IDTX: iidtx32_16col(intl, intr, inbl, inbr); break;
+#endif
+    default: assert(0); break;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+#endif
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST: FLIPUD_PTR(dest, stride, 32); break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      for (i = 0; i < 16; ++i) {
+        __m128i tmp = intl[i];
+        intl[i] = mm_reverse_epi16(intr[i]);
+        intr[i] = mm_reverse_epi16(tmp);
+        tmp = inbl[i];
+        inbl[i] = mm_reverse_epi16(inbr[i]);
+        inbr[i] = mm_reverse_epi16(tmp);
+      }
+      break;
+    case FLIPADST_FLIPADST:
+      for (i = 0; i < 16; ++i) {
+        __m128i tmp = intl[i];
+        intl[i] = mm_reverse_epi16(intr[i]);
+        intr[i] = mm_reverse_epi16(tmp);
+        tmp = inbl[i];
+        inbl[i] = mm_reverse_epi16(inbr[i]);
+        inbr[i] = mm_reverse_epi16(tmp);
+      }
+      FLIPUD_PTR(dest, stride, 32);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  write_buffer_16x32_round6(dest, intl, intr, inbl, inbr, stride);
+}
+
+static INLINE void write_buffer_32x16_round6(uint8_t *dest, __m128i *in0,
+                                             __m128i *in1, __m128i *in2,
+                                             __m128i *in3, int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  int i;
+
+  for (i = 0; i < 16; ++i) {
+    in0[i] = _mm_adds_epi16(in0[i], final_rounding);
+    in1[i] = _mm_adds_epi16(in1[i], final_rounding);
+    in2[i] = _mm_adds_epi16(in2[i], final_rounding);
+    in3[i] = _mm_adds_epi16(in3[i], final_rounding);
+    in0[i] = _mm_srai_epi16(in0[i], 6);
+    in1[i] = _mm_srai_epi16(in1[i], 6);
+    in2[i] = _mm_srai_epi16(in2[i], 6);
+    in3[i] = _mm_srai_epi16(in3[i], 6);
+    RECON_AND_STORE(dest + i * stride + 0, in0[i]);
+    RECON_AND_STORE(dest + i * stride + 8, in1[i]);
+    RECON_AND_STORE(dest + i * stride + 16, in2[i]);
+    RECON_AND_STORE(dest + i * stride + 24, in3[i]);
+  }
+}
+
+void av1_iht32x16_512_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride, int tx_type) {
+  __m128i in0[16], in1[16], in2[16], in3[16];
+  int i;
+
+  for (i = 0; i < 16; ++i) {
+    in0[i] = load_input_data(input + i * 32 + 0);
+    in1[i] = load_input_data(input + i * 32 + 8);
+    in2[i] = load_input_data(input + i * 32 + 16);
+    in3[i] = load_input_data(input + i * 32 + 24);
+  }
+
+  // Row transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case H_DCT:
+#endif
+      idct32_16col(in0, in1, in2, in3);
+      break;
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case H_ADST:
+    case H_FLIPADST:
+#endif
+      ihalfright32_16col(in0, in1, in2, in3);
+      break;
+#if CONFIG_EXT_TX
+    case V_FLIPADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX: iidtx32_16col(in0, in1, in2, in3); break;
+#endif
+    default: assert(0); break;
+  }
+
+  scale_sqrt2_8x16(in0);
+  scale_sqrt2_8x16(in1);
+  scale_sqrt2_8x16(in2);
+  scale_sqrt2_8x16(in3);
+
+  // Column transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case DCT_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case V_DCT:
+#endif
+      aom_idct16_sse2(in0, in1);
+      aom_idct16_sse2(in2, in3);
+      break;
+    case ADST_DCT:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_ADST:
+    case ADST_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case FLIPADST_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+#endif
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in2, in3);
+      break;
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case IDTX:
+      iidtx16_sse2(in0, in1);
+      iidtx16_sse2(in2, in3);
+      break;
+#endif
+    default: assert(0); break;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+#endif
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST: FLIPUD_PTR(dest, stride, 16); break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      for (i = 0; i < 16; ++i) {
+        __m128i tmp1 = in0[i];
+        __m128i tmp2 = in1[i];
+        in0[i] = mm_reverse_epi16(in3[i]);
+        in1[i] = mm_reverse_epi16(in2[i]);
+        in2[i] = mm_reverse_epi16(tmp2);
+        in3[i] = mm_reverse_epi16(tmp1);
+      }
+      break;
+    case FLIPADST_FLIPADST:
+      for (i = 0; i < 16; ++i) {
+        __m128i tmp1 = in0[i];
+        __m128i tmp2 = in1[i];
+        in0[i] = mm_reverse_epi16(in3[i]);
+        in1[i] = mm_reverse_epi16(in2[i]);
+        in2[i] = mm_reverse_epi16(tmp2);
+        in3[i] = mm_reverse_epi16(tmp1);
+      }
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  write_buffer_32x16_round6(dest, in0, in1, in2, in3, stride);
+}
diff --git a/third_party/aom/av1/common/x86/pvq_sse4.c b/third_party/aom/av1/common/x86/pvq_sse4.c
new file mode 100644
index 000000000..b3ed9efdf
--- /dev/null
+++ b/third_party/aom/av1/common/x86/pvq_sse4.c
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <float.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/x86/pvq_sse4.h"
+#include "../odintrin.h"
+#include "av1/common/pvq.h"
+
+#define EPSILON 1e-15f
+
+static __m128 horizontal_sum_ps(__m128 x) {
+  x = _mm_add_ps(x, _mm_shuffle_ps(x, x, _MM_SHUFFLE(1, 0, 3, 2)));
+  x = _mm_add_ps(x, _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1)));
+  return x;
+}
+
+static __m128i horizontal_sum_epi32(__m128i x) {
+  x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
+  x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)));
+  return x;
+}
+
+static INLINE float rsqrtf(float x) {
+  float y;
+  _mm_store_ss(&y, _mm_rsqrt_ss(_mm_load_ss(&x)));
+  return y;
+}
+
+/** Find the codepoint on the given PSphere closest to the desired
+ * vector. This is a float-precision PVQ search just to make sure
+ * our tests aren't limited by numerical accuracy. It's close to the
+ * pvq_search_rdo_double_c implementation, but is not bit accurate and
+ * it performs slightly worse on PSNR. One reason is that this code runs
+ * more RDO iterations than the C code. It also uses single precision
+ * floating point math, whereas the C version uses double precision.
+ *
+ * @param [in]      xcoeff  input vector to quantize (x in the math doc)
+ * @param [in]      n       number of dimensions
+ * @param [in]      k       number of pulses
+ * @param [out]     ypulse  optimal codevector found (y in the math doc)
+ * @param [in]      g2      multiplier for the distortion (typically squared
+ *                          gain units)
+ * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
+ * @param [in]      prev_k  number of pulses already in ypulse that we should
+ *                          reuse for the search (or 0 for a new search)
+ * @return                  cosine distance between x and y (between 0 and 1)
+ */
+double pvq_search_rdo_double_sse4_1(const od_val16 *xcoeff, int n, int k,
+                                    int *ypulse, double g2,
+                                    double pvq_norm_lambda, int prev_k) {
+  int i, j;
+  int reuse_pulses = prev_k > 0 && prev_k <= k;
+  /* TODO - This blows our 8kB stack space budget and should be fixed when
+   converting PVQ to fixed point. */
+  float xx = 0, xy = 0, yy = 0;
+  float x[MAXN + 3];
+  float y[MAXN + 3];
+  float sign_y[MAXN + 3];
+  for (i = 0; i < n; i++) {
+    float tmp = (float)xcoeff[i];
+    xx += tmp * tmp;
+    x[i] = xcoeff[i];
+  }
+
+  x[n] = x[n + 1] = x[n + 2] = 0;
+  ypulse[n] = ypulse[n + 1] = ypulse[n + 2] = 0;
+
+  __m128 sums = _mm_setzero_ps();
+  for (i = 0; i < n; i += 4) {
+    __m128 x4 = _mm_loadu_ps(&x[i]);
+    __m128 s4 = _mm_cmplt_ps(x4, _mm_setzero_ps());
+    /* Save the sign, we'll put it back later. */
+    _mm_storeu_ps(&sign_y[i], s4);
+    /* Get rid of the sign. */
+    x4 = _mm_andnot_ps(_mm_set_ps1(-0.f), x4);
+    sums = _mm_add_ps(sums, x4);
+    if (!reuse_pulses) {
+      /* Clear y and ypulse in case we don't do the projection. */
+      _mm_storeu_ps(&y[i], _mm_setzero_ps());
+      _mm_storeu_si128((__m128i *)&ypulse[i], _mm_setzero_si128());
+    }
+    _mm_storeu_ps(&x[i], x4);
+  }
+  sums = horizontal_sum_ps(sums);
+  int pulses_left = k;
+  {
+    __m128i pulses_sum;
+    __m128 yy4, xy4;
+    xy4 = yy4 = _mm_setzero_ps();
+    pulses_sum = _mm_setzero_si128();
+    if (reuse_pulses) {
+      /* We reuse pulses from a previous search so we don't have to search them
+          again. */
+      for (j = 0; j < n; j += 4) {
+        __m128 x4, y4;
+        __m128i iy4;
+        iy4 = _mm_abs_epi32(_mm_loadu_si128((__m128i *)&ypulse[j]));
+        pulses_sum = _mm_add_epi32(pulses_sum, iy4);
+        _mm_storeu_si128((__m128i *)&ypulse[j], iy4);
+        y4 = _mm_cvtepi32_ps(iy4);
+        x4 = _mm_loadu_ps(&x[j]);
+        xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
+        yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
+        /* Double the y[] vector so we don't have to do it in the search loop.
+         */
+        _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
+      }
+      pulses_left -= _mm_cvtsi128_si32(horizontal_sum_epi32(pulses_sum));
+      xy4 = horizontal_sum_ps(xy4);
+      xy = _mm_cvtss_f32(xy4);
+      yy4 = horizontal_sum_ps(yy4);
+      yy = _mm_cvtss_f32(yy4);
+    } else if (k > (n >> 1)) {
+      /* Do a pre-search by projecting on the pyramid. */
+      __m128 rcp4;
+      float sum = _mm_cvtss_f32(sums);
+      /* If x is too small, just replace it with a pulse at 0. This prevents
+         infinities and NaNs from causing too many pulses to be allocated. Here,
+         64 is an
+         approximation of infinity. */
+      if (sum <= EPSILON) {
+        x[0] = 1.f;
+        for (i = 1; i < n; i++) {
+          x[i] = 0;
+        }
+        sums = _mm_set_ps1(1.f);
+      }
+      /* Using k + e with e < 1 guarantees we cannot get more than k pulses. */
+      rcp4 = _mm_mul_ps(_mm_set_ps1((float)k + .8f), _mm_rcp_ps(sums));
+      xy4 = yy4 = _mm_setzero_ps();
+      pulses_sum = _mm_setzero_si128();
+      for (j = 0; j < n; j += 4) {
+        __m128 rx4, x4, y4;
+        __m128i iy4;
+        x4 = _mm_loadu_ps(&x[j]);
+        rx4 = _mm_mul_ps(x4, rcp4);
+        iy4 = _mm_cvttps_epi32(rx4);
+        pulses_sum = _mm_add_epi32(pulses_sum, iy4);
+        _mm_storeu_si128((__m128i *)&ypulse[j], iy4);
+        y4 = _mm_cvtepi32_ps(iy4);
+        xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
+        yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
+        /* Double the y[] vector so we don't have to do it in the search loop.
+         */
+        _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
+      }
+      pulses_left -= _mm_cvtsi128_si32(horizontal_sum_epi32(pulses_sum));
+      xy = _mm_cvtss_f32(horizontal_sum_ps(xy4));
+      yy = _mm_cvtss_f32(horizontal_sum_ps(yy4));
+    }
+    x[n] = x[n + 1] = x[n + 2] = -100;
+    y[n] = y[n + 1] = y[n + 2] = 100;
+  }
+
+  /* This should never happen. */
+  OD_ASSERT(pulses_left <= n + 3);
+
+  float lambda_delta_rate[MAXN + 3];
+  if (pulses_left) {
+    /* Hoist lambda to avoid the multiply in the loop. */
+    float lambda =
+        0.5f * sqrtf(xx) * (float)pvq_norm_lambda / (FLT_MIN + (float)g2);
+    float delta_rate = 3.f / n;
+    __m128 count = _mm_set_ps(3, 2, 1, 0);
+    for (i = 0; i < n; i += 4) {
+      _mm_storeu_ps(&lambda_delta_rate[i],
+                    _mm_mul_ps(count, _mm_set_ps1(lambda * delta_rate)));
+      count = _mm_add_ps(count, _mm_set_ps(4, 4, 4, 4));
+    }
+  }
+  lambda_delta_rate[n] = lambda_delta_rate[n + 1] = lambda_delta_rate[n + 2] =
+      1e30f;
+
+  for (i = 0; i < pulses_left; i++) {
+    int best_id = 0;
+    __m128 xy4, yy4;
+    __m128 max, max2;
+    __m128i count;
+    __m128i pos;
+
+    /* The squared magnitude term gets added anyway, so we might as well
+        add it outside the loop. */
+    yy = yy + 1;
+    xy4 = _mm_load1_ps(&xy);
+    yy4 = _mm_load1_ps(&yy);
+    max = _mm_setzero_ps();
+    pos = _mm_setzero_si128();
+    count = _mm_set_epi32(3, 2, 1, 0);
+    for (j = 0; j < n; j += 4) {
+      __m128 x4, y4, r4;
+      x4 = _mm_loadu_ps(&x[j]);
+      y4 = _mm_loadu_ps(&y[j]);
+      x4 = _mm_add_ps(x4, xy4);
+      y4 = _mm_add_ps(y4, yy4);
+      y4 = _mm_rsqrt_ps(y4);
+      r4 = _mm_mul_ps(x4, y4);
+      /* Subtract lambda. */
+      r4 = _mm_sub_ps(r4, _mm_loadu_ps(&lambda_delta_rate[j]));
+      /* Update the index of the max. */
+      pos = _mm_max_epi16(
+          pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max))));
+      /* Update the max. */
+      max = _mm_max_ps(max, r4);
+      /* Update the indices (+4) */
+      count = _mm_add_epi32(count, _mm_set_epi32(4, 4, 4, 4));
+    }
+    /* Horizontal max. */
+    max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2)));
+    max2 =
+        _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1)));
+    /* Now that max2 contains the max at all positions, look at which value(s)
+       of the
+        partial max is equal to the global max. */
+    pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2)));
+    pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos));
+    pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2)));
+    best_id = _mm_cvtsi128_si32(pos);
+    OD_ASSERT(best_id < n);
+    /* Updating the sums of the new pulse(s) */
+    xy = xy + x[best_id];
+    /* We're multiplying y[j] by two so we don't have to do it here. */
+    yy = yy + y[best_id];
+    /* Only now that we've made the final choice, update y/ypulse. */
+    /* Multiplying y[j] by 2 so we don't have to do it everywhere else. */
+    y[best_id] += 2;
+    ypulse[best_id]++;
+  }
+
+  /* Put the original sign back. */
+  for (i = 0; i < n; i += 4) {
+    __m128i y4;
+    __m128i s4;
+    y4 = _mm_loadu_si128((__m128i *)&ypulse[i]);
+    s4 = _mm_castps_si128(_mm_loadu_ps(&sign_y[i]));
+    y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
+    _mm_storeu_si128((__m128i *)&ypulse[i], y4);
+  }
+  return xy * rsqrtf(xx * yy + FLT_MIN);
+}
diff --git a/third_party/aom/av1/common/x86/pvq_sse4.h b/third_party/aom/av1/common/x86/pvq_sse4.h
new file mode 100644
index 000000000..3c4ce8543
--- /dev/null
+++ b/third_party/aom/av1/common/x86/pvq_sse4.h
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_PVQ_X86_SSE4_H_
+#define AOM_COMMON_PVQ_X86_SSE4_H_
+#endif  // AOM_COMMON_PVQ_X86_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
new file mode 100644
index 000000000..260faa8c9
--- /dev/null
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -0,0 +1,1805 @@
+#include <smmintrin.h>
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+#include "av1/common/restoration.h"
+
+/* Calculate four consecutive entries of the intermediate A and B arrays
+   (corresponding to the first loop in the C version of
+   av1_selfguided_restoration)
+*/
+static void calc_block(__m128i sum, __m128i sum_sq, __m128i n,
+                       __m128i one_over_n, __m128i s, int bit_depth, int idx,
+                       int32_t *A, int32_t *B) {
+  __m128i a, b, p;
+#if CONFIG_HIGHBITDEPTH
+  if (bit_depth > 8) {
+    __m128i rounding_a = _mm_set1_epi32((1 << (2 * (bit_depth - 8))) >> 1);
+    __m128i rounding_b = _mm_set1_epi32((1 << (bit_depth - 8)) >> 1);
+    __m128i shift_a = _mm_set_epi64x(0, 2 * (bit_depth - 8));
+    __m128i shift_b = _mm_set_epi64x(0, bit_depth - 8);
+    a = _mm_srl_epi32(_mm_add_epi32(sum_sq, rounding_a), shift_a);
+    b = _mm_srl_epi32(_mm_add_epi32(sum, rounding_b), shift_b);
+    a = _mm_mullo_epi32(a, n);
+    b = _mm_mullo_epi32(b, b);
+    p = _mm_sub_epi32(_mm_max_epi32(a, b), b);
+  } else {
+#endif
+    (void)bit_depth;
+    a = _mm_mullo_epi32(sum_sq, n);
+    b = _mm_mullo_epi32(sum, sum);
+    p = _mm_sub_epi32(a, b);
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif
+
+  __m128i rounding_z = _mm_set1_epi32((1 << SGRPROJ_MTABLE_BITS) >> 1);
+  __m128i z = _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rounding_z),
+                             SGRPROJ_MTABLE_BITS);
+  z = _mm_min_epi32(z, _mm_set1_epi32(255));
+
+  // 'Gather' type instructions are not available pre-AVX2, so synthesize a
+  // gather using scalar loads.
+  __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
+                                x_by_xplus1[_mm_extract_epi32(z, 2)],
+                                x_by_xplus1[_mm_extract_epi32(z, 1)],
+                                x_by_xplus1[_mm_extract_epi32(z, 0)]);
+
+  _mm_storeu_si128((__m128i *)&A[idx], a_res);
+
+  __m128i rounding_res = _mm_set1_epi32((1 << SGRPROJ_RECIP_BITS) >> 1);
+  __m128i a_complement = _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
+  __m128i b_int =
+      _mm_mullo_epi32(a_complement, _mm_mullo_epi32(sum, one_over_n));
+  __m128i b_res =
+      _mm_srli_epi32(_mm_add_epi32(b_int, rounding_res), SGRPROJ_RECIP_BITS);
+
+  _mm_storeu_si128((__m128i *)&B[idx], b_res);
+}
+
+static void selfguided_restoration_1_v(uint8_t *src, int width, int height,
+                                       int src_stride, int32_t *A, int32_t *B,
+                                       int buf_stride) {
+  int i, j;
+
+  // Vertical sum
+  // When the width is not a multiple of 4, we know that 'stride' is rounded up
+  // to a multiple of 4. So it is safe for this loop to calculate extra columns
+  // at the right-hand edge of the frame.
+  int width_extend = (width + 3) & ~3;
+  for (j = 0; j < width_extend; j += 4) {
+    __m128i a, b, x, y, x2, y2;
+    __m128i sum, sum_sq, tmp;
+
+    a = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[j]));
+    b = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[src_stride + j]));
+
+    sum = _mm_cvtepi16_epi32(_mm_add_epi16(a, b));
+    tmp = _mm_unpacklo_epi16(a, b);
+    sum_sq = _mm_madd_epi16(tmp, tmp);
+
+    _mm_store_si128((__m128i *)&B[j], sum);
+    _mm_store_si128((__m128i *)&A[j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    for (i = 1; i < height - 2; ++i) {
+      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+      x = _mm_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+      y = _mm_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)&src[(i + 2) * src_stride + j]));
+
+      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+
+      x2 = _mm_mullo_epi32(x, x);
+      y2 = _mm_mullo_epi32(y, y);
+
+      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+    }
+    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
+  }
+}
+
+static void selfguided_restoration_1_h(int32_t *A, int32_t *B, int width,
+                                       int height, int buf_stride, int eps,
+                                       int bit_depth) {
+  int i, j;
+
+  // Horizontal sum
+  int width_extend = (width + 3) & ~3;
+  for (i = 0; i < height; ++i) {
+    int h = AOMMIN(2, height - i) + AOMMIN(1, i);
+
+    __m128i a1 = _mm_loadu_si128((__m128i *)&A[i * buf_stride]);
+    __m128i b1 = _mm_loadu_si128((__m128i *)&B[i * buf_stride]);
+    __m128i a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 4]);
+    __m128i b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 4]);
+
+    // Note: The _mm_slli_si128 call sets up a register containing
+    // {0, A[i * buf_stride], ..., A[i * buf_stride + 2]},
+    // so that the first element of 'sum' (which should only add two values
+    // together) ends up calculated correctly.
+    __m128i sum_ = _mm_add_epi32(_mm_slli_si128(b1, 4),
+                                 _mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4)));
+    __m128i sum_sq_ = _mm_add_epi32(
+        _mm_slli_si128(a1, 4), _mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4)));
+    __m128i n = _mm_set_epi32(3 * h, 3 * h, 3 * h, 2 * h);
+    __m128i one_over_n =
+        _mm_set_epi32(one_by_x[3 * h - 1], one_by_x[3 * h - 1],
+                      one_by_x[3 * h - 1], one_by_x[2 * h - 1]);
+    __m128i s = _mm_set_epi32(
+        sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1],
+        sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][2 * h - 1]);
+    calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride, A,
+               B);
+
+    n = _mm_set1_epi32(3 * h);
+    one_over_n = _mm_set1_epi32(one_by_x[3 * h - 1]);
+    s = _mm_set1_epi32(sgrproj_mtable[eps - 1][3 * h - 1]);
+
+    // Re-align a1 and b1 so that they start at index i * buf_stride + 3
+    a2 = _mm_alignr_epi8(a2, a1, 12);
+    b2 = _mm_alignr_epi8(b2, b1, 12);
+
+    // Note: When the width is not a multiple of 4, this loop may end up
+    // writing to the last 4 columns of the frame, potentially with incorrect
+    // values (especially for r=2 and r=3).
+    // This is fine, since we fix up those values in the block after this
+    // loop, and in exchange we never have more than four values to
+    // write / fix up after this loop finishes.
+    for (j = 4; j < width_extend - 4; j += 4) {
+      a1 = a2;
+      b1 = b2;
+      a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 3]);
+      b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 3]);
+      /* Loop invariant: At this point,
+         a1 = original A[i * buf_stride + j - 1 : i * buf_stride + j + 3]
+         a2 = original A[i * buf_stride + j + 3 : i * buf_stride + j + 7]
+         and similar for b1,b2 and B
+      */
+      sum_ = _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
+                                             _mm_alignr_epi8(b2, b1, 8)));
+      sum_sq_ = _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
+                                                _mm_alignr_epi8(a2, a1, 8)));
+      calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
+                 A, B);
+    }
+    __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 3]);
+    __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 3]);
+
+    j = width - 4;
+    switch (width % 4) {
+      case 0:
+        a1 = a2;
+        b1 = b2;
+        a2 = a3;
+        b2 = b3;
+        break;
+      case 1:
+        a1 = _mm_alignr_epi8(a2, a1, 4);
+        b1 = _mm_alignr_epi8(b2, b1, 4);
+        a2 = _mm_alignr_epi8(a3, a2, 4);
+        b2 = _mm_alignr_epi8(b3, b2, 4);
+        break;
+      case 2:
+        a1 = _mm_alignr_epi8(a2, a1, 8);
+        b1 = _mm_alignr_epi8(b2, b1, 8);
+        a2 = _mm_alignr_epi8(a3, a2, 8);
+        b2 = _mm_alignr_epi8(b3, b2, 8);
+        break;
+      case 3:
+        a1 = _mm_alignr_epi8(a2, a1, 12);
+        b1 = _mm_alignr_epi8(b2, b1, 12);
+        a2 = _mm_alignr_epi8(a3, a2, 12);
+        b2 = _mm_alignr_epi8(b3, b2, 12);
+        break;
+    }
+
+    // Zero out the data loaded from "off the edge" of the array
+    __m128i zero = _mm_setzero_si128();
+    a2 = _mm_blend_epi16(a2, zero, 0xfc);
+    b2 = _mm_blend_epi16(b2, zero, 0xfc);
+
+    sum_ = _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
+                                           _mm_alignr_epi8(b2, b1, 8)));
+    sum_sq_ = _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
+                                              _mm_alignr_epi8(a2, a1, 8)));
+    n = _mm_set_epi32(2 * h, 3 * h, 3 * h, 3 * h);
+    one_over_n = _mm_set_epi32(one_by_x[2 * h - 1], one_by_x[3 * h - 1],
+                               one_by_x[3 * h - 1], one_by_x[3 * h - 1]);
+    s = _mm_set_epi32(
+        sgrproj_mtable[eps - 1][2 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1],
+        sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1]);
+    calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
+               A, B);
+  }
+}
+
+static void selfguided_restoration_2_v(uint8_t *src, int width, int height,
+                                       int src_stride, int32_t *A, int32_t *B,
+                                       int buf_stride) {
+  int i, j;
+
+  // Vertical sum
+  int width_extend = (width + 3) & ~3;
+  for (j = 0; j < width_extend; j += 4) {
+    __m128i a, b, c, c2, x, y, x2, y2;
+    __m128i sum, sum_sq, tmp;
+
+    a = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[j]));
+    b = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[src_stride + j]));
+    c = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
+
+    sum = _mm_cvtepi16_epi32(_mm_add_epi16(_mm_add_epi16(a, b), c));
+    // Important: Since c may be up to 2^8, the result on squaring may
+    // be up to 2^16. So we need to zero-extend, not sign-extend.
+    c2 = _mm_cvtepu16_epi32(_mm_mullo_epi16(c, c));
+    tmp = _mm_unpacklo_epi16(a, b);
+    sum_sq = _mm_add_epi32(_mm_madd_epi16(tmp, tmp), c2);
+
+    _mm_store_si128((__m128i *)&B[j], sum);
+    _mm_store_si128((__m128i *)&A[j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    for (i = 2; i < height - 3; ++i) {
+      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+      x = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)&src[(i - 2) * src_stride + j])));
+      y = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)&src[(i + 3) * src_stride + j])));
+
+      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+
+      x2 = _mm_mullo_epi32(x, x);
+      y2 = _mm_mullo_epi32(y, y);
+
+      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+    }
+    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
+  }
+}
+
+static void selfguided_restoration_2_h(int32_t *A, int32_t *B, int width,
+                                       int height, int buf_stride, int eps,
+                                       int bit_depth) {
+  int i, j;
+
+  // Horizontal sum
+  int width_extend = (width + 3) & ~3;
+  for (i = 0; i < height; ++i) {
+    int h = AOMMIN(3, height - i) + AOMMIN(2, i);
+
+    __m128i a1 = _mm_loadu_si128((__m128i *)&A[i * buf_stride]);
+    __m128i b1 = _mm_loadu_si128((__m128i *)&B[i * buf_stride]);
+    __m128i a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 4]);
+    __m128i b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 4]);
+
+    __m128i sum_ = _mm_add_epi32(
+        _mm_add_epi32(
+            _mm_add_epi32(_mm_slli_si128(b1, 8), _mm_slli_si128(b1, 4)),
+            _mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4))),
+        _mm_alignr_epi8(b2, b1, 8));
+    __m128i sum_sq_ = _mm_add_epi32(
+        _mm_add_epi32(
+            _mm_add_epi32(_mm_slli_si128(a1, 8), _mm_slli_si128(a1, 4)),
+            _mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4))),
+        _mm_alignr_epi8(a2, a1, 8));
+
+    __m128i n = _mm_set_epi32(5 * h, 5 * h, 4 * h, 3 * h);
+    __m128i one_over_n =
+        _mm_set_epi32(one_by_x[5 * h - 1], one_by_x[5 * h - 1],
+                      one_by_x[4 * h - 1], one_by_x[3 * h - 1]);
+    __m128i s = _mm_set_epi32(
+        sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1],
+        sgrproj_mtable[eps - 1][4 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1]);
+    calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride, A,
+               B);
+
+    // Re-align a1 and b1 so that they start at index i * buf_stride + 2
+    a2 = _mm_alignr_epi8(a2, a1, 8);
+    b2 = _mm_alignr_epi8(b2, b1, 8);
+
+    n = _mm_set1_epi32(5 * h);
+    one_over_n = _mm_set1_epi32(one_by_x[5 * h - 1]);
+    s = _mm_set1_epi32(sgrproj_mtable[eps - 1][5 * h - 1]);
+
+    for (j = 4; j < width_extend - 4; j += 4) {
+      a1 = a2;
+      a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 2]);
+      b1 = b2;
+      b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 2]);
+      /* Loop invariant: At this point,
+         a1 = original A[i * buf_stride + j - 2 : i * buf_stride + j + 2]
+         a2 = original A[i * buf_stride + j + 2 : i * buf_stride + j + 6]
+         and similar for b1,b2 and B
+      */
+      sum_ = _mm_add_epi32(
+          _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
+                                          _mm_alignr_epi8(b2, b1, 8))),
+          _mm_add_epi32(_mm_alignr_epi8(b2, b1, 12), b2));
+      sum_sq_ = _mm_add_epi32(
+          _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
+                                          _mm_alignr_epi8(a2, a1, 8))),
+          _mm_add_epi32(_mm_alignr_epi8(a2, a1, 12), a2));
+
+      calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
+                 A, B);
+    }
+    // If the width is not a multiple of 4, we need to reset j to width - 4
+    // and adjust a1, a2, b1, b2 so that the loop invariant above is maintained
+    __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 2]);
+    __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 2]);
+
+    j = width - 4;
+    switch (width % 4) {
+      case 0:
+        a1 = a2;
+        b1 = b2;
+        a2 = a3;
+        b2 = b3;
+        break;
+      case 1:
+        a1 = _mm_alignr_epi8(a2, a1, 4);
+        b1 = _mm_alignr_epi8(b2, b1, 4);
+        a2 = _mm_alignr_epi8(a3, a2, 4);
+        b2 = _mm_alignr_epi8(b3, b2, 4);
+        break;
+      case 2:
+        a1 = _mm_alignr_epi8(a2, a1, 8);
+        b1 = _mm_alignr_epi8(b2, b1, 8);
+        a2 = _mm_alignr_epi8(a3, a2, 8);
+        b2 = _mm_alignr_epi8(b3, b2, 8);
+        break;
+      case 3:
+        a1 = _mm_alignr_epi8(a2, a1, 12);
+        b1 = _mm_alignr_epi8(b2, b1, 12);
+        a2 = _mm_alignr_epi8(a3, a2, 12);
+        b2 = _mm_alignr_epi8(b3, b2, 12);
+        break;
+    }
+
+    // Zero out the data loaded from "off the edge" of the array
+    __m128i zero = _mm_setzero_si128();
+    a2 = _mm_blend_epi16(a2, zero, 0xf0);
+    b2 = _mm_blend_epi16(b2, zero, 0xf0);
+
+    sum_ = _mm_add_epi32(
+        _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
+                                        _mm_alignr_epi8(b2, b1, 8))),
+        _mm_add_epi32(_mm_alignr_epi8(b2, b1, 12), b2));
+    sum_sq_ = _mm_add_epi32(
+        _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
+                                        _mm_alignr_epi8(a2, a1, 8))),
+        _mm_add_epi32(_mm_alignr_epi8(a2, a1, 12), a2));
+
+    n = _mm_set_epi32(3 * h, 4 * h, 5 * h, 5 * h);
+    one_over_n = _mm_set_epi32(one_by_x[3 * h - 1], one_by_x[4 * h - 1],
+                               one_by_x[5 * h - 1], one_by_x[5 * h - 1]);
+    s = _mm_set_epi32(
+        sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][4 * h - 1],
+        sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1]);
+    calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
+               A, B);
+  }
+}
+
+static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
+                                       int src_stride, int32_t *A, int32_t *B,
+                                       int buf_stride) {
+  int i, j;
+
+  // Vertical sum over 7-pixel regions, 4 columns at a time
+  int width_extend = (width + 3) & ~3;
+  for (j = 0; j < width_extend; j += 4) {
+    __m128i a, b, c, d, x, y, x2, y2;
+    __m128i sum, sum_sq, tmp, tmp2;
+
+    a = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[j]));
+    b = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[src_stride + j]));
+    c = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
+    d = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]));
+
+    sum = _mm_cvtepi16_epi32(
+        _mm_add_epi16(_mm_add_epi16(a, b), _mm_add_epi16(c, d)));
+    tmp = _mm_unpacklo_epi16(a, b);
+    tmp2 = _mm_unpacklo_epi16(c, d);
+    sum_sq =
+        _mm_add_epi32(_mm_madd_epi16(tmp, tmp), _mm_madd_epi16(tmp2, tmp2));
+
+    _mm_store_si128((__m128i *)&B[j], sum);
+    _mm_store_si128((__m128i *)&A[j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[5 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[2 * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[2 * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[6 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    for (i = 3; i < height - 4; ++i) {
+      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+      x = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)&src[(i - 3) * src_stride + j])));
+      y = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)&src[(i + 4) * src_stride + j])));
+
+      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+
+      x2 = _mm_mullo_epi32(x, x);
+      y2 = _mm_mullo_epi32(y, y);
+
+      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+    }
+    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 3) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 3) * buf_stride + j], sum_sq);
+  }
+}
+
+static void selfguided_restoration_3_h(int32_t *A, int32_t *B, int width,
+                                       int height, int buf_stride, int eps,
+                                       int bit_depth) {
+  int i, j;
+  // Horizontal sum over 7-pixel regions of dst
+  int width_extend = (width + 3) & ~3;
+  for (i = 0; i < height; ++i) {
+    int h = AOMMIN(4, height - i) + AOMMIN(3, i);
+
+    __m128i a1 = _mm_loadu_si128((__m128i *)&A[i * buf_stride]);
+    __m128i b1 = _mm_loadu_si128((__m128i *)&B[i * buf_stride]);
+    __m128i a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 4]);
+    __m128i b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 4]);
+
+    __m128i sum_ = _mm_add_epi32(
+        _mm_add_epi32(
+            _mm_add_epi32(_mm_slli_si128(b1, 12), _mm_slli_si128(b1, 8)),
+            _mm_add_epi32(_mm_slli_si128(b1, 4), b1)),
+        _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
+                                    _mm_alignr_epi8(b2, b1, 8)),
+                      _mm_alignr_epi8(b2, b1, 12)));
+    __m128i sum_sq_ = _mm_add_epi32(
+        _mm_add_epi32(
+            _mm_add_epi32(_mm_slli_si128(a1, 12), _mm_slli_si128(a1, 8)),
+            _mm_add_epi32(_mm_slli_si128(a1, 4), a1)),
+        _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
+                                    _mm_alignr_epi8(a2, a1, 8)),
+                      _mm_alignr_epi8(a2, a1, 12)));
+
+    __m128i n = _mm_set_epi32(7 * h, 6 * h, 5 * h, 4 * h);
+    __m128i one_over_n =
+        _mm_set_epi32(one_by_x[7 * h - 1], one_by_x[6 * h - 1],
+                      one_by_x[5 * h - 1], one_by_x[4 * h - 1]);
+    __m128i s = _mm_set_epi32(
+        sgrproj_mtable[eps - 1][7 * h - 1], sgrproj_mtable[eps - 1][6 * h - 1],
+        sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][4 * h - 1]);
+    calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride, A,
+               B);
+
+    // Re-align a1 and b1 so that they start at index i * buf_stride + 1
+    a2 = _mm_alignr_epi8(a2, a1, 4);
+    b2 = _mm_alignr_epi8(b2, b1, 4);
+
+    n = _mm_set1_epi32(7 * h);
+    one_over_n = _mm_set1_epi32(one_by_x[7 * h - 1]);
+    s = _mm_set1_epi32(sgrproj_mtable[eps - 1][7 * h - 1]);
+
+    for (j = 4; j < width_extend - 4; j += 4) {
+      a1 = a2;
+      a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 1]);
+      b1 = b2;
+      b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 1]);
+      __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 5]);
+      __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 5]);
+      /* Loop invariant: At this point,
+         a1 = original A[i * buf_stride + j - 3 : i * buf_stride + j + 1]
+         a2 = original A[i * buf_stride + j + 1 : i * buf_stride + j + 5]
+         a3 = original A[i * buf_stride + j + 5 : i * buf_stride + j + 9]
+         and similar for b1,b2,b3 and B
+      */
+      sum_ = _mm_add_epi32(
+          _mm_add_epi32(_mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4)),
+                        _mm_add_epi32(_mm_alignr_epi8(b2, b1, 8),
+                                      _mm_alignr_epi8(b2, b1, 12))),
+          _mm_add_epi32(_mm_add_epi32(b2, _mm_alignr_epi8(b3, b2, 4)),
+                        _mm_alignr_epi8(b3, b2, 8)));
+      sum_sq_ = _mm_add_epi32(
+          _mm_add_epi32(_mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4)),
+                        _mm_add_epi32(_mm_alignr_epi8(a2, a1, 8),
+                                      _mm_alignr_epi8(a2, a1, 12))),
+          _mm_add_epi32(_mm_add_epi32(a2, _mm_alignr_epi8(a3, a2, 4)),
+                        _mm_alignr_epi8(a3, a2, 8)));
+
+      calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
+                 A, B);
+    }
+    __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 1]);
+    __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 1]);
+
+    j = width - 4;
+    switch (width % 4) {
+      case 0:
+        a1 = a2;
+        b1 = b2;
+        a2 = a3;
+        b2 = b3;
+        break;
+      case 1:
+        a1 = _mm_alignr_epi8(a2, a1, 4);
+        b1 = _mm_alignr_epi8(b2, b1, 4);
+        a2 = _mm_alignr_epi8(a3, a2, 4);
+        b2 = _mm_alignr_epi8(b3, b2, 4);
+        break;
+      case 2:
+        a1 = _mm_alignr_epi8(a2, a1, 8);
+        b1 = _mm_alignr_epi8(b2, b1, 8);
+        a2 = _mm_alignr_epi8(a3, a2, 8);
+        b2 = _mm_alignr_epi8(b3, b2, 8);
+        break;
+      case 3:
+        a1 = _mm_alignr_epi8(a2, a1, 12);
+        b1 = _mm_alignr_epi8(b2, b1, 12);
+        a2 = _mm_alignr_epi8(a3, a2, 12);
+        b2 = _mm_alignr_epi8(b3, b2, 12);
+        break;
+    }
+
+    // Zero out the data loaded from "off the edge" of the array
+    __m128i zero = _mm_setzero_si128();
+    a2 = _mm_blend_epi16(a2, zero, 0xc0);
+    b2 = _mm_blend_epi16(b2, zero, 0xc0);
+
+    sum_ = _mm_add_epi32(
+        _mm_add_epi32(_mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4)),
+                      _mm_add_epi32(_mm_alignr_epi8(b2, b1, 8),
+                                    _mm_alignr_epi8(b2, b1, 12))),
+        _mm_add_epi32(_mm_add_epi32(b2, _mm_alignr_epi8(zero, b2, 4)),
+                      _mm_alignr_epi8(zero, b2, 8)));
+    sum_sq_ = _mm_add_epi32(
+        _mm_add_epi32(_mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4)),
+                      _mm_add_epi32(_mm_alignr_epi8(a2, a1, 8),
+                                    _mm_alignr_epi8(a2, a1, 12))),
+        _mm_add_epi32(_mm_add_epi32(a2, _mm_alignr_epi8(zero, a2, 4)),
+                      _mm_alignr_epi8(zero, a2, 8)));
+
+    n = _mm_set_epi32(4 * h, 5 * h, 6 * h, 7 * h);
+    one_over_n = _mm_set_epi32(one_by_x[4 * h - 1], one_by_x[5 * h - 1],
+                               one_by_x[6 * h - 1], one_by_x[7 * h - 1]);
+    s = _mm_set_epi32(
+        sgrproj_mtable[eps - 1][4 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1],
+        sgrproj_mtable[eps - 1][6 * h - 1], sgrproj_mtable[eps - 1][7 * h - 1]);
+    calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
+               A, B);
+  }
+}
+
+void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
+                                       int stride, int32_t *dst, int dst_stride,
+                                       int r, int eps, int32_t *tmpbuf) {
+  int32_t *A = tmpbuf;
+  int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
+  int i, j;
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 16 bytes for efficiency.
+  int buf_stride = ((width + 3) & ~3) + 16;
+
+  // Don't filter tiles with dimensions < 5 on any axis
+  if ((width < 5) || (height < 5)) return;
+
+  if (r == 1) {
+    selfguided_restoration_1_v(dgd, width, height, stride, A, B, buf_stride);
+    selfguided_restoration_1_h(A, B, width, height, buf_stride, eps, 8);
+  } else if (r == 2) {
+    selfguided_restoration_2_v(dgd, width, height, stride, A, B, buf_stride);
+    selfguided_restoration_2_h(A, B, width, height, buf_stride, eps, 8);
+  } else if (r == 3) {
+    selfguided_restoration_3_v(dgd, width, height, stride, A, B, buf_stride);
+    selfguided_restoration_3_h(A, B, width, height, buf_stride, eps, 8);
+  } else {
+    assert(0);
+  }
+
+  {
+    i = 0;
+    j = 0;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
+                        A[k + buf_stride + 1];
+      const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] +
+                        B[k + buf_stride + 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+    for (j = 1; j < width - 1; ++j) {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
+                        A[k + buf_stride - 1] + A[k + buf_stride + 1];
+      const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
+                        B[k + buf_stride - 1] + B[k + buf_stride + 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+    j = width - 1;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
+                        A[k + buf_stride - 1];
+      const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] +
+                        B[k + buf_stride - 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+  }
+  for (i = 1; i < height - 1; ++i) {
+    j = 0;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
+                        A[k + 1] + A[k - buf_stride + 1] +
+                        A[k + buf_stride + 1];
+      const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
+                        B[k + 1] + B[k - buf_stride + 1] +
+                        B[k + buf_stride + 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+
+    // Vectorize the innermost loop
+    for (j = 1; j < width - 1; j += 4) {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 5;
+
+      __m128i tmp0 = _mm_loadu_si128((__m128i *)&A[k - 1 - buf_stride]);
+      __m128i tmp1 = _mm_loadu_si128((__m128i *)&A[k + 3 - buf_stride]);
+      __m128i tmp2 = _mm_loadu_si128((__m128i *)&A[k - 1]);
+      __m128i tmp3 = _mm_loadu_si128((__m128i *)&A[k + 3]);
+      __m128i tmp4 = _mm_loadu_si128((__m128i *)&A[k - 1 + buf_stride]);
+      __m128i tmp5 = _mm_loadu_si128((__m128i *)&A[k + 3 + buf_stride]);
+
+      __m128i a0 = _mm_add_epi32(
+          _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 4), tmp2),
+                        _mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 8),
+                                      _mm_alignr_epi8(tmp5, tmp4, 4))),
+          _mm_alignr_epi8(tmp1, tmp0, 4));
+      __m128i a1 = _mm_add_epi32(_mm_add_epi32(tmp0, tmp4),
+                                 _mm_add_epi32(_mm_alignr_epi8(tmp1, tmp0, 8),
+                                               _mm_alignr_epi8(tmp5, tmp4, 8)));
+      __m128i a = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(a0, a1), 2), a1);
+
+      __m128i tmp6 = _mm_loadu_si128((__m128i *)&B[k - 1 - buf_stride]);
+      __m128i tmp7 = _mm_loadu_si128((__m128i *)&B[k + 3 - buf_stride]);
+      __m128i tmp8 = _mm_loadu_si128((__m128i *)&B[k - 1]);
+      __m128i tmp9 = _mm_loadu_si128((__m128i *)&B[k + 3]);
+      __m128i tmp10 = _mm_loadu_si128((__m128i *)&B[k - 1 + buf_stride]);
+      __m128i tmp11 = _mm_loadu_si128((__m128i *)&B[k + 3 + buf_stride]);
+
+      __m128i b0 = _mm_add_epi32(
+          _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 4), tmp8),
+                        _mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 8),
+                                      _mm_alignr_epi8(tmp11, tmp10, 4))),
+          _mm_alignr_epi8(tmp7, tmp6, 4));
+      __m128i b1 =
+          _mm_add_epi32(_mm_add_epi32(tmp6, tmp10),
+                        _mm_add_epi32(_mm_alignr_epi8(tmp7, tmp6, 8),
+                                      _mm_alignr_epi8(tmp11, tmp10, 8)));
+      __m128i b = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(b0, b1), 2), b1);
+
+      __m128i src = _mm_cvtepu8_epi32(_mm_loadu_si128((__m128i *)&dgd[l]));
+
+      __m128i rounding = _mm_set1_epi32(
+          (1 << (SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS)) >> 1);
+      __m128i v = _mm_add_epi32(_mm_mullo_epi32(a, src), b);
+      __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding),
+                                 SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+      _mm_storeu_si128((__m128i *)&dst[m], w);
+    }
+
+    // Deal with any extra pixels at the right-hand edge of the frame
+    // (typically have 2 such pixels, but may have anywhere between 0 and 3)
+    for (; j < width - 1; ++j) {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 5;
+      const int32_t a =
+          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
+              4 +
+          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
+           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
+              3;
+      const int32_t b =
+          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
+              4 +
+          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
+           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
+              3;
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+
+    j = width - 1;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
+                        A[k - 1] + A[k - buf_stride - 1] +
+                        A[k + buf_stride - 1];
+      const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
+                        B[k - 1] + B[k - buf_stride - 1] +
+                        B[k + buf_stride - 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+  }
+
+  {
+    i = height - 1;
+    j = 0;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
+                        A[k - buf_stride + 1];
+      const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] +
+                        B[k - buf_stride + 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+    for (j = 1; j < width - 1; ++j) {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
+                        A[k - buf_stride - 1] + A[k - buf_stride + 1];
+      const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
+                        B[k - buf_stride - 1] + B[k - buf_stride + 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+    j = width - 1;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
+                        A[k - buf_stride - 1];
+      const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] +
+                        B[k - buf_stride - 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+  }
+}
+
+void av1_highpass_filter_sse4_1(uint8_t *dgd, int width, int height, int stride,
+                                int32_t *dst, int dst_stride, int corner,
+                                int edge) {
+  int i, j;
+  const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
+
+  {
+    i = 0;
+    j = 0;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
+          corner *
+              (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
+    }
+    for (j = 1; j < width - 1; ++j) {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] = center * dgd[k] +
+               edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
+               corner * (dgd[k + stride - 1] + dgd[k + stride + 1] +
+                         dgd[k - 1] + dgd[k + 1]);
+    }
+    j = width - 1;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
+          corner *
+              (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
+    }
+  }
+  {
+    i = height - 1;
+    j = 0;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
+          corner *
+              (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
+    }
+    for (j = 1; j < width - 1; ++j) {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] = center * dgd[k] +
+               edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
+               corner * (dgd[k - stride - 1] + dgd[k - stride + 1] +
+                         dgd[k - 1] + dgd[k + 1]);
+    }
+    j = width - 1;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
+          corner *
+              (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
+    }
+  }
+  __m128i center_ = _mm_set1_epi16(center);
+  __m128i edge_ = _mm_set1_epi16(edge);
+  __m128i corner_ = _mm_set1_epi16(corner);
+  for (i = 1; i < height - 1; ++i) {
+    j = 0;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] +
+          edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
+          corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
+                    dgd[k - stride] + dgd[k + stride]);
+    }
+    // Process in units of 8 pixels at a time.
+    for (j = 1; j < width - 8; j += 8) {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+
+      __m128i a = _mm_loadu_si128((__m128i *)&dgd[k - stride - 1]);
+      __m128i b = _mm_loadu_si128((__m128i *)&dgd[k - 1]);
+      __m128i c = _mm_loadu_si128((__m128i *)&dgd[k + stride - 1]);
+
+      __m128i tl = _mm_cvtepu8_epi16(a);
+      __m128i tr = _mm_cvtepu8_epi16(_mm_srli_si128(a, 8));
+      __m128i cl = _mm_cvtepu8_epi16(b);
+      __m128i cr = _mm_cvtepu8_epi16(_mm_srli_si128(b, 8));
+      __m128i bl = _mm_cvtepu8_epi16(c);
+      __m128i br = _mm_cvtepu8_epi16(_mm_srli_si128(c, 8));
+
+      __m128i x = _mm_alignr_epi8(cr, cl, 2);
+      __m128i y = _mm_add_epi16(_mm_add_epi16(_mm_alignr_epi8(tr, tl, 2), cl),
+                                _mm_add_epi16(_mm_alignr_epi8(br, bl, 2),
+                                              _mm_alignr_epi8(cr, cl, 4)));
+      __m128i z = _mm_add_epi16(_mm_add_epi16(tl, bl),
+                                _mm_add_epi16(_mm_alignr_epi8(tr, tl, 4),
+                                              _mm_alignr_epi8(br, bl, 4)));
+
+      __m128i res = _mm_add_epi16(_mm_mullo_epi16(x, center_),
+                                  _mm_add_epi16(_mm_mullo_epi16(y, edge_),
+                                                _mm_mullo_epi16(z, corner_)));
+
+      _mm_storeu_si128((__m128i *)&dst[l], _mm_cvtepi16_epi32(res));
+      _mm_storeu_si128((__m128i *)&dst[l + 4],
+                       _mm_cvtepi16_epi32(_mm_srli_si128(res, 8)));
+    }
+    // If there are enough pixels left in this row, do another batch of 4
+    // pixels.
+    for (; j < width - 4; j += 4) {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+
+      __m128i a = _mm_loadl_epi64((__m128i *)&dgd[k - stride - 1]);
+      __m128i b = _mm_loadl_epi64((__m128i *)&dgd[k - 1]);
+      __m128i c = _mm_loadl_epi64((__m128i *)&dgd[k + stride - 1]);
+
+      __m128i tl = _mm_cvtepu8_epi16(a);
+      __m128i cl = _mm_cvtepu8_epi16(b);
+      __m128i bl = _mm_cvtepu8_epi16(c);
+
+      __m128i x = _mm_srli_si128(cl, 2);
+      __m128i y = _mm_add_epi16(
+          _mm_add_epi16(_mm_srli_si128(tl, 2), cl),
+          _mm_add_epi16(_mm_srli_si128(bl, 2), _mm_srli_si128(cl, 4)));
+      __m128i z = _mm_add_epi16(
+          _mm_add_epi16(tl, bl),
+          _mm_add_epi16(_mm_srli_si128(tl, 4), _mm_srli_si128(bl, 4)));
+
+      __m128i res = _mm_add_epi16(_mm_mullo_epi16(x, center_),
+                                  _mm_add_epi16(_mm_mullo_epi16(y, edge_),
+                                                _mm_mullo_epi16(z, corner_)));
+
+      _mm_storeu_si128((__m128i *)&dst[l], _mm_cvtepi16_epi32(res));
+    }
+    // Handle any leftover pixels
+    for (; j < width - 1; ++j) {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] +
+          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
+          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
+                    dgd[k - stride + 1] + dgd[k + stride + 1]);
+    }
+    j = width - 1;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] +
+          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
+          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
+                    dgd[k - stride] + dgd[k + stride]);
+    }
+  }
+}
+
+void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height,
+                                         int stride, int eps, int *xqd,
+                                         uint8_t *dst, int dst_stride,
+                                         int32_t *tmpbuf) {
+  int xq[2];
+  int32_t *flt1 = tmpbuf;
+  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
+  int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
+  int i, j;
+  assert(width * height <= RESTORATION_TILEPELS_MAX);
+#if USE_HIGHPASS_IN_SGRPROJ
+  av1_highpass_filter_sse4_1(dat, width, height, stride, flt1, width,
+                             sgr_params[eps].corner, sgr_params[eps].edge);
+#else
+    av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt1, width,
+                                      sgr_params[eps].r1, sgr_params[eps].e1,
+                                      tmpbuf2);
+#endif  // USE_HIGHPASS_IN_SGRPROJ
+  av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt2, width,
+                                    sgr_params[eps].r2, sgr_params[eps].e2,
+                                    tmpbuf2);
+  decode_xq(xqd, xq);
+
+  __m128i xq0 = _mm_set1_epi32(xq[0]);
+  __m128i xq1 = _mm_set1_epi32(xq[1]);
+  for (i = 0; i < height; ++i) {
+    // Calculate output in batches of 8 pixels
+    for (j = 0; j < width; j += 8) {
+      const int k = i * width + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      __m128i src =
+          _mm_slli_epi16(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&dat[l])),
+                         SGRPROJ_RST_BITS);
+
+      const __m128i u_0 = _mm_cvtepu16_epi32(src);
+      const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(src, 8));
+
+      const __m128i f1_0 =
+          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k]), u_0);
+      const __m128i f2_0 =
+          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k]), u_0);
+      const __m128i f1_1 =
+          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k + 4]), u_1);
+      const __m128i f2_1 =
+          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k + 4]), u_1);
+
+      const __m128i v_0 = _mm_add_epi32(
+          _mm_add_epi32(_mm_mullo_epi32(xq0, f1_0), _mm_mullo_epi32(xq1, f2_0)),
+          _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS));
+      const __m128i v_1 = _mm_add_epi32(
+          _mm_add_epi32(_mm_mullo_epi32(xq0, f1_1), _mm_mullo_epi32(xq1, f2_1)),
+          _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS));
+
+      const __m128i rounding =
+          _mm_set1_epi32((1 << (SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS)) >> 1);
+      const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding),
+                                         SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
+                                         SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+      const __m128i tmp = _mm_packs_epi32(w_0, w_1);
+      const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */);
+      _mm_storel_epi64((__m128i *)&dst[m], res);
+    }
+    // Process leftover pixels
+    for (; j < width; ++j) {
+      const int k = i * width + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
+      const int32_t f1 = (int32_t)flt1[k] - u;
+      const int32_t f2 = (int32_t)flt2[k] - u;
+      const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
+      const int16_t w =
+          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      dst[m] = (uint16_t)clip_pixel(w);
+    }
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+// Only the vertical sums need to be adjusted for highbitdepth
+
+static void highbd_selfguided_restoration_1_v(uint16_t *src, int width,
+                                              int height, int src_stride,
+                                              int32_t *A, int32_t *B,
+                                              int buf_stride) {
+  int i, j;
+
+  int width_extend = (width + 3) & ~3;
+  for (j = 0; j < width_extend; j += 4) {
+    __m128i a, b, x, y, x2, y2;
+    __m128i sum, sum_sq, tmp;
+
+    a = _mm_loadl_epi64((__m128i *)&src[j]);
+    b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
+
+    sum = _mm_cvtepi16_epi32(_mm_add_epi16(a, b));
+    tmp = _mm_unpacklo_epi16(a, b);
+    sum_sq = _mm_madd_epi16(tmp, tmp);
+
+    _mm_store_si128((__m128i *)&B[j], sum);
+    _mm_store_si128((__m128i *)&A[j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    for (i = 1; i < height - 2; ++i) {
+      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+      x = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+      y = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)&src[(i + 2) * src_stride + j]));
+
+      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+
+      x2 = _mm_mullo_epi32(x, x);
+      y2 = _mm_mullo_epi32(y, y);
+
+      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+    }
+    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
+  }
+}
+
+static void highbd_selfguided_restoration_2_v(uint16_t *src, int width,
+                                              int height, int src_stride,
+                                              int32_t *A, int32_t *B,
+                                              int buf_stride) {
+  int i, j;
+
+  int width_extend = (width + 3) & ~3;
+  for (j = 0; j < width_extend; j += 4) {
+    __m128i a, b, c, c2, x, y, x2, y2;
+    __m128i sum, sum_sq, tmp;
+
+    a = _mm_loadl_epi64((__m128i *)&src[j]);
+    b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
+    c = _mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]);
+
+    sum = _mm_cvtepi16_epi32(_mm_add_epi16(_mm_add_epi16(a, b), c));
+    // Important: We need to widen *before* squaring here, since
+    // c^2 may be up to 2^24.
+    c = _mm_cvtepu16_epi32(c);
+    c2 = _mm_mullo_epi32(c, c);
+    tmp = _mm_unpacklo_epi16(a, b);
+    sum_sq = _mm_add_epi32(_mm_madd_epi16(tmp, tmp), c2);
+
+    _mm_store_si128((__m128i *)&B[j], sum);
+    _mm_store_si128((__m128i *)&A[j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    for (i = 2; i < height - 3; ++i) {
+      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+      x = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
+      y = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)&src[(i + 3) * src_stride + j]));
+
+      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+
+      x2 = _mm_mullo_epi32(x, x);
+      y2 = _mm_mullo_epi32(y, y);
+
+      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+    }
+    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
+  }
+}
+
+static void highbd_selfguided_restoration_3_v(uint16_t *src, int width,
+                                              int height, int src_stride,
+                                              int32_t *A, int32_t *B,
+                                              int buf_stride) {
+  int i, j;
+
+  int width_extend = (width + 3) & ~3;
+  for (j = 0; j < width_extend; j += 4) {
+    __m128i a, b, c, d, x, y, x2, y2;
+    __m128i sum, sum_sq, tmp, tmp2;
+
+    a = _mm_loadl_epi64((__m128i *)&src[j]);
+    b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
+    c = _mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]);
+    d = _mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]);
+
+    sum = _mm_cvtepi16_epi32(
+        _mm_add_epi16(_mm_add_epi16(a, b), _mm_add_epi16(c, d)));
+    tmp = _mm_unpacklo_epi16(a, b);
+    tmp2 = _mm_unpacklo_epi16(c, d);
+    sum_sq =
+        _mm_add_epi32(_mm_madd_epi16(tmp, tmp), _mm_madd_epi16(tmp2, tmp2));
+
+    _mm_store_si128((__m128i *)&B[j], sum);
+    _mm_store_si128((__m128i *)&A[j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[5 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[2 * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[2 * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[6 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    for (i = 3; i < height - 4; ++i) {
+      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+      x = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j]));
+      y = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)&src[(i + 4) * src_stride + j]));
+
+      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+
+      x2 = _mm_mullo_epi32(x, x);
+      y2 = _mm_mullo_epi32(y, y);
+
+      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+    }
+    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 3) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 3) * buf_stride + j], sum_sq);
+  }
+}
+
+void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
+                                              int height, int stride,
+                                              int32_t *dst, int dst_stride,
+                                              int bit_depth, int r, int eps,
+                                              int32_t *tmpbuf) {
+  int32_t *A = tmpbuf;
+  int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
+  int i, j;
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 16 bytes for efficiency.
+  int buf_stride = ((width + 3) & ~3) + 16;
+
+  // Don't filter tiles with dimensions < 5 on any axis
+  if ((width < 5) || (height < 5)) return;
+
+  if (r == 1) {
+    highbd_selfguided_restoration_1_v(dgd, width, height, stride, A, B,
+                                      buf_stride);
+    selfguided_restoration_1_h(A, B, width, height, buf_stride, eps, bit_depth);
+  } else if (r == 2) {
+    highbd_selfguided_restoration_2_v(dgd, width, height, stride, A, B,
+                                      buf_stride);
+    selfguided_restoration_2_h(A, B, width, height, buf_stride, eps, bit_depth);
+  } else if (r == 3) {
+    highbd_selfguided_restoration_3_v(dgd, width, height, stride, A, B,
+                                      buf_stride);
+    selfguided_restoration_3_h(A, B, width, height, buf_stride, eps, bit_depth);
+  } else {
+    assert(0);
+  }
+
+  {
+    i = 0;
+    j = 0;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
+                        A[k + buf_stride + 1];
+      const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] +
+                        B[k + buf_stride + 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+    for (j = 1; j < width - 1; ++j) {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
+                        A[k + buf_stride - 1] + A[k + buf_stride + 1];
+      const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
+                        B[k + buf_stride - 1] + B[k + buf_stride + 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+    j = width - 1;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
+                        A[k + buf_stride - 1];
+      const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] +
+                        B[k + buf_stride - 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+  }
+  for (i = 1; i < height - 1; ++i) {
+    j = 0;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
+                        A[k + 1] + A[k - buf_stride + 1] +
+                        A[k + buf_stride + 1];
+      const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
+                        B[k + 1] + B[k - buf_stride + 1] +
+                        B[k + buf_stride + 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+
+    // Vectorize the innermost loop
+    for (j = 1; j < width - 1; j += 4) {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 5;
+
+      __m128i tmp0 = _mm_loadu_si128((__m128i *)&A[k - 1 - buf_stride]);
+      __m128i tmp1 = _mm_loadu_si128((__m128i *)&A[k + 3 - buf_stride]);
+      __m128i tmp2 = _mm_loadu_si128((__m128i *)&A[k - 1]);
+      __m128i tmp3 = _mm_loadu_si128((__m128i *)&A[k + 3]);
+      __m128i tmp4 = _mm_loadu_si128((__m128i *)&A[k - 1 + buf_stride]);
+      __m128i tmp5 = _mm_loadu_si128((__m128i *)&A[k + 3 + buf_stride]);
+
+      __m128i a0 = _mm_add_epi32(
+          _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 4), tmp2),
+                        _mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 8),
+                                      _mm_alignr_epi8(tmp5, tmp4, 4))),
+          _mm_alignr_epi8(tmp1, tmp0, 4));
+      __m128i a1 = _mm_add_epi32(_mm_add_epi32(tmp0, tmp4),
+                                 _mm_add_epi32(_mm_alignr_epi8(tmp1, tmp0, 8),
+                                               _mm_alignr_epi8(tmp5, tmp4, 8)));
+      __m128i a = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(a0, a1), 2), a1);
+
+      __m128i tmp6 = _mm_loadu_si128((__m128i *)&B[k - 1 - buf_stride]);
+      __m128i tmp7 = _mm_loadu_si128((__m128i *)&B[k + 3 - buf_stride]);
+      __m128i tmp8 = _mm_loadu_si128((__m128i *)&B[k - 1]);
+      __m128i tmp9 = _mm_loadu_si128((__m128i *)&B[k + 3]);
+      __m128i tmp10 = _mm_loadu_si128((__m128i *)&B[k - 1 + buf_stride]);
+      __m128i tmp11 = _mm_loadu_si128((__m128i *)&B[k + 3 + buf_stride]);
+
+      __m128i b0 = _mm_add_epi32(
+          _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 4), tmp8),
+                        _mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 8),
+                                      _mm_alignr_epi8(tmp11, tmp10, 4))),
+          _mm_alignr_epi8(tmp7, tmp6, 4));
+      __m128i b1 =
+          _mm_add_epi32(_mm_add_epi32(tmp6, tmp10),
+                        _mm_add_epi32(_mm_alignr_epi8(tmp7, tmp6, 8),
+                                      _mm_alignr_epi8(tmp11, tmp10, 8)));
+      __m128i b = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(b0, b1), 2), b1);
+
+      __m128i src = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)&dgd[l]));
+
+      __m128i rounding = _mm_set1_epi32(
+          (1 << (SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS)) >> 1);
+      __m128i v = _mm_add_epi32(_mm_mullo_epi32(a, src), b);
+      __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding),
+                                 SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+      _mm_storeu_si128((__m128i *)&dst[m], w);
+    }
+
+    // Deal with any extra pixels at the right-hand edge of the frame
+    // (typically have 2 such pixels, but may have anywhere between 0 and 3)
+    for (; j < width - 1; ++j) {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 5;
+      const int32_t a =
+          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
+              4 +
+          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
+           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
+              3;
+      const int32_t b =
+          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
+              4 +
+          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
+           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
+              3;
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+
+    j = width - 1;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
+                        A[k - 1] + A[k - buf_stride - 1] +
+                        A[k + buf_stride - 1];
+      const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
+                        B[k - 1] + B[k - buf_stride - 1] +
+                        B[k + buf_stride - 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+  }
+
+  {
+    i = height - 1;
+    j = 0;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
+                        A[k - buf_stride + 1];
+      const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] +
+                        B[k - buf_stride + 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+    for (j = 1; j < width - 1; ++j) {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
+                        A[k - buf_stride - 1] + A[k - buf_stride + 1];
+      const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
+                        B[k - buf_stride - 1] + B[k - buf_stride + 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+    j = width - 1;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
+                        A[k - buf_stride - 1];
+      const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] +
+                        B[k - buf_stride - 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+  }
+}
+
+void av1_highpass_filter_highbd_sse4_1(uint16_t *dgd, int width, int height,
+                                       int stride, int32_t *dst, int dst_stride,
+                                       int corner, int edge) {
+  int i, j;
+  const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
+
+  {
+    i = 0;
+    j = 0;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
+          corner *
+              (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
+    }
+    for (j = 1; j < width - 1; ++j) {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] = center * dgd[k] +
+               edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
+               corner * (dgd[k + stride - 1] + dgd[k + stride + 1] +
+                         dgd[k - 1] + dgd[k + 1]);
+    }
+    j = width - 1;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
+          corner *
+              (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
+    }
+  }
+  __m128i center_ = _mm_set1_epi32(center);
+  __m128i edge_ = _mm_set1_epi32(edge);
+  __m128i corner_ = _mm_set1_epi32(corner);
+  for (i = 1; i < height - 1; ++i) {
+    j = 0;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] +
+          edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
+          corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
+                    dgd[k - stride] + dgd[k + stride]);
+    }
+    // Process 4 pixels at a time
+    for (j = 1; j < width - 4; j += 4) {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+
+      __m128i a = _mm_loadu_si128((__m128i *)&dgd[k - stride - 1]);
+      __m128i b = _mm_loadu_si128((__m128i *)&dgd[k - 1]);
+      __m128i c = _mm_loadu_si128((__m128i *)&dgd[k + stride - 1]);
+
+      __m128i tl = _mm_cvtepu16_epi32(a);
+      __m128i tr = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
+      __m128i cl = _mm_cvtepu16_epi32(b);
+      __m128i cr = _mm_cvtepu16_epi32(_mm_srli_si128(b, 8));
+      __m128i bl = _mm_cvtepu16_epi32(c);
+      __m128i br = _mm_cvtepu16_epi32(_mm_srli_si128(c, 8));
+
+      __m128i x = _mm_alignr_epi8(cr, cl, 4);
+      __m128i y = _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tr, tl, 4), cl),
+                                _mm_add_epi32(_mm_alignr_epi8(br, bl, 4),
+                                              _mm_alignr_epi8(cr, cl, 8)));
+      __m128i z = _mm_add_epi32(_mm_add_epi32(tl, bl),
+                                _mm_add_epi32(_mm_alignr_epi8(tr, tl, 8),
+                                              _mm_alignr_epi8(br, bl, 8)));
+
+      __m128i res = _mm_add_epi32(_mm_mullo_epi32(x, center_),
+                                  _mm_add_epi32(_mm_mullo_epi32(y, edge_),
+                                                _mm_mullo_epi32(z, corner_)));
+
+      _mm_storeu_si128((__m128i *)&dst[l], res);
+    }
+    // Handle any leftover pixels
+    for (; j < width - 1; ++j) {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] +
+          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
+          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
+                    dgd[k - stride + 1] + dgd[k + stride + 1]);
+    }
+    j = width - 1;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] +
+          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
+          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
+                    dgd[k - stride] + dgd[k + stride]);
+    }
+  }
+  {
+    i = height - 1;
+    j = 0;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
+          corner *
+              (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
+    }
+    for (j = 1; j < width - 1; ++j) {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] = center * dgd[k] +
+               edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
+               corner * (dgd[k - stride - 1] + dgd[k - stride + 1] +
+                         dgd[k - 1] + dgd[k + 1]);
+    }
+    j = width - 1;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
+          corner *
+              (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
+    }
+  }
+}
+
+void apply_selfguided_restoration_highbd_sse4_1(
+    uint16_t *dat, int width, int height, int stride, int bit_depth, int eps,
+    int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf) {
+  int xq[2];
+  int32_t *flt1 = tmpbuf;
+  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
+  int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
+  int i, j;
+  assert(width * height <= RESTORATION_TILEPELS_MAX);
+#if USE_HIGHPASS_IN_SGRPROJ
+  av1_highpass_filter_highbd_sse4_1(dat, width, height, stride, flt1, width,
+                                    sgr_params[eps].corner,
+                                    sgr_params[eps].edge);
+#else
+  av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt1,
+                                           width, bit_depth, sgr_params[eps].r1,
+                                           sgr_params[eps].e1, tmpbuf2);
+#endif  // USE_HIGHPASS_IN_SGRPROJ
+  av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt2,
+                                           width, bit_depth, sgr_params[eps].r2,
+                                           sgr_params[eps].e2, tmpbuf2);
+  decode_xq(xqd, xq);
+
+  __m128i xq0 = _mm_set1_epi32(xq[0]);
+  __m128i xq1 = _mm_set1_epi32(xq[1]);
+  for (i = 0; i < height; ++i) {
+    // Calculate output in batches of 8 pixels
+    for (j = 0; j < width; j += 8) {
+      const int k = i * width + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      __m128i src =
+          _mm_slli_epi16(_mm_load_si128((__m128i *)&dat[l]), SGRPROJ_RST_BITS);
+
+      const __m128i u_0 = _mm_cvtepu16_epi32(src);
+      const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(src, 8));
+
+      const __m128i f1_0 =
+          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k]), u_0);
+      const __m128i f2_0 =
+          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k]), u_0);
+      const __m128i f1_1 =
+          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k + 4]), u_1);
+      const __m128i f2_1 =
+          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k + 4]), u_1);
+
+      const __m128i v_0 = _mm_add_epi32(
+          _mm_add_epi32(_mm_mullo_epi32(xq0, f1_0), _mm_mullo_epi32(xq1, f2_0)),
+          _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS));
+      const __m128i v_1 = _mm_add_epi32(
+          _mm_add_epi32(_mm_mullo_epi32(xq0, f1_1), _mm_mullo_epi32(xq1, f2_1)),
+          _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS));
+
+      const __m128i rounding =
+          _mm_set1_epi32((1 << (SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS)) >> 1);
+      const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding),
+                                         SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
+                                         SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+      // Pack into 16 bits and clamp to [0, 2^bit_depth)
+      const __m128i tmp = _mm_packus_epi32(w_0, w_1);
+      const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1);
+      const __m128i res = _mm_min_epi16(tmp, max);
+
+      _mm_store_si128((__m128i *)&dst[m], res);
+    }
+    // Process leftover pixels
+    for (; j < width; ++j) {
+      const int k = i * width + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
+      const int32_t f1 = (int32_t)flt1[k] - u;
+      const int32_t f2 = (int32_t)flt2[k] - u;
+      const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
+      const int16_t w =
+          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      dst[m] = (uint16_t)clip_pixel_highbd(w, bit_depth);
+    }
+  }
+}
+
+#endif
diff --git a/third_party/aom/av1/common/x86/warp_plane_sse2.c b/third_party/aom/av1/common/x86/warp_plane_sse2.c
new file mode 100644
index 000000000..925e4650d
--- /dev/null
+++ b/third_party/aom/av1/common/x86/warp_plane_sse2.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+
+static const __m128i *const filter = (const __m128i *const)warped_filter;
+
+/* SSE2 version of the rotzoom/affine warp filter */
+void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
+                          int stride, uint8_t *pred, int p_col, int p_row,
+                          int p_width, int p_height, int p_stride,
+                          int subsampling_x, int subsampling_y, int ref_frm,
+                          int16_t alpha, int16_t beta, int16_t gamma,
+                          int16_t delta) {
+  __m128i tmp[15];
+  int i, j, k;
+
+  /* Note: For this code to work, the left/right frame borders need to be
+     extended by at least 13 pixels each. By the time we get here, other
+     code will have set up this border, but we allow an explicit check
+     for debugging purposes.
+  */
+  /*for (i = 0; i < height; ++i) {
+    for (j = 0; j < 13; ++j) {
+      assert(ref[i * stride - 13 + j] == ref[i * stride]);
+      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+    }
+  }*/
+
+  for (i = 0; i < p_height; i += 8) {
+    for (j = 0; j < p_width; j += 8) {
+      // (x, y) coordinates of the center of this block in the destination
+      // image
+      int32_t dst_x = p_col + j + 4;
+      int32_t dst_y = p_row + i + 4;
+
+      int32_t x4, y4, ix4, sx4, iy4, sy4;
+      if (subsampling_x)
+        x4 = ROUND_POWER_OF_TWO_SIGNED(
+            mat[2] * 2 * dst_x + mat[3] * 2 * dst_y + mat[0] +
+                (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+            1);
+      else
+        x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];
+
+      if (subsampling_y)
+        y4 = ROUND_POWER_OF_TWO_SIGNED(
+            mat[4] * 2 * dst_x + mat[5] * 2 * dst_y + mat[1] +
+                (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+            1);
+      else
+        y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];
+
+      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Horizontal filter
+      for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+        int iy = iy4 + k;
+        if (iy < 0)
+          iy = 0;
+        else if (iy > height - 1)
+          iy = height - 1;
+
+        // If the block is aligned such that, after clamping, every sample
+        // would be taken from the leftmost/rightmost column, then we can
+        // skip the expensive horizontal filter.
+        if (ix4 <= -7) {
+          tmp[k + 7] = _mm_set1_epi16(
+              ref[iy * stride] *
+              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+        } else if (ix4 >= width + 6) {
+          tmp[k + 7] = _mm_set1_epi16(
+              ref[iy * stride + (width - 1)] *
+              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+        } else {
+          int sx = sx4 + alpha * (-4) + beta * k +
+                   // Include rounding and offset here
+                   (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+                   (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+          // Load source pixels
+          __m128i zero = _mm_setzero_si128();
+          __m128i src =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+
+          // Filter even-index pixels
+          __m128i tmp_0 = _mm_loadu_si128(
+              (__m128i *)(filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          __m128i tmp_2 = _mm_loadu_si128(
+              (__m128i *)(filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          __m128i tmp_4 = _mm_loadu_si128(
+              (__m128i *)(filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          __m128i tmp_6 = _mm_loadu_si128(
+              (__m128i *)(filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+          // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
+          __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+          // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
+          __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+          // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
+          __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+          // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
+          __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+          // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
+          __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+          // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
+          __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+          // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
+          __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+          // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
+          __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+          __m128i round_const =
+              _mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
+
+          // Calculate filtered results
+          __m128i src_0 = _mm_unpacklo_epi8(src, zero);
+          __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+          __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(src, 2), zero);
+          __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+          __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(src, 4), zero);
+          __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+          __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(src, 6), zero);
+          __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+          __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                           _mm_add_epi32(res_2, res_6));
+          res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+                                    HORSHEAR_REDUCE_PREC_BITS);
+
+          // Filter odd-index pixels
+          __m128i tmp_1 = _mm_loadu_si128(
+              (__m128i *)(filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          __m128i tmp_3 = _mm_loadu_si128(
+              (__m128i *)(filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          __m128i tmp_5 = _mm_loadu_si128(
+              (__m128i *)(filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          __m128i tmp_7 = _mm_loadu_si128(
+              (__m128i *)(filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+          __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+          __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+          __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+          __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+          __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+          __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+          __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+          __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+          __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(src, 1), zero);
+          __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+          __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(src, 3), zero);
+          __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+          __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(src, 5), zero);
+          __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+          __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(src, 7), zero);
+          __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+          __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                          _mm_add_epi32(res_3, res_7));
+          res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+                                   HORSHEAR_REDUCE_PREC_BITS);
+
+          // Combine results into one register.
+          // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
+          // as this order helps with the vertical filter.
+          tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
+        }
+      }
+
+      // Vertical filter
+      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+        int sy = sy4 + gamma * (-4) + delta * k +
+                 (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+                 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+        // Load from tmp and rearrange pairs of consecutive rows into the
+        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+        __m128i *src = tmp + (k + 4);
+        __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+        __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+        __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+        __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+        // Filter even-index pixels
+        __m128i tmp_0 = _mm_loadu_si128(
+            (__m128i *)(filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        __m128i tmp_2 = _mm_loadu_si128(
+            (__m128i *)(filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        __m128i tmp_4 = _mm_loadu_si128(
+            (__m128i *)(filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        __m128i tmp_6 = _mm_loadu_si128(
+            (__m128i *)(filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+        __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+        __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+        __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+        __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+        __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+        __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+        __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+        __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+        __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+        __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+        __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+        __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                         _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+        __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+        __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+        __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+        __m128i tmp_1 = _mm_loadu_si128(
+            (__m128i *)(filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        __m128i tmp_3 = _mm_loadu_si128(
+            (__m128i *)(filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        __m128i tmp_5 = _mm_loadu_si128(
+            (__m128i *)(filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        __m128i tmp_7 = _mm_loadu_si128(
+            (__m128i *)(filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+        __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+        __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+        __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+        __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+        __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+        __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+        __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+        __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+        __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+        __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+        __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+        __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                        _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        // Round and pack into 8 bits
+        __m128i round_const =
+            _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);
+
+        __m128i res_lo_round = _mm_srai_epi32(
+            _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
+        __m128i res_hi_round = _mm_srai_epi32(
+            _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
+
+        __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+        __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+        // Store, blending with 'pred' if needed
+        __m128i *p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+        // Note: If we're outputting a 4x4 block, we need to be very careful
+        // to only output 4 pixels at this point, to avoid encode/decode
+        // mismatches when encoding with multiple threads.
+        if (p_width == 4) {
+          if (ref_frm) {
+            const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
+            res_8bit = _mm_avg_epu8(res_8bit, orig);
+          }
+          *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
+        } else {
+          if (ref_frm) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
+          _mm_storel_epi64(p, res_8bit);
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/zigzag.h b/third_party/aom/av1/common/zigzag.h
new file mode 100644
index 000000000..c58b18b57
--- /dev/null
+++ b/third_party/aom/av1/common/zigzag.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#if !defined(_zigzag_H)
+# define _zigzag_H (1)
+
+extern const unsigned char OD_ZIGZAG4_DCT_DCT[15][2];
+extern const unsigned char OD_ZIGZAG4_ADST_DCT[15][2];
+extern const unsigned char OD_ZIGZAG4_DCT_ADST[15][2];
+#define OD_ZIGZAG4_ADST_ADST OD_ZIGZAG4_DCT_DCT
+
+extern const unsigned char OD_ZIGZAG8_DCT_DCT[48][2];
+extern const unsigned char OD_ZIGZAG8_ADST_DCT[48][2];
+extern const unsigned char OD_ZIGZAG8_DCT_ADST[48][2];
+#define OD_ZIGZAG8_ADST_ADST OD_ZIGZAG8_DCT_DCT
+
+extern const unsigned char OD_ZIGZAG16_DCT_DCT[192][2];
+extern const unsigned char OD_ZIGZAG16_ADST_DCT[192][2];
+extern const unsigned char OD_ZIGZAG16_DCT_ADST[192][2];
+#define OD_ZIGZAG16_ADST_ADST OD_ZIGZAG16_DCT_DCT
+
+extern const unsigned char OD_ZIGZAG32_DCT_DCT[768][2];
+#endif
diff --git a/third_party/aom/av1/common/zigzag16.c b/third_party/aom/av1/common/zigzag16.c
new file mode 100644
index 000000000..6df6e3855
--- /dev/null
+++ b/third_party/aom/av1/common/zigzag16.c
@@ -0,0 +1,157 @@
+/* This file is generated by gen_zigzag16.m */
+
+/* clang-format off */
+
+#include "odintrin.h"
+OD_EXTERN const unsigned char OD_ZIGZAG16_DCT_DCT[192][2] = {
+  {8, 0}, {8, 1}, {8, 2}, {9, 0},
+  {8, 3}, {9, 1}, {9, 2}, {10, 0},
+  {9, 3}, {10, 1}, {10, 2}, {11, 0},
+  {10, 3}, {11, 1}, {11, 2}, {11, 3},
+  {12, 0}, {12, 1}, {13, 0}, {12, 2},
+  {12, 3}, {13, 1}, {13, 2}, {14, 0},
+  {13, 3}, {14, 1}, {15, 0}, {14, 2},
+  {14, 3}, {15, 1}, {15, 2}, {15, 3},
+  {0, 8}, {1, 8}, {0, 9}, {2, 8},
+  {1, 9}, {3, 8}, {0, 10}, {2, 9},
+  {1, 10}, {3, 9}, {0, 11}, {2, 10},
+  {1, 11}, {3, 10}, {0, 12}, {2, 11},
+  {1, 12}, {3, 11}, {0, 13}, {2, 12},
+  {1, 13}, {0, 14}, {3, 12}, {2, 13},
+  {1, 14}, {3, 13}, {0, 15}, {2, 14},
+  {1, 15}, {3, 14}, {2, 15}, {3, 15},
+  {4, 8}, {5, 8}, {4, 9}, {8, 4},
+  {8, 5}, {6, 8}, {5, 9}, {4, 10},
+  {9, 4}, {8, 6}, {7, 8}, {9, 5},
+  {5, 10}, {8, 7}, {6, 9}, {4, 11},
+  {10, 4}, {9, 6}, {7, 9}, {8, 8},
+  {10, 5}, {6, 10}, {5, 11}, {9, 7},
+  {8, 9}, {10, 6}, {7, 10}, {4, 12},
+  {11, 4}, {9, 8}, {6, 11}, {10, 7},
+  {11, 5}, {5, 12}, {8, 10}, {7, 11},
+  {9, 9}, {4, 13}, {10, 8}, {11, 6},
+  {11, 7}, {6, 12}, {8, 11}, {9, 10},
+  {12, 4}, {5, 13}, {10, 9}, {12, 5},
+  {7, 12}, {11, 8}, {4, 14}, {6, 13},
+  {10, 10}, {9, 11}, {12, 6}, {13, 4},
+  {11, 9}, {8, 12}, {5, 14}, {12, 7},
+  {7, 13}, {4, 15}, {13, 5}, {10, 11},
+  {11, 10}, {9, 12}, {13, 6}, {12, 8},
+  {6, 14}, {8, 13}, {5, 15}, {13, 7},
+  {14, 4}, {12, 9}, {7, 14}, {11, 11},
+  {10, 12}, {9, 13}, {14, 5}, {6, 15},
+  {13, 8}, {8, 14}, {12, 10}, {14, 6},
+  {7, 15}, {13, 9}, {15, 4}, {10, 13},
+  {11, 12}, {14, 7}, {9, 14}, {12, 11},
+  {8, 15}, {15, 5}, {13, 10}, {14, 8},
+  {11, 13}, {15, 6}, {9, 15}, {10, 14},
+  {14, 9}, {15, 7}, {13, 11}, {12, 12},
+  {10, 15}, {11, 14}, {15, 8}, {14, 10},
+  {12, 13}, {13, 12}, {15, 9}, {11, 15},
+  {14, 11}, {13, 13}, {15, 10}, {12, 14},
+  {13, 14}, {15, 11}, {14, 12}, {12, 15},
+  {14, 13}, {13, 15}, {15, 12}, {14, 14},
+  {15, 13}, {14, 15}, {15, 14}, {15, 15}
+  };
+
+OD_EXTERN const unsigned char OD_ZIGZAG16_ADST_DCT[192][2] = {
+  {8, 0}, {9, 0}, {10, 0}, {8, 1},
+  {11, 0}, {9, 1}, {8, 2}, {12, 0},
+  {10, 1}, {9, 2}, {8, 3}, {13, 0},
+  {11, 1}, {10, 2}, {9, 3}, {14, 0},
+  {12, 1}, {10, 3}, {15, 0}, {11, 2},
+  {13, 1}, {11, 3}, {12, 2}, {14, 1},
+  {12, 3}, {13, 2}, {15, 1}, {13, 3},
+  {14, 2}, {14, 3}, {15, 2}, {15, 3},
+  {0, 8}, {1, 8}, {2, 8}, {0, 9},
+  {3, 8}, {1, 9}, {2, 9}, {0, 10},
+  {3, 9}, {1, 10}, {2, 10}, {0, 11},
+  {3, 10}, {1, 11}, {2, 11}, {0, 12},
+  {3, 11}, {1, 12}, {2, 12}, {0, 13},
+  {3, 12}, {1, 13}, {0, 14}, {2, 13},
+  {0, 15}, {1, 14}, {3, 13}, {2, 14},
+  {1, 15}, {3, 14}, {2, 15}, {3, 15},
+  {8, 4}, {9, 4}, {8, 5}, {4, 8},
+  {10, 4}, {9, 5}, {5, 8}, {8, 6},
+  {4, 9}, {10, 5}, {9, 6}, {6, 8},
+  {8, 7}, {11, 4}, {7, 8}, {5, 9},
+  {9, 7}, {11, 5}, {10, 6}, {4, 10},
+  {6, 9}, {8, 8}, {5, 10}, {7, 9},
+  {12, 4}, {10, 7}, {9, 8}, {11, 6},
+  {8, 9}, {4, 11}, {6, 10}, {7, 10},
+  {12, 5}, {5, 11}, {10, 8}, {11, 7},
+  {9, 9}, {4, 12}, {13, 4}, {8, 10},
+  {6, 11}, {12, 6}, {5, 12}, {10, 9},
+  {7, 11}, {9, 10}, {11, 8}, {13, 5},
+  {8, 11}, {4, 13}, {6, 12}, {10, 10},
+  {12, 7}, {11, 9}, {7, 12}, {14, 4},
+  {5, 13}, {9, 11}, {13, 6}, {8, 12},
+  {4, 14}, {12, 8}, {6, 13}, {11, 10},
+  {10, 11}, {12, 9}, {5, 14}, {13, 7},
+  {14, 5}, {9, 12}, {4, 15}, {7, 13},
+  {8, 13}, {6, 14}, {13, 8}, {11, 11},
+  {10, 12}, {15, 4}, {12, 10}, {14, 6},
+  {13, 9}, {5, 15}, {9, 13}, {7, 14},
+  {15, 5}, {6, 15}, {8, 14}, {14, 7},
+  {11, 12}, {7, 15}, {9, 14}, {13, 10},
+  {10, 13}, {14, 8}, {15, 6}, {14, 9},
+  {12, 11}, {8, 15}, {15, 7}, {10, 14},
+  {11, 13}, {9, 15}, {13, 11}, {12, 12},
+  {15, 8}, {14, 10}, {15, 9}, {10, 15},
+  {11, 14}, {13, 12}, {12, 13}, {15, 10},
+  {14, 11}, {11, 15}, {13, 13}, {15, 11},
+  {14, 12}, {12, 14}, {15, 12}, {13, 14},
+  {12, 15}, {14, 13}, {13, 15}, {15, 13},
+  {14, 14}, {15, 14}, {14, 15}, {15, 15}
+  };
+
+OD_EXTERN const unsigned char OD_ZIGZAG16_DCT_ADST[192][2] = {
+  {8, 0}, {8, 1}, {8, 2}, {8, 3},
+  {9, 0}, {9, 1}, {9, 2}, {9, 3},
+  {10, 0}, {10, 1}, {10, 2}, {10, 3},
+  {11, 0}, {11, 1}, {11, 2}, {11, 3},
+  {12, 0}, {12, 1}, {12, 2}, {12, 3},
+  {13, 0}, {13, 1}, {13, 2}, {13, 3},
+  {14, 0}, {15, 0}, {14, 1}, {14, 2},
+  {14, 3}, {15, 1}, {15, 2}, {15, 3},
+  {0, 8}, {0, 9}, {0, 10}, {1, 8},
+  {0, 11}, {1, 9}, {2, 8}, {0, 12},
+  {1, 10}, {2, 9}, {0, 13}, {1, 11},
+  {3, 8}, {2, 10}, {0, 14}, {1, 12},
+  {3, 9}, {0, 15}, {2, 11}, {3, 10},
+  {1, 13}, {2, 12}, {3, 11}, {1, 14},
+  {2, 13}, {1, 15}, {3, 12}, {2, 14},
+  {3, 13}, {2, 15}, {3, 14}, {3, 15},
+  {4, 8}, {4, 9}, {5, 8}, {4, 10},
+  {5, 9}, {4, 11}, {6, 8}, {5, 10},
+  {8, 4}, {6, 9}, {4, 12}, {5, 11},
+  {8, 5}, {6, 10}, {7, 8}, {8, 6},
+  {4, 13}, {7, 9}, {5, 12}, {8, 7},
+  {9, 4}, {6, 11}, {8, 8}, {7, 10},
+  {5, 13}, {9, 5}, {4, 14}, {9, 6},
+  {8, 9}, {6, 12}, {9, 7}, {7, 11},
+  {4, 15}, {8, 10}, {9, 8}, {5, 14},
+  {10, 4}, {6, 13}, {10, 5}, {9, 9},
+  {7, 12}, {8, 11}, {10, 6}, {5, 15},
+  {10, 7}, {6, 14}, {9, 10}, {7, 13},
+  {8, 12}, {10, 8}, {9, 11}, {6, 15},
+  {11, 4}, {11, 5}, {10, 9}, {8, 13},
+  {7, 14}, {11, 6}, {9, 12}, {11, 7},
+  {10, 10}, {7, 15}, {8, 14}, {12, 4},
+  {11, 8}, {12, 5}, {9, 13}, {10, 11},
+  {8, 15}, {11, 9}, {12, 6}, {12, 7},
+  {10, 12}, {9, 14}, {11, 10}, {13, 4},
+  {12, 8}, {9, 15}, {13, 5}, {11, 11},
+  {12, 9}, {10, 13}, {13, 6}, {13, 7},
+  {12, 10}, {14, 4}, {11, 12}, {13, 8},
+  {10, 14}, {14, 5}, {12, 11}, {13, 9},
+  {14, 6}, {10, 15}, {11, 13}, {15, 4},
+  {14, 7}, {12, 12}, {13, 10}, {14, 8},
+  {15, 5}, {13, 11}, {15, 6}, {11, 14},
+  {14, 9}, {12, 13}, {11, 15}, {15, 7},
+  {14, 10}, {15, 8}, {13, 12}, {12, 14},
+  {15, 9}, {14, 11}, {13, 13}, {12, 15},
+  {15, 10}, {14, 12}, {13, 14}, {15, 11},
+  {13, 15}, {14, 13}, {14, 14}, {15, 12},
+  {14, 15}, {15, 13}, {15, 14}, {15, 15}
+  };
diff --git a/third_party/aom/av1/common/zigzag32.c b/third_party/aom/av1/common/zigzag32.c
new file mode 100644
index 000000000..cb3b9bc63
--- /dev/null
+++ b/third_party/aom/av1/common/zigzag32.c
@@ -0,0 +1,199 @@
+/* This file is generated by gen_zigzag32.m */
+
+/* clang-format off */
+
+#include "odintrin.h"
+OD_EXTERN const unsigned char OD_ZIGZAG32_DCT_DCT[768][2] = {
+  { 16, 0 }, { 17, 0 }, { 18, 0 }, { 19, 0 },
+  { 16, 1 }, { 17, 1 }, { 20, 0 }, { 16, 2 },
+  { 18, 1 }, { 21, 0 }, { 17, 2 }, { 16, 3 },
+  { 19, 1 }, { 22, 0 }, { 18, 2 }, { 17, 3 },
+  { 20, 1 }, { 16, 4 }, { 23, 0 }, { 19, 2 },
+  { 24, 0 }, { 16, 5 }, { 21, 1 }, { 17, 4 },
+  { 18, 3 }, { 20, 2 }, { 17, 5 }, { 16, 6 },
+  { 19, 3 }, { 18, 4 }, { 25, 0 }, { 22, 1 },
+  { 16, 7 }, { 21, 2 }, { 17, 6 }, { 20, 3 },
+  { 26, 0 }, { 18, 5 }, { 19, 4 }, { 17, 7 },
+  { 23, 1 }, { 22, 2 }, { 18, 6 }, { 27, 0 },
+  { 19, 5 }, { 24, 1 }, { 21, 3 }, { 28, 0 },
+  { 20, 4 }, { 18, 7 }, { 19, 6 }, { 23, 2 },
+  { 29, 0 }, { 25, 1 }, { 21, 4 }, { 30, 0 },
+  { 20, 5 }, { 22, 3 }, { 31, 0 }, { 19, 7 },
+  { 24, 2 }, { 26, 1 }, { 20, 6 }, { 21, 5 },
+  { 22, 4 }, { 23, 3 }, { 27, 1 }, { 25, 2 },
+  { 20, 7 }, { 28, 1 }, { 24, 3 }, { 21, 6 },
+  { 22, 5 }, { 23, 4 }, { 26, 2 }, { 21, 7 },
+  { 29, 1 }, { 25, 3 }, { 30, 1 }, { 27, 2 },
+  { 22, 6 }, { 23, 5 }, { 31, 1 }, { 24, 4 },
+  { 26, 3 }, { 28, 2 }, { 22, 7 }, { 23, 6 },
+  { 25, 4 }, { 24, 5 }, { 29, 2 }, { 30, 2 },
+  { 27, 3 }, { 23, 7 }, { 31, 2 }, { 24, 6 },
+  { 26, 4 }, { 25, 5 }, { 28, 3 }, { 24, 7 },
+  { 27, 4 }, { 29, 3 }, { 25, 6 }, { 26, 5 },
+  { 30, 3 }, { 31, 3 }, { 28, 4 }, { 27, 5 },
+  { 25, 7 }, { 29, 4 }, { 26, 6 }, { 28, 5 },
+  { 30, 4 }, { 26, 7 }, { 27, 6 }, { 31, 4 },
+  { 29, 5 }, { 27, 7 }, { 30, 5 }, { 28, 6 },
+  { 31, 5 }, { 29, 6 }, { 28, 7 }, { 30, 6 },
+  { 31, 6 }, { 29, 7 }, { 30, 7 }, { 31, 7 },
+  { 0, 16 }, { 0, 17 }, { 1, 16 }, { 0, 18 },
+  { 1, 17 }, { 0, 19 }, { 2, 16 }, { 1, 18 },
+  { 0, 20 }, { 2, 17 }, { 3, 16 }, { 1, 19 },
+  { 2, 18 }, { 0, 21 }, { 3, 17 }, { 4, 16 },
+  { 1, 20 }, { 2, 19 }, { 0, 22 }, { 3, 18 },
+  { 4, 17 }, { 5, 16 }, { 0, 23 }, { 3, 19 },
+  { 2, 20 }, { 1, 21 }, { 4, 18 }, { 6, 16 },
+  { 5, 17 }, { 3, 20 }, { 2, 21 }, { 1, 22 },
+  { 0, 24 }, { 0, 25 }, { 4, 19 }, { 7, 16 },
+  { 6, 17 }, { 5, 18 }, { 0, 26 }, { 3, 21 },
+  { 2, 22 }, { 1, 23 }, { 4, 20 }, { 5, 19 },
+  { 6, 18 }, { 1, 24 }, { 7, 17 }, { 0, 27 },
+  { 2, 23 }, { 3, 22 }, { 4, 21 }, { 1, 25 },
+  { 5, 20 }, { 7, 18 }, { 0, 28 }, { 6, 19 },
+  { 2, 24 }, { 1, 26 }, { 0, 29 }, { 4, 22 },
+  { 3, 23 }, { 2, 25 }, { 5, 21 }, { 0, 31 },
+  { 7, 19 }, { 6, 20 }, { 0, 30 }, { 1, 27 },
+  { 3, 24 }, { 2, 26 }, { 4, 23 }, { 5, 22 },
+  { 7, 20 }, { 1, 28 }, { 6, 21 }, { 3, 25 },
+  { 2, 27 }, { 1, 29 }, { 4, 24 }, { 2, 28 },
+  { 1, 30 }, { 7, 21 }, { 5, 23 }, { 3, 26 },
+  { 6, 22 }, { 1, 31 }, { 4, 25 }, { 7, 22 },
+  { 3, 27 }, { 2, 29 }, { 2, 30 }, { 5, 24 },
+  { 2, 31 }, { 6, 23 }, { 4, 26 }, { 3, 28 },
+  { 5, 25 }, { 3, 29 }, { 6, 24 }, { 7, 23 },
+  { 3, 30 }, { 4, 27 }, { 3, 31 }, { 5, 26 },
+  { 6, 25 }, { 4, 28 }, { 7, 24 }, { 4, 29 },
+  { 5, 27 }, { 4, 30 }, { 4, 31 }, { 6, 26 },
+  { 5, 28 }, { 7, 25 }, { 6, 27 }, { 5, 29 },
+  { 7, 26 }, { 5, 30 }, { 5, 31 }, { 6, 28 },
+  { 7, 27 }, { 6, 29 }, { 6, 30 }, { 7, 28 },
+  { 6, 31 }, { 7, 29 }, { 7, 30 }, { 7, 31 },
+  { 8, 16 }, { 9, 16 }, { 8, 17 }, { 10, 16 },
+  { 9, 17 }, { 16, 8 }, { 8, 18 }, { 16, 9 },
+  { 10, 17 }, { 11, 16 }, { 17, 8 }, { 9, 18 },
+  { 8, 19 }, { 16, 10 }, { 11, 17 }, { 12, 16 },
+  { 10, 18 }, { 17, 9 }, { 9, 19 }, { 16, 11 },
+  { 8, 20 }, { 18, 8 }, { 17, 10 }, { 10, 19 },
+  { 12, 17 }, { 11, 18 }, { 9, 20 }, { 16, 12 },
+  { 18, 9 }, { 8, 21 }, { 13, 16 }, { 17, 11 },
+  { 19, 8 }, { 18, 10 }, { 13, 17 }, { 16, 13 },
+  { 11, 19 }, { 12, 18 }, { 10, 20 }, { 17, 12 },
+  { 9, 21 }, { 19, 9 }, { 8, 22 }, { 14, 16 },
+  { 18, 11 }, { 11, 20 }, { 10, 21 }, { 20, 8 },
+  { 13, 18 }, { 16, 14 }, { 12, 19 }, { 17, 13 },
+  { 19, 10 }, { 14, 17 }, { 9, 22 }, { 18, 12 },
+  { 8, 23 }, { 17, 14 }, { 20, 9 }, { 15, 16 },
+  { 16, 15 }, { 13, 19 }, { 10, 22 }, { 19, 11 },
+  { 11, 21 }, { 14, 18 }, { 12, 20 }, { 18, 13 },
+  { 20, 10 }, { 21, 8 }, { 15, 17 }, { 9, 23 },
+  { 19, 12 }, { 11, 22 }, { 8, 24 }, { 21, 9 },
+  { 17, 15 }, { 16, 16 }, { 14, 19 }, { 18, 14 },
+  { 12, 21 }, { 13, 20 }, { 20, 11 }, { 10, 23 },
+  { 19, 13 }, { 15, 18 }, { 16, 17 }, { 21, 10 },
+  { 22, 8 }, { 9, 24 }, { 8, 25 }, { 20, 12 },
+  { 15, 19 }, { 11, 23 }, { 17, 16 }, { 18, 15 },
+  { 14, 20 }, { 12, 22 }, { 10, 24 }, { 22, 9 },
+  { 21, 11 }, { 19, 14 }, { 13, 21 }, { 16, 18 },
+  { 9, 25 }, { 17, 17 }, { 8, 26 }, { 20, 13 },
+  { 23, 8 }, { 12, 23 }, { 13, 22 }, { 22, 10 },
+  { 19, 15 }, { 15, 20 }, { 16, 19 }, { 21, 12 },
+  { 11, 24 }, { 14, 21 }, { 8, 27 }, { 18, 16 },
+  { 10, 25 }, { 9, 26 }, { 22, 11 }, { 20, 14 },
+  { 23, 9 }, { 18, 17 }, { 17, 18 }, { 17, 19 },
+  { 19, 16 }, { 21, 13 }, { 10, 26 }, { 12, 24 },
+  { 23, 10 }, { 24, 8 }, { 8, 28 }, { 16, 20 },
+  { 9, 27 }, { 15, 21 }, { 22, 12 }, { 14, 22 },
+  { 13, 23 }, { 20, 15 }, { 11, 25 }, { 24, 9 },
+  { 18, 18 }, { 19, 17 }, { 23, 11 }, { 10, 27 },
+  { 8, 29 }, { 12, 25 }, { 9, 28 }, { 8, 30 },
+  { 21, 14 }, { 13, 24 }, { 11, 26 }, { 25, 8 },
+  { 24, 10 }, { 20, 16 }, { 19, 18 }, { 14, 23 },
+  { 22, 13 }, { 8, 31 }, { 17, 20 }, { 9, 29 },
+  { 23, 12 }, { 15, 22 }, { 25, 9 }, { 11, 27 },
+  { 10, 28 }, { 20, 17 }, { 21, 15 }, { 18, 19 },
+  { 16, 21 }, { 24, 11 }, { 9, 30 }, { 12, 26 },
+  { 10, 29 }, { 22, 14 }, { 14, 24 }, { 9, 31 },
+  { 26, 8 }, { 13, 25 }, { 25, 10 }, { 18, 20 },
+  { 19, 19 }, { 11, 28 }, { 15, 23 }, { 20, 18 },
+  { 10, 30 }, { 12, 27 }, { 17, 21 }, { 23, 13 },
+  { 24, 12 }, { 21, 16 }, { 16, 22 }, { 26, 9 },
+  { 27, 8 }, { 13, 26 }, { 22, 15 }, { 10, 31 },
+  { 14, 25 }, { 12, 28 }, { 25, 11 }, { 21, 17 },
+  { 26, 10 }, { 20, 19 }, { 11, 29 }, { 15, 24 },
+  { 23, 14 }, { 27, 9 }, { 11, 30 }, { 13, 27 },
+  { 19, 20 }, { 24, 13 }, { 28, 8 }, { 11, 31 },
+  { 22, 16 }, { 17, 22 }, { 16, 23 }, { 25, 12 },
+  { 18, 21 }, { 12, 29 }, { 21, 18 }, { 28, 9 },
+  { 27, 10 }, { 26, 11 }, { 29, 8 }, { 14, 26 },
+  { 15, 25 }, { 13, 28 }, { 12, 30 }, { 23, 15 },
+  { 30, 8 }, { 16, 24 }, { 13, 29 }, { 25, 13 },
+  { 24, 14 }, { 20, 20 }, { 31, 8 }, { 12, 31 },
+  { 14, 27 }, { 28, 10 }, { 26, 12 }, { 22, 17 },
+  { 21, 19 }, { 17, 23 }, { 18, 22 }, { 29, 9 },
+  { 27, 11 }, { 19, 21 }, { 27, 12 }, { 30, 9 },
+  { 31, 9 }, { 13, 30 }, { 24, 15 }, { 23, 16 },
+  { 15, 26 }, { 14, 28 }, { 29, 10 }, { 28, 11 },
+  { 26, 13 }, { 17, 24 }, { 13, 31 }, { 25, 14 },
+  { 22, 18 }, { 16, 25 }, { 30, 10 }, { 14, 29 },
+  { 15, 27 }, { 19, 22 }, { 21, 20 }, { 20, 21 },
+  { 27, 13 }, { 29, 11 }, { 18, 23 }, { 23, 17 },
+  { 16, 26 }, { 31, 10 }, { 24, 16 }, { 14, 30 },
+  { 22, 19 }, { 14, 31 }, { 28, 12 }, { 26, 14 },
+  { 30, 11 }, { 15, 28 }, { 25, 15 }, { 17, 25 },
+  { 23, 18 }, { 18, 24 }, { 15, 30 }, { 29, 12 },
+  { 31, 11 }, { 16, 27 }, { 24, 17 }, { 28, 13 },
+  { 19, 23 }, { 15, 29 }, { 25, 16 }, { 17, 26 },
+  { 27, 14 }, { 22, 20 }, { 15, 31 }, { 20, 22 },
+  { 21, 21 }, { 16, 28 }, { 17, 27 }, { 30, 12 },
+  { 26, 15 }, { 19, 24 }, { 18, 25 }, { 23, 19 },
+  { 29, 13 }, { 31, 12 }, { 24, 18 }, { 26, 16 },
+  { 25, 17 }, { 16, 29 }, { 28, 14 }, { 20, 23 },
+  { 18, 26 }, { 21, 22 }, { 19, 25 }, { 22, 21 },
+  { 27, 15 }, { 17, 28 }, { 16, 30 }, { 26, 17 },
+  { 23, 20 }, { 16, 31 }, { 25, 18 }, { 27, 16 },
+  { 20, 24 }, { 24, 19 }, { 31, 13 }, { 30, 13 },
+  { 29, 14 }, { 18, 27 }, { 28, 15 }, { 17, 29 },
+  { 19, 26 }, { 17, 30 }, { 21, 23 }, { 22, 22 },
+  { 30, 14 }, { 20, 25 }, { 23, 21 }, { 17, 31 },
+  { 18, 28 }, { 25, 19 }, { 24, 20 }, { 28, 16 },
+  { 31, 14 }, { 26, 18 }, { 19, 27 }, { 29, 15 },
+  { 27, 17 }, { 30, 15 }, { 21, 24 }, { 22, 23 },
+  { 26, 19 }, { 23, 22 }, { 28, 17 }, { 29, 16 },
+  { 18, 30 }, { 24, 21 }, { 25, 20 }, { 18, 31 },
+  { 18, 29 }, { 20, 26 }, { 19, 28 }, { 27, 18 },
+  { 31, 15 }, { 20, 27 }, { 30, 16 }, { 19, 29 },
+  { 29, 17 }, { 31, 16 }, { 27, 19 }, { 21, 25 },
+  { 28, 18 }, { 26, 20 }, { 22, 24 }, { 25, 21 },
+  { 19, 30 }, { 24, 22 }, { 30, 17 }, { 21, 26 },
+  { 23, 23 }, { 19, 31 }, { 20, 28 }, { 31, 17 },
+  { 28, 19 }, { 27, 20 }, { 21, 27 }, { 29, 18 },
+  { 30, 18 }, { 25, 22 }, { 26, 21 }, { 20, 29 },
+  { 22, 25 }, { 24, 23 }, { 29, 19 }, { 23, 24 },
+  { 20, 31 }, { 20, 30 }, { 28, 20 }, { 21, 28 },
+  { 22, 26 }, { 31, 18 }, { 27, 21 }, { 30, 19 },
+  { 22, 27 }, { 29, 20 }, { 23, 25 }, { 24, 24 },
+  { 26, 22 }, { 21, 29 }, { 25, 23 }, { 31, 19 },
+  { 21, 30 }, { 23, 26 }, { 28, 21 }, { 21, 31 },
+  { 22, 28 }, { 30, 20 }, { 25, 24 }, { 27, 22 },
+  { 29, 21 }, { 26, 23 }, { 24, 25 }, { 31, 20 },
+  { 23, 27 }, { 22, 29 }, { 30, 21 }, { 28, 22 },
+  { 24, 26 }, { 25, 25 }, { 27, 23 }, { 22, 30 },
+  { 23, 28 }, { 22, 31 }, { 26, 24 }, { 31, 21 },
+  { 24, 27 }, { 29, 22 }, { 27, 24 }, { 30, 22 },
+  { 25, 26 }, { 28, 23 }, { 23, 30 }, { 23, 29 },
+  { 24, 28 }, { 25, 27 }, { 31, 22 }, { 23, 31 },
+  { 26, 25 }, { 28, 24 }, { 29, 23 }, { 24, 29 },
+  { 24, 30 }, { 27, 25 }, { 25, 28 }, { 26, 26 },
+  { 30, 23 }, { 26, 27 }, { 31, 23 }, { 28, 25 },
+  { 27, 26 }, { 25, 29 }, { 24, 31 }, { 29, 24 },
+  { 30, 24 }, { 27, 27 }, { 29, 25 }, { 26, 28 },
+  { 31, 24 }, { 25, 30 }, { 25, 31 }, { 28, 26 },
+  { 27, 28 }, { 26, 29 }, { 30, 25 }, { 29, 26 },
+  { 28, 27 }, { 26, 30 }, { 31, 25 }, { 27, 29 },
+  { 26, 31 }, { 30, 26 }, { 28, 28 }, { 31, 26 },
+  { 29, 27 }, { 27, 30 }, { 28, 29 }, { 27, 31 },
+  { 30, 27 }, { 31, 27 }, { 28, 30 }, { 29, 28 },
+  { 30, 28 }, { 29, 29 }, { 30, 29 }, { 31, 28 },
+  { 28, 31 }, { 29, 30 }, { 29, 31 }, { 31, 29 },
+  { 30, 30 }, { 30, 31 }, { 31, 30 }, { 31, 31 }
+};
diff --git a/third_party/aom/av1/common/zigzag4.c b/third_party/aom/av1/common/zigzag4.c
new file mode 100644
index 000000000..1fb5a320b
--- /dev/null
+++ b/third_party/aom/av1/common/zigzag4.c
@@ -0,0 +1,22 @@
+/* This file is generated by gen_zigzag4.m */
+
+/* clang-format off */
+
+#include "odintrin.h"
+OD_EXTERN const unsigned char OD_ZIGZAG4_DCT_DCT[15][2] = {
+  {0, 1}, {1, 0}, {1, 1}, {0, 2},
+  {2, 0}, {0, 3}, {1, 2}, {3, 0},
+  {2, 1}, {1, 3}, {2, 2}, {3, 1},
+  {2, 3}, {3, 2}, {3, 3} };
+
+OD_EXTERN const unsigned char OD_ZIGZAG4_ADST_DCT[15][2] = {
+  {1, 0}, {0, 1}, {2, 0}, {1, 1},
+  {3, 0}, {2, 1}, {0, 2}, {1, 2},
+  {3, 1}, {0, 3}, {2, 2}, {1, 3},
+  {3, 2}, {2, 3}, {3, 3} };
+
+OD_EXTERN const unsigned char OD_ZIGZAG4_DCT_ADST[15][2] = {
+  {0, 1}, {0, 2}, {1, 0}, {0, 3},
+  {1, 1}, {1, 2}, {2, 0}, {1, 3},
+  {2, 1}, {2, 2}, {3, 0}, {3, 1},
+  {2, 3}, {3, 2}, {3, 3} };
diff --git a/third_party/aom/av1/common/zigzag8.c b/third_party/aom/av1/common/zigzag8.c
new file mode 100644
index 000000000..3f11e0c03
--- /dev/null
+++ b/third_party/aom/av1/common/zigzag8.c
@@ -0,0 +1,50 @@
+/* This file is generated by gen_zigzag8.m */
+
+/* clang-format off */
+
+#include "odintrin.h"
+
+OD_EXTERN const unsigned char OD_ZIGZAG8_DCT_DCT[48][2] = {
+  {4, 0}, {4, 1}, {5, 0}, {5, 1},
+  {6, 0}, {7, 0}, {6, 1}, {7, 1},
+  {0, 4}, {1, 4}, {0, 5}, {1, 5},
+  {0, 6}, {1, 6}, {0, 7}, {1, 7},
+  {2, 4}, {4, 2}, {3, 4}, {2, 5},
+  {4, 3}, {5, 2}, {4, 4}, {3, 5},
+  {5, 3}, {2, 6}, {4, 5}, {6, 2},
+  {5, 4}, {3, 6}, {2, 7}, {6, 3},
+  {5, 5}, {7, 2}, {4, 6}, {3, 7},
+  {6, 4}, {7, 3}, {4, 7}, {5, 6},
+  {6, 5}, {7, 4}, {5, 7}, {6, 6},
+  {7, 5}, {6, 7}, {7, 6}, {7, 7}
+  };
+
+OD_EXTERN const unsigned char OD_ZIGZAG8_ADST_DCT[48][2] = {
+  {4, 0}, {5, 0}, {4, 1}, {6, 0},
+  {5, 1}, {7, 0}, {6, 1}, {7, 1},
+  {0, 4}, {1, 4}, {0, 5}, {1, 5},
+  {0, 6}, {1, 6}, {0, 7}, {1, 7},
+  {4, 2}, {2, 4}, {5, 2}, {4, 3},
+  {3, 4}, {2, 5}, {5, 3}, {4, 4},
+  {6, 2}, {3, 5}, {5, 4}, {2, 6},
+  {4, 5}, {6, 3}, {7, 2}, {3, 6},
+  {2, 7}, {5, 5}, {6, 4}, {4, 6},
+  {7, 3}, {3, 7}, {5, 6}, {6, 5},
+  {4, 7}, {7, 4}, {5, 7}, {7, 5},
+  {6, 6}, {7, 6}, {6, 7}, {7, 7}
+  };
+
+OD_EXTERN const unsigned char OD_ZIGZAG8_DCT_ADST[48][2] = {
+  {4, 0}, {4, 1}, {5, 0}, {5, 1},
+  {6, 0}, {6, 1}, {7, 0}, {7, 1},
+  {0, 4}, {0, 5}, {1, 4}, {0, 6},
+  {1, 5}, {0, 7}, {1, 6}, {1, 7},
+  {2, 4}, {2, 5}, {3, 4}, {4, 2},
+  {2, 6}, {4, 3}, {3, 5}, {4, 4},
+  {2, 7}, {3, 6}, {5, 2}, {4, 5},
+  {5, 3}, {3, 7}, {5, 4}, {4, 6},
+  {6, 2}, {5, 5}, {4, 7}, {6, 3},
+  {6, 4}, {5, 6}, {7, 2}, {6, 5},
+  {7, 3}, {5, 7}, {7, 4}, {6, 6},
+  {7, 5}, {6, 7}, {7, 6}, {7, 7}
+  };
diff --git a/third_party/aom/av1/decoder/accounting.c b/third_party/aom/av1/decoder/accounting.c
new file mode 100644
index 000000000..ba243c9e1
--- /dev/null
+++ b/third_party/aom/av1/decoder/accounting.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_integer.h"
+#include "./accounting.h"
+
+static int aom_accounting_hash(const char *str) {
+  uint32_t val;
+  const unsigned char *ustr;
+  val = 0;
+  ustr = (const unsigned char *)str;
+  /* This is about the worst hash one can design, but it should be good enough
+     here. */
+  while (*ustr) val += *ustr++;
+  return val % AOM_ACCOUNTING_HASH_SIZE;
+}
+
+/* Dictionary lookup based on an open-addressing hash table. */
+int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str) {
+  int hash;
+  int len;
+  AccountingDictionary *dictionary;
+  dictionary = &accounting->syms.dictionary;
+  hash = aom_accounting_hash(str);
+  while (accounting->hash_dictionary[hash] != -1) {
+    if (strcmp(dictionary->strs[accounting->hash_dictionary[hash]], str) == 0) {
+      return accounting->hash_dictionary[hash];
+    }
+    hash++;
+    if (hash == AOM_ACCOUNTING_HASH_SIZE) hash = 0;
+  }
+  /* No match found. */
+  assert(dictionary->num_strs + 1 < MAX_SYMBOL_TYPES);
+  accounting->hash_dictionary[hash] = dictionary->num_strs;
+  len = strlen(str);
+  dictionary->strs[dictionary->num_strs] = malloc(len + 1);
+  snprintf(dictionary->strs[dictionary->num_strs], len + 1, "%s", str);
+  dictionary->num_strs++;
+  return dictionary->num_strs - 1;
+}
+
+void aom_accounting_init(Accounting *accounting) {
+  int i;
+  accounting->num_syms_allocated = 1000;
+  accounting->syms.syms =
+      malloc(sizeof(AccountingSymbol) * accounting->num_syms_allocated);
+  accounting->syms.dictionary.num_strs = 0;
+  assert(AOM_ACCOUNTING_HASH_SIZE > 2 * MAX_SYMBOL_TYPES);
+  for (i = 0; i < AOM_ACCOUNTING_HASH_SIZE; i++)
+    accounting->hash_dictionary[i] = -1;
+  aom_accounting_reset(accounting);
+}
+
+void aom_accounting_reset(Accounting *accounting) {
+  accounting->syms.num_syms = 0;
+  accounting->syms.num_binary_syms = 0;
+  accounting->syms.num_multi_syms = 0;
+  accounting->context.x = -1;
+  accounting->context.y = -1;
+  accounting->last_tell_frac = 0;
+}
+
+void aom_accounting_clear(Accounting *accounting) {
+  int i;
+  AccountingDictionary *dictionary;
+  free(accounting->syms.syms);
+  dictionary = &accounting->syms.dictionary;
+  for (i = 0; i < dictionary->num_strs; i++) {
+    free(dictionary->strs[i]);
+  }
+}
+
+void aom_accounting_set_context(Accounting *accounting, int16_t x, int16_t y) {
+  accounting->context.x = x;
+  accounting->context.y = y;
+}
+
+void aom_accounting_record(Accounting *accounting, const char *str,
+                           uint32_t bits) {
+  AccountingSymbol sym;
+  // Reuse previous symbol if it has the same context and symbol id.
+  if (accounting->syms.num_syms) {
+    AccountingSymbol *last_sym;
+    last_sym = &accounting->syms.syms[accounting->syms.num_syms - 1];
+    if (memcmp(&last_sym->context, &accounting->context,
+               sizeof(AccountingSymbolContext)) == 0) {
+      uint32_t id;
+      id = aom_accounting_dictionary_lookup(accounting, str);
+      if (id == last_sym->id) {
+        last_sym->bits += bits;
+        last_sym->samples++;
+        return;
+      }
+    }
+  }
+  sym.context = accounting->context;
+  sym.samples = 1;
+  sym.bits = bits;
+  sym.id = aom_accounting_dictionary_lookup(accounting, str);
+  assert(sym.id <= 255);
+  if (accounting->syms.num_syms == accounting->num_syms_allocated) {
+    accounting->num_syms_allocated *= 2;
+    accounting->syms.syms =
+        realloc(accounting->syms.syms,
+                sizeof(AccountingSymbol) * accounting->num_syms_allocated);
+    assert(accounting->syms.syms != NULL);
+  }
+  accounting->syms.syms[accounting->syms.num_syms++] = sym;
+}
+
+void aom_accounting_dump(Accounting *accounting) {
+  int i;
+  AccountingSymbol *sym;
+  printf("\n----- Number of recorded syntax elements = %d -----\n",
+         accounting->syms.num_syms);
+  printf("----- Total number of symbol calls = %d (%d binary) -----\n",
+         accounting->syms.num_multi_syms + accounting->syms.num_binary_syms,
+         accounting->syms.num_binary_syms);
+  for (i = 0; i < accounting->syms.num_syms; i++) {
+    sym = &accounting->syms.syms[i];
+    printf("%s x: %d, y: %d bits: %f samples: %d\n",
+           accounting->syms.dictionary.strs[sym->id], sym->context.x,
+           sym->context.y, (float)sym->bits / 8.0, sym->samples);
+  }
+}
diff --git a/third_party/aom/av1/decoder/accounting.h b/third_party/aom/av1/decoder/accounting.h
new file mode 100644
index 000000000..889865b2e
--- /dev/null
+++ b/third_party/aom/av1/decoder/accounting.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_ACCOUNTING_H_
+#define AOM_ACCOUNTING_H_
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#define AOM_ACCOUNTING_HASH_SIZE (1021)
+
+/* Max number of entries for symbol types in the dictionary (increase as
+   necessary). */
+#define MAX_SYMBOL_TYPES (256)
+
+/*The resolution of fractional-precision bit usage measurements, i.e.,
+   3 => 1/8th bits.*/
+#define AOM_ACCT_BITRES (3)
+
+typedef struct {
+  int16_t x;
+  int16_t y;
+} AccountingSymbolContext;
+
+typedef struct {
+  AccountingSymbolContext context;
+  uint32_t id;
+  /** Number of bits in units of 1/8 bit. */
+  uint32_t bits;
+  uint32_t samples;
+} AccountingSymbol;
+
+/** Dictionary for translating strings into id. */
+typedef struct {
+  char *(strs[MAX_SYMBOL_TYPES]);
+  int num_strs;
+} AccountingDictionary;
+
+typedef struct {
+  /** All recorded symbols decoded. */
+  AccountingSymbol *syms;
+  /** Number of syntax actually recorded. */
+  int num_syms;
+  /** Raw symbol decoding calls for non-binary values. */
+  int num_multi_syms;
+  /** Raw binary symbol decoding calls. */
+  int num_binary_syms;
+  /** Dictionary for translating strings into id. */
+  AccountingDictionary dictionary;
+} AccountingSymbols;
+
+typedef struct Accounting Accounting;
+
+struct Accounting {
+  AccountingSymbols syms;
+  /** Size allocated for symbols (not all may be used). */
+  int num_syms_allocated;
+  int16_t hash_dictionary[AOM_ACCOUNTING_HASH_SIZE];
+  AccountingSymbolContext context;
+  uint32_t last_tell_frac;
+};
+
+void aom_accounting_init(Accounting *accounting);
+void aom_accounting_reset(Accounting *accounting);
+void aom_accounting_clear(Accounting *accounting);
+void aom_accounting_set_context(Accounting *accounting, int16_t x, int16_t y);
+int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str);
+void aom_accounting_record(Accounting *accounting, const char *str,
+                           uint32_t bits);
+void aom_accounting_dump(Accounting *accounting);
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // AOM_ACCOUNTING_H_
diff --git a/third_party/aom/av1/decoder/decint.h b/third_party/aom/av1/decoder/decint.h
new file mode 100644
index 000000000..e887ad5e0
--- /dev/null
+++ b/third_party/aom/av1/decoder/decint.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#if !defined(_decint_H)
+# define _decint_H (1)
+# include "av1/common/pvq_state.h"
+# include "aom_dsp/bitreader.h"
+# include "aom_dsp/entdec.h"
+
+typedef struct daala_dec_ctx daala_dec_ctx;
+
+typedef struct daala_dec_ctx od_dec_ctx;
+
+
+struct daala_dec_ctx {
+  /* Stores context-adaptive CDFs for PVQ. */
+  od_state state;
+  /* AOM entropy decoder. */
+  aom_reader *r;
+  int use_activity_masking;
+  /* Mode of quantization matrice : FLAT (0) or HVS (1) */
+  int qm;
+};
+
+#endif
diff --git a/third_party/aom/av1/decoder/decodeframe.c b/third_party/aom/av1/decoder/decodeframe.c
new file mode 100644
index 000000000..289d38670
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodeframe.c
@@ -0,0 +1,5159 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdlib.h>  // qsort()
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "./aom_scale_rtcd.h"
+#include "./av1_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_dsp/binary_codes_reader.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/mem_ops.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_util/aom_thread.h"
+
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#include "av1/common/alloccommon.h"
+#if CONFIG_CDEF
+#include "av1/common/cdef.h"
+#include "av1/common/clpf.h"
+#endif
+#if CONFIG_INSPECTION
+#include "av1/decoder/inspection.h"
+#endif
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/idct.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/thread_common.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/decodemv.h"
+#include "av1/decoder/decoder.h"
+#if CONFIG_LV_MAP
+#include "av1/decoder/decodetxb.h"
+#endif
+#include "av1/decoder/detokenize.h"
+#include "av1/decoder/dsubexp.h"
+
+#if CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION
+#include "av1/common/warped_motion.h"
+#endif  // CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION
+
+#define MAX_AV1_HEADER_SIZE 80
+#define ACCT_STR __func__
+
+#if CONFIG_PVQ
+#include "av1/common/partition.h"
+#include "av1/common/pvq.h"
+#include "av1/common/scan.h"
+#include "av1/decoder/decint.h"
+#include "av1/decoder/pvq_decoder.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#endif
+
+#if CONFIG_CFL
+#include "av1/common/cfl.h"
+#endif
+
+static struct aom_read_bit_buffer *init_read_bit_buffer(
+    AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
+    const uint8_t *data_end, uint8_t clear_data[MAX_AV1_HEADER_SIZE]);
+static int read_compressed_header(AV1Decoder *pbi, const uint8_t *data,
+                                  size_t partition_size);
+static size_t read_uncompressed_header(AV1Decoder *pbi,
+                                       struct aom_read_bit_buffer *rb);
+
+static int is_compound_reference_allowed(const AV1_COMMON *cm) {
+#if CONFIG_LOWDELAY_COMPOUND  // Normative in decoder
+  return !frame_is_intra_only(cm);
+#else
+  int i;
+  if (frame_is_intra_only(cm)) return 0;
+  for (i = 1; i < INTER_REFS_PER_FRAME; ++i)
+    if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) return 1;
+
+  return 0;
+#endif
+}
+
+static void setup_compound_reference_mode(AV1_COMMON *cm) {
+#if CONFIG_EXT_REFS
+  cm->comp_fwd_ref[0] = LAST_FRAME;
+  cm->comp_fwd_ref[1] = LAST2_FRAME;
+  cm->comp_fwd_ref[2] = LAST3_FRAME;
+  cm->comp_fwd_ref[3] = GOLDEN_FRAME;
+
+  cm->comp_bwd_ref[0] = BWDREF_FRAME;
+  cm->comp_bwd_ref[1] = ALTREF_FRAME;
+#else
+  if (cm->ref_frame_sign_bias[LAST_FRAME] ==
+      cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
+    cm->comp_fixed_ref = ALTREF_FRAME;
+    cm->comp_var_ref[0] = LAST_FRAME;
+    cm->comp_var_ref[1] = GOLDEN_FRAME;
+  } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==
+             cm->ref_frame_sign_bias[ALTREF_FRAME]) {
+    cm->comp_fixed_ref = GOLDEN_FRAME;
+    cm->comp_var_ref[0] = LAST_FRAME;
+    cm->comp_var_ref[1] = ALTREF_FRAME;
+  } else {
+    cm->comp_fixed_ref = LAST_FRAME;
+    cm->comp_var_ref[0] = GOLDEN_FRAME;
+    cm->comp_var_ref[1] = ALTREF_FRAME;
+  }
+#endif  // CONFIG_EXT_REFS
+}
+
+static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
+  return len != 0 && len <= (size_t)(end - start);
+}
+
+static int decode_unsigned_max(struct aom_read_bit_buffer *rb, int max) {
+  const int data = aom_rb_read_literal(rb, get_unsigned_bits(max));
+  return data > max ? max : data;
+}
+
+static TX_MODE read_tx_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
+                            struct aom_read_bit_buffer *rb) {
+  int i, all_lossless = 1;
+#if CONFIG_TX64X64
+  TX_MODE tx_mode;
+#endif
+
+  if (cm->seg.enabled) {
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      if (!xd->lossless[i]) {
+        all_lossless = 0;
+        break;
+      }
+    }
+  } else {
+    all_lossless = xd->lossless[0];
+  }
+
+  if (all_lossless) return ONLY_4X4;
+#if CONFIG_TX64X64
+  tx_mode = aom_rb_read_bit(rb) ? TX_MODE_SELECT : aom_rb_read_literal(rb, 2);
+  if (tx_mode == ALLOW_32X32) tx_mode += aom_rb_read_bit(rb);
+  return tx_mode;
+#else
+  return aom_rb_read_bit(rb) ? TX_MODE_SELECT : aom_rb_read_literal(rb, 2);
+#endif  // CONFIG_TX64X64
+}
+
+#if !CONFIG_EC_ADAPT
+static void read_tx_size_probs(FRAME_CONTEXT *fc, aom_reader *r) {
+  int i, j, k;
+  for (i = 0; i < MAX_TX_DEPTH; ++i)
+    for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+      for (k = 0; k < i + 1; ++k)
+        av1_diff_update_prob(r, &fc->tx_size_probs[i][j][k], ACCT_STR);
+}
+#endif
+
+#if !CONFIG_EC_ADAPT
+static void read_switchable_interp_probs(FRAME_CONTEXT *fc, aom_reader *r) {
+  int i, j;
+  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
+    for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
+      av1_diff_update_prob(r, &fc->switchable_interp_prob[j][i], ACCT_STR);
+  }
+}
+#endif
+
+static void read_inter_mode_probs(FRAME_CONTEXT *fc, aom_reader *r) {
+#if CONFIG_REF_MV
+  int i;
+  for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
+    av1_diff_update_prob(r, &fc->newmv_prob[i], ACCT_STR);
+  for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
+    av1_diff_update_prob(r, &fc->zeromv_prob[i], ACCT_STR);
+  for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
+    av1_diff_update_prob(r, &fc->refmv_prob[i], ACCT_STR);
+  for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
+    av1_diff_update_prob(r, &fc->drl_prob[i], ACCT_STR);
+#else
+#if !CONFIG_EC_ADAPT
+  int i, j;
+  for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
+    for (j = 0; j < INTER_MODES - 1; ++j)
+      av1_diff_update_prob(r, &fc->inter_mode_probs[i][j], ACCT_STR);
+  }
+#else
+  (void)fc;
+  (void)r;
+#endif
+#endif
+}
+
+#if CONFIG_EXT_INTER
+static void read_inter_compound_mode_probs(FRAME_CONTEXT *fc, aom_reader *r) {
+  int i, j;
+  if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
+    for (j = 0; j < INTER_MODE_CONTEXTS; ++j) {
+      for (i = 0; i < INTER_COMPOUND_MODES - 1; ++i) {
+        av1_diff_update_prob(r, &fc->inter_compound_mode_probs[j][i], ACCT_STR);
+      }
+    }
+  }
+}
+#endif  // CONFIG_EXT_INTER
+#if !CONFIG_EC_ADAPT
+#if !CONFIG_EXT_TX
+static void read_ext_tx_probs(FRAME_CONTEXT *fc, aom_reader *r) {
+  int i, j, k;
+  if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      for (j = 0; j < TX_TYPES; ++j) {
+        for (k = 0; k < TX_TYPES - 1; ++k)
+          av1_diff_update_prob(r, &fc->intra_ext_tx_prob[i][j][k], ACCT_STR);
+      }
+    }
+  }
+  if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      for (k = 0; k < TX_TYPES - 1; ++k)
+        av1_diff_update_prob(r, &fc->inter_ext_tx_prob[i][k], ACCT_STR);
+    }
+  }
+}
+#endif
+#endif
+
+static REFERENCE_MODE read_frame_reference_mode(
+    const AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  if (is_compound_reference_allowed(cm)) {
+#if CONFIG_REF_ADAPT
+    return aom_rb_read_bit(rb) ? REFERENCE_MODE_SELECT : SINGLE_REFERENCE;
+#else
+    return aom_rb_read_bit(rb)
+               ? REFERENCE_MODE_SELECT
+               : (aom_rb_read_bit(rb) ? COMPOUND_REFERENCE : SINGLE_REFERENCE);
+#endif  // CONFIG_REF_ADAPT
+  } else {
+    return SINGLE_REFERENCE;
+  }
+}
+
+static void read_frame_reference_mode_probs(AV1_COMMON *cm, aom_reader *r) {
+  FRAME_CONTEXT *const fc = cm->fc;
+  int i, j;
+
+  if (cm->reference_mode == REFERENCE_MODE_SELECT)
+    for (i = 0; i < COMP_INTER_CONTEXTS; ++i)
+      av1_diff_update_prob(r, &fc->comp_inter_prob[i], ACCT_STR);
+
+  if (cm->reference_mode != COMPOUND_REFERENCE) {
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      for (j = 0; j < (SINGLE_REFS - 1); ++j) {
+        av1_diff_update_prob(r, &fc->single_ref_prob[i][j], ACCT_STR);
+      }
+    }
+  }
+
+  if (cm->reference_mode != SINGLE_REFERENCE) {
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+#if CONFIG_EXT_REFS
+      for (j = 0; j < (FWD_REFS - 1); ++j)
+        av1_diff_update_prob(r, &fc->comp_ref_prob[i][j], ACCT_STR);
+      for (j = 0; j < (BWD_REFS - 1); ++j)
+        av1_diff_update_prob(r, &fc->comp_bwdref_prob[i][j], ACCT_STR);
+#else
+      for (j = 0; j < (COMP_REFS - 1); ++j)
+        av1_diff_update_prob(r, &fc->comp_ref_prob[i][j], ACCT_STR);
+#endif  // CONFIG_EXT_REFS
+    }
+  }
+}
+
+static void update_mv_probs(aom_prob *p, int n, aom_reader *r) {
+  int i;
+  for (i = 0; i < n; ++i) av1_diff_update_prob(r, &p[i], ACCT_STR);
+}
+
+static void read_mv_probs(nmv_context *ctx, int allow_hp, aom_reader *r) {
+  int i;
+
+#if !CONFIG_EC_ADAPT
+  int j;
+  update_mv_probs(ctx->joints, MV_JOINTS - 1, r);
+
+  for (i = 0; i < 2; ++i) {
+    nmv_component *const comp_ctx = &ctx->comps[i];
+    update_mv_probs(&comp_ctx->sign, 1, r);
+    update_mv_probs(comp_ctx->classes, MV_CLASSES - 1, r);
+    update_mv_probs(comp_ctx->class0, CLASS0_SIZE - 1, r);
+    update_mv_probs(comp_ctx->bits, MV_OFFSET_BITS, r);
+  }
+  for (i = 0; i < 2; ++i) {
+    nmv_component *const comp_ctx = &ctx->comps[i];
+    for (j = 0; j < CLASS0_SIZE; ++j) {
+      update_mv_probs(comp_ctx->class0_fp[j], MV_FP_SIZE - 1, r);
+    }
+    update_mv_probs(comp_ctx->fp, MV_FP_SIZE - 1, r);
+  }
+#endif  // !CONFIG_EC_ADAPT
+
+  if (allow_hp) {
+    for (i = 0; i < 2; ++i) {
+      nmv_component *const comp_ctx = &ctx->comps[i];
+      update_mv_probs(&comp_ctx->class0_hp, 1, r);
+      update_mv_probs(&comp_ctx->hp, 1, r);
+    }
+  }
+}
+
+static void inverse_transform_block(MACROBLOCKD *xd, int plane,
+                                    const TX_TYPE tx_type,
+                                    const TX_SIZE tx_size, uint8_t *dst,
+                                    int stride, int16_t scan_line, int eob) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *const dqcoeff = pd->dqcoeff;
+  av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, dst, stride, eob);
+  memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0]));
+}
+
+#if CONFIG_PVQ
+static int av1_pvq_decode_helper(MACROBLOCKD *xd, tran_low_t *ref_coeff,
+                                 tran_low_t *dqcoeff, int16_t *quant, int pli,
+                                 int bs, TX_TYPE tx_type, int xdec,
+                                 PVQ_SKIP_TYPE ac_dc_coded) {
+  unsigned int flags;  // used for daala's stream analyzer.
+  int off;
+  const int is_keyframe = 0;
+  const int has_dc_skip = 1;
+  int coeff_shift = 3 - av1_get_tx_scale(bs);
+  int hbd_downshift = 0;
+  int rounding_mask;
+  // DC quantizer for PVQ
+  int pvq_dc_quant;
+  int lossless = (quant[0] == 0);
+  const int blk_size = tx_size_wide[bs];
+  int eob = 0;
+  int i;
+  od_dec_ctx *dec = &xd->daala_dec;
+  int use_activity_masking = dec->use_activity_masking;
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_coeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+
+  od_coeff ref_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX];
+  od_coeff out_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX];
+
+#if CONFIG_HIGHBITDEPTH
+  hbd_downshift = xd->bd - 8;
+#endif  // CONFIG_HIGHBITDEPTH
+
+  od_raster_to_coding_order(ref_coeff_pvq, blk_size, tx_type, ref_coeff,
+                            blk_size);
+
+  assert(OD_COEFF_SHIFT >= 4);
+  if (lossless)
+    pvq_dc_quant = 1;
+  else {
+    if (use_activity_masking)
+      pvq_dc_quant = OD_MAXI(
+          1, (quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift) *
+                     dec->state.pvq_qm_q4[pli][od_qm_get_index(bs, 0)] >>
+                 4);
+    else
+      pvq_dc_quant =
+          OD_MAXI(1, quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift);
+  }
+
+  off = od_qm_offset(bs, xdec);
+
+  // copy int16 inputs to int32
+  for (i = 0; i < blk_size * blk_size; i++) {
+    ref_int32[i] =
+        AOM_SIGNED_SHL(ref_coeff_pvq[i], OD_COEFF_SHIFT - coeff_shift) >>
+        hbd_downshift;
+  }
+
+  od_pvq_decode(dec, ref_int32, out_int32,
+                OD_MAXI(1, quant[1] << (OD_COEFF_SHIFT - 3) >> hbd_downshift),
+                pli, bs, OD_PVQ_BETA[use_activity_masking][pli][bs],
+                is_keyframe, &flags, ac_dc_coded, dec->state.qm + off,
+                dec->state.qm_inv + off);
+
+  if (!has_dc_skip || out_int32[0]) {
+    out_int32[0] =
+        has_dc_skip + generic_decode(dec->r, &dec->state.adapt->model_dc[pli],
+                                     &dec->state.adapt->ex_dc[pli][bs][0], 2,
+                                     "dc:mag");
+    if (out_int32[0]) out_int32[0] *= aom_read_bit(dec->r, "dc:sign") ? -1 : 1;
+  }
+  out_int32[0] = out_int32[0] * pvq_dc_quant + ref_int32[0];
+
+  // copy int32 result back to int16
+  assert(OD_COEFF_SHIFT > coeff_shift);
+  rounding_mask = (1 << (OD_COEFF_SHIFT - coeff_shift - 1)) - 1;
+  for (i = 0; i < blk_size * blk_size; i++) {
+    out_int32[i] = AOM_SIGNED_SHL(out_int32[i], hbd_downshift);
+    dqcoeff_pvq[i] = (out_int32[i] + (out_int32[i] < 0) + rounding_mask) >>
+                     (OD_COEFF_SHIFT - coeff_shift);
+  }
+
+  od_coding_order_to_raster(dqcoeff, blk_size, tx_type, dqcoeff_pvq, blk_size);
+
+  eob = blk_size * blk_size;
+
+  return eob;
+}
+
+static PVQ_SKIP_TYPE read_pvq_skip(AV1_COMMON *cm, MACROBLOCKD *const xd,
+                                   int plane, TX_SIZE tx_size) {
+  // decode ac/dc coded flag. bit0: DC coded, bit1 : AC coded
+  // NOTE : we don't use 5 symbols for luma here in aom codebase,
+  // since block partition is taken care of by aom.
+  // So, only AC/DC skip info is coded
+  const int ac_dc_coded = aom_read_symbol(
+      xd->daala_dec.r,
+      xd->daala_dec.state.adapt->skip_cdf[2 * tx_size + (plane != 0)], 4,
+      "skip");
+  if (ac_dc_coded < 0 || ac_dc_coded > 3) {
+    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+                       "Invalid PVQ Skip Type");
+  }
+  return ac_dc_coded;
+}
+
+static int av1_pvq_decode_helper2(AV1_COMMON *cm, MACROBLOCKD *const xd,
+                                  MB_MODE_INFO *const mbmi, int plane, int row,
+                                  int col, TX_SIZE tx_size, TX_TYPE tx_type) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  // transform block size in pixels
+  int tx_blk_size = tx_size_wide[tx_size];
+  int i, j;
+  tran_low_t *pvq_ref_coeff = pd->pvq_ref_coeff;
+  const int diff_stride = tx_blk_size;
+  int16_t *pred = pd->pred;
+  tran_low_t *const dqcoeff = pd->dqcoeff;
+  uint8_t *dst;
+  int eob;
+  const PVQ_SKIP_TYPE ac_dc_coded = read_pvq_skip(cm, xd, plane, tx_size);
+
+  eob = 0;
+  dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
+
+  if (ac_dc_coded) {
+    int xdec = pd->subsampling_x;
+    int seg_id = mbmi->segment_id;
+    int16_t *quant;
+    FWD_TXFM_PARAM fwd_txfm_param;
+    // ToDo(yaowu): correct this with optimal number from decoding process.
+    const int max_scan_line = tx_size_2d[tx_size];
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      for (j = 0; j < tx_blk_size; j++)
+        for (i = 0; i < tx_blk_size; i++)
+          pred[diff_stride * j + i] =
+              CONVERT_TO_SHORTPTR(dst)[pd->dst.stride * j + i];
+    } else {
+#endif
+      for (j = 0; j < tx_blk_size; j++)
+        for (i = 0; i < tx_blk_size; i++)
+          pred[diff_stride * j + i] = dst[pd->dst.stride * j + i];
+#if CONFIG_HIGHBITDEPTH
+    }
+#endif
+
+    fwd_txfm_param.tx_type = tx_type;
+    fwd_txfm_param.tx_size = tx_size;
+    fwd_txfm_param.lossless = xd->lossless[seg_id];
+
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      fwd_txfm_param.bd = xd->bd;
+      av1_highbd_fwd_txfm(pred, pvq_ref_coeff, diff_stride, &fwd_txfm_param);
+    } else {
+#endif  // CONFIG_HIGHBITDEPTH
+      av1_fwd_txfm(pred, pvq_ref_coeff, diff_stride, &fwd_txfm_param);
+#if CONFIG_HIGHBITDEPTH
+    }
+#endif  // CONFIG_HIGHBITDEPTH
+
+    quant = &pd->seg_dequant[seg_id][0];  // aom's quantizer
+
+    eob = av1_pvq_decode_helper(xd, pvq_ref_coeff, dqcoeff, quant, plane,
+                                tx_size, tx_type, xdec, ac_dc_coded);
+
+    inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
+                            max_scan_line, eob);
+  }
+
+  return eob;
+}
+#endif
+
+static int get_block_idx(const MACROBLOCKD *xd, int plane, int row, int col) {
+  const int bsize = xd->mi[0]->mbmi.sb_type;
+  const struct macroblockd_plane *pd = &xd->plane[plane];
+#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+#else
+  const BLOCK_SIZE plane_bsize =
+      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
+#endif  // CONFIG_CHROMA_2X2
+#else
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
+#endif
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  const TX_SIZE tx_size = get_tx_size(plane, xd);
+  const uint8_t txh_unit = tx_size_high_unit[tx_size];
+  return row * max_blocks_wide + col * txh_unit;
+}
+
+static void predict_and_reconstruct_intra_block(
+    AV1_COMMON *cm, MACROBLOCKD *const xd, aom_reader *const r,
+    MB_MODE_INFO *const mbmi, int plane, int row, int col, TX_SIZE tx_size) {
+  PLANE_TYPE plane_type = get_plane_type(plane);
+  const int block_idx = get_block_idx(xd, plane, row, col);
+#if CONFIG_PVQ
+  (void)r;
+#endif
+  av1_predict_intra_block_facade(xd, plane, block_idx, col, row, tx_size);
+
+  if (!mbmi->skip) {
+#if !CONFIG_PVQ
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+#if CONFIG_LV_MAP
+    int16_t max_scan_line = 0;
+    int eob;
+    av1_read_coeffs_txb_facade(cm, xd, r, row, col, block_idx, plane,
+                               pd->dqcoeff, &max_scan_line, &eob);
+    // tx_type will be read out in av1_read_coeffs_txb_facade
+    TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx, tx_size);
+#else   // CONFIG_LV_MAP
+    TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx, tx_size);
+    const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0);
+    int16_t max_scan_line = 0;
+    const int eob =
+        av1_decode_block_tokens(cm, xd, plane, scan_order, col, row, tx_size,
+                                tx_type, &max_scan_line, r, mbmi->segment_id);
+#endif  // CONFIG_LV_MAP
+    if (eob) {
+      uint8_t *dst =
+          &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
+      inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
+                              max_scan_line, eob);
+    }
+#else
+    TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx, tx_size);
+    av1_pvq_decode_helper2(cm, xd, mbmi, plane, row, col, tx_size, tx_type);
+#endif
+  }
+#if CONFIG_CFL
+  if (plane == AOM_PLANE_Y) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    uint8_t *dst =
+        &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
+    cfl_store(xd->cfl, dst, pd->dst.stride, row, col, tx_size);
+  }
+#endif
+}
+
+#if CONFIG_VAR_TX && !CONFIG_COEF_INTERLEAVE
+static void decode_reconstruct_tx(AV1_COMMON *cm, MACROBLOCKD *const xd,
+                                  aom_reader *r, MB_MODE_INFO *const mbmi,
+                                  int plane, BLOCK_SIZE plane_bsize,
+                                  int blk_row, int blk_col, TX_SIZE tx_size,
+                                  int *eob_total) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  const TX_SIZE plane_tx_size =
+      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
+            : mbmi->inter_tx_size[tx_row][tx_col];
+  // Scale to match transform block unit.
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (tx_size == plane_tx_size) {
+    PLANE_TYPE plane_type = get_plane_type(plane);
+    int block_idx = get_block_idx(xd, plane, blk_row, blk_col);
+#if CONFIG_LV_MAP
+    (void)segment_id;
+    int16_t max_scan_line = 0;
+    int eob;
+    av1_read_coeffs_txb_facade(cm, xd, r, row, col, block_idx, plane,
+                               pd->dqcoeff, &max_scan_line, &eob);
+    // tx_type will be read out in av1_read_coeffs_txb_facade
+    TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx, plane_tx_size);
+#else   // CONFIG_LV_MAP
+    TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx, plane_tx_size);
+    const SCAN_ORDER *sc = get_scan(cm, plane_tx_size, tx_type, 1);
+    int16_t max_scan_line = 0;
+    const int eob = av1_decode_block_tokens(
+        cm, xd, plane, sc, blk_col, blk_row, plane_tx_size, tx_type,
+        &max_scan_line, r, mbmi->segment_id);
+#endif  // CONFIG_LV_MAP
+    inverse_transform_block(xd, plane, tx_type, plane_tx_size,
+                            &pd->dst.buf[(blk_row * pd->dst.stride + blk_col)
+                                         << tx_size_wide_log2[0]],
+                            pd->dst.stride, max_scan_line, eob);
+    *eob_total += eob;
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsl = tx_size_wide_unit[sub_txs];
+    int i;
+
+    assert(bsl > 0);
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + (i >> 1) * bsl;
+      const int offsetc = blk_col + (i & 0x01) * bsl;
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+      decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize, offsetr,
+                            offsetc, sub_txs, eob_total);
+    }
+  }
+}
+#endif  // CONFIG_VAR_TX
+
+#if !CONFIG_VAR_TX || CONFIG_SUPERTX || CONFIG_COEF_INTERLEAVE || \
+    (!CONFIG_VAR_TX && CONFIG_EXT_TX && CONFIG_RECT_TX)
+static int reconstruct_inter_block(AV1_COMMON *cm, MACROBLOCKD *const xd,
+                                   aom_reader *const r, int segment_id,
+                                   int plane, int row, int col,
+                                   TX_SIZE tx_size) {
+  PLANE_TYPE plane_type = get_plane_type(plane);
+  int block_idx = get_block_idx(xd, plane, row, col);
+#if CONFIG_PVQ
+  int eob;
+  (void)r;
+  (void)segment_id;
+#else
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+#endif
+
+#if !CONFIG_PVQ
+#if CONFIG_LV_MAP
+  (void)segment_id;
+  int16_t max_scan_line = 0;
+  int eob;
+  av1_read_coeffs_txb_facade(cm, xd, r, row, col, block_idx, plane, pd->dqcoeff,
+                             &max_scan_line, &eob);
+  // tx_type will be read out in av1_read_coeffs_txb_facade
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx, tx_size);
+#else   // CONFIG_LV_MAP
+  int16_t max_scan_line = 0;
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx, tx_size);
+  const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 1);
+  const int eob =
+      av1_decode_block_tokens(cm, xd, plane, scan_order, col, row, tx_size,
+                              tx_type, &max_scan_line, r, segment_id);
+#endif  // CONFIG_LV_MAP
+  uint8_t *dst =
+      &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
+  if (eob)
+    inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
+                            max_scan_line, eob);
+#else
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx, tx_size);
+  eob = av1_pvq_decode_helper2(cm, xd, &xd->mi[0]->mbmi, plane, row, col,
+                               tx_size, tx_type);
+#endif
+  return eob;
+}
+#endif  // !CONFIG_VAR_TX || CONFIG_SUPER_TX
+
+static void set_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                        BLOCK_SIZE bsize, int mi_row, int mi_col, int bw,
+                        int bh, int x_mis, int y_mis) {
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  int x, y;
+  const TileInfo *const tile = &xd->tile;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = &cm->mi[offset];
+  // TODO(slavarnway): Generate sb_type based on bwl and bhl, instead of
+  // passing bsize from decode_partition().
+  xd->mi[0]->mbmi.sb_type = bsize;
+#if CONFIG_RD_DEBUG
+  xd->mi[0]->mbmi.mi_row = mi_row;
+  xd->mi[0]->mbmi.mi_col = mi_col;
+#endif
+  for (y = 0; y < y_mis; ++y)
+    for (x = !y; x < x_mis; ++x) xd->mi[y * cm->mi_stride + x] = xd->mi[0];
+
+  set_plane_n4(xd, bw, bh);
+  set_skip_context(xd, mi_row, mi_col);
+
+#if CONFIG_VAR_TX
+  xd->max_tx_size = max_txsize_lookup[bsize];
+#endif
+
+  // Distance of Mb to the various image edges. These are specified to 8th pel
+  // as they are always compared to values that are in 1/8th pel units
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
+#if CONFIG_DEPENDENT_HORZTILES
+                 cm->dependent_horz_tiles,
+#endif  // CONFIG_DEPENDENT_HORZTILES
+                 cm->mi_rows, cm->mi_cols);
+
+  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+                       mi_col);
+}
+
+#if CONFIG_SUPERTX
+static MB_MODE_INFO *set_offsets_extend(AV1_COMMON *const cm,
+                                        MACROBLOCKD *const xd,
+                                        const TileInfo *const tile,
+                                        BLOCK_SIZE bsize_pred, int mi_row_pred,
+                                        int mi_col_pred, int mi_row_ori,
+                                        int mi_col_ori) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori): location for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  const int bw = mi_size_wide[bsize_pred];
+  const int bh = mi_size_high[bsize_pred];
+  const int offset = mi_row_ori * cm->mi_stride + mi_col_ori;
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = cm->mi + offset;
+  set_mi_row_col(xd, tile, mi_row_pred, bh, mi_col_pred, bw,
+#if CONFIG_DEPENDENT_HORZTILES
+                 cm->dependent_horz_tiles,
+#endif  // CONFIG_DEPENDENT_HORZTILES
+                 cm->mi_rows, cm->mi_cols);
+
+  xd->up_available = (mi_row_ori > tile->mi_row_start);
+  xd->left_available = (mi_col_ori > tile->mi_col_start);
+
+  set_plane_n4(xd, bw, bh);
+
+  return &xd->mi[0]->mbmi;
+}
+
+#if CONFIG_SUPERTX
+static MB_MODE_INFO *set_mb_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                    int bw, int bh, int x_mis, int y_mis) {
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  const TileInfo *const tile = &xd->tile;
+  int x, y;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = cm->mi + offset;
+  xd->mi[0]->mbmi.sb_type = bsize;
+  for (y = 0; y < y_mis; ++y)
+    for (x = !y; x < x_mis; ++x) xd->mi[y * cm->mi_stride + x] = xd->mi[0];
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
+#if CONFIG_DEPENDENT_HORZTILES
+                 cm->dependent_horz_tiles,
+#endif  // CONFIG_DEPENDENT_HORZTILES
+                 cm->mi_rows, cm->mi_cols);
+  return &xd->mi[0]->mbmi;
+}
+#endif
+
+static void set_offsets_topblock(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                 const TileInfo *const tile, BLOCK_SIZE bsize,
+                                 int mi_row, int mi_col) {
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int offset = mi_row * cm->mi_stride + mi_col;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = cm->mi + offset;
+
+  set_plane_n4(xd, bw, bh);
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
+#if CONFIG_DEPENDENT_HORZTILES
+                 cm->dependent_horz_tiles,
+#endif  // CONFIG_DEPENDENT_HORZTILES
+                 cm->mi_rows, cm->mi_cols);
+
+  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+                       mi_col);
+}
+
+static void set_param_topblock(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                               BLOCK_SIZE bsize, int mi_row, int mi_col,
+                               int txfm, int skip) {
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  int x, y;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = cm->mi + offset;
+
+  for (y = 0; y < y_mis; ++y)
+    for (x = 0; x < x_mis; ++x) {
+      xd->mi[y * cm->mi_stride + x]->mbmi.skip = skip;
+      xd->mi[y * cm->mi_stride + x]->mbmi.tx_type = txfm;
+    }
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  set_txfm_ctxs(xd->mi[0]->mbmi.tx_size, bw, bh, skip, xd);
+#endif
+}
+
+static void set_ref(AV1_COMMON *const cm, MACROBLOCKD *const xd, int idx,
+                    int mi_row, int mi_col) {
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  RefBuffer *ref_buffer = &cm->frame_refs[mbmi->ref_frame[idx] - LAST_FRAME];
+  xd->block_refs[idx] = ref_buffer;
+  if (!av1_is_valid_scale(&ref_buffer->sf))
+    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Invalid scale factors");
+  av1_setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col,
+                       &ref_buffer->sf);
+  aom_merge_corrupted_flag(&xd->corrupted, ref_buffer->buf->corrupted);
+}
+
+static void dec_predict_b_extend(
+    AV1Decoder *const pbi, MACROBLOCKD *const xd, const TileInfo *const tile,
+    int block, int mi_row_ori, int mi_col_ori, int mi_row_pred, int mi_col_pred,
+    int mi_row_top, int mi_col_top, uint8_t *dst_buf[3], int dst_stride[3],
+    BLOCK_SIZE bsize_top, BLOCK_SIZE bsize_pred, int b_sub8x8, int bextend) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori): location for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  // (mi_row_top, mi_col_top, bsize_top): region of the top partition size
+  // block: sub location of sub8x8 blocks
+  // b_sub8x8: 1: ori is sub8x8; 0: ori is not sub8x8
+  // bextend: 1: region to predict is an extension of ori; 0: not
+  int r = (mi_row_pred - mi_row_top) * MI_SIZE;
+  int c = (mi_col_pred - mi_col_top) * MI_SIZE;
+  const int mi_width_top = mi_size_wide[bsize_top];
+  const int mi_height_top = mi_size_high[bsize_top];
+  MB_MODE_INFO *mbmi;
+  AV1_COMMON *const cm = &pbi->common;
+
+  if (mi_row_pred < mi_row_top || mi_col_pred < mi_col_top ||
+      mi_row_pred >= mi_row_top + mi_height_top ||
+      mi_col_pred >= mi_col_top + mi_width_top || mi_row_pred >= cm->mi_rows ||
+      mi_col_pred >= cm->mi_cols)
+    return;
+
+  mbmi = set_offsets_extend(cm, xd, tile, bsize_pred, mi_row_pred, mi_col_pred,
+                            mi_row_ori, mi_col_ori);
+  set_ref(cm, xd, 0, mi_row_pred, mi_col_pred);
+  if (has_second_ref(&xd->mi[0]->mbmi))
+    set_ref(cm, xd, 1, mi_row_pred, mi_col_pred);
+
+  if (!bextend) mbmi->tx_size = max_txsize_lookup[bsize_top];
+
+  xd->plane[0].dst.stride = dst_stride[0];
+  xd->plane[1].dst.stride = dst_stride[1];
+  xd->plane[2].dst.stride = dst_stride[2];
+  xd->plane[0].dst.buf = dst_buf[0] +
+                         (r >> xd->plane[0].subsampling_y) * dst_stride[0] +
+                         (c >> xd->plane[0].subsampling_x);
+  xd->plane[1].dst.buf = dst_buf[1] +
+                         (r >> xd->plane[1].subsampling_y) * dst_stride[1] +
+                         (c >> xd->plane[1].subsampling_x);
+  xd->plane[2].dst.buf = dst_buf[2] +
+                         (r >> xd->plane[2].subsampling_y) * dst_stride[2] +
+                         (c >> xd->plane[2].subsampling_x);
+
+  if (!b_sub8x8)
+    av1_build_inter_predictors_sb_extend(xd,
+#if CONFIG_EXT_INTER
+                                         mi_row_ori, mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                                         mi_row_pred, mi_col_pred, bsize_pred);
+  else
+    av1_build_inter_predictors_sb_sub8x8_extend(xd,
+#if CONFIG_EXT_INTER
+                                                mi_row_ori, mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                                                mi_row_pred, mi_col_pred,
+                                                bsize_pred, block);
+}
+
+static void dec_extend_dir(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+                           const TileInfo *const tile, int block,
+                           BLOCK_SIZE bsize, BLOCK_SIZE top_bsize, int mi_row,
+                           int mi_col, int mi_row_top, int mi_col_top,
+                           uint8_t *dst_buf[3], int dst_stride[3], int dir) {
+  // dir: 0-lower, 1-upper, 2-left, 3-right
+  //      4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  int xss = xd->plane[1].subsampling_x;
+  int yss = xd->plane[1].subsampling_y;
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+#endif
+  int b_sub8x8 = (bsize < BLOCK_8X8) && !unify_bsize ? 1 : 0;
+  BLOCK_SIZE extend_bsize;
+  int mi_row_pred, mi_col_pred;
+
+  int wide_unit, high_unit;
+  int i, j;
+  int ext_offset = 0;
+
+  if (dir == 0 || dir == 1) {
+    extend_bsize =
+        (mi_width == mi_size_wide[BLOCK_8X8] || bsize < BLOCK_8X8 || xss < yss)
+            ? BLOCK_8X8
+            : BLOCK_16X8;
+#if CONFIG_CB4X4
+    if (bsize < BLOCK_8X8) {
+      extend_bsize = BLOCK_4X4;
+      ext_offset = mi_size_wide[BLOCK_8X8];
+    }
+#endif
+
+    wide_unit = mi_size_wide[extend_bsize];
+    high_unit = mi_size_high[extend_bsize];
+
+    mi_row_pred = mi_row + ((dir == 0) ? mi_height : -(mi_height + ext_offset));
+    mi_col_pred = mi_col;
+
+    for (j = 0; j < mi_height + ext_offset; j += high_unit)
+      for (i = 0; i < mi_width + ext_offset; i += wide_unit)
+        dec_predict_b_extend(pbi, xd, tile, block, mi_row, mi_col,
+                             mi_row_pred + j, mi_col_pred + i, mi_row_top,
+                             mi_col_top, dst_buf, dst_stride, top_bsize,
+                             extend_bsize, b_sub8x8, 1);
+  } else if (dir == 2 || dir == 3) {
+    extend_bsize =
+        (mi_height == mi_size_high[BLOCK_8X8] || bsize < BLOCK_8X8 || yss < xss)
+            ? BLOCK_8X8
+            : BLOCK_8X16;
+#if CONFIG_CB4X4
+    if (bsize < BLOCK_8X8) {
+      extend_bsize = BLOCK_4X4;
+      ext_offset = mi_size_wide[BLOCK_8X8];
+    }
+#endif
+
+    wide_unit = mi_size_wide[extend_bsize];
+    high_unit = mi_size_high[extend_bsize];
+
+    mi_row_pred = mi_row;
+    mi_col_pred = mi_col + ((dir == 3) ? mi_width : -(mi_width + ext_offset));
+
+    for (j = 0; j < mi_height + ext_offset; j += high_unit)
+      for (i = 0; i < mi_width + ext_offset; i += wide_unit)
+        dec_predict_b_extend(pbi, xd, tile, block, mi_row, mi_col,
+                             mi_row_pred + j, mi_col_pred + i, mi_row_top,
+                             mi_col_top, dst_buf, dst_stride, top_bsize,
+                             extend_bsize, b_sub8x8, 1);
+  } else {
+    extend_bsize = BLOCK_8X8;
+#if CONFIG_CB4X4
+    if (bsize < BLOCK_8X8) {
+      extend_bsize = BLOCK_4X4;
+      ext_offset = mi_size_wide[BLOCK_8X8];
+    }
+#endif
+    wide_unit = mi_size_wide[extend_bsize];
+    high_unit = mi_size_high[extend_bsize];
+
+    mi_row_pred = mi_row + ((dir == 4 || dir == 6) ? mi_height
+                                                   : -(mi_height + ext_offset));
+    mi_col_pred =
+        mi_col + ((dir == 6 || dir == 7) ? mi_width : -(mi_width + ext_offset));
+
+    for (j = 0; j < mi_height + ext_offset; j += high_unit)
+      for (i = 0; i < mi_width + ext_offset; i += wide_unit)
+        dec_predict_b_extend(pbi, xd, tile, block, mi_row, mi_col,
+                             mi_row_pred + j, mi_col_pred + i, mi_row_top,
+                             mi_col_top, dst_buf, dst_stride, top_bsize,
+                             extend_bsize, b_sub8x8, 1);
+  }
+}
+
+static void dec_extend_all(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+                           const TileInfo *const tile, int block,
+                           BLOCK_SIZE bsize, BLOCK_SIZE top_bsize, int mi_row,
+                           int mi_col, int mi_row_top, int mi_col_top,
+                           uint8_t *dst_buf[3], int dst_stride[3]) {
+  for (int i = 0; i < 8; ++i) {
+    dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, dst_buf, dst_stride, i);
+  }
+}
+
+static void dec_predict_sb_complex(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+                                   const TileInfo *const tile, int mi_row,
+                                   int mi_col, int mi_row_top, int mi_col_top,
+                                   BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+                                   uint8_t *dst_buf[3], int dst_stride[3]) {
+  const AV1_COMMON *const cm = &pbi->common;
+  const int hbs = mi_size_wide[bsize] / 2;
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_EXT_PARTITION_TYPES
+  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+  int i;
+  const int mi_offset = mi_row * cm->mi_stride + mi_col;
+  uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+#endif
+
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf3[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  int dst_stride1[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
+  int dst_stride2[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
+  int dst_stride3[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
+
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
+    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_TX_SQUARE * len);
+    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_TX_SQUARE * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
+    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_TX_SQUARE * len);
+    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_TX_SQUARE * len);
+    dst_buf3[0] = CONVERT_TO_BYTEPTR(tmp_buf3);
+    dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAX_TX_SQUARE * len);
+    dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAX_TX_SQUARE * len);
+  } else {
+#endif
+    dst_buf1[0] = tmp_buf1;
+    dst_buf1[1] = tmp_buf1 + MAX_TX_SQUARE;
+    dst_buf1[2] = tmp_buf1 + 2 * MAX_TX_SQUARE;
+    dst_buf2[0] = tmp_buf2;
+    dst_buf2[1] = tmp_buf2 + MAX_TX_SQUARE;
+    dst_buf2[2] = tmp_buf2 + 2 * MAX_TX_SQUARE;
+    dst_buf3[0] = tmp_buf3;
+    dst_buf3[1] = tmp_buf3 + MAX_TX_SQUARE;
+    dst_buf3[2] = tmp_buf3 + 2 * MAX_TX_SQUARE;
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  xd->mi = cm->mi_grid_visible + mi_offset;
+  xd->mi[0] = cm->mi + mi_offset;
+
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].dst.buf = dst_buf[i];
+    xd->plane[i].dst.stride = dst_stride[i];
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      assert(bsize < top_bsize);
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, bsize, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dst_buf, dst_stride);
+      break;
+    case PARTITION_HORZ:
+      if (bsize == BLOCK_8X8 && !unify_bsize) {
+        // For sub8x8, predict in 8x8 unit
+        // First half
+        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, BLOCK_8X8, 1, 0);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+        // Second half
+        dec_predict_b_extend(pbi, xd, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+        // weighted average to smooth the boundary
+        xd->plane[0].dst.buf = dst_buf[0];
+        xd->plane[0].dst.stride = dst_stride[0];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
+            0);
+      } else {
+        // First half
+        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, subsize, 0, 0);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+        else
+          dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride, 0);
+
+        if (mi_row + hbs < cm->mi_rows) {
+          // Second half
+          dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col,
+                               mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                               dst_buf1, dst_stride1, top_bsize, subsize, 0, 0);
+          if (bsize < top_bsize)
+            dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row + hbs,
+                           mi_col, mi_row_top, mi_col_top, dst_buf1,
+                           dst_stride1);
+          else
+            dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row + hbs,
+                           mi_col, mi_row_top, mi_col_top, dst_buf1,
+                           dst_stride1, 1);
+
+          // weighted average to smooth the boundary
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = dst_buf[i];
+            xd->plane[i].dst.stride = dst_stride[i];
+            av1_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+                PARTITION_HORZ, i);
+          }
+        }
+      }
+      break;
+    case PARTITION_VERT:
+      if (bsize == BLOCK_8X8 && !unify_bsize) {
+        // First half
+        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, BLOCK_8X8, 1, 0);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+        // Second half
+        dec_predict_b_extend(pbi, xd, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+        // Smooth
+        xd->plane[0].dst.buf = dst_buf[0];
+        xd->plane[0].dst.stride = dst_stride[0];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
+            0);
+      } else {
+        // First half
+        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, subsize, 0, 0);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+        else
+          dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride, 3);
+
+        // Second half
+        if (mi_col + hbs < cm->mi_cols) {
+          dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs, mi_row,
+                               mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+                               dst_stride1, top_bsize, subsize, 0, 0);
+          if (bsize < top_bsize)
+            dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
+                           mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+                           dst_stride1);
+          else
+            dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
+                           mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+                           dst_stride1, 2);
+
+          // Smooth
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = dst_buf[i];
+            xd->plane[i].dst.stride = dst_stride[i];
+            av1_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+                PARTITION_VERT, i);
+          }
+        }
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8 && !unify_bsize) {
+        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, BLOCK_8X8, 1, 0);
+        dec_predict_b_extend(pbi, xd, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        dec_predict_b_extend(pbi, xd, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf2, dst_stride2,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        dec_predict_b_extend(pbi, xd, tile, 3, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf3, dst_stride3,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        if (bsize < top_bsize) {
+          dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+          dec_extend_all(pbi, xd, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+          dec_extend_all(pbi, xd, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf2, dst_stride2);
+          dec_extend_all(pbi, xd, tile, 3, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf3, dst_stride3);
+        }
+      } else {
+        dec_predict_sb_complex(pbi, xd, tile, mi_row, mi_col, mi_row_top,
+                               mi_col_top, subsize, top_bsize, dst_buf,
+                               dst_stride);
+        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          dec_predict_sb_complex(pbi, xd, tile, mi_row, mi_col + hbs,
+                                 mi_row_top, mi_col_top, subsize, top_bsize,
+                                 dst_buf1, dst_stride1);
+        if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
+          dec_predict_sb_complex(pbi, xd, tile, mi_row + hbs, mi_col,
+                                 mi_row_top, mi_col_top, subsize, top_bsize,
+                                 dst_buf2, dst_stride2);
+        if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          dec_predict_sb_complex(pbi, xd, tile, mi_row + hbs, mi_col + hbs,
+                                 mi_row_top, mi_col_top, subsize, top_bsize,
+                                 dst_buf3, dst_stride3);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+#if !CONFIG_CB4X4
+        if (bsize == BLOCK_8X8 && i != 0)
+          continue;  // Skip <4x4 chroma smoothing
+#endif
+        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
+          av1_build_masked_inter_predictor_complex(
+              xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+              mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+              PARTITION_VERT, i);
+          if (mi_row + hbs < cm->mi_rows) {
+            av1_build_masked_inter_predictor_complex(
+                xd, dst_buf2[i], dst_stride2[i], dst_buf3[i], dst_stride3[i],
+                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+                PARTITION_VERT, i);
+            av1_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i],
+                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+                PARTITION_HORZ, i);
+          }
+        } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
+          av1_build_masked_inter_predictor_complex(
+              xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i],
+              mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+              PARTITION_HORZ, i);
+        }
+      }
+      break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs, mi_row,
+                           mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+                           dst_stride1, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                           mi_col, mi_row_top, mi_col_top, dst_buf2,
+                           dst_stride2, top_bsize, subsize, 0, 0);
+      if (bsize < top_bsize)
+        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2);
+      else
+        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2,
+                       1);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
+            i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
+            i);
+      }
+      break;
+    case PARTITION_VERT_A:
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                           mi_col, mi_row_top, mi_col_top, dst_buf1,
+                           dst_stride1, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs, mi_row,
+                           mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
+                           dst_stride2, top_bsize, subsize, 0, 0);
+      if (bsize < top_bsize)
+        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
+                       dst_stride2);
+      else
+        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
+                       dst_stride2, 2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
+            i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
+            i);
+      }
+      break;
+    case PARTITION_HORZ_B:
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, subsize, 0, 0);
+      if (bsize < top_bsize)
+        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride);
+      else
+        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride, 0);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                           mi_col, mi_row_top, mi_col_top, dst_buf1,
+                           dst_stride1, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col + hbs,
+                           mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf2, dst_stride2, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row + hbs,
+                     mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
+                     dst_stride2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf1[i];
+        xd->plane[i].dst.stride = dst_stride1[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i],
+            mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+            PARTITION_VERT, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
+            i);
+      }
+      break;
+    case PARTITION_VERT_B:
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, subsize, 0, 0);
+      if (bsize < top_bsize)
+        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride);
+      else
+        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride, 3);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs, mi_row,
+                           mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+                           dst_stride1, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col + hbs,
+                           mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf2, dst_stride2, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row + hbs,
+                     mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
+                     dst_stride2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf1[i];
+        xd->plane[i].dst.stride = dst_stride1[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i],
+            mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+            PARTITION_HORZ, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
+            i);
+      }
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    default: assert(0);
+  }
+}
+
+static void set_segment_id_supertx(const AV1_COMMON *const cm, int mi_row,
+                                   int mi_col, BLOCK_SIZE bsize) {
+  const struct segmentation *seg = &cm->seg;
+  const int miw = AOMMIN(mi_size_wide[bsize], cm->mi_cols - mi_col);
+  const int mih = AOMMIN(mi_size_high[bsize], cm->mi_rows - mi_row);
+  const int mi_offset = mi_row * cm->mi_stride + mi_col;
+  MODE_INFO **const mip = cm->mi_grid_visible + mi_offset;
+  int r, c;
+  int seg_id_supertx = MAX_SEGMENTS;
+
+  if (!seg->enabled) {
+    seg_id_supertx = 0;
+  } else {
+    // Find the minimum segment_id
+    for (r = 0; r < mih; r++)
+      for (c = 0; c < miw; c++)
+        seg_id_supertx =
+            AOMMIN(mip[r * cm->mi_stride + c]->mbmi.segment_id, seg_id_supertx);
+    assert(0 <= seg_id_supertx && seg_id_supertx < MAX_SEGMENTS);
+  }
+
+  // Assign the the segment_id back to segment_id_supertx
+  for (r = 0; r < mih; r++)
+    for (c = 0; c < miw; c++)
+      mip[r * cm->mi_stride + c]->mbmi.segment_id_supertx = seg_id_supertx;
+}
+#endif  // CONFIG_SUPERTX
+
+static void decode_mbmi_block(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+#if CONFIG_SUPERTX
+                              int supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                              int mi_row, int mi_col, aom_reader *r,
+#if CONFIG_EXT_PARTITION_TYPES
+                              PARTITION_TYPE partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                              BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
+
+#if CONFIG_ACCOUNTING
+  aom_accounting_set_context(&pbi->accounting, mi_col, mi_row);
+#endif
+#if CONFIG_SUPERTX
+  if (supertx_enabled) {
+    set_mb_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
+  } else {
+    set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
+  }
+#if CONFIG_EXT_PARTITION_TYPES
+  xd->mi[0]->mbmi.partition = partition;
+#endif
+  av1_read_mode_info(pbi, xd, supertx_enabled, mi_row, mi_col, r, x_mis, y_mis);
+#else
+  set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
+#if CONFIG_EXT_PARTITION_TYPES
+  xd->mi[0]->mbmi.partition = partition;
+#endif
+  av1_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
+#endif  // CONFIG_SUPERTX
+
+  if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
+    const BLOCK_SIZE uv_subsize =
+        ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y];
+    if (uv_subsize == BLOCK_INVALID)
+      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid block size.");
+  }
+
+#if CONFIG_SUPERTX
+  xd->mi[0]->mbmi.segment_id_supertx = MAX_SEGMENTS;
+#endif  // CONFIG_SUPERTX
+
+  int reader_corrupted_flag = aom_reader_has_error(r);
+  aom_merge_corrupted_flag(&xd->corrupted, reader_corrupted_flag);
+}
+
+static void decode_token_and_recon_block(AV1Decoder *const pbi,
+                                         MACROBLOCKD *const xd, int mi_row,
+                                         int mi_col, aom_reader *r,
+                                         BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
+
+  set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+
+#if CONFIG_DELTA_Q
+  if (cm->delta_q_present_flag) {
+    int i;
+    for (i = 0; i < MAX_SEGMENTS; i++) {
+#if CONFIG_EXT_DELTA_Q
+      xd->plane[0].seg_dequant[i][0] =
+          av1_dc_quant(av1_get_qindex(&cm->seg, i, xd->current_qindex),
+                       cm->y_dc_delta_q, cm->bit_depth);
+      xd->plane[0].seg_dequant[i][1] = av1_ac_quant(
+          av1_get_qindex(&cm->seg, i, xd->current_qindex), 0, cm->bit_depth);
+      xd->plane[1].seg_dequant[i][0] =
+          av1_dc_quant(av1_get_qindex(&cm->seg, i, xd->current_qindex),
+                       cm->uv_dc_delta_q, cm->bit_depth);
+      xd->plane[1].seg_dequant[i][1] =
+          av1_ac_quant(av1_get_qindex(&cm->seg, i, xd->current_qindex),
+                       cm->uv_ac_delta_q, cm->bit_depth);
+      xd->plane[2].seg_dequant[i][0] =
+          av1_dc_quant(av1_get_qindex(&cm->seg, i, xd->current_qindex),
+                       cm->uv_dc_delta_q, cm->bit_depth);
+      xd->plane[2].seg_dequant[i][1] =
+          av1_ac_quant(av1_get_qindex(&cm->seg, i, xd->current_qindex),
+                       cm->uv_ac_delta_q, cm->bit_depth);
+#else
+      xd->plane[0].seg_dequant[i][0] =
+          av1_dc_quant(xd->current_qindex, cm->y_dc_delta_q, cm->bit_depth);
+      xd->plane[0].seg_dequant[i][1] =
+          av1_ac_quant(xd->current_qindex, 0, cm->bit_depth);
+      xd->plane[1].seg_dequant[i][0] =
+          av1_dc_quant(xd->current_qindex, cm->uv_dc_delta_q, cm->bit_depth);
+      xd->plane[1].seg_dequant[i][1] =
+          av1_ac_quant(xd->current_qindex, cm->uv_ac_delta_q, cm->bit_depth);
+      xd->plane[2].seg_dequant[i][0] =
+          av1_dc_quant(xd->current_qindex, cm->uv_dc_delta_q, cm->bit_depth);
+      xd->plane[2].seg_dequant[i][1] =
+          av1_ac_quant(xd->current_qindex, cm->uv_ac_delta_q, cm->bit_depth);
+#endif
+    }
+  }
+#endif
+
+#if CONFIG_CB4X4
+  if (mbmi->skip) reset_skip_context(xd, bsize);
+#else
+  if (mbmi->skip) reset_skip_context(xd, AOMMAX(BLOCK_8X8, bsize));
+#endif
+
+#if CONFIG_COEF_INTERLEAVE
+  {
+    const struct macroblockd_plane *const pd_y = &xd->plane[0];
+    const struct macroblockd_plane *const pd_c = &xd->plane[1];
+    const TX_SIZE tx_log2_y = mbmi->tx_size;
+    const TX_SIZE tx_log2_c = get_uv_tx_size(mbmi, pd_c);
+    const int tx_sz_y = (1 << tx_log2_y);
+    const int tx_sz_c = (1 << tx_log2_c);
+    const int num_4x4_w_y = pd_y->n4_w;
+    const int num_4x4_h_y = pd_y->n4_h;
+    const int num_4x4_w_c = pd_c->n4_w;
+    const int num_4x4_h_c = pd_c->n4_h;
+    const int max_4x4_w_y = get_max_4x4_size(num_4x4_w_y, xd->mb_to_right_edge,
+                                             pd_y->subsampling_x);
+    const int max_4x4_h_y = get_max_4x4_size(num_4x4_h_y, xd->mb_to_bottom_edge,
+                                             pd_y->subsampling_y);
+    const int max_4x4_w_c = get_max_4x4_size(num_4x4_w_c, xd->mb_to_right_edge,
+                                             pd_c->subsampling_x);
+    const int max_4x4_h_c = get_max_4x4_size(num_4x4_h_c, xd->mb_to_bottom_edge,
+                                             pd_c->subsampling_y);
+
+    // The max_4x4_w/h may be smaller than tx_sz under some corner cases,
+    // i.e. when the SB is splitted by tile boundaries.
+    const int tu_num_w_y = (max_4x4_w_y + tx_sz_y - 1) / tx_sz_y;
+    const int tu_num_h_y = (max_4x4_h_y + tx_sz_y - 1) / tx_sz_y;
+    const int tu_num_w_c = (max_4x4_w_c + tx_sz_c - 1) / tx_sz_c;
+    const int tu_num_h_c = (max_4x4_h_c + tx_sz_c - 1) / tx_sz_c;
+    const int tu_num_c = tu_num_w_c * tu_num_h_c;
+
+    if (!is_inter_block(mbmi)) {
+      int tu_idx_c = 0;
+      int row_y, col_y, row_c, col_c;
+      int plane;
+
+#if CONFIG_PALETTE
+      for (plane = 0; plane <= 1; ++plane) {
+        if (mbmi->palette_mode_info.palette_size[plane])
+          av1_decode_palette_tokens(xd, plane, r);
+      }
+#endif
+
+      for (row_y = 0; row_y < tu_num_h_y; row_y++) {
+        for (col_y = 0; col_y < tu_num_w_y; col_y++) {
+          // luma
+          predict_and_reconstruct_intra_block(
+              cm, xd, r, mbmi, 0, row_y * tx_sz_y, col_y * tx_sz_y, tx_log2_y);
+          // chroma
+          if (tu_idx_c < tu_num_c) {
+            row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
+            col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
+            predict_and_reconstruct_intra_block(cm, xd, r, mbmi, 1, row_c,
+                                                col_c, tx_log2_c);
+            predict_and_reconstruct_intra_block(cm, xd, r, mbmi, 2, row_c,
+                                                col_c, tx_log2_c);
+            tu_idx_c++;
+          }
+        }
+      }
+
+      // In 422 case, it's possilbe that Chroma has more TUs than Luma
+      while (tu_idx_c < tu_num_c) {
+        row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
+        col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
+        predict_and_reconstruct_intra_block(cm, xd, r, mbmi, 1, row_c, col_c,
+                                            tx_log2_c);
+        predict_and_reconstruct_intra_block(cm, xd, r, mbmi, 2, row_c, col_c,
+                                            tx_log2_c);
+        tu_idx_c++;
+      }
+    } else {
+      // Prediction
+      av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL,
+                                    AOMMAX(bsize, BLOCK_8X8));
+
+      // Reconstruction
+      if (!mbmi->skip) {
+        int eobtotal = 0;
+        int tu_idx_c = 0;
+        int row_y, col_y, row_c, col_c;
+
+        for (row_y = 0; row_y < tu_num_h_y; row_y++) {
+          for (col_y = 0; col_y < tu_num_w_y; col_y++) {
+            // luma
+            eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id, 0,
+                                                row_y * tx_sz_y,
+                                                col_y * tx_sz_y, tx_log2_y);
+            // chroma
+            if (tu_idx_c < tu_num_c) {
+              row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
+              col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
+              eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id,
+                                                  1, row_c, col_c, tx_log2_c);
+              eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id,
+                                                  2, row_c, col_c, tx_log2_c);
+              tu_idx_c++;
+            }
+          }
+        }
+
+        // In 422 case, it's possilbe that Chroma has more TUs than Luma
+        while (tu_idx_c < tu_num_c) {
+          row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
+          col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
+          eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id, 1,
+                                              row_c, col_c, tx_log2_c);
+          eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id, 2,
+                                              row_c, col_c, tx_log2_c);
+          tu_idx_c++;
+        }
+
+        // TODO(CONFIG_COEF_INTERLEAVE owners): bring eob == 0 corner case
+        // into line with the defaut configuration
+        if (bsize >= BLOCK_8X8 && eobtotal == 0) mbmi->skip = 1;
+      }
+    }
+  }
+#else  // CONFIG_COEF_INTERLEAVE
+  if (!is_inter_block(mbmi)) {
+    int plane;
+#if CONFIG_PALETTE
+    for (plane = 0; plane <= 1; ++plane) {
+      if (mbmi->palette_mode_info.palette_size[plane])
+        av1_decode_palette_tokens(xd, plane, r);
+    }
+#endif  // CONFIG_PALETTE
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const TX_SIZE tx_size = get_tx_size(plane, xd);
+      const int stepr = tx_size_high_unit[tx_size];
+      const int stepc = tx_size_wide_unit[tx_size];
+#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2
+      const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+#else
+      const BLOCK_SIZE plane_bsize =
+          AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
+#endif  // CONFIG_CHROMA_2X2
+#else
+      const BLOCK_SIZE plane_bsize =
+          get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
+#endif
+      int row, col;
+      const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+      const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+#if CONFIG_CB4X4
+      if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                               pd->subsampling_y))
+        continue;
+#endif
+
+      for (row = 0; row < max_blocks_high; row += stepr)
+        for (col = 0; col < max_blocks_wide; col += stepc)
+          predict_and_reconstruct_intra_block(cm, xd, r, mbmi, plane, row, col,
+                                              tx_size);
+    }
+  } else {
+    int ref;
+
+    for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+      const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+      if (frame < LAST_FRAME) {
+#if CONFIG_INTRABC
+        assert(is_intrabc_block(mbmi));
+        assert(frame == INTRA_FRAME);
+        assert(ref == 0);
+#else
+        assert(0);
+#endif  // CONFIG_INTRABC
+      } else {
+        RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
+
+        xd->block_refs[ref] = ref_buf;
+        if ((!av1_is_valid_scale(&ref_buf->sf)))
+          aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                             "Reference frame has invalid dimensions");
+        av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col,
+                             &ref_buf->sf);
+      }
+    }
+
+#if CONFIG_CB4X4
+    av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, bsize);
+#else
+    av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL,
+                                  AOMMAX(bsize, BLOCK_8X8));
+#endif
+
+#if CONFIG_MOTION_VAR
+    if (mbmi->motion_mode == OBMC_CAUSAL) {
+#if CONFIG_NCOBMC
+      av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+#else
+      av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+#endif
+    }
+#endif  // CONFIG_MOTION_VAR
+
+    // Reconstruction
+    if (!mbmi->skip) {
+      int eobtotal = 0;
+      int plane;
+
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        const struct macroblockd_plane *const pd = &xd->plane[plane];
+#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2
+        const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+#else
+        const BLOCK_SIZE plane_bsize =
+            AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
+#endif  // CONFIG_CHROMA_2X2
+#else
+        const BLOCK_SIZE plane_bsize =
+            get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
+#endif
+        const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+        const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+        int row, col;
+
+#if CONFIG_CB4X4
+        if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                                 pd->subsampling_y))
+          continue;
+#endif
+
+#if CONFIG_VAR_TX
+        const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, plane_bsize);
+        const int bh_var_tx = tx_size_high_unit[max_tx_size];
+        const int bw_var_tx = tx_size_wide_unit[max_tx_size];
+        for (row = 0; row < max_blocks_high; row += bh_var_tx)
+          for (col = 0; col < max_blocks_wide; col += bw_var_tx)
+            decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize, row, col,
+                                  max_tx_size, &eobtotal);
+#else
+        const TX_SIZE tx_size = get_tx_size(plane, xd);
+        const int stepr = tx_size_high_unit[tx_size];
+        const int stepc = tx_size_wide_unit[tx_size];
+        for (row = 0; row < max_blocks_high; row += stepr)
+          for (col = 0; col < max_blocks_wide; col += stepc)
+            eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id,
+                                                plane, row, col, tx_size);
+#endif
+      }
+    }
+  }
+#endif  // CONFIG_COEF_INTERLEAVE
+
+  int reader_corrupted_flag = aom_reader_has_error(r);
+  aom_merge_corrupted_flag(&xd->corrupted, reader_corrupted_flag);
+}
+
+#if CONFIG_NCOBMC && CONFIG_MOTION_VAR
+static void detoken_and_recon_sb(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+                                 int mi_row, int mi_col, aom_reader *r,
+                                 BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int hbs = mi_size_wide[bsize] >> 1;
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+  const int has_rows = (mi_row + hbs) < cm->mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  partition = get_partition(cm, mi_row, mi_col, bsize);
+  subsize = subsize_lookup[partition][bsize];
+
+  if (!hbs && !unify_bsize) {
+    xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
+    xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
+    decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, subsize);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, bsize);
+        break;
+      case PARTITION_HORZ:
+        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, subsize);
+        if (has_rows)
+          decode_token_and_recon_block(pbi, xd, mi_row + hbs, mi_col, r,
+                                       subsize);
+        break;
+      case PARTITION_VERT:
+        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, subsize);
+        if (has_cols)
+          decode_token_and_recon_block(pbi, xd, mi_row, mi_col + hbs, r,
+                                       subsize);
+        break;
+      case PARTITION_SPLIT:
+        detoken_and_recon_sb(pbi, xd, mi_row, mi_col, r, subsize);
+        detoken_and_recon_sb(pbi, xd, mi_row, mi_col + hbs, r, subsize);
+        detoken_and_recon_sb(pbi, xd, mi_row + hbs, mi_col, r, subsize);
+        detoken_and_recon_sb(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize);
+        break;
+#if CONFIG_EXT_PARTITION_TYPES
+      case PARTITION_HORZ_A:
+        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, bsize2);
+        decode_token_and_recon_block(pbi, xd, mi_row, mi_col + hbs, r, bsize2);
+        decode_token_and_recon_block(pbi, xd, mi_row + hbs, mi_col, r, subsize);
+        break;
+      case PARTITION_HORZ_B:
+        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, subsize);
+        decode_token_and_recon_block(pbi, xd, mi_row + hbs, mi_col, r, bsize2);
+        decode_token_and_recon_block(pbi, xd, mi_row + hbs, mi_col + hbs, r,
+                                     bsize2);
+        break;
+      case PARTITION_VERT_A:
+        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, bsize2);
+        decode_token_and_recon_block(pbi, xd, mi_row + hbs, mi_col, r, bsize2);
+        decode_token_and_recon_block(pbi, xd, mi_row, mi_col + hbs, r, subsize);
+        break;
+      case PARTITION_VERT_B:
+        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, subsize);
+        decode_token_and_recon_block(pbi, xd, mi_row, mi_col + hbs, r, bsize2);
+        decode_token_and_recon_block(pbi, xd, mi_row + hbs, mi_col + hbs, r,
+                                     bsize2);
+        break;
+#endif
+      default: assert(0 && "Invalid partition type");
+    }
+  }
+}
+#endif
+
+static void decode_block(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+#if CONFIG_SUPERTX
+                         int supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                         int mi_row, int mi_col, aom_reader *r,
+#if CONFIG_EXT_PARTITION_TYPES
+                         PARTITION_TYPE partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                         BLOCK_SIZE bsize) {
+  decode_mbmi_block(pbi, xd,
+#if CONFIG_SUPERTX
+                    supertx_enabled,
+#endif
+                    mi_row, mi_col, r,
+#if CONFIG_EXT_PARTITION_TYPES
+                    partition,
+#endif
+                    bsize);
+#if !(CONFIG_MOTION_VAR && CONFIG_NCOBMC)
+#if CONFIG_SUPERTX
+  if (!supertx_enabled)
+#endif  // CONFIG_SUPERTX
+    decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, bsize);
+#endif
+}
+
+static PARTITION_TYPE read_partition(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                     int mi_row, int mi_col, aom_reader *r,
+                                     int has_rows, int has_cols,
+                                     BLOCK_SIZE bsize) {
+#if CONFIG_UNPOISON_PARTITION_CTX
+  const int ctx =
+      partition_plane_context(xd, mi_row, mi_col, has_rows, has_cols, bsize);
+  const aom_prob *const probs =
+      ctx < PARTITION_CONTEXTS ? cm->fc->partition_prob[ctx] : NULL;
+  FRAME_COUNTS *const counts = ctx < PARTITION_CONTEXTS ? xd->counts : NULL;
+#else
+  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+  const aom_prob *const probs = cm->fc->partition_prob[ctx];
+  FRAME_COUNTS *const counts = xd->counts;
+#endif
+  PARTITION_TYPE p;
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  (void)cm;
+#elif CONFIG_EC_MULTISYMBOL
+  FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+
+#if CONFIG_EC_MULTISYMBOL
+  aom_cdf_prob *partition_cdf = (ctx >= 0) ? ec_ctx->partition_cdf[ctx] : NULL;
+#endif
+
+  if (has_rows && has_cols)
+#if CONFIG_EXT_PARTITION_TYPES
+    if (bsize <= BLOCK_8X8)
+#if CONFIG_EC_MULTISYMBOL
+      p = (PARTITION_TYPE)aom_read_symbol(r, partition_cdf, PARTITION_TYPES,
+                                          ACCT_STR);
+#else
+      p = (PARTITION_TYPE)aom_read_tree(r, av1_partition_tree, probs, ACCT_STR);
+#endif
+    else
+#if CONFIG_EC_MULTISYMBOL
+      p = (PARTITION_TYPE)aom_read_symbol(r, partition_cdf, EXT_PARTITION_TYPES,
+                                          ACCT_STR);
+#else
+      p = (PARTITION_TYPE)aom_read_tree(r, av1_ext_partition_tree, probs,
+                                        ACCT_STR);
+#endif
+#else
+#if CONFIG_EC_MULTISYMBOL
+    p = (PARTITION_TYPE)aom_read_symbol(r, partition_cdf, PARTITION_TYPES,
+                                        ACCT_STR);
+#else
+    p = (PARTITION_TYPE)aom_read_tree(r, av1_partition_tree, probs, ACCT_STR);
+#endif
+#endif  // CONFIG_EXT_PARTITION_TYPES
+  else if (!has_rows && has_cols)
+    p = aom_read(r, probs[1], ACCT_STR) ? PARTITION_SPLIT : PARTITION_HORZ;
+  else if (has_rows && !has_cols)
+    p = aom_read(r, probs[2], ACCT_STR) ? PARTITION_SPLIT : PARTITION_VERT;
+  else
+    p = PARTITION_SPLIT;
+
+  if (counts) ++counts->partition[ctx][p];
+
+  return p;
+}
+
+#if CONFIG_SUPERTX
+static int read_skip(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
+                     aom_reader *r) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 1;
+  } else {
+    const int ctx = av1_get_skip_context(xd);
+    const int skip = aom_read(r, cm->fc->skip_probs[ctx], ACCT_STR);
+    FRAME_COUNTS *counts = xd->counts;
+    if (counts) ++counts->skip[ctx][skip];
+    return skip;
+  }
+}
+#endif  // CONFIG_SUPERTX
+
+// TODO(slavarnway): eliminate bsize and subsize in future commits
+static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+#if CONFIG_SUPERTX
+                             int supertx_enabled,
+#endif
+                             int mi_row, int mi_col, aom_reader *r,
+                             BLOCK_SIZE bsize, int n4x4_l2) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int n8x8_l2 = n4x4_l2 - 1;
+  const int num_8x8_wh = mi_size_wide[bsize];
+  const int hbs = num_8x8_wh >> 1;
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+#endif
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+#if CONFIG_EXT_PARTITION_TYPES
+  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+  const int has_rows = (mi_row + hbs) < cm->mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+#if CONFIG_SUPERTX
+  const int read_token = !supertx_enabled;
+  int skip = 0;
+  TX_SIZE supertx_size = max_txsize_lookup[bsize];
+  const TileInfo *const tile = &xd->tile;
+  int txfm = DCT_DCT;
+#endif  // CONFIG_SUPERTX
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  partition = (bsize < BLOCK_8X8) ? PARTITION_NONE
+                                  : read_partition(cm, xd, mi_row, mi_col, r,
+                                                   has_rows, has_cols, bsize);
+  subsize = subsize_lookup[partition][bsize];  // get_subsize(bsize, partition);
+
+#if CONFIG_PVQ
+  assert(partition < PARTITION_TYPES);
+  assert(subsize < BLOCK_SIZES);
+#endif
+#if CONFIG_SUPERTX
+  if (!frame_is_intra_only(cm) && partition != PARTITION_NONE &&
+      bsize <= MAX_SUPERTX_BLOCK_SIZE && !supertx_enabled && !xd->lossless[0]) {
+    const int supertx_context = partition_supertx_context_lookup[partition];
+    supertx_enabled = aom_read(
+        r, cm->fc->supertx_prob[supertx_context][supertx_size], ACCT_STR);
+    if (xd->counts)
+      xd->counts->supertx[supertx_context][supertx_size][supertx_enabled]++;
+#if CONFIG_VAR_TX
+    if (supertx_enabled) xd->supertx_size = supertx_size;
+#endif
+  }
+#endif  // CONFIG_SUPERTX
+  if (!hbs && !unify_bsize) {
+    // calculate bmode block dimensions (log 2)
+    xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
+    xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
+    decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                 supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                 mi_row, mi_col, r,
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                 subsize);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                     mi_row, mi_col, r,
+#if CONFIG_EXT_PARTITION_TYPES
+                     partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                     subsize);
+        break;
+      case PARTITION_HORZ:
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                     mi_row, mi_col, r,
+#if CONFIG_EXT_PARTITION_TYPES
+                     partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                     subsize);
+        if (has_rows)
+          decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                       supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                       mi_row + hbs, mi_col, r,
+#if CONFIG_EXT_PARTITION_TYPES
+                       partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                       subsize);
+        break;
+      case PARTITION_VERT:
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                     mi_row, mi_col, r,
+#if CONFIG_EXT_PARTITION_TYPES
+                     partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                     subsize);
+        if (has_cols)
+          decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                       supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                       mi_row, mi_col + hbs, r,
+#if CONFIG_EXT_PARTITION_TYPES
+                       partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                       subsize);
+        break;
+      case PARTITION_SPLIT:
+        decode_partition(pbi, xd,
+#if CONFIG_SUPERTX
+                         supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                         mi_row, mi_col, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd,
+#if CONFIG_SUPERTX
+                         supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                         mi_row, mi_col + hbs, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd,
+#if CONFIG_SUPERTX
+                         supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                         mi_row + hbs, mi_col, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd,
+#if CONFIG_SUPERTX
+                         supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                         mi_row + hbs, mi_col + hbs, r, subsize, n8x8_l2);
+        break;
+#if CONFIG_EXT_PARTITION_TYPES
+      case PARTITION_HORZ_A:
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r, partition, bsize2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col + hbs, r, partition, bsize2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row + hbs, mi_col, r, partition, subsize);
+        break;
+      case PARTITION_HORZ_B:
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r, partition, subsize);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row + hbs, mi_col, r, partition, bsize2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row + hbs, mi_col + hbs, r, partition, bsize2);
+        break;
+      case PARTITION_VERT_A:
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r, partition, bsize2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row + hbs, mi_col, r, partition, bsize2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col + hbs, r, partition, subsize);
+        break;
+      case PARTITION_VERT_B:
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r, partition, subsize);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col + hbs, r, partition, bsize2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row + hbs, mi_col + hbs, r, partition, bsize2);
+        break;
+#endif
+      default: assert(0 && "Invalid partition type");
+    }
+  }
+
+#if CONFIG_SUPERTX
+  if (supertx_enabled && read_token) {
+    uint8_t *dst_buf[3];
+    int dst_stride[3], i;
+    int offset = mi_row * cm->mi_stride + mi_col;
+
+    set_segment_id_supertx(cm, mi_row, mi_col, bsize);
+
+#if CONFIG_DELTA_Q
+    if (cm->delta_q_present_flag) {
+      for (i = 0; i < MAX_SEGMENTS; i++) {
+        xd->plane[0].seg_dequant[i][0] =
+            av1_dc_quant(xd->current_qindex, cm->y_dc_delta_q, cm->bit_depth);
+        xd->plane[0].seg_dequant[i][1] =
+            av1_ac_quant(xd->current_qindex, 0, cm->bit_depth);
+        xd->plane[1].seg_dequant[i][0] =
+            av1_dc_quant(xd->current_qindex, cm->uv_dc_delta_q, cm->bit_depth);
+        xd->plane[1].seg_dequant[i][1] =
+            av1_ac_quant(xd->current_qindex, cm->uv_ac_delta_q, cm->bit_depth);
+        xd->plane[2].seg_dequant[i][0] =
+            av1_dc_quant(xd->current_qindex, cm->uv_dc_delta_q, cm->bit_depth);
+        xd->plane[2].seg_dequant[i][1] =
+            av1_ac_quant(xd->current_qindex, cm->uv_ac_delta_q, cm->bit_depth);
+      }
+    }
+#endif
+
+    xd->mi = cm->mi_grid_visible + offset;
+    xd->mi[0] = cm->mi + offset;
+    set_mi_row_col(xd, tile, mi_row, mi_size_high[bsize], mi_col,
+                   mi_size_wide[bsize],
+#if CONFIG_DEPENDENT_HORZTILES
+                   cm->dependent_horz_tiles,
+#endif  // CONFIG_DEPENDENT_HORZTILES
+                   cm->mi_rows, cm->mi_cols);
+    set_skip_context(xd, mi_row, mi_col);
+    skip = read_skip(cm, xd, xd->mi[0]->mbmi.segment_id_supertx, r);
+    if (skip) {
+      reset_skip_context(xd, bsize);
+    } else {
+#if CONFIG_EXT_TX
+      if (get_ext_tx_types(supertx_size, bsize, 1, cm->reduced_tx_set_used) >
+          1) {
+        const int eset =
+            get_ext_tx_set(supertx_size, bsize, 1, cm->reduced_tx_set_used);
+        if (eset > 0) {
+          txfm = aom_read_tree(r, av1_ext_tx_inter_tree[eset],
+                               cm->fc->inter_ext_tx_prob[eset][supertx_size],
+                               ACCT_STR);
+          if (xd->counts) ++xd->counts->inter_ext_tx[eset][supertx_size][txfm];
+        }
+      }
+#else
+      if (supertx_size < TX_32X32) {
+        txfm = aom_read_tree(r, av1_ext_tx_tree,
+                             cm->fc->inter_ext_tx_prob[supertx_size], ACCT_STR);
+        if (xd->counts) ++xd->counts->inter_ext_tx[supertx_size][txfm];
+      }
+#endif  // CONFIG_EXT_TX
+    }
+
+    av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+                         mi_col);
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      dst_buf[i] = xd->plane[i].dst.buf;
+      dst_stride[i] = xd->plane[i].dst.stride;
+    }
+    dec_predict_sb_complex(pbi, xd, tile, mi_row, mi_col, mi_row, mi_col, bsize,
+                           bsize, dst_buf, dst_stride);
+
+    if (!skip) {
+      int eobtotal = 0;
+      MB_MODE_INFO *mbmi;
+      set_offsets_topblock(cm, xd, tile, bsize, mi_row, mi_col);
+      mbmi = &xd->mi[0]->mbmi;
+      mbmi->tx_type = txfm;
+      assert(mbmi->segment_id_supertx != MAX_SEGMENTS);
+      for (i = 0; i < MAX_MB_PLANE; ++i) {
+        const struct macroblockd_plane *const pd = &xd->plane[i];
+        int row, col;
+        const TX_SIZE tx_size = get_tx_size(i, xd);
+        const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+        const int stepr = tx_size_high_unit[tx_size];
+        const int stepc = tx_size_wide_unit[tx_size];
+        const int max_blocks_wide = max_block_wide(xd, plane_bsize, i);
+        const int max_blocks_high = max_block_high(xd, plane_bsize, i);
+
+        for (row = 0; row < max_blocks_high; row += stepr)
+          for (col = 0; col < max_blocks_wide; col += stepc)
+            eobtotal += reconstruct_inter_block(
+                cm, xd, r, mbmi->segment_id_supertx, i, row, col, tx_size);
+      }
+      if ((unify_bsize || !(subsize < BLOCK_8X8)) && eobtotal == 0) skip = 1;
+    }
+    set_param_topblock(cm, xd, bsize, mi_row, mi_col, txfm, skip);
+  }
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_EXT_PARTITION_TYPES
+  update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+#else
+  // update partition context
+  if (bsize >= BLOCK_8X8 &&
+      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
+#if CONFIG_CDEF
+#if CONFIG_EXT_PARTITION
+  if (cm->sb_size == BLOCK_128X128 && bsize == BLOCK_128X128) {
+    if (!sb_all_skip(cm, mi_row, mi_col)) {
+      cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.cdef_strength =
+          aom_read_literal(r, cm->cdef_bits, ACCT_STR);
+    } else {
+      cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.cdef_strength =
+          0;
+    }
+  } else if (cm->sb_size == BLOCK_64X64 && bsize == BLOCK_64X64) {
+#else
+  if (bsize == BLOCK_64X64) {
+#endif
+    if (!sb_all_skip(cm, mi_row, mi_col)) {
+      cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.cdef_strength =
+          aom_read_literal(r, cm->cdef_bits, ACCT_STR);
+    } else {
+      cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.cdef_strength =
+          -1;
+    }
+  }
+#endif  // CONFIG_CDEF
+}
+
+static void setup_bool_decoder(const uint8_t *data, const uint8_t *data_end,
+                               const size_t read_size,
+                               struct aom_internal_error_info *error_info,
+                               aom_reader *r,
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+                               int window_size,
+#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
+                               aom_decrypt_cb decrypt_cb, void *decrypt_state) {
+  // Validate the calculated partition length. If the buffer
+  // described by the partition can't be fully read, then restrict
+  // it to the portion that can be (for EC mode) or throw an error.
+  if (!read_is_valid(data, read_size, data_end))
+    aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt tile length");
+
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+  r->window_size = window_size;
+#endif
+  if (aom_reader_init(r, data, read_size, decrypt_cb, decrypt_state))
+    aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate bool decoder %d", 1);
+}
+
+#if !CONFIG_PVQ && !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET) && !CONFIG_LV_MAP
+static void read_coef_probs_common(av1_coeff_probs_model *coef_probs,
+                                   aom_reader *r) {
+  int i, j, k, l, m;
+#if CONFIG_EC_ADAPT
+  const int node_limit = UNCONSTRAINED_NODES - 1;
+#else
+  const int node_limit = UNCONSTRAINED_NODES;
+#endif
+
+  if (aom_read_bit(r, ACCT_STR))
+    for (i = 0; i < PLANE_TYPES; ++i)
+      for (j = 0; j < REF_TYPES; ++j)
+        for (k = 0; k < COEF_BANDS; ++k)
+          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
+            for (m = 0; m < node_limit; ++m)
+              av1_diff_update_prob(r, &coef_probs[i][j][k][l][m], ACCT_STR);
+}
+
+static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode, aom_reader *r) {
+  const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+  TX_SIZE tx_size;
+  for (tx_size = 0; tx_size <= max_tx_size; ++tx_size)
+    read_coef_probs_common(fc->coef_probs[tx_size], r);
+}
+#endif
+
+static void setup_segmentation(AV1_COMMON *const cm,
+                               struct aom_read_bit_buffer *rb) {
+  struct segmentation *const seg = &cm->seg;
+  int i, j;
+
+  seg->update_map = 0;
+  seg->update_data = 0;
+
+  seg->enabled = aom_rb_read_bit(rb);
+  if (!seg->enabled) return;
+
+  // Segmentation map update
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+    seg->update_map = 1;
+  } else {
+    seg->update_map = aom_rb_read_bit(rb);
+  }
+  if (seg->update_map) {
+    if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+      seg->temporal_update = 0;
+    } else {
+      seg->temporal_update = aom_rb_read_bit(rb);
+    }
+  }
+
+  // Segmentation data update
+  seg->update_data = aom_rb_read_bit(rb);
+  if (seg->update_data) {
+    seg->abs_delta = aom_rb_read_bit(rb);
+
+    av1_clearall_segfeatures(seg);
+
+    for (i = 0; i < MAX_SEGMENTS; i++) {
+      for (j = 0; j < SEG_LVL_MAX; j++) {
+        int data = 0;
+        const int feature_enabled = aom_rb_read_bit(rb);
+        if (feature_enabled) {
+          av1_enable_segfeature(seg, i, j);
+          data = decode_unsigned_max(rb, av1_seg_feature_data_max(j));
+          if (av1_is_segfeature_signed(j))
+            data = aom_rb_read_bit(rb) ? -data : data;
+        }
+        av1_set_segdata(seg, i, j, data);
+      }
+    }
+  }
+}
+
+#if CONFIG_LOOP_RESTORATION
+static void decode_restoration_mode(AV1_COMMON *cm,
+                                    struct aom_read_bit_buffer *rb) {
+  int p;
+  RestorationInfo *rsi = &cm->rst_info[0];
+  if (aom_rb_read_bit(rb)) {
+    rsi->frame_restoration_type =
+        aom_rb_read_bit(rb) ? RESTORE_SGRPROJ : RESTORE_WIENER;
+  } else {
+    rsi->frame_restoration_type =
+        aom_rb_read_bit(rb) ? RESTORE_SWITCHABLE : RESTORE_NONE;
+  }
+  for (p = 1; p < MAX_MB_PLANE; ++p) {
+    cm->rst_info[p].frame_restoration_type =
+        aom_rb_read_bit(rb) ? RESTORE_WIENER : RESTORE_NONE;
+  }
+
+  cm->rst_info[0].restoration_tilesize = RESTORATION_TILESIZE_MAX;
+  cm->rst_info[1].restoration_tilesize = RESTORATION_TILESIZE_MAX;
+  cm->rst_info[2].restoration_tilesize = RESTORATION_TILESIZE_MAX;
+  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
+    rsi = &cm->rst_info[0];
+    rsi->restoration_tilesize >>= aom_rb_read_bit(rb);
+    if (rsi->restoration_tilesize != RESTORATION_TILESIZE_MAX) {
+      rsi->restoration_tilesize >>= aom_rb_read_bit(rb);
+    }
+    cm->rst_info[1].restoration_tilesize = cm->rst_info[0].restoration_tilesize;
+    cm->rst_info[2].restoration_tilesize = cm->rst_info[0].restoration_tilesize;
+  }
+}
+
+static void read_wiener_filter(WienerInfo *wiener_info,
+                               WienerInfo *ref_wiener_info, aom_reader *rb) {
+  wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] =
+      aom_read_primitive_refsubexpfin(
+          rb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+          WIENER_FILT_TAP0_SUBEXP_K,
+          ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV) +
+      WIENER_FILT_TAP0_MINV;
+  wiener_info->vfilter[1] = wiener_info->vfilter[WIENER_WIN - 2] =
+      aom_read_primitive_refsubexpfin(
+          rb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+          WIENER_FILT_TAP1_SUBEXP_K,
+          ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV) +
+      WIENER_FILT_TAP1_MINV;
+  wiener_info->vfilter[2] = wiener_info->vfilter[WIENER_WIN - 3] =
+      aom_read_primitive_refsubexpfin(
+          rb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+          WIENER_FILT_TAP2_SUBEXP_K,
+          ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV) +
+      WIENER_FILT_TAP2_MINV;
+  // The central element has an implicit +WIENER_FILT_STEP
+  wiener_info->vfilter[WIENER_HALFWIN] =
+      -2 * (wiener_info->vfilter[0] + wiener_info->vfilter[1] +
+            wiener_info->vfilter[2]);
+
+  wiener_info->hfilter[0] = wiener_info->hfilter[WIENER_WIN - 1] =
+      aom_read_primitive_refsubexpfin(
+          rb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+          WIENER_FILT_TAP0_SUBEXP_K,
+          ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV) +
+      WIENER_FILT_TAP0_MINV;
+  wiener_info->hfilter[1] = wiener_info->hfilter[WIENER_WIN - 2] =
+      aom_read_primitive_refsubexpfin(
+          rb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+          WIENER_FILT_TAP1_SUBEXP_K,
+          ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV) +
+      WIENER_FILT_TAP1_MINV;
+  wiener_info->hfilter[2] = wiener_info->hfilter[WIENER_WIN - 3] =
+      aom_read_primitive_refsubexpfin(
+          rb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+          WIENER_FILT_TAP2_SUBEXP_K,
+          ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV) +
+      WIENER_FILT_TAP2_MINV;
+  // The central element has an implicit +WIENER_FILT_STEP
+  wiener_info->hfilter[WIENER_HALFWIN] =
+      -2 * (wiener_info->hfilter[0] + wiener_info->hfilter[1] +
+            wiener_info->hfilter[2]);
+  memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info));
+}
+
+static void read_sgrproj_filter(SgrprojInfo *sgrproj_info,
+                                SgrprojInfo *ref_sgrproj_info, aom_reader *rb) {
+  sgrproj_info->ep = aom_read_literal(rb, SGRPROJ_PARAMS_BITS, ACCT_STR);
+  sgrproj_info->xqd[0] =
+      aom_read_primitive_refsubexpfin(
+          rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+          ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0) +
+      SGRPROJ_PRJ_MIN0;
+  sgrproj_info->xqd[1] =
+      aom_read_primitive_refsubexpfin(
+          rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+          ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1) +
+      SGRPROJ_PRJ_MIN1;
+  memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
+}
+
+static void decode_restoration(AV1_COMMON *cm, aom_reader *rb) {
+  int i, p;
+  SgrprojInfo ref_sgrproj_info;
+  WienerInfo ref_wiener_info;
+  set_default_wiener(&ref_wiener_info);
+  set_default_sgrproj(&ref_sgrproj_info);
+  const int ntiles = av1_get_rest_ntiles(cm->width, cm->height,
+                                         cm->rst_info[0].restoration_tilesize,
+                                         NULL, NULL, NULL, NULL);
+  const int ntiles_uv = av1_get_rest_ntiles(
+      ROUND_POWER_OF_TWO(cm->width, cm->subsampling_x),
+      ROUND_POWER_OF_TWO(cm->height, cm->subsampling_y),
+      cm->rst_info[1].restoration_tilesize, NULL, NULL, NULL, NULL);
+  RestorationInfo *rsi = &cm->rst_info[0];
+  if (rsi->frame_restoration_type != RESTORE_NONE) {
+    if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) {
+      for (i = 0; i < ntiles; ++i) {
+        rsi->restoration_type[i] =
+            aom_read_tree(rb, av1_switchable_restore_tree,
+                          cm->fc->switchable_restore_prob, ACCT_STR);
+        if (rsi->restoration_type[i] == RESTORE_WIENER) {
+          read_wiener_filter(&rsi->wiener_info[i], &ref_wiener_info, rb);
+        } else if (rsi->restoration_type[i] == RESTORE_SGRPROJ) {
+          read_sgrproj_filter(&rsi->sgrproj_info[i], &ref_sgrproj_info, rb);
+        }
+      }
+    } else if (rsi->frame_restoration_type == RESTORE_WIENER) {
+      for (i = 0; i < ntiles; ++i) {
+        if (aom_read(rb, RESTORE_NONE_WIENER_PROB, ACCT_STR)) {
+          rsi->restoration_type[i] = RESTORE_WIENER;
+          read_wiener_filter(&rsi->wiener_info[i], &ref_wiener_info, rb);
+        } else {
+          rsi->restoration_type[i] = RESTORE_NONE;
+        }
+      }
+    } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) {
+      for (i = 0; i < ntiles; ++i) {
+        if (aom_read(rb, RESTORE_NONE_SGRPROJ_PROB, ACCT_STR)) {
+          rsi->restoration_type[i] = RESTORE_SGRPROJ;
+          read_sgrproj_filter(&rsi->sgrproj_info[i], &ref_sgrproj_info, rb);
+        } else {
+          rsi->restoration_type[i] = RESTORE_NONE;
+        }
+      }
+    }
+  }
+  for (p = 1; p < MAX_MB_PLANE; ++p) {
+    set_default_wiener(&ref_wiener_info);
+    rsi = &cm->rst_info[p];
+    if (rsi->frame_restoration_type == RESTORE_WIENER) {
+      for (i = 0; i < ntiles_uv; ++i) {
+        if (ntiles_uv > 1)
+          rsi->restoration_type[i] =
+              aom_read(rb, RESTORE_NONE_WIENER_PROB, ACCT_STR) ? RESTORE_WIENER
+                                                               : RESTORE_NONE;
+        else
+          rsi->restoration_type[i] = RESTORE_WIENER;
+        if (rsi->restoration_type[i] == RESTORE_WIENER) {
+          read_wiener_filter(&rsi->wiener_info[i], &ref_wiener_info, rb);
+        }
+      }
+    }
+  }
+}
+#endif  // CONFIG_LOOP_RESTORATION
+
+static void setup_loopfilter(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  struct loopfilter *lf = &cm->lf;
+  lf->filter_level = aom_rb_read_literal(rb, 6);
+  lf->sharpness_level = aom_rb_read_literal(rb, 3);
+
+  // Read in loop filter deltas applied at the MB level based on mode or ref
+  // frame.
+  lf->mode_ref_delta_update = 0;
+
+  lf->mode_ref_delta_enabled = aom_rb_read_bit(rb);
+  if (lf->mode_ref_delta_enabled) {
+    lf->mode_ref_delta_update = aom_rb_read_bit(rb);
+    if (lf->mode_ref_delta_update) {
+      int i;
+
+      for (i = 0; i < TOTAL_REFS_PER_FRAME; i++)
+        if (aom_rb_read_bit(rb))
+          lf->ref_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
+
+      for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
+        if (aom_rb_read_bit(rb))
+          lf->mode_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
+    }
+  }
+}
+
+#if CONFIG_CDEF
+static void setup_cdef(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  int i;
+  cm->cdef_dering_damping = aom_rb_read_literal(rb, 1) + 5;
+  cm->cdef_clpf_damping = aom_rb_read_literal(rb, 2) + 3;
+  cm->cdef_bits = aom_rb_read_literal(rb, 2);
+  cm->nb_cdef_strengths = 1 << cm->cdef_bits;
+  for (i = 0; i < cm->nb_cdef_strengths; i++) {
+    cm->cdef_strengths[i] = aom_rb_read_literal(rb, CDEF_STRENGTH_BITS);
+    cm->cdef_uv_strengths[i] = aom_rb_read_literal(rb, CDEF_STRENGTH_BITS);
+  }
+}
+#endif  // CONFIG_CDEF
+
+static INLINE int read_delta_q(struct aom_read_bit_buffer *rb) {
+  return aom_rb_read_bit(rb) ? aom_rb_read_inv_signed_literal(rb, 6) : 0;
+}
+
+static void setup_quantization(AV1_COMMON *const cm,
+                               struct aom_read_bit_buffer *rb) {
+  cm->base_qindex = aom_rb_read_literal(rb, QINDEX_BITS);
+  cm->y_dc_delta_q = read_delta_q(rb);
+  cm->uv_dc_delta_q = read_delta_q(rb);
+  cm->uv_ac_delta_q = read_delta_q(rb);
+  cm->dequant_bit_depth = cm->bit_depth;
+#if CONFIG_AOM_QM
+  cm->using_qmatrix = aom_rb_read_bit(rb);
+  if (cm->using_qmatrix) {
+    cm->min_qmlevel = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+    cm->max_qmlevel = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+  } else {
+    cm->min_qmlevel = 0;
+    cm->max_qmlevel = 0;
+  }
+#endif
+}
+
+static void setup_segmentation_dequant(AV1_COMMON *const cm) {
+  // Build y/uv dequant values based on segmentation.
+  int i = 0;
+#if CONFIG_AOM_QM
+  int lossless;
+  int j = 0;
+  int qmlevel;
+  int using_qm = cm->using_qmatrix;
+  int minqm = cm->min_qmlevel;
+  int maxqm = cm->max_qmlevel;
+#endif
+#if CONFIG_NEW_QUANT
+  int b;
+  int dq;
+#endif  //  CONFIG_NEW_QUANT
+  if (cm->seg.enabled) {
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      const int qindex = av1_get_qindex(&cm->seg, i, cm->base_qindex);
+      cm->y_dequant[i][0] =
+          av1_dc_quant(qindex, cm->y_dc_delta_q, cm->bit_depth);
+      cm->y_dequant[i][1] = av1_ac_quant(qindex, 0, cm->bit_depth);
+      cm->uv_dequant[i][0] =
+          av1_dc_quant(qindex, cm->uv_dc_delta_q, cm->bit_depth);
+      cm->uv_dequant[i][1] =
+          av1_ac_quant(qindex, cm->uv_ac_delta_q, cm->bit_depth);
+#if CONFIG_AOM_QM
+      lossless = qindex == 0 && cm->y_dc_delta_q == 0 &&
+                 cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
+      // NB: depends on base index so there is only 1 set per frame
+      // No quant weighting when lossless or signalled not using QM
+      qmlevel = (lossless || using_qm == 0)
+                    ? NUM_QM_LEVELS - 1
+                    : aom_get_qmlevel(cm->base_qindex, minqm, maxqm);
+      for (j = 0; j < TX_SIZES; ++j) {
+        cm->y_iqmatrix[i][1][j] = aom_iqmatrix(cm, qmlevel, 0, j, 1);
+        cm->y_iqmatrix[i][0][j] = aom_iqmatrix(cm, qmlevel, 0, j, 0);
+        cm->uv_iqmatrix[i][1][j] = aom_iqmatrix(cm, qmlevel, 1, j, 1);
+        cm->uv_iqmatrix[i][0][j] = aom_iqmatrix(cm, qmlevel, 1, j, 0);
+      }
+#endif  // CONFIG_AOM_QM
+#if CONFIG_NEW_QUANT
+      for (dq = 0; dq < QUANT_PROFILES; dq++) {
+        for (b = 0; b < COEF_BANDS; ++b) {
+          av1_get_dequant_val_nuq(cm->y_dequant[i][b != 0], b,
+                                  cm->y_dequant_nuq[i][dq][b], NULL, dq);
+          av1_get_dequant_val_nuq(cm->uv_dequant[i][b != 0], b,
+                                  cm->uv_dequant_nuq[i][dq][b], NULL, dq);
+        }
+      }
+#endif  //  CONFIG_NEW_QUANT
+    }
+  } else {
+    const int qindex = cm->base_qindex;
+    // When segmentation is disabled, only the first value is used.  The
+    // remaining are don't cares.
+    cm->y_dequant[0][0] = av1_dc_quant(qindex, cm->y_dc_delta_q, cm->bit_depth);
+    cm->y_dequant[0][1] = av1_ac_quant(qindex, 0, cm->bit_depth);
+    cm->uv_dequant[0][0] =
+        av1_dc_quant(qindex, cm->uv_dc_delta_q, cm->bit_depth);
+    cm->uv_dequant[0][1] =
+        av1_ac_quant(qindex, cm->uv_ac_delta_q, cm->bit_depth);
+#if CONFIG_AOM_QM
+    lossless = qindex == 0 && cm->y_dc_delta_q == 0 && cm->uv_dc_delta_q == 0 &&
+               cm->uv_ac_delta_q == 0;
+    // No quant weighting when lossless or signalled not using QM
+    qmlevel = (lossless || using_qm == 0)
+                  ? NUM_QM_LEVELS - 1
+                  : aom_get_qmlevel(cm->base_qindex, minqm, maxqm);
+    for (j = 0; j < TX_SIZES; ++j) {
+      cm->y_iqmatrix[i][1][j] = aom_iqmatrix(cm, qmlevel, 0, j, 1);
+      cm->y_iqmatrix[i][0][j] = aom_iqmatrix(cm, qmlevel, 0, j, 0);
+      cm->uv_iqmatrix[i][1][j] = aom_iqmatrix(cm, qmlevel, 1, j, 1);
+      cm->uv_iqmatrix[i][0][j] = aom_iqmatrix(cm, qmlevel, 1, j, 0);
+    }
+#endif
+#if CONFIG_NEW_QUANT
+    for (dq = 0; dq < QUANT_PROFILES; dq++) {
+      for (b = 0; b < COEF_BANDS; ++b) {
+        av1_get_dequant_val_nuq(cm->y_dequant[0][b != 0], b,
+                                cm->y_dequant_nuq[0][dq][b], NULL, dq);
+        av1_get_dequant_val_nuq(cm->uv_dequant[0][b != 0], b,
+                                cm->uv_dequant_nuq[0][dq][b], NULL, dq);
+      }
+    }
+#endif  //  CONFIG_NEW_QUANT
+  }
+}
+
+static InterpFilter read_frame_interp_filter(struct aom_read_bit_buffer *rb) {
+  return aom_rb_read_bit(rb) ? SWITCHABLE
+                             : aom_rb_read_literal(rb, LOG_SWITCHABLE_FILTERS);
+}
+
+static void setup_render_size(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  cm->render_width = cm->width;
+  cm->render_height = cm->height;
+  if (aom_rb_read_bit(rb))
+    av1_read_frame_size(rb, &cm->render_width, &cm->render_height);
+}
+
+#if CONFIG_FRAME_SUPERRES
+// TODO(afergs): make "struct aom_read_bit_buffer *const rb"?
+static void setup_superres_size(AV1_COMMON *const cm,
+                                struct aom_read_bit_buffer *rb, int *width,
+                                int *height) {
+  // TODO(afergs): Test this behaviour
+  // Frame superres is probably in compatible with this render resolution
+  assert(cm->width == cm->render_width && cm->height == cm->render_height);
+
+  cm->superres_width = cm->width;
+  cm->superres_height = cm->height;
+  if (aom_rb_read_bit(rb)) {
+    cm->superres_scale_numerator =
+        (uint8_t)aom_rb_read_literal(rb, SUPERRES_SCALE_BITS);
+    cm->superres_scale_numerator += SUPERRES_SCALE_NUMERATOR_MIN;
+    // Don't edit cm->width or cm->height directly, or the buffers won't get
+    // resized correctly
+    // TODO(afergs): Should the render resolution not be modified? It's the same
+    // by default (ie. when it isn't sent)...
+    // resize_context_buffers() will change cm->width to equal cm->render_width,
+    // then they'll be the same again
+    *width = cm->render_width =
+        cm->width * cm->superres_scale_numerator / SUPERRES_SCALE_DENOMINATOR;
+    *height = cm->render_height =
+        cm->height * cm->superres_scale_numerator / SUPERRES_SCALE_DENOMINATOR;
+  } else {
+    // 1:1 scaling - ie. no scaling, scale not provided
+    cm->superres_scale_numerator = SUPERRES_SCALE_DENOMINATOR;
+  }
+}
+#endif  // CONFIG_FRAME_SUPERRES
+
+static void resize_mv_buffer(AV1_COMMON *cm) {
+  aom_free(cm->cur_frame->mvs);
+  cm->cur_frame->mi_rows = cm->mi_rows;
+  cm->cur_frame->mi_cols = cm->mi_cols;
+  CHECK_MEM_ERROR(cm, cm->cur_frame->mvs,
+                  (MV_REF *)aom_calloc(cm->mi_rows * cm->mi_cols,
+                                       sizeof(*cm->cur_frame->mvs)));
+}
+
+static void resize_context_buffers(AV1_COMMON *cm, int width, int height) {
+#if CONFIG_SIZE_LIMIT
+  if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Dimensions of %dx%d beyond allowed size of %dx%d.",
+                       width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
+#endif
+  if (cm->width != width || cm->height != height) {
+    const int new_mi_rows =
+        ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+    const int new_mi_cols =
+        ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+
+    // Allocations in av1_alloc_context_buffers() depend on individual
+    // dimensions as well as the overall size.
+    if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) {
+      if (av1_alloc_context_buffers(cm, width, height))
+        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate context buffers");
+    } else {
+      av1_set_mb_mi(cm, width, height);
+    }
+    av1_init_context_buffers(cm);
+    cm->width = width;
+    cm->height = height;
+  }
+  if (cm->cur_frame->mvs == NULL || cm->mi_rows > cm->cur_frame->mi_rows ||
+      cm->mi_cols > cm->cur_frame->mi_cols) {
+    resize_mv_buffer(cm);
+  }
+}
+
+static void setup_frame_size(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  int width, height;
+  BufferPool *const pool = cm->buffer_pool;
+  av1_read_frame_size(rb, &width, &height);
+  setup_render_size(cm, rb);
+#if CONFIG_FRAME_SUPERRES
+  setup_superres_size(cm, rb, &width, &height);
+#endif  // CONFIG_FRAME_SUPERRES
+  resize_context_buffers(cm, width, height);
+
+  lock_buffer_pool(pool);
+  if (aom_realloc_frame_buffer(
+          get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x,
+          cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+          cm->use_highbitdepth,
+#endif
+          AOM_BORDER_IN_PIXELS, cm->byte_alignment,
+          &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
+          pool->cb_priv)) {
+    unlock_buffer_pool(pool);
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate frame buffer");
+  }
+  unlock_buffer_pool(pool);
+
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
+  pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range;
+  pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width;
+  pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
+}
+
+static INLINE int valid_ref_frame_img_fmt(aom_bit_depth_t ref_bit_depth,
+                                          int ref_xss, int ref_yss,
+                                          aom_bit_depth_t this_bit_depth,
+                                          int this_xss, int this_yss) {
+  return ref_bit_depth == this_bit_depth && ref_xss == this_xss &&
+         ref_yss == this_yss;
+}
+
+static void setup_frame_size_with_refs(AV1_COMMON *cm,
+                                       struct aom_read_bit_buffer *rb) {
+  int width, height;
+  int found = 0, i;
+  int has_valid_ref_frame = 0;
+  BufferPool *const pool = cm->buffer_pool;
+  for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    if (aom_rb_read_bit(rb)) {
+      YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
+      width = buf->y_crop_width;
+      height = buf->y_crop_height;
+      cm->render_width = buf->render_width;
+      cm->render_height = buf->render_height;
+      found = 1;
+      break;
+    }
+  }
+
+  if (!found) {
+    av1_read_frame_size(rb, &width, &height);
+    setup_render_size(cm, rb);
+  }
+
+  if (width <= 0 || height <= 0)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Invalid frame size");
+
+  // Check to make sure at least one of frames that this frame references
+  // has valid dimensions.
+  for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    RefBuffer *const ref_frame = &cm->frame_refs[i];
+    has_valid_ref_frame |=
+        valid_ref_frame_size(ref_frame->buf->y_crop_width,
+                             ref_frame->buf->y_crop_height, width, height);
+  }
+  if (!has_valid_ref_frame)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Referenced frame has invalid size");
+  for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    RefBuffer *const ref_frame = &cm->frame_refs[i];
+    if (!valid_ref_frame_img_fmt(ref_frame->buf->bit_depth,
+                                 ref_frame->buf->subsampling_x,
+                                 ref_frame->buf->subsampling_y, cm->bit_depth,
+                                 cm->subsampling_x, cm->subsampling_y))
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Referenced frame has incompatible color format");
+  }
+
+  resize_context_buffers(cm, width, height);
+
+  lock_buffer_pool(pool);
+  if (aom_realloc_frame_buffer(
+          get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x,
+          cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+          cm->use_highbitdepth,
+#endif
+          AOM_BORDER_IN_PIXELS, cm->byte_alignment,
+          &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
+          pool->cb_priv)) {
+    unlock_buffer_pool(pool);
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate frame buffer");
+  }
+  unlock_buffer_pool(pool);
+
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
+  pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range;
+  pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width;
+  pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
+}
+
+static void read_tile_info(AV1Decoder *const pbi,
+                           struct aom_read_bit_buffer *const rb) {
+  AV1_COMMON *const cm = &pbi->common;
+#if CONFIG_EXT_TILE
+  cm->tile_encoding_mode = aom_rb_read_literal(rb, 1);
+// Read the tile width/height
+#if CONFIG_EXT_PARTITION
+  if (cm->sb_size == BLOCK_128X128) {
+    cm->tile_width = aom_rb_read_literal(rb, 5) + 1;
+    cm->tile_height = aom_rb_read_literal(rb, 5) + 1;
+  } else
+#endif  // CONFIG_EXT_PARTITION
+  {
+    cm->tile_width = aom_rb_read_literal(rb, 6) + 1;
+    cm->tile_height = aom_rb_read_literal(rb, 6) + 1;
+  }
+
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+  cm->loop_filter_across_tiles_enabled = aom_rb_read_bit(rb);
+#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+
+  cm->tile_width <<= cm->mib_size_log2;
+  cm->tile_height <<= cm->mib_size_log2;
+
+  cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols);
+  cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows);
+
+  // Get the number of tiles
+  cm->tile_cols = 1;
+  while (cm->tile_cols * cm->tile_width < cm->mi_cols) ++cm->tile_cols;
+
+  cm->tile_rows = 1;
+  while (cm->tile_rows * cm->tile_height < cm->mi_rows) ++cm->tile_rows;
+
+  if (cm->tile_cols * cm->tile_rows > 1) {
+    // Read the number of bytes used to store tile size
+    pbi->tile_col_size_bytes = aom_rb_read_literal(rb, 2) + 1;
+    pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
+  }
+
+#if CONFIG_DEPENDENT_HORZTILES
+  if (cm->tile_rows <= 1)
+    cm->dependent_horz_tiles = aom_rb_read_bit(rb);
+  else
+    cm->dependent_horz_tiles = 0;
+#endif
+#else
+  int min_log2_tile_cols, max_log2_tile_cols, max_ones;
+  av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+
+  // columns
+  max_ones = max_log2_tile_cols - min_log2_tile_cols;
+  cm->log2_tile_cols = min_log2_tile_cols;
+  while (max_ones-- && aom_rb_read_bit(rb)) cm->log2_tile_cols++;
+
+  if (cm->log2_tile_cols > 6)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Invalid number of tile columns");
+
+  // rows
+  cm->log2_tile_rows = aom_rb_read_bit(rb);
+  if (cm->log2_tile_rows) cm->log2_tile_rows += aom_rb_read_bit(rb);
+#if CONFIG_DEPENDENT_HORZTILES
+  if (cm->log2_tile_rows != 0)
+    cm->dependent_horz_tiles = aom_rb_read_bit(rb);
+  else
+    cm->dependent_horz_tiles = 0;
+#endif
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+  cm->loop_filter_across_tiles_enabled = aom_rb_read_bit(rb);
+#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+
+  cm->tile_cols = 1 << cm->log2_tile_cols;
+  cm->tile_rows = 1 << cm->log2_tile_rows;
+
+  cm->tile_width = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+  cm->tile_width >>= cm->log2_tile_cols;
+  cm->tile_height = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
+  cm->tile_height >>= cm->log2_tile_rows;
+
+  // round to integer multiples of superblock size
+  cm->tile_width = ALIGN_POWER_OF_TWO(cm->tile_width, MAX_MIB_SIZE_LOG2);
+  cm->tile_height = ALIGN_POWER_OF_TWO(cm->tile_height, MAX_MIB_SIZE_LOG2);
+
+// tile size magnitude
+#if !CONFIG_TILE_GROUPS
+  if (cm->tile_rows > 1 || cm->tile_cols > 1)
+#endif
+    pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
+#endif  // CONFIG_EXT_TILE
+
+#if CONFIG_TILE_GROUPS
+  // Store an index to the location of the tile group information
+  pbi->tg_size_bit_offset = rb->bit_offset;
+  pbi->tg_size = 1 << (cm->log2_tile_rows + cm->log2_tile_cols);
+  if (cm->log2_tile_rows + cm->log2_tile_cols > 0) {
+    pbi->tg_start =
+        aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
+    pbi->tg_size =
+        1 + aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
+  }
+#endif
+}
+
+static int mem_get_varsize(const uint8_t *src, int sz) {
+  switch (sz) {
+    case 1: return src[0];
+    case 2: return mem_get_le16(src);
+    case 3: return mem_get_le24(src);
+    case 4: return mem_get_le32(src);
+    default: assert("Invalid size" && 0); return -1;
+  }
+}
+
+#if CONFIG_EXT_TILE
+// Reads the next tile returning its size and adjusting '*data' accordingly
+// based on 'is_last'.
+static void get_tile_buffer(const uint8_t *const data_end,
+                            struct aom_internal_error_info *error_info,
+                            const uint8_t **data, aom_decrypt_cb decrypt_cb,
+                            void *decrypt_state,
+                            TileBufferDec (*const tile_buffers)[MAX_TILE_COLS],
+                            int tile_size_bytes, int col, int row,
+                            unsigned int tile_encoding_mode) {
+  size_t size;
+
+  size_t copy_size = 0;
+  const uint8_t *copy_data = NULL;
+
+  if (!read_is_valid(*data, tile_size_bytes, data_end))
+    aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt tile length");
+  if (decrypt_cb) {
+    uint8_t be_data[4];
+    decrypt_cb(decrypt_state, *data, be_data, tile_size_bytes);
+
+    // Only read number of bytes in cm->tile_size_bytes.
+    size = mem_get_varsize(be_data, tile_size_bytes);
+  } else {
+    size = mem_get_varsize(*data, tile_size_bytes);
+  }
+
+  // If cm->tile_encoding_mode = 1 (i.e. TILE_VR), then the top bit of the tile
+  // header indicates copy mode.
+  if (tile_encoding_mode && (size >> (tile_size_bytes * 8 - 1)) == 1) {
+    // The remaining bits in the top byte signal the row offset
+    int offset = (size >> (tile_size_bytes - 1) * 8) & 0x7f;
+
+    // Currently, only use tiles in same column as reference tiles.
+    copy_data = tile_buffers[row - offset][col].data;
+    copy_size = tile_buffers[row - offset][col].size;
+    size = 0;
+  }
+
+  *data += tile_size_bytes;
+
+  if (size > (size_t)(data_end - *data))
+    aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt tile size");
+
+  if (size > 0) {
+    tile_buffers[row][col].data = *data;
+    tile_buffers[row][col].size = size;
+  } else {
+    tile_buffers[row][col].data = copy_data;
+    tile_buffers[row][col].size = copy_size;
+  }
+
+  *data += size;
+
+  tile_buffers[row][col].raw_data_end = *data;
+}
+
+static void get_tile_buffers(
+    AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end,
+    TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  const int have_tiles = tile_cols * tile_rows > 1;
+
+  if (!have_tiles) {
+    const size_t tile_size = data_end - data;
+    tile_buffers[0][0].data = data;
+    tile_buffers[0][0].size = tile_size;
+    tile_buffers[0][0].raw_data_end = NULL;
+  } else {
+    // We locate only the tile buffers that are required, which are the ones
+    // specified by pbi->dec_tile_col and pbi->dec_tile_row. Also, we always
+    // need the last (bottom right) tile buffer, as we need to know where the
+    // end of the compressed frame buffer is for proper superframe decoding.
+
+    const uint8_t *tile_col_data_end[MAX_TILE_COLS];
+    const uint8_t *const data_start = data;
+
+    const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+    const int single_row = pbi->dec_tile_row >= 0;
+    const int tile_rows_start = single_row ? dec_tile_row : 0;
+    const int tile_rows_end = single_row ? tile_rows_start + 1 : tile_rows;
+    const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+    const int single_col = pbi->dec_tile_col >= 0;
+    const int tile_cols_start = single_col ? dec_tile_col : 0;
+    const int tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+
+    const int tile_col_size_bytes = pbi->tile_col_size_bytes;
+    const int tile_size_bytes = pbi->tile_size_bytes;
+
+    size_t tile_col_size;
+    int r, c;
+
+    // Read tile column sizes for all columns (we need the last tile buffer)
+    for (c = 0; c < tile_cols; ++c) {
+      const int is_last = c == tile_cols - 1;
+      if (!is_last) {
+        tile_col_size = mem_get_varsize(data, tile_col_size_bytes);
+        data += tile_col_size_bytes;
+        tile_col_data_end[c] = data + tile_col_size;
+      } else {
+        tile_col_size = data_end - data;
+        tile_col_data_end[c] = data_end;
+      }
+      data += tile_col_size;
+    }
+
+    data = data_start;
+
+    // Read the required tile sizes.
+    for (c = tile_cols_start; c < tile_cols_end; ++c) {
+      const int is_last = c == tile_cols - 1;
+
+      if (c > 0) data = tile_col_data_end[c - 1];
+
+      if (!is_last) data += tile_col_size_bytes;
+
+      // Get the whole of the last column, otherwise stop at the required tile.
+      for (r = 0; r < (is_last ? tile_rows : tile_rows_end); ++r) {
+        tile_buffers[r][c].col = c;
+
+        get_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data,
+                        pbi->decrypt_cb, pbi->decrypt_state, tile_buffers,
+                        tile_size_bytes, c, r, cm->tile_encoding_mode);
+      }
+    }
+
+    // If we have not read the last column, then read it to get the last tile.
+    if (tile_cols_end != tile_cols) {
+      c = tile_cols - 1;
+
+      data = tile_col_data_end[c - 1];
+
+      for (r = 0; r < tile_rows; ++r) {
+        tile_buffers[r][c].col = c;
+
+        get_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data,
+                        pbi->decrypt_cb, pbi->decrypt_state, tile_buffers,
+                        tile_size_bytes, c, r, cm->tile_encoding_mode);
+      }
+    }
+  }
+}
+#else
+// Reads the next tile returning its size and adjusting '*data' accordingly
+// based on 'is_last'.
+static void get_tile_buffer(const uint8_t *const data_end,
+                            const int tile_size_bytes, int is_last,
+                            struct aom_internal_error_info *error_info,
+                            const uint8_t **data, aom_decrypt_cb decrypt_cb,
+                            void *decrypt_state, TileBufferDec *const buf) {
+  size_t size;
+
+  if (!is_last) {
+    if (!read_is_valid(*data, tile_size_bytes, data_end))
+      aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+                         "Truncated packet or corrupt tile length");
+
+    if (decrypt_cb) {
+      uint8_t be_data[4];
+      decrypt_cb(decrypt_state, *data, be_data, tile_size_bytes);
+      size = mem_get_varsize(be_data, tile_size_bytes);
+    } else {
+      size = mem_get_varsize(*data, tile_size_bytes);
+    }
+    *data += tile_size_bytes;
+
+    if (size > (size_t)(data_end - *data))
+      aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+                         "Truncated packet or corrupt tile size");
+  } else {
+    size = data_end - *data;
+  }
+
+  buf->data = *data;
+  buf->size = size;
+
+  *data += size;
+}
+
+static void get_tile_buffers(
+    AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end,
+    TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
+  AV1_COMMON *const cm = &pbi->common;
+#if CONFIG_TILE_GROUPS
+  int r, c;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  int tc = 0;
+  int first_tile_in_tg = 0;
+  struct aom_read_bit_buffer rb_tg_hdr;
+  uint8_t clear_data[MAX_AV1_HEADER_SIZE];
+  const int num_tiles = tile_rows * tile_cols;
+  const int num_bits = OD_ILOG(num_tiles) - 1;
+  const size_t hdr_size = pbi->uncomp_hdr_size + pbi->first_partition_size;
+  const int tg_size_bit_offset = pbi->tg_size_bit_offset;
+#if CONFIG_DEPENDENT_HORZTILES
+  int tile_group_start_col = 0;
+  int tile_group_start_row = 0;
+#endif
+
+  for (r = 0; r < tile_rows; ++r) {
+    for (c = 0; c < tile_cols; ++c, ++tc) {
+      TileBufferDec *const buf = &tile_buffers[r][c];
+      const int is_last = (r == tile_rows - 1) && (c == tile_cols - 1);
+      const size_t hdr_offset = (tc && tc == first_tile_in_tg) ? hdr_size : 0;
+
+      buf->col = c;
+      if (hdr_offset) {
+        init_read_bit_buffer(pbi, &rb_tg_hdr, data, data_end, clear_data);
+        rb_tg_hdr.bit_offset = tg_size_bit_offset;
+        if (num_tiles) {
+          pbi->tg_start = aom_rb_read_literal(&rb_tg_hdr, num_bits);
+          pbi->tg_size = 1 + aom_rb_read_literal(&rb_tg_hdr, num_bits);
+#if CONFIG_DEPENDENT_HORZTILES
+          tile_group_start_row = r;
+          tile_group_start_col = c;
+#endif
+        }
+      }
+      first_tile_in_tg += tc == first_tile_in_tg ? pbi->tg_size : 0;
+      data += hdr_offset;
+      get_tile_buffer(data_end, pbi->tile_size_bytes, is_last,
+                      &pbi->common.error, &data, pbi->decrypt_cb,
+                      pbi->decrypt_state, buf);
+#if CONFIG_DEPENDENT_HORZTILES
+      cm->tile_group_start_row[r][c] = tile_group_start_row;
+      cm->tile_group_start_col[r][c] = tile_group_start_col;
+#endif
+    }
+  }
+#else
+  int r, c;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+
+  for (r = 0; r < tile_rows; ++r) {
+    for (c = 0; c < tile_cols; ++c) {
+      const int is_last = (r == tile_rows - 1) && (c == tile_cols - 1);
+      TileBufferDec *const buf = &tile_buffers[r][c];
+      buf->col = c;
+      get_tile_buffer(data_end, pbi->tile_size_bytes, is_last, &cm->error,
+                      &data, pbi->decrypt_cb, pbi->decrypt_state, buf);
+    }
+  }
+#endif
+}
+#endif  // CONFIG_EXT_TILE
+
+#if CONFIG_PVQ
+static void daala_dec_init(AV1_COMMON *const cm, daala_dec_ctx *daala_dec,
+                           aom_reader *r) {
+  daala_dec->r = r;
+
+  // TODO(yushin) : activity masking info needs be signaled by a bitstream
+  daala_dec->use_activity_masking = AV1_PVQ_ENABLE_ACTIVITY_MASKING;
+
+#if !CONFIG_DAALA_DIST
+  daala_dec->use_activity_masking = 0;
+#endif
+
+  if (daala_dec->use_activity_masking)
+    daala_dec->qm = OD_HVS_QM;
+  else
+    daala_dec->qm = OD_FLAT_QM;
+
+  od_init_qm(daala_dec->state.qm, daala_dec->state.qm_inv,
+             daala_dec->qm == OD_HVS_QM ? OD_QM8_Q4_HVS : OD_QM8_Q4_FLAT);
+
+  if (daala_dec->use_activity_masking) {
+    int pli;
+    int use_masking = daala_dec->use_activity_masking;
+    int segment_id = 0;
+    int qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+
+    for (pli = 0; pli < MAX_MB_PLANE; pli++) {
+      int i;
+      int q;
+
+      q = qindex;
+      if (q <= OD_DEFAULT_QMS[use_masking][0][pli].interp_q << OD_COEFF_SHIFT) {
+        od_interp_qm(&daala_dec->state.pvq_qm_q4[pli][0], q,
+                     &OD_DEFAULT_QMS[use_masking][0][pli], NULL);
+      } else {
+        i = 0;
+        while (OD_DEFAULT_QMS[use_masking][i + 1][pli].qm_q4 != NULL &&
+               q > OD_DEFAULT_QMS[use_masking][i + 1][pli].interp_q
+                       << OD_COEFF_SHIFT) {
+          i++;
+        }
+        od_interp_qm(&daala_dec->state.pvq_qm_q4[pli][0], q,
+                     &OD_DEFAULT_QMS[use_masking][i][pli],
+                     &OD_DEFAULT_QMS[use_masking][i + 1][pli]);
+      }
+    }
+  }
+}
+#endif  // #if CONFIG_PVQ
+
+static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
+                                   const uint8_t *data_end) {
+  AV1_COMMON *const cm = &pbi->common;
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  const int n_tiles = tile_cols * tile_rows;
+  TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
+#if CONFIG_EXT_TILE
+  const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+  const int single_row = pbi->dec_tile_row >= 0;
+  const int tile_rows_start = single_row ? dec_tile_row : 0;
+  const int tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
+  const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+  const int single_col = pbi->dec_tile_col >= 0;
+  const int tile_cols_start = single_col ? dec_tile_col : 0;
+  const int tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+  const int inv_col_order = pbi->inv_tile_order && !single_col;
+  const int inv_row_order = pbi->inv_tile_order && !single_row;
+#else
+  const int tile_rows_start = 0;
+  const int tile_rows_end = tile_rows;
+  const int tile_cols_start = 0;
+  const int tile_cols_end = tile_cols;
+  const int inv_col_order = pbi->inv_tile_order;
+  const int inv_row_order = pbi->inv_tile_order;
+#endif  // CONFIG_EXT_TILE
+  int tile_row, tile_col;
+
+#if CONFIG_SUBFRAME_PROB_UPDATE
+  cm->do_subframe_update = n_tiles == 1;
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+
+  if (cm->lf.filter_level && !cm->skip_loop_filter &&
+      pbi->lf_worker.data1 == NULL) {
+    CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
+                    aom_memalign(32, sizeof(LFWorkerData)));
+    pbi->lf_worker.hook = (AVxWorkerHook)av1_loop_filter_worker;
+    if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
+      aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                         "Loop filter thread creation failed");
+    }
+  }
+
+  if (cm->lf.filter_level && !cm->skip_loop_filter) {
+    LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1;
+    // Be sure to sync as we might be resuming after a failed frame decode.
+    winterface->sync(&pbi->lf_worker);
+    av1_loop_filter_data_reset(lf_data, get_frame_new_buffer(cm), cm,
+                               pbi->mb.plane);
+  }
+
+  assert(tile_rows <= MAX_TILE_ROWS);
+  assert(tile_cols <= MAX_TILE_COLS);
+
+  get_tile_buffers(pbi, data, data_end, tile_buffers);
+
+  if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
+    aom_free(pbi->tile_data);
+    CHECK_MEM_ERROR(cm, pbi->tile_data,
+                    aom_memalign(32, n_tiles * (sizeof(*pbi->tile_data))));
+    pbi->allocated_tiles = n_tiles;
+  }
+#if CONFIG_ACCOUNTING
+  if (pbi->acct_enabled) {
+    aom_accounting_reset(&pbi->accounting);
+  }
+#endif
+  // Load all tile information into tile_data.
+  for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
+    for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
+      const TileBufferDec *const buf = &tile_buffers[tile_row][tile_col];
+      TileData *const td = pbi->tile_data + tile_cols * tile_row + tile_col;
+
+      td->cm = cm;
+      td->xd = pbi->mb;
+      td->xd.corrupted = 0;
+      td->xd.counts =
+          cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD
+              ? &cm->counts
+              : NULL;
+      av1_zero(td->dqcoeff);
+#if CONFIG_PVQ
+      av1_zero(td->pvq_ref_coeff);
+#endif
+      av1_tile_init(&td->xd.tile, td->cm, tile_row, tile_col);
+      setup_bool_decoder(buf->data, data_end, buf->size, &cm->error,
+                         &td->bit_reader,
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+                         1 << cm->ans_window_size_log2,
+#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
+                         pbi->decrypt_cb, pbi->decrypt_state);
+#if CONFIG_ACCOUNTING
+      if (pbi->acct_enabled) {
+        td->bit_reader.accounting = &pbi->accounting;
+      } else {
+        td->bit_reader.accounting = NULL;
+      }
+#endif
+      av1_init_macroblockd(cm, &td->xd,
+#if CONFIG_PVQ
+                           td->pvq_ref_coeff,
+#endif
+#if CONFIG_CFL
+                           &td->cfl,
+#endif
+                           td->dqcoeff);
+
+#if CONFIG_EC_ADAPT
+      // Initialise the tile context from the frame context
+      td->tctx = *cm->fc;
+      td->xd.tile_ctx = &td->tctx;
+#endif
+
+#if CONFIG_PVQ
+      daala_dec_init(cm, &td->xd.daala_dec, &td->bit_reader);
+      td->xd.daala_dec.state.adapt = &td->tctx.pvq_context;
+#endif
+
+#if CONFIG_PALETTE
+      td->xd.plane[0].color_index_map = td->color_index_map[0];
+      td->xd.plane[1].color_index_map = td->color_index_map[1];
+#endif  // CONFIG_PALETTE
+    }
+  }
+
+  for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
+    const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row;
+    int mi_row = 0;
+    TileInfo tile_info;
+
+    av1_tile_set_row(&tile_info, cm, row);
+
+    for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
+      const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col;
+      TileData *const td = pbi->tile_data + tile_cols * row + col;
+#if CONFIG_ACCOUNTING
+      if (pbi->acct_enabled) {
+        td->bit_reader.accounting->last_tell_frac =
+            aom_reader_tell_frac(&td->bit_reader);
+      }
+#endif
+
+      av1_tile_set_col(&tile_info, cm, col);
+
+#if CONFIG_DEPENDENT_HORZTILES
+#if CONFIG_TILE_GROUPS
+      av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col);
+      if (!cm->dependent_horz_tiles || tile_row == 0 ||
+          tile_info.tg_horz_boundary) {
+#else
+      if (!cm->dependent_horz_tiles || tile_row == 0) {
+#endif
+        av1_zero_above_context(cm, tile_info.mi_col_start,
+                               tile_info.mi_col_end);
+      }
+#else
+      av1_zero_above_context(cm, tile_info.mi_col_start, tile_info.mi_col_end);
+#endif
+
+      for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+           mi_row += cm->mib_size) {
+        int mi_col;
+
+        av1_zero_left_context(&td->xd);
+
+        for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+             mi_col += cm->mib_size) {
+          av1_update_boundary_info(cm, &tile_info, mi_row, mi_col);
+          decode_partition(pbi, &td->xd,
+#if CONFIG_SUPERTX
+                           0,
+#endif  // CONFIG_SUPERTX
+                           mi_row, mi_col, &td->bit_reader, cm->sb_size,
+                           b_width_log2_lookup[cm->sb_size]);
+#if CONFIG_NCOBMC && CONFIG_MOTION_VAR
+          detoken_and_recon_sb(pbi, &td->xd, mi_row, mi_col, &td->bit_reader,
+                               cm->sb_size);
+#endif
+        }
+        aom_merge_corrupted_flag(&pbi->mb.corrupted, td->xd.corrupted);
+        if (pbi->mb.corrupted)
+          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                             "Failed to decode tile data");
+#if CONFIG_SUBFRAME_PROB_UPDATE
+        if (cm->do_subframe_update &&
+            cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+          const int mi_rows_per_update =
+              MI_SIZE * AOMMAX(cm->mi_rows / MI_SIZE / COEF_PROBS_BUFS, 1);
+          if ((mi_row + MI_SIZE) % mi_rows_per_update == 0 &&
+              mi_row + MI_SIZE < cm->mi_rows &&
+              cm->coef_probs_update_idx < COEF_PROBS_BUFS - 1) {
+            av1_partial_adapt_probs(cm, mi_row, mi_col);
+            ++cm->coef_probs_update_idx;
+          }
+        }
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+      }
+    }
+
+    assert(mi_row > 0);
+
+// when Parallel deblocking is enabled, deblocking should not
+// be interleaved with decoding. Instead, deblocking should be done
+// after the entire frame is decoded.
+#if !CONFIG_VAR_TX && !CONFIG_PARALLEL_DEBLOCKING && !CONFIG_CB4X4
+    // Loopfilter one tile row.
+    // Note: If out-of-order tile decoding is used(for example, inv_row_order
+    // = 1), the loopfiltering has be done after all tile rows are decoded.
+    if (!inv_row_order && cm->lf.filter_level && !cm->skip_loop_filter) {
+      LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1;
+      const int lf_start = AOMMAX(0, tile_info.mi_row_start - cm->mib_size);
+      const int lf_end = tile_info.mi_row_end - cm->mib_size;
+
+      // Delay the loopfilter if the first tile row is only
+      // a single superblock high.
+      if (lf_end <= 0) continue;
+
+      // Decoding has completed. Finish up the loop filter in this thread.
+      if (tile_info.mi_row_end >= cm->mi_rows) continue;
+
+      winterface->sync(&pbi->lf_worker);
+      lf_data->start = lf_start;
+      lf_data->stop = lf_end;
+      if (pbi->max_threads > 1) {
+        winterface->launch(&pbi->lf_worker);
+      } else {
+        winterface->execute(&pbi->lf_worker);
+      }
+    }
+#endif  // !CONFIG_VAR_TX && !CONFIG_PARALLEL_DEBLOCKING
+
+    // After loopfiltering, the last 7 row pixels in each superblock row may
+    // still be changed by the longest loopfilter of the next superblock row.
+    if (cm->frame_parallel_decode)
+      av1_frameworker_broadcast(pbi->cur_buf, mi_row << cm->mib_size_log2);
+  }
+
+#if CONFIG_VAR_TX || CONFIG_CB4X4
+  // Loopfilter the whole frame.
+  av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb,
+                        cm->lf.filter_level, 0, 0);
+#else
+#if CONFIG_PARALLEL_DEBLOCKING
+  // Loopfilter all rows in the frame in the frame.
+  if (cm->lf.filter_level && !cm->skip_loop_filter) {
+    LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1;
+    winterface->sync(&pbi->lf_worker);
+    lf_data->start = 0;
+    lf_data->stop = cm->mi_rows;
+    winterface->execute(&pbi->lf_worker);
+  }
+#else
+  // Loopfilter remaining rows in the frame.
+  if (cm->lf.filter_level && !cm->skip_loop_filter) {
+    LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1;
+    winterface->sync(&pbi->lf_worker);
+    lf_data->start = lf_data->stop;
+    lf_data->stop = cm->mi_rows;
+    winterface->execute(&pbi->lf_worker);
+  }
+#endif  // CONFIG_PARALLEL_DEBLOCKING
+#endif  // CONFIG_VAR_TX
+  if (cm->frame_parallel_decode)
+    av1_frameworker_broadcast(pbi->cur_buf, INT_MAX);
+
+#if CONFIG_EXT_TILE
+  if (n_tiles == 1) {
+#if CONFIG_ANS
+    return data_end;
+#else
+    // Find the end of the single tile buffer
+    return aom_reader_find_end(&pbi->tile_data->bit_reader);
+#endif  // CONFIG_ANS
+  } else {
+    // Return the end of the last tile buffer
+    return tile_buffers[tile_rows - 1][tile_cols - 1].raw_data_end;
+  }
+#else
+#if CONFIG_ANS
+  return data_end;
+#else
+  {
+    // Get last tile data.
+    TileData *const td = pbi->tile_data + tile_cols * tile_rows - 1;
+    return aom_reader_find_end(&td->bit_reader);
+  }
+#endif  // CONFIG_ANS
+#endif  // CONFIG_EXT_TILE
+}
+
+static int tile_worker_hook(TileWorkerData *const tile_data,
+                            const TileInfo *const tile) {
+  AV1Decoder *const pbi = tile_data->pbi;
+  const AV1_COMMON *const cm = &pbi->common;
+  int mi_row, mi_col;
+
+  if (setjmp(tile_data->error_info.jmp)) {
+    tile_data->error_info.setjmp = 0;
+    aom_merge_corrupted_flag(&tile_data->xd.corrupted, 1);
+    return 0;
+  }
+
+  tile_data->error_info.setjmp = 1;
+  tile_data->xd.error_info = &tile_data->error_info;
+#if CONFIG_DEPENDENT_HORZTILES
+#if CONFIG_TILE_GROUPS
+  if (!cm->dependent_horz_tiles || tile->tg_horz_boundary) {
+#else
+  if (!cm->dependent_horz_tiles) {
+#endif
+    av1_zero_above_context(&pbi->common, tile->mi_col_start, tile->mi_col_end);
+  }
+#else
+  av1_zero_above_context(&pbi->common, tile->mi_col_start, tile->mi_col_end);
+#endif
+
+  for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
+       mi_row += cm->mib_size) {
+    av1_zero_left_context(&tile_data->xd);
+
+    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+         mi_col += cm->mib_size) {
+      decode_partition(pbi, &tile_data->xd,
+#if CONFIG_SUPERTX
+                       0,
+#endif
+                       mi_row, mi_col, &tile_data->bit_reader, cm->sb_size,
+                       b_width_log2_lookup[cm->sb_size]);
+#if CONFIG_NCOBMC && CONFIG_MOTION_VAR
+      detoken_and_recon_sb(pbi, &tile_data->xd, mi_row, mi_col,
+                           &tile_data->bit_reader, cm->sb_size);
+#endif
+    }
+  }
+  return !tile_data->xd.corrupted;
+}
+
+// sorts in descending order
+static int compare_tile_buffers(const void *a, const void *b) {
+  const TileBufferDec *const buf1 = (const TileBufferDec *)a;
+  const TileBufferDec *const buf2 = (const TileBufferDec *)b;
+  return (int)(buf2->size - buf1->size);
+}
+
+static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
+                                      const uint8_t *data_end) {
+  AV1_COMMON *const cm = &pbi->common;
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  const int num_workers = AOMMIN(pbi->max_threads & ~1, tile_cols);
+  TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
+#if CONFIG_EXT_TILE
+  const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+  const int single_row = pbi->dec_tile_row >= 0;
+  const int tile_rows_start = single_row ? dec_tile_row : 0;
+  const int tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
+  const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+  const int single_col = pbi->dec_tile_col >= 0;
+  const int tile_cols_start = single_col ? dec_tile_col : 0;
+  const int tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+#else
+  const int tile_rows_start = 0;
+  const int tile_rows_end = tile_rows;
+  const int tile_cols_start = 0;
+  const int tile_cols_end = tile_cols;
+#endif  // CONFIG_EXT_TILE
+  int tile_row, tile_col;
+  int i;
+
+#if !(CONFIG_ANS || CONFIG_EXT_TILE)
+  int final_worker = -1;
+#endif  // !(CONFIG_ANS || CONFIG_EXT_TILE)
+
+  assert(tile_rows <= MAX_TILE_ROWS);
+  assert(tile_cols <= MAX_TILE_COLS);
+
+  assert(tile_cols * tile_rows > 1);
+
+  // TODO(jzern): See if we can remove the restriction of passing in max
+  // threads to the decoder.
+  if (pbi->num_tile_workers == 0) {
+    const int num_threads = pbi->max_threads & ~1;
+    CHECK_MEM_ERROR(cm, pbi->tile_workers,
+                    aom_malloc(num_threads * sizeof(*pbi->tile_workers)));
+    // Ensure tile data offsets will be properly aligned. This may fail on
+    // platforms without DECLARE_ALIGNED().
+    assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
+    CHECK_MEM_ERROR(
+        cm, pbi->tile_worker_data,
+        aom_memalign(32, num_threads * sizeof(*pbi->tile_worker_data)));
+    CHECK_MEM_ERROR(cm, pbi->tile_worker_info,
+                    aom_malloc(num_threads * sizeof(*pbi->tile_worker_info)));
+    for (i = 0; i < num_threads; ++i) {
+      AVxWorker *const worker = &pbi->tile_workers[i];
+      ++pbi->num_tile_workers;
+
+      winterface->init(worker);
+      if (i < num_threads - 1 && !winterface->reset(worker)) {
+        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                           "Tile decoder thread creation failed");
+      }
+    }
+  }
+
+  // Reset tile decoding hook
+  for (i = 0; i < num_workers; ++i) {
+    AVxWorker *const worker = &pbi->tile_workers[i];
+    winterface->sync(worker);
+    worker->hook = (AVxWorkerHook)tile_worker_hook;
+    worker->data1 = &pbi->tile_worker_data[i];
+    worker->data2 = &pbi->tile_worker_info[i];
+  }
+
+  // Initialize thread frame counts.
+  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    for (i = 0; i < num_workers; ++i) {
+      TileWorkerData *const twd = (TileWorkerData *)pbi->tile_workers[i].data1;
+      av1_zero(twd->counts);
+    }
+  }
+
+  // Load tile data into tile_buffers
+  get_tile_buffers(pbi, data, data_end, tile_buffers);
+
+  for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
+    // Sort the buffers in this tile row based on size in descending order.
+    qsort(&tile_buffers[tile_row][tile_cols_start],
+          tile_cols_end - tile_cols_start, sizeof(tile_buffers[0][0]),
+          compare_tile_buffers);
+
+    // Rearrange the tile buffers in this tile row such that per-tile group
+    // the largest, and presumably the most difficult tile will be decoded in
+    // the main thread. This should help minimize the number of instances
+    // where the main thread is waiting for a worker to complete.
+    {
+      int group_start;
+      for (group_start = tile_cols_start; group_start < tile_cols_end;
+           group_start += num_workers) {
+        const int group_end = AOMMIN(group_start + num_workers, tile_cols);
+        const TileBufferDec largest = tile_buffers[tile_row][group_start];
+        memmove(&tile_buffers[tile_row][group_start],
+                &tile_buffers[tile_row][group_start + 1],
+                (group_end - group_start - 1) * sizeof(tile_buffers[0][0]));
+        tile_buffers[tile_row][group_end - 1] = largest;
+      }
+    }
+
+    for (tile_col = tile_cols_start; tile_col < tile_cols_end;) {
+      // Launch workers for individual columns
+      for (i = 0; i < num_workers && tile_col < tile_cols_end;
+           ++i, ++tile_col) {
+        TileBufferDec *const buf = &tile_buffers[tile_row][tile_col];
+        AVxWorker *const worker = &pbi->tile_workers[i];
+        TileWorkerData *const twd = (TileWorkerData *)worker->data1;
+        TileInfo *const tile_info = (TileInfo *)worker->data2;
+
+        twd->pbi = pbi;
+        twd->xd = pbi->mb;
+        twd->xd.corrupted = 0;
+        twd->xd.counts =
+            cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD
+                ? &twd->counts
+                : NULL;
+        av1_zero(twd->dqcoeff);
+        av1_tile_init(tile_info, cm, tile_row, buf->col);
+        av1_tile_init(&twd->xd.tile, cm, tile_row, buf->col);
+        setup_bool_decoder(buf->data, data_end, buf->size, &cm->error,
+                           &twd->bit_reader,
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+                           1 << cm->ans_window_size_log2,
+#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
+                           pbi->decrypt_cb, pbi->decrypt_state);
+        av1_init_macroblockd(cm, &twd->xd,
+#if CONFIG_PVQ
+                             twd->pvq_ref_coeff,
+#endif
+#if CONFIG_CFL
+                             &twd->cfl,
+#endif
+                             twd->dqcoeff);
+#if CONFIG_PVQ
+        daala_dec_init(cm, &twd->xd.daala_dec, &twd->bit_reader);
+        twd->xd.daala_dec.state.adapt = &twd->tctx.pvq_context;
+#endif
+#if CONFIG_EC_ADAPT
+        // Initialise the tile context from the frame context
+        twd->tctx = *cm->fc;
+        twd->xd.tile_ctx = &twd->tctx;
+#endif
+#if CONFIG_PALETTE
+        twd->xd.plane[0].color_index_map = twd->color_index_map[0];
+        twd->xd.plane[1].color_index_map = twd->color_index_map[1];
+#endif  // CONFIG_PALETTE
+
+        worker->had_error = 0;
+        if (i == num_workers - 1 || tile_col == tile_cols_end - 1) {
+          winterface->execute(worker);
+        } else {
+          winterface->launch(worker);
+        }
+
+#if !(CONFIG_ANS || CONFIG_EXT_TILE)
+        if (tile_row == tile_rows - 1 && buf->col == tile_cols - 1) {
+          final_worker = i;
+        }
+#endif  // !(CONFIG_ANS || CONFIG_EXT_TILE)
+      }
+
+      // Sync all workers
+      for (; i > 0; --i) {
+        AVxWorker *const worker = &pbi->tile_workers[i - 1];
+        // TODO(jzern): The tile may have specific error data associated with
+        // its aom_internal_error_info which could be propagated to the main
+        // info in cm. Additionally once the threads have been synced and an
+        // error is detected, there's no point in continuing to decode tiles.
+        pbi->mb.corrupted |= !winterface->sync(worker);
+      }
+    }
+  }
+
+  // Accumulate thread frame counts.
+  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    for (i = 0; i < num_workers; ++i) {
+      TileWorkerData *const twd = (TileWorkerData *)pbi->tile_workers[i].data1;
+      av1_accumulate_frame_counts(&cm->counts, &twd->counts);
+    }
+  }
+
+#if CONFIG_EXT_TILE
+  // Return the end of the last tile buffer
+  return tile_buffers[tile_rows - 1][tile_cols - 1].raw_data_end;
+#else
+#if CONFIG_ANS
+  return data_end;
+#else
+  assert(final_worker != -1);
+  {
+    TileWorkerData *const twd =
+        (TileWorkerData *)pbi->tile_workers[final_worker].data1;
+    return aom_reader_find_end(&twd->bit_reader);
+  }
+#endif  // CONFIG_ANS
+#endif  // CONFIG_EXT_TILE
+}
+
+static void error_handler(void *data) {
+  AV1_COMMON *const cm = (AV1_COMMON *)data;
+  aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet");
+}
+
+static void read_bitdepth_colorspace_sampling(AV1_COMMON *cm,
+                                              struct aom_read_bit_buffer *rb) {
+  if (cm->profile >= PROFILE_2) {
+    cm->bit_depth = aom_rb_read_bit(rb) ? AOM_BITS_12 : AOM_BITS_10;
+  } else {
+    cm->bit_depth = AOM_BITS_8;
+  }
+
+#if CONFIG_HIGHBITDEPTH
+  if (cm->bit_depth > AOM_BITS_8) {
+    cm->use_highbitdepth = 1;
+  } else {
+#if CONFIG_LOWBITDEPTH
+    cm->use_highbitdepth = 0;
+#else
+    cm->use_highbitdepth = 1;
+#endif
+  }
+#endif
+
+  cm->color_space = aom_rb_read_literal(rb, 3);
+  if (cm->color_space != AOM_CS_SRGB) {
+    // [16,235] (including xvycc) vs [0,255] range
+    cm->color_range = aom_rb_read_bit(rb);
+    if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
+      cm->subsampling_x = aom_rb_read_bit(rb);
+      cm->subsampling_y = aom_rb_read_bit(rb);
+      if (cm->subsampling_x == 1 && cm->subsampling_y == 1)
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "4:2:0 color not supported in profile 1 or 3");
+      if (aom_rb_read_bit(rb))
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Reserved bit set");
+    } else {
+      cm->subsampling_y = cm->subsampling_x = 1;
+    }
+  } else {
+    if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
+      // Note if colorspace is SRGB then 4:4:4 chroma sampling is assumed.
+      // 4:2:2 or 4:4:0 chroma sampling is not allowed.
+      cm->subsampling_y = cm->subsampling_x = 0;
+      if (aom_rb_read_bit(rb))
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Reserved bit set");
+    } else {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "4:4:4 color not supported in profile 0 or 2");
+    }
+  }
+}
+
+#if CONFIG_REFERENCE_BUFFER
+void read_sequence_header(SequenceHeader *seq_params) {
+  /* Placeholder for actually reading from the bitstream */
+  seq_params->frame_id_numbers_present_flag = FRAME_ID_NUMBERS_PRESENT_FLAG;
+  seq_params->frame_id_length_minus7 = FRAME_ID_LENGTH_MINUS7;
+  seq_params->delta_frame_id_length_minus2 = DELTA_FRAME_ID_LENGTH_MINUS2;
+}
+#endif
+
+static size_t read_uncompressed_header(AV1Decoder *pbi,
+                                       struct aom_read_bit_buffer *rb) {
+  AV1_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = pool->frame_bufs;
+  int i, mask, ref_index = 0;
+  size_t sz;
+
+#if CONFIG_REFERENCE_BUFFER
+  /* TODO: Move outside frame loop or inside key-frame branch */
+  read_sequence_header(&pbi->seq_params);
+#endif
+
+  cm->last_frame_type = cm->frame_type;
+  cm->last_intra_only = cm->intra_only;
+
+#if CONFIG_EXT_REFS
+  // NOTE: By default all coded frames to be used as a reference
+  cm->is_reference_frame = 1;
+#endif  // CONFIG_EXT_REFS
+
+  if (aom_rb_read_literal(rb, 2) != AOM_FRAME_MARKER)
+    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Invalid frame marker");
+
+  cm->profile = av1_read_profile(rb);
+
+  const BITSTREAM_PROFILE MAX_SUPPORTED_PROFILE =
+      CONFIG_HIGHBITDEPTH ? MAX_PROFILES : PROFILE_2;
+
+  if (cm->profile >= MAX_SUPPORTED_PROFILE)
+    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Unsupported bitstream profile");
+
+  cm->show_existing_frame = aom_rb_read_bit(rb);
+
+  if (cm->show_existing_frame) {
+    // Show an existing frame directly.
+    const int existing_frame_idx = aom_rb_read_literal(rb, 3);
+    const int frame_to_show = cm->ref_frame_map[existing_frame_idx];
+#if CONFIG_REFERENCE_BUFFER
+    if (pbi->seq_params.frame_id_numbers_present_flag) {
+      int frame_id_length = pbi->seq_params.frame_id_length_minus7 + 7;
+      int display_frame_id = aom_rb_read_literal(rb, frame_id_length);
+      /* Compare display_frame_id with ref_frame_id and check valid for
+      * referencing */
+      if (display_frame_id != cm->ref_frame_id[existing_frame_idx] ||
+          cm->valid_for_referencing[existing_frame_idx] == 0)
+        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                           "Reference buffer frame ID mismatch");
+    }
+#endif
+    lock_buffer_pool(pool);
+    if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
+      unlock_buffer_pool(pool);
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Buffer %d does not contain a decoded frame",
+                         frame_to_show);
+    }
+    ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+    unlock_buffer_pool(pool);
+
+    cm->lf.filter_level = 0;
+    cm->show_frame = 1;
+    pbi->refresh_frame_flags = 0;
+
+    if (cm->frame_parallel_decode) {
+      for (i = 0; i < REF_FRAMES; ++i)
+        cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
+    }
+
+    return 0;
+  }
+
+  cm->frame_type = (FRAME_TYPE)aom_rb_read_bit(rb);
+  cm->show_frame = aom_rb_read_bit(rb);
+  cm->error_resilient_mode = aom_rb_read_bit(rb);
+#if CONFIG_REFERENCE_BUFFER
+  if (pbi->seq_params.frame_id_numbers_present_flag) {
+    int frame_id_length = pbi->seq_params.frame_id_length_minus7 + 7;
+    int diff_len = pbi->seq_params.delta_frame_id_length_minus2 + 2;
+    int prev_frame_id = 0;
+    if (cm->frame_type != KEY_FRAME) {
+      prev_frame_id = cm->current_frame_id;
+    }
+    cm->current_frame_id = aom_rb_read_literal(rb, frame_id_length);
+
+    if (cm->frame_type != KEY_FRAME) {
+      int diff_frame_id;
+      if (cm->current_frame_id > prev_frame_id) {
+        diff_frame_id = cm->current_frame_id - prev_frame_id;
+      } else {
+        diff_frame_id =
+            (1 << frame_id_length) + cm->current_frame_id - prev_frame_id;
+      }
+      /* Check current_frame_id for conformance */
+      if (prev_frame_id == cm->current_frame_id ||
+          diff_frame_id >= (1 << (frame_id_length - 1))) {
+        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                           "Invalid value of current_frame_id");
+      }
+    }
+    /* Check if some frames need to be marked as not valid for referencing */
+    for (i = 0; i < REF_FRAMES; i++) {
+      if (cm->frame_type == KEY_FRAME) {
+        cm->valid_for_referencing[i] = 0;
+      } else if (cm->current_frame_id - (1 << diff_len) > 0) {
+        if (cm->ref_frame_id[i] > cm->current_frame_id ||
+            cm->ref_frame_id[i] < cm->current_frame_id - (1 << diff_len))
+          cm->valid_for_referencing[i] = 0;
+      } else {
+        if (cm->ref_frame_id[i] > cm->current_frame_id &&
+            cm->ref_frame_id[i] <
+                (1 << frame_id_length) + cm->current_frame_id - (1 << diff_len))
+          cm->valid_for_referencing[i] = 0;
+      }
+    }
+  }
+#endif
+  if (cm->frame_type == KEY_FRAME) {
+    if (!av1_read_sync_code(rb))
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Invalid frame sync code");
+
+    read_bitdepth_colorspace_sampling(cm, rb);
+    pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
+
+    for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+      cm->frame_refs[i].idx = INVALID_IDX;
+      cm->frame_refs[i].buf = NULL;
+    }
+
+    setup_frame_size(cm, rb);
+    if (pbi->need_resync) {
+      memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+      pbi->need_resync = 0;
+    }
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+    cm->ans_window_size_log2 = aom_rb_read_literal(rb, 4) + 8;
+#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
+#if CONFIG_PALETTE
+    cm->allow_screen_content_tools = aom_rb_read_bit(rb);
+#endif  // CONFIG_PALETTE
+  } else {
+    cm->intra_only = cm->show_frame ? 0 : aom_rb_read_bit(rb);
+#if CONFIG_PALETTE
+    if (cm->intra_only) cm->allow_screen_content_tools = aom_rb_read_bit(rb);
+#endif  // CONFIG_PALETTE
+    if (cm->error_resilient_mode) {
+      cm->reset_frame_context = RESET_FRAME_CONTEXT_ALL;
+    } else {
+      if (cm->intra_only) {
+        cm->reset_frame_context = aom_rb_read_bit(rb)
+                                      ? RESET_FRAME_CONTEXT_ALL
+                                      : RESET_FRAME_CONTEXT_CURRENT;
+      } else {
+        cm->reset_frame_context = aom_rb_read_bit(rb)
+                                      ? RESET_FRAME_CONTEXT_CURRENT
+                                      : RESET_FRAME_CONTEXT_NONE;
+        if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT)
+          cm->reset_frame_context = aom_rb_read_bit(rb)
+                                        ? RESET_FRAME_CONTEXT_ALL
+                                        : RESET_FRAME_CONTEXT_CURRENT;
+      }
+    }
+
+    if (cm->intra_only) {
+      if (!av1_read_sync_code(rb))
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Invalid frame sync code");
+
+      read_bitdepth_colorspace_sampling(cm, rb);
+
+      pbi->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
+      setup_frame_size(cm, rb);
+      if (pbi->need_resync) {
+        memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+        pbi->need_resync = 0;
+      }
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+      cm->ans_window_size_log2 = aom_rb_read_literal(rb, 4) + 8;
+#endif
+    } else if (pbi->need_resync != 1) { /* Skip if need resync */
+      pbi->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
+
+#if CONFIG_EXT_REFS
+      if (!pbi->refresh_frame_flags) {
+        // NOTE: "pbi->refresh_frame_flags == 0" indicates that the coded frame
+        //       will not be used as a reference
+        cm->is_reference_frame = 0;
+      }
+#endif  // CONFIG_EXT_REFS
+
+      for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+        const int ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
+        const int idx = cm->ref_frame_map[ref];
+        RefBuffer *const ref_frame = &cm->frame_refs[i];
+        ref_frame->idx = idx;
+        ref_frame->buf = &frame_bufs[idx].buf;
+        cm->ref_frame_sign_bias[LAST_FRAME + i] = aom_rb_read_bit(rb);
+#if CONFIG_REFERENCE_BUFFER
+        if (pbi->seq_params.frame_id_numbers_present_flag) {
+          int frame_id_length = pbi->seq_params.frame_id_length_minus7 + 7;
+          int diff_len = pbi->seq_params.delta_frame_id_length_minus2 + 2;
+          int delta_frame_id_minus1 = aom_rb_read_literal(rb, diff_len);
+          int ref_frame_id =
+              ((cm->current_frame_id - (delta_frame_id_minus1 + 1) +
+                (1 << frame_id_length)) %
+               (1 << frame_id_length));
+          /* Compare values derived from delta_frame_id_minus1 and
+          * refresh_frame_flags. Also, check valid for referencing */
+          if (ref_frame_id != cm->ref_frame_id[ref] ||
+              cm->valid_for_referencing[ref] == 0)
+            aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                               "Reference buffer frame ID mismatch");
+        }
+#endif
+      }
+
+#if CONFIG_FRAME_SIZE
+      if (cm->error_resilient_mode == 0) {
+        setup_frame_size_with_refs(cm, rb);
+      } else {
+        setup_frame_size(cm, rb);
+      }
+#else
+      setup_frame_size_with_refs(cm, rb);
+#endif
+
+      cm->allow_high_precision_mv = aom_rb_read_bit(rb);
+      cm->interp_filter = read_frame_interp_filter(rb);
+#if CONFIG_TEMPMV_SIGNALING
+      if (!cm->error_resilient_mode) {
+        cm->use_prev_frame_mvs = aom_rb_read_bit(rb);
+      }
+#endif
+      for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+        RefBuffer *const ref_buf = &cm->frame_refs[i];
+#if CONFIG_HIGHBITDEPTH
+        av1_setup_scale_factors_for_frame(
+            &ref_buf->sf, ref_buf->buf->y_crop_width,
+            ref_buf->buf->y_crop_height, cm->width, cm->height,
+            cm->use_highbitdepth);
+#else
+        av1_setup_scale_factors_for_frame(
+            &ref_buf->sf, ref_buf->buf->y_crop_width,
+            ref_buf->buf->y_crop_height, cm->width, cm->height);
+#endif
+      }
+    }
+  }
+#if CONFIG_TEMPMV_SIGNALING
+  cm->cur_frame->intra_only = cm->frame_type == KEY_FRAME || cm->intra_only;
+#endif
+
+#if CONFIG_REFERENCE_BUFFER
+  if (pbi->seq_params.frame_id_numbers_present_flag) {
+    /* If bitmask is set, update reference frame id values and
+    mark frames as valid for reference */
+    int refresh_frame_flags =
+        cm->frame_type == KEY_FRAME ? 0xFF : pbi->refresh_frame_flags;
+    for (i = 0; i < REF_FRAMES; i++) {
+      if ((refresh_frame_flags >> i) & 1) {
+        cm->ref_frame_id[i] = cm->current_frame_id;
+        cm->valid_for_referencing[i] = 1;
+      }
+    }
+  }
+#endif
+
+#if CONFIG_HIGHBITDEPTH
+  get_frame_new_buffer(cm)->bit_depth = cm->bit_depth;
+#endif
+  get_frame_new_buffer(cm)->color_space = cm->color_space;
+  get_frame_new_buffer(cm)->color_range = cm->color_range;
+  get_frame_new_buffer(cm)->render_width = cm->render_width;
+  get_frame_new_buffer(cm)->render_height = cm->render_height;
+
+  if (pbi->need_resync) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Keyframe / intra-only frame required to reset decoder"
+                       " state");
+  }
+
+  if (!cm->error_resilient_mode) {
+    cm->refresh_frame_context = aom_rb_read_bit(rb)
+                                    ? REFRESH_FRAME_CONTEXT_FORWARD
+                                    : REFRESH_FRAME_CONTEXT_BACKWARD;
+  } else {
+    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_FORWARD;
+  }
+
+  // This flag will be overridden by the call to av1_setup_past_independence
+  // below, forcing the use of context 0 for those frame types.
+  cm->frame_context_idx = aom_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);
+
+  // Generate next_ref_frame_map.
+  lock_buffer_pool(pool);
+  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+    if (mask & 1) {
+      cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
+      ++frame_bufs[cm->new_fb_idx].ref_count;
+    } else {
+      cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+    }
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+    ++ref_index;
+  }
+
+  for (; ref_index < REF_FRAMES; ++ref_index) {
+    cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+  }
+  unlock_buffer_pool(pool);
+  pbi->hold_ref_buf = 1;
+
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode)
+    av1_setup_past_independence(cm);
+
+#if CONFIG_EXT_PARTITION
+  set_sb_size(cm, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64);
+#else
+  set_sb_size(cm, BLOCK_64X64);
+#endif  // CONFIG_EXT_PARTITION
+
+  setup_loopfilter(cm, rb);
+#if CONFIG_CDEF
+  setup_cdef(cm, rb);
+#endif
+#if CONFIG_LOOP_RESTORATION
+  decode_restoration_mode(cm, rb);
+#endif  // CONFIG_LOOP_RESTORATION
+  setup_quantization(cm, rb);
+#if CONFIG_HIGHBITDEPTH
+  xd->bd = (int)cm->bit_depth;
+#endif
+
+#if CONFIG_Q_ADAPT_PROBS
+  av1_default_coef_probs(cm);
+  if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode ||
+      cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL) {
+    for (i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
+  } else if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT) {
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+  }
+#endif  // CONFIG_Q_ADAPT_PROBS
+
+  setup_segmentation(cm, rb);
+
+#if CONFIG_DELTA_Q
+  {
+    struct segmentation *const seg = &cm->seg;
+    int segment_quantizer_active = 0;
+    for (i = 0; i < MAX_SEGMENTS; i++) {
+      if (segfeature_active(seg, i, SEG_LVL_ALT_Q)) {
+        segment_quantizer_active = 1;
+      }
+    }
+
+    cm->delta_q_res = 1;
+#if CONFIG_EXT_DELTA_Q
+    cm->delta_lf_res = 1;
+#endif
+    if (segment_quantizer_active == 0 && cm->base_qindex > 0) {
+      cm->delta_q_present_flag = aom_rb_read_bit(rb);
+    } else {
+      cm->delta_q_present_flag = 0;
+    }
+    if (cm->delta_q_present_flag) {
+      xd->prev_qindex = cm->base_qindex;
+      cm->delta_q_res = 1 << aom_rb_read_literal(rb, 2);
+#if CONFIG_EXT_DELTA_Q
+      if (segment_quantizer_active) {
+        assert(seg->abs_delta == SEGMENT_DELTADATA);
+      }
+      cm->delta_lf_present_flag = aom_rb_read_bit(rb);
+      if (cm->delta_lf_present_flag) {
+        xd->prev_delta_lf_from_base = 0;
+        cm->delta_lf_res = 1 << aom_rb_read_literal(rb, 2);
+      } else {
+        cm->delta_lf_present_flag = 0;
+      }
+#endif  // CONFIG_EXT_DELTA_Q
+    }
+  }
+#endif
+
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    const int qindex = cm->seg.enabled
+                           ? av1_get_qindex(&cm->seg, i, cm->base_qindex)
+                           : cm->base_qindex;
+    xd->lossless[i] = qindex == 0 && cm->y_dc_delta_q == 0 &&
+                      cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
+    xd->qindex[i] = qindex;
+  }
+
+  setup_segmentation_dequant(cm);
+  cm->tx_mode = read_tx_mode(cm, xd, rb);
+  cm->reference_mode = read_frame_reference_mode(cm, rb);
+
+#if CONFIG_EXT_TX
+  cm->reduced_tx_set_used = aom_rb_read_bit(rb);
+#endif  // CONFIG_EXT_TX
+
+  read_tile_info(pbi, rb);
+  sz = aom_rb_read_literal(rb, 16);
+
+  if (sz == 0)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Invalid header size");
+  return sz;
+}
+
+#if CONFIG_EXT_TX
+#if !CONFIG_EC_ADAPT
+static void read_ext_tx_probs(FRAME_CONTEXT *fc, aom_reader *r) {
+  int i, j, k;
+  int s;
+  for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+    if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
+      for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+        if (!use_inter_ext_tx_for_txsize[s][i]) continue;
+        for (j = 0; j < num_ext_tx_set[ext_tx_set_type_inter[s]] - 1; ++j)
+          av1_diff_update_prob(r, &fc->inter_ext_tx_prob[s][i][j], ACCT_STR);
+      }
+    }
+  }
+
+  for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+    if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
+      for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+        if (!use_intra_ext_tx_for_txsize[s][i]) continue;
+        for (j = 0; j < INTRA_MODES; ++j)
+          for (k = 0; k < num_ext_tx_set[ext_tx_set_type_intra[s]] - 1; ++k)
+            av1_diff_update_prob(r, &fc->intra_ext_tx_prob[s][i][j][k],
+                                 ACCT_STR);
+      }
+    }
+  }
+}
+#endif  // !CONFIG_EC_ADAPT
+#else
+
+#endif  // CONFIG_EXT_TX
+#if CONFIG_SUPERTX
+static void read_supertx_probs(FRAME_CONTEXT *fc, aom_reader *r) {
+  int i, j;
+  if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
+    for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
+      for (j = TX_8X8; j < TX_SIZES; ++j) {
+        av1_diff_update_prob(r, &fc->supertx_prob[i][j], ACCT_STR);
+      }
+    }
+  }
+}
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_GLOBAL_MOTION
+static void read_global_motion_params(WarpedMotionParams *params,
+                                      WarpedMotionParams *ref_params,
+                                      aom_prob *probs, aom_reader *r,
+                                      int allow_hp) {
+  TransformationType type =
+      aom_read_tree(r, av1_global_motion_types_tree, probs, ACCT_STR);
+  int trans_bits;
+  int trans_dec_factor;
+  int trans_prec_diff;
+  set_default_warp_params(params);
+  params->wmtype = type;
+  switch (type) {
+    case HOMOGRAPHY:
+    case HORTRAPEZOID:
+    case VERTRAPEZOID:
+      if (type != HORTRAPEZOID)
+        params->wmmat[6] =
+            aom_read_signed_primitive_refsubexpfin(
+                r, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
+                (ref_params->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF)) *
+            GM_ROW3HOMO_DECODE_FACTOR;
+      if (type != VERTRAPEZOID)
+        params->wmmat[7] =
+            aom_read_signed_primitive_refsubexpfin(
+                r, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
+                (ref_params->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF)) *
+            GM_ROW3HOMO_DECODE_FACTOR;
+    case AFFINE:
+    case ROTZOOM:
+      params->wmmat[2] = aom_read_signed_primitive_refsubexpfin(
+                             r, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+                             (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
+                                 (1 << GM_ALPHA_PREC_BITS)) *
+                             GM_ALPHA_DECODE_FACTOR +
+                         (1 << WARPEDMODEL_PREC_BITS);
+      if (type != VERTRAPEZOID)
+        params->wmmat[3] = aom_read_signed_primitive_refsubexpfin(
+                               r, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+                               (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF)) *
+                           GM_ALPHA_DECODE_FACTOR;
+      if (type >= AFFINE) {
+        if (type != HORTRAPEZOID)
+          params->wmmat[4] = aom_read_signed_primitive_refsubexpfin(
+                                 r, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+                                 (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF)) *
+                             GM_ALPHA_DECODE_FACTOR;
+        params->wmmat[5] = aom_read_signed_primitive_refsubexpfin(
+                               r, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+                               (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+                                   (1 << GM_ALPHA_PREC_BITS)) *
+                               GM_ALPHA_DECODE_FACTOR +
+                           (1 << WARPEDMODEL_PREC_BITS);
+      } else {
+        params->wmmat[4] = -params->wmmat[3];
+        params->wmmat[5] = params->wmmat[2];
+      }
+    // fallthrough intended
+    case TRANSLATION:
+      trans_bits = (type == TRANSLATION) ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+                                         : GM_ABS_TRANS_BITS;
+      trans_dec_factor = (type == TRANSLATION)
+                             ? GM_TRANS_ONLY_DECODE_FACTOR * (1 << !allow_hp)
+                             : GM_TRANS_DECODE_FACTOR;
+      trans_prec_diff = (type == TRANSLATION)
+                            ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+                            : GM_TRANS_PREC_DIFF;
+      params->wmmat[0] = aom_read_signed_primitive_refsubexpfin(
+                             r, (1 << trans_bits) + 1, SUBEXPFIN_K,
+                             (ref_params->wmmat[0] >> trans_prec_diff)) *
+                         trans_dec_factor;
+      params->wmmat[1] = aom_read_signed_primitive_refsubexpfin(
+                             r, (1 << trans_bits) + 1, SUBEXPFIN_K,
+                             (ref_params->wmmat[1] >> trans_prec_diff)) *
+                         trans_dec_factor;
+    case IDENTITY: break;
+    default: assert(0);
+  }
+  if (params->wmtype <= AFFINE)
+    if (!get_shear_params(params)) assert(0);
+}
+
+static void read_global_motion(AV1_COMMON *cm, aom_reader *r) {
+  int frame;
+  for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+    read_global_motion_params(
+        &cm->global_motion[frame], &cm->prev_frame->global_motion[frame],
+        cm->fc->global_motion_types_prob, r, cm->allow_high_precision_mv);
+    /*
+    printf("Dec Ref %d [%d/%d]: %d %d %d %d\n",
+           frame, cm->current_video_frame, cm->show_frame,
+           cm->global_motion[frame].wmmat[0],
+           cm->global_motion[frame].wmmat[1],
+           cm->global_motion[frame].wmmat[2],
+           cm->global_motion[frame].wmmat[3]);
+           */
+  }
+  memcpy(cm->cur_frame->global_motion, cm->global_motion,
+         TOTAL_REFS_PER_FRAME * sizeof(WarpedMotionParams));
+}
+#endif  // CONFIG_GLOBAL_MOTION
+
+static int read_compressed_header(AV1Decoder *pbi, const uint8_t *data,
+                                  size_t partition_size) {
+  AV1_COMMON *const cm = &pbi->common;
+#if CONFIG_SUPERTX
+  MACROBLOCKD *const xd = &pbi->mb;
+#endif
+  FRAME_CONTEXT *const fc = cm->fc;
+  aom_reader r;
+  int k, i;
+#if !CONFIG_EC_ADAPT || \
+    (CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION || CONFIG_EXT_INTER)
+  int j;
+#endif
+
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+  r.window_size = 1 << cm->ans_window_size_log2;
+#endif
+  if (aom_reader_init(&r, data, partition_size, pbi->decrypt_cb,
+                      pbi->decrypt_state))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate bool decoder 0");
+
+#if CONFIG_LOOP_RESTORATION
+  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
+    av1_alloc_restoration_buffers(cm);
+    decode_restoration(cm, &r);
+  }
+#endif
+
+#if !CONFIG_EC_ADAPT
+  if (cm->tx_mode == TX_MODE_SELECT) read_tx_size_probs(fc, &r);
+#endif
+
+#if CONFIG_LV_MAP
+  av1_read_txb_probs(fc, cm->tx_mode, &r);
+#else  // CONFIG_LV_MAP
+#if !CONFIG_PVQ
+#if !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET)
+  read_coef_probs(fc, cm->tx_mode, &r);
+#endif  // !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET)
+#endif  // !CONFIG_PVQ
+#endif  // CONFIG_LV_MAP
+
+#if CONFIG_VAR_TX
+  for (k = 0; k < TXFM_PARTITION_CONTEXTS; ++k)
+    av1_diff_update_prob(&r, &fc->txfm_partition_prob[k], ACCT_STR);
+#endif  // CONFIG_VAR_TX
+  for (k = 0; k < SKIP_CONTEXTS; ++k)
+    av1_diff_update_prob(&r, &fc->skip_probs[k], ACCT_STR);
+
+#if CONFIG_DELTA_Q && !CONFIG_EC_ADAPT
+#if CONFIG_EXT_DELTA_Q
+  if (cm->delta_q_present_flag) {
+    for (k = 0; k < DELTA_Q_PROBS; ++k)
+      av1_diff_update_prob(&r, &fc->delta_q_prob[k], ACCT_STR);
+  }
+  if (cm->delta_lf_present_flag) {
+    for (k = 0; k < DELTA_LF_PROBS; ++k)
+      av1_diff_update_prob(&r, &fc->delta_lf_prob[k], ACCT_STR);
+  }
+#else
+  for (k = 0; k < DELTA_Q_PROBS; ++k)
+    av1_diff_update_prob(&r, &fc->delta_q_prob[k], ACCT_STR);
+#endif
+#endif
+
+#if !CONFIG_EC_ADAPT
+  if (cm->seg.enabled && cm->seg.update_map) {
+    if (cm->seg.temporal_update) {
+      for (k = 0; k < PREDICTION_PROBS; k++)
+        av1_diff_update_prob(&r, &cm->fc->seg.pred_probs[k], ACCT_STR);
+    }
+    for (k = 0; k < MAX_SEGMENTS - 1; k++)
+      av1_diff_update_prob(&r, &cm->fc->seg.tree_probs[k], ACCT_STR);
+  }
+
+  for (j = 0; j < INTRA_MODES; j++) {
+    for (i = 0; i < INTRA_MODES - 1; ++i)
+      av1_diff_update_prob(&r, &fc->uv_mode_prob[j][i], ACCT_STR);
+  }
+
+#if CONFIG_EXT_PARTITION_TYPES
+  for (j = 0; j < PARTITION_PLOFFSET; ++j)
+    for (i = 0; i < PARTITION_TYPES - 1; ++i)
+      av1_diff_update_prob(&r, &fc->partition_prob[j][i], ACCT_STR);
+  for (; j < PARTITION_CONTEXTS_PRIMARY; ++j)
+    for (i = 0; i < EXT_PARTITION_TYPES - 1; ++i)
+      av1_diff_update_prob(&r, &fc->partition_prob[j][i], ACCT_STR);
+#else
+  for (j = 0; j < PARTITION_CONTEXTS_PRIMARY; ++j)
+    for (i = 0; i < PARTITION_TYPES - 1; ++i)
+      av1_diff_update_prob(&r, &fc->partition_prob[j][i], ACCT_STR);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
+#if CONFIG_UNPOISON_PARTITION_CTX
+  for (; j < PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES; ++j)
+    av1_diff_update_prob(&r, &fc->partition_prob[j][PARTITION_VERT], ACCT_STR);
+  for (; j < PARTITION_CONTEXTS_PRIMARY + 2 * PARTITION_BLOCK_SIZES; ++j)
+    av1_diff_update_prob(&r, &fc->partition_prob[j][PARTITION_HORZ], ACCT_STR);
+#endif  // CONFIG_UNPOISON_PARTITION_CTX
+
+#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+  for (i = 0; i < INTRA_FILTERS + 1; ++i)
+    for (j = 0; j < INTRA_FILTERS - 1; ++j)
+      av1_diff_update_prob(&r, &fc->intra_filter_probs[i][j], ACCT_STR);
+#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+#endif  // !CONFIG_EC_ADAPT
+
+  if (frame_is_intra_only(cm)) {
+    av1_copy(cm->kf_y_prob, av1_kf_y_mode_prob);
+#if CONFIG_EC_MULTISYMBOL
+    av1_copy(cm->fc->kf_y_cdf, av1_kf_y_mode_cdf);
+#endif
+#if !CONFIG_EC_ADAPT
+    for (k = 0; k < INTRA_MODES; k++)
+      for (j = 0; j < INTRA_MODES; j++)
+        for (i = 0; i < INTRA_MODES - 1; ++i)
+          av1_diff_update_prob(&r, &cm->kf_y_prob[k][j][i], ACCT_STR);
+#endif
+  } else {
+#if !CONFIG_REF_MV
+    nmv_context *const nmvc = &fc->nmvc;
+#endif
+    read_inter_mode_probs(fc, &r);
+
+#if CONFIG_EXT_INTER
+    read_inter_compound_mode_probs(fc, &r);
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
+      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+        if (is_interintra_allowed_bsize_group(i)) {
+          av1_diff_update_prob(&r, &fc->interintra_prob[i], ACCT_STR);
+        }
+      }
+      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+        for (j = 0; j < INTERINTRA_MODES - 1; j++)
+          av1_diff_update_prob(&r, &fc->interintra_mode_prob[i][j], ACCT_STR);
+      }
+      for (i = 0; i < BLOCK_SIZES; i++) {
+        if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i)) {
+          av1_diff_update_prob(&r, &fc->wedge_interintra_prob[i], ACCT_STR);
+        }
+      }
+    }
+#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+    if (cm->reference_mode != SINGLE_REFERENCE) {
+      for (i = 0; i < BLOCK_SIZES; i++) {
+        for (j = 0; j < COMPOUND_TYPES - 1; j++) {
+          av1_diff_update_prob(&r, &fc->compound_type_prob[i][j], ACCT_STR);
+        }
+      }
+    }
+#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    for (i = BLOCK_8X8; i < BLOCK_SIZES; ++i) {
+      for (j = 0; j < MOTION_MODES - 1; ++j)
+        av1_diff_update_prob(&r, &fc->motion_mode_prob[i][j], ACCT_STR);
+    }
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if !CONFIG_EC_ADAPT
+    if (cm->interp_filter == SWITCHABLE) read_switchable_interp_probs(fc, &r);
+#endif
+
+    for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+      av1_diff_update_prob(&r, &fc->intra_inter_prob[i], ACCT_STR);
+
+    if (cm->reference_mode != SINGLE_REFERENCE)
+      setup_compound_reference_mode(cm);
+    read_frame_reference_mode_probs(cm, &r);
+
+#if !CONFIG_EC_ADAPT
+    for (j = 0; j < BLOCK_SIZE_GROUPS; j++) {
+      for (i = 0; i < INTRA_MODES - 1; ++i)
+        av1_diff_update_prob(&r, &fc->y_mode_prob[j][i], ACCT_STR);
+    }
+#endif
+
+#if CONFIG_REF_MV
+    for (i = 0; i < NMV_CONTEXTS; ++i)
+      read_mv_probs(&fc->nmvc[i], cm->allow_high_precision_mv, &r);
+#else
+    read_mv_probs(nmvc, cm->allow_high_precision_mv, &r);
+#endif
+#if !CONFIG_EC_ADAPT
+    read_ext_tx_probs(fc, &r);
+#endif  // EC_ADAPT
+#if CONFIG_SUPERTX
+    if (!xd->lossless[0]) read_supertx_probs(fc, &r);
+#endif
+#if CONFIG_GLOBAL_MOTION
+    read_global_motion(cm, &r);
+#endif  // EC_ADAPT, DAALA_EC
+  }
+#if CONFIG_EC_MULTISYMBOL && !CONFIG_EC_ADAPT
+#if CONFIG_NEW_TOKENSET
+  av1_coef_head_cdfs(fc);
+#endif
+  /* Make tail distribution from head */
+  av1_coef_pareto_cdfs(fc);
+#if CONFIG_REF_MV
+  for (i = 0; i < NMV_CONTEXTS; ++i) av1_set_mv_cdfs(&fc->nmvc[i]);
+#else
+  av1_set_mv_cdfs(&fc->nmvc);
+#endif
+  av1_set_mode_cdfs(cm);
+#endif  // CONFIG_EC_MULTISYMBOL && !CONFIG_EC_ADAPT
+
+  return aom_reader_has_error(&r);
+}
+
+#ifdef NDEBUG
+#define debug_check_frame_counts(cm) (void)0
+#else  // !NDEBUG
+// Counts should only be incremented when frame_parallel_decoding_mode and
+// error_resilient_mode are disabled.
+static void debug_check_frame_counts(const AV1_COMMON *const cm) {
+  FRAME_COUNTS zero_counts;
+  av1_zero(zero_counts);
+  assert(cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_BACKWARD ||
+         cm->error_resilient_mode);
+  assert(!memcmp(cm->counts.y_mode, zero_counts.y_mode,
+                 sizeof(cm->counts.y_mode)));
+  assert(!memcmp(cm->counts.uv_mode, zero_counts.uv_mode,
+                 sizeof(cm->counts.uv_mode)));
+  assert(!memcmp(cm->counts.partition, zero_counts.partition,
+                 sizeof(cm->counts.partition)));
+  assert(!memcmp(cm->counts.coef, zero_counts.coef, sizeof(cm->counts.coef)));
+  assert(!memcmp(cm->counts.eob_branch, zero_counts.eob_branch,
+                 sizeof(cm->counts.eob_branch)));
+#if CONFIG_EC_MULTISYMBOL
+  assert(!memcmp(cm->counts.blockz_count, zero_counts.blockz_count,
+                 sizeof(cm->counts.blockz_count)));
+#endif
+  assert(!memcmp(cm->counts.switchable_interp, zero_counts.switchable_interp,
+                 sizeof(cm->counts.switchable_interp)));
+  assert(!memcmp(cm->counts.inter_mode, zero_counts.inter_mode,
+                 sizeof(cm->counts.inter_mode)));
+#if CONFIG_EXT_INTER
+  assert(!memcmp(cm->counts.inter_compound_mode,
+                 zero_counts.inter_compound_mode,
+                 sizeof(cm->counts.inter_compound_mode)));
+  assert(!memcmp(cm->counts.interintra, zero_counts.interintra,
+                 sizeof(cm->counts.interintra)));
+  assert(!memcmp(cm->counts.wedge_interintra, zero_counts.wedge_interintra,
+                 sizeof(cm->counts.wedge_interintra)));
+  assert(!memcmp(cm->counts.compound_interinter,
+                 zero_counts.compound_interinter,
+                 sizeof(cm->counts.compound_interinter)));
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  assert(!memcmp(cm->counts.motion_mode, zero_counts.motion_mode,
+                 sizeof(cm->counts.motion_mode)));
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  assert(!memcmp(cm->counts.intra_inter, zero_counts.intra_inter,
+                 sizeof(cm->counts.intra_inter)));
+  assert(!memcmp(cm->counts.comp_inter, zero_counts.comp_inter,
+                 sizeof(cm->counts.comp_inter)));
+  assert(!memcmp(cm->counts.single_ref, zero_counts.single_ref,
+                 sizeof(cm->counts.single_ref)));
+  assert(!memcmp(cm->counts.comp_ref, zero_counts.comp_ref,
+                 sizeof(cm->counts.comp_ref)));
+#if CONFIG_EXT_REFS
+  assert(!memcmp(cm->counts.comp_bwdref, zero_counts.comp_bwdref,
+                 sizeof(cm->counts.comp_bwdref)));
+#endif  // CONFIG_EXT_REFS
+  assert(!memcmp(&cm->counts.tx_size, &zero_counts.tx_size,
+                 sizeof(cm->counts.tx_size)));
+  assert(!memcmp(cm->counts.skip, zero_counts.skip, sizeof(cm->counts.skip)));
+#if CONFIG_REF_MV
+  assert(
+      !memcmp(&cm->counts.mv[0], &zero_counts.mv[0], sizeof(cm->counts.mv[0])));
+  assert(
+      !memcmp(&cm->counts.mv[1], &zero_counts.mv[1], sizeof(cm->counts.mv[0])));
+#else
+  assert(!memcmp(&cm->counts.mv, &zero_counts.mv, sizeof(cm->counts.mv)));
+#endif
+  assert(!memcmp(cm->counts.inter_ext_tx, zero_counts.inter_ext_tx,
+                 sizeof(cm->counts.inter_ext_tx)));
+  assert(!memcmp(cm->counts.intra_ext_tx, zero_counts.intra_ext_tx,
+                 sizeof(cm->counts.intra_ext_tx)));
+}
+#endif  // NDEBUG
+
+static struct aom_read_bit_buffer *init_read_bit_buffer(
+    AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
+    const uint8_t *data_end, uint8_t clear_data[MAX_AV1_HEADER_SIZE]) {
+  rb->bit_offset = 0;
+  rb->error_handler = error_handler;
+  rb->error_handler_data = &pbi->common;
+  if (pbi->decrypt_cb) {
+    const int n = (int)AOMMIN(MAX_AV1_HEADER_SIZE, data_end - data);
+    pbi->decrypt_cb(pbi->decrypt_state, data, clear_data, n);
+    rb->bit_buffer = clear_data;
+    rb->bit_buffer_end = clear_data + n;
+  } else {
+    rb->bit_buffer = data;
+    rb->bit_buffer_end = data_end;
+  }
+  return rb;
+}
+
+//------------------------------------------------------------------------------
+
+int av1_read_sync_code(struct aom_read_bit_buffer *const rb) {
+  return aom_rb_read_literal(rb, 8) == AV1_SYNC_CODE_0 &&
+         aom_rb_read_literal(rb, 8) == AV1_SYNC_CODE_1 &&
+         aom_rb_read_literal(rb, 8) == AV1_SYNC_CODE_2;
+}
+
+void av1_read_frame_size(struct aom_read_bit_buffer *rb, int *width,
+                         int *height) {
+  *width = aom_rb_read_literal(rb, 16) + 1;
+  *height = aom_rb_read_literal(rb, 16) + 1;
+}
+
+BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb) {
+  int profile = aom_rb_read_bit(rb);
+  profile |= aom_rb_read_bit(rb) << 1;
+  if (profile > 2) profile += aom_rb_read_bit(rb);
+  return (BITSTREAM_PROFILE)profile;
+}
+
+#if CONFIG_EC_ADAPT
+static void make_update_tile_list_dec(AV1Decoder *pbi, int tile_rows,
+                                      int tile_cols, FRAME_CONTEXT *ec_ctxs[]) {
+  int i;
+  for (i = 0; i < tile_rows * tile_cols; ++i)
+    ec_ctxs[i] = &pbi->tile_data[i].tctx;
+}
+#endif
+
+void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data,
+                      const uint8_t *data_end, const uint8_t **p_data_end) {
+  AV1_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  struct aom_read_bit_buffer rb;
+  int context_updated = 0;
+  uint8_t clear_data[MAX_AV1_HEADER_SIZE];
+  size_t first_partition_size;
+  YV12_BUFFER_CONFIG *new_fb;
+
+#if CONFIG_ADAPT_SCAN
+  av1_deliver_eob_threshold(cm, xd);
+#endif
+#if CONFIG_BITSTREAM_DEBUG
+  bitstream_queue_set_frame_read(cm->current_video_frame * 2 + cm->show_frame);
+#endif
+
+  first_partition_size = read_uncompressed_header(
+      pbi, init_read_bit_buffer(pbi, &rb, data, data_end, clear_data));
+
+#if CONFIG_EXT_TILE
+  // If cm->tile_encoding_mode == TILE_NORMAL, the independent decoding of a
+  // single tile or a section of a frame is not allowed.
+  if (!cm->tile_encoding_mode &&
+      (pbi->dec_tile_row >= 0 || pbi->dec_tile_col >= 0)) {
+    pbi->dec_tile_row = -1;
+    pbi->dec_tile_col = -1;
+  }
+#endif  // CONFIG_EXT_TILE
+
+#if CONFIG_TILE_GROUPS
+  pbi->first_partition_size = first_partition_size;
+  pbi->uncomp_hdr_size = aom_rb_bytes_read(&rb);
+#endif
+  new_fb = get_frame_new_buffer(cm);
+  xd->cur_buf = new_fb;
+#if CONFIG_GLOBAL_MOTION
+  int i;
+  for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+    set_default_warp_params(&cm->global_motion[i]);
+    set_default_warp_params(&cm->cur_frame->global_motion[i]);
+  }
+  xd->global_motion = cm->global_motion;
+#endif  // CONFIG_GLOBAL_MOTION
+
+  if (!first_partition_size) {
+    // showing a frame directly
+    *p_data_end = data + aom_rb_bytes_read(&rb);
+    return;
+  }
+
+  data += aom_rb_bytes_read(&rb);
+  if (!read_is_valid(data, first_partition_size, data_end))
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt header length");
+
+#if CONFIG_REF_MV
+  cm->setup_mi(cm);
+#endif
+
+#if CONFIG_TEMPMV_SIGNALING
+  if (cm->use_prev_frame_mvs) {
+    RefBuffer *last_fb_ref_buf = &cm->frame_refs[LAST_FRAME - LAST_FRAME];
+    cm->prev_frame = &cm->buffer_pool->frame_bufs[last_fb_ref_buf->idx];
+    assert(!cm->error_resilient_mode &&
+           cm->width == last_fb_ref_buf->buf->y_width &&
+           cm->height == last_fb_ref_buf->buf->y_height &&
+           !cm->prev_frame->intra_only);
+  }
+#else
+  cm->use_prev_frame_mvs =
+      !cm->error_resilient_mode && cm->width == cm->last_width &&
+      cm->height == cm->last_height && !cm->last_intra_only &&
+      cm->last_show_frame && (cm->last_frame_type != KEY_FRAME);
+#endif
+#if CONFIG_EXT_REFS
+  // NOTE(zoeliu): As cm->prev_frame can take neither a frame of
+  //               show_exisiting_frame=1, nor can it take a frame not used as
+  //               a reference, it is probable that by the time it is being
+  //               referred to, the frame buffer it originally points to may
+  //               already get expired and have been reassigned to the current
+  //               newly coded frame. Hence, we need to check whether this is
+  //               the case, and if yes, we have 2 choices:
+  //               (1) Simply disable the use of previous frame mvs; or
+  //               (2) Have cm->prev_frame point to one reference frame buffer,
+  //                   e.g. LAST_FRAME.
+  if (cm->use_prev_frame_mvs && !dec_is_ref_frame_buf(pbi, cm->prev_frame)) {
+    // Reassign the LAST_FRAME buffer to cm->prev_frame.
+    RefBuffer *last_fb_ref_buf = &cm->frame_refs[LAST_FRAME - LAST_FRAME];
+    cm->prev_frame = &cm->buffer_pool->frame_bufs[last_fb_ref_buf->idx];
+  }
+#endif  // CONFIG_EXT_REFS
+
+  av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
+
+  *cm->fc = cm->frame_contexts[cm->frame_context_idx];
+  if (!cm->fc->initialized)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Uninitialized entropy context.");
+
+  av1_zero(cm->counts);
+
+  xd->corrupted = 0;
+  new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
+  if (new_fb->corrupted)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Decode failed. Frame data header is corrupted.");
+
+  if (cm->lf.filter_level && !cm->skip_loop_filter) {
+    av1_loop_filter_frame_init(cm, cm->lf.filter_level);
+  }
+
+  // If encoded in frame parallel mode, frame context is ready after decoding
+  // the frame header.
+  if (cm->frame_parallel_decode &&
+      cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_BACKWARD) {
+    AVxWorker *const worker = pbi->frame_worker_owner;
+    FrameWorkerData *const frame_worker_data = worker->data1;
+    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD) {
+      context_updated = 1;
+      cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+    }
+    av1_frameworker_lock_stats(worker);
+    pbi->cur_buf->row = -1;
+    pbi->cur_buf->col = -1;
+    frame_worker_data->frame_context_ready = 1;
+    // Signal the main thread that context is ready.
+    av1_frameworker_signal_stats(worker);
+    av1_frameworker_unlock_stats(worker);
+  }
+
+#if CONFIG_SUBFRAME_PROB_UPDATE
+  av1_copy(cm->starting_coef_probs, cm->fc->coef_probs);
+  cm->coef_probs_update_idx = 0;
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+
+  if (pbi->max_threads > 1 && !CONFIG_CB4X4 &&
+#if CONFIG_EXT_TILE
+      pbi->dec_tile_col < 0 &&  // Decoding all columns
+#endif                          // CONFIG_EXT_TILE
+      cm->tile_cols > 1) {
+    // Multi-threaded tile decoder
+    *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
+    if (!xd->corrupted) {
+      if (!cm->skip_loop_filter) {
+        // If multiple threads are used to decode tiles, then we use those
+        // threads to do parallel loopfiltering.
+        av1_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, cm->lf.filter_level,
+                                 0, 0, pbi->tile_workers, pbi->num_tile_workers,
+                                 &pbi->lf_row_sync);
+      }
+    } else {
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Decode failed. Frame data is corrupted.");
+    }
+  } else {
+    *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
+  }
+
+#if CONFIG_CDEF
+  if (!cm->skip_loop_filter) {
+    av1_cdef_frame(&pbi->cur_buf->buf, cm, &pbi->mb);
+  }
+#endif  // CONFIG_CDEF
+
+#if CONFIG_LOOP_RESTORATION
+  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
+    av1_loop_restoration_frame(new_fb, cm, cm->rst_info, 7, 0, NULL);
+  }
+#endif  // CONFIG_LOOP_RESTORATION
+
+  if (!xd->corrupted) {
+    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+#if CONFIG_EC_ADAPT
+      FRAME_CONTEXT **tile_ctxs = aom_malloc(cm->tile_rows * cm->tile_cols *
+                                             sizeof(&pbi->tile_data[0].tctx));
+      aom_cdf_prob **cdf_ptrs =
+          aom_malloc(cm->tile_rows * cm->tile_cols *
+                     sizeof(&pbi->tile_data[0].tctx.partition_cdf[0][0]));
+      make_update_tile_list_dec(pbi, cm->tile_rows, cm->tile_cols, tile_ctxs);
+#endif
+
+#if CONFIG_SUBFRAME_PROB_UPDATE
+      cm->partial_prob_update = 0;
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+      av1_adapt_coef_probs(cm);
+      av1_adapt_intra_frame_probs(cm);
+#if CONFIG_EC_ADAPT
+      av1_average_tile_coef_cdfs(pbi->common.fc, tile_ctxs, cdf_ptrs,
+                                 cm->tile_rows * cm->tile_cols);
+      av1_average_tile_intra_cdfs(pbi->common.fc, tile_ctxs, cdf_ptrs,
+                                  cm->tile_rows * cm->tile_cols);
+#if CONFIG_PVQ
+      av1_average_tile_pvq_cdfs(pbi->common.fc, tile_ctxs,
+                                cm->tile_rows * cm->tile_cols);
+#endif  // CONFIG_PVQ
+#endif  // CONFIG_EC_ADAPT
+#if CONFIG_ADAPT_SCAN
+      av1_adapt_scan_order(cm);
+#endif  // CONFIG_ADAPT_SCAN
+
+      if (!frame_is_intra_only(cm)) {
+        av1_adapt_inter_frame_probs(cm);
+        av1_adapt_mv_probs(cm, cm->allow_high_precision_mv);
+#if CONFIG_EC_ADAPT
+        av1_average_tile_inter_cdfs(&pbi->common, pbi->common.fc, tile_ctxs,
+                                    cdf_ptrs, cm->tile_rows * cm->tile_cols);
+        av1_average_tile_mv_cdfs(pbi->common.fc, tile_ctxs, cdf_ptrs,
+                                 cm->tile_rows * cm->tile_cols);
+#endif
+      }
+#if CONFIG_EC_ADAPT
+      aom_free(tile_ctxs);
+      aom_free(cdf_ptrs);
+#endif
+    } else {
+      debug_check_frame_counts(cm);
+    }
+  } else {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Decode failed. Frame data is corrupted.");
+  }
+
+#if CONFIG_INSPECTION
+  if (pbi->inspect_cb != NULL) {
+    (*pbi->inspect_cb)(pbi, pbi->inspect_ctx);
+  }
+#endif
+
+  // Non frame parallel update frame context here.
+  if (!cm->error_resilient_mode && !context_updated)
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+}
diff --git a/third_party/aom/av1/decoder/decodeframe.h b/third_party/aom/av1/decoder/decodeframe.h
new file mode 100644
index 000000000..a904658b0
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodeframe.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_DECODER_DECODEFRAME_H_
+#define AV1_DECODER_DECODEFRAME_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Decoder;
+struct aom_read_bit_buffer;
+
+#if CONFIG_REFERENCE_BUFFER
+/* Placeholder for now */
+void read_sequence_header(SequenceHeader *seq_params);
+#endif
+
+int av1_read_sync_code(struct aom_read_bit_buffer *const rb);
+void av1_read_frame_size(struct aom_read_bit_buffer *rb, int *width,
+                         int *height);
+BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb);
+
+void av1_decode_frame(struct AV1Decoder *pbi, const uint8_t *data,
+                      const uint8_t *data_end, const uint8_t **p_data_end);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_DECODER_DECODEFRAME_H_
diff --git a/third_party/aom/av1/decoder/decodemv.c b/third_party/aom/av1/decoder/decodemv.c
new file mode 100644
index 000000000..ec0f87751
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodemv.c
@@ -0,0 +1,2405 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
+#if CONFIG_EXT_INTRA
+#include "av1/common/reconintra.h"
+#endif  // CONFIG_EXT_INTRA
+#include "av1/common/seg_common.h"
+#if CONFIG_WARPED_MOTION
+#include "av1/common/warped_motion.h"
+#endif  // CONFIG_WARPED_MOTION
+
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/decodemv.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#define ACCT_STR __func__
+#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
+static INLINE int read_uniform(aom_reader *r, int n) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  const int v = aom_read_literal(r, l - 1, ACCT_STR);
+  assert(l != 0);
+  if (v < m)
+    return v;
+  else
+    return (v << 1) - m + aom_read_literal(r, 1, ACCT_STR);
+}
+#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
+
+#if CONFIG_EC_MULTISYMBOL
+static PREDICTION_MODE read_intra_mode(aom_reader *r, aom_cdf_prob *cdf) {
+  return (PREDICTION_MODE)
+      av1_intra_mode_inv[aom_read_symbol(r, cdf, INTRA_MODES, ACCT_STR)];
+}
+#else
+static PREDICTION_MODE read_intra_mode(aom_reader *r, const aom_prob *p) {
+  return (PREDICTION_MODE)aom_read_tree(r, av1_intra_mode_tree, p, ACCT_STR);
+}
+#endif
+
+#if CONFIG_DELTA_Q
+static int read_delta_qindex(AV1_COMMON *cm, MACROBLOCKD *xd, aom_reader *r,
+                             MB_MODE_INFO *const mbmi, int mi_col, int mi_row) {
+  FRAME_COUNTS *counts = xd->counts;
+  int sign, abs, reduced_delta_qindex = 0;
+  BLOCK_SIZE bsize = mbmi->sb_type;
+  const int b_col = mi_col & MAX_MIB_MASK;
+  const int b_row = mi_row & MAX_MIB_MASK;
+  const int read_delta_q_flag = (b_col == 0 && b_row == 0);
+  int rem_bits, thr;
+  int i, smallval;
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  (void)cm;
+#else
+  FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+
+  if ((bsize != BLOCK_LARGEST || mbmi->skip == 0) && read_delta_q_flag) {
+#if !CONFIG_EC_MULTISYMBOL
+    int bit = 1;
+    abs = 0;
+    while (abs < DELTA_Q_SMALL && bit) {
+      bit = aom_read(r, ec_ctx->delta_q_prob[abs], ACCT_STR);
+      abs += bit;
+    }
+#else
+    abs = aom_read_symbol(r, ec_ctx->delta_q_cdf, DELTA_Q_PROBS + 1, ACCT_STR);
+#endif
+    smallval = (abs < DELTA_Q_SMALL);
+    if (counts) {
+      for (i = 0; i < abs; ++i) counts->delta_q[i][1]++;
+      if (smallval) counts->delta_q[abs][0]++;
+    }
+
+    if (!smallval) {
+      rem_bits = aom_read_literal(r, 3, ACCT_STR);
+      thr = (1 << rem_bits) + 1;
+      abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
+    }
+
+    if (abs) {
+      sign = aom_read_bit(r, ACCT_STR);
+    } else {
+      sign = 1;
+    }
+
+    reduced_delta_qindex = sign ? -abs : abs;
+  }
+  return reduced_delta_qindex;
+}
+#if CONFIG_EXT_DELTA_Q
+static int read_delta_lflevel(AV1_COMMON *cm, MACROBLOCKD *xd, aom_reader *r,
+                              MB_MODE_INFO *const mbmi, int mi_col,
+                              int mi_row) {
+  FRAME_COUNTS *counts = xd->counts;
+  int sign, abs, reduced_delta_lflevel = 0;
+  BLOCK_SIZE bsize = mbmi->sb_type;
+  const int b_col = mi_col & MAX_MIB_MASK;
+  const int b_row = mi_row & MAX_MIB_MASK;
+  const int read_delta_lf_flag = (b_col == 0 && b_row == 0);
+  int rem_bits, thr;
+  int i, smallval;
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  (void)cm;
+#else
+  FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+
+  if ((bsize != BLOCK_64X64 || mbmi->skip == 0) && read_delta_lf_flag) {
+#if !CONFIG_EC_MULTISYMBOL
+    int bit = 1;
+    abs = 0;
+    while (abs < DELTA_LF_SMALL && bit) {
+      bit = aom_read(r, ec_ctx->delta_lf_prob[abs], ACCT_STR);
+      abs += bit;
+    }
+#else
+    abs =
+        aom_read_symbol(r, ec_ctx->delta_lf_cdf, DELTA_LF_PROBS + 1, ACCT_STR);
+#endif
+    smallval = (abs < DELTA_LF_SMALL);
+    if (counts) {
+      for (i = 0; i < abs; ++i) counts->delta_lf[i][1]++;
+      if (smallval) counts->delta_lf[abs][0]++;
+    }
+    if (!smallval) {
+      rem_bits = aom_read_literal(r, 3, ACCT_STR);
+      thr = (1 << rem_bits) + 1;
+      abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
+    }
+
+    if (abs) {
+      sign = aom_read_bit(r, ACCT_STR);
+    } else {
+      sign = 1;
+    }
+
+    reduced_delta_lflevel = sign ? -abs : abs;
+  }
+  return reduced_delta_lflevel;
+}
+#endif
+#endif
+
+static PREDICTION_MODE read_intra_mode_y(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                         aom_reader *r, int size_group) {
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#elif CONFIG_EC_MULTISYMBOL
+  FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+
+  const PREDICTION_MODE y_mode =
+#if CONFIG_EC_MULTISYMBOL
+      read_intra_mode(r, ec_ctx->y_mode_cdf[size_group]);
+#else
+      read_intra_mode(r, cm->fc->y_mode_prob[size_group]);
+#endif
+  FRAME_COUNTS *counts = xd->counts;
+#if CONFIG_EC_ADAPT
+  (void)cm;
+#endif
+  if (counts) ++counts->y_mode[size_group][y_mode];
+  return y_mode;
+}
+
+static PREDICTION_MODE read_intra_mode_uv(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                          aom_reader *r,
+                                          PREDICTION_MODE y_mode) {
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#elif CONFIG_EC_MULTISYMBOL
+  FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+
+  const PREDICTION_MODE uv_mode =
+#if CONFIG_EC_MULTISYMBOL
+      read_intra_mode(r, ec_ctx->uv_mode_cdf[y_mode]);
+#else
+      read_intra_mode(r, cm->fc->uv_mode_prob[y_mode]);
+#endif
+  FRAME_COUNTS *counts = xd->counts;
+#if CONFIG_EC_ADAPT
+  (void)cm;
+#endif
+  if (counts) ++counts->uv_mode[y_mode][uv_mode];
+  return uv_mode;
+}
+
+#if CONFIG_EXT_INTER
+static INTERINTRA_MODE read_interintra_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                            aom_reader *r, int size_group) {
+  const INTERINTRA_MODE ii_mode = (INTERINTRA_MODE)aom_read_tree(
+      r, av1_interintra_mode_tree, cm->fc->interintra_mode_prob[size_group],
+      ACCT_STR);
+  FRAME_COUNTS *counts = xd->counts;
+  if (counts) ++counts->interintra_mode[size_group][ii_mode];
+  return ii_mode;
+}
+#endif  // CONFIG_EXT_INTER
+
+static PREDICTION_MODE read_inter_mode(FRAME_CONTEXT *ec_ctx, MACROBLOCKD *xd,
+                                       aom_reader *r, int16_t ctx) {
+#if CONFIG_REF_MV
+  FRAME_COUNTS *counts = xd->counts;
+  int16_t mode_ctx = ctx & NEWMV_CTX_MASK;
+  aom_prob mode_prob = ec_ctx->newmv_prob[mode_ctx];
+
+  if (aom_read(r, mode_prob, ACCT_STR) == 0) {
+    if (counts) ++counts->newmv_mode[mode_ctx][0];
+    return NEWMV;
+  }
+  if (counts) ++counts->newmv_mode[mode_ctx][1];
+
+  if (ctx & (1 << ALL_ZERO_FLAG_OFFSET)) return ZEROMV;
+
+  mode_ctx = (ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
+
+  mode_prob = ec_ctx->zeromv_prob[mode_ctx];
+  if (aom_read(r, mode_prob, ACCT_STR) == 0) {
+    if (counts) ++counts->zeromv_mode[mode_ctx][0];
+    return ZEROMV;
+  }
+  if (counts) ++counts->zeromv_mode[mode_ctx][1];
+
+  mode_ctx = (ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+  if (ctx & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6;
+  if (ctx & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7;
+  if (ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8;
+
+  mode_prob = ec_ctx->refmv_prob[mode_ctx];
+
+  if (aom_read(r, mode_prob, ACCT_STR) == 0) {
+    if (counts) ++counts->refmv_mode[mode_ctx][0];
+
+    return NEARESTMV;
+  } else {
+    if (counts) ++counts->refmv_mode[mode_ctx][1];
+    return NEARMV;
+  }
+
+  // Invalid prediction mode.
+  assert(0);
+#else
+#if CONFIG_EC_MULTISYMBOL
+  const int mode = av1_inter_mode_inv[aom_read_symbol(
+      r, ec_ctx->inter_mode_cdf[ctx], INTER_MODES, ACCT_STR)];
+#else
+  const int mode = aom_read_tree(r, av1_inter_mode_tree,
+                                 ec_ctx->inter_mode_probs[ctx], ACCT_STR);
+#endif
+  FRAME_COUNTS *counts = xd->counts;
+  if (counts) ++counts->inter_mode[ctx][mode];
+
+  return NEARESTMV + mode;
+#endif
+}
+
+#if CONFIG_REF_MV
+static void read_drl_idx(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                         MB_MODE_INFO *mbmi, aom_reader *r) {
+  uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+  mbmi->ref_mv_idx = 0;
+
+#if CONFIG_EXT_INTER
+  if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+#else
+  if (mbmi->mode == NEWMV) {
+#endif
+    int idx;
+    for (idx = 0; idx < 2; ++idx) {
+      if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx = av1_drl_ctx(xd->ref_mv_stack[ref_frame_type], idx);
+        aom_prob drl_prob = cm->fc->drl_prob[drl_ctx];
+        if (!aom_read(r, drl_prob, ACCT_STR)) {
+          mbmi->ref_mv_idx = idx;
+          if (xd->counts) ++xd->counts->drl_mode[drl_ctx][0];
+          return;
+        }
+        mbmi->ref_mv_idx = idx + 1;
+        if (xd->counts) ++xd->counts->drl_mode[drl_ctx][1];
+      }
+    }
+  }
+
+  if (have_nearmv_in_inter_mode(mbmi->mode)) {
+    int idx;
+    // Offset the NEARESTMV mode.
+    // TODO(jingning): Unify the two syntax decoding loops after the NEARESTMV
+    // mode is factored in.
+    for (idx = 1; idx < 3; ++idx) {
+      if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx = av1_drl_ctx(xd->ref_mv_stack[ref_frame_type], idx);
+        aom_prob drl_prob = cm->fc->drl_prob[drl_ctx];
+        if (!aom_read(r, drl_prob, ACCT_STR)) {
+          mbmi->ref_mv_idx = idx - 1;
+          if (xd->counts) ++xd->counts->drl_mode[drl_ctx][0];
+          return;
+        }
+        mbmi->ref_mv_idx = idx;
+        if (xd->counts) ++xd->counts->drl_mode[drl_ctx][1];
+      }
+    }
+  }
+}
+#endif
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+static MOTION_MODE read_motion_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    MODE_INFO *mi, aom_reader *r) {
+  MB_MODE_INFO *mbmi = &mi->mbmi;
+  const MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
+#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+      0, xd->global_motion,
+#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+      mi);
+  int motion_mode;
+  FRAME_COUNTS *counts = xd->counts;
+
+  if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return SIMPLE_TRANSLATION;
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+  if (last_motion_mode_allowed == OBMC_CAUSAL) {
+    motion_mode = aom_read(r, cm->fc->obmc_prob[mbmi->sb_type], ACCT_STR);
+    if (counts) ++counts->obmc[mbmi->sb_type][motion_mode];
+    return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
+  } else {
+#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+    motion_mode =
+        aom_read_tree(r, av1_motion_mode_tree,
+                      cm->fc->motion_mode_prob[mbmi->sb_type], ACCT_STR);
+    if (counts) ++counts->motion_mode[mbmi->sb_type][motion_mode];
+    return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+  }
+#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+}
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if CONFIG_EXT_INTER
+static PREDICTION_MODE read_inter_compound_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                                aom_reader *r, int16_t ctx) {
+  const int mode =
+      aom_read_tree(r, av1_inter_compound_mode_tree,
+                    cm->fc->inter_compound_mode_probs[ctx], ACCT_STR);
+  FRAME_COUNTS *counts = xd->counts;
+
+  if (counts) ++counts->inter_compound_mode[ctx][mode];
+
+  assert(is_inter_compound_mode(NEAREST_NEARESTMV + mode));
+  return NEAREST_NEARESTMV + mode;
+}
+#endif  // CONFIG_EXT_INTER
+
+static int read_segment_id(aom_reader *r, struct segmentation_probs *segp) {
+#if CONFIG_EC_MULTISYMBOL
+  return aom_read_symbol(r, segp->tree_cdf, MAX_SEGMENTS, ACCT_STR);
+#else
+  return aom_read_tree(r, av1_segment_tree, segp->tree_probs, ACCT_STR);
+#endif
+}
+
+#if CONFIG_VAR_TX
+static void read_tx_size_vartx(AV1_COMMON *cm, MACROBLOCKD *xd,
+                               MB_MODE_INFO *mbmi, FRAME_COUNTS *counts,
+                               TX_SIZE tx_size, int depth, int blk_row,
+                               int blk_col, aom_reader *r) {
+  int is_split = 0;
+  const int tx_row = blk_row >> 1;
+  const int tx_col = blk_col >> 1;
+  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
+  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
+  int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
+                                   xd->left_txfm_context + tx_row,
+                                   mbmi->sb_type, tx_size);
+  TX_SIZE(*const inter_tx_size)
+  [MAX_MIB_SIZE] =
+      (TX_SIZE(*)[MAX_MIB_SIZE]) & mbmi->inter_tx_size[tx_row][tx_col];
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (depth == MAX_VARTX_DEPTH) {
+    int idx, idy;
+    inter_tx_size[0][0] = tx_size;
+    for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
+      for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
+        inter_tx_size[idy][idx] = tx_size;
+    mbmi->tx_size = tx_size;
+    mbmi->min_tx_size = AOMMIN(mbmi->min_tx_size, get_min_tx_size(tx_size));
+    if (counts) ++counts->txfm_partition[ctx][0];
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size, tx_size);
+    return;
+  }
+
+  is_split = aom_read(r, cm->fc->txfm_partition_prob[ctx], ACCT_STR);
+
+  if (is_split) {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsl = tx_size_wide_unit[sub_txs];
+    int i;
+
+    if (counts) ++counts->txfm_partition[ctx][1];
+
+    if (tx_size == TX_8X8) {
+      int idx, idy;
+      inter_tx_size[0][0] = sub_txs;
+      for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
+        for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
+          inter_tx_size[idy][idx] = inter_tx_size[0][0];
+      mbmi->tx_size = sub_txs;
+      mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+      txfm_partition_update(xd->above_txfm_context + tx_col,
+                            xd->left_txfm_context + tx_row, sub_txs, tx_size);
+      return;
+    }
+
+    assert(bsl > 0);
+    for (i = 0; i < 4; ++i) {
+      int offsetr = blk_row + (i >> 1) * bsl;
+      int offsetc = blk_col + (i & 0x01) * bsl;
+      read_tx_size_vartx(cm, xd, mbmi, counts, sub_txs, depth + 1, offsetr,
+                         offsetc, r);
+    }
+  } else {
+    int idx, idy;
+    inter_tx_size[0][0] = tx_size;
+    for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
+      for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
+        inter_tx_size[idy][idx] = tx_size;
+    mbmi->tx_size = tx_size;
+    mbmi->min_tx_size = AOMMIN(mbmi->min_tx_size, get_min_tx_size(tx_size));
+    if (counts) ++counts->txfm_partition[ctx][0];
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size, tx_size);
+  }
+}
+#endif
+
+static TX_SIZE read_selected_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                     int tx_size_cat, aom_reader *r) {
+  FRAME_COUNTS *counts = xd->counts;
+  const int ctx = get_tx_size_context(xd);
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  (void)cm;
+#else
+  FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+
+  const int depth =
+#if CONFIG_EC_MULTISYMBOL
+      aom_read_symbol(r, ec_ctx->tx_size_cdf[tx_size_cat][ctx], tx_size_cat + 2,
+                      ACCT_STR);
+#else
+      aom_read_tree(r, av1_tx_size_tree[tx_size_cat],
+                    ec_ctx->tx_size_probs[tx_size_cat][ctx], ACCT_STR);
+#endif
+  const TX_SIZE tx_size = depth_to_tx_size(depth);
+#if CONFIG_RECT_TX
+  assert(!is_rect_tx(tx_size));
+#endif  // CONFIG_RECT_TX
+  if (counts) ++counts->tx_size[tx_size_cat][ctx][depth];
+  return tx_size;
+}
+
+static TX_SIZE read_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd, int is_inter,
+                            int allow_select_inter, aom_reader *r) {
+  const TX_MODE tx_mode = cm->tx_mode;
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  if (xd->lossless[xd->mi[0]->mbmi.segment_id]) return TX_4X4;
+#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_EXT_TX) && CONFIG_RECT_TX
+  if (bsize > BLOCK_4X4) {
+#else
+  if (bsize >= BLOCK_8X8) {
+#endif  // CONFIG_CB4X4 && CONFIG_VAR_TX
+    if ((!is_inter || allow_select_inter) && tx_mode == TX_MODE_SELECT) {
+      const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
+                                           : intra_tx_size_cat_lookup[bsize];
+      const TX_SIZE coded_tx_size =
+          read_selected_tx_size(cm, xd, tx_size_cat, r);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      if (coded_tx_size > max_txsize_lookup[bsize]) {
+        assert(coded_tx_size == max_txsize_lookup[bsize] + 1);
+        return max_txsize_rect_lookup[bsize];
+      }
+#else
+      assert(coded_tx_size <= max_txsize_lookup[bsize]);
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+      return coded_tx_size;
+    } else {
+      return tx_size_from_tx_mode(bsize, tx_mode, is_inter);
+    }
+  } else {
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+    assert(IMPLIES(tx_mode == ONLY_4X4, bsize == BLOCK_4X4));
+    return max_txsize_rect_lookup[bsize];
+#else
+    return TX_4X4;
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+  }
+}
+
+static int dec_get_segment_id(const AV1_COMMON *cm, const uint8_t *segment_ids,
+                              int mi_offset, int x_mis, int y_mis) {
+  int x, y, segment_id = INT_MAX;
+
+  for (y = 0; y < y_mis; y++)
+    for (x = 0; x < x_mis; x++)
+      segment_id =
+          AOMMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]);
+
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+  return segment_id;
+}
+
+static void set_segment_id(AV1_COMMON *cm, int mi_offset, int x_mis, int y_mis,
+                           int segment_id) {
+  int x, y;
+
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+
+  for (y = 0; y < y_mis; y++)
+    for (x = 0; x < x_mis; x++)
+      cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
+}
+
+static int read_intra_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                 int mi_offset, int x_mis, int y_mis,
+                                 aom_reader *r) {
+  struct segmentation *const seg = &cm->seg;
+  FRAME_COUNTS *counts = xd->counts;
+  struct segmentation_probs *const segp = &cm->fc->seg;
+  int segment_id;
+
+  if (!seg->enabled) return 0;  // Default for disabled segmentation
+
+  assert(seg->update_map && !seg->temporal_update);
+
+  segment_id = read_segment_id(r, segp);
+  if (counts) ++counts->seg.tree_total[segment_id];
+  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+  return segment_id;
+}
+
+static void copy_segment_id(const AV1_COMMON *cm,
+                            const uint8_t *last_segment_ids,
+                            uint8_t *current_segment_ids, int mi_offset,
+                            int x_mis, int y_mis) {
+  int x, y;
+
+  for (y = 0; y < y_mis; y++)
+    for (x = 0; x < x_mis; x++)
+      current_segment_ids[mi_offset + y * cm->mi_cols + x] =
+          last_segment_ids ? last_segment_ids[mi_offset + y * cm->mi_cols + x]
+                           : 0;
+}
+
+static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                 int mi_row, int mi_col, aom_reader *r) {
+  struct segmentation *const seg = &cm->seg;
+  FRAME_COUNTS *counts = xd->counts;
+  struct segmentation_probs *const segp = &cm->fc->seg;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int predicted_segment_id, segment_id;
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = mi_size_wide[mbmi->sb_type];
+  const int bh = mi_size_high[mbmi->sb_type];
+
+  // TODO(slavarnway): move x_mis, y_mis into xd ?????
+  const int x_mis = AOMMIN(cm->mi_cols - mi_col, bw);
+  const int y_mis = AOMMIN(cm->mi_rows - mi_row, bh);
+
+  if (!seg->enabled) return 0;  // Default for disabled segmentation
+
+  predicted_segment_id = cm->last_frame_seg_map
+                             ? dec_get_segment_id(cm, cm->last_frame_seg_map,
+                                                  mi_offset, x_mis, y_mis)
+                             : 0;
+
+  if (!seg->update_map) {
+    copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
+                    mi_offset, x_mis, y_mis);
+    return predicted_segment_id;
+  }
+
+  if (seg->temporal_update) {
+    const int ctx = av1_get_pred_context_seg_id(xd);
+    const aom_prob pred_prob = segp->pred_probs[ctx];
+    mbmi->seg_id_predicted = aom_read(r, pred_prob, ACCT_STR);
+    if (counts) ++counts->seg.pred[ctx][mbmi->seg_id_predicted];
+    if (mbmi->seg_id_predicted) {
+      segment_id = predicted_segment_id;
+    } else {
+      segment_id = read_segment_id(r, segp);
+      if (counts) ++counts->seg.tree_mispred[segment_id];
+    }
+  } else {
+    segment_id = read_segment_id(r, segp);
+    if (counts) ++counts->seg.tree_total[segment_id];
+  }
+  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+  return segment_id;
+}
+
+static int read_skip(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
+                     aom_reader *r) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 1;
+  } else {
+    const int ctx = av1_get_skip_context(xd);
+    const int skip = aom_read(r, cm->fc->skip_probs[ctx], ACCT_STR);
+    FRAME_COUNTS *counts = xd->counts;
+    if (counts) ++counts->skip[ctx][skip];
+    return skip;
+  }
+}
+
+#if CONFIG_PALETTE
+static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                   aom_reader *r) {
+  MODE_INFO *const mi = xd->mi[0];
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  int i, n;
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+
+  if (mbmi->mode == DC_PRED) {
+    int palette_y_mode_ctx = 0;
+    if (above_mi)
+      palette_y_mode_ctx +=
+          (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    if (left_mi)
+      palette_y_mode_ctx +=
+          (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    if (aom_read(r, av1_default_palette_y_mode_prob[bsize - BLOCK_8X8]
+                                                   [palette_y_mode_ctx],
+                 ACCT_STR)) {
+      pmi->palette_size[0] =
+          aom_read_tree(r, av1_palette_size_tree,
+                        av1_default_palette_y_size_prob[bsize - BLOCK_8X8],
+                        ACCT_STR) +
+          2;
+      n = pmi->palette_size[0];
+#if CONFIG_PALETTE_DELTA_ENCODING
+      const int min_bits = cm->bit_depth - 3;
+      int bits = min_bits + aom_read_literal(r, 2, ACCT_STR);
+      pmi->palette_colors[0] = aom_read_literal(r, cm->bit_depth, ACCT_STR);
+      for (i = 1; i < n; ++i) {
+        pmi->palette_colors[i] = pmi->palette_colors[i - 1] +
+                                 aom_read_literal(r, bits, ACCT_STR) + 1;
+        bits = AOMMIN(
+            bits, av1_ceil_log2((1 << cm->bit_depth) - pmi->palette_colors[i]));
+      }
+#else
+      for (i = 0; i < n; ++i)
+        pmi->palette_colors[i] = aom_read_literal(r, cm->bit_depth, ACCT_STR);
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
+      xd->plane[0].color_index_map[0] = read_uniform(r, n);
+      assert(xd->plane[0].color_index_map[0] < n);
+    }
+  }
+
+  if (mbmi->uv_mode == DC_PRED) {
+    const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+    if (aom_read(r, av1_default_palette_uv_mode_prob[palette_uv_mode_ctx],
+                 ACCT_STR)) {
+      pmi->palette_size[1] =
+          aom_read_tree(r, av1_palette_size_tree,
+                        av1_default_palette_uv_size_prob[bsize - BLOCK_8X8],
+                        ACCT_STR) +
+          2;
+      n = pmi->palette_size[1];
+#if CONFIG_PALETTE_DELTA_ENCODING
+      // U channel colors.
+      const int min_bits_u = cm->bit_depth - 3;
+      int bits = min_bits_u + aom_read_literal(r, 2, ACCT_STR);
+      pmi->palette_colors[PALETTE_MAX_SIZE] =
+          aom_read_literal(r, cm->bit_depth, ACCT_STR);
+      for (i = 1; i < n; ++i) {
+        pmi->palette_colors[PALETTE_MAX_SIZE + i] =
+            pmi->palette_colors[PALETTE_MAX_SIZE + i - 1] +
+            aom_read_literal(r, bits, ACCT_STR);
+        bits = AOMMIN(bits,
+                      av1_ceil_log2(1 + (1 << cm->bit_depth) -
+                                    pmi->palette_colors[PALETTE_MAX_SIZE + i]));
+      }
+      // V channel colors.
+      if (aom_read_bit(r, ACCT_STR)) {  // Delta encoding.
+        const int min_bits_v = cm->bit_depth - 4;
+        const int max_val = 1 << cm->bit_depth;
+        bits = min_bits_v + aom_read_literal(r, 2, ACCT_STR);
+        pmi->palette_colors[2 * PALETTE_MAX_SIZE] =
+            aom_read_literal(r, cm->bit_depth, ACCT_STR);
+        for (i = 1; i < n; ++i) {
+          int delta = aom_read_literal(r, bits, ACCT_STR);
+          if (delta && aom_read_bit(r, ACCT_STR)) delta = -delta;
+          int val =
+              (int)pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1] + delta;
+          if (val < 0) val += max_val;
+          if (val >= max_val) val -= max_val;
+          pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] = val;
+        }
+      } else {
+        for (i = 0; i < n; ++i) {
+          pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] =
+              aom_read_literal(r, cm->bit_depth, ACCT_STR);
+        }
+      }
+#else
+      for (i = 0; i < n; ++i) {
+        pmi->palette_colors[PALETTE_MAX_SIZE + i] =
+            aom_read_literal(r, cm->bit_depth, ACCT_STR);
+        pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] =
+            aom_read_literal(r, cm->bit_depth, ACCT_STR);
+      }
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
+      xd->plane[1].color_index_map[0] = read_uniform(r, n);
+      assert(xd->plane[1].color_index_map[0] < n);
+    }
+  }
+}
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_FILTER_INTRA
+static void read_filter_intra_mode_info(AV1_COMMON *const cm,
+                                        MACROBLOCKD *const xd, aom_reader *r) {
+  MODE_INFO *const mi = xd->mi[0];
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  FRAME_COUNTS *counts = xd->counts;
+  FILTER_INTRA_MODE_INFO *filter_intra_mode_info =
+      &mbmi->filter_intra_mode_info;
+
+  if (mbmi->mode == DC_PRED
+#if CONFIG_PALETTE
+      && mbmi->palette_mode_info.palette_size[0] == 0
+#endif  // CONFIG_PALETTE
+      ) {
+    filter_intra_mode_info->use_filter_intra_mode[0] =
+        aom_read(r, cm->fc->filter_intra_probs[0], ACCT_STR);
+    if (filter_intra_mode_info->use_filter_intra_mode[0]) {
+      filter_intra_mode_info->filter_intra_mode[0] =
+          read_uniform(r, FILTER_INTRA_MODES);
+    }
+    if (counts) {
+      ++counts
+            ->filter_intra[0][filter_intra_mode_info->use_filter_intra_mode[0]];
+    }
+  }
+  if (mbmi->uv_mode == DC_PRED
+#if CONFIG_PALETTE
+      && mbmi->palette_mode_info.palette_size[1] == 0
+#endif  // CONFIG_PALETTE
+      ) {
+    filter_intra_mode_info->use_filter_intra_mode[1] =
+        aom_read(r, cm->fc->filter_intra_probs[1], ACCT_STR);
+    if (filter_intra_mode_info->use_filter_intra_mode[1]) {
+      filter_intra_mode_info->filter_intra_mode[1] =
+          read_uniform(r, FILTER_INTRA_MODES);
+    }
+    if (counts) {
+      ++counts
+            ->filter_intra[1][filter_intra_mode_info->use_filter_intra_mode[1]];
+    }
+  }
+}
+#endif  // CONFIG_FILTER_INTRA
+
+#if CONFIG_EXT_INTRA
+static void read_intra_angle_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                  aom_reader *r) {
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_INTRA_INTERP
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+#else
+  FRAME_CONTEXT *const ec_ctx = cm->fc;
+#endif  // CONFIG_EC_ADAPT
+  const int ctx = av1_get_pred_context_intra_interp(xd);
+  int p_angle;
+#endif  // CONFIG_INTRA_INTERP
+
+  (void)cm;
+  if (bsize < BLOCK_8X8) return;
+
+  if (av1_is_directional_mode(mbmi->mode, bsize)) {
+    mbmi->angle_delta[0] =
+        read_uniform(r, 2 * MAX_ANGLE_DELTA + 1) - MAX_ANGLE_DELTA;
+#if CONFIG_INTRA_INTERP
+    p_angle = mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
+    if (av1_is_intra_filter_switchable(p_angle)) {
+      FRAME_COUNTS *counts = xd->counts;
+#if CONFIG_EC_MULTISYMBOL
+      mbmi->intra_filter = aom_read_symbol(r, ec_ctx->intra_filter_cdf[ctx],
+                                           INTRA_FILTERS, ACCT_STR);
+#else
+      mbmi->intra_filter = aom_read_tree(
+          r, av1_intra_filter_tree, ec_ctx->intra_filter_probs[ctx], ACCT_STR);
+#endif  // CONFIG_EC_MULTISYMBOL
+      if (counts) ++counts->intra_filter[ctx][mbmi->intra_filter];
+    } else {
+      mbmi->intra_filter = INTRA_FILTER_LINEAR;
+    }
+#endif  // CONFIG_INTRA_INTERP
+  }
+
+  if (av1_is_directional_mode(mbmi->uv_mode, bsize)) {
+    mbmi->angle_delta[1] =
+        read_uniform(r, 2 * MAX_ANGLE_DELTA + 1) - MAX_ANGLE_DELTA;
+  }
+}
+#endif  // CONFIG_EXT_INTRA
+
+void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+#if CONFIG_SUPERTX
+                      int supertx_enabled,
+#endif
+#if CONFIG_TXK_SEL
+                      int block, int plane,
+#endif
+                      aom_reader *r) {
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int inter_block = is_inter_block(mbmi);
+#if CONFIG_VAR_TX
+  const TX_SIZE tx_size = inter_block ? mbmi->min_tx_size : mbmi->tx_size;
+#else
+  const TX_SIZE tx_size = mbmi->tx_size;
+#endif
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#else
+  FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+
+#if !CONFIG_TXK_SEL
+  TX_TYPE *tx_type = &mbmi->tx_type;
+#else
+  // only y plane's tx_type is transmitted
+  if (plane > 0) return;
+  TX_TYPE *tx_type = &mbmi->txk_type[block];
+#endif
+
+  if (!FIXED_TX_TYPE) {
+#if CONFIG_EXT_TX
+    const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+    if (get_ext_tx_types(tx_size, mbmi->sb_type, inter_block,
+                         cm->reduced_tx_set_used) > 1 &&
+        ((!cm->seg.enabled && cm->base_qindex > 0) ||
+         (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
+        !mbmi->skip &&
+#if CONFIG_SUPERTX
+        !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      const int eset = get_ext_tx_set(tx_size, mbmi->sb_type, inter_block,
+                                      cm->reduced_tx_set_used);
+      FRAME_COUNTS *counts = xd->counts;
+
+      if (inter_block) {
+        if (eset > 0) {
+#if CONFIG_EC_MULTISYMBOL
+          *tx_type = av1_ext_tx_inter_inv[eset][aom_read_symbol(
+              r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
+              ext_tx_cnt_inter[eset], ACCT_STR)];
+#else
+          *tx_type = aom_read_tree(
+              r, av1_ext_tx_inter_tree[eset],
+              ec_ctx->inter_ext_tx_prob[eset][square_tx_size], ACCT_STR);
+#endif
+          if (counts) ++counts->inter_ext_tx[eset][square_tx_size][*tx_type];
+        }
+      } else if (ALLOW_INTRA_EXT_TX) {
+        if (eset > 0) {
+#if CONFIG_EC_MULTISYMBOL
+          *tx_type = av1_ext_tx_intra_inv[eset][aom_read_symbol(
+              r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
+              ext_tx_cnt_intra[eset], ACCT_STR)];
+#else
+          *tx_type = aom_read_tree(
+              r, av1_ext_tx_intra_tree[eset],
+              ec_ctx->intra_ext_tx_prob[eset][square_tx_size][mbmi->mode],
+              ACCT_STR);
+#endif
+          if (counts)
+            ++counts->intra_ext_tx[eset][square_tx_size][mbmi->mode][*tx_type];
+        }
+      }
+    } else {
+      *tx_type = DCT_DCT;
+    }
+#else
+
+    if (tx_size < TX_32X32 &&
+        ((!cm->seg.enabled && cm->base_qindex > 0) ||
+         (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
+        !mbmi->skip &&
+#if CONFIG_SUPERTX
+        !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      FRAME_COUNTS *counts = xd->counts;
+
+      if (inter_block) {
+#if CONFIG_EC_MULTISYMBOL
+        *tx_type = av1_ext_tx_inv[aom_read_symbol(
+            r, ec_ctx->inter_ext_tx_cdf[tx_size], TX_TYPES, ACCT_STR)];
+#else
+        *tx_type = aom_read_tree(r, av1_ext_tx_tree,
+                                 ec_ctx->inter_ext_tx_prob[tx_size], ACCT_STR);
+#endif
+        if (counts) ++counts->inter_ext_tx[tx_size][*tx_type];
+      } else {
+        const TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode];
+#if CONFIG_EC_MULTISYMBOL
+        *tx_type = av1_ext_tx_inv[aom_read_symbol(
+            r, ec_ctx->intra_ext_tx_cdf[tx_size][tx_type_nom], TX_TYPES,
+            ACCT_STR)];
+#else
+        *tx_type = aom_read_tree(
+            r, av1_ext_tx_tree, ec_ctx->intra_ext_tx_prob[tx_size][tx_type_nom],
+            ACCT_STR);
+#endif
+        if (counts) ++counts->intra_ext_tx[tx_size][tx_type_nom][*tx_type];
+      }
+    } else {
+      *tx_type = DCT_DCT;
+    }
+#endif  // CONFIG_EXT_TX
+  }
+}
+
+#if CONFIG_INTRABC
+static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
+                           nmv_context *ctx, nmv_context_counts *counts,
+                           int allow_hp);
+
+static INLINE int is_mv_valid(const MV *mv);
+
+static INLINE int assign_dv(AV1_COMMON *cm, MACROBLOCKD *xd, int_mv *mv,
+                            const int_mv *ref_mv, int mi_row, int mi_col,
+                            BLOCK_SIZE bsize, aom_reader *r) {
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  (void)cm;
+#else
+  FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+  FRAME_COUNTS *counts = xd->counts;
+  nmv_context_counts *const dv_counts = counts ? &counts->dv : NULL;
+  read_mv(r, &mv->as_mv, &ref_mv->as_mv, &ec_ctx->ndvc, dv_counts, 0);
+  int valid = is_mv_valid(&mv->as_mv) &&
+              is_dv_valid(mv->as_mv, &xd->tile, mi_row, mi_col, bsize);
+  // TODO(aconverse@google.com): additional validation
+  return valid;
+}
+#endif  // CONFIG_INTRABC
+
+static void read_intra_frame_mode_info(AV1_COMMON *const cm,
+                                       MACROBLOCKD *const xd, int mi_row,
+                                       int mi_col, aom_reader *r) {
+  MODE_INFO *const mi = xd->mi[0];
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi = xd->left_mi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  int i;
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+
+  // TODO(slavarnway): move x_mis, y_mis into xd ?????
+  const int x_mis = AOMMIN(cm->mi_cols - mi_col, bw);
+  const int y_mis = AOMMIN(cm->mi_rows - mi_row, bh);
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#elif CONFIG_EC_MULTISYMBOL
+  FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+
+  mbmi->segment_id = read_intra_segment_id(cm, xd, mi_offset, x_mis, y_mis, r);
+  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
+
+#if CONFIG_DELTA_Q
+  if (cm->delta_q_present_flag) {
+    xd->current_qindex =
+        xd->prev_qindex +
+        read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res;
+    /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
+    xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
+    xd->prev_qindex = xd->current_qindex;
+#if CONFIG_EXT_DELTA_Q
+    if (cm->delta_lf_present_flag) {
+      mbmi->current_delta_lf_from_base = xd->current_delta_lf_from_base =
+          xd->prev_delta_lf_from_base +
+          read_delta_lflevel(cm, xd, r, mbmi, mi_col, mi_row) *
+              cm->delta_lf_res;
+      xd->prev_delta_lf_from_base = xd->current_delta_lf_from_base;
+    }
+#endif
+  }
+#endif
+
+  mbmi->tx_size = read_tx_size(cm, xd, 0, 1, r);
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+
+#if CONFIG_INTRABC
+  if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools) {
+    mbmi->use_intrabc = aom_read(r, INTRABC_PROB, ACCT_STR);
+    if (mbmi->use_intrabc) {
+      int_mv dv_ref;
+      mbmi->mode = mbmi->uv_mode = DC_PRED;
+#if CONFIG_DUAL_FILTER
+      for (int idx = 0; idx < 4; ++idx) mbmi->interp_filter[idx] = BILINEAR;
+#else
+      mbmi->interp_filter = BILINEAR;
+#endif
+      av1_find_ref_dv(&dv_ref, mi_row, mi_col);
+      xd->corrupted |=
+          !assign_dv(cm, xd, &mbmi->mv[0], &dv_ref, mi_row, mi_col, bsize, r);
+      return;
+    }
+  }
+#endif  // CONFIG_INTRABC
+
+#if CONFIG_CB4X4
+  (void)i;
+  mbmi->mode =
+#if CONFIG_EC_MULTISYMBOL
+      read_intra_mode(r, get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, 0));
+#else
+      read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, 0));
+#endif
+#else
+  switch (bsize) {
+    case BLOCK_4X4:
+      for (i = 0; i < 4; ++i)
+        mi->bmi[i].as_mode =
+#if CONFIG_EC_MULTISYMBOL
+            read_intra_mode(r,
+                            get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, i));
+#else
+            read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, i));
+#endif
+      mbmi->mode = mi->bmi[3].as_mode;
+      break;
+    case BLOCK_4X8:
+      mi->bmi[0].as_mode = mi->bmi[2].as_mode =
+#if CONFIG_EC_MULTISYMBOL
+          read_intra_mode(r, get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, 0));
+#else
+          read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, 0));
+#endif
+      mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode =
+#if CONFIG_EC_MULTISYMBOL
+          read_intra_mode(r, get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, 1));
+#else
+          read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, 1));
+#endif
+      break;
+    case BLOCK_8X4:
+      mi->bmi[0].as_mode = mi->bmi[1].as_mode =
+#if CONFIG_EC_MULTISYMBOL
+          read_intra_mode(r, get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, 0));
+#else
+          read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, 0));
+#endif
+      mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode =
+#if CONFIG_EC_MULTISYMBOL
+          read_intra_mode(r, get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, 2));
+#else
+          read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, 2));
+#endif
+      break;
+    default:
+      mbmi->mode =
+#if CONFIG_EC_MULTISYMBOL
+          read_intra_mode(r, get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, 0));
+#else
+          read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, 0));
+#endif
+  }
+#endif
+
+#if CONFIG_CB4X4
+  if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                          xd->plane[1].subsampling_y))
+    mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
+#else
+  mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
+#endif
+
+#if CONFIG_EXT_INTRA
+  read_intra_angle_info(cm, xd, r);
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_PALETTE
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+  if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools)
+    read_palette_mode_info(cm, xd, r);
+#endif  // CONFIG_PALETTE
+#if CONFIG_FILTER_INTRA
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+  if (bsize >= BLOCK_8X8 || CONFIG_CB4X4)
+    read_filter_intra_mode_info(cm, xd, r);
+#endif  // CONFIG_FILTER_INTRA
+
+#if !CONFIG_TXK_SEL
+  av1_read_tx_type(cm, xd,
+#if CONFIG_SUPERTX
+                   0,
+#endif
+                   r);
+#endif  // !CONFIG_TXK_SEL
+}
+
+static int read_mv_component(aom_reader *r, nmv_component *mvcomp, int usehp) {
+  int mag, d, fr, hp;
+  const int sign = aom_read(r, mvcomp->sign, ACCT_STR);
+  const int mv_class =
+#if CONFIG_EC_MULTISYMBOL
+      aom_read_symbol(r, mvcomp->class_cdf, MV_CLASSES, ACCT_STR);
+#else
+      aom_read_tree(r, av1_mv_class_tree, mvcomp->classes, ACCT_STR);
+#endif
+  const int class0 = mv_class == MV_CLASS_0;
+
+  // Integer part
+  if (class0) {
+    d = aom_read(r, mvcomp->class0[0], ACCT_STR);
+    mag = 0;
+  } else {
+    int i;
+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
+
+    d = 0;
+    for (i = 0; i < n; ++i) d |= aom_read(r, mvcomp->bits[i], ACCT_STR) << i;
+    mag = CLASS0_SIZE << (mv_class + 2);
+  }
+
+// Fractional part
+#if CONFIG_EC_MULTISYMBOL
+  fr = aom_read_symbol(r, class0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
+                       MV_FP_SIZE, ACCT_STR);
+#else
+  fr = aom_read_tree(r, av1_mv_fp_tree,
+                     class0 ? mvcomp->class0_fp[d] : mvcomp->fp, ACCT_STR);
+#endif
+
+  // High precision part (if hp is not used, the default value of the hp is 1)
+  hp = usehp ? aom_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp, ACCT_STR)
+             : 1;
+
+  // Result
+  mag += ((d << 3) | (fr << 1) | hp) + 1;
+  return sign ? -mag : mag;
+}
+
+static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
+                           nmv_context *ctx, nmv_context_counts *counts,
+                           int allow_hp) {
+  MV_JOINT_TYPE joint_type;
+  MV diff = { 0, 0 };
+  joint_type =
+#if CONFIG_EC_MULTISYMBOL
+      (MV_JOINT_TYPE)aom_read_symbol(r, ctx->joint_cdf, MV_JOINTS, ACCT_STR);
+#else
+      (MV_JOINT_TYPE)aom_read_tree(r, av1_mv_joint_tree, ctx->joints, ACCT_STR);
+#endif
+
+  if (mv_joint_vertical(joint_type))
+    diff.row = read_mv_component(r, &ctx->comps[0], allow_hp);
+
+  if (mv_joint_horizontal(joint_type))
+    diff.col = read_mv_component(r, &ctx->comps[1], allow_hp);
+
+  av1_inc_mv(&diff, counts, allow_hp);
+
+  mv->row = ref->row + diff.row;
+  mv->col = ref->col + diff.col;
+}
+
+static REFERENCE_MODE read_block_reference_mode(AV1_COMMON *cm,
+                                                const MACROBLOCKD *xd,
+                                                aom_reader *r) {
+#if !SUB8X8_COMP_REF
+  if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) return SINGLE_REFERENCE;
+#endif
+  if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+    const int ctx = av1_get_reference_mode_context(cm, xd);
+    const REFERENCE_MODE mode =
+        (REFERENCE_MODE)aom_read(r, cm->fc->comp_inter_prob[ctx], ACCT_STR);
+    FRAME_COUNTS *counts = xd->counts;
+    if (counts) ++counts->comp_inter[ctx][mode];
+    return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
+  } else {
+    return cm->reference_mode;
+  }
+}
+
+// Read the referncence frame
+static void read_ref_frames(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                            aom_reader *r, int segment_id,
+                            MV_REFERENCE_FRAME ref_frame[2]) {
+  FRAME_CONTEXT *const fc = cm->fc;
+  FRAME_COUNTS *counts = xd->counts;
+
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id,
+                                                   SEG_LVL_REF_FRAME);
+    ref_frame[1] = NONE_FRAME;
+  } else {
+    const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
+    // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
+    if (mode == COMPOUND_REFERENCE) {
+#if CONFIG_LOWDELAY_COMPOUND  // Normative in decoder (for low delay)
+      const int idx = 1;
+#else
+#if CONFIG_EXT_REFS
+      const int idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
+#else
+      const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+#endif  // CONFIG_EXT_REFS
+#endif
+      const int ctx = av1_get_pred_context_comp_ref_p(cm, xd);
+
+      const int bit = aom_read(r, fc->comp_ref_prob[ctx][0], ACCT_STR);
+      if (counts) ++counts->comp_ref[ctx][0][bit];
+
+#if CONFIG_EXT_REFS
+      // Decode forward references.
+      if (!bit) {
+        const int ctx1 = av1_get_pred_context_comp_ref_p1(cm, xd);
+        const int bit1 = aom_read(r, fc->comp_ref_prob[ctx1][1], ACCT_STR);
+        if (counts) ++counts->comp_ref[ctx1][1][bit1];
+        ref_frame[!idx] = cm->comp_fwd_ref[bit1 ? 0 : 1];
+      } else {
+        const int ctx2 = av1_get_pred_context_comp_ref_p2(cm, xd);
+        const int bit2 = aom_read(r, fc->comp_ref_prob[ctx2][2], ACCT_STR);
+        if (counts) ++counts->comp_ref[ctx2][2][bit2];
+        ref_frame[!idx] = cm->comp_fwd_ref[bit2 ? 3 : 2];
+      }
+
+      // Decode backward references.
+      {
+        const int ctx_bwd = av1_get_pred_context_comp_bwdref_p(cm, xd);
+        const int bit_bwd =
+            aom_read(r, fc->comp_bwdref_prob[ctx_bwd][0], ACCT_STR);
+        if (counts) ++counts->comp_bwdref[ctx_bwd][0][bit_bwd];
+        ref_frame[idx] = cm->comp_bwd_ref[bit_bwd];
+      }
+#else
+      ref_frame[!idx] = cm->comp_var_ref[bit];
+      ref_frame[idx] = cm->comp_fixed_ref;
+#endif  // CONFIG_EXT_REFS
+    } else if (mode == SINGLE_REFERENCE) {
+#if CONFIG_EXT_REFS
+      const int ctx0 = av1_get_pred_context_single_ref_p1(xd);
+      const int bit0 = aom_read(r, fc->single_ref_prob[ctx0][0], ACCT_STR);
+      if (counts) ++counts->single_ref[ctx0][0][bit0];
+
+      if (bit0) {
+        const int ctx1 = av1_get_pred_context_single_ref_p2(xd);
+        const int bit1 = aom_read(r, fc->single_ref_prob[ctx1][1], ACCT_STR);
+        if (counts) ++counts->single_ref[ctx1][1][bit1];
+        ref_frame[0] = bit1 ? ALTREF_FRAME : BWDREF_FRAME;
+      } else {
+        const int ctx2 = av1_get_pred_context_single_ref_p3(xd);
+        const int bit2 = aom_read(r, fc->single_ref_prob[ctx2][2], ACCT_STR);
+        if (counts) ++counts->single_ref[ctx2][2][bit2];
+        if (bit2) {
+          const int ctx4 = av1_get_pred_context_single_ref_p5(xd);
+          const int bit4 = aom_read(r, fc->single_ref_prob[ctx4][4], ACCT_STR);
+          if (counts) ++counts->single_ref[ctx4][4][bit4];
+          ref_frame[0] = bit4 ? GOLDEN_FRAME : LAST3_FRAME;
+        } else {
+          const int ctx3 = av1_get_pred_context_single_ref_p4(xd);
+          const int bit3 = aom_read(r, fc->single_ref_prob[ctx3][3], ACCT_STR);
+          if (counts) ++counts->single_ref[ctx3][3][bit3];
+          ref_frame[0] = bit3 ? LAST2_FRAME : LAST_FRAME;
+        }
+      }
+#else
+      const int ctx0 = av1_get_pred_context_single_ref_p1(xd);
+      const int bit0 = aom_read(r, fc->single_ref_prob[ctx0][0], ACCT_STR);
+      if (counts) ++counts->single_ref[ctx0][0][bit0];
+
+      if (bit0) {
+        const int ctx1 = av1_get_pred_context_single_ref_p2(xd);
+        const int bit1 = aom_read(r, fc->single_ref_prob[ctx1][1], ACCT_STR);
+        if (counts) ++counts->single_ref[ctx1][1][bit1];
+        ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
+      } else {
+        ref_frame[0] = LAST_FRAME;
+      }
+#endif  // CONFIG_EXT_REFS
+
+      ref_frame[1] = NONE_FRAME;
+    } else {
+      assert(0 && "Invalid prediction mode.");
+    }
+  }
+}
+
+static INLINE void read_mb_interp_filter(AV1_COMMON *const cm,
+                                         MACROBLOCKD *const xd,
+                                         MB_MODE_INFO *const mbmi,
+                                         aom_reader *r) {
+  FRAME_COUNTS *counts = xd->counts;
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#else
+  FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+
+  if (!av1_is_interp_needed(xd)) {
+    set_default_interp_filters(mbmi, cm->interp_filter);
+    return;
+  }
+
+#if CONFIG_DUAL_FILTER
+  if (cm->interp_filter != SWITCHABLE) {
+    int dir;
+
+    for (dir = 0; dir < 4; ++dir) mbmi->interp_filter[dir] = cm->interp_filter;
+  } else {
+    int dir;
+
+    for (dir = 0; dir < 2; ++dir) {
+      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+      mbmi->interp_filter[dir] = EIGHTTAP_REGULAR;
+
+      if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
+          (mbmi->ref_frame[1] > INTRA_FRAME &&
+           has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
+#if CONFIG_EC_MULTISYMBOL
+        mbmi->interp_filter[dir] =
+            (InterpFilter)av1_switchable_interp_inv[aom_read_symbol(
+                r, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS,
+                ACCT_STR)];
+#else
+        mbmi->interp_filter[dir] = (InterpFilter)aom_read_tree(
+            r, av1_switchable_interp_tree, ec_ctx->switchable_interp_prob[ctx],
+            ACCT_STR);
+#endif
+        if (counts) ++counts->switchable_interp[ctx][mbmi->interp_filter[dir]];
+      }
+    }
+    // The index system works as:
+    // (0, 1) -> (vertical, horizontal) filter types for the first ref frame.
+    // (2, 3) -> (vertical, horizontal) filter types for the second ref frame.
+    mbmi->interp_filter[2] = mbmi->interp_filter[0];
+    mbmi->interp_filter[3] = mbmi->interp_filter[1];
+  }
+#else  // CONFIG_DUAL_FILTER
+  if (cm->interp_filter != SWITCHABLE) {
+    mbmi->interp_filter = cm->interp_filter;
+  } else {
+    const int ctx = av1_get_pred_context_switchable_interp(xd);
+#if CONFIG_EC_MULTISYMBOL
+    mbmi->interp_filter =
+        (InterpFilter)av1_switchable_interp_inv[aom_read_symbol(
+            r, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS,
+            ACCT_STR)];
+#else
+    mbmi->interp_filter = (InterpFilter)aom_read_tree(
+        r, av1_switchable_interp_tree, ec_ctx->switchable_interp_prob[ctx],
+        ACCT_STR);
+#endif
+    if (counts) ++counts->switchable_interp[ctx][mbmi->interp_filter];
+  }
+#endif  // CONFIG_DUAL_FILTER
+}
+
+static void read_intra_block_mode_info(AV1_COMMON *const cm, const int mi_row,
+                                       const int mi_col, MACROBLOCKD *const xd,
+                                       MODE_INFO *mi, aom_reader *r) {
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
+  int i;
+
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+
+#if CONFIG_CB4X4
+  (void)i;
+  mbmi->mode = read_intra_mode_y(cm, xd, r, size_group_lookup[bsize]);
+#else
+  switch (bsize) {
+    case BLOCK_4X4:
+      for (i = 0; i < 4; ++i)
+        mi->bmi[i].as_mode = read_intra_mode_y(cm, xd, r, 0);
+      mbmi->mode = mi->bmi[3].as_mode;
+      break;
+    case BLOCK_4X8:
+      mi->bmi[0].as_mode = mi->bmi[2].as_mode = read_intra_mode_y(cm, xd, r, 0);
+      mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode =
+          read_intra_mode_y(cm, xd, r, 0);
+      break;
+    case BLOCK_8X4:
+      mi->bmi[0].as_mode = mi->bmi[1].as_mode = read_intra_mode_y(cm, xd, r, 0);
+      mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode =
+          read_intra_mode_y(cm, xd, r, 0);
+      break;
+    default:
+      mbmi->mode = read_intra_mode_y(cm, xd, r, size_group_lookup[bsize]);
+  }
+#endif
+
+#if CONFIG_CB4X4
+  if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                          xd->plane[1].subsampling_y))
+    mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
+#else
+  mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
+  (void)mi_row;
+  (void)mi_col;
+#endif
+
+#if CONFIG_EXT_INTRA
+  read_intra_angle_info(cm, xd, r);
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_PALETTE
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+  if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools)
+    read_palette_mode_info(cm, xd, r);
+#endif  // CONFIG_PALETTE
+#if CONFIG_FILTER_INTRA
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+  if (bsize >= BLOCK_8X8 || CONFIG_CB4X4)
+    read_filter_intra_mode_info(cm, xd, r);
+#endif  // CONFIG_FILTER_INTRA
+}
+
+static INLINE int is_mv_valid(const MV *mv) {
+  return mv->row > MV_LOW && mv->row < MV_UPP && mv->col > MV_LOW &&
+         mv->col < MV_UPP;
+}
+
+static INLINE int assign_mv(AV1_COMMON *cm, MACROBLOCKD *xd,
+                            PREDICTION_MODE mode,
+                            MV_REFERENCE_FRAME ref_frame[2], int block,
+                            int_mv mv[2], int_mv ref_mv[2],
+                            int_mv nearest_mv[2], int_mv near_mv[2], int mi_row,
+                            int mi_col, int is_compound, int allow_hp,
+                            aom_reader *r) {
+  int i;
+  int ret = 1;
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#else
+  FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+#if CONFIG_REF_MV
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_CB4X4
+  int_mv *pred_mv = mbmi->pred_mv;
+  (void)block;
+#else
+  int_mv *pred_mv =
+      (bsize >= BLOCK_8X8) ? mbmi->pred_mv : xd->mi[0]->bmi[block].pred_mv;
+#endif  // CONFIG_CB4X4
+#else
+  (void)block;
+#endif  // CONFIG_REF_MV
+  (void)ref_frame;
+  (void)cm;
+  (void)mi_row;
+  (void)mi_col;
+  (void)bsize;
+
+  switch (mode) {
+    case NEWMV: {
+      FRAME_COUNTS *counts = xd->counts;
+#if !CONFIG_REF_MV
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL;
+#endif
+      for (i = 0; i < 1 + is_compound; ++i) {
+#if CONFIG_REF_MV
+        int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+        int nmv_ctx =
+            av1_nmv_ctx(xd->ref_mv_count[rf_type], xd->ref_mv_stack[rf_type], i,
+                        mbmi->ref_mv_idx);
+        nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
+        nmv_context_counts *const mv_counts =
+            counts ? &counts->mv[nmv_ctx] : NULL;
+#endif
+        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, nmvc, mv_counts, allow_hp);
+        ret = ret && is_mv_valid(&mv[i].as_mv);
+
+#if CONFIG_REF_MV
+        pred_mv[i].as_int = ref_mv[i].as_int;
+#endif
+      }
+      break;
+    }
+    case NEARESTMV: {
+      mv[0].as_int = nearest_mv[0].as_int;
+      if (is_compound) mv[1].as_int = nearest_mv[1].as_int;
+
+#if CONFIG_REF_MV
+      pred_mv[0].as_int = nearest_mv[0].as_int;
+      if (is_compound) pred_mv[1].as_int = nearest_mv[1].as_int;
+#endif
+      break;
+    }
+    case NEARMV: {
+      mv[0].as_int = near_mv[0].as_int;
+      if (is_compound) mv[1].as_int = near_mv[1].as_int;
+
+#if CONFIG_REF_MV
+      pred_mv[0].as_int = near_mv[0].as_int;
+      if (is_compound) pred_mv[1].as_int = near_mv[1].as_int;
+#endif
+      break;
+    }
+    case ZEROMV: {
+#if CONFIG_GLOBAL_MOTION
+      mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
+                                          cm->allow_high_precision_mv, bsize,
+                                          mi_col, mi_row, block)
+                         .as_int;
+      if (is_compound)
+        mv[1].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[1]],
+                                            cm->allow_high_precision_mv, bsize,
+                                            mi_col, mi_row, block)
+                           .as_int;
+#else
+      mv[0].as_int = 0;
+      if (is_compound) mv[1].as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
+
+#if CONFIG_REF_MV
+      pred_mv[0].as_int = mv[0].as_int;
+      if (is_compound) pred_mv[1].as_int = mv[1].as_int;
+#endif
+      break;
+    }
+#if CONFIG_EXT_INTER
+    case NEW_NEWMV: {
+      FRAME_COUNTS *counts = xd->counts;
+#if !CONFIG_REF_MV
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL;
+#endif
+      assert(is_compound);
+      for (i = 0; i < 2; ++i) {
+#if CONFIG_REF_MV
+        int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+        int nmv_ctx =
+            av1_nmv_ctx(xd->ref_mv_count[rf_type], xd->ref_mv_stack[rf_type], i,
+                        mbmi->ref_mv_idx);
+        nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
+        nmv_context_counts *const mv_counts =
+            counts ? &counts->mv[nmv_ctx] : NULL;
+#endif
+        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, nmvc, mv_counts, allow_hp);
+        ret = ret && is_mv_valid(&mv[i].as_mv);
+      }
+      break;
+    }
+    case NEAREST_NEARESTMV: {
+      assert(is_compound);
+      mv[0].as_int = nearest_mv[0].as_int;
+      mv[1].as_int = nearest_mv[1].as_int;
+      break;
+    }
+    case NEAREST_NEARMV: {
+      assert(is_compound);
+      mv[0].as_int = nearest_mv[0].as_int;
+      mv[1].as_int = near_mv[1].as_int;
+      break;
+    }
+    case NEAR_NEARESTMV: {
+      assert(is_compound);
+      mv[0].as_int = near_mv[0].as_int;
+      mv[1].as_int = nearest_mv[1].as_int;
+      break;
+    }
+    case NEAR_NEARMV: {
+      assert(is_compound);
+      mv[0].as_int = near_mv[0].as_int;
+      mv[1].as_int = near_mv[1].as_int;
+      break;
+    }
+    case NEW_NEARESTMV: {
+      FRAME_COUNTS *counts = xd->counts;
+#if CONFIG_REF_MV
+      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
+                                xd->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
+      nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
+      nmv_context_counts *const mv_counts =
+          counts ? &counts->mv[nmv_ctx] : NULL;
+#else
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL;
+#endif
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, mv_counts, allow_hp);
+      assert(is_compound);
+      ret = ret && is_mv_valid(&mv[0].as_mv);
+      mv[1].as_int = nearest_mv[1].as_int;
+      break;
+    }
+    case NEAREST_NEWMV: {
+      FRAME_COUNTS *counts = xd->counts;
+#if CONFIG_REF_MV
+      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
+                                xd->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
+      nmv_context_counts *const mv_counts =
+          counts ? &counts->mv[nmv_ctx] : NULL;
+      nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
+#else
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL;
+#endif
+      mv[0].as_int = nearest_mv[0].as_int;
+      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, mv_counts, allow_hp);
+      assert(is_compound);
+      ret = ret && is_mv_valid(&mv[1].as_mv);
+      break;
+    }
+    case NEAR_NEWMV: {
+      FRAME_COUNTS *counts = xd->counts;
+#if CONFIG_REF_MV
+      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
+                                xd->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
+      nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
+      nmv_context_counts *const mv_counts =
+          counts ? &counts->mv[nmv_ctx] : NULL;
+#else
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL;
+#endif
+      mv[0].as_int = near_mv[0].as_int;
+      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, mv_counts, allow_hp);
+      assert(is_compound);
+
+      ret = ret && is_mv_valid(&mv[1].as_mv);
+      break;
+    }
+    case NEW_NEARMV: {
+      FRAME_COUNTS *counts = xd->counts;
+#if CONFIG_REF_MV
+      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
+                                xd->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
+      nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
+      nmv_context_counts *const mv_counts =
+          counts ? &counts->mv[nmv_ctx] : NULL;
+#else
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL;
+#endif
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, mv_counts, allow_hp);
+      assert(is_compound);
+      ret = ret && is_mv_valid(&mv[0].as_mv);
+      mv[1].as_int = near_mv[1].as_int;
+      break;
+    }
+    case ZERO_ZEROMV: {
+      assert(is_compound);
+#if CONFIG_GLOBAL_MOTION
+      mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
+                                          cm->allow_high_precision_mv, bsize,
+                                          mi_col, mi_row, block)
+                         .as_int;
+      mv[1].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[1]],
+                                          cm->allow_high_precision_mv, bsize,
+                                          mi_col, mi_row, block)
+                         .as_int;
+#else
+      mv[0].as_int = 0;
+      mv[1].as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
+      break;
+    }
+#endif  // CONFIG_EXT_INTER
+    default: { return 0; }
+  }
+  return ret;
+}
+
+static int read_is_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                               int segment_id, aom_reader *r) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    return get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
+  } else {
+    const int ctx = av1_get_intra_inter_context(xd);
+    const int is_inter = aom_read(r, cm->fc->intra_inter_prob[ctx], ACCT_STR);
+    FRAME_COUNTS *counts = xd->counts;
+    if (counts) ++counts->intra_inter[ctx][is_inter];
+    return is_inter;
+  }
+}
+
+static void fpm_sync(void *const data, int mi_row) {
+  AV1Decoder *const pbi = (AV1Decoder *)data;
+  av1_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame,
+                       mi_row << pbi->common.mib_size_log2);
+}
+
+static void read_inter_block_mode_info(AV1Decoder *const pbi,
+                                       MACROBLOCKD *const xd,
+                                       MODE_INFO *const mi,
+#if (CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION || CONFIG_EXT_INTER) && \
+    CONFIG_SUPERTX
+                                       int mi_row, int mi_col, aom_reader *r,
+                                       int supertx_enabled) {
+#else
+                                       int mi_row, int mi_col, aom_reader *r) {
+#endif  // CONFIG_MOTION_VAR && CONFIG_SUPERTX
+  AV1_COMMON *const cm = &pbi->common;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int allow_hp = cm->allow_high_precision_mv;
+  const int unify_bsize = CONFIG_CB4X4;
+  int_mv nearestmv[2], nearmv[2];
+  int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
+  int ref, is_compound;
+  int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+  int16_t compound_inter_mode_ctx[MODE_CTX_REF_FRAMES];
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+  int16_t mode_ctx = 0;
+#if CONFIG_WARPED_MOTION
+  int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+#endif  // CONFIG_WARPED_MOTION
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#else
+  FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+
+#if CONFIG_PALETTE
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+#endif  // CONFIG_PALETTE
+
+  memset(ref_mvs, 0, sizeof(ref_mvs));
+
+  read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
+  is_compound = has_second_ref(mbmi);
+
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+
+    av1_find_mv_refs(cm, xd, mi, frame,
+#if CONFIG_REF_MV
+                     &xd->ref_mv_count[frame], xd->ref_mv_stack[frame],
+#if CONFIG_EXT_INTER
+                     compound_inter_mode_ctx,
+#endif  // CONFIG_EXT_INTER
+#endif
+                     ref_mvs[frame], mi_row, mi_col, fpm_sync, (void *)pbi,
+                     inter_mode_ctx);
+  }
+
+#if CONFIG_REF_MV
+  if (is_compound) {
+    MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame);
+    av1_find_mv_refs(cm, xd, mi, ref_frame, &xd->ref_mv_count[ref_frame],
+                     xd->ref_mv_stack[ref_frame],
+#if CONFIG_EXT_INTER
+                     compound_inter_mode_ctx,
+#endif  // CONFIG_EXT_INTER
+                     ref_mvs[ref_frame], mi_row, mi_col, fpm_sync, (void *)pbi,
+                     inter_mode_ctx);
+
+    if (xd->ref_mv_count[ref_frame] < 2) {
+      MV_REFERENCE_FRAME rf[2];
+      int_mv zeromv[2];
+      av1_set_ref_frame(rf, ref_frame);
+#if CONFIG_GLOBAL_MOTION
+      zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[rf[0]],
+                                              cm->allow_high_precision_mv,
+                                              bsize, mi_col, mi_row, 0)
+                             .as_int;
+      zeromv[1].as_int = (rf[1] != NONE_FRAME)
+                             ? gm_get_motion_vector(&cm->global_motion[rf[1]],
+                                                    cm->allow_high_precision_mv,
+                                                    bsize, mi_col, mi_row, 0)
+                                   .as_int
+                             : 0;
+#else
+      zeromv[0].as_int = zeromv[1].as_int = 0;
+#endif
+      for (ref = 0; ref < 2; ++ref) {
+        if (rf[ref] == NONE_FRAME) continue;
+        lower_mv_precision(&ref_mvs[rf[ref]][0].as_mv, allow_hp);
+        lower_mv_precision(&ref_mvs[rf[ref]][1].as_mv, allow_hp);
+        if (ref_mvs[rf[ref]][0].as_int != zeromv[ref].as_int ||
+            ref_mvs[rf[ref]][1].as_int != zeromv[ref].as_int)
+          inter_mode_ctx[ref_frame] &= ~(1 << ALL_ZERO_FLAG_OFFSET);
+      }
+    }
+  }
+
+#if CONFIG_EXT_INTER
+  if (is_compound)
+    mode_ctx = compound_inter_mode_ctx[mbmi->ref_frame[0]];
+  else
+#endif  // CONFIG_EXT_INTER
+    mode_ctx =
+        av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame, bsize, -1);
+  mbmi->ref_mv_idx = 0;
+#else
+  mode_ctx = inter_mode_ctx[mbmi->ref_frame[0]];
+#endif
+
+  if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    mbmi->mode = ZEROMV;
+    if (bsize < BLOCK_8X8 && !unify_bsize) {
+      aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Invalid usage of segement feature on small blocks");
+      return;
+    }
+  } else {
+    if (bsize >= BLOCK_8X8 || unify_bsize) {
+#if CONFIG_EXT_INTER
+      if (is_compound)
+        mbmi->mode = read_inter_compound_mode(cm, xd, r, mode_ctx);
+      else
+#endif  // CONFIG_EXT_INTER
+        mbmi->mode = read_inter_mode(ec_ctx, xd, r, mode_ctx);
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+      if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV ||
+          have_nearmv_in_inter_mode(mbmi->mode))
+#else
+      if (mbmi->mode == NEARMV || mbmi->mode == NEWMV)
+#endif
+        read_drl_idx(cm, xd, mbmi, r);
+#endif
+    }
+  }
+
+#if CONFIG_EXT_INTER
+  if ((bsize < BLOCK_8X8 && unify_bsize) ||
+      (mbmi->mode != ZEROMV && mbmi->mode != ZERO_ZEROMV)) {
+#else
+  if ((bsize < BLOCK_8X8 && !unify_bsize) || mbmi->mode != ZEROMV) {
+#endif  // CONFIG_EXT_INTER
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      av1_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[ref]],
+                            &nearestmv[ref], &nearmv[ref]);
+    }
+  }
+
+#if CONFIG_REF_MV
+  if (mbmi->ref_mv_idx > 0) {
+    int_mv cur_mv =
+        xd->ref_mv_stack[mbmi->ref_frame[0]][1 + mbmi->ref_mv_idx].this_mv;
+    nearmv[0] = cur_mv;
+  }
+
+#if CONFIG_EXT_INTER
+  if (is_compound && (bsize >= BLOCK_8X8 || unify_bsize) &&
+      mbmi->mode != ZERO_ZEROMV) {
+#else
+  if (is_compound && (bsize >= BLOCK_8X8 || unify_bsize) &&
+      mbmi->mode != NEWMV && mbmi->mode != ZEROMV) {
+#endif  // CONFIG_EXT_INTER
+    uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+
+#if CONFIG_EXT_INTER
+    if (xd->ref_mv_count[ref_frame_type] > 0) {
+#else
+    if (xd->ref_mv_count[ref_frame_type] == 1 && mbmi->mode == NEARESTMV) {
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_EXT_INTER
+      if (mbmi->mode == NEAREST_NEARESTMV) {
+#endif  // CONFIG_EXT_INTER
+        nearestmv[0] = xd->ref_mv_stack[ref_frame_type][0].this_mv;
+        nearestmv[1] = xd->ref_mv_stack[ref_frame_type][0].comp_mv;
+        lower_mv_precision(&nearestmv[0].as_mv, allow_hp);
+        lower_mv_precision(&nearestmv[1].as_mv, allow_hp);
+#if CONFIG_EXT_INTER
+      } else if (mbmi->mode == NEAREST_NEWMV || mbmi->mode == NEAREST_NEARMV) {
+        nearestmv[0] = xd->ref_mv_stack[ref_frame_type][0].this_mv;
+        lower_mv_precision(&nearestmv[0].as_mv, allow_hp);
+      } else if (mbmi->mode == NEW_NEARESTMV || mbmi->mode == NEAR_NEARESTMV) {
+        nearestmv[1] = xd->ref_mv_stack[ref_frame_type][0].comp_mv;
+        lower_mv_precision(&nearestmv[1].as_mv, allow_hp);
+      }
+#endif  // CONFIG_EXT_INTER
+    }
+
+#if CONFIG_EXT_INTER
+    if (xd->ref_mv_count[ref_frame_type] > 1) {
+      int ref_mv_idx = 1 + mbmi->ref_mv_idx;
+      if (compound_ref0_mode(mbmi->mode) == NEARMV) {
+        nearmv[0] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+        lower_mv_precision(&nearmv[0].as_mv, allow_hp);
+      }
+
+      if (compound_ref1_mode(mbmi->mode) == NEARMV) {
+        nearmv[1] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+        lower_mv_precision(&nearmv[1].as_mv, allow_hp);
+      }
+    }
+#else
+    if (xd->ref_mv_count[ref_frame_type] > 1) {
+      int ref_mv_idx = 1 + mbmi->ref_mv_idx;
+      nearestmv[0] = xd->ref_mv_stack[ref_frame_type][0].this_mv;
+      nearestmv[1] = xd->ref_mv_stack[ref_frame_type][0].comp_mv;
+      nearmv[0] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+      nearmv[1] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+    }
+#endif  // CONFIG_EXT_INTER
+  }
+#endif
+
+#if !CONFIG_DUAL_FILTER && !CONFIG_WARPED_MOTION && !CONFIG_GLOBAL_MOTION
+  read_mb_interp_filter(cm, xd, mbmi, r);
+#endif  // !CONFIG_DUAL_FILTER && !CONFIG_WARPED_MOTION
+
+  if (bsize < BLOCK_8X8 && !unify_bsize) {
+    const int num_4x4_w = 1 << xd->bmode_blocks_wl;
+    const int num_4x4_h = 1 << xd->bmode_blocks_hl;
+    int idx, idy;
+    PREDICTION_MODE b_mode;
+    int_mv nearest_sub8x8[2], near_sub8x8[2];
+#if CONFIG_EXT_INTER
+    int_mv ref_mv[2][2];
+#endif  // CONFIG_EXT_INTER
+    for (idy = 0; idy < 2; idy += num_4x4_h) {
+      for (idx = 0; idx < 2; idx += num_4x4_w) {
+        int_mv block[2];
+        const int j = idy * 2 + idx;
+        int_mv ref_mv_s8[2];
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+        if (!is_compound)
+#endif  // CONFIG_EXT_INTER
+          mode_ctx = av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame,
+                                               bsize, j);
+#endif
+#if CONFIG_EXT_INTER
+        if (is_compound)
+          b_mode = read_inter_compound_mode(cm, xd, r, mode_ctx);
+        else
+#endif  // CONFIG_EXT_INTER
+          b_mode = read_inter_mode(ec_ctx, xd, r, mode_ctx);
+
+#if CONFIG_EXT_INTER
+        if (b_mode != ZEROMV && b_mode != ZERO_ZEROMV) {
+#else
+        if (b_mode != ZEROMV) {
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_REF_MV
+          CANDIDATE_MV ref_mv_stack[2][MAX_REF_MV_STACK_SIZE];
+          uint8_t ref_mv_count[2];
+#endif
+          for (ref = 0; ref < 1 + is_compound; ++ref)
+#if CONFIG_EXT_INTER
+          {
+            int_mv mv_ref_list[MAX_MV_REF_CANDIDATES];
+            av1_update_mv_context(cm, xd, mi, mbmi->ref_frame[ref], mv_ref_list,
+                                  j, mi_row, mi_col, NULL);
+#endif  // CONFIG_EXT_INTER
+            av1_append_sub8x8_mvs_for_idx(cm, xd, j, ref, mi_row, mi_col,
+#if CONFIG_REF_MV
+                                          ref_mv_stack[ref], &ref_mv_count[ref],
+#endif
+#if CONFIG_EXT_INTER
+                                          mv_ref_list,
+#endif  // CONFIG_EXT_INTER
+                                          &nearest_sub8x8[ref],
+                                          &near_sub8x8[ref]);
+#if CONFIG_EXT_INTER
+            if (have_newmv_in_inter_mode(b_mode)) {
+              mv_ref_list[0].as_int = nearest_sub8x8[ref].as_int;
+              mv_ref_list[1].as_int = near_sub8x8[ref].as_int;
+              av1_find_best_ref_mvs(allow_hp, mv_ref_list, &ref_mv[0][ref],
+                                    &ref_mv[1][ref]);
+            }
+          }
+#endif  // CONFIG_EXT_INTER
+        }
+
+        for (ref = 0; ref < 1 + is_compound && b_mode != ZEROMV; ++ref) {
+#if CONFIG_REF_MV
+          ref_mv_s8[ref] = nearest_sub8x8[ref];
+          lower_mv_precision(&ref_mv_s8[ref].as_mv, allow_hp);
+#else
+          ref_mv_s8[ref] = nearestmv[ref];
+#endif
+        }
+#if CONFIG_EXT_INTER
+        (void)ref_mv_s8;
+#endif
+
+        if (!assign_mv(cm, xd, b_mode, mbmi->ref_frame, j, block,
+#if CONFIG_EXT_INTER
+                       ref_mv[0],
+#else   // !CONFIG_EXT_INTER
+                       ref_mv_s8,
+#endif  // CONFIG_EXT_INTER
+                       nearest_sub8x8, near_sub8x8, mi_row, mi_col, is_compound,
+                       allow_hp, r)) {
+          aom_merge_corrupted_flag(&xd->corrupted, 1);
+          break;
+        };
+
+        mi->bmi[j].as_mv[0].as_int = block[0].as_int;
+        mi->bmi[j].as_mode = b_mode;
+        if (is_compound) mi->bmi[j].as_mv[1].as_int = block[1].as_int;
+
+        if (num_4x4_h == 2) mi->bmi[j + 2] = mi->bmi[j];
+        if (num_4x4_w == 2) mi->bmi[j + 1] = mi->bmi[j];
+      }
+    }
+
+#if CONFIG_REF_MV
+    mbmi->pred_mv[0].as_int = mi->bmi[3].pred_mv[0].as_int;
+    mbmi->pred_mv[1].as_int = mi->bmi[3].pred_mv[1].as_int;
+#endif
+    mi->mbmi.mode = b_mode;
+
+    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+    mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+  } else {
+    int_mv ref_mv[2];
+    ref_mv[0] = nearestmv[0];
+    ref_mv[1] = nearestmv[1];
+
+#if CONFIG_EXT_INTER
+    if (is_compound) {
+#if CONFIG_REF_MV
+      int ref_mv_idx = mbmi->ref_mv_idx;
+      // Special case: NEAR_NEWMV and NEW_NEARMV modes use
+      // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
+      // mbmi->ref_mv_idx (like NEWMV)
+      if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
+        ref_mv_idx = 1 + mbmi->ref_mv_idx;
+#endif
+
+      if (compound_ref0_mode(mbmi->mode) == NEWMV) {
+#if CONFIG_REF_MV
+        uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+        if (xd->ref_mv_count[ref_frame_type] > 1) {
+          ref_mv[0] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+          clamp_mv_ref(&ref_mv[0].as_mv, xd->n8_w << MI_SIZE_LOG2,
+                       xd->n8_h << MI_SIZE_LOG2, xd);
+        }
+#endif
+        nearestmv[0] = ref_mv[0];
+      }
+      if (compound_ref1_mode(mbmi->mode) == NEWMV) {
+#if CONFIG_REF_MV
+        uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+        if (xd->ref_mv_count[ref_frame_type] > 1) {
+          ref_mv[1] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+          clamp_mv_ref(&ref_mv[1].as_mv, xd->n8_w << MI_SIZE_LOG2,
+                       xd->n8_h << MI_SIZE_LOG2, xd);
+        }
+#endif
+        nearestmv[1] = ref_mv[1];
+      }
+    } else {
+#endif  // CONFIG_EXT_INTER
+      if (mbmi->mode == NEWMV) {
+        for (ref = 0; ref < 1 + is_compound; ++ref) {
+#if CONFIG_REF_MV
+          uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+          if (xd->ref_mv_count[ref_frame_type] > 1) {
+            ref_mv[ref] =
+                (ref == 0)
+                    ? xd->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx].this_mv
+                    : xd->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
+                          .comp_mv;
+            clamp_mv_ref(&ref_mv[ref].as_mv, xd->n8_w << MI_SIZE_LOG2,
+                         xd->n8_h << MI_SIZE_LOG2, xd);
+          }
+#endif
+          nearestmv[ref] = ref_mv[ref];
+        }
+      }
+#if CONFIG_EXT_INTER
+    }
+#endif  // CONFIG_EXT_INTER
+
+    int mv_corrupted_flag =
+        !assign_mv(cm, xd, mbmi->mode, mbmi->ref_frame, 0, mbmi->mv, ref_mv,
+                   nearestmv, nearmv, mi_row, mi_col, is_compound, allow_hp, r);
+    aom_merge_corrupted_flag(&xd->corrupted, mv_corrupted_flag);
+  }
+
+#if CONFIG_EXT_INTER
+  mbmi->use_wedge_interintra = 0;
+  if (cm->reference_mode != COMPOUND_REFERENCE &&
+#if CONFIG_SUPERTX
+      !supertx_enabled &&
+#endif
+      is_interintra_allowed(mbmi)) {
+    const int bsize_group = size_group_lookup[bsize];
+    const int interintra =
+        aom_read(r, cm->fc->interintra_prob[bsize_group], ACCT_STR);
+    if (xd->counts) xd->counts->interintra[bsize_group][interintra]++;
+    assert(mbmi->ref_frame[1] == NONE_FRAME);
+    if (interintra) {
+      const INTERINTRA_MODE interintra_mode =
+          read_interintra_mode(cm, xd, r, bsize_group);
+      mbmi->ref_frame[1] = INTRA_FRAME;
+      mbmi->interintra_mode = interintra_mode;
+#if CONFIG_EXT_INTRA
+      mbmi->angle_delta[0] = 0;
+      mbmi->angle_delta[1] = 0;
+#if CONFIG_INTRA_INTERP
+      mbmi->intra_filter = INTRA_FILTER_LINEAR;
+#endif  // CONFIG_INTRA_INTERP
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+      mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+      mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif  // CONFIG_FILTER_INTRA
+      if (is_interintra_wedge_used(bsize)) {
+        mbmi->use_wedge_interintra =
+            aom_read(r, cm->fc->wedge_interintra_prob[bsize], ACCT_STR);
+        if (xd->counts)
+          xd->counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
+        if (mbmi->use_wedge_interintra) {
+          mbmi->interintra_wedge_index =
+              aom_read_literal(r, get_wedge_bits_lookup(bsize), ACCT_STR);
+          mbmi->interintra_wedge_sign = 0;
+        }
+      }
+    }
+  }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+#if CONFIG_WARPED_MOTION
+  if (mbmi->sb_type >= BLOCK_8X8 && !has_second_ref(mbmi))
+    mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
+#endif  // CONFIG_WARPED_MOTION
+#if CONFIG_MOTION_VAR
+  av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
+#endif
+
+#if CONFIG_SUPERTX
+  if (!supertx_enabled) {
+#endif  // CONFIG_SUPERTX
+#if CONFIG_EXT_INTER
+    if (mbmi->ref_frame[1] != INTRA_FRAME)
+#endif  // CONFIG_EXT_INTER
+      mbmi->motion_mode = read_motion_mode(cm, xd, mi, r);
+#if CONFIG_WARPED_MOTION
+    if (mbmi->motion_mode == WARPED_CAUSAL) {
+      mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE;
+      if (find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
+                          mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
+                          &mbmi->wm_params[0], mi_row, mi_col)) {
+        assert(0 && "Invalid Warped Model.");
+      }
+    }
+#endif  // CONFIG_WARPED_MOTION
+#if CONFIG_SUPERTX
+  }
+#endif  // CONFIG_SUPERTX
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if CONFIG_EXT_INTER
+  mbmi->interinter_compound_type = COMPOUND_AVERAGE;
+  if (cm->reference_mode != SINGLE_REFERENCE &&
+      is_inter_compound_mode(mbmi->mode)
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      && mbmi->motion_mode == SIMPLE_TRANSLATION
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      ) {
+    if (is_any_masked_compound_used(bsize)) {
+#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+      mbmi->interinter_compound_type =
+          aom_read_tree(r, av1_compound_type_tree,
+                        cm->fc->compound_type_prob[bsize], ACCT_STR);
+#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+#if CONFIG_WEDGE
+      if (mbmi->interinter_compound_type == COMPOUND_WEDGE) {
+        mbmi->wedge_index =
+            aom_read_literal(r, get_wedge_bits_lookup(bsize), ACCT_STR);
+        mbmi->wedge_sign = aom_read_bit(r, ACCT_STR);
+      }
+#endif  // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+      if (mbmi->interinter_compound_type == COMPOUND_SEG) {
+        mbmi->mask_type = aom_read_literal(r, MAX_SEG_MASK_BITS, ACCT_STR);
+      }
+#endif  // CONFIG_COMPOUND_SEGMENT
+    } else {
+      mbmi->interinter_compound_type = COMPOUND_AVERAGE;
+    }
+    if (xd->counts)
+      xd->counts->compound_interinter[bsize][mbmi->interinter_compound_type]++;
+  }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION
+  read_mb_interp_filter(cm, xd, mbmi, r);
+#endif  // CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION
+}
+
+static void read_inter_frame_mode_info(AV1Decoder *const pbi,
+                                       MACROBLOCKD *const xd,
+#if CONFIG_SUPERTX
+                                       int supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                                       int mi_row, int mi_col, aom_reader *r) {
+  AV1_COMMON *const cm = &pbi->common;
+  MODE_INFO *const mi = xd->mi[0];
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  int inter_block = 1;
+#if CONFIG_VAR_TX
+  BLOCK_SIZE bsize = mbmi->sb_type;
+#endif  // CONFIG_VAR_TX
+
+  mbmi->mv[0].as_int = 0;
+  mbmi->mv[1].as_int = 0;
+  mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
+#if CONFIG_SUPERTX
+  if (!supertx_enabled)
+#endif  // CONFIG_SUPERTX
+    mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
+
+#if CONFIG_DELTA_Q
+  if (cm->delta_q_present_flag) {
+    xd->current_qindex =
+        xd->prev_qindex +
+        read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res;
+    /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
+    xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
+    xd->prev_qindex = xd->current_qindex;
+#if CONFIG_EXT_DELTA_Q
+    if (cm->delta_lf_present_flag) {
+      mbmi->current_delta_lf_from_base = xd->current_delta_lf_from_base =
+          xd->prev_delta_lf_from_base +
+          read_delta_lflevel(cm, xd, r, mbmi, mi_col, mi_row) *
+              cm->delta_lf_res;
+      xd->prev_delta_lf_from_base = xd->current_delta_lf_from_base;
+    }
+#endif
+  }
+#endif
+
+#if CONFIG_SUPERTX
+  if (!supertx_enabled) {
+#endif  // CONFIG_SUPERTX
+    inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
+
+#if CONFIG_VAR_TX
+    xd->above_txfm_context = cm->above_txfm_context + mi_col;
+    xd->left_txfm_context =
+        xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+    if (cm->tx_mode == TX_MODE_SELECT &&
+#if CONFIG_CB4X4
+        bsize > BLOCK_4X4 &&
+#else
+        bsize >= BLOCK_8X8 &&
+#endif
+        !mbmi->skip && inter_block) {
+      const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
+      const int bh = tx_size_high_unit[max_tx_size];
+      const int bw = tx_size_wide_unit[max_tx_size];
+      const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
+      const int height = block_size_high[bsize] >> tx_size_wide_log2[0];
+      int idx, idy;
+
+      mbmi->min_tx_size = TX_SIZES_ALL;
+      for (idy = 0; idy < height; idy += bh)
+        for (idx = 0; idx < width; idx += bw)
+          read_tx_size_vartx(cm, xd, mbmi, xd->counts, max_tx_size,
+                             height != width, idy, idx, r);
+    } else {
+      mbmi->tx_size = read_tx_size(cm, xd, inter_block, !mbmi->skip, r);
+
+      if (inter_block) {
+        const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
+        const int height = block_size_high[bsize] >> tx_size_high_log2[0];
+        int idx, idy;
+        for (idy = 0; idy < height; ++idy)
+          for (idx = 0; idx < width; ++idx)
+            mbmi->inter_tx_size[idy >> 1][idx >> 1] = mbmi->tx_size;
+      }
+      mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+      set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, mbmi->skip, xd);
+    }
+#else
+  mbmi->tx_size = read_tx_size(cm, xd, inter_block, !mbmi->skip, r);
+#endif  // CONFIG_VAR_TX
+#if CONFIG_SUPERTX
+  }
+#if CONFIG_VAR_TX
+  else if (inter_block) {
+    const int width = num_4x4_blocks_wide_lookup[bsize];
+    const int height = num_4x4_blocks_high_lookup[bsize];
+    int idx, idy;
+    xd->mi[0]->mbmi.tx_size = xd->supertx_size;
+    for (idy = 0; idy < height; ++idy)
+      for (idx = 0; idx < width; ++idx)
+        xd->mi[0]->mbmi.inter_tx_size[idy >> 1][idx >> 1] = xd->supertx_size;
+  }
+#endif  // CONFIG_VAR_TX
+#endif  // CONFIG_SUPERTX
+
+  if (inter_block)
+    read_inter_block_mode_info(pbi, xd,
+#if (CONFIG_MOTION_VAR || CONFIG_EXT_INTER || CONFIG_WARPED_MOTION) && \
+    CONFIG_SUPERTX
+
+                               mi, mi_row, mi_col, r, supertx_enabled);
+#else
+                               mi, mi_row, mi_col, r);
+#endif  // CONFIG_MOTION_VAR && CONFIG_SUPERTX
+  else
+    read_intra_block_mode_info(cm, mi_row, mi_col, xd, mi, r);
+
+#if !CONFIG_TXK_SEL
+  av1_read_tx_type(cm, xd,
+#if CONFIG_SUPERTX
+                   supertx_enabled,
+#endif
+                   r);
+#endif  // !CONFIG_TXK_SEL
+}
+
+void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd,
+#if CONFIG_SUPERTX
+                        int supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                        int mi_row, int mi_col, aom_reader *r, int x_mis,
+                        int y_mis) {
+  AV1_COMMON *const cm = &pbi->common;
+  MODE_INFO *const mi = xd->mi[0];
+  MV_REF *frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
+
+#if CONFIG_INTRABC
+  mi->mbmi.use_intrabc = 0;
+#endif  // CONFIG_INTRABC
+
+  if (frame_is_intra_only(cm)) {
+    read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
+#if CONFIG_REF_MV
+    for (h = 0; h < y_mis; ++h) {
+      MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+      for (w = 0; w < x_mis; ++w) {
+        MV_REF *const mv = frame_mv + w;
+        mv->ref_frame[0] = NONE_FRAME;
+        mv->ref_frame[1] = NONE_FRAME;
+      }
+    }
+#endif
+  } else {
+    read_inter_frame_mode_info(pbi, xd,
+#if CONFIG_SUPERTX
+                               supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                               mi_row, mi_col, r);
+    for (h = 0; h < y_mis; ++h) {
+      MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+      for (w = 0; w < x_mis; ++w) {
+        MV_REF *const mv = frame_mv + w;
+        mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+        mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+        mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+        mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+#if CONFIG_REF_MV
+        mv->pred_mv[0].as_int = mi->mbmi.pred_mv[0].as_int;
+        mv->pred_mv[1].as_int = mi->mbmi.pred_mv[1].as_int;
+#endif
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/decoder/decodemv.h b/third_party/aom/av1/decoder/decodemv.h
new file mode 100644
index 000000000..ceaee1d6b
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodemv.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_DECODER_DECODEMV_H_
+#define AV1_DECODER_DECODEMV_H_
+
+#include "aom_dsp/bitreader.h"
+
+#include "av1/decoder/decoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd,
+#if CONFIG_SUPERTX
+                        int supertx_enabled,
+#endif
+
+                        int mi_row, int mi_col, aom_reader *r, int x_mis,
+                        int y_mis);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+#if CONFIG_SUPERTX
+                      int supertx_enabled,
+#endif
+#if CONFIG_TXK_SEL
+                      int block, int plane,
+#endif
+                      aom_reader *r);
+
+#endif  // AV1_DECODER_DECODEMV_H_
diff --git a/third_party/aom/av1/decoder/decoder.c b/third_party/aom/av1/decoder/decoder.c
new file mode 100644
index 000000000..1bd91086e
--- /dev/null
+++ b/third_party/aom/av1/decoder/decoder.c
@@ -0,0 +1,583 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "./av1_rtcd.h"
+#include "./aom_dsp_rtcd.h"
+#include "./aom_scale_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/system_state.h"
+#include "aom_ports/aom_once.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_util/aom_thread.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/decoder.h"
+
+#if !CONFIG_PVQ
+#include "av1/decoder/detokenize.h"
+#endif
+
+static void initialize_dec(void) {
+  static volatile int init_done = 0;
+
+  if (!init_done) {
+    av1_rtcd();
+    aom_dsp_rtcd();
+    aom_scale_rtcd();
+    av1_init_intra_predictors();
+#if CONFIG_EXT_INTER
+    av1_init_wedge_masks();
+#endif  // CONFIG_EXT_INTER
+    init_done = 1;
+#if CONFIG_EC_MULTISYMBOL
+    av1_indices_from_tree(av1_intra_mode_ind, av1_intra_mode_inv,
+                          av1_intra_mode_tree);
+    av1_indices_from_tree(av1_switchable_interp_ind, av1_switchable_interp_inv,
+                          av1_switchable_interp_tree);
+#if CONFIG_EXT_TX
+    int s;
+    for (s = 1; s < EXT_TX_SETS_INTRA; ++s)
+      av1_indices_from_tree(av1_ext_tx_intra_ind[s], av1_ext_tx_intra_inv[s],
+                            av1_ext_tx_intra_tree[s]);
+    for (s = 1; s < EXT_TX_SETS_INTER; ++s)
+      av1_indices_from_tree(av1_ext_tx_inter_ind[s], av1_ext_tx_inter_inv[s],
+                            av1_ext_tx_inter_tree[s]);
+#else
+    av1_indices_from_tree(av1_ext_tx_ind, av1_ext_tx_inv, av1_ext_tx_tree);
+#endif
+    av1_indices_from_tree(av1_inter_mode_ind, av1_inter_mode_inv,
+                          av1_inter_mode_tree);
+#endif
+  }
+}
+
+static void av1_dec_setup_mi(AV1_COMMON *cm) {
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+  memset(cm->mi_grid_base, 0,
+         cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
+}
+
+static int av1_dec_alloc_mi(AV1_COMMON *cm, int mi_size) {
+  cm->mip = aom_calloc(mi_size, sizeof(*cm->mip));
+  if (!cm->mip) return 1;
+  cm->mi_alloc_size = mi_size;
+  cm->mi_grid_base = (MODE_INFO **)aom_calloc(mi_size, sizeof(MODE_INFO *));
+  if (!cm->mi_grid_base) return 1;
+  return 0;
+}
+
+static void av1_dec_free_mi(AV1_COMMON *cm) {
+  aom_free(cm->mip);
+  cm->mip = NULL;
+  aom_free(cm->mi_grid_base);
+  cm->mi_grid_base = NULL;
+}
+
+AV1Decoder *av1_decoder_create(BufferPool *const pool) {
+  AV1Decoder *volatile const pbi = aom_memalign(32, sizeof(*pbi));
+  AV1_COMMON *volatile const cm = pbi ? &pbi->common : NULL;
+
+  if (!cm) return NULL;
+
+  av1_zero(*pbi);
+
+  if (setjmp(cm->error.jmp)) {
+    cm->error.setjmp = 0;
+    av1_decoder_remove(pbi);
+    return NULL;
+  }
+
+  cm->error.setjmp = 1;
+
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(cm, cm->frame_contexts,
+                  (FRAME_CONTEXT *)aom_memalign(
+                      32, FRAME_CONTEXTS * sizeof(*cm->frame_contexts)));
+  memset(cm->fc, 0, sizeof(*cm->fc));
+  memset(cm->frame_contexts, 0, FRAME_CONTEXTS * sizeof(*cm->frame_contexts));
+
+  pbi->need_resync = 1;
+  once(initialize_dec);
+
+  // Initialize the references to not point to any frame buffers.
+  memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+  memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
+
+  cm->current_video_frame = 0;
+  pbi->ready_for_new_data = 1;
+  pbi->common.buffer_pool = pool;
+
+  cm->bit_depth = AOM_BITS_8;
+  cm->dequant_bit_depth = AOM_BITS_8;
+
+  cm->alloc_mi = av1_dec_alloc_mi;
+  cm->free_mi = av1_dec_free_mi;
+  cm->setup_mi = av1_dec_setup_mi;
+
+  av1_loop_filter_init(cm);
+
+#if CONFIG_AOM_QM
+  aom_qm_init(cm);
+#endif
+#if CONFIG_LOOP_RESTORATION
+  av1_loop_restoration_precal();
+#endif  // CONFIG_LOOP_RESTORATION
+#if CONFIG_ACCOUNTING
+  pbi->acct_enabled = 1;
+  aom_accounting_init(&pbi->accounting);
+#endif
+
+  cm->error.setjmp = 0;
+
+  aom_get_worker_interface()->init(&pbi->lf_worker);
+
+  return pbi;
+}
+
+void av1_decoder_remove(AV1Decoder *pbi) {
+  int i;
+
+  if (!pbi) return;
+
+  aom_get_worker_interface()->end(&pbi->lf_worker);
+  aom_free(pbi->lf_worker.data1);
+  aom_free(pbi->tile_data);
+  for (i = 0; i < pbi->num_tile_workers; ++i) {
+    AVxWorker *const worker = &pbi->tile_workers[i];
+    aom_get_worker_interface()->end(worker);
+  }
+  aom_free(pbi->tile_worker_data);
+  aom_free(pbi->tile_worker_info);
+  aom_free(pbi->tile_workers);
+
+  if (pbi->num_tile_workers > 0) {
+    av1_loop_filter_dealloc(&pbi->lf_row_sync);
+  }
+
+#if CONFIG_ACCOUNTING
+  aom_accounting_clear(&pbi->accounting);
+#endif
+
+  aom_free(pbi);
+}
+
+static int equal_dimensions(const YV12_BUFFER_CONFIG *a,
+                            const YV12_BUFFER_CONFIG *b) {
+  return a->y_height == b->y_height && a->y_width == b->y_width &&
+         a->uv_height == b->uv_height && a->uv_width == b->uv_width;
+}
+
+aom_codec_err_t av1_copy_reference_dec(AV1Decoder *pbi,
+                                       AOM_REFFRAME ref_frame_flag,
+                                       YV12_BUFFER_CONFIG *sd) {
+  AV1_COMMON *cm = &pbi->common;
+
+  /* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
+   * encoder is using the frame buffers for. This is just a stub to keep the
+   * aomenc --test-decode functionality working, and will be replaced in a
+   * later commit that adds AV1-specific controls for this functionality.
+   */
+  if (ref_frame_flag == AOM_LAST_FLAG) {
+    const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, 0);
+    if (cfg == NULL) {
+      aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                         "No 'last' reference frame");
+      return AOM_CODEC_ERROR;
+    }
+    if (!equal_dimensions(cfg, sd))
+      aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                         "Incorrect buffer dimensions");
+    else
+      aom_yv12_copy_frame(cfg, sd);
+  } else {
+    aom_internal_error(&cm->error, AOM_CODEC_ERROR, "Invalid reference frame");
+  }
+
+  return cm->error.error_code;
+}
+
+aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm,
+                                      AOM_REFFRAME ref_frame_flag,
+                                      YV12_BUFFER_CONFIG *sd) {
+  int idx;
+  YV12_BUFFER_CONFIG *ref_buf = NULL;
+
+  // TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
+  // encoder is using the frame buffers for. This is just a stub to keep the
+  // aomenc --test-decode functionality working, and will be replaced in a
+  // later commit that adds AV1-specific controls for this functionality.
+
+  // (Yunqing) The set_reference control depends on the following setting in
+  // encoder.
+  //   cpi->lst_fb_idx = 0;
+  // #if CONFIG_EXT_REFS
+  //   cpi->lst2_fb_idx = 1;
+  //   cpi->lst3_fb_idx = 2;
+  //   cpi->gld_fb_idx = 3;
+  //   cpi->bwd_fb_idx = 4;
+  //   cpi->alt_fb_idx = 5;
+  // #else  // CONFIG_EXT_REFS
+  //   cpi->gld_fb_idx = 1;
+  //   cpi->alt_fb_idx = 2;
+  // #endif  // CONFIG_EXT_REFS
+
+  // TODO(zoeliu): To revisit following code and reconsider what assumption we
+  // may take on the reference frame buffer virtual indexes
+  if (ref_frame_flag == AOM_LAST_FLAG) {
+    idx = cm->ref_frame_map[0];
+#if CONFIG_EXT_REFS
+  } else if (ref_frame_flag == AOM_LAST2_FLAG) {
+    idx = cm->ref_frame_map[1];
+  } else if (ref_frame_flag == AOM_LAST3_FLAG) {
+    idx = cm->ref_frame_map[2];
+  } else if (ref_frame_flag == AOM_GOLD_FLAG) {
+    idx = cm->ref_frame_map[3];
+  } else if (ref_frame_flag == AOM_BWD_FLAG) {
+    idx = cm->ref_frame_map[4];
+  } else if (ref_frame_flag == AOM_ALT_FLAG) {
+    idx = cm->ref_frame_map[5];
+#else
+  } else if (ref_frame_flag == AOM_GOLD_FLAG) {
+    idx = cm->ref_frame_map[1];
+  } else if (ref_frame_flag == AOM_ALT_FLAG) {
+    idx = cm->ref_frame_map[2];
+#endif  // CONFIG_EXT_REFS
+  } else {
+    aom_internal_error(&cm->error, AOM_CODEC_ERROR, "Invalid reference frame");
+    return cm->error.error_code;
+  }
+
+  if (idx < 0 || idx >= FRAME_BUFFERS) {
+    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                       "Invalid reference frame map");
+    return cm->error.error_code;
+  }
+
+  // Get the destination reference buffer.
+  ref_buf = &cm->buffer_pool->frame_bufs[idx].buf;
+
+  if (!equal_dimensions(ref_buf, sd)) {
+    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                       "Incorrect buffer dimensions");
+  } else {
+    // Overwrite the reference frame buffer.
+    aom_yv12_copy_frame(sd, ref_buf);
+  }
+
+  return cm->error.error_code;
+}
+
+/* If any buffer updating is signaled it should be done here. */
+static void swap_frame_buffers(AV1Decoder *pbi) {
+  int ref_index = 0, mask;
+  AV1_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+
+  lock_buffer_pool(pool);
+  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+    const int old_idx = cm->ref_frame_map[ref_index];
+    // Current thread releases the holding of reference frame.
+    decrease_ref_count(old_idx, frame_bufs, pool);
+
+    // Release the reference frame holding in the reference map for the decoding
+    // of the next frame.
+    if (mask & 1) decrease_ref_count(old_idx, frame_bufs, pool);
+    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+    ++ref_index;
+  }
+
+  // Current thread releases the holding of reference frame.
+  for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+    const int old_idx = cm->ref_frame_map[ref_index];
+    decrease_ref_count(old_idx, frame_bufs, pool);
+    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+  }
+
+  unlock_buffer_pool(pool);
+  pbi->hold_ref_buf = 0;
+  cm->frame_to_show = get_frame_new_buffer(cm);
+
+  // TODO(zoeliu): To fix the ref frame buffer update for the scenario of
+  //               cm->frame_parellel_decode == 1
+  if (!cm->frame_parallel_decode || !cm->show_frame) {
+    lock_buffer_pool(pool);
+    --frame_bufs[cm->new_fb_idx].ref_count;
+    unlock_buffer_pool(pool);
+  }
+
+  // Invalidate these references until the next frame starts.
+  for (ref_index = 0; ref_index < INTER_REFS_PER_FRAME; ref_index++) {
+    cm->frame_refs[ref_index].idx = INVALID_IDX;
+    cm->frame_refs[ref_index].buf = NULL;
+  }
+}
+
+int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
+                                const uint8_t **psource) {
+  AV1_COMMON *volatile const cm = &pbi->common;
+  BufferPool *volatile const pool = cm->buffer_pool;
+  RefCntBuffer *volatile const frame_bufs = cm->buffer_pool->frame_bufs;
+  const uint8_t *source = *psource;
+  int retcode = 0;
+  cm->error.error_code = AOM_CODEC_OK;
+
+  if (size == 0) {
+    // This is used to signal that we are missing frames.
+    // We do not know if the missing frame(s) was supposed to update
+    // any of the reference buffers, but we act conservative and
+    // mark only the last buffer as corrupted.
+    //
+    // TODO(jkoleszar): Error concealment is undefined and non-normative
+    // at this point, but if it becomes so, [0] may not always be the correct
+    // thing to do here.
+    if (cm->frame_refs[0].idx > 0) {
+      assert(cm->frame_refs[0].buf != NULL);
+      cm->frame_refs[0].buf->corrupted = 1;
+    }
+  }
+
+  pbi->ready_for_new_data = 0;
+
+  // Find a free buffer for the new frame, releasing the reference previously
+  // held.
+
+  // Check if the previous frame was a frame without any references to it.
+  // Release frame buffer if not decoding in frame parallel mode.
+  if (!cm->frame_parallel_decode && cm->new_fb_idx >= 0 &&
+      frame_bufs[cm->new_fb_idx].ref_count == 0)
+    pool->release_fb_cb(pool->cb_priv,
+                        &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
+
+  // Find a free frame buffer. Return error if can not find any.
+  cm->new_fb_idx = get_free_fb(cm);
+  if (cm->new_fb_idx == INVALID_IDX) return AOM_CODEC_MEM_ERROR;
+
+  // Assign a MV array to the frame buffer.
+  cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
+
+  pbi->hold_ref_buf = 0;
+  if (cm->frame_parallel_decode) {
+    AVxWorker *const worker = pbi->frame_worker_owner;
+    av1_frameworker_lock_stats(worker);
+    frame_bufs[cm->new_fb_idx].frame_worker_owner = worker;
+    // Reset decoding progress.
+    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+    pbi->cur_buf->row = -1;
+    pbi->cur_buf->col = -1;
+    av1_frameworker_unlock_stats(worker);
+  } else {
+    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+  }
+
+  if (setjmp(cm->error.jmp)) {
+    const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+    int i;
+
+    cm->error.setjmp = 0;
+    pbi->ready_for_new_data = 1;
+
+    // Synchronize all threads immediately as a subsequent decode call may
+    // cause a resize invalidating some allocations.
+    winterface->sync(&pbi->lf_worker);
+    for (i = 0; i < pbi->num_tile_workers; ++i) {
+      winterface->sync(&pbi->tile_workers[i]);
+    }
+
+    lock_buffer_pool(pool);
+    // Release all the reference buffers if worker thread is holding them.
+    if (pbi->hold_ref_buf == 1) {
+      int ref_index = 0, mask;
+      for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        // Current thread releases the holding of reference frame.
+        decrease_ref_count(old_idx, frame_bufs, pool);
+
+        // Release the reference frame holding in the reference map for the
+        // decoding of the next frame.
+        if (mask & 1) decrease_ref_count(old_idx, frame_bufs, pool);
+        ++ref_index;
+      }
+
+      // Current thread releases the holding of reference frame.
+      for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        decrease_ref_count(old_idx, frame_bufs, pool);
+      }
+      pbi->hold_ref_buf = 0;
+    }
+    // Release current frame.
+    decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+    unlock_buffer_pool(pool);
+
+    aom_clear_system_state();
+    return -1;
+  }
+
+  cm->error.setjmp = 1;
+  av1_decode_frame(pbi, source, source + size, psource);
+
+  swap_frame_buffers(pbi);
+
+#if CONFIG_EXT_TILE
+  // For now, we only extend the frame borders when the whole frame is decoded.
+  // Later, if needed, extend the border for the decoded tile on the frame
+  // border.
+  if (pbi->dec_tile_row == -1 && pbi->dec_tile_col == -1)
+#endif  // CONFIG_EXT_TILE
+    aom_extend_frame_inner_borders(cm->frame_to_show);
+
+  aom_clear_system_state();
+
+  if (!cm->show_existing_frame) {
+    cm->last_show_frame = cm->show_frame;
+
+#if CONFIG_EXT_REFS
+    // NOTE: It is not supposed to ref to any frame not used as reference
+    if (cm->is_reference_frame)
+#endif  // CONFIG_EXT_REFS
+      cm->prev_frame = cm->cur_frame;
+
+    if (cm->seg.enabled && !cm->frame_parallel_decode)
+      av1_swap_current_and_last_seg_map(cm);
+  }
+
+  // Update progress in frame parallel decode.
+  if (cm->frame_parallel_decode) {
+    // Need to lock the mutex here as another thread may
+    // be accessing this buffer.
+    AVxWorker *const worker = pbi->frame_worker_owner;
+    FrameWorkerData *const frame_worker_data = worker->data1;
+    av1_frameworker_lock_stats(worker);
+
+    if (cm->show_frame) {
+      cm->current_video_frame++;
+    }
+    frame_worker_data->frame_decoded = 1;
+    frame_worker_data->frame_context_ready = 1;
+    av1_frameworker_signal_stats(worker);
+    av1_frameworker_unlock_stats(worker);
+  } else {
+    cm->last_width = cm->width;
+    cm->last_height = cm->height;
+    if (cm->show_frame) {
+      cm->current_video_frame++;
+    }
+  }
+
+  cm->error.setjmp = 0;
+  return retcode;
+}
+
+int av1_get_raw_frame(AV1Decoder *pbi, YV12_BUFFER_CONFIG *sd) {
+  AV1_COMMON *const cm = &pbi->common;
+  int ret = -1;
+  if (pbi->ready_for_new_data == 1) return ret;
+
+  pbi->ready_for_new_data = 1;
+
+  /* no raw frame to show!!! */
+  if (!cm->show_frame) return ret;
+
+  pbi->ready_for_new_data = 1;
+  *sd = *cm->frame_to_show;
+  ret = 0;
+  aom_clear_system_state();
+  return ret;
+}
+
+int av1_get_frame_to_show(AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame) {
+  AV1_COMMON *const cm = &pbi->common;
+
+  if (!cm->show_frame || !cm->frame_to_show) return -1;
+
+  *frame = *cm->frame_to_show;
+  return 0;
+}
+
+aom_codec_err_t av1_parse_superframe_index(const uint8_t *data, size_t data_sz,
+                                           uint32_t sizes[8], int *count,
+                                           aom_decrypt_cb decrypt_cb,
+                                           void *decrypt_state) {
+  // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
+  // it is a super frame index. If the last byte of real video compression
+  // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
+  // not the associated matching marker byte at the front of the index we have
+  // an invalid bitstream and need to return an error.
+
+  uint8_t marker;
+  size_t frame_sz_sum = 0;
+
+  assert(data_sz);
+  marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1);
+  *count = 0;
+
+  if ((marker & 0xe0) == 0xc0) {
+    const uint32_t frames = (marker & 0x7) + 1;
+    const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+    const size_t index_sz = 2 + mag * (frames - 1);
+
+    // This chunk is marked as having a superframe index but doesn't have
+    // enough data for it, thus it's an invalid superframe index.
+    if (data_sz < index_sz) return AOM_CODEC_CORRUPT_FRAME;
+
+    {
+      const uint8_t marker2 =
+          read_marker(decrypt_cb, decrypt_state, data + data_sz - index_sz);
+
+      // This chunk is marked as having a superframe index but doesn't have
+      // the matching marker byte at the front of the index therefore it's an
+      // invalid chunk.
+      if (marker != marker2) return AOM_CODEC_CORRUPT_FRAME;
+    }
+
+    {
+      // Found a valid superframe index.
+      uint32_t i, j;
+      const uint8_t *x = &data[data_sz - index_sz + 1];
+
+      // Frames has a maximum of 8 and mag has a maximum of 4.
+      uint8_t clear_buffer[28];
+      assert(sizeof(clear_buffer) >= (frames - 1) * mag);
+      if (decrypt_cb) {
+        decrypt_cb(decrypt_state, x, clear_buffer, (frames - 1) * mag);
+        x = clear_buffer;
+      }
+
+      for (i = 0; i < frames - 1; ++i) {
+        uint32_t this_sz = 0;
+
+        for (j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8);
+        this_sz += 1;
+        sizes[i] = this_sz;
+        frame_sz_sum += this_sz;
+      }
+      sizes[i] = (uint32_t)(data_sz - index_sz - frame_sz_sum);
+      *count = frames;
+    }
+  }
+  return AOM_CODEC_OK;
+}
diff --git a/third_party/aom/av1/decoder/decoder.h b/third_party/aom/av1/decoder/decoder.h
new file mode 100644
index 000000000..4a90b4ad5
--- /dev/null
+++ b/third_party/aom/av1/decoder/decoder.h
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_DECODER_DECODER_H_
+#define AV1_DECODER_DECODER_H_
+
+#include "./aom_config.h"
+
+#include "aom/aom_codec.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_scale/yv12config.h"
+#include "aom_util/aom_thread.h"
+
+#include "av1/common/thread_common.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/decoder/dthread.h"
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#endif
+#if CONFIG_INSPECTION
+#include "av1/decoder/inspection.h"
+#endif
+
+#if CONFIG_PVQ
+#include "aom_dsp/entdec.h"
+#include "av1/decoder/decint.h"
+#include "av1/encoder/encodemb.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// TODO(hkuang): combine this with TileWorkerData.
+typedef struct TileData {
+  AV1_COMMON *cm;
+  aom_reader bit_reader;
+  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
+#if CONFIG_PVQ
+  /* forward transformed predicted image, a reference for PVQ */
+  DECLARE_ALIGNED(16, tran_low_t, pvq_ref_coeff[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+#endif
+#if CONFIG_CFL
+  CFL_CTX cfl;
+#endif
+#if CONFIG_EC_ADAPT
+  DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
+#endif
+#if CONFIG_PALETTE
+  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
+#endif  // CONFIG_PALETTE
+} TileData;
+
+typedef struct TileWorkerData {
+  struct AV1Decoder *pbi;
+  aom_reader bit_reader;
+  FRAME_COUNTS counts;
+  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
+#if CONFIG_PVQ
+  /* forward transformed predicted image, a reference for PVQ */
+  DECLARE_ALIGNED(16, tran_low_t, pvq_ref_coeff[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+#endif
+#if CONFIG_CFL
+  CFL_CTX cfl;
+#endif
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT tctx;
+#endif
+#if CONFIG_PALETTE
+  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
+#endif  // CONFIG_PALETTE
+  struct aom_internal_error_info error_info;
+} TileWorkerData;
+
+typedef struct TileBufferDec {
+  const uint8_t *data;
+  size_t size;
+  const uint8_t *raw_data_end;  // The end of the raw tile buffer in the
+                                // bit stream.
+  int col;                      // only used with multi-threaded decoding
+} TileBufferDec;
+
+typedef struct AV1Decoder {
+  DECLARE_ALIGNED(16, MACROBLOCKD, mb);
+
+  DECLARE_ALIGNED(16, AV1_COMMON, common);
+
+  int ready_for_new_data;
+
+  int refresh_frame_flags;
+
+  // TODO(hkuang): Combine this with cur_buf in macroblockd as they are
+  // the same.
+  RefCntBuffer *cur_buf;  //  Current decoding frame buffer.
+
+  AVxWorker *frame_worker_owner;  // frame_worker that owns this pbi.
+  AVxWorker lf_worker;
+  AVxWorker *tile_workers;
+  TileWorkerData *tile_worker_data;
+  TileInfo *tile_worker_info;
+  int num_tile_workers;
+
+  TileData *tile_data;
+  int allocated_tiles;
+
+  TileBufferDec tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
+
+  AV1LfSync lf_row_sync;
+
+  aom_decrypt_cb decrypt_cb;
+  void *decrypt_state;
+
+  int max_threads;
+  int inv_tile_order;
+  int need_resync;   // wait for key/intra-only frame.
+  int hold_ref_buf;  // hold the reference buffer.
+
+  int tile_size_bytes;
+#if CONFIG_EXT_TILE
+  int tile_col_size_bytes;
+  int dec_tile_row, dec_tile_col;
+#endif  // CONFIG_EXT_TILE
+#if CONFIG_ACCOUNTING
+  int acct_enabled;
+  Accounting accounting;
+#endif
+  size_t uncomp_hdr_size;       // Size of the uncompressed header
+  size_t first_partition_size;  // Size of the compressed header
+#if CONFIG_TILE_GROUPS
+  int tg_size;   // Number of tiles in the current tilegroup
+  int tg_start;  // First tile in the current tilegroup
+  int tg_size_bit_offset;
+#endif
+#if CONFIG_REFERENCE_BUFFER
+  SequenceHeader seq_params;
+#endif
+#if CONFIG_INSPECTION
+  aom_inspect_cb inspect_cb;
+  void *inspect_ctx;
+#endif
+} AV1Decoder;
+
+int av1_receive_compressed_data(struct AV1Decoder *pbi, size_t size,
+                                const uint8_t **dest);
+
+int av1_get_raw_frame(struct AV1Decoder *pbi, YV12_BUFFER_CONFIG *sd);
+
+int av1_get_frame_to_show(struct AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame);
+
+aom_codec_err_t av1_copy_reference_dec(struct AV1Decoder *pbi,
+                                       AOM_REFFRAME ref_frame_flag,
+                                       YV12_BUFFER_CONFIG *sd);
+
+aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm,
+                                      AOM_REFFRAME ref_frame_flag,
+                                      YV12_BUFFER_CONFIG *sd);
+
+static INLINE uint8_t read_marker(aom_decrypt_cb decrypt_cb,
+                                  void *decrypt_state, const uint8_t *data) {
+  if (decrypt_cb) {
+    uint8_t marker;
+    decrypt_cb(decrypt_state, data, &marker, 1);
+    return marker;
+  }
+  return *data;
+}
+
+// This function is exposed for use in tests, as well as the inlined function
+// "read_marker".
+aom_codec_err_t av1_parse_superframe_index(const uint8_t *data, size_t data_sz,
+                                           uint32_t sizes[8], int *count,
+                                           aom_decrypt_cb decrypt_cb,
+                                           void *decrypt_state);
+
+struct AV1Decoder *av1_decoder_create(BufferPool *const pool);
+
+void av1_decoder_remove(struct AV1Decoder *pbi);
+
+static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
+                                      BufferPool *const pool) {
+  if (idx >= 0) {
+    --frame_bufs[idx].ref_count;
+    // A worker may only get a free framebuffer index when calling get_free_fb.
+    // But the private buffer is not set up until finish decoding header.
+    // So any error happens during decoding header, the frame_bufs will not
+    // have valid priv buffer.
+    if (frame_bufs[idx].ref_count == 0 &&
+        frame_bufs[idx].raw_frame_buffer.priv) {
+      pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer);
+    }
+  }
+}
+
+#if CONFIG_EXT_REFS
+static INLINE int dec_is_ref_frame_buf(AV1Decoder *const pbi,
+                                       RefCntBuffer *frame_buf) {
+  AV1_COMMON *const cm = &pbi->common;
+  int i;
+  for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    RefBuffer *const ref_frame = &cm->frame_refs[i];
+    if (ref_frame->idx == INVALID_IDX) continue;
+    if (frame_buf == &cm->buffer_pool->frame_bufs[ref_frame->idx]) break;
+  }
+  return (i < INTER_REFS_PER_FRAME);
+}
+#endif  // CONFIG_EXT_REFS
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_DECODER_DECODER_H_
diff --git a/third_party/aom/av1/decoder/decodetxb.c b/third_party/aom/av1/decoder/decodetxb.c
new file mode 100644
index 000000000..e1db09775
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodetxb.c
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/scan.h"
+#include "av1/common/idct.h"
+#include "av1/common/txb_common.h"
+#include "av1/decoder/decodemv.h"
+#include "av1/decoder/decodetxb.h"
+#include "av1/decoder/dsubexp.h"
+
+#define ACCT_STR __func__
+
+static int read_golomb(MACROBLOCKD *xd, aom_reader *r) {
+  int x = 1;
+  int length = 0;
+  int i = 0;
+
+  while (!i) {
+    i = aom_read_bit(r, ACCT_STR);
+    ++length;
+    if (length >= 32) {
+      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid length in read_golomb");
+      break;
+    }
+  }
+
+  for (i = 0; i < length - 1; ++i) {
+    x <<= 1;
+    x += aom_read_bit(r, ACCT_STR);
+  }
+
+  return x - 1;
+}
+
+uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+                            aom_reader *r, int block, int plane,
+                            tran_low_t *tcoeffs, TXB_CTX *txb_ctx,
+                            int16_t *max_scan_line, int *eob) {
+  FRAME_COUNTS *counts = xd->counts;
+  TX_SIZE tx_size = get_tx_size(plane, xd);
+  PLANE_TYPE plane_type = get_plane_type(plane);
+  aom_prob *nz_map = cm->fc->nz_map[tx_size][plane_type];
+  aom_prob *eob_flag = cm->fc->eob_flag[tx_size][plane_type];
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int seg_eob = tx_size_2d[tx_size];
+  int c = 0;
+  int update_eob = -1;
+  const int16_t *const dequant = xd->plane[plane].seg_dequant[mbmi->segment_id];
+  const int shift = av1_get_tx_scale(tx_size);
+  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
+  int cul_level = 0;
+  unsigned int(*nz_map_count)[SIG_COEF_CONTEXTS][2];
+  uint8_t txb_mask[32 * 32] = { 0 };
+
+  nz_map_count = (counts) ? &counts->nz_map[tx_size][plane_type] : NULL;
+
+  memset(tcoeffs, 0, sizeof(*tcoeffs) * seg_eob);
+
+  int all_zero =
+      aom_read(r, cm->fc->txb_skip[tx_size][txb_ctx->txb_skip_ctx], ACCT_STR);
+  if (xd->counts)
+    ++xd->counts->txb_skip[tx_size][txb_ctx->txb_skip_ctx][all_zero];
+
+  *eob = 0;
+  if (all_zero) {
+    *max_scan_line = 0;
+    return 0;
+  }
+
+#if CONFIG_TXK_SEL
+  av1_read_tx_type(cm, xd, block, plane, r);
+#endif
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const SCAN_ORDER *const scan_order =
+      get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+  const int16_t *scan = scan_order->scan;
+
+  for (c = 0; c < seg_eob; ++c) {
+    int is_nz;
+    int coeff_ctx = get_nz_map_ctx(tcoeffs, txb_mask, scan[c], bwl);
+    int eob_ctx = get_eob_ctx(tcoeffs, scan[c], bwl);
+
+    if (c < seg_eob - 1)
+      is_nz = aom_read(r, nz_map[coeff_ctx], tx_size);
+    else
+      is_nz = 1;
+
+    // set non-zero coefficient map.
+    tcoeffs[scan[c]] = is_nz;
+
+    if (c == seg_eob - 1) {
+      ++c;
+      break;
+    }
+
+    if (counts) ++(*nz_map_count)[coeff_ctx][is_nz];
+
+    if (is_nz) {
+      int is_eob = aom_read(r, eob_flag[eob_ctx], tx_size);
+      if (counts) ++counts->eob_flag[tx_size][plane_type][eob_ctx][is_eob];
+      if (is_eob) break;
+    }
+    txb_mask[scan[c]] = 1;
+  }
+
+  *eob = AOMMIN(seg_eob, c + 1);
+  *max_scan_line = *eob;
+
+  int i;
+  for (i = 0; i < NUM_BASE_LEVELS; ++i) {
+    aom_prob *coeff_base = cm->fc->coeff_base[tx_size][plane_type][i];
+
+    update_eob = 0;
+    for (c = *eob - 1; c >= 0; --c) {
+      tran_low_t *v = &tcoeffs[scan[c]];
+      int sign;
+      int ctx;
+
+      if (*v <= i) continue;
+
+      ctx = get_base_ctx(tcoeffs, scan[c], bwl, i + 1);
+
+      if (aom_read(r, coeff_base[ctx], tx_size)) {
+        *v = i + 1;
+        cul_level += i + 1;
+
+        if (counts) ++counts->coeff_base[tx_size][plane_type][i][ctx][1];
+
+        if (c == 0) {
+          int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+          sign = aom_read(r, cm->fc->dc_sign[plane_type][dc_sign_ctx], tx_size);
+          if (counts) ++counts->dc_sign[plane_type][dc_sign_ctx][sign];
+        } else {
+          sign = aom_read_bit(r, ACCT_STR);
+        }
+        if (sign) *v = -(*v);
+        continue;
+      }
+      *v = i + 2;
+      if (counts) ++counts->coeff_base[tx_size][plane_type][i][ctx][0];
+
+      // update the eob flag for coefficients with magnitude above 1.
+      update_eob = AOMMAX(update_eob, c);
+    }
+  }
+
+  for (c = update_eob; c >= 0; --c) {
+    tran_low_t *v = &tcoeffs[scan[c]];
+    int sign;
+    int idx;
+    int ctx;
+
+    if (*v <= NUM_BASE_LEVELS) continue;
+
+    if (c == 0) {
+      int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+      sign = aom_read(r, cm->fc->dc_sign[plane_type][dc_sign_ctx], tx_size);
+      if (counts) ++counts->dc_sign[plane_type][dc_sign_ctx][sign];
+    } else {
+      sign = aom_read_bit(r, ACCT_STR);
+    }
+
+    ctx = get_level_ctx(tcoeffs, scan[c], bwl);
+
+    if (cm->fc->coeff_lps[tx_size][plane_type][ctx] == 0) exit(0);
+
+    for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
+      if (aom_read(r, cm->fc->coeff_lps[tx_size][plane_type][ctx], tx_size)) {
+        *v = (idx + 1 + NUM_BASE_LEVELS);
+        if (sign) *v = -(*v);
+        cul_level += abs(*v);
+
+        if (counts) ++counts->coeff_lps[tx_size][plane_type][ctx][1];
+        break;
+      }
+      if (counts) ++counts->coeff_lps[tx_size][plane_type][ctx][0];
+    }
+    if (idx < COEFF_BASE_RANGE) continue;
+
+    // decode 0-th order Golomb code
+    *v = read_golomb(xd, r) + COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS;
+    if (sign) *v = -(*v);
+    cul_level += abs(*v);
+  }
+
+  for (c = 0; c < *eob; ++c) {
+    int16_t dqv = (c == 0) ? dequant[0] : dequant[1];
+    tran_low_t *v = &tcoeffs[scan[c]];
+    int sign = (*v) < 0;
+    *v = (abs(*v) * dqv) >> shift;
+    if (sign) *v = -(*v);
+  }
+
+  cul_level = AOMMIN(63, cul_level);
+
+  // DC value
+  set_dc_sign(&cul_level, tcoeffs[0]);
+
+  return cul_level;
+}
+
+uint8_t av1_read_coeffs_txb_facade(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                   aom_reader *r, int row, int col, int block,
+                                   int plane, tran_low_t *tcoeffs,
+                                   int16_t *max_scan_line, int *eob) {
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct macroblockd_plane *pd = &xd->plane[plane];
+
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+#else
+  const BLOCK_SIZE plane_bsize =
+      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
+#endif  // CONFIG_CHROMA_2X2
+#else   // CONFIG_CB4X4
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
+#endif  // CONFIG_CB4X4
+
+  TX_SIZE tx_size = get_tx_size(plane, xd);
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, pd->above_context + col,
+              pd->left_context + row, &txb_ctx);
+  uint8_t cul_level = av1_read_coeffs_txb(cm, xd, r, block, plane, tcoeffs,
+                                          &txb_ctx, max_scan_line, eob);
+#if CONFIG_ADAPT_SCAN
+  PLANE_TYPE plane_type = get_plane_type(plane);
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  if (xd->counts && *eob > 0)
+    av1_update_scan_count_facade(cm, xd->counts, tx_size, tx_type, pd->dqcoeff,
+                                 *eob);
+#endif
+  av1_set_contexts(xd, pd, plane, tx_size, cul_level, col, row);
+  return cul_level;
+}
+
+static void read_txb_probs(FRAME_CONTEXT *fc, const TX_SIZE tx_size,
+                           aom_reader *r) {
+  int plane, ctx, level;
+
+  if (aom_read_bit(r, ACCT_STR) == 0) return;
+
+  for (ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx)
+    av1_diff_update_prob(r, &fc->txb_skip[tx_size][ctx], ACCT_STR);
+
+  for (plane = 0; plane < PLANE_TYPES; ++plane)
+    for (ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx)
+      av1_diff_update_prob(r, &fc->nz_map[tx_size][plane][ctx], ACCT_STR);
+
+  for (plane = 0; plane < PLANE_TYPES; ++plane)
+    for (ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx)
+      av1_diff_update_prob(r, &fc->eob_flag[tx_size][plane][ctx], ACCT_STR);
+
+  for (level = 0; level < NUM_BASE_LEVELS; ++level)
+    for (plane = 0; plane < PLANE_TYPES; ++plane)
+      for (ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx)
+        av1_diff_update_prob(r, &fc->coeff_base[tx_size][plane][level][ctx],
+                             ACCT_STR);
+
+  for (plane = 0; plane < PLANE_TYPES; ++plane)
+    for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx)
+      av1_diff_update_prob(r, &fc->coeff_lps[tx_size][plane][ctx], ACCT_STR);
+}
+
+void av1_read_txb_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode, aom_reader *r) {
+  const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+  TX_SIZE tx_size;
+  int ctx, plane;
+  for (plane = 0; plane < PLANE_TYPES; ++plane)
+    for (ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
+      av1_diff_update_prob(r, &fc->dc_sign[plane][ctx], ACCT_STR);
+
+  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+    read_txb_probs(fc, tx_size, r);
+}
diff --git a/third_party/aom/av1/decoder/decodetxb.h b/third_party/aom/av1/decoder/decodetxb.h
new file mode 100644
index 000000000..ee1bf6a3d
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodetxb.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef DECODETXB_H_
+#define DECODETXB_H_
+
+#include "./aom_config.h"
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+#include "aom_dsp/bitreader.h"
+
+uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+                            aom_reader *r, int block, int plane,
+                            tran_low_t *tcoeffs, TXB_CTX *txb_ctx,
+                            int16_t *max_scan_line, int *eob);
+
+uint8_t av1_read_coeffs_txb_facade(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                   aom_reader *r, int row, int col, int block,
+                                   int plane, tran_low_t *tcoeffs,
+                                   int16_t *max_scan_line, int *eob);
+void av1_read_txb_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode, aom_reader *r);
+#endif  //  DECODETXB_H_
diff --git a/third_party/aom/av1/decoder/detokenize.c b/third_party/aom/av1/decoder/detokenize.c
new file mode 100644
index 000000000..494f1681f
--- /dev/null
+++ b/third_party/aom/av1/decoder/detokenize.c
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_config.h"
+#if !CONFIG_PVQ
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#endif  // !CONFIG_PVQ
+
+#include "av1/common/blockd.h"
+
+#define ACCT_STR __func__
+
+#if !CONFIG_PVQ || CONFIG_VAR_TX
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/idct.h"
+#include "av1/decoder/detokenize.h"
+
+#define EOB_CONTEXT_NODE 0
+#define ZERO_CONTEXT_NODE 1
+#define ONE_CONTEXT_NODE 2
+#define LOW_VAL_CONTEXT_NODE 0
+#define TWO_CONTEXT_NODE 1
+#define THREE_CONTEXT_NODE 2
+#define HIGH_LOW_CONTEXT_NODE 3
+#define CAT_ONE_CONTEXT_NODE 4
+#define CAT_THREEFOUR_CONTEXT_NODE 5
+#define CAT_THREE_CONTEXT_NODE 6
+#define CAT_FIVE_CONTEXT_NODE 7
+
+#define INCREMENT_COUNT(token)                   \
+  do {                                           \
+    if (counts) ++coef_counts[band][ctx][token]; \
+  } while (0)
+
+#if CONFIG_NEW_MULTISYMBOL
+#define READ_COEFF(prob_name, cdf_name, num, r) read_coeff(cdf_name, num, r);
+static INLINE int read_coeff(const aom_cdf_prob *const *cdf, int n,
+                             aom_reader *r) {
+  int val = 0;
+  int i = 0;
+  int count = 0;
+  while (count < n) {
+    const int size = AOMMIN(n - count, 4);
+    val |= aom_read_cdf(r, cdf[i++], 1 << size, ACCT_STR) << count;
+    count += size;
+  }
+  return val;
+}
+#else
+#define READ_COEFF(prob_name, cdf_name, num, r) read_coeff(prob_name, num, r);
+static INLINE int read_coeff(const aom_prob *probs, int n, aom_reader *r) {
+  int i, val = 0;
+  for (i = 0; i < n; ++i) val = (val << 1) | aom_read(r, probs[i], ACCT_STR);
+  return val;
+}
+
+#endif
+
+static int token_to_value(aom_reader *const r, int token, TX_SIZE tx_size,
+                          int bit_depth) {
+#if !CONFIG_HIGHBITDEPTH
+  assert(bit_depth == 8);
+#endif  // !CONFIG_HIGHBITDEPTH
+
+  switch (token) {
+    case ZERO_TOKEN:
+    case ONE_TOKEN:
+    case TWO_TOKEN:
+    case THREE_TOKEN:
+    case FOUR_TOKEN: return token;
+    case CATEGORY1_TOKEN:
+      return CAT1_MIN_VAL + READ_COEFF(av1_cat1_prob, av1_cat1_cdf, 1, r);
+    case CATEGORY2_TOKEN:
+      return CAT2_MIN_VAL + READ_COEFF(av1_cat2_prob, av1_cat2_cdf, 2, r);
+    case CATEGORY3_TOKEN:
+      return CAT3_MIN_VAL + READ_COEFF(av1_cat3_prob, av1_cat3_cdf, 3, r);
+    case CATEGORY4_TOKEN:
+      return CAT4_MIN_VAL + READ_COEFF(av1_cat4_prob, av1_cat4_cdf, 4, r);
+    case CATEGORY5_TOKEN:
+      return CAT5_MIN_VAL + READ_COEFF(av1_cat5_prob, av1_cat5_cdf, 5, r);
+    case CATEGORY6_TOKEN: {
+      const int skip_bits = (int)sizeof(av1_cat6_prob) -
+                            av1_get_cat6_extrabits_size(tx_size, bit_depth);
+      return CAT6_MIN_VAL + READ_COEFF(av1_cat6_prob + skip_bits, av1_cat6_cdf,
+                                       18 - skip_bits, r);
+    }
+    default:
+      assert(0);  // Invalid token.
+      return -1;
+  }
+}
+
+static int decode_coefs(MACROBLOCKD *xd, PLANE_TYPE type, tran_low_t *dqcoeff,
+                        TX_SIZE tx_size, TX_TYPE tx_type, const int16_t *dq,
+#if CONFIG_NEW_QUANT
+                        dequant_val_type_nuq *dq_val,
+#endif  // CONFIG_NEW_QUANT
+#if CONFIG_AOM_QM
+                        const qm_val_t *iqm[2][TX_SIZES],
+#endif  // CONFIG_AOM_QM
+                        int ctx, const int16_t *scan, const int16_t *nb,
+                        int16_t *max_scan_line, aom_reader *r) {
+  FRAME_COUNTS *counts = xd->counts;
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#else
+  FRAME_CONTEXT *const ec_ctx = xd->fc;
+#endif
+  const int max_eob = tx_size_2d[tx_size];
+  const int ref = is_inter_block(&xd->mi[0]->mbmi);
+#if CONFIG_AOM_QM
+  const qm_val_t *iqmatrix = iqm[!ref][tx_size];
+#endif  // CONFIG_AOM_QM
+  int band, c = 0;
+  const int tx_size_ctx = txsize_sqr_map[tx_size];
+#if CONFIG_NEW_TOKENSET
+  aom_cdf_prob(*coef_head_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
+      ec_ctx->coef_head_cdfs[tx_size_ctx][type][ref];
+  aom_cdf_prob(*coef_tail_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
+      ec_ctx->coef_tail_cdfs[tx_size_ctx][type][ref];
+  int val = 0;
+
+#if !CONFIG_EC_ADAPT
+  unsigned int *blockz_count;
+  unsigned int(*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1] = NULL;
+  unsigned int(*eob_branch_count)[COEFF_CONTEXTS] = NULL;
+#endif
+#else
+  aom_prob(*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+      ec_ctx->coef_probs[tx_size_ctx][type][ref];
+  const aom_prob *prob;
+#if CONFIG_EC_MULTISYMBOL
+  aom_cdf_prob(*coef_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
+      ec_ctx->coef_cdfs[tx_size_ctx][type][ref];
+  aom_cdf_prob(*cdf)[CDF_SIZE(ENTROPY_TOKENS)];
+#endif  // CONFIG_EC_MULTISYMBOL
+  unsigned int(*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1] = NULL;
+  unsigned int(*eob_branch_count)[COEFF_CONTEXTS] = NULL;
+#endif  // CONFIG_NEW_TOKENSET
+  uint8_t token_cache[MAX_TX_SQUARE];
+  const uint8_t *band_translate = get_band_translate(tx_size);
+  int dq_shift;
+  int v, token;
+  int16_t dqv = dq[0];
+#if CONFIG_NEW_QUANT
+  const tran_low_t *dqv_val = &dq_val[0][0];
+#endif  // CONFIG_NEW_QUANT
+  (void)tx_type;
+#if CONFIG_AOM_QM
+  (void)iqmatrix;
+#endif  // CONFIG_AOM_QM
+
+  if (counts) {
+#if !CONFIG_NEW_TOKENSET || !CONFIG_EC_ADAPT
+    coef_counts = counts->coef[tx_size_ctx][type][ref];
+    eob_branch_count = counts->eob_branch[tx_size_ctx][type][ref];
+#endif
+#if CONFIG_NEW_TOKENSET && !CONFIG_EC_ADAPT
+    blockz_count = counts->blockz_count[tx_size_ctx][type][ref][ctx];
+#endif
+  }
+
+  dq_shift = av1_get_tx_scale(tx_size);
+
+#if CONFIG_NEW_TOKENSET
+  band = *band_translate++;
+
+  int more_data = 1;
+  while (more_data) {
+    int comb_token;
+    int last_pos = (c + 1 == max_eob);
+    int first_pos = (c == 0);
+
+#if CONFIG_NEW_QUANT
+    dqv_val = &dq_val[band][0];
+#endif  // CONFIG_NEW_QUANT
+
+    comb_token = last_pos ? 2 * aom_read_bit(r, ACCT_STR) + 2
+                          : aom_read_symbol(r, coef_head_cdfs[band][ctx],
+                                            HEAD_TOKENS + first_pos, ACCT_STR) +
+                                !first_pos;
+    if (first_pos) {
+#if !CONFIG_EC_ADAPT
+      if (counts) ++blockz_count[comb_token != 0];
+#endif
+      if (comb_token == 0) return 0;
+    }
+    token = comb_token >> 1;
+
+    while (!token) {
+      *max_scan_line = AOMMAX(*max_scan_line, scan[c]);
+      token_cache[scan[c]] = 0;
+#if !CONFIG_EC_ADAPT
+      if (counts && !last_pos) {
+        ++coef_counts[band][ctx][ZERO_TOKEN];
+      }
+#endif
+      ++c;
+      dqv = dq[1];
+      ctx = get_coef_context(nb, token_cache, c);
+      band = *band_translate++;
+
+      last_pos = (c + 1 == max_eob);
+
+      comb_token = last_pos ? 2 * aom_read_bit(r, ACCT_STR) + 2
+                            : aom_read_symbol(r, coef_head_cdfs[band][ctx],
+                                              HEAD_TOKENS, ACCT_STR) +
+                                  1;
+      token = comb_token >> 1;
+    }
+
+    more_data = comb_token & 1;
+#if !CONFIG_EC_ADAPT
+    if (counts && !last_pos) {
+      ++coef_counts[band][ctx][token];
+      ++eob_branch_count[band][ctx];
+      if (!more_data) ++coef_counts[band][ctx][EOB_MODEL_TOKEN];
+    }
+#endif
+
+    if (token > ONE_TOKEN)
+      token +=
+          aom_read_symbol(r, coef_tail_cdfs[band][ctx], TAIL_TOKENS, ACCT_STR);
+#if CONFIG_NEW_QUANT
+    dqv_val = &dq_val[band][0];
+#endif  // CONFIG_NEW_QUANT
+
+    *max_scan_line = AOMMAX(*max_scan_line, scan[c]);
+    token_cache[scan[c]] = av1_pt_energy_class[token];
+
+    val = token_to_value(r, token, tx_size,
+#if CONFIG_HIGHBITDEPTH
+                         xd->bd);
+#else
+                         8);
+#endif  // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_NEW_QUANT
+    v = av1_dequant_abscoeff_nuq(val, dqv, dqv_val);
+    v = dq_shift ? ROUND_POWER_OF_TWO(v, dq_shift) : v;
+#else
+#if CONFIG_AOM_QM
+    dqv = ((iqmatrix[scan[c]] * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+#endif
+    v = (val * dqv) >> dq_shift;
+#endif
+
+    v = aom_read_bit(r, ACCT_STR) ? -v : v;
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#if CONFIG_HIGHBITDEPTH
+    check_range(v, xd->bd);
+#else
+    check_range(v, 8);
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+
+    dqcoeff[scan[c]] = v;
+
+    ++c;
+    more_data &= (c < max_eob);
+    if (!more_data) break;
+    dqv = dq[1];
+    ctx = get_coef_context(nb, token_cache, c);
+    band = *band_translate++;
+
+#else  // CONFIG_NEW_TOKENSET
+  while (c < max_eob) {
+    int val = -1;
+    band = *band_translate++;
+    prob = coef_probs[band][ctx];
+    if (counts) ++eob_branch_count[band][ctx];
+    if (!aom_read(r, prob[EOB_CONTEXT_NODE], ACCT_STR)) {
+      INCREMENT_COUNT(EOB_MODEL_TOKEN);
+      break;
+    }
+
+#if CONFIG_NEW_QUANT
+    dqv_val = &dq_val[band][0];
+#endif  // CONFIG_NEW_QUANT
+
+    while (!aom_read(r, prob[ZERO_CONTEXT_NODE], ACCT_STR)) {
+      INCREMENT_COUNT(ZERO_TOKEN);
+      dqv = dq[1];
+      token_cache[scan[c]] = 0;
+      ++c;
+      if (c >= max_eob) return c;  // zero tokens at the end (no eob token)
+      ctx = get_coef_context(nb, token_cache, c);
+      band = *band_translate++;
+      prob = coef_probs[band][ctx];
+#if CONFIG_NEW_QUANT
+      dqv_val = &dq_val[band][0];
+#endif  // CONFIG_NEW_QUANT
+    }
+
+    *max_scan_line = AOMMAX(*max_scan_line, scan[c]);
+
+#if CONFIG_EC_MULTISYMBOL
+    cdf = &coef_cdfs[band][ctx];
+    token = ONE_TOKEN +
+            aom_read_symbol(r, *cdf, CATEGORY6_TOKEN - ONE_TOKEN + 1, ACCT_STR);
+    INCREMENT_COUNT(ONE_TOKEN + (token > ONE_TOKEN));
+    assert(token != ZERO_TOKEN);
+    val = token_to_value(r, token, tx_size,
+#if CONFIG_HIGHBITDEPTH
+                         xd->bd);
+#else
+                         8);
+#endif  // CONFIG_HIGHBITDEPTH
+#else   // CONFIG_EC_MULTISYMBOL
+    if (!aom_read(r, prob[ONE_CONTEXT_NODE], ACCT_STR)) {
+      INCREMENT_COUNT(ONE_TOKEN);
+      token = ONE_TOKEN;
+      val = 1;
+    } else {
+      INCREMENT_COUNT(TWO_TOKEN);
+      token = aom_read_tree(r, av1_coef_con_tree,
+                            av1_pareto8_full[prob[PIVOT_NODE] - 1], ACCT_STR);
+      assert(token != ZERO_TOKEN && token != ONE_TOKEN);
+      val = token_to_value(r, token, tx_size,
+#if CONFIG_HIGHBITDEPTH
+                           xd->bd);
+#else
+                           8);
+#endif  // CONFIG_HIGHBITDEPTH
+    }
+#endif  // CONFIG_EC_MULTISYMBOL
+#if CONFIG_NEW_QUANT
+    v = av1_dequant_abscoeff_nuq(val, dqv, dqv_val);
+    v = dq_shift ? ROUND_POWER_OF_TWO(v, dq_shift) : v;
+#else
+#if CONFIG_AOM_QM
+    dqv = ((iqmatrix[scan[c]] * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+#endif
+    v = (val * dqv) >> dq_shift;
+#endif  // CONFIG_NEW_QUANT
+
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#if CONFIG_HIGHBITDEPTH
+    dqcoeff[scan[c]] =
+        highbd_check_range((aom_read_bit(r, ACCT_STR) ? -v : v), xd->bd);
+#else
+    dqcoeff[scan[c]] = check_range(aom_read_bit(r, ACCT_STR) ? -v : v, 8);
+#endif  // CONFIG_HIGHBITDEPTH
+#else
+    dqcoeff[scan[c]] = aom_read_bit(r, ACCT_STR) ? -v : v;
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+    token_cache[scan[c]] = av1_pt_energy_class[token];
+    ++c;
+    ctx = get_coef_context(nb, token_cache, c);
+    dqv = dq[1];
+#endif  // CONFIG_NEW_TOKENSET
+  }
+
+  return c;
+}
+#endif  // !CONFIG_PVQ
+
+#if CONFIG_PALETTE
+void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane,
+                               aom_reader *r) {
+  const MODE_INFO *const mi = xd->mi[0];
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  uint8_t color_order[PALETTE_MAX_SIZE];
+  const int n = mbmi->palette_mode_info.palette_size[plane];
+  int i, j;
+  uint8_t *const color_map = xd->plane[plane].color_index_map;
+  const aom_prob(
+      *const prob)[PALETTE_COLOR_INDEX_CONTEXTS][PALETTE_COLORS - 1] =
+      plane ? av1_default_palette_uv_color_index_prob
+            : av1_default_palette_y_color_index_prob;
+  int plane_block_width, plane_block_height, rows, cols;
+  av1_get_block_dimensions(mbmi->sb_type, plane, xd, &plane_block_width,
+                           &plane_block_height, &rows, &cols);
+  assert(plane == 0 || plane == 1);
+
+#if CONFIG_PALETTE_THROUGHPUT
+  // Run wavefront on the palette map index decoding.
+  for (i = 1; i < rows + cols - 1; ++i) {
+    for (j = AOMMIN(i, cols - 1); j >= AOMMAX(0, i - rows + 1); --j) {
+      const int color_ctx = av1_get_palette_color_index_context(
+          color_map, plane_block_width, (i - j), j, n, color_order, NULL);
+      const int color_idx =
+          aom_read_tree(r, av1_palette_color_index_tree[n - 2],
+                        prob[n - 2][color_ctx], ACCT_STR);
+      assert(color_idx >= 0 && color_idx < n);
+      color_map[(i - j) * plane_block_width + j] = color_order[color_idx];
+    }
+  }
+  // Copy last column to extra columns.
+  if (cols < plane_block_width) {
+    for (i = 0; i < plane_block_height; ++i) {
+      memset(color_map + i * plane_block_width + cols,
+             color_map[i * plane_block_width + cols - 1],
+             (plane_block_width - cols));
+    }
+  }
+#else
+  for (i = 0; i < rows; ++i) {
+    for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
+      const int color_ctx = av1_get_palette_color_index_context(
+          color_map, plane_block_width, i, j, n, color_order, NULL);
+      const int color_idx =
+          aom_read_tree(r, av1_palette_color_index_tree[n - PALETTE_MIN_SIZE],
+                        prob[n - PALETTE_MIN_SIZE][color_ctx], ACCT_STR);
+      assert(color_idx >= 0 && color_idx < n);
+      color_map[i * plane_block_width + j] = color_order[color_idx];
+    }
+    memset(color_map + i * plane_block_width + cols,
+           color_map[i * plane_block_width + cols - 1],
+           (plane_block_width - cols));  // Copy last column to extra columns.
+  }
+#endif  // CONFIG_PALETTE_THROUGHPUT
+  // Copy last row to extra rows.
+  for (i = rows; i < plane_block_height; ++i) {
+    memcpy(color_map + i * plane_block_width,
+           color_map + (rows - 1) * plane_block_width, plane_block_width);
+  }
+}
+#endif  // CONFIG_PALETTE
+
+#if !CONFIG_PVQ || CONFIG_VAR_TX
+int av1_decode_block_tokens(AV1_COMMON *cm, MACROBLOCKD *const xd, int plane,
+                            const SCAN_ORDER *sc, int x, int y, TX_SIZE tx_size,
+                            TX_TYPE tx_type, int16_t *max_scan_line,
+                            aom_reader *r, int seg_id) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int16_t *const dequant = pd->seg_dequant[seg_id];
+  const int ctx =
+      get_entropy_context(tx_size, pd->above_context + x, pd->left_context + y);
+#if CONFIG_NEW_QUANT
+  const int ref = is_inter_block(&xd->mi[0]->mbmi);
+  int dq =
+      get_dq_profile_from_ctx(xd->qindex[seg_id], ctx, ref, pd->plane_type);
+#endif  //  CONFIG_NEW_QUANT
+
+  const int eob =
+      decode_coefs(xd, pd->plane_type, pd->dqcoeff, tx_size, tx_type, dequant,
+#if CONFIG_NEW_QUANT
+                   pd->seg_dequant_nuq[seg_id][dq],
+#endif  // CONFIG_NEW_QUANT
+#if CONFIG_AOM_QM
+                   pd->seg_iqmatrix[seg_id],
+#endif  // CONFIG_AOM_QM
+                   ctx, sc->scan, sc->neighbors, max_scan_line, r);
+  av1_set_contexts(xd, pd, plane, tx_size, eob > 0, x, y);
+#if CONFIG_ADAPT_SCAN
+  if (xd->counts)
+    av1_update_scan_count_facade(cm, xd->counts, tx_size, tx_type, pd->dqcoeff,
+                                 eob);
+#else
+  (void)cm;
+#endif
+  return eob;
+}
+#endif  // !CONFIG_PVQ
diff --git a/third_party/aom/av1/decoder/detokenize.h b/third_party/aom/av1/decoder/detokenize.h
new file mode 100644
index 000000000..ba4066603
--- /dev/null
+++ b/third_party/aom/av1/decoder/detokenize.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_DECODER_DETOKENIZE_H_
+#define AV1_DECODER_DETOKENIZE_H_
+
+#include "./aom_config.h"
+#if !CONFIG_PVQ || CONFIG_VAR_TX
+#include "av1/decoder/decoder.h"
+#include "av1/common/scan.h"
+#endif  // !CONFIG_PVQ
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_PALETTE
+void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane, aom_reader *r);
+#endif  // CONFIG_PALETTE
+
+#if !CONFIG_PVQ || CONFIG_VAR_TX
+int av1_decode_block_tokens(AV1_COMMON *cm, MACROBLOCKD *const xd, int plane,
+                            const SCAN_ORDER *sc, int x, int y, TX_SIZE tx_size,
+                            TX_TYPE tx_type, int16_t *max_scan_line,
+                            aom_reader *r, int seg_id);
+#endif  // !CONFIG_PVQ
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AV1_DECODER_DETOKENIZE_H_
diff --git a/third_party/aom/av1/decoder/dsubexp.c b/third_party/aom/av1/decoder/dsubexp.c
new file mode 100644
index 000000000..5171f1144
--- /dev/null
+++ b/third_party/aom/av1/decoder/dsubexp.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/entropy.h"
+
+#include "av1/decoder/dsubexp.h"
+
+static int inv_recenter_nonneg(int v, int m) {
+  if (v > 2 * m) return v;
+
+  return (v & 1) ? m - ((v + 1) >> 1) : m + (v >> 1);
+}
+
+#define decode_uniform(r, ACCT_STR_NAME) \
+  decode_uniform_(r ACCT_STR_ARG(ACCT_STR_NAME))
+#define decode_term_subexp(r, ACCT_STR_NAME) \
+  decode_term_subexp_(r ACCT_STR_ARG(ACCT_STR_NAME))
+
+static int decode_uniform_(aom_reader *r ACCT_STR_PARAM) {
+  const int l = 8;
+  const int m = (1 << l) - 190;
+  const int v = aom_read_literal(r, l - 1, ACCT_STR_NAME);
+  return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME);
+}
+
+static int inv_remap_prob(int v, int m) {
+  /* clang-format off */
+  static uint8_t inv_map_table[MAX_PROB - 1] = {
+      7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176, 189,
+    202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,  10,  11,
+     12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  27,
+     28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,
+     44,  45,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  60,
+     61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  73,  74,  75,  76,
+     77,  78,  79,  80,  81,  82,  83,  84,  86,  87,  88,  89,  90,  91,  92,
+     93,  94,  95,  96,  97,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
+    109, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 125,
+    126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 138, 139, 140, 141,
+    142, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157,
+    158, 159, 160, 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173,
+    174, 175, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190,
+    191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
+    207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221, 222,
+    223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238,
+    239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253
+  }; /* clang-format on */
+  assert(v < (int)(sizeof(inv_map_table) / sizeof(inv_map_table[0])));
+  v = inv_map_table[v];
+  m--;
+  if ((m << 1) <= MAX_PROB) {
+    return 1 + inv_recenter_nonneg(v, m);
+  } else {
+    return MAX_PROB - inv_recenter_nonneg(v, MAX_PROB - 1 - m);
+  }
+}
+
+static int decode_term_subexp_(aom_reader *r ACCT_STR_PARAM) {
+  if (!aom_read_bit(r, ACCT_STR_NAME))
+    return aom_read_literal(r, 4, ACCT_STR_NAME);
+  if (!aom_read_bit(r, ACCT_STR_NAME))
+    return aom_read_literal(r, 4, ACCT_STR_NAME) + 16;
+  if (!aom_read_bit(r, ACCT_STR_NAME))
+    return aom_read_literal(r, 5, ACCT_STR_NAME) + 32;
+  return decode_uniform(r, ACCT_STR_NAME) + 64;
+}
+
+void av1_diff_update_prob_(aom_reader *r, aom_prob *p ACCT_STR_PARAM) {
+  if (aom_read(r, DIFF_UPDATE_PROB, ACCT_STR_NAME)) {
+    const int delp = decode_term_subexp(r, ACCT_STR_NAME);
+    *p = (aom_prob)inv_remap_prob(delp, *p);
+  }
+}
diff --git a/third_party/aom/av1/decoder/dsubexp.h b/third_party/aom/av1/decoder/dsubexp.h
new file mode 100644
index 000000000..4bc38578c
--- /dev/null
+++ b/third_party/aom/av1/decoder/dsubexp.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_DECODER_DSUBEXP_H_
+#define AV1_DECODER_DSUBEXP_H_
+
+#include "aom_dsp/bitreader.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_ACCOUNTING
+#define av1_diff_update_prob(r, p, str) av1_diff_update_prob_(r, p, str)
+#else
+#define av1_diff_update_prob(r, p, str) av1_diff_update_prob_(r, p)
+#endif
+
+void av1_diff_update_prob_(aom_reader *r, aom_prob *p ACCT_STR_PARAM);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AV1_DECODER_DSUBEXP_H_
diff --git a/third_party/aom/av1/decoder/dthread.c b/third_party/aom/av1/decoder/dthread.c
new file mode 100644
index 000000000..50f8ed192
--- /dev/null
+++ b/third_party/aom/av1/decoder/dthread.c
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_config.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/reconinter.h"
+#include "av1/decoder/dthread.h"
+#include "av1/decoder/decoder.h"
+
+// #define DEBUG_THREAD
+
+// TODO(hkuang): Clean up all the #ifdef in this file.
+void av1_frameworker_lock_stats(AVxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const worker_data = worker->data1;
+  pthread_mutex_lock(&worker_data->stats_mutex);
+#else
+  (void)worker;
+#endif
+}
+
+void av1_frameworker_unlock_stats(AVxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const worker_data = worker->data1;
+  pthread_mutex_unlock(&worker_data->stats_mutex);
+#else
+  (void)worker;
+#endif
+}
+
+void av1_frameworker_signal_stats(AVxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const worker_data = worker->data1;
+
+// TODO(hkuang): Fix the pthread_cond_broadcast in windows wrapper.
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+  pthread_cond_signal(&worker_data->stats_cond);
+#else
+  pthread_cond_broadcast(&worker_data->stats_cond);
+#endif
+
+#else
+  (void)worker;
+#endif
+}
+
+// This macro prevents thread_sanitizer from reporting known concurrent writes.
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define BUILDING_WITH_TSAN
+#endif
+#endif
+
+// TODO(hkuang): Remove worker parameter as it is only used in debug code.
+void av1_frameworker_wait(AVxWorker *const worker, RefCntBuffer *const ref_buf,
+                          int row) {
+#if CONFIG_MULTITHREAD
+  if (!ref_buf) return;
+
+#ifndef BUILDING_WITH_TSAN
+  // The following line of code will get harmless tsan error but it is the key
+  // to get best performance.
+  if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return;
+#endif
+
+  {
+    // Find the worker thread that owns the reference frame. If the reference
+    // frame has been fully decoded, it may not have owner.
+    AVxWorker *const ref_worker = ref_buf->frame_worker_owner;
+    FrameWorkerData *const ref_worker_data =
+        (FrameWorkerData *)ref_worker->data1;
+    const AV1Decoder *const pbi = ref_worker_data->pbi;
+
+#ifdef DEBUG_THREAD
+    {
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      printf("%d %p worker is waiting for %d %p worker (%d)  ref %d \r\n",
+             worker_data->worker_id, worker, ref_worker_data->worker_id,
+             ref_buf->frame_worker_owner, row, ref_buf->row);
+    }
+#endif
+
+    av1_frameworker_lock_stats(ref_worker);
+    while (ref_buf->row < row && pbi->cur_buf == ref_buf &&
+           ref_buf->buf.corrupted != 1) {
+      pthread_cond_wait(&ref_worker_data->stats_cond,
+                        &ref_worker_data->stats_mutex);
+    }
+
+    if (ref_buf->buf.corrupted == 1) {
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      av1_frameworker_unlock_stats(ref_worker);
+      aom_internal_error(&worker_data->pbi->common.error,
+                         AOM_CODEC_CORRUPT_FRAME,
+                         "Worker %p failed to decode frame", worker);
+    }
+    av1_frameworker_unlock_stats(ref_worker);
+  }
+#else
+  (void)worker;
+  (void)ref_buf;
+  (void)row;
+  (void)ref_buf;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void av1_frameworker_broadcast(RefCntBuffer *const buf, int row) {
+#if CONFIG_MULTITHREAD
+  AVxWorker *worker = buf->frame_worker_owner;
+
+#ifdef DEBUG_THREAD
+  {
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    printf("%d %p worker decode to (%d) \r\n", worker_data->worker_id,
+           buf->frame_worker_owner, row);
+  }
+#endif
+
+  av1_frameworker_lock_stats(worker);
+  buf->row = row;
+  av1_frameworker_signal_stats(worker);
+  av1_frameworker_unlock_stats(worker);
+#else
+  (void)buf;
+  (void)row;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void av1_frameworker_copy_context(AVxWorker *const dst_worker,
+                                  AVxWorker *const src_worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1;
+  FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1;
+  AV1_COMMON *const src_cm = &src_worker_data->pbi->common;
+  AV1_COMMON *const dst_cm = &dst_worker_data->pbi->common;
+  int i;
+
+  // Wait until source frame's context is ready.
+  av1_frameworker_lock_stats(src_worker);
+  while (!src_worker_data->frame_context_ready) {
+    pthread_cond_wait(&src_worker_data->stats_cond,
+                      &src_worker_data->stats_mutex);
+  }
+
+  dst_cm->last_frame_seg_map = src_cm->seg.enabled
+                                   ? src_cm->current_frame_seg_map
+                                   : src_cm->last_frame_seg_map;
+  dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync;
+  av1_frameworker_unlock_stats(src_worker);
+
+  dst_cm->bit_depth = src_cm->bit_depth;
+#if CONFIG_HIGHBITDEPTH
+  dst_cm->use_highbitdepth = src_cm->use_highbitdepth;
+#endif
+#if CONFIG_EXT_REFS
+// TODO(zoeliu): To handle parallel decoding
+#endif  // CONFIG_EXT_REFS
+  dst_cm->prev_frame =
+      src_cm->show_existing_frame ? src_cm->prev_frame : src_cm->cur_frame;
+  dst_cm->last_width =
+      !src_cm->show_existing_frame ? src_cm->width : src_cm->last_width;
+  dst_cm->last_height =
+      !src_cm->show_existing_frame ? src_cm->height : src_cm->last_height;
+  dst_cm->subsampling_x = src_cm->subsampling_x;
+  dst_cm->subsampling_y = src_cm->subsampling_y;
+  dst_cm->frame_type = src_cm->frame_type;
+  dst_cm->last_show_frame = !src_cm->show_existing_frame
+                                ? src_cm->show_frame
+                                : src_cm->last_show_frame;
+  for (i = 0; i < REF_FRAMES; ++i)
+    dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i];
+
+  memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr,
+         (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh));
+  dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level;
+  dst_cm->lf.filter_level = src_cm->lf.filter_level;
+  memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, TOTAL_REFS_PER_FRAME);
+  memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
+  dst_cm->seg = src_cm->seg;
+  memcpy(dst_cm->frame_contexts, src_cm->frame_contexts,
+         FRAME_CONTEXTS * sizeof(dst_cm->frame_contexts[0]));
+#else
+  (void)dst_worker;
+  (void)src_worker;
+#endif  // CONFIG_MULTITHREAD
+}
diff --git a/third_party/aom/av1/decoder/dthread.h b/third_party/aom/av1/decoder/dthread.h
new file mode 100644
index 000000000..c17053d9c
--- /dev/null
+++ b/third_party/aom/av1/decoder/dthread.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_DECODER_DTHREAD_H_
+#define AV1_DECODER_DTHREAD_H_
+
+#include "./aom_config.h"
+#include "aom_util/aom_thread.h"
+#include "aom/internal/aom_codec_internal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+struct AV1Decoder;
+
+// WorkerData for the FrameWorker thread. It contains all the information of
+// the worker and decode structures for decoding a frame.
+typedef struct FrameWorkerData {
+  struct AV1Decoder *pbi;
+  const uint8_t *data;
+  const uint8_t *data_end;
+  size_t data_size;
+  void *user_priv;
+  int result;
+  int worker_id;
+  int received_frame;
+
+  // scratch_buffer is used in frame parallel mode only.
+  // It is used to make a copy of the compressed data.
+  uint8_t *scratch_buffer;
+  size_t scratch_buffer_size;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t stats_mutex;
+  pthread_cond_t stats_cond;
+#endif
+
+  int frame_context_ready;  // Current frame's context is ready to read.
+  int frame_decoded;        // Finished decoding current frame.
+} FrameWorkerData;
+
+void av1_frameworker_lock_stats(AVxWorker *const worker);
+void av1_frameworker_unlock_stats(AVxWorker *const worker);
+void av1_frameworker_signal_stats(AVxWorker *const worker);
+
+// Wait until ref_buf has been decoded to row in real pixel unit.
+// Note: worker may already finish decoding ref_buf and release it in order to
+// start decoding next frame. So need to check whether worker is still decoding
+// ref_buf.
+void av1_frameworker_wait(AVxWorker *const worker, RefCntBuffer *const ref_buf,
+                          int row);
+
+// FrameWorker broadcasts its decoding progress so other workers that are
+// waiting on it can resume decoding.
+void av1_frameworker_broadcast(RefCntBuffer *const buf, int row);
+
+// Copy necessary decoding context from src worker to dst worker.
+void av1_frameworker_copy_context(AVxWorker *const dst_worker,
+                                  AVxWorker *const src_worker);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_DECODER_DTHREAD_H_
diff --git a/third_party/aom/av1/decoder/generic_decoder.c b/third_party/aom/av1/decoder/generic_decoder.c
new file mode 100644
index 000000000..0c7d71b9f
--- /dev/null
+++ b/third_party/aom/av1/decoder/generic_decoder.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stdio.h>
+
+#include "aom_dsp/bitreader.h"
+#include "av1/common/generic_code.h"
+#include "av1/common/odintrin.h"
+#include "pvq_decoder.h"
+
+/** Decodes a value from 0 to N-1 (with N up to 16) based on a cdf and adapts
+ * the cdf accordingly.
+ *
+ * @param [in,out] r     multi-symbol entropy decoder
+ * @param [in,out] cdf   CDF of the variable (Q15)
+ * @param [in]     n     number of values possible
+ * @param [in,out] count number of symbols encoded with that cdf so far
+ * @param [in]     rate  adaptation rate shift (smaller is faster)
+ * @return decoded variable
+ */
+int aom_decode_cdf_adapt_q15_(aom_reader *r, uint16_t *cdf, int n,
+ int *count, int rate ACCT_STR_PARAM) {
+  int val;
+  int i;
+  if (*count == 0) {
+    int ft;
+    ft = cdf[n - 1];
+    for (i = 0; i < n; i++) {
+      cdf[i] = AOM_ICDF(cdf[i]*32768/ft);
+    }
+  }
+  val = aom_read_cdf(r, cdf, n, ACCT_STR_NAME);
+  aom_cdf_adapt_q15(val, cdf, n, count, rate);
+  return val;
+}
+
+/** Encodes a random variable using a "generic" model, assuming that the
+ * distribution is one-sided (zero and up), has a single mode, and decays
+ * exponentially past the model.
+ *
+ * @param [in,out] r     multi-symbol entropy decoder
+ * @param [in,out] model generic probability model
+ * @param [in]     x     variable being encoded
+ * @param [in,out] ExQ16 expectation of x (adapted)
+ * @param [in]     integration integration period of ExQ16 (leaky average over
+ * 1<<integration samples)
+ *
+ * @retval decoded variable x
+ */
+int generic_decode_(aom_reader *r, generic_encoder *model,
+ int *ex_q16, int integration ACCT_STR_PARAM) {
+  int lg_q1;
+  int shift;
+  int id;
+  uint16_t *cdf;
+  int xs;
+  int lsb;
+  int x;
+  lsb = 0;
+  lg_q1 = log_ex(*ex_q16);
+  /* If expectation is too large, shift x to ensure that
+     all we have past xs=15 is the exponentially decaying tail
+     of the distribution. */
+  shift = OD_MAXI(0, (lg_q1 - 5) >> 1);
+  /* Choose the cdf to use: we have two per "octave" of ExQ16. */
+  id = OD_MINI(GENERIC_TABLES - 1, lg_q1);
+  cdf = model->cdf[id];
+  xs = aom_read_symbol_pvq(r, cdf, 16, ACCT_STR_NAME);
+  if (xs == 15) {
+    int e;
+    unsigned decay;
+    /* Estimate decay based on the assumption that the distribution is close
+       to Laplacian for large values. We should probably have an adaptive
+       estimate instead. Note: The 2* is a kludge that's not fully understood
+       yet. */
+    OD_ASSERT(*ex_q16 < INT_MAX >> 1);
+    e = ((2**ex_q16 >> 8) + (1 << shift >> 1)) >> shift;
+    decay = OD_MAXI(2, OD_MINI(254, 256*e/(e + 256)));
+    xs += aom_laplace_decode_special(r, decay, ACCT_STR_NAME);
+  }
+  if (shift != 0) {
+    int special;
+    /* Because of the rounding, there's only half the number of possibilities
+       for xs=0 */
+    special = xs == 0;
+    if (shift - special > 0) {
+      lsb = aom_read_literal(r, shift - special, ACCT_STR_NAME);
+    }
+    lsb -= !special << (shift - 1);
+  }
+  x = (xs << shift) + lsb;
+  generic_model_update(ex_q16, x, integration);
+  OD_LOG((OD_LOG_ENTROPY_CODER, OD_LOG_DEBUG,
+   "dec: %d %d %d %d %d %x", *ex_q16, x, shift, id, xs, dec->rng));
+  return x;
+}
diff --git a/third_party/aom/av1/decoder/inspection.c b/third_party/aom/av1/decoder/inspection.c
new file mode 100644
index 000000000..2e8a61087
--- /dev/null
+++ b/third_party/aom/av1/decoder/inspection.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/inspection.h"
+#include "av1/common/enums.h"
+#if CONFIG_CDEF
+#include "av1/common/cdef.h"
+#endif
+
+void ifd_init(insp_frame_data *fd, int frame_width, int frame_height) {
+  fd->mi_cols = ALIGN_POWER_OF_TWO(frame_width, 3) >> MI_SIZE_LOG2;
+  fd->mi_rows = ALIGN_POWER_OF_TWO(frame_height, 3) >> MI_SIZE_LOG2;
+  fd->mi_grid = (insp_mi_data *)aom_malloc(sizeof(insp_mi_data) * fd->mi_rows *
+                                           fd->mi_cols);
+}
+
+void ifd_clear(insp_frame_data *fd) {
+  aom_free(fd->mi_grid);
+  fd->mi_grid = NULL;
+}
+
+/* TODO(negge) This function may be called by more than one thread when using
+               a multi-threaded decoder and this may cause a data race. */
+int ifd_inspect(insp_frame_data *fd, void *decoder) {
+  struct AV1Decoder *pbi = (struct AV1Decoder *)decoder;
+  AV1_COMMON *const cm = &pbi->common;
+  // TODO(negge): Should this function just call ifd_clear() and ifd_init()?
+  if (fd->mi_rows != cm->mi_rows || fd->mi_cols != cm->mi_cols) {
+    return 0;
+  }
+  fd->show_frame = cm->show_frame;
+  fd->frame_type = cm->frame_type;
+  fd->base_qindex = cm->base_qindex;
+  fd->tile_mi_cols = cm->tile_width;
+  fd->tile_mi_rows = cm->tile_height;
+#if CONFIG_ACCOUNTING
+  fd->accounting = &pbi->accounting;
+#endif
+#if CONFIG_CDEF
+// TODO(negge): copy per frame CDEF data
+#endif
+  int i, j;
+  for (i = 0; i < MAX_SEGMENTS; i++) {
+    for (j = 0; j < 2; j++) {
+      fd->y_dequant[i][j] = cm->y_dequant[i][j];
+      fd->uv_dequant[i][j] = cm->uv_dequant[i][j];
+    }
+  }
+  for (j = 0; j < cm->mi_rows; j++) {
+    for (i = 0; i < cm->mi_cols; i++) {
+      const MB_MODE_INFO *mbmi =
+          &cm->mi_grid_visible[j * cm->mi_stride + i]->mbmi;
+      insp_mi_data *mi = &fd->mi_grid[j * cm->mi_cols + i];
+      // Segment
+      mi->segment_id = mbmi->segment_id;
+      // Motion Vectors
+      mi->mv[0].row = mbmi->mv[0].as_mv.row;
+      mi->mv[0].col = mbmi->mv[0].as_mv.col;
+      mi->mv[1].row = mbmi->mv[1].as_mv.row;
+      mi->mv[1].col = mbmi->mv[1].as_mv.col;
+      // Reference Frames
+      mi->ref_frame[0] = mbmi->ref_frame[0];
+      mi->ref_frame[1] = mbmi->ref_frame[1];
+      // Prediction Mode
+      mi->mode = mbmi->mode;
+      // Prediction Mode for Chromatic planes
+      if (mi->mode < INTRA_MODES) {
+        mi->uv_mode = mbmi->uv_mode;
+      } else {
+        mi->uv_mode = INTRA_INVALID;
+      }
+      // Block Size
+      mi->sb_type = mbmi->sb_type;
+      // Skip Flag
+      mi->skip = mbmi->skip;
+#if CONFIG_DUAL_FILTER
+      mi->filter[0] = mbmi->interp_filter[0];
+      mi->filter[1] = mbmi->interp_filter[1];
+#else
+      mi->filter = mbmi->interp_filter;
+#endif
+      // Transform
+      mi->tx_type = mbmi->tx_type;
+      mi->tx_size = mbmi->tx_size;
+
+#if CONFIG_CDEF
+      mi->cdef_level = cm->cdef_strengths[mbmi->cdef_strength] / CLPF_STRENGTHS;
+      mi->cdef_strength =
+          cm->cdef_strengths[mbmi->cdef_strength] % CLPF_STRENGTHS;
+      mi->cdef_strength += mi->cdef_strength == 3;
+#endif
+    }
+  }
+  return 1;
+}
diff --git a/third_party/aom/av1/decoder/inspection.h b/third_party/aom/av1/decoder/inspection.h
new file mode 100644
index 000000000..d6cf4319a
--- /dev/null
+++ b/third_party/aom/av1/decoder/inspection.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_INSPECTION_H_
+#define AOM_INSPECTION_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#endif
+
+typedef void (*aom_inspect_cb)(void *decoder, void *data);
+
+typedef struct insp_mv insp_mv;
+
+struct insp_mv {
+  int16_t row;
+  int16_t col;
+};
+
+typedef struct insp_mi_data insp_mi_data;
+
+struct insp_mi_data {
+  insp_mv mv[2];
+  int8_t ref_frame[2];
+  int8_t mode;
+  int8_t uv_mode;
+  int8_t sb_type;
+  int8_t skip;
+  int8_t segment_id;
+#if CONFIG_DUAL_FILTER
+  int8_t filter[2];
+#else
+  int8_t filter;
+#endif
+  int8_t tx_type;
+  int8_t tx_size;
+#if CONFIG_CDEF
+  int8_t cdef_level;
+  int8_t cdef_strength;
+#endif
+};
+
+typedef struct insp_frame_data insp_frame_data;
+
+struct insp_frame_data {
+#if CONFIG_ACCOUNTING
+  Accounting *accounting;
+#endif
+  insp_mi_data *mi_grid;
+  int show_frame;
+  int frame_type;
+  int base_qindex;
+  int mi_rows;
+  int mi_cols;
+  int tile_mi_rows;
+  int tile_mi_cols;
+  int16_t y_dequant[MAX_SEGMENTS][2];
+  int16_t uv_dequant[MAX_SEGMENTS][2];
+#if CONFIG_CDEF
+// TODO(negge): add per frame CDEF data
+#endif
+};
+
+void ifd_init(insp_frame_data *fd, int frame_width, int frame_height);
+void ifd_clear(insp_frame_data *fd);
+int ifd_inspect(insp_frame_data *fd, void *decoder);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // AOM_INSPECTION_H_
diff --git a/third_party/aom/av1/decoder/laplace_decoder.c b/third_party/aom/av1/decoder/laplace_decoder.c
new file mode 100644
index 000000000..b6cf50bc7
--- /dev/null
+++ b/third_party/aom/av1/decoder/laplace_decoder.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+/* clang-format off */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stdio.h>
+
+#include "aom_dsp/bitreader.h"
+#include "av1/common/pvq.h"
+#include "pvq_decoder.h"
+
+#define aom_decode_pvq_split(r, adapt, sum, ctx, ACCT_STR_NAME) \
+  aom_decode_pvq_split_(r, adapt, sum, ctx ACCT_STR_ARG(ACCT_STR_NAME))
+
+static int aom_decode_pvq_split_(aom_reader *r, od_pvq_codeword_ctx *adapt,
+ int sum, int ctx ACCT_STR_PARAM) {
+  int shift;
+  int count;
+  int msbs;
+  int fctx;
+  count = 0;
+  if (sum == 0) return 0;
+  shift = OD_MAXI(0, OD_ILOG(sum) - 3);
+  fctx = 7*ctx + (sum >> shift) - 1;
+  msbs = aom_read_symbol_pvq(r, adapt->pvq_split_cdf[fctx], (sum >> shift) + 1,
+      ACCT_STR_NAME);
+  if (shift) count = aom_read_literal(r, shift, ACCT_STR_NAME);
+  count += msbs << shift;
+  if (count > sum) {
+    count = sum;
+#if CONFIG_DAALA_EC
+    r->ec.error = 1;
+#else
+# error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+  }
+  return count;
+}
+
+void aom_decode_band_pvq_splits(aom_reader *r, od_pvq_codeword_ctx *adapt,
+ od_coeff *y, int n, int k, int level) {
+  int mid;
+  int count_right;
+  if (n == 1) {
+    y[0] = k;
+  }
+  else if (k == 0) {
+    OD_CLEAR(y, n);
+  }
+  else if (k == 1 && n <= 16) {
+    int cdf_id;
+    int pos;
+    cdf_id = od_pvq_k1_ctx(n, level == 0);
+    OD_CLEAR(y, n);
+    pos = aom_read_symbol_pvq(r, adapt->pvq_k1_cdf[cdf_id], n, "pvq:k1");
+    y[pos] = 1;
+  }
+  else {
+    mid = n >> 1;
+    count_right = aom_decode_pvq_split(r, adapt, k, od_pvq_size_ctx(n),
+     "pvq:split");
+    aom_decode_band_pvq_splits(r, adapt, y, mid, k - count_right, level + 1);
+    aom_decode_band_pvq_splits(r, adapt, y + mid, n - mid, count_right,
+     level + 1);
+  }
+}
+
+/** Decodes the tail of a Laplace-distributed variable, i.e. it doesn't
+ * do anything special for the zero case.
+ *
+ * @param [dec] range decoder
+ * @param [decay] decay factor of the distribution, i.e. pdf ~= decay^x
+ *
+ * @retval decoded variable x
+ */
+int aom_laplace_decode_special_(aom_reader *r, unsigned decay ACCT_STR_PARAM) {
+  int pos;
+  int shift;
+  int xs;
+  int sym;
+  const uint16_t *cdf;
+  shift = 0;
+  /* We don't want a large decay value because that would require too many
+     symbols. */
+  while (decay > 235) {
+    decay = (decay*decay + 128) >> 8;
+    shift++;
+  }
+  decay = OD_MINI(decay, 254);
+  decay = OD_MAXI(decay, 2);
+  cdf = EXP_CDF_TABLE[(decay + 1) >> 1];
+  OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "decay = %d\n", decay));
+  xs = 0;
+  do {
+    sym = OD_MINI(xs, 15);
+    {
+      int i;
+      OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "%d %d %d", xs, shift, sym));
+      for (i = 0; i < 16; i++) {
+        OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "%d ", cdf[i]));
+      }
+      OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "\n"));
+    }
+    sym = aom_read_cdf(r, cdf, 16, ACCT_STR_NAME);
+    xs += sym;
+  } while (sym >= 15);
+  if (shift) pos = (xs << shift) + aom_read_literal(r, shift, ACCT_STR_NAME);
+  else pos = xs;
+  return pos;
+}
diff --git a/third_party/aom/av1/decoder/pvq_decoder.c b/third_party/aom/av1/decoder/pvq_decoder.c
new file mode 100644
index 000000000..d9a8e8056
--- /dev/null
+++ b/third_party/aom/av1/decoder/pvq_decoder.c
@@ -0,0 +1,378 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "./aom_config.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_dsp/entcode.h"
+#include "aom_dsp/entdec.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/partition.h"
+#include "av1/common/pvq_state.h"
+#include "av1/decoder/decint.h"
+#include "av1/decoder/pvq_decoder.h"
+#include "aom_ports/system_state.h"
+
+int aom_read_symbol_pvq_(aom_reader *r, aom_cdf_prob *cdf, int nsymbs
+ ACCT_STR_PARAM) {
+  if (cdf[0] == 0)
+    aom_cdf_init_q15_1D(cdf, nsymbs, CDF_SIZE(nsymbs));
+  return aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME);
+}
+
+static void aom_decode_pvq_codeword(aom_reader *r, od_pvq_codeword_ctx *ctx,
+ od_coeff *y, int n, int k) {
+  int i;
+  aom_decode_band_pvq_splits(r, ctx, y, n, k, 0);
+  for (i = 0; i < n; i++) {
+    if (y[i] && aom_read_bit(r, "pvq:sign")) y[i] = -y[i];
+  }
+}
+
+/** Inverse of neg_interleave; decodes the interleaved gain.
+ *
+ * @param [in]      x      quantized/interleaved gain to decode
+ * @param [in]      ref    quantized gain of the reference
+ * @return                 original quantized gain value
+ */
+static int neg_deinterleave(int x, int ref) {
+  if (x < 2*ref-1) {
+    if (x & 1) return ref - 1 - (x >> 1);
+    else return ref + (x >> 1);
+  }
+  else return x+1;
+}
+
+/** Synthesizes one parition of coefficient values from a PVQ-encoded
+ * vector.
+ *
+ * @param [out]     xcoeff  output coefficient partition (x in math doc)
+ * @param [in]      ypulse  PVQ-encoded values (y in math doc); in the noref
+ *                          case, this vector has n entries, in the
+ *                          reference case it contains n-1 entries
+ *                          (the m-th entry is not included)
+ * @param [in]      ref     reference vector (prediction)
+ * @param [in]      n       number of elements in this partition
+ * @param [in]      gr      gain of the reference vector (prediction)
+ * @param [in]      noref   indicates presence or lack of prediction
+ * @param [in]      g       decoded quantized vector gain
+ * @param [in]      theta   decoded theta (prediction error)
+ * @param [in]      qm      QM with magnitude compensation
+ * @param [in]      qm_inv  Inverse of QM with magnitude compensation
+ */
+static void pvq_synthesis(od_coeff *xcoeff, od_coeff *ypulse, od_val16 *r16,
+ int n, od_val32 gr, int noref, od_val32 g, od_val32 theta, const int16_t *qm_inv,
+ int shift) {
+  int s;
+  int m;
+  /* Sign of the Householder reflection vector */
+  s = 0;
+  /* Direction of the Householder reflection vector */
+  m = noref ? 0 : od_compute_householder(r16, n, gr, &s, shift);
+  od_pvq_synthesis_partial(xcoeff, ypulse, r16, n, noref, g, theta, m, s,
+   qm_inv);
+}
+
+typedef struct {
+  od_coeff *ref;
+  int nb_coeffs;
+  int allow_flip;
+} cfl_ctx;
+
+/** Decodes a single vector of integers (eg, a partition within a
+ *  coefficient block) encoded using PVQ
+ *
+ * @param [in,out] ec          range encoder
+ * @param [in]     q0          scale/quantizer
+ * @param [in]     n           number of coefficients in partition
+ * @param [in,out] model       entropy decoder state
+ * @param [in,out] adapt       adaptation context
+ * @param [in,out] exg         ExQ16 expectation of decoded gain value
+ * @param [in,out] ext         ExQ16 expectation of decoded theta value
+ * @param [in]     ref         'reference' (prediction) vector
+ * @param [out]    out         decoded partition
+ * @param [out]    noref       boolean indicating absence of reference
+ * @param [in]     beta        per-band activity masking beta param
+ * @param [in]     is_keyframe whether we're encoding a keyframe
+ * @param [in]     pli         plane index
+ * @param [in]     cdf_ctx     selects which cdf context to use
+ * @param [in,out] skip_rest   whether to skip further bands in each direction
+ * @param [in]     band        index of the band being decoded
+ * @param [in]     band        index of the band being decoded
+ * @param [out]    skip        skip flag with range [0,1]
+ * @param [in]     qm          QM with magnitude compensation
+ * @param [in]     qm_inv      Inverse of QM with magnitude compensation
+ */
+static void pvq_decode_partition(aom_reader *r,
+                                 int q0,
+                                 int n,
+                                 generic_encoder model[3],
+                                 od_adapt_ctx *adapt,
+                                 int *exg,
+                                 int *ext,
+                                 od_coeff *ref,
+                                 od_coeff *out,
+                                 int *noref,
+                                 od_val16 beta,
+                                 int is_keyframe,
+                                 int pli,
+                                 int cdf_ctx,
+                                 cfl_ctx *cfl,
+                                 int has_skip,
+                                 int *skip_rest,
+                                 int band,
+                                 int *skip,
+                                 const int16_t *qm,
+                                 const int16_t *qm_inv) {
+  int k;
+  od_val32 qcg;
+  int itheta;
+  od_val32 theta;
+  od_val32 gr;
+  od_val32 gain_offset;
+  od_coeff y[MAXN];
+  int qg;
+  int id;
+  int i;
+  od_val16 ref16[MAXN];
+  int rshift;
+  theta = 0;
+  gr = 0;
+  gain_offset = 0;
+  /* Skip is per-direction. For band=0, we can use any of the flags. */
+  if (skip_rest[(band + 2) % 3]) {
+    qg = 0;
+    if (is_keyframe) {
+      itheta = -1;
+      *noref = 1;
+    }
+    else {
+      itheta = 0;
+      *noref = 0;
+    }
+  }
+  else {
+    /* Jointly decode gain, itheta and noref for small values. Then we handle
+       larger gain. */
+    id = aom_read_symbol_pvq(r, &adapt->pvq.pvq_gaintheta_cdf[cdf_ctx][0],
+     8 + 7*has_skip, "pvq:gaintheta");
+    if (!is_keyframe && id >= 10) id++;
+    if (is_keyframe && id >= 8) id++;
+    if (id >= 8) {
+      id -= 8;
+      skip_rest[0] = skip_rest[1] = skip_rest[2] = 1;
+    }
+    qg = id & 1;
+    itheta = (id >> 1) - 1;
+    *noref = (itheta == -1);
+  }
+  /* The CfL flip bit is only decoded on the first band that has noref=0. */
+  if (cfl->allow_flip && !*noref) {
+    int flip;
+    flip = aom_read_bit(r, "cfl:flip");
+    if (flip) {
+      for (i = 0; i < cfl->nb_coeffs; i++) cfl->ref[i] = -cfl->ref[i];
+    }
+    cfl->allow_flip = 0;
+  }
+  if (qg > 0) {
+    int tmp;
+    tmp = *exg;
+    qg = 1 + generic_decode(r, &model[!*noref], &tmp, 2, "pvq:gain");
+    OD_IIR_DIADIC(*exg, qg << 16, 2);
+  }
+  *skip = 0;
+#if defined(OD_FLOAT_PVQ)
+  rshift = 0;
+#else
+  /* Shift needed to make the reference fit in 15 bits, so that the Householder
+     vector can fit in 16 bits. */
+  rshift = OD_MAXI(0, od_vector_log_mag(ref, n) - 14);
+#endif
+  for (i = 0; i < n; i++) {
+#if defined(OD_FLOAT_PVQ)
+    ref16[i] = ref[i]*(double)qm[i]*OD_QM_SCALE_1;
+#else
+    ref16[i] = OD_SHR_ROUND(ref[i]*qm[i], OD_QM_SHIFT + rshift);
+#endif
+  }
+  if(!*noref){
+    /* we have a reference; compute its gain */
+    od_val32 cgr;
+    int icgr;
+    int cfl_enabled;
+    cfl_enabled = pli != 0 && is_keyframe && !OD_DISABLE_CFL;
+    cgr = od_pvq_compute_gain(ref16, n, q0, &gr, beta, rshift);
+    if (cfl_enabled) cgr = OD_CGAIN_SCALE;
+#if defined(OD_FLOAT_PVQ)
+    icgr = (int)floor(.5 + cgr);
+#else
+    icgr = OD_SHR_ROUND(cgr, OD_CGAIN_SHIFT);
+#endif
+    /* quantized gain is interleave encoded when there's a reference;
+       deinterleave it now */
+    if (is_keyframe) qg = neg_deinterleave(qg, icgr);
+    else {
+      qg = neg_deinterleave(qg, icgr + 1) - 1;
+      if (qg == 0) *skip = (icgr ? OD_PVQ_SKIP_ZERO : OD_PVQ_SKIP_COPY);
+    }
+    if (qg == icgr && itheta == 0 && !cfl_enabled) *skip = OD_PVQ_SKIP_COPY;
+    gain_offset = cgr - OD_SHL(icgr, OD_CGAIN_SHIFT);
+    qcg = OD_SHL(qg, OD_CGAIN_SHIFT) + gain_offset;
+    /* read and decode first-stage PVQ error theta */
+    if (itheta > 1) {
+      int tmp;
+      tmp = *ext;
+      itheta = 2 + generic_decode(r, &model[2], &tmp, 2, "pvq:theta");
+      OD_IIR_DIADIC(*ext, itheta << 16, 2);
+    }
+    theta = od_pvq_compute_theta(itheta, od_pvq_compute_max_theta(qcg, beta));
+  }
+  else{
+    itheta = 0;
+    if (!is_keyframe) qg++;
+    qcg = OD_SHL(qg, OD_CGAIN_SHIFT);
+    if (qg == 0) *skip = OD_PVQ_SKIP_ZERO;
+  }
+
+  k = od_pvq_compute_k(qcg, itheta, *noref, n, beta);
+  if (k != 0) {
+    /* when noref==0, y is actually size n-1 */
+    aom_decode_pvq_codeword(r, &adapt->pvq.pvq_codeword_ctx, y,
+     n - !*noref, k);
+  }
+  else {
+    OD_CLEAR(y, n);
+  }
+  if (*skip) {
+    if (*skip == OD_PVQ_SKIP_COPY) OD_COPY(out, ref, n);
+    else OD_CLEAR(out, n);
+  }
+  else {
+    od_val32 g;
+    g = od_gain_expand(qcg, q0, beta);
+    pvq_synthesis(out, y, ref16, n, gr, *noref, g, theta, qm_inv, rshift);
+  }
+  /* If OD_PVQ_SKIP_ZERO or OD_PVQ_SKIP_COPY, set skip to 1 for visualization */
+  if (*skip) *skip = 1;
+}
+
+/** Decodes a coefficient block (except for DC) encoded using PVQ
+ *
+ * @param [in,out] dec         daala decoder context
+ * @param [in]     ref         'reference' (prediction) vector
+ * @param [out]    out         decoded partition
+ * @param [in]     q0          quantizer
+ * @param [in]     pli         plane index
+ * @param [in]     bs          log of the block size minus two
+ * @param [in]     beta        per-band activity masking beta param
+ * @param [in]     is_keyframe whether we're encoding a keyframe
+ * @param [out]    flags       bitmask of the per band skip and noref flags
+ * @param [in]     ac_dc_coded skip flag for the block (range 0-3)
+ * @param [in]     qm          QM with magnitude compensation
+ * @param [in]     qm_inv      Inverse of QM with magnitude compensation
+ */
+void od_pvq_decode(daala_dec_ctx *dec,
+                   od_coeff *ref,
+                   od_coeff *out,
+                   int q0,
+                   int pli,
+                   int bs,
+                   const od_val16 *beta,
+                   int is_keyframe,
+                   unsigned int *flags,
+                   PVQ_SKIP_TYPE ac_dc_coded,
+                   const int16_t *qm,
+                   const int16_t *qm_inv){
+
+  int noref[PVQ_MAX_PARTITIONS];
+  int skip[PVQ_MAX_PARTITIONS];
+  int *exg;
+  int *ext;
+  int nb_bands;
+  int i;
+  const int *off;
+  int size[PVQ_MAX_PARTITIONS];
+  generic_encoder *model;
+  int skip_rest[3] = {0};
+  cfl_ctx cfl;
+  const unsigned char *pvq_qm;
+  int use_masking;
+
+  aom_clear_system_state();
+
+  /*Default to skip=1 and noref=0 for all bands.*/
+  for (i = 0; i < PVQ_MAX_PARTITIONS; i++) {
+    noref[i] = 0;
+    skip[i] = 1;
+  }
+
+  use_masking = dec->use_activity_masking;
+
+  if (use_masking)
+    pvq_qm = &dec->state.pvq_qm_q4[pli][0];
+  else
+    pvq_qm = 0;
+
+  exg = &dec->state.adapt->pvq.pvq_exg[pli][bs][0];
+  ext = dec->state.adapt->pvq.pvq_ext + bs*PVQ_MAX_PARTITIONS;
+  model = dec->state.adapt->pvq.pvq_param_model;
+  nb_bands = OD_BAND_OFFSETS[bs][0];
+  off = &OD_BAND_OFFSETS[bs][1];
+  out[0] = ac_dc_coded & DC_CODED;
+  if (ac_dc_coded < AC_CODED) {
+    if (is_keyframe) for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = 0;
+    else for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = ref[i];
+  }
+  else {
+    for (i = 0; i < nb_bands; i++) size[i] = off[i+1] - off[i];
+    cfl.ref = ref;
+    cfl.nb_coeffs = off[nb_bands];
+    cfl.allow_flip = pli != 0 && is_keyframe;
+    for (i = 0; i < nb_bands; i++) {
+      int q;
+
+      if (use_masking)
+        q = OD_MAXI(1, q0 * pvq_qm[od_qm_get_index(bs, i + 1)] >> 4);
+      else
+        q = OD_MAXI(1, q0);
+
+      pvq_decode_partition(dec->r, q, size[i],
+       model, dec->state.adapt, exg + i, ext + i, ref + off[i], out + off[i],
+       &noref[i], beta[i], is_keyframe, pli,
+       (pli != 0)*OD_TXSIZES*PVQ_MAX_PARTITIONS + bs*PVQ_MAX_PARTITIONS + i,
+       &cfl, i == 0 && (i < nb_bands - 1), skip_rest, i, &skip[i],
+       qm + off[i], qm_inv + off[i]);
+      if (i == 0 && !skip_rest[0] && bs > 0) {
+        int skip_dir;
+        int j;
+        skip_dir = aom_read_symbol(dec->r,
+         &dec->state.adapt->pvq.pvq_skip_dir_cdf[(pli != 0) + 2*(bs - 1)][0], 7,
+         "pvq:skiprest");
+        for (j = 0; j < 3; j++) skip_rest[j] = !!(skip_dir & (1 << j));
+      }
+    }
+  }
+  *flags = 0;
+  for (i = nb_bands - 1; i >= 0; i--) {
+    *flags <<= 1;
+    *flags |= noref[i]&1;
+    *flags <<= 1;
+    *flags |= skip[i]&1;
+  }
+}
diff --git a/third_party/aom/av1/decoder/pvq_decoder.h b/third_party/aom/av1/decoder/pvq_decoder.h
new file mode 100644
index 000000000..98970663b
--- /dev/null
+++ b/third_party/aom/av1/decoder/pvq_decoder.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#if !defined(_pvq_decoder_H)
+# define _pvq_decoder_H (1)
+# include "aom_dsp/bitreader.h"
+# include "aom_dsp/entdec.h"
+# include "av1/common/pvq.h"
+# include "av1/decoder/decint.h"
+
+#define aom_read_symbol_pvq(r, cdf, nsymbs, ACCT_STR_NAME) \
+  aom_read_symbol_pvq_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
+
+int aom_read_symbol_pvq_(aom_reader *r, aom_cdf_prob *cdf, int nsymbs
+  ACCT_STR_PARAM);
+
+void aom_decode_band_pvq_splits(aom_reader *r, od_pvq_codeword_ctx *adapt,
+ od_coeff *y, int n, int k, int level);
+
+#define aom_laplace_decode_special(r, decay, ACCT_STR_NAME) \
+  aom_laplace_decode_special_(r, decay ACCT_STR_ARG(ACCT_STR_NAME))
+
+int aom_laplace_decode_special_(aom_reader *r, unsigned decay ACCT_STR_PARAM);
+
+void od_pvq_decode(daala_dec_ctx *dec, od_coeff *ref, od_coeff *out, int q0,
+    int pli, int bs, const od_val16 *beta, int is_keyframe,
+    unsigned int *flags, PVQ_SKIP_TYPE ac_dc_coded, const int16_t *qm,
+    const int16_t *qm_inv);
+
+#endif
diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c
new file mode 100644
index 000000000..054b0e062
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_complexity.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/common/seg_common.h"
+#include "av1/encoder/segmentation.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/system_state.h"
+
+#define AQ_C_SEGMENTS 5
+#define DEFAULT_AQ2_SEG 3  // Neutral Q segment
+#define AQ_C_STRENGTHS 3
+static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+  { 1.75, 1.25, 1.05, 1.00, 0.90 },
+  { 2.00, 1.50, 1.15, 1.00, 0.85 },
+  { 2.50, 1.75, 1.25, 1.00, 0.80 }
+};
+static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+  { 0.15, 0.30, 0.55, 2.00, 100.0 },
+  { 0.20, 0.40, 0.65, 2.00, 100.0 },
+  { 0.25, 0.50, 0.75, 2.00, 100.0 }
+};
+static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+  { -4.0, -3.0, -2.0, 100.00, 100.0 },
+  { -3.5, -2.5, -1.5, 100.00, 100.0 },
+  { -3.0, -2.0, -1.0, 100.00, 100.0 }
+};
+
+#define DEFAULT_COMPLEXITY 64
+
+static int get_aq_c_strength(int q_index, aom_bit_depth_t bit_depth) {
+  // Approximate base quatizer (truncated to int)
+  const int base_quant = av1_ac_quant(q_index, 0, bit_depth) / 4;
+  return (base_quant > 10) + (base_quant > 25);
+}
+
+void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  struct segmentation *const seg = &cm->seg;
+
+  // Make SURE use of floating point in this function is safe.
+  aom_clear_system_state();
+
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+      cpi->refresh_alt_ref_frame ||
+      (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    int segment;
+    const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
+
+    // Clear down the segment map.
+    memset(cpi->segmentation_map, DEFAULT_AQ2_SEG, cm->mi_rows * cm->mi_cols);
+
+    av1_clearall_segfeatures(seg);
+
+    // Segmentation only makes sense if the target bits per SB is above a
+    // threshold. Below this the overheads will usually outweigh any benefit.
+    if (cpi->rc.sb64_target_rate < 256) {
+      av1_disable_segmentation(seg);
+      return;
+    }
+
+    av1_enable_segmentation(seg);
+
+    // Select delta coding method.
+    seg->abs_delta = SEGMENT_DELTADATA;
+
+    // Default segment "Q" feature is disabled so it defaults to the baseline Q.
+    av1_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q);
+
+    // Use some of the segments for in frame Q adjustment.
+    for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) {
+      int qindex_delta;
+
+      if (segment == DEFAULT_AQ2_SEG) continue;
+
+      qindex_delta = av1_compute_qdelta_by_rate(
+          &cpi->rc, cm->frame_type, cm->base_qindex,
+          aq_c_q_adj_factor[aq_strength][segment], cm->bit_depth);
+
+      // For AQ complexity mode, we dont allow Q0 in a segment if the base
+      // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
+      // Q delta is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -cm->base_qindex + 1;
+      }
+      if ((cm->base_qindex + qindex_delta) > 0) {
+        av1_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+        av1_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+      }
+    }
+  }
+}
+
+#define DEFAULT_LV_THRESH 10.0
+#define MIN_DEFAULT_LV_THRESH 8.0
+#define VAR_STRENGTH_STEP 0.25
+// Select a segment for the current block.
+// The choice of segment for a block depends on the ratio of the projected
+// bits for the block vs a target average and its spatial complexity.
+void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
+                            int mi_row, int mi_col, int projected_rate) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int xmis = AOMMIN(cm->mi_cols - mi_col, mi_size_wide[bs]);
+  const int ymis = AOMMIN(cm->mi_rows - mi_row, mi_size_high[bs]);
+  int x, y;
+  int i;
+  unsigned char segment;
+
+  if (0) {
+    segment = DEFAULT_AQ2_SEG;
+  } else {
+    // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
+    // It is converted to bits * 256 units.
+    const int64_t num = (int64_t)cpi->rc.sb64_target_rate * xmis * ymis * 256;
+    const int denom = cm->mib_size * cm->mib_size;
+    const int target_rate = (int)(num / denom);
+    double logvar;
+    double low_var_thresh;
+    const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
+
+    aom_clear_system_state();
+    low_var_thresh = (cpi->oxcf.pass == 2) ? AOMMAX(cpi->twopass.mb_av_energy,
+                                                    MIN_DEFAULT_LV_THRESH)
+                                           : DEFAULT_LV_THRESH;
+
+    av1_setup_src_planes(mb, cpi->source, mi_row, mi_col);
+    logvar = av1_log_block_var(cpi, mb, bs);
+
+    segment = AQ_C_SEGMENTS - 1;  // Just in case no break out below.
+    for (i = 0; i < AQ_C_SEGMENTS; ++i) {
+      // Test rate against a threshold value and variance against a threshold.
+      // Increasing segment number (higher variance and complexity) = higher Q.
+      if ((projected_rate < target_rate * aq_c_transitions[aq_strength][i]) &&
+          (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) {
+        segment = i;
+        break;
+      }
+    }
+  }
+
+  // Fill in the entires in the segment map corresponding to this SB64.
+  for (y = 0; y < ymis; y++) {
+    for (x = 0; x < xmis; x++) {
+      cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment;
+    }
+  }
+}
diff --git a/third_party/aom/av1/encoder/aq_complexity.h b/third_party/aom/av1/encoder/aq_complexity.h
new file mode 100644
index 000000000..af525b36d
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_complexity.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_AQ_COMPLEXITY_H_
+#define AV1_ENCODER_AQ_COMPLEXITY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/common/enums.h"
+
+struct AV1_COMP;
+struct macroblock;
+
+// Select a segment for the current Block.
+void av1_caq_select_segment(const struct AV1_COMP *cpi, struct macroblock *,
+                            BLOCK_SIZE bs, int mi_row, int mi_col,
+                            int projected_rate);
+
+// This function sets up a set of segments with delta Q values around
+// the baseline frame quantizer.
+void av1_setup_in_frame_q_adj(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_AQ_COMPLEXITY_H_
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
new file mode 100644
index 000000000..e41c608b6
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
@@ -0,0 +1,566 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "av1/common/seg_common.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/segmentation.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/system_state.h"
+
+struct CYCLIC_REFRESH {
+  // Percentage of blocks per frame that are targeted as candidates
+  // for cyclic refresh.
+  int percent_refresh;
+  // Maximum q-delta as percentage of base q.
+  int max_qdelta_perc;
+  // Superblock starting index for cycling through the frame.
+  int sb_index;
+  // Controls how long block will need to wait to be refreshed again, in
+  // excess of the cycle time, i.e., in the case of all zero motion, block
+  // will be refreshed every (100/percent_refresh + time_for_refresh) frames.
+  int time_for_refresh;
+  // Target number of (8x8) blocks that are set for delta-q.
+  int target_num_seg_blocks;
+  // Actual number of (8x8) blocks that were applied delta-q.
+  int actual_num_seg1_blocks;
+  int actual_num_seg2_blocks;
+  // RD mult. parameters for segment 1.
+  int rdmult;
+  // Cyclic refresh map.
+  signed char *map;
+  // Map of the last q a block was coded at.
+  uint8_t *last_coded_q_map;
+  // Thresholds applied to the projected rate/distortion of the coding block,
+  // when deciding whether block should be refreshed.
+  int64_t thresh_rate_sb;
+  int64_t thresh_dist_sb;
+  // Threshold applied to the motion vector (in units of 1/8 pel) of the
+  // coding block, when deciding whether block should be refreshed.
+  int16_t motion_thresh;
+  // Rate target ratio to set q delta.
+  double rate_ratio_qdelta;
+  // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+  int rate_boost_fac;
+  double low_content_avg;
+  int qindex_delta[3];
+};
+
+CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
+  size_t last_coded_q_map_size;
+  CYCLIC_REFRESH *const cr = aom_calloc(1, sizeof(*cr));
+  if (cr == NULL) return NULL;
+
+  cr->map = aom_calloc(mi_rows * mi_cols, sizeof(*cr->map));
+  if (cr->map == NULL) {
+    av1_cyclic_refresh_free(cr);
+    return NULL;
+  }
+  last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map);
+  cr->last_coded_q_map = aom_malloc(last_coded_q_map_size);
+  if (cr->last_coded_q_map == NULL) {
+    av1_cyclic_refresh_free(cr);
+    return NULL;
+  }
+  assert(MAXQ <= 255);
+  memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
+
+  return cr;
+}
+
+void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
+  aom_free(cr->map);
+  aom_free(cr->last_coded_q_map);
+  aom_free(cr);
+}
+
+// Check if we should turn off cyclic refresh based on bitrate condition.
+static int apply_cyclic_refresh_bitrate(const AV1_COMMON *cm,
+                                        const RATE_CONTROL *rc) {
+  // Turn off cyclic refresh if bits available per frame is not sufficiently
+  // larger than bit cost of segmentation. Segment map bit cost should scale
+  // with number of seg blocks, so compare available bits to number of blocks.
+  // Average bits available per frame = avg_frame_bandwidth
+  // Number of (8x8) blocks in frame = mi_rows * mi_cols;
+  const float factor = 0.25;
+  const int number_blocks = cm->mi_rows * cm->mi_cols;
+  // The condition below corresponds to turning off at target bitrates:
+  // (at 30fps), ~12kbps for CIF, 36kbps for VGA, 100kps for HD/720p.
+  // Also turn off at very small frame sizes, to avoid too large fraction of
+  // superblocks to be refreshed per frame. Threshold below is less than QCIF.
+  if (rc->avg_frame_bandwidth < factor * number_blocks ||
+      number_blocks / 64 < 5)
+    return 0;
+  else
+    return 1;
+}
+
+// Check if this coding block, of size bsize, should be considered for refresh
+// (lower-qp coding). Decision can be based on various factors, such as
+// size of the coding block (i.e., below min_block size rejected), coding
+// mode, and rate/distortion.
+static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
+                                const MB_MODE_INFO *mbmi, int64_t rate,
+                                int64_t dist, int bsize) {
+  MV mv = mbmi->mv[0].as_mv;
+  // Reject the block for lower-qp coding if projected distortion
+  // is above the threshold, and any of the following is true:
+  // 1) mode uses large mv
+  // 2) mode is an intra-mode
+  // Otherwise accept for refresh.
+  if (dist > cr->thresh_dist_sb &&
+      (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh ||
+       mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh ||
+       !is_inter_block(mbmi)))
+    return CR_SEGMENT_ID_BASE;
+  else if (bsize >= BLOCK_16X16 && rate < cr->thresh_rate_sb &&
+           is_inter_block(mbmi) && mbmi->mv[0].as_int == 0 &&
+           cr->rate_boost_fac > 10)
+    // More aggressive delta-q for bigger blocks with zero motion.
+    return CR_SEGMENT_ID_BOOST2;
+  else
+    return CR_SEGMENT_ID_BOOST1;
+}
+
+// Compute delta-q for the segment.
+static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) {
+  const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int deltaq = av1_compute_qdelta_by_rate(rc, cpi->common.frame_type, q,
+                                          rate_factor, cpi->common.bit_depth);
+  if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
+    deltaq = -cr->max_qdelta_perc * q / 100;
+  }
+  return deltaq;
+}
+
+// For the just encoded frame, estimate the bits, incorporating the delta-q
+// from non-base segment. For now ignore effect of multiple segments
+// (with different delta-q). Note this function is called in the postencode
+// (called from rc_update_rate_correction_factors()).
+int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi,
+                                          double correction_factor) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int estimated_bits;
+  int mbs = cm->MBs;
+  int num8x8bl = mbs << 2;
+  // Weight for non-base segments: use actual number of blocks refreshed in
+  // previous/just encoded frame. Note number of blocks here is in 8x8 units.
+  double weight_segment1 = (double)cr->actual_num_seg1_blocks / num8x8bl;
+  double weight_segment2 = (double)cr->actual_num_seg2_blocks / num8x8bl;
+  // Take segment weighted average for estimated bits.
+  estimated_bits =
+      (int)((1.0 - weight_segment1 - weight_segment2) *
+                av1_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs,
+                                       correction_factor, cm->bit_depth) +
+            weight_segment1 *
+                av1_estimate_bits_at_q(cm->frame_type,
+                                       cm->base_qindex + cr->qindex_delta[1],
+                                       mbs, correction_factor, cm->bit_depth) +
+            weight_segment2 *
+                av1_estimate_bits_at_q(cm->frame_type,
+                                       cm->base_qindex + cr->qindex_delta[2],
+                                       mbs, correction_factor, cm->bit_depth));
+  return estimated_bits;
+}
+
+// Prior to encoding the frame, estimate the bits per mb, for a given q = i and
+// a corresponding delta-q (for segment 1). This function is called in the
+// rc_regulate_q() to set the base qp index.
+// Note: the segment map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or
+// to 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock, prior to encoding.
+int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i,
+                                      double correction_factor) {
+  const AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int bits_per_mb;
+  int num8x8bl = cm->MBs << 2;
+  // Weight for segment prior to encoding: take the average of the target
+  // number for the frame to be encoded and the actual from the previous frame.
+  double weight_segment =
+      (double)((cr->target_num_seg_blocks + cr->actual_num_seg1_blocks +
+                cr->actual_num_seg2_blocks) >>
+               1) /
+      num8x8bl;
+  // Compute delta-q corresponding to qindex i.
+  int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
+  // Take segment weighted average for bits per mb.
+  bits_per_mb = (int)((1.0 - weight_segment) *
+                          av1_rc_bits_per_mb(cm->frame_type, i,
+                                             correction_factor, cm->bit_depth) +
+                      weight_segment *
+                          av1_rc_bits_per_mb(cm->frame_type, i + deltaq,
+                                             correction_factor, cm->bit_depth));
+  return bits_per_mb;
+}
+
+// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+// check if we should reset the segment_id, and update the cyclic_refresh map
+// and segmentation map.
+void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi,
+                                       MB_MODE_INFO *const mbmi, int mi_row,
+                                       int mi_col, BLOCK_SIZE bsize,
+                                       int64_t rate, int64_t dist, int skip) {
+  const AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int xmis = AOMMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = AOMMIN(cm->mi_rows - mi_row, bh);
+  const int block_index = mi_row * cm->mi_cols + mi_col;
+  const int refresh_this_block =
+      candidate_refresh_aq(cr, mbmi, rate, dist, bsize);
+  // Default is to not update the refresh map.
+  int new_map_value = cr->map[block_index];
+  int x = 0;
+  int y = 0;
+
+  // If this block is labeled for refresh, check if we should reset the
+  // segment_id.
+  if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
+    mbmi->segment_id = refresh_this_block;
+    // Reset segment_id if will be skipped.
+    if (skip) mbmi->segment_id = CR_SEGMENT_ID_BASE;
+  }
+
+  // Update the cyclic refresh map, to be used for setting segmentation map
+  // for the next frame. If the block  will be refreshed this frame, mark it
+  // as clean. The magnitude of the -ve influences how long before we consider
+  // it for refresh again.
+  if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
+    new_map_value = -cr->time_for_refresh;
+  } else if (refresh_this_block) {
+    // Else if it is accepted as candidate for refresh, and has not already
+    // been refreshed (marked as 1) then mark it as a candidate for cleanup
+    // for future time (marked as 0), otherwise don't update it.
+    if (cr->map[block_index] == 1) new_map_value = 0;
+  } else {
+    // Leave it marked as block that is not candidate for refresh.
+    new_map_value = 1;
+  }
+
+  // Update entries in the cyclic refresh map with new_map_value, and
+  // copy mbmi->segment_id into global segmentation map.
+  for (y = 0; y < ymis; y++)
+    for (x = 0; x < xmis; x++) {
+      int map_offset = block_index + y * cm->mi_cols + x;
+      cr->map[map_offset] = new_map_value;
+      cpi->segmentation_map[map_offset] = mbmi->segment_id;
+      // Inter skip blocks were clearly not coded at the current qindex, so
+      // don't update the map for them. For cases where motion is non-zero or
+      // the reference frame isn't the previous frame, the previous value in
+      // the map for this spatial location is not entirely correct.
+      if ((!is_inter_block(mbmi) || !skip) &&
+          mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+        cr->last_coded_q_map[map_offset] = clamp(
+            cm->base_qindex + cr->qindex_delta[mbmi->segment_id], 0, MAXQ);
+      } else if (is_inter_block(mbmi) && skip &&
+                 mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+        cr->last_coded_q_map[map_offset] =
+            AOMMIN(clamp(cm->base_qindex + cr->qindex_delta[mbmi->segment_id],
+                         0, MAXQ),
+                   cr->last_coded_q_map[map_offset]);
+      }
+    }
+}
+
+// Update the actual number of blocks that were applied the segment delta q.
+void av1_cyclic_refresh_postencode(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  unsigned char *const seg_map = cpi->segmentation_map;
+  int mi_row, mi_col;
+  cr->actual_num_seg1_blocks = 0;
+  cr->actual_num_seg2_blocks = 0;
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row++)
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+      if (cyclic_refresh_segment_id(seg_map[mi_row * cm->mi_cols + mi_col]) ==
+          CR_SEGMENT_ID_BOOST1)
+        cr->actual_num_seg1_blocks++;
+      else if (cyclic_refresh_segment_id(
+                   seg_map[mi_row * cm->mi_cols + mi_col]) ==
+               CR_SEGMENT_ID_BOOST2)
+        cr->actual_num_seg2_blocks++;
+    }
+}
+
+// Set golden frame update interval, for 1 pass CBR mode.
+void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  // Set minimum gf_interval for GF update to a multiple (== 2) of refresh
+  // period. Depending on past encoding stats, GF flag may be reset and update
+  // may not occur until next baseline_gf_interval.
+  if (cr->percent_refresh > 0)
+    rc->baseline_gf_interval = 4 * (100 / cr->percent_refresh);
+  else
+    rc->baseline_gf_interval = 40;
+}
+
+// Update some encoding stats (from the just encoded frame). If this frame's
+// background has high motion, refresh the golden frame. Otherwise, if the
+// golden reference is to be updated check if we should NOT update the golden
+// ref.
+void av1_cyclic_refresh_check_golden_update(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int mi_row, mi_col;
+  double fraction_low = 0.0;
+  int low_content_frame = 0;
+
+  MODE_INFO **mi;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int rows = cm->mi_rows, cols = cm->mi_cols;
+  int cnt1 = 0, cnt2 = 0;
+  int force_gf_refresh = 0;
+
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      int16_t abs_mvr = mi[0]->mbmi.mv[0].as_mv.row >= 0
+                            ? mi[0]->mbmi.mv[0].as_mv.row
+                            : -1 * mi[0]->mbmi.mv[0].as_mv.row;
+      int16_t abs_mvc = mi[0]->mbmi.mv[0].as_mv.col >= 0
+                            ? mi[0]->mbmi.mv[0].as_mv.col
+                            : -1 * mi[0]->mbmi.mv[0].as_mv.col;
+
+      // Calculate the motion of the background.
+      if (abs_mvr <= 16 && abs_mvc <= 16) {
+        cnt1++;
+        if (abs_mvr == 0 && abs_mvc == 0) cnt2++;
+      }
+      mi++;
+
+      // Accumulate low_content_frame.
+      if (cr->map[mi_row * cols + mi_col] < 1) low_content_frame++;
+    }
+  }
+
+  // For video conference clips, if the background has high motion in current
+  // frame because of the camera movement, set this frame as the golden frame.
+  // Use 70% and 5% as the thresholds for golden frame refreshing.
+  // Also, force this frame as a golden update frame if this frame will change
+  // the resolution (resize_pending != 0).
+  if (cpi->resize_pending != 0 ||
+      (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1)) {
+    av1_cyclic_refresh_set_golden_update(cpi);
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+    if (rc->frames_till_gf_update_due > rc->frames_to_key)
+      rc->frames_till_gf_update_due = rc->frames_to_key;
+    cpi->refresh_golden_frame = 1;
+    force_gf_refresh = 1;
+  }
+
+  fraction_low = (double)low_content_frame / (rows * cols);
+  // Update average.
+  cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4;
+  if (!force_gf_refresh && cpi->refresh_golden_frame == 1) {
+    // Don't update golden reference if the amount of low_content for the
+    // current encoded frame is small, or if the recursive average of the
+    // low_content over the update interval window falls below threshold.
+    if (fraction_low < 0.8 || cr->low_content_avg < 0.7)
+      cpi->refresh_golden_frame = 0;
+    // Reset for next internal.
+    cr->low_content_avg = fraction_low;
+  }
+}
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to
+// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock.
+// Blocks labeled as BOOST1 may later get set to BOOST2 (during the
+// encoding of the superblock).
+static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  unsigned char *const seg_map = cpi->segmentation_map;
+  int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
+  int xmis, ymis, x, y;
+  memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols);
+  sb_cols = (cm->mi_cols + cm->mib_size - 1) / cm->mib_size;
+  sb_rows = (cm->mi_rows + cm->mib_size - 1) / cm->mib_size;
+  sbs_in_frame = sb_cols * sb_rows;
+  // Number of target blocks to get the q delta (segment 1).
+  block_count = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+  // Set the segmentation map: cycle through the superblocks, starting at
+  // cr->mb_index, and stopping when either block_count blocks have been found
+  // to be refreshed, or we have passed through whole frame.
+  assert(cr->sb_index < sbs_in_frame);
+  i = cr->sb_index;
+  cr->target_num_seg_blocks = 0;
+  do {
+    int sum_map = 0;
+    // Get the mi_row/mi_col corresponding to superblock index i.
+    int sb_row_index = (i / sb_cols);
+    int sb_col_index = i - sb_row_index * sb_cols;
+    int mi_row = sb_row_index * cm->mib_size;
+    int mi_col = sb_col_index * cm->mib_size;
+    int qindex_thresh =
+        cpi->oxcf.content == AOM_CONTENT_SCREEN
+            ? av1_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
+            : 0;
+    assert(mi_row >= 0 && mi_row < cm->mi_rows);
+    assert(mi_col >= 0 && mi_col < cm->mi_cols);
+    bl_index = mi_row * cm->mi_cols + mi_col;
+    // Loop through all MI blocks in superblock and update map.
+    xmis = AOMMIN(cm->mi_cols - mi_col, cm->mib_size);
+    ymis = AOMMIN(cm->mi_rows - mi_row, cm->mib_size);
+    for (y = 0; y < ymis; y++) {
+      for (x = 0; x < xmis; x++) {
+        const int bl_index2 = bl_index + y * cm->mi_cols + x;
+        // If the block is as a candidate for clean up then mark it
+        // for possible boost/refresh (segment 1). The segment id may get
+        // reset to 0 later if block gets coded anything other than ZEROMV.
+        if (cr->map[bl_index2] == 0) {
+          if (cr->last_coded_q_map[bl_index2] > qindex_thresh) sum_map++;
+        } else if (cr->map[bl_index2] < 0) {
+          cr->map[bl_index2]++;
+        }
+      }
+    }
+    // Enforce constant segment over superblock.
+    // If segment is at least half of superblock, set to 1.
+    if (sum_map >= xmis * ymis / 2) {
+      for (y = 0; y < ymis; y++)
+        for (x = 0; x < xmis; x++) {
+          seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
+        }
+      cr->target_num_seg_blocks += xmis * ymis;
+    }
+    i++;
+    if (i == sbs_in_frame) {
+      i = 0;
+    }
+  } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index);
+  cr->sb_index = i;
+}
+
+// Set cyclic refresh parameters.
+void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  cr->percent_refresh = 10;
+  cr->max_qdelta_perc = 50;
+  cr->time_for_refresh = 0;
+  // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
+  // periods of the refresh cycle, after a key frame.
+  if (rc->frames_since_key < 4 * cr->percent_refresh)
+    cr->rate_ratio_qdelta = 3.0;
+  else
+    cr->rate_ratio_qdelta = 2.0;
+  // Adjust some parameters for low resolutions at low bitrates.
+  if (cm->width <= 352 && cm->height <= 288 && rc->avg_frame_bandwidth < 3400) {
+    cr->motion_thresh = 4;
+    cr->rate_boost_fac = 10;
+  } else {
+    cr->motion_thresh = 32;
+    cr->rate_boost_fac = 17;
+  }
+}
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  struct segmentation *const seg = &cm->seg;
+  const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc);
+  if (cm->current_video_frame == 0) cr->low_content_avg = 0.0;
+  // Don't apply refresh on key frame or enhancement layer frames.
+  if (!apply_cyclic_refresh || cm->frame_type == KEY_FRAME) {
+    // Set segmentation map to 0 and disable.
+    unsigned char *const seg_map = cpi->segmentation_map;
+    memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
+    av1_disable_segmentation(&cm->seg);
+    if (cm->frame_type == KEY_FRAME) {
+      memset(cr->last_coded_q_map, MAXQ,
+             cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
+      cr->sb_index = 0;
+    }
+    return;
+  } else {
+    int qindex_delta = 0;
+    int qindex2;
+    const double q = av1_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
+    aom_clear_system_state();
+    // Set rate threshold to some multiple (set to 2 for now) of the target
+    // rate (target is given by sb64_target_rate and scaled by 256).
+    cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2;
+    // Distortion threshold, quadratic in Q, scale factor to be adjusted.
+    // q will not exceed 457, so (q * q) is within 32bit; see:
+    // av1_convert_qindex_to_q(), av1_ac_quant(), ac_qlookup*[].
+    cr->thresh_dist_sb = ((int64_t)(q * q)) << 2;
+
+    // Set up segmentation.
+    // Clear down the segment map.
+    av1_enable_segmentation(&cm->seg);
+    av1_clearall_segfeatures(seg);
+    // Select delta coding method.
+    seg->abs_delta = SEGMENT_DELTADATA;
+
+    // Note: setting temporal_update has no effect, as the seg-map coding method
+    // (temporal or spatial) is determined in
+    // av1_choose_segmap_coding_method(),
+    // based on the coding cost of each method. For error_resilient mode on the
+    // last_frame_seg_map is set to 0, so if temporal coding is used, it is
+    // relative to 0 previous map.
+    // seg->temporal_update = 0;
+
+    // Segment BASE "Q" feature is disabled so it defaults to the baseline Q.
+    av1_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q);
+    // Use segment BOOST1 for in-frame Q adjustment.
+    av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q);
+    // Use segment BOOST2 for more aggressive in-frame Q adjustment.
+    av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q);
+
+    // Set the q delta for segment BOOST1.
+    qindex_delta = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta);
+    cr->qindex_delta[1] = qindex_delta;
+
+    // Compute rd-mult for segment BOOST1.
+    qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ);
+
+    cr->rdmult = av1_compute_rd_mult(cpi, qindex2);
+
+    av1_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);
+
+    // Set a more aggressive (higher) q delta for segment BOOST2.
+    qindex_delta = compute_deltaq(
+        cpi, cm->base_qindex,
+        AOMMIN(CR_MAX_RATE_TARGET_RATIO,
+               0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta));
+    cr->qindex_delta[2] = qindex_delta;
+    av1_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
+
+    // Update the segmentation and refresh map.
+    cyclic_refresh_update_map(cpi);
+  }
+}
+
+int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) {
+  return cr->rdmult;
+}
+
+void av1_cyclic_refresh_reset_resize(AV1_COMP *const cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  memset(cr->map, 0, cm->mi_rows * cm->mi_cols);
+  cr->sb_index = 0;
+  cpi->refresh_golden_frame = 1;
+}
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.h b/third_party/aom/av1/encoder/aq_cyclicrefresh.h
new file mode 100644
index 000000000..459ab80b8
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_AQ_CYCLICREFRESH_H_
+#define AV1_ENCODER_AQ_CYCLICREFRESH_H_
+
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The segment ids used in cyclic refresh: from base (no boost) to increasing
+// boost (higher delta-qp).
+#define CR_SEGMENT_ID_BASE 0
+#define CR_SEGMENT_ID_BOOST1 1
+#define CR_SEGMENT_ID_BOOST2 2
+
+// Maximum rate target ratio for setting segment delta-qp.
+#define CR_MAX_RATE_TARGET_RATIO 4.0
+
+struct AV1_COMP;
+
+struct CYCLIC_REFRESH;
+typedef struct CYCLIC_REFRESH CYCLIC_REFRESH;
+
+CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols);
+
+void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr);
+
+// Estimate the bits, incorporating the delta-q from segment 1, after encoding
+// the frame.
+int av1_cyclic_refresh_estimate_bits_at_q(const struct AV1_COMP *cpi,
+                                          double correction_factor);
+
+// Estimate the bits per mb, for a given q = i and a corresponding delta-q
+// (for segment 1), prior to encoding the frame.
+int av1_cyclic_refresh_rc_bits_per_mb(const struct AV1_COMP *cpi, int i,
+                                      double correction_factor);
+
+// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+// check if we should reset the segment_id, and update the cyclic_refresh map
+// and segmentation map.
+void av1_cyclic_refresh_update_segment(const struct AV1_COMP *cpi,
+                                       MB_MODE_INFO *const mbmi, int mi_row,
+                                       int mi_col, BLOCK_SIZE bsize,
+                                       int64_t rate, int64_t dist, int skip);
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+void av1_cyclic_refresh_update__map(struct AV1_COMP *const cpi);
+
+// Update the actual number of blocks that were applied the segment delta q.
+void av1_cyclic_refresh_postencode(struct AV1_COMP *const cpi);
+
+// Set golden frame update interval, for 1 pass CBR mode.
+void av1_cyclic_refresh_set_golden_update(struct AV1_COMP *const cpi);
+
+// Check if we should not update golden reference, based on past refresh stats.
+void av1_cyclic_refresh_check_golden_update(struct AV1_COMP *const cpi);
+
+// Set/update global/frame level refresh parameters.
+void av1_cyclic_refresh_update_parameters(struct AV1_COMP *const cpi);
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void av1_cyclic_refresh_setup(struct AV1_COMP *const cpi);
+
+int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
+
+void av1_cyclic_refresh_reset_resize(struct AV1_COMP *const cpi);
+
+static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) {
+  return segment_id == CR_SEGMENT_ID_BOOST1 ||
+         segment_id == CR_SEGMENT_ID_BOOST2;
+}
+
+static INLINE int cyclic_refresh_segment_id(int segment_id) {
+  if (segment_id == CR_SEGMENT_ID_BOOST1)
+    return CR_SEGMENT_ID_BOOST1;
+  else if (segment_id == CR_SEGMENT_ID_BOOST2)
+    return CR_SEGMENT_ID_BOOST2;
+  else
+    return CR_SEGMENT_ID_BASE;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_AQ_CYCLICREFRESH_H_
diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c
new file mode 100644
index 000000000..ab9b3790b
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_variance.c
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "aom_ports/mem.h"
+
+#include "av1/encoder/aq_variance.h"
+
+#include "av1/common/seg_common.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/segmentation.h"
+#include "aom_ports/system_state.h"
+
+#define ENERGY_MIN (-4)
+#define ENERGY_MAX (1)
+#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1)
+#define ENERGY_IN_BOUNDS(energy) \
+  assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
+
+static const double rate_ratio[MAX_SEGMENTS] = { 2.5,  2.0, 1.5, 1.0,
+                                                 0.75, 1.0, 1.0, 1.0 };
+static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 };
+
+#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN]
+
+DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 };
+#if CONFIG_HIGHBITDEPTH
+DECLARE_ALIGNED(16, static const uint16_t,
+                av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
+#endif
+
+unsigned int av1_vaq_segment_id(int energy) {
+  ENERGY_IN_BOUNDS(energy);
+  return SEGMENT_ID(energy);
+}
+
+void av1_vaq_frame_setup(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  struct segmentation *seg = &cm->seg;
+  int i;
+
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+      cpi->refresh_alt_ref_frame ||
+      (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    cpi->vaq_refresh = 1;
+
+    av1_enable_segmentation(seg);
+    av1_clearall_segfeatures(seg);
+
+    seg->abs_delta = SEGMENT_DELTADATA;
+
+    aom_clear_system_state();
+
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      int qindex_delta =
+          av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+                                     rate_ratio[i], cm->bit_depth);
+
+      // We don't allow qindex 0 in a segment if the base value is not 0.
+      // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
+      // Q delta is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -cm->base_qindex + 1;
+      }
+
+      // No need to enable SEG_LVL_ALT_Q for this segment.
+      if (rate_ratio[i] == 1.0) {
+        continue;
+      }
+
+      av1_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
+      av1_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+    }
+  }
+}
+
+/* TODO(agrange, paulwilkins): The block_variance calls the unoptimized versions
+ * of variance() and highbd_8_variance(). It should not.
+ */
+static void aq_variance(const uint8_t *a, int a_stride, const uint8_t *b,
+                        int b_stride, int w, int h, unsigned int *sse,
+                        int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void aq_highbd_variance64(const uint8_t *a8, int a_stride,
+                                 const uint8_t *b8, int b_stride, int w, int h,
+                                 uint64_t *sse, uint64_t *sum) {
+  int i, j;
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+static void aq_highbd_8_variance(const uint8_t *a8, int a_stride,
+                                 const uint8_t *b8, int b_stride, int w, int h,
+                                 unsigned int *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  aq_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (unsigned int)sse_long;
+  *sum = (int)sum_long;
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                   BLOCK_SIZE bs) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int var, sse;
+  int right_overflow =
+      (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
+  int bottom_overflow =
+      (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
+
+  if (right_overflow || bottom_overflow) {
+    const int bw = 8 * mi_size_wide[bs] - right_overflow;
+    const int bh = 8 * mi_size_high[bs] - bottom_overflow;
+    int avg;
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
+                           CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, bw, bh,
+                           &sse, &avg);
+      sse >>= 2 * (xd->bd - 8);
+      avg >>= (xd->bd - 8);
+    } else {
+      aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_all_zeros, 0,
+                  bw, bh, &sse, &avg);
+    }
+#else
+    aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_all_zeros, 0,
+                bw, bh, &sse, &avg);
+#endif  // CONFIG_HIGHBITDEPTH
+    var = sse - (unsigned int)(((int64_t)avg * avg) / (bw * bh));
+    return (unsigned int)((uint64_t)var * 256) / (bw * bh);
+  } else {
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      var =
+          cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+                             CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse);
+    } else {
+      var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+                               av1_all_zeros, 0, &sse);
+    }
+#else
+    var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+                             av1_all_zeros, 0, &sse);
+#endif  // CONFIG_HIGHBITDEPTH
+    return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs];
+  }
+}
+
+double av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+  unsigned int var = block_variance(cpi, x, bs);
+  aom_clear_system_state();
+  return log(var + 1.0);
+}
+
+#define DEFAULT_E_MIDPOINT 10.0
+int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+  double energy;
+  double energy_midpoint;
+  aom_clear_system_state();
+  energy_midpoint =
+      (cpi->oxcf.pass == 2) ? cpi->twopass.mb_av_energy : DEFAULT_E_MIDPOINT;
+  energy = av1_log_block_var(cpi, x, bs) - energy_midpoint;
+  return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
+}
diff --git a/third_party/aom/av1/encoder/aq_variance.h b/third_party/aom/av1/encoder/aq_variance.h
new file mode 100644
index 000000000..05725c5de
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_variance.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_AQ_VARIANCE_H_
+#define AV1_ENCODER_AQ_VARIANCE_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int av1_vaq_segment_id(int energy);
+void av1_vaq_frame_setup(AV1_COMP *cpi);
+
+int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+double av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_AQ_VARIANCE_H_
diff --git a/third_party/aom/av1/encoder/arm/neon/dct_neon.c b/third_party/aom/av1/encoder/arm/neon/dct_neon.c
new file mode 100644
index 000000000..f6ce24a3d
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/dct_neon.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "av1/common/blockd.h"
+#include "aom_dsp/txfm_common.h"
+
+void av1_fdct8x8_quant_neon(const int16_t *input, int stride,
+                            int16_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t *zbin_ptr,
+                            const int16_t *round_ptr, const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+                            int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                            uint16_t *eob_ptr, const int16_t *scan_ptr,
+                            const int16_t *iscan_ptr) {
+  int16_t temp_buffer[64];
+  (void)coeff_ptr;
+
+  aom_fdct8x8_neon(input, temp_buffer, stride);
+  av1_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, zbin_ptr, round_ptr,
+                       quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+                       dequant_ptr, eob_ptr, scan_ptr, iscan_ptr);
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/error_neon.c b/third_party/aom/av1/encoder/arm/neon/error_neon.c
new file mode 100644
index 000000000..fe5233f89
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/error_neon.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./av1_rtcd.h"
+
+int64_t av1_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff,
+                                int block_size) {
+  int64x2_t error = vdupq_n_s64(0);
+
+  assert(block_size >= 8);
+  assert((block_size % 8) == 0);
+
+  do {
+    const int16x8_t c = vld1q_s16(coeff);
+    const int16x8_t d = vld1q_s16(dqcoeff);
+    const int16x8_t diff = vsubq_s16(c, d);
+    const int16x4_t diff_lo = vget_low_s16(diff);
+    const int16x4_t diff_hi = vget_high_s16(diff);
+    // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
+    // accumulating them in 64-bits.
+    const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
+    const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
+    const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
+    error = vaddq_s64(error, err2);
+    coeff += 8;
+    dqcoeff += 8;
+    block_size -= 8;
+  } while (block_size != 0);
+
+  return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/quantize_neon.c b/third_party/aom/av1/encoder/arm/neon/quantize_neon.c
new file mode 100644
index 000000000..36e7d3370
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/quantize_neon.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include <math.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
+
+void av1_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count,
+                          int skip_block, const int16_t *zbin_ptr,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+                          int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                          uint16_t *eob_ptr, const int16_t *scan,
+                          const int16_t *iscan) {
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)scan;
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    int i;
+    const int16x8_t v_zero = vdupq_n_s16(0);
+    const int16x8_t v_one = vdupq_n_s16(1);
+    int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+    int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
+    int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
+    int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
+    // adjust for dc
+    v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
+    v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+    v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+    // process dc and the first seven ac coeffs
+    {
+      const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
+      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+      const int32x4_t v_tmp_lo =
+          vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+      const int32x4_t v_tmp_hi =
+          vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+      const int16x8_t v_tmp2 =
+          vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+      vst1q_s16(&qcoeff_ptr[0], v_qcoeff);
+      vst1q_s16(&dqcoeff_ptr[0], v_dqcoeff);
+      v_round = vmovq_n_s16(round_ptr[1]);
+      v_quant = vmovq_n_s16(quant_ptr[1]);
+      v_dequant = vmovq_n_s16(dequant_ptr[1]);
+    }
+    // now process the rest of the ac coeffs
+    for (i = 8; i < count; i += 8) {
+      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
+      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+      const int32x4_t v_tmp_lo =
+          vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+      const int32x4_t v_tmp_hi =
+          vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+      const int16x8_t v_tmp2 =
+          vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+      vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
+      vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
+    }
+    {
+      const int16x4_t v_eobmax_3210 = vmax_s16(
+          vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210));
+      const int64x1_t v_eobmax_xx32 =
+          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+      const int16x4_t v_eobmax_tmp =
+          vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+      const int64x1_t v_eobmax_xxx3 =
+          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+      const int16x4_t v_eobmax_final =
+          vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+      *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+    }
+  } else {
+    memset(qcoeff_ptr, 0, count * sizeof(int16_t));
+    memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+    *eob_ptr = 0;
+  }
+}
diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c
new file mode 100644
index 000000000..6cffac264
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_quantize.c
@@ -0,0 +1,1790 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/idct.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
+
+#if CONFIG_NEW_QUANT
+static INLINE int quantize_coeff_nuq(
+    const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
+    const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < cuml_bins_ptr[i]) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    tmp -= cuml_bins_ptr[NUQ_KNOTS - 1];
+    q = NUQ_KNOTS + (((((tmp * quant) >> 16) + tmp) * quant_shift) >> 16);
+  }
+  if (q) {
+    *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val);
+    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+static INLINE int quantize_coeff_bigtx_nuq(
+    const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
+    const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, int logsizeby16) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    tmp -= ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16);
+    q = NUQ_KNOTS +
+        (((((tmp * quant) >> 16) + tmp) * quant_shift) >> (16 - logsizeby16));
+  }
+  if (q) {
+    *dqcoeff_ptr = ROUND_POWER_OF_TWO(
+        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16);
+    // *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val) >>
+    // (logsizeby16);
+    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+static INLINE int quantize_coeff_fp_nuq(
+    const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
+    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < cuml_bins_ptr[i]) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    q = NUQ_KNOTS +
+        ((((int64_t)tmp - cuml_bins_ptr[NUQ_KNOTS - 1]) * quant) >> 16);
+  }
+  if (q) {
+    *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val);
+    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+static INLINE int quantize_coeff_bigtx_fp_nuq(
+    const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
+    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int logsizeby16) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    q = NUQ_KNOTS +
+        ((((int64_t)tmp -
+           ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16)) *
+          quant) >>
+         (16 - logsizeby16));
+  }
+  if (q) {
+    *dqcoeff_ptr = ROUND_POWER_OF_TWO(
+        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16);
+    // *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val) >>
+    // (logsizeby16);
+    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+void quantize_dc_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                     int skip_block, const int16_t quant,
+                     const int16_t quant_shift, const int16_t dequant,
+                     const tran_low_t *cuml_bins_ptr,
+                     const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+                     tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (quantize_coeff_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
+                           cuml_bins_ptr, dequant_val, qcoeff_ptr, dqcoeff_ptr))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_dc_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                        int skip_block, const int16_t quant,
+                        const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+                        const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (quantize_coeff_fp_nuq(coeff_ptr[rc], quant, dequant, cuml_bins_ptr,
+                              dequant_val, qcoeff_ptr, dqcoeff_ptr))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_dc_32x32_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           int skip_block, const int16_t quant,
+                           const int16_t quant_shift, const int16_t dequant,
+                           const tran_low_t *cuml_bins_ptr,
+                           const tran_low_t *dequant_val,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (quantize_coeff_bigtx_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
+                                 cuml_bins_ptr, dequant_val, qcoeff_ptr,
+                                 dqcoeff_ptr, av1_get_tx_scale(TX_32X32)))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_dc_32x32_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              int skip_block, const int16_t quant,
+                              const int16_t dequant,
+                              const tran_low_t *cuml_bins_ptr,
+                              const tran_low_t *dequant_val,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc], quant, dequant,
+                                    cuml_bins_ptr, dequant_val, qcoeff_ptr,
+                                    dqcoeff_ptr, av1_get_tx_scale(TX_32X32)))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_TX64X64
+void quantize_dc_64x64_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           int skip_block, const int16_t quant,
+                           const int16_t quant_shift, const int16_t dequant,
+                           const tran_low_t *cuml_bins_ptr,
+                           const tran_low_t *dequant_val,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (quantize_coeff_bigtx_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
+                                 cuml_bins_ptr, dequant_val, qcoeff_ptr,
+                                 dqcoeff_ptr, av1_get_tx_scale(TX_64X64)))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_dc_64x64_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              int skip_block, const int16_t quant,
+                              const int16_t dequant,
+                              const tran_low_t *cuml_bins_ptr,
+                              const tran_low_t *dequant_val,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc], quant, dequant,
+                                    cuml_bins_ptr, dequant_val, qcoeff_ptr,
+                                    dqcoeff_ptr, av1_get_tx_scale(TX_64X64)))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
+
+void quantize_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                    int skip_block, const int16_t *quant_ptr,
+                    const int16_t *quant_shift_ptr, const int16_t *dequant_ptr,
+                    const cuml_bins_type_nuq *cuml_bins_ptr,
+                    const dequant_val_type_nuq *dequant_val,
+                    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                    uint16_t *eob_ptr, const int16_t *scan,
+                    const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (quantize_coeff_nuq(coeff_ptr[rc], quant_ptr[rc != 0],
+                             quant_shift_ptr[rc != 0], dequant_ptr[rc != 0],
+                             cuml_bins_ptr[band[i]], dequant_val[band[i]],
+                             &qcoeff_ptr[rc], &dqcoeff_ptr[rc]))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                       int skip_block, const int16_t *quant_ptr,
+                       const int16_t *dequant_ptr,
+                       const cuml_bins_type_nuq *cuml_bins_ptr,
+                       const dequant_val_type_nuq *dequant_val,
+                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                       uint16_t *eob_ptr, const int16_t *scan,
+                       const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (quantize_coeff_fp_nuq(coeff_ptr[rc], quant_ptr[rc != 0],
+                                dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
+                                dequant_val[band[i]], &qcoeff_ptr[rc],
+                                &dqcoeff_ptr[rc]))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_32x32_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          int skip_block, const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr,
+                          const int16_t *dequant_ptr,
+                          const cuml_bins_type_nuq *cuml_bins_ptr,
+                          const dequant_val_type_nuq *dequant_val,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          uint16_t *eob_ptr, const int16_t *scan,
+                          const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (quantize_coeff_bigtx_nuq(
+              coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
+              dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
+              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc],
+              av1_get_tx_scale(TX_32X32)))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_32x32_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *quant_ptr,
+                             const int16_t *dequant_ptr,
+                             const cuml_bins_type_nuq *cuml_bins_ptr,
+                             const dequant_val_type_nuq *dequant_val,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             uint16_t *eob_ptr, const int16_t *scan,
+                             const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (quantize_coeff_bigtx_fp_nuq(
+              coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
+              cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
+              &dqcoeff_ptr[rc], av1_get_tx_scale(TX_32X32)))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_TX64X64
+void quantize_64x64_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          int skip_block, const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr,
+                          const int16_t *dequant_ptr,
+                          const cuml_bins_type_nuq *cuml_bins_ptr,
+                          const dequant_val_type_nuq *dequant_val,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          uint16_t *eob_ptr, const int16_t *scan,
+                          const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (quantize_coeff_bigtx_nuq(
+              coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
+              dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
+              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc],
+              av1_get_tx_scale(TX_64X64)))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_64x64_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *quant_ptr,
+                             const int16_t *dequant_ptr,
+                             const cuml_bins_type_nuq *cuml_bins_ptr,
+                             const dequant_val_type_nuq *dequant_val,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             uint16_t *eob_ptr, const int16_t *scan,
+                             const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (quantize_coeff_bigtx_fp_nuq(
+              coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
+              cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
+              &dqcoeff_ptr[rc], av1_get_tx_scale(TX_64X64)))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_NEW_QUANT
+
+void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+                       tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  *eob_ptr = 0;
+}
+
+static void quantize_fp_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan,
+#if CONFIG_AOM_QM
+    const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
+#endif
+    int log_scale) {
+  int i, eob = -1;
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+#if CONFIG_AOM_QM
+      const qm_val_t wt = qm_ptr[rc];
+      const qm_val_t iwt = iqm_ptr[rc];
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+#endif
+      const int coeff_sign = (coeff >> 31);
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      int tmp32 = 0;
+#if CONFIG_AOM_QM
+      if (abs_coeff * wt >=
+          (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) {
+#else
+      if (abs_coeff >= (dequant_ptr[rc != 0] >> (1 + log_scale))) {
+#endif
+        abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+        abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+#if CONFIG_AOM_QM
+        tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >>
+                      ((16 - log_scale) + AOM_QM_BITS));
+        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / (1 << log_scale);
+#else
+        tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
+        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] =
+            qcoeff_ptr[rc] * dequant_ptr[rc != 0] / (1 << log_scale);
+#endif
+      }
+
+      if (tmp32) eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                       int skip_block, const int16_t *zbin_ptr,
+                       const int16_t *round_ptr, const int16_t *quant_ptr,
+                       const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                       tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                       uint16_t *eob_ptr, const int16_t *scan,
+                       const int16_t *iscan
+#if CONFIG_AOM_QM
+                       ,
+                       const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+                       ) {
+  quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
+                       quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+                       dequant_ptr, eob_ptr, scan, iscan,
+#if CONFIG_AOM_QM
+                       qm_ptr, iqm_ptr,
+#endif
+                       0);
+}
+
+void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *zbin_ptr,
+                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan
+#if CONFIG_AOM_QM
+                             ,
+                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+                             ) {
+  quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
+                       quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+                       dequant_ptr, eob_ptr, scan, iscan,
+#if CONFIG_AOM_QM
+                       qm_ptr, iqm_ptr,
+#endif
+                       1);
+}
+
+#if CONFIG_TX64X64
+void av1_quantize_fp_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *zbin_ptr,
+                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan
+#if CONFIG_AOM_QM
+                             ,
+                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+                             ) {
+  quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
+                       quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+                       dequant_ptr, eob_ptr, scan, iscan,
+#if CONFIG_AOM_QM
+                       qm_ptr, iqm_ptr,
+#endif
+                       2);
+}
+#endif  // CONFIG_TX64X64
+
+void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            const MACROBLOCKD_PLANE *pd,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+#if CONFIG_AOM_QM
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+#endif  // CONFIG_AOM_QM
+
+  switch (qparam->log_scale) {
+    case 0:
+      if (n_coeffs < 16) {
+        // TODO(jingning): Need SIMD implementation for smaller block size
+        // quantization.
+        quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                             p->round_fp, p->quant_fp, p->quant_shift,
+                             qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+                             sc->scan, sc->iscan,
+#if CONFIG_AOM_QM
+                             qm_ptr, iqm_ptr,
+#endif
+                             qparam->log_scale);
+      } else {
+        av1_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
+                        p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                        pd->dequant, eob_ptr, sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+                        ,
+                        qm_ptr, iqm_ptr
+#endif
+                        );
+      }
+      break;
+    case 1:
+      av1_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                            p->round_fp, p->quant_fp, p->quant_shift,
+                            qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+                            sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+                            ,
+                            qm_ptr, iqm_ptr
+#endif
+                            );
+      break;
+#if CONFIG_TX64X64
+    case 2:
+      av1_quantize_fp_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                            p->round_fp, p->quant_fp, p->quant_shift,
+                            qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+                            sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+                            ,
+                            qm_ptr, iqm_ptr
+#endif
+                            );
+      break;
+#endif  // CONFIG_TX64X64
+    default: assert(0);
+  }
+}
+
+void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                           const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr,
+                           uint16_t *eob_ptr, const SCAN_ORDER *sc,
+                           const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+#if CONFIG_AOM_QM
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+#endif  // CONFIG_AOM_QM
+
+  switch (qparam->log_scale) {
+    case 0:
+      aom_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                     pd->dequant, eob_ptr, sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+                     ,
+                     qm_ptr, iqm_ptr
+#endif
+                     );
+      break;
+    case 1:
+      aom_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+                           p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                           pd->dequant, eob_ptr, sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+                           ,
+                           qm_ptr, iqm_ptr
+#endif
+                           );
+      break;
+#if CONFIG_TX64X64
+    case 2:
+      aom_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+                           p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                           pd->dequant, eob_ptr, sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+                           ,
+                           qm_ptr, iqm_ptr
+#endif
+                           );
+      break;
+#endif  // CONFIG_TX64X64
+    default: assert(0);
+  }
+}
+
+void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            const MACROBLOCKD_PLANE *pd,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+#if CONFIG_AOM_QM
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+#endif  // CONFIG_AOM_QM
+
+  (void)sc;
+
+  switch (qparam->log_scale) {
+    case 0:
+      aom_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
+                      p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr, pd->dequant[0],
+                      eob_ptr
+#if CONFIG_AOM_QM
+                      ,
+                      qm_ptr, iqm_ptr
+#endif
+                      );
+      break;
+    case 1:
+      aom_quantize_dc_32x32(coeff_ptr, skip_block, p->round, p->quant_fp[0],
+                            qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr
+#if CONFIG_AOM_QM
+                            ,
+                            qm_ptr, iqm_ptr
+#endif
+                            );
+      break;
+#if CONFIG_TX64X64
+      aom_quantize_dc_64x64(coeff_ptr, skip_block, p->round, p->quant_fp[0],
+                            qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr
+#if CONFIG_AOM_QM
+                            ,
+                            qm_ptr, iqm_ptr
+#endif
+                            );
+    case 2: break;
+#endif  // CONFIG_TX64X64
+    default: assert(0);
+  }
+}
+
+#if CONFIG_NEW_QUANT
+void av1_quantize_b_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const MACROBLOCK_PLANE *p,
+                               tran_low_t *qcoeff_ptr,
+                               const MACROBLOCKD_PLANE *pd,
+                               tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                               const SCAN_ORDER *sc,
+                               const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+  const uint8_t *band = get_band_translate(qparam->tx_size);
+  int dq = qparam->dq;
+
+  switch (qparam->log_scale) {
+    case 0:
+      quantize_nuq(coeff_ptr, n_coeffs, skip_block, p->quant, p->quant_shift,
+                   pd->dequant,
+                   (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                   (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+                   qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band);
+      break;
+    case 1:
+      quantize_32x32_nuq(coeff_ptr, n_coeffs, skip_block, p->quant,
+                         p->quant_shift, pd->dequant,
+                         (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                         (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+                         qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band);
+      break;
+#if CONFIG_TX64X64
+    case 2:
+      quantize_64x64_nuq(coeff_ptr, n_coeffs, skip_block, p->quant,
+                         p->quant_shift, pd->dequant,
+                         (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                         (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+                         qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band);
+      break;
+#endif  // CONFIG_TX64X64
+    default: assert(0);
+  }
+}
+
+void av1_quantize_fp_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const MACROBLOCK_PLANE *p,
+                                tran_low_t *qcoeff_ptr,
+                                const MACROBLOCKD_PLANE *pd,
+                                tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                const SCAN_ORDER *sc,
+                                const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+  const uint8_t *band = get_band_translate(qparam->tx_size);
+  int dq = qparam->dq;
+
+  switch (qparam->log_scale) {
+    case 0:
+      quantize_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
+                      (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                      (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+                      qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band);
+      break;
+    case 1:
+      quantize_32x32_fp_nuq(
+          coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
+          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
+          dqcoeff_ptr, eob_ptr, sc->scan, band);
+      break;
+#if CONFIG_TX64X64
+    case 2:
+      quantize_64x64_fp_nuq(
+          coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
+          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
+          dqcoeff_ptr, eob_ptr, sc->scan, band);
+      break;
+#endif  // CONFIG_TX64X64
+    default: assert(0);
+  }
+}
+
+void av1_quantize_dc_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const MACROBLOCK_PLANE *p,
+                                tran_low_t *qcoeff_ptr,
+                                const MACROBLOCKD_PLANE *pd,
+                                tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                const SCAN_ORDER *sc,
+                                const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+  int dq = qparam->dq;
+  (void)sc;
+
+  switch (qparam->log_scale) {
+    case 0:
+      quantize_dc_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0],
+                         pd->dequant[0], p->cuml_bins_nuq[dq][0],
+                         pd->dequant_val_nuq[dq][0], qcoeff_ptr, dqcoeff_ptr,
+                         eob_ptr);
+      break;
+    case 1:
+      quantize_dc_32x32_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0],
+                               pd->dequant[0], p->cuml_bins_nuq[dq][0],
+                               pd->dequant_val_nuq[dq][0], qcoeff_ptr,
+                               dqcoeff_ptr, eob_ptr);
+      break;
+#if CONFIG_TX64X64
+    case 2:
+      quantize_dc_64x64_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0],
+                               pd->dequant[0], p->cuml_bins_nuq[dq][0],
+                               pd->dequant_val_nuq[dq][0], qcoeff_ptr,
+                               dqcoeff_ptr, eob_ptr);
+      break;
+#endif  // CONFIG_TX64X64
+    default: assert(0);
+  }
+}
+#endif  // CONFIG_NEW_QUANT
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   const MACROBLOCKD_PLANE *pd,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const SCAN_ORDER *sc,
+                                   const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+#if CONFIG_AOM_QM
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+#endif  // CONFIG_AOM_QM
+
+  if (n_coeffs < 16) {
+    // TODO(jingning): Need SIMD implementation for smaller block size
+    // quantization.
+    av1_highbd_quantize_fp_c(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                             p->round_fp, p->quant_fp, p->quant_shift,
+                             qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+                             sc->scan, sc->iscan,
+#if CONFIG_AOM_QM
+                             qm_ptr, iqm_ptr,
+#endif
+                             qparam->log_scale);
+    return;
+  }
+
+  av1_highbd_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
+                         p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                         pd->dequant, eob_ptr, sc->scan, sc->iscan,
+#if CONFIG_AOM_QM
+                         qm_ptr, iqm_ptr,
+#endif
+                         qparam->log_scale);
+}
+
+void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
+                                  intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                  tran_low_t *qcoeff_ptr,
+                                  const MACROBLOCKD_PLANE *pd,
+                                  tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                  const SCAN_ORDER *sc,
+                                  const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+#if CONFIG_AOM_QM
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+#endif  // CONFIG_AOM_QM
+
+  switch (qparam->log_scale) {
+    case 0:
+      aom_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+                            p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                            pd->dequant, eob_ptr, sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+                            ,
+                            qm_ptr, iqm_ptr
+#endif
+                            );
+      break;
+    case 1:
+      aom_highbd_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                                  p->round, p->quant, p->quant_shift,
+                                  qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+                                  sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+                                  ,
+                                  qm_ptr, iqm_ptr
+#endif
+                                  );
+      break;
+#if CONFIG_TX64X64
+    case 2:
+      aom_highbd_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                                  p->round, p->quant, p->quant_shift,
+                                  qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+                                  sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+                                  ,
+                                  qm_ptr, iqm_ptr
+#endif
+                                  );
+      break;
+#endif  // CONFIG_TX64X64
+    default: assert(0);
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE void highbd_quantize_dc(
+    const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+    const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr,
+#if CONFIG_AOM_QM
+    const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
+#endif
+    const int log_scale) {
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+#if CONFIG_AOM_QM
+  (void)qm_ptr;
+  (void)iqm_ptr;
+#endif
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + round_ptr[0];
+    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> (16 - log_scale));
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / (1 << log_scale);
+    if (abs_qcoeff) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   const MACROBLOCKD_PLANE *pd,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const SCAN_ORDER *sc,
+                                   const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+#if CONFIG_AOM_QM
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+#endif  // CONFIG_AOM_QM
+
+  (void)sc;
+
+  highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
+                     p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr, pd->dequant[0],
+                     eob_ptr,
+#if CONFIG_AOM_QM
+                     qm_ptr, iqm_ptr,
+#endif
+                     qparam->log_scale);
+}
+
+#if CONFIG_NEW_QUANT
+static INLINE int highbd_quantize_coeff_nuq(
+    const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
+    const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < cuml_bins_ptr[i]) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    tmp -= cuml_bins_ptr[NUQ_KNOTS - 1];
+    q = NUQ_KNOTS + (int)(((((tmp * quant) >> 16) + tmp) * quant_shift) >> 16);
+  }
+  if (q) {
+    *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val);
+    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+static INLINE int highbd_quantize_coeff_fp_nuq(
+    const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
+    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < cuml_bins_ptr[i]) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    q = NUQ_KNOTS + (int)(((tmp - cuml_bins_ptr[NUQ_KNOTS - 1]) * quant) >> 16);
+  }
+  if (q) {
+    *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val);
+    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+static INLINE int highbd_quantize_coeff_bigtx_fp_nuq(
+    const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
+    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int logsizeby16) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    q = NUQ_KNOTS +
+        (int)(((tmp -
+                ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16)) *
+               quant) >>
+              (16 - logsizeby16));
+  }
+  if (q) {
+    *dqcoeff_ptr = ROUND_POWER_OF_TWO(
+        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16);
+    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+static INLINE int highbd_quantize_coeff_bigtx_nuq(
+    const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
+    const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, int logsizeby16) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    tmp -= ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16);
+    q = NUQ_KNOTS + (int)(((((tmp * quant) >> 16) + tmp) * quant_shift) >>
+                          (16 - logsizeby16));
+  }
+  if (q) {
+    *dqcoeff_ptr = ROUND_POWER_OF_TWO(
+        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16);
+    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+void highbd_quantize_dc_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t quant,
+                            const int16_t quant_shift, const int16_t dequant,
+                            const tran_low_t *cuml_bins_ptr,
+                            const tran_low_t *dequant_val,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (highbd_quantize_coeff_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
+                                  cuml_bins_ptr, dequant_val, qcoeff_ptr,
+                                  dqcoeff_ptr))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_dc_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               int skip_block, const int16_t quant,
+                               const int16_t dequant,
+                               const tran_low_t *cuml_bins_ptr,
+                               const tran_low_t *dequant_val,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (highbd_quantize_coeff_fp_nuq(coeff_ptr[rc], quant, dequant,
+                                     cuml_bins_ptr, dequant_val, qcoeff_ptr,
+                                     dqcoeff_ptr))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           int skip_block, const int16_t *quant_ptr,
+                           const int16_t *quant_shift_ptr,
+                           const int16_t *dequant_ptr,
+                           const cuml_bins_type_nuq *cuml_bins_ptr,
+                           const dequant_val_type_nuq *dequant_val,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           uint16_t *eob_ptr, const int16_t *scan,
+                           const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (highbd_quantize_coeff_nuq(
+              coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
+              dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
+              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc]))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_32x32_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 int skip_block, const int16_t *quant_ptr,
+                                 const int16_t *quant_shift_ptr,
+                                 const int16_t *dequant_ptr,
+                                 const cuml_bins_type_nuq *cuml_bins_ptr,
+                                 const dequant_val_type_nuq *dequant_val,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                 const int16_t *scan, const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (highbd_quantize_coeff_bigtx_nuq(
+              coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
+              dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
+              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc],
+              av1_get_tx_scale(TX_32X32)))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_32x32_fp_nuq_c(const tran_low_t *coeff_ptr,
+                                    intptr_t n_coeffs, int skip_block,
+                                    const int16_t *quant_ptr,
+                                    const int16_t *dequant_ptr,
+                                    const cuml_bins_type_nuq *cuml_bins_ptr,
+                                    const dequant_val_type_nuq *dequant_val,
+                                    tran_low_t *qcoeff_ptr,
+                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                    const int16_t *scan, const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (highbd_quantize_coeff_bigtx_fp_nuq(
+              coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
+              cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
+              &dqcoeff_ptr[rc], av1_get_tx_scale(TX_32X32)))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_TX64X64
+void highbd_quantize_64x64_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 int skip_block, const int16_t *quant_ptr,
+                                 const int16_t *quant_shift_ptr,
+                                 const int16_t *dequant_ptr,
+                                 const cuml_bins_type_nuq *cuml_bins_ptr,
+                                 const dequant_val_type_nuq *dequant_val,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                 const int16_t *scan, const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (highbd_quantize_coeff_bigtx_nuq(
+              coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
+              dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
+              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc],
+              av1_get_tx_scale(TX_64X64)))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_64x64_fp_nuq_c(const tran_low_t *coeff_ptr,
+                                    intptr_t n_coeffs, int skip_block,
+                                    const int16_t *quant_ptr,
+                                    const int16_t *dequant_ptr,
+                                    const cuml_bins_type_nuq *cuml_bins_ptr,
+                                    const dequant_val_type_nuq *dequant_val,
+                                    tran_low_t *qcoeff_ptr,
+                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                    const int16_t *scan, const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (highbd_quantize_coeff_bigtx_fp_nuq(
+              coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
+              cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
+              &dqcoeff_ptr[rc], av1_get_tx_scale(TX_64X64)))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
+
+void highbd_quantize_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              int skip_block, const int16_t *quant_ptr,
+                              const int16_t *dequant_ptr,
+                              const cuml_bins_type_nuq *cuml_bins_ptr,
+                              const dequant_val_type_nuq *dequant_val,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              uint16_t *eob_ptr, const int16_t *scan,
+                              const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (highbd_quantize_coeff_fp_nuq(
+              coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
+              cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
+              &dqcoeff_ptr[rc]))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_dc_32x32_nuq(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t quant, const int16_t quant_shift, const int16_t dequant,
+    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (highbd_quantize_coeff_bigtx_nuq(
+            coeff_ptr[rc], quant, quant_shift, dequant, cuml_bins_ptr,
+            dequant_val, qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_32X32)))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_dc_32x32_fp_nuq(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t quant, const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (highbd_quantize_coeff_bigtx_fp_nuq(
+            coeff_ptr[rc], quant, dequant, cuml_bins_ptr, dequant_val,
+            qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_32X32)))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_TX64X64
+void highbd_quantize_dc_64x64_nuq(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t quant, const int16_t quant_shift, const int16_t dequant,
+    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (highbd_quantize_coeff_bigtx_nuq(
+            coeff_ptr[rc], quant, quant_shift, dequant, cuml_bins_ptr,
+            dequant_val, qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_64X64)))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_dc_64x64_fp_nuq(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t quant, const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (highbd_quantize_coeff_bigtx_fp_nuq(
+            coeff_ptr[rc], quant, dequant, cuml_bins_ptr, dequant_val,
+            qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_64X64)))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
+
+void av1_highbd_quantize_b_nuq_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
+    const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+  const uint8_t *band = get_band_translate(qparam->tx_size);
+  const int dq = qparam->dq;
+
+  switch (qparam->log_scale) {
+    case 0:
+      highbd_quantize_nuq(coeff_ptr, n_coeffs, skip_block, p->quant,
+                          p->quant_shift, pd->dequant,
+                          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+                          qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band);
+      break;
+    case 1:
+      highbd_quantize_32x32_nuq(
+          coeff_ptr, n_coeffs, skip_block, p->quant, p->quant_shift,
+          pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
+          dqcoeff_ptr, eob_ptr, sc->scan, band);
+      break;
+#if CONFIG_TX64X64
+    case 2:
+      highbd_quantize_64x64_nuq(
+          coeff_ptr, n_coeffs, skip_block, p->quant, p->quant_shift,
+          pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
+          dqcoeff_ptr, eob_ptr, sc->scan, band);
+      break;
+#endif  // CONFIG_TX64X64
+    default: assert(0);
+  }
+}
+
+void av1_highbd_quantize_fp_nuq_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
+    const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+  const uint8_t *band = get_band_translate(qparam->tx_size);
+  const int dq = qparam->dq;
+
+  switch (qparam->log_scale) {
+    case 0:
+      highbd_quantize_fp_nuq(
+          coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
+          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
+          dqcoeff_ptr, eob_ptr, sc->scan, band);
+      break;
+    case 1:
+      highbd_quantize_32x32_fp_nuq(
+          coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
+          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
+          dqcoeff_ptr, eob_ptr, sc->scan, band);
+      break;
+#if CONFIG_TX64X64
+    case 2:
+      highbd_quantize_64x64_fp_nuq(
+          coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
+          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
+          dqcoeff_ptr, eob_ptr, sc->scan, band);
+      break;
+#endif  // CONFIG_TX64X64
+    default: assert(0);
+  }
+}
+
+void av1_highbd_quantize_dc_nuq_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
+    const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+  const int dq = qparam->dq;
+  (void)sc;
+
+  switch (qparam->log_scale) {
+    case 0:
+      highbd_quantize_dc_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0],
+                                pd->dequant[0], p->cuml_bins_nuq[dq][0],
+                                pd->dequant_val_nuq[dq][0], qcoeff_ptr,
+                                dqcoeff_ptr, eob_ptr);
+      break;
+    case 1:
+      highbd_quantize_dc_32x32_fp_nuq(
+          coeff_ptr, n_coeffs, skip_block, p->quant_fp[0], pd->dequant[0],
+          p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0], qcoeff_ptr,
+          dqcoeff_ptr, eob_ptr);
+      break;
+#if CONFIG_TX64X64
+    case 2:
+      highbd_quantize_dc_64x64_fp_nuq(
+          coeff_ptr, n_coeffs, skip_block, p->quant_fp[0], pd->dequant[0],
+          p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0], qcoeff_ptr,
+          dqcoeff_ptr, eob_ptr);
+      break;
+#endif  // CONFIG_TX64X64
+    default: assert(0);
+  }
+}
+#endif  // CONFIG_NEW_QUANT
+#endif  // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
+                              int skip_block, const int16_t *zbin_ptr,
+                              const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const int16_t *scan, const int16_t *iscan,
+#if CONFIG_AOM_QM
+                              const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
+#endif
+                              int log_scale) {
+  int i;
+  int eob = -1;
+  const int scale = 1 << log_scale;
+  const int shift = 16 - log_scale;
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+#if CONFIG_AOM_QM
+      const qm_val_t wt = qm_ptr[rc];
+      const qm_val_t iwt = iqm_ptr[rc];
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+#endif
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp = abs_coeff + round_ptr[rc != 0];
+#if CONFIG_AOM_QM
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS));
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / scale;
+#else
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp * quant_ptr[rc != 0]) >> shift);
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / scale;
+#endif
+      if (abs_qcoeff) eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#endif  // CONFIG_HIGHBITDEPTH
+
+static void invert_quant(int16_t *quant, int16_t *shift, int d) {
+  uint32_t t;
+  int l, m;
+  t = d;
+  for (l = 0; t > 1; l++) t >>= 1;
+  m = 1 + (1 << (16 + l)) / d;
+  *quant = (int16_t)(m - (1 << 16));
+  *shift = 1 << (16 - l);
+}
+
+static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) {
+  const int quant = av1_dc_quant(q, 0, bit_depth);
+#if CONFIG_HIGHBITDEPTH
+  switch (bit_depth) {
+    case AOM_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+    case AOM_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80);
+    case AOM_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+#else
+  (void)bit_depth;
+  return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+#endif
+}
+
+void av1_init_quantizer(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  QUANTS *const quants = &cpi->quants;
+  int i, q, quant;
+#if CONFIG_NEW_QUANT
+  int dq;
+#endif
+
+  for (q = 0; q < QINDEX_RANGE; q++) {
+    const int qzbin_factor = get_qzbin_factor(q, cm->bit_depth);
+    const int qrounding_factor = q == 0 ? 64 : 48;
+
+    for (i = 0; i < 2; ++i) {
+      int qrounding_factor_fp = 64;
+      // y
+      quant = i == 0 ? av1_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth)
+                     : av1_ac_quant(q, 0, cm->bit_depth);
+      invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant);
+      quants->y_quant_fp[q][i] = (1 << 16) / quant;
+      quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
+      quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+      quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
+      cpi->y_dequant[q][i] = quant;
+
+      // uv
+      quant = i == 0 ? av1_dc_quant(q, cm->uv_dc_delta_q, cm->bit_depth)
+                     : av1_ac_quant(q, cm->uv_ac_delta_q, cm->bit_depth);
+      invert_quant(&quants->uv_quant[q][i], &quants->uv_quant_shift[q][i],
+                   quant);
+      quants->uv_quant_fp[q][i] = (1 << 16) / quant;
+      quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
+      quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+      quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
+      cpi->uv_dequant[q][i] = quant;
+    }
+
+#if CONFIG_NEW_QUANT
+    for (dq = 0; dq < QUANT_PROFILES; dq++) {
+      for (i = 0; i < COEF_BANDS; i++) {
+        const int y_quant = cpi->y_dequant[q][i != 0];
+        const int uvquant = cpi->uv_dequant[q][i != 0];
+        av1_get_dequant_val_nuq(y_quant, i, cpi->y_dequant_val_nuq[dq][q][i],
+                                quants->y_cuml_bins_nuq[dq][q][i], dq);
+        av1_get_dequant_val_nuq(uvquant, i, cpi->uv_dequant_val_nuq[dq][q][i],
+                                quants->uv_cuml_bins_nuq[dq][q][i], dq);
+      }
+    }
+#endif  // CONFIG_NEW_QUANT
+
+    for (i = 2; i < 8; i++) {  // 8: SIMD width
+      quants->y_quant[q][i] = quants->y_quant[q][1];
+      quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
+      quants->y_round_fp[q][i] = quants->y_round_fp[q][1];
+      quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
+      quants->y_zbin[q][i] = quants->y_zbin[q][1];
+      quants->y_round[q][i] = quants->y_round[q][1];
+      cpi->y_dequant[q][i] = cpi->y_dequant[q][1];
+
+      quants->uv_quant[q][i] = quants->uv_quant[q][1];
+      quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1];
+      quants->uv_round_fp[q][i] = quants->uv_round_fp[q][1];
+      quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
+      quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
+      quants->uv_round[q][i] = quants->uv_round[q][1];
+      cpi->uv_dequant[q][i] = cpi->uv_dequant[q][1];
+    }
+  }
+}
+
+void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
+                               int segment_id) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const QUANTS *const quants = &cpi->quants;
+
+#if CONFIG_DELTA_Q
+#if CONFIG_EXT_DELTA_Q
+  int current_q_index = AOMMAX(
+      0, AOMMIN(QINDEX_RANGE - 1, cpi->oxcf.deltaq_mode != NO_DELTA_Q
+                                      ? cm->base_qindex + xd->delta_qindex
+                                      : cm->base_qindex));
+#else
+  int current_q_index = AOMMAX(
+      0, AOMMIN(QINDEX_RANGE - 1, cm->delta_q_present_flag
+                                      ? cm->base_qindex + xd->delta_qindex
+                                      : cm->base_qindex));
+#endif
+  const int qindex = av1_get_qindex(&cm->seg, segment_id, current_q_index);
+#else
+  const int qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+#endif
+  const int rdmult = av1_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
+  int i;
+#if CONFIG_AOM_QM
+  int minqm = cm->min_qmlevel;
+  int maxqm = cm->max_qmlevel;
+  // Quant matrix only depends on the base QP so there is only one set per frame
+  int qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0)
+                    ? NUM_QM_LEVELS - 1
+                    : aom_get_qmlevel(cm->base_qindex, minqm, maxqm);
+#endif
+#if CONFIG_NEW_QUANT
+  int dq;
+#endif
+
+  // Y
+  x->plane[0].quant = quants->y_quant[qindex];
+  x->plane[0].quant_fp = quants->y_quant_fp[qindex];
+  x->plane[0].round_fp = quants->y_round_fp[qindex];
+  x->plane[0].quant_shift = quants->y_quant_shift[qindex];
+  x->plane[0].zbin = quants->y_zbin[qindex];
+  x->plane[0].round = quants->y_round[qindex];
+#if CONFIG_AOM_QM
+  memcpy(&xd->plane[0].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][0],
+         sizeof(cm->gqmatrix[qmlevel][0]));
+  memcpy(&xd->plane[0].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][0],
+         sizeof(cm->giqmatrix[qmlevel][0]));
+#endif
+  xd->plane[0].dequant = cpi->y_dequant[qindex];
+#if CONFIG_NEW_QUANT
+  for (dq = 0; dq < QUANT_PROFILES; dq++) {
+    x->plane[0].cuml_bins_nuq[dq] = quants->y_cuml_bins_nuq[dq][qindex];
+    xd->plane[0].dequant_val_nuq[dq] = cpi->y_dequant_val_nuq[dq][qindex];
+  }
+#endif  // CONFIG_NEW_QUANT
+
+  // UV
+  for (i = 1; i < 3; i++) {
+    x->plane[i].quant = quants->uv_quant[qindex];
+    x->plane[i].quant_fp = quants->uv_quant_fp[qindex];
+    x->plane[i].round_fp = quants->uv_round_fp[qindex];
+    x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
+    x->plane[i].zbin = quants->uv_zbin[qindex];
+    x->plane[i].round = quants->uv_round[qindex];
+#if CONFIG_AOM_QM
+    memcpy(&xd->plane[i].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][1],
+           sizeof(cm->gqmatrix[qmlevel][1]));
+    memcpy(&xd->plane[i].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][1],
+           sizeof(cm->giqmatrix[qmlevel][1]));
+#endif
+    xd->plane[i].dequant = cpi->uv_dequant[qindex];
+#if CONFIG_NEW_QUANT
+    for (dq = 0; dq < QUANT_PROFILES; dq++) {
+      x->plane[i].cuml_bins_nuq[dq] = quants->uv_cuml_bins_nuq[dq][qindex];
+      xd->plane[i].dequant_val_nuq[dq] = cpi->uv_dequant_val_nuq[dq][qindex];
+    }
+#endif  // CONFIG_NEW_QUANT
+  }
+
+  x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
+  x->qindex = qindex;
+
+  set_error_per_bit(x, rdmult);
+
+  av1_initialize_me_consts(cpi, x, qindex);
+}
+
+void av1_frame_init_quantizer(AV1_COMP *cpi) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
+}
+
+void av1_set_quantizer(AV1_COMMON *cm, int q) {
+  // quantizer has to be reinitialized with av1_init_quantizer() if any
+  // delta_q changes.
+  cm->base_qindex = q;
+  cm->y_dc_delta_q = 0;
+  cm->uv_dc_delta_q = 0;
+  cm->uv_ac_delta_q = 0;
+}
+
+// Table that converts 0-63 Q-range values passed in outside to the Qindex
+// range used internally.
+static const int quantizer_to_qindex[] = {
+  0,   4,   8,   12,  16,  20,  24,  28,  32,  36,  40,  44,  48,
+  52,  56,  60,  64,  68,  72,  76,  80,  84,  88,  92,  96,  100,
+  104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152,
+  156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204,
+  208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 249, 255,
+};
+
+int av1_quantizer_to_qindex(int quantizer) {
+  return quantizer_to_qindex[quantizer];
+}
+
+int av1_qindex_to_quantizer(int qindex) {
+  int quantizer;
+
+  for (quantizer = 0; quantizer < 64; ++quantizer)
+    if (quantizer_to_qindex[quantizer] >= qindex) return quantizer;
+
+  return 63;
+}
diff --git a/third_party/aom/av1/encoder/av1_quantize.h b/third_party/aom/av1/encoder/av1_quantize.h
new file mode 100644
index 000000000..c87b6b7dc
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_quantize.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_QUANTIZE_H_
+#define AV1_ENCODER_QUANTIZE_H_
+
+#include "./aom_config.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/scan.h"
+#include "av1/encoder/block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct QUANT_PARAM {
+  int log_scale;
+#if CONFIG_NEW_QUANT
+  TX_SIZE tx_size;
+  int dq;
+#endif  // CONFIG_NEW_QUANT
+#if CONFIG_AOM_QM
+  const qm_val_t *qmatrix;
+  const qm_val_t *iqmatrix;
+#endif  // CONFIG_AOM_QM
+} QUANT_PARAM;
+
+typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 const MACROBLOCK_PLANE *p,
+                                 tran_low_t *qcoeff_ptr,
+                                 const MACROBLOCKD_PLANE *pd,
+                                 tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                 const SCAN_ORDER *sc,
+                                 const QUANT_PARAM *qparam);
+
+typedef struct {
+#if CONFIG_NEW_QUANT
+  DECLARE_ALIGNED(
+      16, tran_low_t,
+      y_cuml_bins_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS][NUQ_KNOTS]);
+  DECLARE_ALIGNED(
+      16, tran_low_t,
+      uv_cuml_bins_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS][NUQ_KNOTS]);
+#endif  // CONFIG_NEW_QUANT
+  // 0: dc 1: ac 2-8: ac repeated to SIMD width
+  DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]);
+
+  // TODO(jingning): in progress of re-working the quantization. will decide
+  // if we want to deprecate the current use of y_quant.
+  DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_round_fp[QINDEX_RANGE][8]);
+
+  DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
+} QUANTS;
+
+struct AV1_COMP;
+struct AV1Common;
+
+void av1_frame_init_quantizer(struct AV1_COMP *cpi);
+
+void av1_init_plane_quantizers(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                               int segment_id);
+
+void av1_init_quantizer(struct AV1_COMP *cpi);
+
+void av1_set_quantizer(struct AV1Common *cm, int q);
+
+int av1_quantizer_to_qindex(int quantizer);
+
+int av1_qindex_to_quantizer(int qindex);
+
+void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+                       tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
+
+void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            const MACROBLOCKD_PLANE *pd,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                           const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr,
+                           uint16_t *eob_ptr, const SCAN_ORDER *sc,
+                           const QUANT_PARAM *qparam);
+
+void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            const MACROBLOCKD_PLANE *pd,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+#if CONFIG_NEW_QUANT
+void av1_quantize_fp_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const MACROBLOCK_PLANE *p,
+                                tran_low_t *qcoeff_ptr,
+                                const MACROBLOCKD_PLANE *pd,
+                                tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                const SCAN_ORDER *sc,
+                                const QUANT_PARAM *qparam);
+
+void av1_quantize_b_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const MACROBLOCK_PLANE *p,
+                               tran_low_t *qcoeff_ptr,
+                               const MACROBLOCKD_PLANE *pd,
+                               tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                               const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+void av1_quantize_dc_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const MACROBLOCK_PLANE *p,
+                                tran_low_t *qcoeff_ptr,
+                                const MACROBLOCKD_PLANE *pd,
+                                tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                const SCAN_ORDER *sc,
+                                const QUANT_PARAM *qparam);
+#endif  // CONFIG_NEW_QUANT
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   const MACROBLOCKD_PLANE *pd,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const SCAN_ORDER *sc,
+                                   const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
+                                  intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                  tran_low_t *qcoeff_ptr,
+                                  const MACROBLOCKD_PLANE *pd,
+                                  tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                  const SCAN_ORDER *sc,
+                                  const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   const MACROBLOCKD_PLANE *pd,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const SCAN_ORDER *sc,
+                                   const QUANT_PARAM *qparam);
+
+#if CONFIG_NEW_QUANT
+void av1_highbd_quantize_fp_nuq_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
+    const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_b_nuq_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
+    const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_dc_nuq_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
+    const QUANT_PARAM *qparam);
+#endif  // CONFIG_NEW_QUANT
+#endif  // CONFIG_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_QUANTIZE_H_
diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c
new file mode 100644
index 000000000..7cc6179ea
--- /dev/null
+++ b/third_party/aom/av1/encoder/bitstream.c
@@ -0,0 +1,5399 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "aom/aom_encoder.h"
+#include "aom_dsp/bitwriter_buffer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem_ops.h"
+#include "aom_ports/system_state.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#if CONFIG_CDEF
+#include "av1/common/cdef.h"
+#include "av1/common/clpf.h"
+#endif  // CONFIG_CDEF
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
+#if CONFIG_EXT_INTRA
+#include "av1/common/reconintra.h"
+#endif  // CONFIG_EXT_INTRA
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+
+#if CONFIG_ANS
+#include "aom_dsp/buf_ans.h"
+#endif  // CONFIG_ANS
+#if CONFIG_LV_MAP
+#include "av1/encoder/encodetxb.h"
+#endif  // CONFIG_LV_MAP
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/mcomp.h"
+#if CONFIG_PALETTE && CONFIG_PALETTE_DELTA_ENCODING
+#include "av1/encoder/palette.h"
+#endif  // CONFIG_PALETTE && CONFIG_PALETTE_DELTA_ENCODING
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/subexp.h"
+#include "av1/encoder/tokenize.h"
+#if CONFIG_PVQ
+#include "av1/encoder/pvq_encoder.h"
+#endif
+
+static struct av1_token intra_mode_encodings[INTRA_MODES];
+static struct av1_token switchable_interp_encodings[SWITCHABLE_FILTERS];
+#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EC_MULTISYMBOL
+static const struct av1_token ext_partition_encodings[EXT_PARTITION_TYPES] = {
+  { 0, 1 },  { 4, 3 },  { 12, 4 }, { 7, 3 },
+  { 10, 4 }, { 11, 4 }, { 26, 5 }, { 27, 5 }
+};
+#endif
+static struct av1_token partition_encodings[PARTITION_TYPES];
+#if !CONFIG_REF_MV
+static struct av1_token inter_mode_encodings[INTER_MODES];
+#endif
+#if CONFIG_EXT_INTER
+static const struct av1_token
+    inter_compound_mode_encodings[INTER_COMPOUND_MODES] = {
+      { 2, 2 },  { 50, 6 }, { 51, 6 }, { 24, 5 }, { 52, 6 },
+      { 53, 6 }, { 54, 6 }, { 55, 6 }, { 0, 1 },  { 7, 3 }
+    };
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_PALETTE
+static struct av1_token palette_size_encodings[PALETTE_SIZES];
+static struct av1_token palette_color_index_encodings[PALETTE_SIZES]
+                                                     [PALETTE_COLORS];
+#endif  // CONFIG_PALETTE
+#if !CONFIG_EC_MULTISYMBOL
+static const struct av1_token tx_size_encodings[MAX_TX_DEPTH][TX_SIZES] = {
+  { { 0, 1 }, { 1, 1 } },                      // Max tx_size is 8X8
+  { { 0, 1 }, { 2, 2 }, { 3, 2 } },            // Max tx_size is 16X16
+  { { 0, 1 }, { 2, 2 }, { 6, 3 }, { 7, 3 } },  // Max tx_size is 32X32
+#if CONFIG_TX64X64
+  { { 0, 1 }, { 2, 2 }, { 6, 3 }, { 14, 4 }, { 15, 4 } },  // Max tx_size 64X64
+#endif                                                     // CONFIG_TX64X64
+};
+#endif
+
+#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
+static INLINE void write_uniform(aom_writer *w, int n, int v) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  if (l == 0) return;
+  if (v < m) {
+    aom_write_literal(w, v, l - 1);
+  } else {
+    aom_write_literal(w, m + ((v - m) >> 1), l - 1);
+    aom_write_literal(w, (v - m) & 1, 1);
+  }
+}
+#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
+
+#if CONFIG_EXT_TX
+static struct av1_token ext_tx_inter_encodings[EXT_TX_SETS_INTER][TX_TYPES];
+static struct av1_token ext_tx_intra_encodings[EXT_TX_SETS_INTRA][TX_TYPES];
+#else
+static struct av1_token ext_tx_encodings[TX_TYPES];
+#endif  // CONFIG_EXT_TX
+#if CONFIG_GLOBAL_MOTION
+static struct av1_token global_motion_types_encodings[GLOBAL_TRANS_TYPES];
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_EXT_INTRA
+#if CONFIG_INTRA_INTERP
+static struct av1_token intra_filter_encodings[INTRA_FILTERS];
+#endif  // CONFIG_INTRA_INTERP
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_EXT_INTER
+static struct av1_token interintra_mode_encodings[INTERINTRA_MODES];
+#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+static struct av1_token compound_type_encodings[COMPOUND_TYPES];
+#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+static struct av1_token motion_mode_encodings[MOTION_MODES];
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_LOOP_RESTORATION
+static struct av1_token switchable_restore_encodings[RESTORE_SWITCHABLE_TYPES];
+#endif  // CONFIG_LOOP_RESTORATION
+static void write_uncompressed_header(AV1_COMP *cpi,
+                                      struct aom_write_bit_buffer *wb);
+static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data);
+static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
+                       const uint32_t data_size, const uint32_t max_tile_size,
+                       const uint32_t max_tile_col_size,
+                       int *const tile_size_bytes,
+                       int *const tile_col_size_bytes);
+
+void av1_encode_token_init(void) {
+#if CONFIG_EXT_TX || CONFIG_PALETTE
+  int s;
+#endif  // CONFIG_EXT_TX || CONFIG_PALETTE
+#if CONFIG_EXT_TX
+  for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+    av1_tokens_from_tree(ext_tx_inter_encodings[s], av1_ext_tx_inter_tree[s]);
+  }
+  for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+    av1_tokens_from_tree(ext_tx_intra_encodings[s], av1_ext_tx_intra_tree[s]);
+  }
+#else
+  av1_tokens_from_tree(ext_tx_encodings, av1_ext_tx_tree);
+#endif  // CONFIG_EXT_TX
+  av1_tokens_from_tree(intra_mode_encodings, av1_intra_mode_tree);
+  av1_tokens_from_tree(switchable_interp_encodings, av1_switchable_interp_tree);
+  av1_tokens_from_tree(partition_encodings, av1_partition_tree);
+#if !CONFIG_REF_MV
+  av1_tokens_from_tree(inter_mode_encodings, av1_inter_mode_tree);
+#endif
+
+#if CONFIG_PALETTE
+  av1_tokens_from_tree(palette_size_encodings, av1_palette_size_tree);
+  for (s = 0; s < PALETTE_SIZES; ++s) {
+    av1_tokens_from_tree(palette_color_index_encodings[s],
+                         av1_palette_color_index_tree[s]);
+  }
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+  av1_tokens_from_tree(intra_filter_encodings, av1_intra_filter_tree);
+#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+#if CONFIG_EXT_INTER
+  av1_tokens_from_tree(interintra_mode_encodings, av1_interintra_mode_tree);
+#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+  av1_tokens_from_tree(compound_type_encodings, av1_compound_type_tree);
+#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  av1_tokens_from_tree(motion_mode_encodings, av1_motion_mode_tree);
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_GLOBAL_MOTION
+  av1_tokens_from_tree(global_motion_types_encodings,
+                       av1_global_motion_types_tree);
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_LOOP_RESTORATION
+  av1_tokens_from_tree(switchable_restore_encodings,
+                       av1_switchable_restore_tree);
+#endif  // CONFIG_LOOP_RESTORATION
+
+#if CONFIG_EC_MULTISYMBOL
+  /* This hack is necessary when CONFIG_DUAL_FILTER is enabled because the five
+      SWITCHABLE_FILTERS are not consecutive, e.g., 0, 1, 2, 3, 4, when doing
+      an in-order traversal of the av1_switchable_interp_tree structure. */
+  av1_indices_from_tree(av1_switchable_interp_ind, av1_switchable_interp_inv,
+                        av1_switchable_interp_tree);
+/* This hack is necessary because the four TX_TYPES are not consecutive,
+    e.g., 0, 1, 2, 3, when doing an in-order traversal of the av1_ext_tx_tree
+    structure. */
+#if CONFIG_EXT_TX
+  for (s = 1; s < EXT_TX_SETS_INTRA; ++s)
+    av1_indices_from_tree(av1_ext_tx_intra_ind[s], av1_ext_tx_intra_inv[s],
+                          av1_ext_tx_intra_tree[s]);
+  for (s = 1; s < EXT_TX_SETS_INTER; ++s)
+    av1_indices_from_tree(av1_ext_tx_inter_ind[s], av1_ext_tx_inter_inv[s],
+                          av1_ext_tx_inter_tree[s]);
+#else
+  av1_indices_from_tree(av1_ext_tx_ind, av1_ext_tx_inv, av1_ext_tx_tree);
+#endif
+  av1_indices_from_tree(av1_intra_mode_ind, av1_intra_mode_inv,
+                        av1_intra_mode_tree);
+  av1_indices_from_tree(av1_inter_mode_ind, av1_inter_mode_inv,
+                        av1_inter_mode_tree);
+#endif
+}
+
+static void write_intra_mode_kf(const AV1_COMMON *cm, FRAME_CONTEXT *frame_ctx,
+                                const MODE_INFO *mi, const MODE_INFO *above_mi,
+                                const MODE_INFO *left_mi, int block,
+                                PREDICTION_MODE mode, aom_writer *w) {
+#if CONFIG_INTRABC
+  assert(!is_intrabc_block(&mi->mbmi));
+#endif  // CONFIG_INTRABC
+#if CONFIG_EC_MULTISYMBOL
+  aom_write_symbol(w, av1_intra_mode_ind[mode],
+                   get_y_mode_cdf(frame_ctx, mi, above_mi, left_mi, block),
+                   INTRA_MODES);
+  (void)cm;
+#else
+  av1_write_token(w, av1_intra_mode_tree,
+                  get_y_mode_probs(cm, mi, above_mi, left_mi, block),
+                  &intra_mode_encodings[mode]);
+  (void)frame_ctx;
+#endif
+}
+
+#if CONFIG_EXT_INTER
+static void write_interintra_mode(aom_writer *w, INTERINTRA_MODE mode,
+                                  const aom_prob *probs) {
+  av1_write_token(w, av1_interintra_mode_tree, probs,
+                  &interintra_mode_encodings[mode]);
+}
+#endif  // CONFIG_EXT_INTER
+
+static void write_inter_mode(aom_writer *w, PREDICTION_MODE mode,
+                             FRAME_CONTEXT *ec_ctx, const int16_t mode_ctx) {
+#if CONFIG_REF_MV
+  const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+  const aom_prob newmv_prob = ec_ctx->newmv_prob[newmv_ctx];
+
+#define IS_NEWMV_MODE(mode) ((mode) == NEWMV)
+  aom_write(w, !IS_NEWMV_MODE(mode), newmv_prob);
+
+  if (!IS_NEWMV_MODE(mode)) {
+    const int16_t zeromv_ctx = (mode_ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
+    const aom_prob zeromv_prob = ec_ctx->zeromv_prob[zeromv_ctx];
+
+    if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) {
+      assert(mode == ZEROMV);
+      return;
+    }
+
+    aom_write(w, mode != ZEROMV, zeromv_prob);
+
+    if (mode != ZEROMV) {
+      int16_t refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+      aom_prob refmv_prob;
+
+      if (mode_ctx & (1 << SKIP_NEARESTMV_OFFSET)) refmv_ctx = 6;
+      if (mode_ctx & (1 << SKIP_NEARMV_OFFSET)) refmv_ctx = 7;
+      if (mode_ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) refmv_ctx = 8;
+
+      refmv_prob = ec_ctx->refmv_prob[refmv_ctx];
+      aom_write(w, mode != NEARESTMV, refmv_prob);
+    }
+  }
+
+#undef IS_NEWMV_MODE
+
+#else  // !CONFIG_REF_MV
+  assert(is_inter_mode(mode));
+#if CONFIG_EC_MULTISYMBOL
+  aom_write_symbol(w, av1_inter_mode_ind[INTER_OFFSET(mode)],
+                   ec_ctx->inter_mode_cdf[mode_ctx], INTER_MODES);
+#else
+  {
+    const aom_prob *const inter_probs = ec_ctx->inter_mode_probs[mode_ctx];
+    av1_write_token(w, av1_inter_mode_tree, inter_probs,
+                    &inter_mode_encodings[INTER_OFFSET(mode)]);
+  }
+#endif
+#endif
+}
+
+#if CONFIG_REF_MV
+static void write_drl_idx(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
+                          const MB_MODE_INFO_EXT *mbmi_ext, aom_writer *w) {
+  uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+
+  assert(mbmi->ref_mv_idx < 3);
+
+#if CONFIG_EXT_INTER
+  if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+#else
+  if (mbmi->mode == NEWMV) {
+#endif
+    int idx;
+    for (idx = 0; idx < 2; ++idx) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx =
+            av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+        aom_prob drl_prob = cm->fc->drl_prob[drl_ctx];
+
+        aom_write(w, mbmi->ref_mv_idx != idx, drl_prob);
+        if (mbmi->ref_mv_idx == idx) return;
+      }
+    }
+    return;
+  }
+
+  if (have_nearmv_in_inter_mode(mbmi->mode)) {
+    int idx;
+    // TODO(jingning): Temporary solution to compensate the NEARESTMV offset.
+    for (idx = 1; idx < 3; ++idx) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx =
+            av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+        aom_prob drl_prob = cm->fc->drl_prob[drl_ctx];
+
+        aom_write(w, mbmi->ref_mv_idx != (idx - 1), drl_prob);
+        if (mbmi->ref_mv_idx == (idx - 1)) return;
+      }
+    }
+    return;
+  }
+}
+#endif
+
+#if CONFIG_EXT_INTER
+static void write_inter_compound_mode(AV1_COMMON *cm, aom_writer *w,
+                                      PREDICTION_MODE mode,
+                                      const int16_t mode_ctx) {
+  const aom_prob *const inter_compound_probs =
+      cm->fc->inter_compound_mode_probs[mode_ctx];
+
+  assert(is_inter_compound_mode(mode));
+  av1_write_token(w, av1_inter_compound_mode_tree, inter_compound_probs,
+                  &inter_compound_mode_encodings[INTER_COMPOUND_OFFSET(mode)]);
+}
+#endif  // CONFIG_EXT_INTER
+
+static void encode_unsigned_max(struct aom_write_bit_buffer *wb, int data,
+                                int max) {
+  aom_wb_write_literal(wb, data, get_unsigned_bits(max));
+}
+
+#if !CONFIG_EC_ADAPT || \
+    (CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION || CONFIG_EXT_INTER)
+static void prob_diff_update(const aom_tree_index *tree,
+                             aom_prob probs[/*n - 1*/],
+                             const unsigned int counts[/*n - 1*/], int n,
+                             int probwt, aom_writer *w) {
+  int i;
+  unsigned int branch_ct[32][2];
+
+  // Assuming max number of probabilities <= 32
+  assert(n <= 32);
+
+  av1_tree_probs_from_distribution(tree, branch_ct, counts);
+  for (i = 0; i < n - 1; ++i)
+    av1_cond_prob_diff_update(w, &probs[i], branch_ct[i], probwt);
+}
+#endif
+
+#if CONFIG_EXT_INTER || !CONFIG_EC_ADAPT
+static int prob_diff_update_savings(const aom_tree_index *tree,
+                                    aom_prob probs[/*n - 1*/],
+                                    const unsigned int counts[/*n - 1*/], int n,
+                                    int probwt) {
+  int i;
+  unsigned int branch_ct[32][2];
+  int savings = 0;
+
+  // Assuming max number of probabilities <= 32
+  assert(n <= 32);
+  av1_tree_probs_from_distribution(tree, branch_ct, counts);
+  for (i = 0; i < n - 1; ++i) {
+    savings +=
+        av1_cond_prob_diff_update_savings(&probs[i], branch_ct[i], probwt);
+  }
+  return savings;
+}
+#endif  // CONFIG_EXT_INTER || !CONFIG_EC_ADAPT
+
+#if CONFIG_VAR_TX
+static void write_tx_size_vartx(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                                const MB_MODE_INFO *mbmi, TX_SIZE tx_size,
+                                int depth, int blk_row, int blk_col,
+                                aom_writer *w) {
+  const int tx_row = blk_row >> 1;
+  const int tx_col = blk_col >> 1;
+  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
+  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
+
+  int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
+                                   xd->left_txfm_context + tx_row,
+                                   mbmi->sb_type, tx_size);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (depth == MAX_VARTX_DEPTH) {
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size, tx_size);
+    return;
+  }
+
+  if (tx_size == mbmi->inter_tx_size[tx_row][tx_col]) {
+    aom_write(w, 0, cm->fc->txfm_partition_prob[ctx]);
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size, tx_size);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsl = tx_size_wide_unit[sub_txs];
+    int i;
+
+    aom_write(w, 1, cm->fc->txfm_partition_prob[ctx]);
+
+    if (tx_size == TX_8X8) {
+      txfm_partition_update(xd->above_txfm_context + tx_col,
+                            xd->left_txfm_context + tx_row, sub_txs, tx_size);
+      return;
+    }
+
+    assert(bsl > 0);
+    for (i = 0; i < 4; ++i) {
+      int offsetr = blk_row + (i >> 1) * bsl;
+      int offsetc = blk_col + (i & 0x01) * bsl;
+      write_tx_size_vartx(cm, xd, mbmi, sub_txs, depth + 1, offsetr, offsetc,
+                          w);
+    }
+  }
+}
+
+static void update_txfm_partition_probs(AV1_COMMON *cm, aom_writer *w,
+                                        FRAME_COUNTS *counts, int probwt) {
+  int k;
+  for (k = 0; k < TXFM_PARTITION_CONTEXTS; ++k)
+    av1_cond_prob_diff_update(w, &cm->fc->txfm_partition_prob[k],
+                              counts->txfm_partition[k], probwt);
+}
+#endif
+
+static void write_selected_tx_size(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                                   aom_writer *w) {
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  (void)cm;
+#else
+  FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+// For sub8x8 blocks the tx_size symbol does not need to be sent
+#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_EXT_TX) && CONFIG_RECT_TX
+  if (bsize > BLOCK_4X4) {
+#else
+  if (bsize >= BLOCK_8X8) {
+#endif
+    const TX_SIZE tx_size = mbmi->tx_size;
+    const int is_inter = is_inter_block(mbmi);
+    const int tx_size_ctx = get_tx_size_context(xd);
+    const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
+                                     : intra_tx_size_cat_lookup[bsize];
+    const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
+    const int depth = tx_size_to_depth(coded_tx_size);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+    assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
+    assert(
+        IMPLIES(is_rect_tx(tx_size), tx_size == max_txsize_rect_lookup[bsize]));
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+#if CONFIG_EC_MULTISYMBOL
+    aom_write_symbol(w, depth, ec_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
+                     tx_size_cat + 2);
+#else
+    av1_write_token(w, av1_tx_size_tree[tx_size_cat],
+                    ec_ctx->tx_size_probs[tx_size_cat][tx_size_ctx],
+                    &tx_size_encodings[tx_size_cat][depth]);
+#endif
+  }
+}
+
+#if CONFIG_REF_MV
+static void update_inter_mode_probs(AV1_COMMON *cm, aom_writer *w,
+                                    FRAME_COUNTS *counts) {
+  int i;
+#if CONFIG_TILE_GROUPS
+  const int probwt = cm->num_tg;
+#else
+  const int probwt = 1;
+#endif
+  for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
+    av1_cond_prob_diff_update(w, &cm->fc->newmv_prob[i], counts->newmv_mode[i],
+                              probwt);
+  for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
+    av1_cond_prob_diff_update(w, &cm->fc->zeromv_prob[i],
+                              counts->zeromv_mode[i], probwt);
+  for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
+    av1_cond_prob_diff_update(w, &cm->fc->refmv_prob[i], counts->refmv_mode[i],
+                              probwt);
+  for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
+    av1_cond_prob_diff_update(w, &cm->fc->drl_prob[i], counts->drl_mode[i],
+                              probwt);
+}
+#endif
+
+#if CONFIG_EXT_INTER
+static void update_inter_compound_mode_probs(AV1_COMMON *cm, int probwt,
+                                             aom_writer *w) {
+  const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
+                             av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
+  int i;
+  int savings = 0;
+  int do_update = 0;
+  for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
+    savings += prob_diff_update_savings(
+        av1_inter_compound_mode_tree, cm->fc->inter_compound_mode_probs[i],
+        cm->counts.inter_compound_mode[i], INTER_COMPOUND_MODES, probwt);
+  }
+  do_update = savings > savings_thresh;
+  aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+  if (do_update) {
+    for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
+      prob_diff_update(
+          av1_inter_compound_mode_tree, cm->fc->inter_compound_mode_probs[i],
+          cm->counts.inter_compound_mode[i], INTER_COMPOUND_MODES, probwt, w);
+    }
+  }
+}
+#endif  // CONFIG_EXT_INTER
+
+static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                      int segment_id, const MODE_INFO *mi, aom_writer *w) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 1;
+  } else {
+    const int skip = mi->mbmi.skip;
+    aom_write(w, skip, av1_get_skip_prob(cm, xd));
+    return skip;
+  }
+}
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+static void write_motion_mode(const AV1_COMMON *cm, const MODE_INFO *mi,
+                              aom_writer *w) {
+  const MB_MODE_INFO *mbmi = &mi->mbmi;
+  MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
+#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+      0, cm->global_motion,
+#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+      mi);
+
+  if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return;
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+  if (last_motion_mode_allowed == OBMC_CAUSAL) {
+    aom_write(w, mbmi->motion_mode == OBMC_CAUSAL,
+              cm->fc->obmc_prob[mbmi->sb_type]);
+  } else {
+#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+    av1_write_token(w, av1_motion_mode_tree,
+                    cm->fc->motion_mode_prob[mbmi->sb_type],
+                    &motion_mode_encodings[mbmi->motion_mode]);
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+  }
+#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+}
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if CONFIG_DELTA_Q
+static void write_delta_qindex(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                               int delta_qindex, aom_writer *w) {
+  int sign = delta_qindex < 0;
+  int abs = sign ? -delta_qindex : delta_qindex;
+  int rem_bits, thr;
+  int smallval = abs < DELTA_Q_SMALL ? 1 : 0;
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  (void)cm;
+#else
+  FRAME_CONTEXT *ec_ctx = cm->fc;
+  (void)xd;
+#endif
+
+#if CONFIG_EC_MULTISYMBOL
+  aom_write_symbol(w, AOMMIN(abs, DELTA_Q_SMALL), ec_ctx->delta_q_cdf,
+                   DELTA_Q_PROBS + 1);
+#else
+  int i = 0;
+  while (i < DELTA_Q_SMALL && i <= abs) {
+    int bit = (i < abs);
+    aom_write(w, bit, ec_ctx->delta_q_prob[i]);
+    i++;
+  }
+#endif
+
+  if (!smallval) {
+    rem_bits = OD_ILOG_NZ(abs - 1) - 1;
+    thr = (1 << rem_bits) + 1;
+    aom_write_literal(w, rem_bits, 3);
+    aom_write_literal(w, abs - thr, rem_bits);
+  }
+  if (abs > 0) {
+    aom_write_bit(w, sign);
+  }
+}
+
+#if !CONFIG_EC_ADAPT
+static void update_delta_q_probs(AV1_COMMON *cm, aom_writer *w,
+                                 FRAME_COUNTS *counts) {
+  int k;
+#if CONFIG_TILE_GROUPS
+  const int probwt = cm->num_tg;
+#else
+  const int probwt = 1;
+#endif
+#if CONFIG_EXT_DELTA_Q
+  if (!cm->delta_q_present_flag) return;
+#endif  // CONFIG_EXT_DELTA_Q
+  for (k = 0; k < DELTA_Q_PROBS; ++k) {
+    av1_cond_prob_diff_update(w, &cm->fc->delta_q_prob[k], counts->delta_q[k],
+                              probwt);
+  }
+}
+#endif  // CONFIG_EC_ADAPT
+
+#if CONFIG_EXT_DELTA_Q
+static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                                int delta_lflevel, aom_writer *w) {
+  int sign = delta_lflevel < 0;
+  int abs = sign ? -delta_lflevel : delta_lflevel;
+  int rem_bits, thr;
+  int smallval = abs < DELTA_LF_SMALL ? 1 : 0;
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  (void)cm;
+#else
+  FRAME_CONTEXT *ec_ctx = cm->fc;
+  (void)xd;
+#endif
+
+#if CONFIG_EC_MULTISYMBOL
+  aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf,
+                   DELTA_LF_PROBS + 1);
+#else
+  int i = 0;
+  while (i < DELTA_LF_SMALL && i <= abs) {
+    int bit = (i < abs);
+    aom_write(w, bit, ec_ctx->delta_lf_prob[i]);
+    i++;
+  }
+#endif  // CONFIG_EC_MULTISYMBOL
+
+  if (!smallval) {
+    rem_bits = OD_ILOG_NZ(abs - 1) - 1;
+    thr = (1 << rem_bits) + 1;
+    aom_write_literal(w, rem_bits, 3);
+    aom_write_literal(w, abs - thr, rem_bits);
+  }
+  if (abs > 0) {
+    aom_write_bit(w, sign);
+  }
+}
+
+#if !CONFIG_EC_ADAPT
+static void update_delta_lf_probs(AV1_COMMON *cm, aom_writer *w,
+                                  FRAME_COUNTS *counts) {
+  int k;
+#if CONFIG_TILE_GROUPS
+  const int probwt = cm->num_tg;
+#else
+  const int probwt = 1;
+#endif
+  if (!cm->delta_lf_present_flag) return;
+  for (k = 0; k < DELTA_LF_PROBS; ++k) {
+    av1_cond_prob_diff_update(w, &cm->fc->delta_lf_prob[k], counts->delta_lf[k],
+                              probwt);
+  }
+}
+#endif  // CONFIG_EC_ADAPT
+#endif  // CONFIG_EXT_DELTA_Q
+#endif  // CONFIG_DELTA_Q
+
+static void update_skip_probs(AV1_COMMON *cm, aom_writer *w,
+                              FRAME_COUNTS *counts) {
+  int k;
+#if CONFIG_TILE_GROUPS
+  const int probwt = cm->num_tg;
+#else
+  const int probwt = 1;
+#endif
+  for (k = 0; k < SKIP_CONTEXTS; ++k) {
+    av1_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k],
+                              probwt);
+  }
+}
+
+#if !CONFIG_EC_ADAPT
+static void update_switchable_interp_probs(AV1_COMMON *cm, aom_writer *w,
+                                           FRAME_COUNTS *counts) {
+  int j;
+  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
+#if CONFIG_TILE_GROUPS
+    const int probwt = cm->num_tg;
+#else
+    const int probwt = 1;
+#endif
+    prob_diff_update(
+        av1_switchable_interp_tree, cm->fc->switchable_interp_prob[j],
+        counts->switchable_interp[j], SWITCHABLE_FILTERS, probwt, w);
+  }
+}
+#endif
+
+#if !CONFIG_EC_ADAPT
+#if CONFIG_EXT_TX
+static void update_ext_tx_probs(AV1_COMMON *cm, aom_writer *w) {
+  const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
+                             av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
+  int i, j;
+  int s;
+#if CONFIG_TILE_GROUPS
+  const int probwt = cm->num_tg;
+#else
+  const int probwt = 1;
+#endif
+  for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+    int savings = 0;
+    int do_update = 0;
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      if (!use_inter_ext_tx_for_txsize[s][i]) continue;
+      savings += prob_diff_update_savings(
+          av1_ext_tx_inter_tree[s], cm->fc->inter_ext_tx_prob[s][i],
+          cm->counts.inter_ext_tx[s][i],
+          num_ext_tx_set[ext_tx_set_type_inter[s]], probwt);
+    }
+    do_update = savings > savings_thresh;
+    aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+    if (do_update) {
+      for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+        if (!use_inter_ext_tx_for_txsize[s][i]) continue;
+        prob_diff_update(av1_ext_tx_inter_tree[s],
+                         cm->fc->inter_ext_tx_prob[s][i],
+                         cm->counts.inter_ext_tx[s][i],
+                         num_ext_tx_set[ext_tx_set_type_inter[s]], probwt, w);
+      }
+    }
+  }
+
+  for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+    int savings = 0;
+    int do_update = 0;
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      if (!use_intra_ext_tx_for_txsize[s][i]) continue;
+      for (j = 0; j < INTRA_MODES; ++j)
+        savings += prob_diff_update_savings(
+            av1_ext_tx_intra_tree[s], cm->fc->intra_ext_tx_prob[s][i][j],
+            cm->counts.intra_ext_tx[s][i][j],
+            num_ext_tx_set[ext_tx_set_type_intra[s]], probwt);
+    }
+    do_update = savings > savings_thresh;
+    aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+    if (do_update) {
+      for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+        if (!use_intra_ext_tx_for_txsize[s][i]) continue;
+        for (j = 0; j < INTRA_MODES; ++j)
+          prob_diff_update(av1_ext_tx_intra_tree[s],
+                           cm->fc->intra_ext_tx_prob[s][i][j],
+                           cm->counts.intra_ext_tx[s][i][j],
+                           num_ext_tx_set[ext_tx_set_type_intra[s]], probwt, w);
+      }
+    }
+  }
+}
+
+#else
+static void update_ext_tx_probs(AV1_COMMON *cm, aom_writer *w) {
+  const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
+                             av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
+  int i, j;
+
+  int savings = 0;
+  int do_update = 0;
+#if CONFIG_TILE_GROUPS
+  const int probwt = cm->num_tg;
+#else
+  const int probwt = 1;
+#endif
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    for (j = 0; j < TX_TYPES; ++j)
+      savings += prob_diff_update_savings(
+          av1_ext_tx_tree, cm->fc->intra_ext_tx_prob[i][j],
+          cm->counts.intra_ext_tx[i][j], TX_TYPES, probwt);
+  }
+  do_update = savings > savings_thresh;
+  aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+  if (do_update) {
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      for (j = 0; j < TX_TYPES; ++j) {
+        prob_diff_update(av1_ext_tx_tree, cm->fc->intra_ext_tx_prob[i][j],
+                         cm->counts.intra_ext_tx[i][j], TX_TYPES, probwt, w);
+      }
+    }
+  }
+
+  savings = 0;
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    savings +=
+        prob_diff_update_savings(av1_ext_tx_tree, cm->fc->inter_ext_tx_prob[i],
+                                 cm->counts.inter_ext_tx[i], TX_TYPES, probwt);
+  }
+  do_update = savings > savings_thresh;
+  aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+  if (do_update) {
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      prob_diff_update(av1_ext_tx_tree, cm->fc->inter_ext_tx_prob[i],
+                       cm->counts.inter_ext_tx[i], TX_TYPES, probwt, w);
+    }
+  }
+}
+#endif  // CONFIG_EXT_TX
+#endif  // !CONFIG_EC_ADAPT
+#if CONFIG_PALETTE
+static void pack_palette_tokens(aom_writer *w, const TOKENEXTRA **tp, int n,
+                                int num) {
+  int i;
+  const TOKENEXTRA *p = *tp;
+
+  for (i = 0; i < num; ++i) {
+    av1_write_token(
+        w, av1_palette_color_index_tree[n - PALETTE_MIN_SIZE], p->context_tree,
+        &palette_color_index_encodings[n - PALETTE_MIN_SIZE][p->token]);
+    ++p;
+  }
+
+  *tp = p;
+}
+#endif  // CONFIG_PALETTE
+
+#if !CONFIG_PVQ
+#if CONFIG_SUPERTX
+static void update_supertx_probs(AV1_COMMON *cm, int probwt, aom_writer *w) {
+  const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
+                             av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
+  int i, j;
+  int savings = 0;
+  int do_update = 0;
+  for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
+    for (j = TX_8X8; j < TX_SIZES; ++j) {
+      savings += av1_cond_prob_diff_update_savings(
+          &cm->fc->supertx_prob[i][j], cm->counts.supertx[i][j], probwt);
+    }
+  }
+  do_update = savings > savings_thresh;
+  aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+  if (do_update) {
+    for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
+      for (j = TX_8X8; j < TX_SIZES; ++j) {
+        av1_cond_prob_diff_update(w, &cm->fc->supertx_prob[i][j],
+                                  cm->counts.supertx[i][j], probwt);
+      }
+    }
+  }
+}
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_NEW_MULTISYMBOL
+static INLINE void write_coeff_extra(const aom_cdf_prob *const *cdf, int val,
+                                     int n, aom_writer *w) {
+  // Code the extra bits from LSB to MSB in groups of 4
+  int i = 0;
+  int count = 0;
+  while (count < n) {
+    const int size = AOMMIN(n - count, 4);
+    const int mask = (1 << size) - 1;
+    aom_write_cdf(w, val & mask, cdf[i++], 1 << size);
+    val >>= size;
+    count += size;
+  }
+}
+#else
+static INLINE void write_coeff_extra(const aom_prob *pb, int value,
+                                     int num_bits, int skip_bits, aom_writer *w,
+                                     TOKEN_STATS *token_stats) {
+  // Code the extra bits from MSB to LSB 1 bit at a time
+  int index;
+  for (index = skip_bits; index < num_bits; ++index) {
+    const int shift = num_bits - index - 1;
+    const int bb = (value >> shift) & 1;
+    aom_write_record(w, bb, pb[index], token_stats);
+  }
+}
+#endif
+
+#if CONFIG_NEW_TOKENSET && !CONFIG_LV_MAP
+static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp,
+                           const TOKENEXTRA *const stop,
+                           aom_bit_depth_t bit_depth, const TX_SIZE tx_size,
+                           TOKEN_STATS *token_stats) {
+  const TOKENEXTRA *p = *tp;
+#if CONFIG_VAR_TX
+  int count = 0;
+  const int seg_eob = tx_size_2d[tx_size];
+#endif
+
+  while (p < stop && p->token != EOSB_TOKEN) {
+    const int token = p->token;
+    if (token == BLOCK_Z_TOKEN) {
+      aom_write_symbol(w, 0, *p->head_cdf, HEAD_TOKENS + 1);
+      p++;
+      continue;
+    }
+
+    const av1_extra_bit *const extra_bits = &av1_extra_bits[token];
+    if (p->eob_val == LAST_EOB) {
+      // Just code a flag indicating whether the value is >1 or 1.
+      aom_write_bit(w, token != ONE_TOKEN);
+    } else {
+      int comb_symb = 2 * AOMMIN(token, TWO_TOKEN) - p->eob_val + p->first_val;
+      aom_write_symbol(w, comb_symb, *p->head_cdf, HEAD_TOKENS + p->first_val);
+    }
+    if (token > ONE_TOKEN) {
+      aom_write_symbol(w, token - TWO_TOKEN, *p->tail_cdf, TAIL_TOKENS);
+    }
+
+    if (extra_bits->base_val) {
+      const int bit_string = p->extra;
+      const int bit_string_length = extra_bits->len;  // Length of extra bits to
+      const int is_cat6 = (extra_bits->base_val == CAT6_MIN_VAL);
+      // be written excluding
+      // the sign bit.
+      int skip_bits = is_cat6
+                          ? (int)sizeof(av1_cat6_prob) -
+                                av1_get_cat6_extrabits_size(tx_size, bit_depth)
+                          : 0;
+
+      assert(!(bit_string >> (bit_string_length - skip_bits + 1)));
+      if (bit_string_length > 0)
+#if CONFIG_NEW_MULTISYMBOL
+        write_coeff_extra(extra_bits->cdf, bit_string >> 1,
+                          bit_string_length - skip_bits, w);
+#else
+        write_coeff_extra(extra_bits->prob, bit_string >> 1, bit_string_length,
+                          skip_bits, w, token_stats);
+#endif
+
+      aom_write_bit_record(w, bit_string & 1, token_stats);
+    }
+    ++p;
+
+#if CONFIG_VAR_TX
+    ++count;
+    if (token == EOB_TOKEN || count == seg_eob) break;
+#endif
+  }
+
+  *tp = p;
+}
+#else  //  CONFIG_NEW_TOKENSET
+#if !CONFIG_LV_MAP
+static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp,
+                           const TOKENEXTRA *const stop,
+                           aom_bit_depth_t bit_depth, const TX_SIZE tx_size,
+                           TOKEN_STATS *token_stats) {
+  const TOKENEXTRA *p = *tp;
+#if CONFIG_VAR_TX
+  int count = 0;
+  const int seg_eob = tx_size_2d[tx_size];
+#endif
+
+  while (p < stop && p->token != EOSB_TOKEN) {
+    const int token = p->token;
+#if !CONFIG_EC_MULTISYMBOL
+    const struct av1_token *const coef_encoding = &av1_coef_encodings[token];
+    int coef_value = coef_encoding->value;
+    int coef_length = coef_encoding->len;
+#endif  // !CONFIG_EC_MULTISYMBOL
+    const av1_extra_bit *const extra_bits = &av1_extra_bits[token];
+
+#if CONFIG_EC_MULTISYMBOL
+    /* skip one or two nodes */
+    if (!p->skip_eob_node)
+      aom_write_record(w, token != EOB_TOKEN, p->context_tree[0], token_stats);
+    if (token != EOB_TOKEN) {
+      aom_write_record(w, token != ZERO_TOKEN, p->context_tree[1], token_stats);
+      if (token != ZERO_TOKEN) {
+        aom_write_symbol(w, token - ONE_TOKEN, *p->token_cdf,
+                         CATEGORY6_TOKEN - ONE_TOKEN + 1);
+      }
+    }
+#else
+    /* skip one or two nodes */
+    if (p->skip_eob_node)
+      coef_length -= p->skip_eob_node;
+    else
+      aom_write_record(w, token != EOB_TOKEN, p->context_tree[0], token_stats);
+
+    if (token != EOB_TOKEN) {
+      aom_write_record(w, token != ZERO_TOKEN, p->context_tree[1], token_stats);
+
+      if (token != ZERO_TOKEN) {
+        aom_write_record(w, token != ONE_TOKEN, p->context_tree[2],
+                         token_stats);
+
+        if (token != ONE_TOKEN) {
+          const int unconstrained_len = UNCONSTRAINED_NODES - p->skip_eob_node;
+          aom_write_tree_record(
+              w, av1_coef_con_tree,
+              av1_pareto8_full[p->context_tree[PIVOT_NODE] - 1], coef_value,
+              coef_length - unconstrained_len, 0, token_stats);
+        }
+      }
+    }
+#endif  // CONFIG_EC_MULTISYMBOL
+
+    if (extra_bits->base_val) {
+      const int bit_string = p->extra;
+      const int bit_string_length = extra_bits->len;  // Length of extra bits to
+      // be written excluding
+      // the sign bit.
+      int skip_bits = (extra_bits->base_val == CAT6_MIN_VAL)
+                          ? (int)sizeof(av1_cat6_prob) -
+                                av1_get_cat6_extrabits_size(tx_size, bit_depth)
+                          : 0;
+
+      assert(!(bit_string >> (bit_string_length - skip_bits + 1)));
+      if (bit_string_length > 0) {
+#if CONFIG_NEW_MULTISYMBOL
+        skip_bits &= ~3;
+        write_coeff_extra(extra_bits->cdf, bit_string >> 1,
+                          bit_string_length - skip_bits, w);
+#else
+        write_coeff_extra(extra_bits->prob, bit_string >> 1, bit_string_length,
+                          skip_bits, w, token_stats);
+#endif
+      }
+      aom_write_bit_record(w, bit_string & 1, token_stats);
+    }
+    ++p;
+
+#if CONFIG_VAR_TX
+    ++count;
+    if (token == EOB_TOKEN || count == seg_eob) break;
+#endif
+  }
+
+  *tp = p;
+}
+#endif  // !CONFIG_LV_MAP
+#endif  // CONFIG_NEW_TOKENSET
+#else   // !CONFIG_PVQ
+static PVQ_INFO *get_pvq_block(PVQ_QUEUE *pvq_q) {
+  PVQ_INFO *pvq;
+
+  assert(pvq_q->curr_pos <= pvq_q->last_pos);
+  assert(pvq_q->curr_pos < pvq_q->buf_len);
+
+  pvq = pvq_q->buf + pvq_q->curr_pos;
+  ++pvq_q->curr_pos;
+
+  return pvq;
+}
+
+static void pack_pvq_tokens(aom_writer *w, MACROBLOCK *const x,
+                            MACROBLOCKD *const xd, int plane, BLOCK_SIZE bsize,
+                            const TX_SIZE tx_size) {
+  PVQ_INFO *pvq;
+  int idx, idy;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  od_adapt_ctx *adapt;
+  int max_blocks_wide;
+  int max_blocks_high;
+  int step = (1 << tx_size);
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd);
+
+  adapt = x->daala_enc.state.adapt;
+
+  max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  max_blocks_high = max_block_high(xd, plane_bsize, plane);
+
+  for (idy = 0; idy < max_blocks_high; idy += step) {
+    for (idx = 0; idx < max_blocks_wide; idx += step) {
+      const int is_keyframe = 0;
+      const int encode_flip = 0;
+      const int flip = 0;
+      int i;
+      const int has_dc_skip = 1;
+      int *exg = &adapt->pvq.pvq_exg[plane][tx_size][0];
+      int *ext = adapt->pvq.pvq_ext + tx_size * PVQ_MAX_PARTITIONS;
+      generic_encoder *model = adapt->pvq.pvq_param_model;
+
+      pvq = get_pvq_block(x->pvq_q);
+
+      // encode block skip info
+      aom_write_symbol(w, pvq->ac_dc_coded,
+                       adapt->skip_cdf[2 * tx_size + (plane != 0)], 4);
+
+      // AC coeffs coded?
+      if (pvq->ac_dc_coded & AC_CODED) {
+        assert(pvq->bs == tx_size);
+        for (i = 0; i < pvq->nb_bands; i++) {
+          if (i == 0 ||
+              (!pvq->skip_rest && !(pvq->skip_dir & (1 << ((i - 1) % 3))))) {
+            pvq_encode_partition(
+                w, pvq->qg[i], pvq->theta[i], pvq->y + pvq->off[i],
+                pvq->size[i], pvq->k[i], model, adapt, exg + i, ext + i,
+                (plane != 0) * OD_TXSIZES * PVQ_MAX_PARTITIONS +
+                    pvq->bs * PVQ_MAX_PARTITIONS + i,
+                is_keyframe, i == 0 && (i < pvq->nb_bands - 1), pvq->skip_rest,
+                encode_flip, flip);
+          }
+          if (i == 0 && !pvq->skip_rest && pvq->bs > 0) {
+            aom_write_symbol(
+                w, pvq->skip_dir,
+                &adapt->pvq
+                     .pvq_skip_dir_cdf[(plane != 0) + 2 * (pvq->bs - 1)][0],
+                7);
+          }
+        }
+      }
+      // Encode residue of DC coeff, if exist.
+      if (!has_dc_skip || (pvq->ac_dc_coded & DC_CODED)) {
+        generic_encode(w, &adapt->model_dc[plane],
+                       abs(pvq->dq_dc_residue) - has_dc_skip,
+                       &adapt->ex_dc[plane][pvq->bs][0], 2);
+      }
+      if ((pvq->ac_dc_coded & DC_CODED)) {
+        aom_write_bit(w, pvq->dq_dc_residue < 0);
+      }
+    }
+  }  // for (idy = 0;
+}
+#endif  // !CONFIG_PVG
+
+#if CONFIG_VAR_TX && !CONFIG_COEF_INTERLEAVE
+static void pack_txb_tokens(aom_writer *w, const TOKENEXTRA **tp,
+                            const TOKENEXTRA *const tok_end,
+#if CONFIG_PVQ
+                            MACROBLOCK *const x,
+#endif
+                            MACROBLOCKD *xd, MB_MODE_INFO *mbmi, int plane,
+                            BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth,
+                            int block, int blk_row, int blk_col,
+                            TX_SIZE tx_size, TOKEN_STATS *token_stats) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  TX_SIZE plane_tx_size;
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  plane_tx_size =
+      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
+            : mbmi->inter_tx_size[tx_row][tx_col];
+
+  if (tx_size == plane_tx_size) {
+    TOKEN_STATS tmp_token_stats;
+    init_token_stats(&tmp_token_stats);
+#if !CONFIG_PVQ
+    pack_mb_tokens(w, tp, tok_end, bit_depth, tx_size, &tmp_token_stats);
+#else
+    pack_pvq_tokens(w, x, xd, plane, bsize, tx_size);
+#endif
+#if CONFIG_RD_DEBUG
+    token_stats->txb_coeff_cost_map[blk_row][blk_col] = tmp_token_stats.cost;
+    token_stats->cost += tmp_token_stats.cost;
+#endif
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsl = tx_size_wide_unit[sub_txs];
+    int i;
+
+    assert(bsl > 0);
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + (i >> 1) * bsl;
+      const int offsetc = blk_col + (i & 0x01) * bsl;
+      const int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+      pack_txb_tokens(w, tp, tok_end,
+#if CONFIG_PVQ
+                      x,
+#endif
+                      xd, mbmi, plane, plane_bsize, bit_depth, block, offsetr,
+                      offsetc, sub_txs, token_stats);
+      block += step;
+    }
+  }
+}
+#endif
+
+static void write_segment_id(aom_writer *w, const struct segmentation *seg,
+                             struct segmentation_probs *segp, int segment_id) {
+  if (seg->enabled && seg->update_map) {
+#if CONFIG_EC_MULTISYMBOL
+    aom_write_symbol(w, segment_id, segp->tree_cdf, MAX_SEGMENTS);
+#else
+    aom_write_tree(w, av1_segment_tree, segp->tree_probs, segment_id, 3, 0);
+#endif
+  }
+}
+
+// This function encodes the reference frame
+static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                             aom_writer *w) {
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int is_compound = has_second_ref(mbmi);
+  const int segment_id = mbmi->segment_id;
+
+  // If segment level coding of this signal is disabled...
+  // or the segment allows multiple reference frame options
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    assert(!is_compound);
+    assert(mbmi->ref_frame[0] ==
+           get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
+  } else {
+    // does the feature use compound prediction or not
+    // (if not specified at the frame/segment level)
+    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+#if SUB8X8_COMP_REF
+      aom_write(w, is_compound, av1_get_reference_mode_prob(cm, xd));
+#else
+      if (mbmi->sb_type >= BLOCK_8X8)
+        aom_write(w, is_compound, av1_get_reference_mode_prob(cm, xd));
+#endif
+    } else {
+      assert((!is_compound) == (cm->reference_mode == SINGLE_REFERENCE));
+    }
+
+    if (is_compound) {
+#if CONFIG_EXT_REFS
+      const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                       mbmi->ref_frame[0] == LAST3_FRAME);
+      const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME;
+#else  // CONFIG_EXT_REFS
+      const int bit = mbmi->ref_frame[0] == GOLDEN_FRAME;
+#endif  // CONFIG_EXT_REFS
+
+      aom_write(w, bit, av1_get_pred_prob_comp_ref_p(cm, xd));
+
+#if CONFIG_EXT_REFS
+      if (!bit) {
+        const int bit1 = mbmi->ref_frame[0] == LAST_FRAME;
+        aom_write(w, bit1, av1_get_pred_prob_comp_ref_p1(cm, xd));
+      } else {
+        const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME;
+        aom_write(w, bit2, av1_get_pred_prob_comp_ref_p2(cm, xd));
+      }
+      aom_write(w, bit_bwd, av1_get_pred_prob_comp_bwdref_p(cm, xd));
+#endif  // CONFIG_EXT_REFS
+    } else {
+#if CONFIG_EXT_REFS
+      const int bit0 = (mbmi->ref_frame[0] == ALTREF_FRAME ||
+                        mbmi->ref_frame[0] == BWDREF_FRAME);
+      aom_write(w, bit0, av1_get_pred_prob_single_ref_p1(cm, xd));
+
+      if (bit0) {
+        const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME;
+        aom_write(w, bit1, av1_get_pred_prob_single_ref_p2(cm, xd));
+      } else {
+        const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME ||
+                          mbmi->ref_frame[0] == GOLDEN_FRAME);
+        aom_write(w, bit2, av1_get_pred_prob_single_ref_p3(cm, xd));
+
+        if (!bit2) {
+          const int bit3 = mbmi->ref_frame[0] != LAST_FRAME;
+          aom_write(w, bit3, av1_get_pred_prob_single_ref_p4(cm, xd));
+        } else {
+          const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME;
+          aom_write(w, bit4, av1_get_pred_prob_single_ref_p5(cm, xd));
+        }
+      }
+#else   // CONFIG_EXT_REFS
+      const int bit0 = mbmi->ref_frame[0] != LAST_FRAME;
+      aom_write(w, bit0, av1_get_pred_prob_single_ref_p1(cm, xd));
+
+      if (bit0) {
+        const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
+        aom_write(w, bit1, av1_get_pred_prob_single_ref_p2(cm, xd));
+      }
+#endif  // CONFIG_EXT_REFS
+    }
+  }
+}
+
+#if CONFIG_FILTER_INTRA
+static void write_filter_intra_mode_info(const AV1_COMMON *const cm,
+                                         const MB_MODE_INFO *const mbmi,
+                                         aom_writer *w) {
+  if (mbmi->mode == DC_PRED
+#if CONFIG_PALETTE
+      && mbmi->palette_mode_info.palette_size[0] == 0
+#endif  // CONFIG_PALETTE
+      ) {
+    aom_write(w, mbmi->filter_intra_mode_info.use_filter_intra_mode[0],
+              cm->fc->filter_intra_probs[0]);
+    if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) {
+      const FILTER_INTRA_MODE mode =
+          mbmi->filter_intra_mode_info.filter_intra_mode[0];
+      write_uniform(w, FILTER_INTRA_MODES, mode);
+    }
+  }
+
+  if (mbmi->uv_mode == DC_PRED
+#if CONFIG_PALETTE
+      && mbmi->palette_mode_info.palette_size[1] == 0
+#endif  // CONFIG_PALETTE
+      ) {
+    aom_write(w, mbmi->filter_intra_mode_info.use_filter_intra_mode[1],
+              cm->fc->filter_intra_probs[1]);
+    if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1]) {
+      const FILTER_INTRA_MODE mode =
+          mbmi->filter_intra_mode_info.filter_intra_mode[1];
+      write_uniform(w, FILTER_INTRA_MODES, mode);
+    }
+  }
+}
+#endif  // CONFIG_FILTER_INTRA
+
+#if CONFIG_EXT_INTRA
+static void write_intra_angle_info(const MACROBLOCKD *xd,
+                                   FRAME_CONTEXT *const ec_ctx, aom_writer *w) {
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_INTRA_INTERP
+  const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
+  int p_angle;
+#endif  // CONFIG_INTRA_INTERP
+
+  (void)ec_ctx;
+  if (bsize < BLOCK_8X8) return;
+
+  if (av1_is_directional_mode(mbmi->mode, bsize)) {
+    write_uniform(w, 2 * MAX_ANGLE_DELTA + 1,
+                  MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
+#if CONFIG_INTRA_INTERP
+    p_angle = mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
+    if (av1_is_intra_filter_switchable(p_angle)) {
+#if CONFIG_EC_MULTISYMBOL
+      aom_write_symbol(w, mbmi->intra_filter,
+                       ec_ctx->intra_filter_cdf[intra_filter_ctx],
+                       INTRA_FILTERS);
+#else
+      av1_write_token(w, av1_intra_filter_tree,
+                      ec_ctx->intra_filter_probs[intra_filter_ctx],
+                      &intra_filter_encodings[mbmi->intra_filter]);
+#endif  // CONFIG_EC_MULTISYMBOL
+    }
+#endif  // CONFIG_INTRA_INTERP
+  }
+
+  if (av1_is_directional_mode(mbmi->uv_mode, bsize)) {
+    write_uniform(w, 2 * MAX_ANGLE_DELTA + 1,
+                  MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
+  }
+}
+#endif  // CONFIG_EXT_INTRA
+
+static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd,
+                                   aom_writer *w) {
+  AV1_COMMON *const cm = &cpi->common;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#else
+  FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+
+  if (!av1_is_interp_needed(xd)) {
+#if CONFIG_DUAL_FILTER
+    for (int i = 0; i < 4; ++i)
+      assert(mbmi->interp_filter[i] == (cm->interp_filter == SWITCHABLE
+                                            ? EIGHTTAP_REGULAR
+                                            : cm->interp_filter));
+#else
+    assert(mbmi->interp_filter == (cm->interp_filter == SWITCHABLE
+                                       ? EIGHTTAP_REGULAR
+                                       : cm->interp_filter));
+#endif  // CONFIG_DUAL_FILTER
+    return;
+  }
+  if (cm->interp_filter == SWITCHABLE) {
+#if CONFIG_DUAL_FILTER
+    int dir;
+    for (dir = 0; dir < 2; ++dir) {
+      if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
+          (mbmi->ref_frame[1] > INTRA_FRAME &&
+           has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
+        const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+#if CONFIG_EC_MULTISYMBOL
+        aom_write_symbol(w, av1_switchable_interp_ind[mbmi->interp_filter[dir]],
+                         ec_ctx->switchable_interp_cdf[ctx],
+                         SWITCHABLE_FILTERS);
+#else
+        av1_write_token(w, av1_switchable_interp_tree,
+                        ec_ctx->switchable_interp_prob[ctx],
+                        &switchable_interp_encodings[mbmi->interp_filter[dir]]);
+#endif
+        ++cpi->interp_filter_selected[0][mbmi->interp_filter[dir]];
+      } else {
+        assert(mbmi->interp_filter[dir] == EIGHTTAP_REGULAR);
+      }
+    }
+#else
+    {
+      const int ctx = av1_get_pred_context_switchable_interp(xd);
+#if CONFIG_EC_MULTISYMBOL
+      aom_write_symbol(w, av1_switchable_interp_ind[mbmi->interp_filter],
+                       ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS);
+#else
+      av1_write_token(w, av1_switchable_interp_tree,
+                      ec_ctx->switchable_interp_prob[ctx],
+                      &switchable_interp_encodings[mbmi->interp_filter]);
+#endif
+      ++cpi->interp_filter_selected[0][mbmi->interp_filter];
+    }
+#endif  // CONFIG_DUAL_FILTER
+  }
+}
+
+#if CONFIG_PALETTE
+#if CONFIG_PALETTE_DELTA_ENCODING
+// Write luma palette color values with delta encoding. Write the first value as
+// literal, and the deltas between each value and the previous one. The luma
+// palette is sorted so each delta is larger than 0.
+static void write_palette_colors_y(const PALETTE_MODE_INFO *const pmi,
+                                   int bit_depth, aom_writer *w) {
+  const int n = pmi->palette_size[0];
+  int min_bits, i;
+  int bits = av1_get_palette_delta_bits_y(pmi, bit_depth, &min_bits);
+  aom_write_literal(w, bits - min_bits, 2);
+  aom_write_literal(w, pmi->palette_colors[0], bit_depth);
+  for (i = 1; i < n; ++i) {
+    aom_write_literal(
+        w, pmi->palette_colors[i] - pmi->palette_colors[i - 1] - 1, bits);
+    bits =
+        AOMMIN(bits, av1_ceil_log2((1 << bit_depth) - pmi->palette_colors[i]));
+  }
+}
+
+// Write chroma palette color values. Use delta encoding for u channel as its
+// palette is sorted. For v channel, either use delta encoding or transmit
+// raw values directly, whichever costs less.
+static void write_palette_colors_uv(const PALETTE_MODE_INFO *const pmi,
+                                    int bit_depth, aom_writer *w) {
+  int i;
+  const int n = pmi->palette_size[1];
+#if CONFIG_HIGHBITDEPTH
+  const uint16_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE;
+  const uint16_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE;
+#else
+  const uint8_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE;
+  const uint8_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE;
+#endif  // CONFIG_HIGHBITDEPTH
+  // U channel colors.
+  int min_bits_u = 0;
+  int bits_u = av1_get_palette_delta_bits_u(pmi, bit_depth, &min_bits_u);
+  aom_write_literal(w, bits_u - min_bits_u, 2);
+  aom_write_literal(w, colors_u[0], bit_depth);
+  for (i = 1; i < n; ++i) {
+    aom_write_literal(w, colors_u[i] - colors_u[i - 1], bits_u);
+    bits_u = AOMMIN(bits_u, av1_ceil_log2(1 + (1 << bit_depth) - colors_u[i]));
+  }
+  // V channel colors.
+  const int max_val = 1 << bit_depth;
+  int zero_count = 0, min_bits_v = 0;
+  int bits_v =
+      av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v);
+  const int rate_using_delta =
+      2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count;
+  const int rate_using_raw = bit_depth * n;
+  if (rate_using_delta < rate_using_raw) {  // delta encoding
+    aom_write_bit(w, 1);
+    aom_write_literal(w, bits_v - min_bits_v, 2);
+    aom_write_literal(w, colors_v[0], bit_depth);
+    for (i = 1; i < n; ++i) {
+      if (colors_v[i] == colors_v[i - 1]) {  // No need to signal sign bit.
+        aom_write_literal(w, 0, bits_v);
+        continue;
+      }
+      const int delta = abs((int)colors_v[i] - colors_v[i - 1]);
+      const int sign_bit = colors_v[i] < colors_v[i - 1];
+      if (delta <= max_val - delta) {
+        aom_write_literal(w, delta, bits_v);
+        aom_write_bit(w, sign_bit);
+      } else {
+        aom_write_literal(w, max_val - delta, bits_v);
+        aom_write_bit(w, !sign_bit);
+      }
+    }
+  } else {  // Transmit raw values.
+    aom_write_bit(w, 0);
+    for (i = 0; i < n; ++i) aom_write_literal(w, colors_v[i], bit_depth);
+  }
+}
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
+
+static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                                    const MODE_INFO *const mi, aom_writer *w) {
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+
+  if (mbmi->mode == DC_PRED) {
+    const int n = pmi->palette_size[0];
+    int palette_y_mode_ctx = 0;
+    if (above_mi)
+      palette_y_mode_ctx +=
+          (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    if (left_mi)
+      palette_y_mode_ctx +=
+          (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    aom_write(
+        w, n > 0,
+        av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_y_mode_ctx]);
+    if (n > 0) {
+      av1_write_token(w, av1_palette_size_tree,
+                      av1_default_palette_y_size_prob[bsize - BLOCK_8X8],
+                      &palette_size_encodings[n - PALETTE_MIN_SIZE]);
+#if CONFIG_PALETTE_DELTA_ENCODING
+      write_palette_colors_y(pmi, cm->bit_depth, w);
+#else
+      int i;
+      for (i = 0; i < n; ++i)
+        aom_write_literal(w, pmi->palette_colors[i], cm->bit_depth);
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
+      write_uniform(w, n, pmi->palette_first_color_idx[0]);
+    }
+  }
+
+  if (mbmi->uv_mode == DC_PRED) {
+    const int n = pmi->palette_size[1];
+    const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+    aom_write(w, n > 0, av1_default_palette_uv_mode_prob[palette_uv_mode_ctx]);
+    if (n > 0) {
+      av1_write_token(w, av1_palette_size_tree,
+                      av1_default_palette_uv_size_prob[bsize - BLOCK_8X8],
+                      &palette_size_encodings[n - PALETTE_MIN_SIZE]);
+#if CONFIG_PALETTE_DELTA_ENCODING
+      write_palette_colors_uv(pmi, cm->bit_depth, w);
+#else
+      int i;
+      for (i = 0; i < n; ++i) {
+        aom_write_literal(w, pmi->palette_colors[PALETTE_MAX_SIZE + i],
+                          cm->bit_depth);
+        aom_write_literal(w, pmi->palette_colors[2 * PALETTE_MAX_SIZE + i],
+                          cm->bit_depth);
+      }
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
+      write_uniform(w, n, pmi->palette_first_color_idx[1]);
+    }
+  }
+}
+#endif  // CONFIG_PALETTE
+
+void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
+#if CONFIG_SUPERTX
+                       const int supertx_enabled,
+#endif
+#if CONFIG_TXK_SEL
+                       int block, int plane,
+#endif
+                       aom_writer *w) {
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int is_inter = is_inter_block(mbmi);
+#if CONFIG_VAR_TX
+  const TX_SIZE tx_size = is_inter ? mbmi->min_tx_size : mbmi->tx_size;
+#else
+  const TX_SIZE tx_size = mbmi->tx_size;
+#endif  // CONFIG_VAR_TX
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#else
+  FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+
+#if !CONFIG_TXK_SEL
+  TX_TYPE tx_type = mbmi->tx_type;
+#else
+  // Only y plane's tx_type is transmitted
+  if (plane > 0) return;
+  PLANE_TYPE plane_type = get_plane_type(plane);
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+#endif
+
+  if (!FIXED_TX_TYPE) {
+#if CONFIG_EXT_TX
+    const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+    const BLOCK_SIZE bsize = mbmi->sb_type;
+    if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) >
+            1 &&
+        ((!cm->seg.enabled && cm->base_qindex > 0) ||
+         (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
+        !mbmi->skip &&
+#if CONFIG_SUPERTX
+        !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      const int eset =
+          get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
+      if (is_inter) {
+        assert(ext_tx_used_inter[eset][tx_type]);
+        if (eset > 0) {
+#if CONFIG_EC_MULTISYMBOL
+          aom_write_symbol(w, av1_ext_tx_inter_ind[eset][tx_type],
+                           ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
+                           ext_tx_cnt_inter[eset]);
+#else
+          av1_write_token(w, av1_ext_tx_inter_tree[eset],
+                          ec_ctx->inter_ext_tx_prob[eset][square_tx_size],
+                          &ext_tx_inter_encodings[eset][tx_type]);
+#endif
+        }
+      } else if (ALLOW_INTRA_EXT_TX) {
+        assert(ext_tx_used_intra[eset][tx_type]);
+        if (eset > 0) {
+#if CONFIG_EC_MULTISYMBOL
+          aom_write_symbol(
+              w, av1_ext_tx_intra_ind[eset][tx_type],
+              ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
+              ext_tx_cnt_intra[eset]);
+#else
+          av1_write_token(
+              w, av1_ext_tx_intra_tree[eset],
+              ec_ctx->intra_ext_tx_prob[eset][square_tx_size][mbmi->mode],
+              &ext_tx_intra_encodings[eset][tx_type]);
+#endif
+        }
+      }
+    }
+#else
+    if (tx_size < TX_32X32 &&
+        ((!cm->seg.enabled && cm->base_qindex > 0) ||
+         (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
+        !mbmi->skip &&
+#if CONFIG_SUPERTX
+        !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      if (is_inter) {
+#if CONFIG_EC_MULTISYMBOL
+        aom_write_symbol(w, av1_ext_tx_ind[tx_type],
+                         ec_ctx->inter_ext_tx_cdf[tx_size], TX_TYPES);
+#else
+        av1_write_token(w, av1_ext_tx_tree, ec_ctx->inter_ext_tx_prob[tx_size],
+                        &ext_tx_encodings[tx_type]);
+#endif
+      } else {
+#if CONFIG_EC_MULTISYMBOL
+        aom_write_symbol(
+            w, av1_ext_tx_ind[tx_type],
+            ec_ctx->intra_ext_tx_cdf[tx_size]
+                                    [intra_mode_to_tx_type_context[mbmi->mode]],
+            TX_TYPES);
+#else
+        av1_write_token(
+            w, av1_ext_tx_tree,
+            ec_ctx
+                ->intra_ext_tx_prob[tx_size]
+                                   [intra_mode_to_tx_type_context[mbmi->mode]],
+            &ext_tx_encodings[tx_type]);
+#endif
+      }
+    }
+#endif  // CONFIG_EXT_TX
+  }
+}
+
+static void write_intra_mode(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize,
+                             PREDICTION_MODE mode, aom_writer *w) {
+#if CONFIG_EC_MULTISYMBOL
+  aom_write_symbol(w, av1_intra_mode_ind[mode],
+                   frame_ctx->y_mode_cdf[size_group_lookup[bsize]],
+                   INTRA_MODES);
+#else
+  av1_write_token(w, av1_intra_mode_tree,
+                  frame_ctx->y_mode_prob[size_group_lookup[bsize]],
+                  &intra_mode_encodings[mode]);
+#endif
+}
+
+static void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx,
+                                PREDICTION_MODE uv_mode, PREDICTION_MODE y_mode,
+                                aom_writer *w) {
+#if CONFIG_EC_MULTISYMBOL
+  aom_write_symbol(w, av1_intra_mode_ind[uv_mode],
+                   frame_ctx->uv_mode_cdf[y_mode], INTRA_MODES);
+#else
+  av1_write_token(w, av1_intra_mode_tree, frame_ctx->uv_mode_prob[y_mode],
+                  &intra_mode_encodings[uv_mode]);
+#endif
+}
+
+static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
+                                const int mi_col,
+#if CONFIG_SUPERTX
+                                int supertx_enabled,
+#endif
+                                aom_writer *w) {
+  AV1_COMMON *const cm = &cpi->common;
+#if CONFIG_DELTA_Q || CONFIG_EC_ADAPT
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+#else
+  const MACROBLOCK *x = &cpi->td.mb;
+  const MACROBLOCKD *xd = &x->e_mbd;
+#endif
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#else
+  FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+#if !CONFIG_REF_MV
+  nmv_context *nmvc = &ec_ctx->nmvc;
+#endif
+  const MODE_INFO *mi = xd->mi[0];
+
+  const struct segmentation *const seg = &cm->seg;
+  struct segmentation_probs *const segp = &cm->fc->seg;
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const PREDICTION_MODE mode = mbmi->mode;
+  const int segment_id = mbmi->segment_id;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int allow_hp = cm->allow_high_precision_mv;
+  const int is_inter = is_inter_block(mbmi);
+  const int is_compound = has_second_ref(mbmi);
+  int skip, ref;
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+#endif
+  (void)mi_row;
+  (void)mi_col;
+
+  if (seg->update_map) {
+    if (seg->temporal_update) {
+      const int pred_flag = mbmi->seg_id_predicted;
+      aom_prob pred_prob = av1_get_pred_prob_seg_id(segp, xd);
+      aom_write(w, pred_flag, pred_prob);
+      if (!pred_flag) write_segment_id(w, seg, segp, segment_id);
+    } else {
+      write_segment_id(w, seg, segp, segment_id);
+    }
+  }
+
+#if CONFIG_SUPERTX
+  if (supertx_enabled)
+    skip = mbmi->skip;
+  else
+    skip = write_skip(cm, xd, segment_id, mi, w);
+#else
+  skip = write_skip(cm, xd, segment_id, mi, w);
+#endif  // CONFIG_SUPERTX
+#if CONFIG_DELTA_Q
+  if (cm->delta_q_present_flag) {
+    int super_block_upper_left =
+        ((mi_row & MAX_MIB_MASK) == 0) && ((mi_col & MAX_MIB_MASK) == 0);
+    if ((bsize != BLOCK_LARGEST || skip == 0) && super_block_upper_left) {
+      assert(mbmi->current_q_index > 0);
+      int reduced_delta_qindex =
+          (mbmi->current_q_index - xd->prev_qindex) / cm->delta_q_res;
+      write_delta_qindex(cm, xd, reduced_delta_qindex, w);
+      xd->prev_qindex = mbmi->current_q_index;
+#if CONFIG_EXT_DELTA_Q
+      if (cm->delta_lf_present_flag) {
+        int reduced_delta_lflevel =
+            (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
+            cm->delta_lf_res;
+        write_delta_lflevel(cm, xd, reduced_delta_lflevel, w);
+        xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
+      }
+#endif  // CONFIG_EXT_DELTA_Q
+    }
+  }
+#endif
+
+#if CONFIG_SUPERTX
+  if (!supertx_enabled)
+#endif  // CONFIG_SUPERTX
+    if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+      aom_write(w, is_inter, av1_get_intra_inter_prob(cm, xd));
+
+  if (cm->tx_mode == TX_MODE_SELECT &&
+#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_RECT_TX)
+#if CONFIG_RECT_TX
+      bsize > BLOCK_4X4 &&
+#else
+      (bsize >= BLOCK_8X8 || (bsize > BLOCK_4X4 && is_inter)) &&
+#endif  // CONFIG_RECT_TX
+#else
+      bsize >= BLOCK_8X8 &&
+#endif
+#if CONFIG_SUPERTX
+      !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
+      !(is_inter && skip) && !xd->lossless[segment_id]) {
+#if CONFIG_VAR_TX
+    if (is_inter) {  // This implies skip flag is 0.
+      const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, bsize);
+      const int bh = tx_size_high_unit[max_tx_size];
+      const int bw = tx_size_wide_unit[max_tx_size];
+      const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
+      const int height = block_size_high[bsize] >> tx_size_wide_log2[0];
+      int idx, idy;
+      for (idy = 0; idy < height; idy += bh)
+        for (idx = 0; idx < width; idx += bw)
+          write_tx_size_vartx(cm, xd, mbmi, max_tx_size, height != width, idy,
+                              idx, w);
+    } else {
+      set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, skip, xd);
+      write_selected_tx_size(cm, xd, w);
+    }
+  } else {
+    set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, skip, xd);
+#else
+    write_selected_tx_size(cm, xd, w);
+#endif
+  }
+
+  if (!is_inter) {
+    if (bsize >= BLOCK_8X8 || unify_bsize) {
+      write_intra_mode(ec_ctx, bsize, mode, w);
+    } else {
+      int idx, idy;
+      const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+      const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+      for (idy = 0; idy < 2; idy += num_4x4_h) {
+        for (idx = 0; idx < 2; idx += num_4x4_w) {
+          const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode;
+          write_intra_mode(ec_ctx, bsize, b_mode, w);
+        }
+      }
+    }
+#if CONFIG_CB4X4
+    if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                            xd->plane[1].subsampling_y))
+      write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mode, w);
+#else  // !CONFIG_CB4X4
+    write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mode, w);
+#endif  // CONFIG_CB4X4
+
+#if CONFIG_EXT_INTRA
+    write_intra_angle_info(xd, ec_ctx, w);
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_PALETTE
+    if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools)
+      write_palette_mode_info(cm, xd, mi, w);
+#endif  // CONFIG_PALETTE
+#if CONFIG_FILTER_INTRA
+    if (bsize >= BLOCK_8X8 || unify_bsize)
+      write_filter_intra_mode_info(cm, mbmi, w);
+#endif  // CONFIG_FILTER_INTRA
+  } else {
+    int16_t mode_ctx;
+    write_ref_frames(cm, xd, w);
+
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+    if (is_compound)
+      mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+    else
+#endif  // CONFIG_EXT_INTER
+      mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
+                                           mbmi->ref_frame, bsize, -1);
+#else  // CONFIG_REF_MV
+    mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
+#endif  // CONFIG_REF_MV
+
+    // If segment skip is not enabled code the mode.
+    if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+      if (bsize >= BLOCK_8X8 || unify_bsize) {
+#if CONFIG_EXT_INTER
+        if (is_inter_compound_mode(mode))
+          write_inter_compound_mode(cm, w, mode, mode_ctx);
+        else if (is_inter_singleref_mode(mode))
+#endif  // CONFIG_EXT_INTER
+          write_inter_mode(w, mode, ec_ctx, mode_ctx);
+
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+        if (mode == NEWMV || mode == NEW_NEWMV ||
+            have_nearmv_in_inter_mode(mode))
+#else
+        if (mode == NEARMV || mode == NEWMV)
+#endif
+          write_drl_idx(cm, mbmi, mbmi_ext, w);
+        else
+          assert(mbmi->ref_mv_idx == 0);
+#endif
+      }
+    }
+
+#if !CONFIG_DUAL_FILTER && !CONFIG_WARPED_MOTION && !CONFIG_GLOBAL_MOTION
+    write_mb_interp_filter(cpi, xd, w);
+#endif  // !CONFIG_DUAL_FILTER && !CONFIG_WARPED_MOTION
+
+    if (bsize < BLOCK_8X8 && !unify_bsize) {
+      const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+      const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+      int idx, idy;
+      for (idy = 0; idy < 2; idy += num_4x4_h) {
+        for (idx = 0; idx < 2; idx += num_4x4_w) {
+          const int j = idy * 2 + idx;
+          const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+          if (!is_compound)
+#endif  // CONFIG_EXT_INTER
+            mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
+                                                 mbmi->ref_frame, bsize, j);
+#endif
+#if CONFIG_EXT_INTER
+          if (is_inter_compound_mode(b_mode))
+            write_inter_compound_mode(cm, w, b_mode, mode_ctx);
+          else if (is_inter_singleref_mode(b_mode))
+#endif  // CONFIG_EXT_INTER
+            write_inter_mode(w, b_mode, ec_ctx, mode_ctx);
+
+#if CONFIG_EXT_INTER
+          if (b_mode == NEWMV || b_mode == NEW_NEWMV) {
+#else
+          if (b_mode == NEWMV) {
+#endif  // CONFIG_EXT_INTER
+            for (ref = 0; ref < 1 + is_compound; ++ref) {
+#if CONFIG_REF_MV
+              int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+              int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                                        mbmi_ext->ref_mv_stack[rf_type], ref,
+                                        mbmi->ref_mv_idx);
+              nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
+#endif
+              av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
+#if CONFIG_EXT_INTER
+                            &mi->bmi[j].ref_mv[ref].as_mv,
+#else
+#if CONFIG_REF_MV
+                            &mi->bmi[j].pred_mv[ref].as_mv,
+#else
+                            &mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0].as_mv,
+#endif  // CONFIG_REF_MV
+#endif  // CONFIG_EXT_INTER
+                            nmvc, allow_hp);
+            }
+          }
+#if CONFIG_EXT_INTER
+          else if (b_mode == NEAREST_NEWMV || b_mode == NEAR_NEWMV) {
+#if CONFIG_REF_MV
+            int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+            int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                                      mbmi_ext->ref_mv_stack[rf_type], 1,
+                                      mbmi->ref_mv_idx);
+            nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
+#endif
+            av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[1].as_mv,
+                          &mi->bmi[j].ref_mv[1].as_mv, nmvc, allow_hp);
+          } else if (b_mode == NEW_NEARESTMV || b_mode == NEW_NEARMV) {
+#if CONFIG_REF_MV
+            int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+            int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                                      mbmi_ext->ref_mv_stack[rf_type], 0,
+                                      mbmi->ref_mv_idx);
+            nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
+#endif
+            av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[0].as_mv,
+                          &mi->bmi[j].ref_mv[0].as_mv, nmvc, allow_hp);
+          }
+#endif  // CONFIG_EXT_INTER
+        }
+      }
+    } else {
+#if CONFIG_EXT_INTER
+      if (mode == NEWMV || mode == NEW_NEWMV) {
+#else
+      if (mode == NEWMV) {
+#endif  // CONFIG_EXT_INTER
+        int_mv ref_mv;
+        for (ref = 0; ref < 1 + is_compound; ++ref) {
+#if CONFIG_REF_MV
+          int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+          int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                                    mbmi_ext->ref_mv_stack[rf_type], ref,
+                                    mbmi->ref_mv_idx);
+          nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
+#endif
+          ref_mv = mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0];
+          av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
+                        allow_hp);
+        }
+#if CONFIG_EXT_INTER
+      } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
+#if CONFIG_REF_MV
+        int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+        int nmv_ctx =
+            av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                        mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
+        nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
+#endif
+        av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv,
+                      &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv, nmvc,
+                      allow_hp);
+      } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+#if CONFIG_REF_MV
+        int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+        int nmv_ctx =
+            av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                        mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
+        nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
+#endif
+        av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv,
+                      &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv, nmvc,
+                      allow_hp);
+#endif  // CONFIG_EXT_INTER
+      }
+    }
+
+#if CONFIG_EXT_INTER
+    if (cpi->common.reference_mode != COMPOUND_REFERENCE &&
+#if CONFIG_SUPERTX
+        !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
+        is_interintra_allowed(mbmi)) {
+      const int interintra = mbmi->ref_frame[1] == INTRA_FRAME;
+      const int bsize_group = size_group_lookup[bsize];
+      aom_write(w, interintra, cm->fc->interintra_prob[bsize_group]);
+      if (interintra) {
+        write_interintra_mode(w, mbmi->interintra_mode,
+                              cm->fc->interintra_mode_prob[bsize_group]);
+        if (is_interintra_wedge_used(bsize)) {
+          aom_write(w, mbmi->use_wedge_interintra,
+                    cm->fc->wedge_interintra_prob[bsize]);
+          if (mbmi->use_wedge_interintra) {
+            aom_write_literal(w, mbmi->interintra_wedge_index,
+                              get_wedge_bits_lookup(bsize));
+            assert(mbmi->interintra_wedge_sign == 0);
+          }
+        }
+      }
+    }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_SUPERTX
+    if (!supertx_enabled)
+#endif  // CONFIG_SUPERTX
+#if CONFIG_EXT_INTER
+      if (mbmi->ref_frame[1] != INTRA_FRAME)
+#endif  // CONFIG_EXT_INTER
+        write_motion_mode(cm, mi, w);
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if CONFIG_EXT_INTER
+    if (cpi->common.reference_mode != SINGLE_REFERENCE &&
+        is_inter_compound_mode(mbmi->mode)
+#if CONFIG_MOTION_VAR
+        && mbmi->motion_mode == SIMPLE_TRANSLATION
+#endif  // CONFIG_MOTION_VAR
+        && is_any_masked_compound_used(bsize)) {
+#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+      av1_write_token(w, av1_compound_type_tree,
+                      cm->fc->compound_type_prob[bsize],
+                      &compound_type_encodings[mbmi->interinter_compound_type]);
+#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+#if CONFIG_WEDGE
+      if (mbmi->interinter_compound_type == COMPOUND_WEDGE) {
+        aom_write_literal(w, mbmi->wedge_index, get_wedge_bits_lookup(bsize));
+        aom_write_bit(w, mbmi->wedge_sign);
+      }
+#endif  // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+      if (mbmi->interinter_compound_type == COMPOUND_SEG) {
+        aom_write_literal(w, mbmi->mask_type, MAX_SEG_MASK_BITS);
+      }
+#endif  // CONFIG_COMPOUND_SEGMENT
+    }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION
+    write_mb_interp_filter(cpi, xd, w);
+#endif  // CONFIG_DUAL_FILTE || CONFIG_WARPED_MOTION
+  }
+
+#if !CONFIG_TXK_SEL
+  av1_write_tx_type(cm, xd,
+#if CONFIG_SUPERTX
+                    supertx_enabled,
+#endif
+                    w);
+#endif  // !CONFIG_TXK_SEL
+}
+
+#if CONFIG_DELTA_Q
+static void write_mb_modes_kf(AV1_COMMON *cm, MACROBLOCKD *xd, const int mi_row,
+                              const int mi_col, aom_writer *w) {
+  int skip;
+#else
+static void write_mb_modes_kf(AV1_COMMON *cm, const MACROBLOCKD *xd,
+                              const int mi_row, const int mi_col,
+                              aom_writer *w) {
+#endif
+  const struct segmentation *const seg = &cm->seg;
+  struct segmentation_probs *const segp = &cm->fc->seg;
+  const MODE_INFO *const mi = xd->mi[0];
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+#endif
+  (void)mi_row;
+  (void)mi_col;
+
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#else
+  FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+
+  if (seg->update_map) write_segment_id(w, seg, segp, mbmi->segment_id);
+
+#if CONFIG_DELTA_Q
+  skip = write_skip(cm, xd, mbmi->segment_id, mi, w);
+  if (cm->delta_q_present_flag) {
+    int super_block_upper_left =
+        ((mi_row & MAX_MIB_MASK) == 0) && ((mi_col & MAX_MIB_MASK) == 0);
+    if ((bsize != BLOCK_LARGEST || skip == 0) && super_block_upper_left) {
+      assert(mbmi->current_q_index > 0);
+      int reduced_delta_qindex =
+          (mbmi->current_q_index - xd->prev_qindex) / cm->delta_q_res;
+      write_delta_qindex(cm, xd, reduced_delta_qindex, w);
+      xd->prev_qindex = mbmi->current_q_index;
+#if CONFIG_EXT_DELTA_Q
+      if (cm->delta_lf_present_flag) {
+        int reduced_delta_lflevel =
+            (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
+            cm->delta_lf_res;
+        write_delta_lflevel(cm, xd, reduced_delta_lflevel, w);
+        xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
+      }
+#endif  // CONFIG_EXT_DELTA_Q
+    }
+  }
+#else
+  write_skip(cm, xd, mbmi->segment_id, mi, w);
+#endif
+
+  if (cm->tx_mode == TX_MODE_SELECT &&
+#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_RECT_TX)
+#if CONFIG_RECT_TX
+      bsize > BLOCK_4X4 &&
+#else
+      bsize >= BLOCK_8X8 &&
+#endif  // CONFIG_RECT_TX
+#else
+      bsize >= BLOCK_8X8 &&
+#endif
+      !xd->lossless[mbmi->segment_id])
+    write_selected_tx_size(cm, xd, w);
+
+#if CONFIG_INTRABC
+  if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools) {
+    int use_intrabc = is_intrabc_block(mbmi);
+    aom_write(w, use_intrabc, INTRABC_PROB);
+    if (use_intrabc) {
+      assert(mbmi->mode == DC_PRED);
+      assert(mbmi->uv_mode == DC_PRED);
+      int_mv dv_ref;
+      av1_find_ref_dv(&dv_ref, mi_row, mi_col);
+      av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc);
+      return;
+    }
+  }
+#endif  // CONFIG_INTRABC
+
+  if (bsize >= BLOCK_8X8 || unify_bsize) {
+    write_intra_mode_kf(cm, ec_ctx, mi, above_mi, left_mi, 0, mbmi->mode, w);
+  } else {
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+    int idx, idy;
+
+    for (idy = 0; idy < 2; idy += num_4x4_h) {
+      for (idx = 0; idx < 2; idx += num_4x4_w) {
+        const int block = idy * 2 + idx;
+        write_intra_mode_kf(cm, ec_ctx, mi, above_mi, left_mi, block,
+                            mi->bmi[block].as_mode, w);
+      }
+    }
+  }
+
+#if CONFIG_CB4X4
+  if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                          xd->plane[1].subsampling_y))
+    write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mbmi->mode, w);
+#else  // !CONFIG_CB4X4
+  write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mbmi->mode, w);
+#endif  // CONFIG_CB4X4
+
+#if CONFIG_EXT_INTRA
+  write_intra_angle_info(xd, ec_ctx, w);
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_PALETTE
+  if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools)
+    write_palette_mode_info(cm, xd, mi, w);
+#endif  // CONFIG_PALETTE
+#if CONFIG_FILTER_INTRA
+  if (bsize >= BLOCK_8X8 || unify_bsize)
+    write_filter_intra_mode_info(cm, mbmi, w);
+#endif  // CONFIG_FILTER_INTRA
+
+#if !CONFIG_TXK_SEL
+  av1_write_tx_type(cm, xd,
+#if CONFIG_SUPERTX
+                    0,
+#endif
+                    w);
+#endif  // !CONFIG_TXK_SEL
+}
+
+#if CONFIG_SUPERTX
+#define write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \
+                              mi_row, mi_col)                              \
+  write_modes_b(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, mi_col)
+#else
+#define write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \
+                              mi_row, mi_col)                              \
+  write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col)
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_RD_DEBUG
+static void dump_mode_info(MODE_INFO *mi) {
+  printf("\nmi->mbmi.mi_row == %d\n", mi->mbmi.mi_row);
+  printf("&& mi->mbmi.mi_col == %d\n", mi->mbmi.mi_col);
+  printf("&& mi->mbmi.sb_type == %d\n", mi->mbmi.sb_type);
+  printf("&& mi->mbmi.tx_size == %d\n", mi->mbmi.tx_size);
+  if (mi->mbmi.sb_type >= BLOCK_8X8) {
+    printf("&& mi->mbmi.mode == %d\n", mi->mbmi.mode);
+  } else {
+    printf("&& mi->bmi[0].as_mode == %d\n", mi->bmi[0].as_mode);
+  }
+}
+static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
+                                   int plane) {
+  if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) {
+#if CONFIG_VAR_TX
+    int r, c;
+#endif
+    printf("\nplane %d rd_stats->txb_coeff_cost %d token_stats->cost %d\n",
+           plane, rd_stats->txb_coeff_cost[plane], token_stats->cost);
+#if CONFIG_VAR_TX
+    printf("rd txb_coeff_cost_map\n");
+    for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
+      for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+        printf("%d ", rd_stats->txb_coeff_cost_map[plane][r][c]);
+      }
+      printf("\n");
+    }
+
+    printf("pack txb_coeff_cost_map\n");
+    for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
+      for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+        printf("%d ", token_stats->txb_coeff_cost_map[r][c]);
+      }
+      printf("\n");
+    }
+#endif
+    return 1;
+  }
+  return 0;
+}
+#endif
+
+static void write_mbmi_b(AV1_COMP *cpi, const TileInfo *const tile,
+                         aom_writer *w,
+#if CONFIG_SUPERTX
+                         int supertx_enabled,
+#endif
+                         int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  MODE_INFO *m;
+  int bh, bw;
+  xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+  m = xd->mi[0];
+
+  assert(m->mbmi.sb_type <= cm->sb_size);
+
+  bh = mi_size_high[m->mbmi.sb_type];
+  bw = mi_size_wide[m->mbmi.sb_type];
+
+  cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
+#if CONFIG_DEPENDENT_HORZTILES
+                 cm->dependent_horz_tiles,
+#endif  // CONFIG_DEPENDENT_HORZTILES
+                 cm->mi_rows, cm->mi_cols);
+
+  if (frame_is_intra_only(cm)) {
+    write_mb_modes_kf(cm, xd, mi_row, mi_col, w);
+  } else {
+#if CONFIG_VAR_TX
+    xd->above_txfm_context = cm->above_txfm_context + mi_col;
+    xd->left_txfm_context =
+        xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+#endif
+#if CONFIG_DUAL_FILTER
+    // has_subpel_mv_component needs the ref frame buffers set up to look
+    // up if they are scaled. has_subpel_mv_component is in turn needed by
+    // write_switchable_interp_filter, which is called by pack_inter_mode_mvs.
+    set_ref_ptrs(cm, xd, m->mbmi.ref_frame[0], m->mbmi.ref_frame[1]);
+#endif  // CONFIG_DUAL_FILTER
+#if 0
+    // NOTE(zoeliu): For debug
+    if (cm->current_video_frame == FRAME_TO_CHECK && cm->show_frame == 1) {
+      const PREDICTION_MODE mode = m->mbmi.mode;
+      const int segment_id = m->mbmi.segment_id;
+      const BLOCK_SIZE bsize = m->mbmi.sb_type;
+
+      // For sub8x8, simply dump out the first sub8x8 block info
+      const PREDICTION_MODE b_mode =
+          (bsize < BLOCK_8X8) ? m->bmi[0].as_mode : -1;
+      const int mv_x = (bsize < BLOCK_8X8) ?
+          m->bmi[0].as_mv[0].as_mv.row : m->mbmi.mv[0].as_mv.row;
+      const int mv_y = (bsize < BLOCK_8X8) ?
+          m->bmi[0].as_mv[0].as_mv.col : m->mbmi.mv[0].as_mv.col;
+
+      printf("Before pack_inter_mode_mvs(): "
+             "Frame=%d, (mi_row,mi_col)=(%d,%d), "
+             "mode=%d, segment_id=%d, bsize=%d, b_mode=%d, "
+             "mv[0]=(%d, %d), ref[0]=%d, ref[1]=%d\n",
+             cm->current_video_frame, mi_row, mi_col,
+             mode, segment_id, bsize, b_mode, mv_x, mv_y,
+             m->mbmi.ref_frame[0], m->mbmi.ref_frame[1]);
+    }
+#endif  // 0
+    pack_inter_mode_mvs(cpi, mi_row, mi_col,
+#if CONFIG_SUPERTX
+                        supertx_enabled,
+#endif
+                        w);
+  }
+}
+
+static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
+                           aom_writer *w, const TOKENEXTRA **tok,
+                           const TOKENEXTRA *const tok_end, int mi_row,
+                           int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  MODE_INFO *const m = xd->mi[0];
+  MB_MODE_INFO *const mbmi = &m->mbmi;
+  int plane;
+  int bh, bw;
+#if CONFIG_PVQ || CONFIG_LV_MAP
+  MACROBLOCK *const x = &cpi->td.mb;
+  (void)tok;
+  (void)tok_end;
+#endif
+  xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+
+  assert(mbmi->sb_type <= cm->sb_size);
+
+  bh = mi_size_high[mbmi->sb_type];
+  bw = mi_size_wide[mbmi->sb_type];
+  cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
+#if CONFIG_DEPENDENT_HORZTILES
+                 cm->dependent_horz_tiles,
+#endif  // CONFIG_DEPENDENT_HORZTILES
+                 cm->mi_rows, cm->mi_cols);
+
+#if CONFIG_PALETTE
+  for (plane = 0; plane <= 1; ++plane) {
+    const uint8_t palette_size_plane =
+        mbmi->palette_mode_info.palette_size[plane];
+    if (palette_size_plane > 0) {
+#if CONFIG_INTRABC
+      assert(mbmi->use_intrabc == 0);
+#endif
+      int rows, cols;
+      assert(mbmi->sb_type >= BLOCK_8X8);
+      av1_get_block_dimensions(mbmi->sb_type, plane, xd, NULL, NULL, &rows,
+                               &cols);
+      assert(*tok < tok_end);
+      pack_palette_tokens(w, tok, palette_size_plane, rows * cols - 1);
+      assert(*tok < tok_end + mbmi->skip);
+    }
+  }
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_COEF_INTERLEAVE
+  if (!mbmi->skip) {
+    const struct macroblockd_plane *const pd_y = &xd->plane[0];
+    const struct macroblockd_plane *const pd_c = &xd->plane[1];
+    const TX_SIZE tx_log2_y = mbmi->tx_size;
+    const TX_SIZE tx_log2_c = get_uv_tx_size(mbmi, pd_c);
+    const int tx_sz_y = (1 << tx_log2_y);
+    const int tx_sz_c = (1 << tx_log2_c);
+
+    const BLOCK_SIZE plane_bsize_y =
+        get_plane_block_size(AOMMAX(mbmi->sb_type, 3), pd_y);
+    const BLOCK_SIZE plane_bsize_c =
+        get_plane_block_size(AOMMAX(mbmi->sb_type, 3), pd_c);
+
+    const int num_4x4_w_y = num_4x4_blocks_wide_lookup[plane_bsize_y];
+    const int num_4x4_w_c = num_4x4_blocks_wide_lookup[plane_bsize_c];
+    const int num_4x4_h_y = num_4x4_blocks_high_lookup[plane_bsize_y];
+    const int num_4x4_h_c = num_4x4_blocks_high_lookup[plane_bsize_c];
+
+    const int max_4x4_w_y = get_max_4x4_size(num_4x4_w_y, xd->mb_to_right_edge,
+                                             pd_y->subsampling_x);
+    const int max_4x4_h_y = get_max_4x4_size(num_4x4_h_y, xd->mb_to_bottom_edge,
+                                             pd_y->subsampling_y);
+    const int max_4x4_w_c = get_max_4x4_size(num_4x4_w_c, xd->mb_to_right_edge,
+                                             pd_c->subsampling_x);
+    const int max_4x4_h_c = get_max_4x4_size(num_4x4_h_c, xd->mb_to_bottom_edge,
+                                             pd_c->subsampling_y);
+
+    // The max_4x4_w/h may be smaller than tx_sz under some corner cases,
+    // i.e. when the SB is splitted by tile boundaries.
+    const int tu_num_w_y = (max_4x4_w_y + tx_sz_y - 1) / tx_sz_y;
+    const int tu_num_h_y = (max_4x4_h_y + tx_sz_y - 1) / tx_sz_y;
+    const int tu_num_w_c = (max_4x4_w_c + tx_sz_c - 1) / tx_sz_c;
+    const int tu_num_h_c = (max_4x4_h_c + tx_sz_c - 1) / tx_sz_c;
+    const int tu_num_y = tu_num_w_y * tu_num_h_y;
+    const int tu_num_c = tu_num_w_c * tu_num_h_c;
+
+    int tu_idx_y = 0, tu_idx_c = 0;
+    TOKEN_STATS token_stats;
+    init_token_stats(&token_stats);
+
+    assert(*tok < tok_end);
+
+    while (tu_idx_y < tu_num_y) {
+      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_y, &token_stats);
+      assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+      (*tok)++;
+      tu_idx_y++;
+
+      if (tu_idx_c < tu_num_c) {
+        pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats);
+        assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+        (*tok)++;
+
+        pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats);
+        assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+        (*tok)++;
+
+        tu_idx_c++;
+      }
+    }
+
+    // In 422 case, it's possilbe that Chroma has more TUs than Luma
+    while (tu_idx_c < tu_num_c) {
+      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats);
+      assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+      (*tok)++;
+
+      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats);
+      assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+      (*tok)++;
+
+      tu_idx_c++;
+    }
+  }
+#else  // CONFIG_COEF_INTERLEAVE
+  if (!mbmi->skip) {
+#if !CONFIG_PVQ && !CONFIG_LV_MAP
+    assert(*tok < tok_end);
+#endif
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_CB4X4
+      if (!is_chroma_reference(mi_row, mi_col, mbmi->sb_type,
+                               xd->plane[plane].subsampling_x,
+                               xd->plane[plane].subsampling_y)) {
+        (*tok)++;
+        continue;
+      }
+#endif
+#if CONFIG_VAR_TX
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2
+      const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+#else
+      const BLOCK_SIZE plane_bsize =
+          AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
+#endif
+#else
+      const BLOCK_SIZE plane_bsize =
+          get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd);
+#endif
+
+      const int num_4x4_w =
+          block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+      const int num_4x4_h =
+          block_size_high[plane_bsize] >> tx_size_wide_log2[0];
+      int row, col;
+      TOKEN_STATS token_stats;
+      init_token_stats(&token_stats);
+
+      if (is_inter_block(mbmi)) {
+        const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, plane_bsize);
+        int block = 0;
+        const int step =
+            tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+        const int bkw = tx_size_wide_unit[max_tx_size];
+        const int bkh = tx_size_high_unit[max_tx_size];
+        for (row = 0; row < num_4x4_h; row += bkh) {
+          for (col = 0; col < num_4x4_w; col += bkw) {
+            pack_txb_tokens(w, tok, tok_end,
+#if CONFIG_PVQ
+                            x,
+#endif
+                            xd, mbmi, plane, plane_bsize, cm->bit_depth, block,
+                            row, col, max_tx_size, &token_stats);
+            block += step;
+          }
+        }
+#if CONFIG_RD_DEBUG
+        if (mbmi->sb_type >= BLOCK_8X8 &&
+            rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) {
+          dump_mode_info(m);
+          assert(0);
+        }
+#endif  // CONFIG_RD_DEBUG
+      } else {
+        TX_SIZE tx = get_tx_size(plane, xd);
+#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+        tx = AOMMAX(TX_4X4, tx);
+#endif
+        const int bkw = tx_size_wide_unit[tx];
+        const int bkh = tx_size_high_unit[tx];
+        for (row = 0; row < num_4x4_h; row += bkh) {
+          for (col = 0; col < num_4x4_w; col += bkw) {
+#if !CONFIG_PVQ
+            pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
+#else
+            pack_pvq_tokens(w, x, xd, plane, bsize, tx);
+#endif
+          }
+        }
+      }
+#else
+      TX_SIZE tx = get_tx_size(plane, xd);
+      TOKEN_STATS token_stats;
+#if !CONFIG_PVQ
+      init_token_stats(&token_stats);
+#if CONFIG_LV_MAP
+      (void)tx;
+      av1_write_coeffs_mb(cm, x, w, plane);
+#else   // CONFIG_LV_MAP
+      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
+#endif  // CONFIG_LV_MAP
+
+#else
+      (void)token_stats;
+      pack_pvq_tokens(w, x, xd, plane, mbmi->sb_type, tx);
+#endif
+#if CONFIG_RD_DEBUG
+      if (is_inter_block(mbmi) && mbmi->sb_type >= BLOCK_8X8 &&
+          rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) {
+        dump_mode_info(m);
+        assert(0);
+      }
+#endif  // CONFIG_RD_DEBUG
+#endif  // CONFIG_VAR_TX
+
+#if !CONFIG_PVQ && !CONFIG_LV_MAP
+      assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+      (*tok)++;
+#endif
+    }
+  }
+#endif  // CONFIG_COEF_INTERLEAVE
+}
+
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+static void write_tokens_sb(AV1_COMP *cpi, const TileInfo *const tile,
+                            aom_writer *w, const TOKENEXTRA **tok,
+                            const TOKENEXTRA *const tok_end, int mi_row,
+                            int mi_col, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int hbs = mi_size_wide[bsize] / 2;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+#endif
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  partition = get_partition(cm, mi_row, mi_col, bsize);
+  subsize = get_subsize(bsize, partition);
+
+  if (subsize < BLOCK_8X8 && !unify_bsize) {
+    write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        break;
+      case PARTITION_HORZ:
+        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        if (mi_row + hbs < cm->mi_rows)
+          write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+        break;
+      case PARTITION_VERT:
+        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        if (mi_col + hbs < cm->mi_cols)
+          write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+        break;
+      case PARTITION_SPLIT:
+        write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
+        write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs,
+                        subsize);
+        write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col,
+                        subsize);
+        write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs,
+                        subsize);
+        break;
+#if CONFIG_EXT_PARTITION_TYPES
+      case PARTITION_HORZ_A:
+        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+        break;
+      case PARTITION_HORZ_B:
+        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+        break;
+      case PARTITION_VERT_A:
+        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+        break;
+      case PARTITION_VERT_B:
+        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+        break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+      default: assert(0);
+    }
+  }
+}
+#endif
+
+static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
+                          aom_writer *w, const TOKENEXTRA **tok,
+                          const TOKENEXTRA *const tok_end,
+#if CONFIG_SUPERTX
+                          int supertx_enabled,
+#endif
+                          int mi_row, int mi_col) {
+  write_mbmi_b(cpi, tile, w,
+#if CONFIG_SUPERTX
+               supertx_enabled,
+#endif
+               mi_row, mi_col);
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+  (void)tok;
+  (void)tok_end;
+#else
+#if !CONFIG_PVQ && CONFIG_SUPERTX
+  if (!supertx_enabled)
+#endif
+    write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+#endif
+}
+
+static void write_partition(const AV1_COMMON *const cm,
+                            const MACROBLOCKD *const xd, int hbs, int mi_row,
+                            int mi_col, PARTITION_TYPE p, BLOCK_SIZE bsize,
+                            aom_writer *w) {
+  const int has_rows = (mi_row + hbs) < cm->mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+  const int is_partition_point = bsize >= BLOCK_8X8;
+  const int ctx = is_partition_point
+                      ? partition_plane_context(xd, mi_row, mi_col,
+#if CONFIG_UNPOISON_PARTITION_CTX
+                                                has_rows, has_cols,
+#endif
+                                                bsize)
+                      : 0;
+#if CONFIG_UNPOISON_PARTITION_CTX
+  const aom_prob *const probs =
+      ctx < PARTITION_CONTEXTS ? cm->fc->partition_prob[ctx] : NULL;
+#else
+  const aom_prob *const probs = cm->fc->partition_prob[ctx];
+#endif
+
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  (void)cm;
+#elif CONFIG_EC_MULTISYMBOL
+  FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+
+  if (!is_partition_point) return;
+
+  if (has_rows && has_cols) {
+#if CONFIG_EXT_PARTITION_TYPES
+    if (bsize <= BLOCK_8X8)
+#if CONFIG_EC_MULTISYMBOL
+      aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], PARTITION_TYPES);
+#else
+      av1_write_token(w, av1_partition_tree, probs, &partition_encodings[p]);
+#endif
+    else
+#if CONFIG_EC_MULTISYMBOL
+      aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], EXT_PARTITION_TYPES);
+#else
+      av1_write_token(w, av1_ext_partition_tree, probs,
+                      &ext_partition_encodings[p]);
+#endif  // CONFIG_EC_MULTISYMBOL
+#else
+#if CONFIG_EC_MULTISYMBOL
+    aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], PARTITION_TYPES);
+#else
+    av1_write_token(w, av1_partition_tree, probs, &partition_encodings[p]);
+#endif
+#endif  // CONFIG_EXT_PARTITION_TYPES
+  } else if (!has_rows && has_cols) {
+    assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
+    aom_write(w, p == PARTITION_SPLIT, probs[1]);
+  } else if (has_rows && !has_cols) {
+    assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
+    aom_write(w, p == PARTITION_SPLIT, probs[2]);
+  } else {
+    assert(p == PARTITION_SPLIT);
+  }
+}
+
+#if CONFIG_SUPERTX
+#define write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,   \
+                               mi_row, mi_col, bsize)                         \
+  write_modes_sb(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, mi_col, \
+                 bsize)
+#else
+#define write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \
+                               mi_row, mi_col, bsize)                       \
+  write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, bsize)
+#endif  // CONFIG_SUPERTX
+
+static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
+                           aom_writer *const w, const TOKENEXTRA **tok,
+                           const TOKENEXTRA *const tok_end,
+#if CONFIG_SUPERTX
+                           int supertx_enabled,
+#endif
+                           int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  const int hbs = mi_size_wide[bsize] / 2;
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+#endif
+
+#if CONFIG_SUPERTX
+  const int mi_offset = mi_row * cm->mi_stride + mi_col;
+  MB_MODE_INFO *mbmi;
+  const int pack_token = !supertx_enabled;
+  TX_SIZE supertx_size;
+  int plane;
+#endif
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
+#if CONFIG_SUPERTX
+  mbmi = &cm->mi_grid_visible[mi_offset]->mbmi;
+  xd->mi = cm->mi_grid_visible + mi_offset;
+  set_mi_row_col(xd, tile, mi_row, mi_size_high[bsize], mi_col,
+                 mi_size_wide[bsize],
+#if CONFIG_DEPENDENT_HORZTILES
+                 cm->dependent_horz_tiles,
+#endif  // CONFIG_DEPENDENT_HORZTILES
+                 cm->mi_rows, cm->mi_cols);
+  if (!supertx_enabled && !frame_is_intra_only(cm) &&
+      partition != PARTITION_NONE && bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+      !xd->lossless[0]) {
+    aom_prob prob;
+    supertx_size = max_txsize_lookup[bsize];
+    prob = cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
+                               [supertx_size];
+    supertx_enabled = (xd->mi[0]->mbmi.tx_size == supertx_size);
+    aom_write(w, supertx_enabled, prob);
+  }
+#endif  // CONFIG_SUPERTX
+  if (subsize < BLOCK_8X8 && !unify_bsize) {
+    write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row,
+                          mi_col);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
+        break;
+      case PARTITION_HORZ:
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
+        if (mi_row + hbs < cm->mi_rows)
+          write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                                mi_row + hbs, mi_col);
+        break;
+      case PARTITION_VERT:
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
+        if (mi_col + hbs < cm->mi_cols)
+          write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                                mi_row, mi_col + hbs);
+        break;
+      case PARTITION_SPLIT:
+        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                               mi_row, mi_col, subsize);
+        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                               mi_row, mi_col + hbs, subsize);
+        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                               mi_row + hbs, mi_col, subsize);
+        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                               mi_row + hbs, mi_col + hbs, subsize);
+        break;
+#if CONFIG_EXT_PARTITION_TYPES
+      case PARTITION_HORZ_A:
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col + hbs);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row + hbs, mi_col);
+        break;
+      case PARTITION_HORZ_B:
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row + hbs, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row + hbs, mi_col + hbs);
+        break;
+      case PARTITION_VERT_A:
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row + hbs, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col + hbs);
+        break;
+      case PARTITION_VERT_B:
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col + hbs);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row + hbs, mi_col + hbs);
+        break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+      default: assert(0);
+    }
+  }
+#if CONFIG_SUPERTX
+  if (partition != PARTITION_NONE && supertx_enabled && pack_token) {
+    int skip;
+    const int bsw = mi_size_wide[bsize];
+    const int bsh = mi_size_high[bsize];
+
+    xd->mi = cm->mi_grid_visible + mi_offset;
+    supertx_size = mbmi->tx_size;
+    set_mi_row_col(xd, tile, mi_row, bsh, mi_col, bsw,
+#if CONFIG_DEPENDENT_HORZTILES
+                   cm->dependent_horz_tiles,
+#endif  // CONFIG_DEPENDENT_HORZTILES
+                   cm->mi_rows, cm->mi_cols);
+
+    assert(IMPLIES(!cm->seg.enabled, mbmi->segment_id_supertx == 0));
+    assert(mbmi->segment_id_supertx < MAX_SEGMENTS);
+
+    skip = write_skip(cm, xd, mbmi->segment_id_supertx, xd->mi[0], w);
+#if CONFIG_EXT_TX
+    if (get_ext_tx_types(supertx_size, bsize, 1, cm->reduced_tx_set_used) > 1 &&
+        !skip) {
+      const int eset =
+          get_ext_tx_set(supertx_size, bsize, 1, cm->reduced_tx_set_used);
+      if (eset > 0) {
+#if CONFIG_EC_MULTISYMBOL
+#if CONFIG_EC_ADAPT
+        FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#else
+        FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+        aom_write_symbol(w, av1_ext_tx_inter_ind[eset][mbmi->tx_type],
+                         ec_ctx->inter_ext_tx_cdf[eset][supertx_size],
+                         ext_tx_cnt_inter[eset]);
+#else
+        av1_write_token(w, av1_ext_tx_inter_tree[eset],
+                        cm->fc->inter_ext_tx_prob[eset][supertx_size],
+                        &ext_tx_inter_encodings[eset][mbmi->tx_type]);
+#endif
+      }
+    }
+#else
+    if (supertx_size < TX_32X32 && !skip) {
+      av1_write_token(w, av1_ext_tx_tree,
+                      cm->fc->inter_ext_tx_prob[supertx_size],
+                      &ext_tx_encodings[mbmi->tx_type]);
+    }
+#endif  // CONFIG_EXT_TX
+
+    if (!skip) {
+      assert(*tok < tok_end);
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        const struct macroblockd_plane *const pd = &xd->plane[plane];
+        const int mbmi_txb_size = txsize_to_bsize[mbmi->tx_size];
+        const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi_txb_size, pd);
+
+        const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+        const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+
+        int row, col;
+        TX_SIZE tx = get_tx_size(plane, xd);
+        BLOCK_SIZE txb_size = txsize_to_bsize[tx];
+
+        const int stepr = tx_size_high_unit[txb_size];
+        const int stepc = tx_size_wide_unit[txb_size];
+
+        TOKEN_STATS token_stats;
+        token_stats.cost = 0;
+        for (row = 0; row < max_blocks_high; row += stepr)
+          for (col = 0; col < max_blocks_wide; col += stepc)
+            pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
+        assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+        (*tok)++;
+      }
+    }
+#if CONFIG_VAR_TX
+    xd->above_txfm_context = cm->above_txfm_context + mi_col;
+    xd->left_txfm_context =
+        xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+    set_txfm_ctxs(xd->mi[0]->mbmi.tx_size, bsw, bsh, skip, xd);
+#endif
+  }
+#endif  // CONFIG_SUPERTX
+
+// update partition context
+#if CONFIG_EXT_PARTITION_TYPES
+  update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+#else
+  if (bsize >= BLOCK_8X8 &&
+      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
+#if CONFIG_CDEF
+#if CONFIG_EXT_PARTITION
+  if (cm->sb_size == BLOCK_128X128 && bsize == BLOCK_128X128 &&
+      !sb_all_skip(cm, mi_row, mi_col)) {
+    aom_write_literal(w, cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]
+                             ->mbmi.cdef_strength,
+                      cm->cdef_bits);
+  } else if (cm->sb_size == BLOCK_64X64 && bsize == BLOCK_64X64 &&
+#else
+  if (bsize == BLOCK_64X64 &&
+#endif  // CONFIG_EXT_PARTITION
+             !sb_all_skip(cm, mi_row, mi_col)) {
+    if (cm->cdef_bits != 0)
+      aom_write_literal(w, cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]
+                               ->mbmi.cdef_strength,
+                        cm->cdef_bits);
+  }
+#endif
+}
+
+static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
+                        aom_writer *const w, const TOKENEXTRA **tok,
+                        const TOKENEXTRA *const tok_end) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  const int mi_row_start = tile->mi_row_start;
+  const int mi_row_end = tile->mi_row_end;
+  const int mi_col_start = tile->mi_col_start;
+  const int mi_col_end = tile->mi_col_end;
+  int mi_row, mi_col;
+
+#if CONFIG_DEPENDENT_HORZTILES
+#if CONFIG_TILE_GROUPS
+  if (!cm->dependent_horz_tiles || mi_row_start == 0 ||
+      tile->tg_horz_boundary) {
+#else
+  if (!cm->dependent_horz_tiles || mi_row_start == 0) {
+#endif
+    av1_zero_above_context(cm, mi_col_start, mi_col_end);
+  }
+#else
+  av1_zero_above_context(cm, mi_col_start, mi_col_end);
+#endif
+#if CONFIG_PVQ
+  assert(cpi->td.mb.pvq_q->curr_pos == 0);
+#endif
+#if CONFIG_DELTA_Q
+  if (cpi->common.delta_q_present_flag) {
+    xd->prev_qindex = cpi->common.base_qindex;
+#if CONFIG_EXT_DELTA_Q
+    if (cpi->common.delta_lf_present_flag) {
+      xd->prev_delta_lf_from_base = 0;
+    }
+#endif  // CONFIG_EXT_DELTA_Q
+  }
+#endif
+
+  for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += cm->mib_size) {
+    av1_zero_left_context(xd);
+
+    for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += cm->mib_size) {
+      write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, 0, mi_row, mi_col,
+                             cm->sb_size);
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+      write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, cm->sb_size);
+#endif
+    }
+  }
+#if CONFIG_PVQ
+  // Check that the number of PVQ blocks encoded and written to the bitstream
+  // are the same
+  assert(cpi->td.mb.pvq_q->curr_pos == cpi->td.mb.pvq_q->last_pos);
+  // Reset curr_pos in case we repack the bitstream
+  cpi->td.mb.pvq_q->curr_pos = 0;
+#endif
+}
+
+#if !CONFIG_LV_MAP
+#if !CONFIG_PVQ && !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET)
+static void build_tree_distribution(AV1_COMP *cpi, TX_SIZE tx_size,
+                                    av1_coeff_stats *coef_branch_ct,
+                                    av1_coeff_probs_model *coef_probs) {
+  av1_coeff_count *coef_counts = cpi->td.rd_counts.coef_counts[tx_size];
+  unsigned int(*eob_branch_ct)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
+      cpi->common.counts.eob_branch[tx_size];
+  int i, j, k, l, m;
+#if CONFIG_RECT_TX
+  assert(!is_rect_tx(tx_size));
+#endif  // CONFIG_RECT_TX
+
+  for (i = 0; i < PLANE_TYPES; ++i) {
+    for (j = 0; j < REF_TYPES; ++j) {
+      for (k = 0; k < COEF_BANDS; ++k) {
+        for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+          av1_tree_probs_from_distribution(av1_coef_tree,
+                                           coef_branch_ct[i][j][k][l],
+                                           coef_counts[i][j][k][l]);
+          coef_branch_ct[i][j][k][l][0][1] =
+              eob_branch_ct[i][j][k][l] - coef_branch_ct[i][j][k][l][0][0];
+          for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+            coef_probs[i][j][k][l][m] =
+                get_binary_prob(coef_branch_ct[i][j][k][l][m][0],
+                                coef_branch_ct[i][j][k][l][m][1]);
+        }
+      }
+    }
+  }
+}
+
+#if !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET)
+static void update_coef_probs_common(aom_writer *const bc, AV1_COMP *cpi,
+                                     TX_SIZE tx_size,
+                                     av1_coeff_stats *frame_branch_ct,
+                                     av1_coeff_probs_model *new_coef_probs) {
+  av1_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size];
+  const aom_prob upd = DIFF_UPDATE_PROB;
+#if CONFIG_EC_ADAPT
+  const int entropy_nodes_update = UNCONSTRAINED_NODES - 1;
+#else
+  const int entropy_nodes_update = UNCONSTRAINED_NODES;
+#endif
+  int i, j, k, l, t;
+  int stepsize = cpi->sf.coeff_prob_appx_step;
+#if CONFIG_TILE_GROUPS
+  const int probwt = cpi->common.num_tg;
+#else
+  const int probwt = 1;
+#endif
+#if CONFIG_RECT_TX
+  assert(!is_rect_tx(tx_size));
+#endif  // CONFIG_RECT_TX
+
+  switch (cpi->sf.use_fast_coef_updates) {
+    case TWO_LOOP: {
+      /* dry run to see if there is any update at all needed */
+      int savings = 0;
+      int update[2] = { 0, 0 };
+      for (i = 0; i < PLANE_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                aom_prob newp = new_coef_probs[i][j][k][l][t];
+                const aom_prob oldp = old_coef_probs[i][j][k][l][t];
+                int s;
+                int u = 0;
+                if (t == PIVOT_NODE)
+                  s = av1_prob_diff_update_savings_search_model(
+                      frame_branch_ct[i][j][k][l][0], oldp, &newp, upd,
+                      stepsize, probwt);
+                else
+                  s = av1_prob_diff_update_savings_search(
+                      frame_branch_ct[i][j][k][l][t], oldp, &newp, upd, probwt);
+
+                if (s > 0 && newp != oldp) u = 1;
+                if (u)
+                  savings += s - (int)(av1_cost_zero(upd));
+                else
+                  savings -= (int)(av1_cost_zero(upd));
+                update[u]++;
+              }
+            }
+          }
+        }
+      }
+
+      /* Is coef updated at all */
+      if (update[1] == 0 || savings < 0) {
+        aom_write_bit(bc, 0);
+        return;
+      }
+      aom_write_bit(bc, 1);
+      for (i = 0; i < PLANE_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+              // calc probs and branch cts for this frame only
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                aom_prob newp = new_coef_probs[i][j][k][l][t];
+                aom_prob *oldp = old_coef_probs[i][j][k][l] + t;
+                int s;
+                int u = 0;
+                if (t == PIVOT_NODE)
+                  s = av1_prob_diff_update_savings_search_model(
+                      frame_branch_ct[i][j][k][l][0], *oldp, &newp, upd,
+                      stepsize, probwt);
+                else
+                  s = av1_prob_diff_update_savings_search(
+                      frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd,
+                      probwt);
+                if (s > 0 && newp != *oldp) u = 1;
+                aom_write(bc, u, upd);
+                if (u) {
+                  /* send/use new probability */
+                  av1_write_prob_diff_update(bc, newp, *oldp);
+                  *oldp = newp;
+                }
+              }
+            }
+          }
+        }
+      }
+      return;
+    }
+
+    case ONE_LOOP_REDUCED: {
+      int updates = 0;
+      int noupdates_before_first = 0;
+      for (i = 0; i < PLANE_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+              // calc probs and branch cts for this frame only
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                aom_prob newp = new_coef_probs[i][j][k][l][t];
+                aom_prob *oldp = old_coef_probs[i][j][k][l] + t;
+                int s;
+                int u = 0;
+                if (t == PIVOT_NODE) {
+                  s = av1_prob_diff_update_savings_search_model(
+                      frame_branch_ct[i][j][k][l][0], *oldp, &newp, upd,
+                      stepsize, probwt);
+                } else {
+                  s = av1_prob_diff_update_savings_search(
+                      frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd,
+                      probwt);
+                }
+
+                if (s > 0 && newp != *oldp) u = 1;
+                updates += u;
+                if (u == 0 && updates == 0) {
+                  noupdates_before_first++;
+                  continue;
+                }
+                if (u == 1 && updates == 1) {
+                  int v;
+                  // first update
+                  aom_write_bit(bc, 1);
+                  for (v = 0; v < noupdates_before_first; ++v)
+                    aom_write(bc, 0, upd);
+                }
+                aom_write(bc, u, upd);
+                if (u) {
+                  /* send/use new probability */
+                  av1_write_prob_diff_update(bc, newp, *oldp);
+                  *oldp = newp;
+                }
+              }
+            }
+          }
+        }
+      }
+      if (updates == 0) {
+        aom_write_bit(bc, 0);  // no updates
+      }
+      return;
+    }
+    default: assert(0);
+  }
+}
+#endif
+#if CONFIG_SUBFRAME_PROB_UPDATE
+// Calculate the token counts between subsequent subframe updates.
+static void get_coef_counts_diff(
+    AV1_COMP *cpi, int index,
+    av1_coeff_count coef_counts[TX_SIZES][PLANE_TYPES],
+    unsigned int eob_counts[TX_SIZES][PLANE_TYPES][REF_TYPES][COEF_BANDS]
+                           [COEFF_CONTEXTS]) {
+  int i, j, k, l, m, tx_size, val;
+  const int max_idx = cpi->common.coef_probs_update_idx;
+  const TX_MODE tx_mode = cpi->common.tx_mode;
+  const int max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+  const SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats;
+
+  assert(max_idx < COEF_PROBS_BUFS);
+
+  for (tx_size = 0; tx_size <= max_tx_size; ++tx_size)
+    for (i = 0; i < PLANE_TYPES; ++i)
+      for (j = 0; j < REF_TYPES; ++j)
+        for (k = 0; k < COEF_BANDS; ++k)
+          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+            if (index == max_idx) {
+              val =
+                  cpi->common.counts.eob_branch[tx_size][i][j][k][l] -
+                  subframe_stats->eob_counts_buf[max_idx][tx_size][i][j][k][l];
+            } else {
+              val = subframe_stats
+                        ->eob_counts_buf[index + 1][tx_size][i][j][k][l] -
+                    subframe_stats->eob_counts_buf[index][tx_size][i][j][k][l];
+            }
+            assert(val >= 0);
+            eob_counts[tx_size][i][j][k][l] = val;
+
+            for (m = 0; m < ENTROPY_TOKENS; ++m) {
+              if (index == max_idx) {
+                val = cpi->td.rd_counts.coef_counts[tx_size][i][j][k][l][m] -
+                      subframe_stats
+                          ->coef_counts_buf[max_idx][tx_size][i][j][k][l][m];
+              } else {
+                val = subframe_stats
+                          ->coef_counts_buf[index + 1][tx_size][i][j][k][l][m] -
+                      subframe_stats
+                          ->coef_counts_buf[index][tx_size][i][j][k][l][m];
+              }
+              assert(val >= 0);
+              coef_counts[tx_size][i][j][k][l][m] = val;
+            }
+          }
+}
+
+static void update_coef_probs_subframe(
+    aom_writer *const bc, AV1_COMP *cpi, TX_SIZE tx_size,
+    av1_coeff_stats branch_ct[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES],
+    av1_coeff_probs_model *new_coef_probs) {
+  av1_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size];
+  const aom_prob upd = DIFF_UPDATE_PROB;
+  const int entropy_nodes_update = UNCONSTRAINED_NODES;
+  int i, j, k, l, t;
+  int stepsize = cpi->sf.coeff_prob_appx_step;
+  const int max_idx = cpi->common.coef_probs_update_idx;
+  int idx;
+  unsigned int this_branch_ct[ENTROPY_NODES][COEF_PROBS_BUFS][2];
+
+  switch (cpi->sf.use_fast_coef_updates) {
+    case TWO_LOOP: {
+      /* dry run to see if there is any update at all needed */
+      int savings = 0;
+      int update[2] = { 0, 0 };
+      for (i = 0; i < PLANE_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+              for (t = 0; t < ENTROPY_NODES; ++t) {
+                for (idx = 0; idx <= max_idx; ++idx) {
+                  memcpy(this_branch_ct[t][idx],
+                         branch_ct[idx][tx_size][i][j][k][l][t],
+                         2 * sizeof(this_branch_ct[t][idx][0]));
+                }
+              }
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                aom_prob newp = new_coef_probs[i][j][k][l][t];
+                const aom_prob oldp = old_coef_probs[i][j][k][l][t];
+                int s, u = 0;
+
+                if (t == PIVOT_NODE)
+                  s = av1_prob_update_search_model_subframe(
+                      this_branch_ct, old_coef_probs[i][j][k][l], &newp, upd,
+                      stepsize, max_idx);
+                else
+                  s = av1_prob_update_search_subframe(this_branch_ct[t], oldp,
+                                                      &newp, upd, max_idx);
+                if (s > 0 && newp != oldp) u = 1;
+                if (u)
+                  savings += s - (int)(av1_cost_zero(upd));
+                else
+                  savings -= (int)(av1_cost_zero(upd));
+                update[u]++;
+              }
+            }
+          }
+        }
+      }
+
+      /* Is coef updated at all */
+      if (update[1] == 0 || savings < 0) {
+        aom_write_bit(bc, 0);
+        return;
+      }
+      aom_write_bit(bc, 1);
+      for (i = 0; i < PLANE_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+              for (t = 0; t < ENTROPY_NODES; ++t) {
+                for (idx = 0; idx <= max_idx; ++idx) {
+                  memcpy(this_branch_ct[t][idx],
+                         branch_ct[idx][tx_size][i][j][k][l][t],
+                         2 * sizeof(this_branch_ct[t][idx][0]));
+                }
+              }
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                aom_prob newp = new_coef_probs[i][j][k][l][t];
+                aom_prob *oldp = old_coef_probs[i][j][k][l] + t;
+                int s;
+                int u = 0;
+
+                if (t == PIVOT_NODE)
+                  s = av1_prob_update_search_model_subframe(
+                      this_branch_ct, old_coef_probs[i][j][k][l], &newp, upd,
+                      stepsize, max_idx);
+                else
+                  s = av1_prob_update_search_subframe(this_branch_ct[t], *oldp,
+                                                      &newp, upd, max_idx);
+                if (s > 0 && newp != *oldp) u = 1;
+                aom_write(bc, u, upd);
+                if (u) {
+                  /* send/use new probability */
+                  av1_write_prob_diff_update(bc, newp, *oldp);
+                  *oldp = newp;
+                }
+              }
+            }
+          }
+        }
+      }
+      return;
+    }
+
+    case ONE_LOOP_REDUCED: {
+      int updates = 0;
+      int noupdates_before_first = 0;
+      for (i = 0; i < PLANE_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+              for (t = 0; t < ENTROPY_NODES; ++t) {
+                for (idx = 0; idx <= max_idx; ++idx) {
+                  memcpy(this_branch_ct[t][idx],
+                         branch_ct[idx][tx_size][i][j][k][l][t],
+                         2 * sizeof(this_branch_ct[t][idx][0]));
+                }
+              }
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                aom_prob newp = new_coef_probs[i][j][k][l][t];
+                aom_prob *oldp = old_coef_probs[i][j][k][l] + t;
+                int s;
+                int u = 0;
+
+                if (t == PIVOT_NODE)
+                  s = av1_prob_update_search_model_subframe(
+                      this_branch_ct, old_coef_probs[i][j][k][l], &newp, upd,
+                      stepsize, max_idx);
+                else
+                  s = av1_prob_update_search_subframe(this_branch_ct[t], *oldp,
+                                                      &newp, upd, max_idx);
+                if (s > 0 && newp != *oldp) u = 1;
+                updates += u;
+                if (u == 0 && updates == 0) {
+                  noupdates_before_first++;
+                  continue;
+                }
+                if (u == 1 && updates == 1) {
+                  int v;
+                  // first update
+                  aom_write_bit(bc, 1);
+                  for (v = 0; v < noupdates_before_first; ++v)
+                    aom_write(bc, 0, upd);
+                }
+                aom_write(bc, u, upd);
+                if (u) {
+                  /* send/use new probability */
+                  av1_write_prob_diff_update(bc, newp, *oldp);
+                  *oldp = newp;
+                }
+              }
+            }
+          }
+        }
+      }
+      if (updates == 0) {
+        aom_write_bit(bc, 0);  // no updates
+      }
+      return;
+    }
+    default: assert(0);
+  }
+}
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+
+#if !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET)
+static void update_coef_probs(AV1_COMP *cpi, aom_writer *w) {
+  const TX_MODE tx_mode = cpi->common.tx_mode;
+  const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+  TX_SIZE tx_size;
+#if CONFIG_SUBFRAME_PROB_UPDATE
+  AV1_COMMON *cm = &cpi->common;
+  SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats;
+  int i;
+  av1_coeff_probs_model dummy_frame_coef_probs[PLANE_TYPES];
+
+  if (cm->do_subframe_update &&
+      cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    av1_copy(cpi->common.fc->coef_probs,
+             subframe_stats->enc_starting_coef_probs);
+    for (i = 0; i <= cpi->common.coef_probs_update_idx; ++i) {
+      get_coef_counts_diff(cpi, i, cpi->wholeframe_stats.coef_counts_buf[i],
+                           cpi->wholeframe_stats.eob_counts_buf[i]);
+    }
+  }
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+
+  for (tx_size = 0; tx_size <= max_tx_size; ++tx_size) {
+    av1_coeff_stats frame_branch_ct[PLANE_TYPES];
+    av1_coeff_probs_model frame_coef_probs[PLANE_TYPES];
+    if (cpi->td.counts->tx_size_totals[tx_size] <= 20 || CONFIG_RD_DEBUG ||
+        (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8)) {
+      aom_write_bit(w, 0);
+    } else {
+#if CONFIG_SUBFRAME_PROB_UPDATE
+      if (cm->do_subframe_update &&
+          cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+        unsigned int this_eob_counts_copy[PLANE_TYPES][REF_TYPES][COEF_BANDS]
+                                         [COEFF_CONTEXTS];
+        av1_coeff_count coef_counts_copy[PLANE_TYPES];
+        av1_copy(this_eob_counts_copy, cpi->common.counts.eob_branch[tx_size]);
+        av1_copy(coef_counts_copy, cpi->td.rd_counts.coef_counts[tx_size]);
+        build_tree_distribution(cpi, tx_size, frame_branch_ct,
+                                frame_coef_probs);
+        for (i = 0; i <= cpi->common.coef_probs_update_idx; ++i) {
+          av1_copy(cpi->common.counts.eob_branch[tx_size],
+                   cpi->wholeframe_stats.eob_counts_buf[i][tx_size]);
+          av1_copy(cpi->td.rd_counts.coef_counts[tx_size],
+                   cpi->wholeframe_stats.coef_counts_buf[i][tx_size]);
+          build_tree_distribution(cpi, tx_size, cpi->branch_ct_buf[i][tx_size],
+                                  dummy_frame_coef_probs);
+        }
+        av1_copy(cpi->common.counts.eob_branch[tx_size], this_eob_counts_copy);
+        av1_copy(cpi->td.rd_counts.coef_counts[tx_size], coef_counts_copy);
+
+        update_coef_probs_subframe(w, cpi, tx_size, cpi->branch_ct_buf,
+                                   frame_coef_probs);
+      } else {
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+        build_tree_distribution(cpi, tx_size, frame_branch_ct,
+                                frame_coef_probs);
+        update_coef_probs_common(w, cpi, tx_size, frame_branch_ct,
+                                 frame_coef_probs);
+#if CONFIG_SUBFRAME_PROB_UPDATE
+      }
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+    }
+  }
+
+#if CONFIG_SUBFRAME_PROB_UPDATE
+  av1_copy(cm->starting_coef_probs, cm->fc->coef_probs);
+  av1_copy(subframe_stats->coef_probs_buf[0], cm->fc->coef_probs);
+  if (cm->do_subframe_update &&
+      cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    unsigned int eob_counts_copy[TX_SIZES][PLANE_TYPES][REF_TYPES][COEF_BANDS]
+                                [COEFF_CONTEXTS];
+    av1_copy(eob_counts_copy, cm->counts.eob_branch);
+    for (i = 1; i <= cpi->common.coef_probs_update_idx; ++i) {
+      for (tx_size = 0; tx_size <= max_tx_size; ++tx_size)
+        av1_full_to_model_counts(cm->counts.coef[tx_size],
+                                 subframe_stats->coef_counts_buf[i][tx_size]);
+      av1_copy(cm->counts.eob_branch, subframe_stats->eob_counts_buf[i]);
+      av1_partial_adapt_probs(cm, 0, 0);
+      av1_copy(subframe_stats->coef_probs_buf[i], cm->fc->coef_probs);
+    }
+    av1_copy(cm->fc->coef_probs, subframe_stats->coef_probs_buf[0]);
+    av1_copy(cm->counts.eob_branch, eob_counts_copy);
+  }
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+}
+#endif  // !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET)
+#endif  // !CONFIG_EC_ADAPT
+#endif  // !CONFIG_LV_MAP
+
+#if CONFIG_LOOP_RESTORATION
+static void encode_restoration_mode(AV1_COMMON *cm,
+                                    struct aom_write_bit_buffer *wb) {
+  int p;
+  RestorationInfo *rsi = &cm->rst_info[0];
+  switch (rsi->frame_restoration_type) {
+    case RESTORE_NONE:
+      aom_wb_write_bit(wb, 0);
+      aom_wb_write_bit(wb, 0);
+      break;
+    case RESTORE_WIENER:
+      aom_wb_write_bit(wb, 1);
+      aom_wb_write_bit(wb, 0);
+      break;
+    case RESTORE_SGRPROJ:
+      aom_wb_write_bit(wb, 1);
+      aom_wb_write_bit(wb, 1);
+      break;
+    case RESTORE_SWITCHABLE:
+      aom_wb_write_bit(wb, 0);
+      aom_wb_write_bit(wb, 1);
+      break;
+    default: assert(0);
+  }
+  for (p = 1; p < MAX_MB_PLANE; ++p) {
+    rsi = &cm->rst_info[p];
+    switch (rsi->frame_restoration_type) {
+      case RESTORE_NONE: aom_wb_write_bit(wb, 0); break;
+      case RESTORE_WIENER: aom_wb_write_bit(wb, 1); break;
+      default: assert(0);
+    }
+  }
+  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
+    rsi = &cm->rst_info[0];
+    aom_wb_write_bit(wb, rsi->restoration_tilesize != RESTORATION_TILESIZE_MAX);
+    if (rsi->restoration_tilesize != RESTORATION_TILESIZE_MAX) {
+      aom_wb_write_bit(
+          wb, rsi->restoration_tilesize != (RESTORATION_TILESIZE_MAX >> 1));
+    }
+  }
+}
+
+static void write_wiener_filter(WienerInfo *wiener_info,
+                                WienerInfo *ref_wiener_info, aom_writer *wb) {
+  aom_write_primitive_refsubexpfin(
+      wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+      WIENER_FILT_TAP0_SUBEXP_K,
+      ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
+      wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV);
+  aom_write_primitive_refsubexpfin(
+      wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+      WIENER_FILT_TAP1_SUBEXP_K,
+      ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV,
+      wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV);
+  aom_write_primitive_refsubexpfin(
+      wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+      WIENER_FILT_TAP2_SUBEXP_K,
+      ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV,
+      wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV);
+  aom_write_primitive_refsubexpfin(
+      wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+      WIENER_FILT_TAP0_SUBEXP_K,
+      ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
+      wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV);
+  aom_write_primitive_refsubexpfin(
+      wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+      WIENER_FILT_TAP1_SUBEXP_K,
+      ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV,
+      wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV);
+  aom_write_primitive_refsubexpfin(
+      wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+      WIENER_FILT_TAP2_SUBEXP_K,
+      ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV,
+      wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV);
+  memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info));
+}
+
+static void write_sgrproj_filter(SgrprojInfo *sgrproj_info,
+                                 SgrprojInfo *ref_sgrproj_info,
+                                 aom_writer *wb) {
+  aom_write_literal(wb, sgrproj_info->ep, SGRPROJ_PARAMS_BITS);
+  aom_write_primitive_refsubexpfin(wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1,
+                                   SGRPROJ_PRJ_SUBEXP_K,
+                                   ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+                                   sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+  aom_write_primitive_refsubexpfin(wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1,
+                                   SGRPROJ_PRJ_SUBEXP_K,
+                                   ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+                                   sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+  memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
+}
+
+static void encode_restoration(AV1_COMMON *cm, aom_writer *wb) {
+  int i, p;
+  const int ntiles = av1_get_rest_ntiles(cm->width, cm->height,
+                                         cm->rst_info[0].restoration_tilesize,
+                                         NULL, NULL, NULL, NULL);
+  WienerInfo ref_wiener_info;
+  SgrprojInfo ref_sgrproj_info;
+  set_default_wiener(&ref_wiener_info);
+  set_default_sgrproj(&ref_sgrproj_info);
+  const int ntiles_uv = av1_get_rest_ntiles(
+      ROUND_POWER_OF_TWO(cm->width, cm->subsampling_x),
+      ROUND_POWER_OF_TWO(cm->height, cm->subsampling_y),
+      cm->rst_info[1].restoration_tilesize, NULL, NULL, NULL, NULL);
+  RestorationInfo *rsi = &cm->rst_info[0];
+  if (rsi->frame_restoration_type != RESTORE_NONE) {
+    if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) {
+      // RESTORE_SWITCHABLE
+      for (i = 0; i < ntiles; ++i) {
+        av1_write_token(
+            wb, av1_switchable_restore_tree, cm->fc->switchable_restore_prob,
+            &switchable_restore_encodings[rsi->restoration_type[i]]);
+        if (rsi->restoration_type[i] == RESTORE_WIENER) {
+          write_wiener_filter(&rsi->wiener_info[i], &ref_wiener_info, wb);
+        } else if (rsi->restoration_type[i] == RESTORE_SGRPROJ) {
+          write_sgrproj_filter(&rsi->sgrproj_info[i], &ref_sgrproj_info, wb);
+        }
+      }
+    } else if (rsi->frame_restoration_type == RESTORE_WIENER) {
+      for (i = 0; i < ntiles; ++i) {
+        aom_write(wb, rsi->restoration_type[i] != RESTORE_NONE,
+                  RESTORE_NONE_WIENER_PROB);
+        if (rsi->restoration_type[i] != RESTORE_NONE) {
+          write_wiener_filter(&rsi->wiener_info[i], &ref_wiener_info, wb);
+        }
+      }
+    } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) {
+      for (i = 0; i < ntiles; ++i) {
+        aom_write(wb, rsi->restoration_type[i] != RESTORE_NONE,
+                  RESTORE_NONE_SGRPROJ_PROB);
+        if (rsi->restoration_type[i] != RESTORE_NONE) {
+          write_sgrproj_filter(&rsi->sgrproj_info[i], &ref_sgrproj_info, wb);
+        }
+      }
+    }
+  }
+  for (p = 1; p < MAX_MB_PLANE; ++p) {
+    set_default_wiener(&ref_wiener_info);
+    rsi = &cm->rst_info[p];
+    if (rsi->frame_restoration_type == RESTORE_WIENER) {
+      for (i = 0; i < ntiles_uv; ++i) {
+        if (ntiles_uv > 1)
+          aom_write(wb, rsi->restoration_type[i] != RESTORE_NONE,
+                    RESTORE_NONE_WIENER_PROB);
+        if (rsi->restoration_type[i] != RESTORE_NONE) {
+          write_wiener_filter(&rsi->wiener_info[i], &ref_wiener_info, wb);
+        }
+      }
+    } else if (rsi->frame_restoration_type != RESTORE_NONE) {
+      assert(0);
+    }
+  }
+}
+#endif  // CONFIG_LOOP_RESTORATION
+
+static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
+  int i;
+  struct loopfilter *lf = &cm->lf;
+
+  // Encode the loop filter level and type
+  aom_wb_write_literal(wb, lf->filter_level, 6);
+  aom_wb_write_literal(wb, lf->sharpness_level, 3);
+
+  // Write out loop filter deltas applied at the MB level based on mode or
+  // ref frame (if they are enabled).
+  aom_wb_write_bit(wb, lf->mode_ref_delta_enabled);
+
+  if (lf->mode_ref_delta_enabled) {
+    aom_wb_write_bit(wb, lf->mode_ref_delta_update);
+    if (lf->mode_ref_delta_update) {
+      for (i = 0; i < TOTAL_REFS_PER_FRAME; i++) {
+        const int delta = lf->ref_deltas[i];
+        const int changed = delta != lf->last_ref_deltas[i];
+        aom_wb_write_bit(wb, changed);
+        if (changed) {
+          lf->last_ref_deltas[i] = delta;
+          aom_wb_write_inv_signed_literal(wb, delta, 6);
+        }
+      }
+
+      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+        const int delta = lf->mode_deltas[i];
+        const int changed = delta != lf->last_mode_deltas[i];
+        aom_wb_write_bit(wb, changed);
+        if (changed) {
+          lf->last_mode_deltas[i] = delta;
+          aom_wb_write_inv_signed_literal(wb, delta, 6);
+        }
+      }
+    }
+  }
+}
+
+#if CONFIG_CDEF
+static void encode_cdef(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
+  int i;
+  aom_wb_write_literal(wb, cm->cdef_dering_damping - 5, 1);
+  aom_wb_write_literal(wb, cm->cdef_clpf_damping - 3, 2);
+  aom_wb_write_literal(wb, cm->cdef_bits, 2);
+  for (i = 0; i < cm->nb_cdef_strengths; i++) {
+    aom_wb_write_literal(wb, cm->cdef_strengths[i], CDEF_STRENGTH_BITS);
+    aom_wb_write_literal(wb, cm->cdef_uv_strengths[i], CDEF_STRENGTH_BITS);
+  }
+}
+#endif
+
+static void write_delta_q(struct aom_write_bit_buffer *wb, int delta_q) {
+  if (delta_q != 0) {
+    aom_wb_write_bit(wb, 1);
+    aom_wb_write_inv_signed_literal(wb, delta_q, 6);
+  } else {
+    aom_wb_write_bit(wb, 0);
+  }
+}
+
+static void encode_quantization(const AV1_COMMON *const cm,
+                                struct aom_write_bit_buffer *wb) {
+  aom_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS);
+  write_delta_q(wb, cm->y_dc_delta_q);
+  write_delta_q(wb, cm->uv_dc_delta_q);
+  write_delta_q(wb, cm->uv_ac_delta_q);
+#if CONFIG_AOM_QM
+  aom_wb_write_bit(wb, cm->using_qmatrix);
+  if (cm->using_qmatrix) {
+    aom_wb_write_literal(wb, cm->min_qmlevel, QM_LEVEL_BITS);
+    aom_wb_write_literal(wb, cm->max_qmlevel, QM_LEVEL_BITS);
+  }
+#endif
+}
+
+static void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                struct aom_write_bit_buffer *wb) {
+  int i, j;
+  const struct segmentation *seg = &cm->seg;
+
+  aom_wb_write_bit(wb, seg->enabled);
+  if (!seg->enabled) return;
+
+  // Segmentation map
+  if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
+    aom_wb_write_bit(wb, seg->update_map);
+  } else {
+    assert(seg->update_map == 1);
+  }
+  if (seg->update_map) {
+    // Select the coding strategy (temporal or spatial)
+    av1_choose_segmap_coding_method(cm, xd);
+
+    // Write out the chosen coding method.
+    if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
+      aom_wb_write_bit(wb, seg->temporal_update);
+    } else {
+      assert(seg->temporal_update == 0);
+    }
+  }
+
+  // Segmentation data
+  aom_wb_write_bit(wb, seg->update_data);
+  if (seg->update_data) {
+    aom_wb_write_bit(wb, seg->abs_delta);
+
+    for (i = 0; i < MAX_SEGMENTS; i++) {
+      for (j = 0; j < SEG_LVL_MAX; j++) {
+        const int active = segfeature_active(seg, i, j);
+        aom_wb_write_bit(wb, active);
+        if (active) {
+          const int data = get_segdata(seg, i, j);
+          const int data_max = av1_seg_feature_data_max(j);
+
+          if (av1_is_segfeature_signed(j)) {
+            encode_unsigned_max(wb, abs(data), data_max);
+            aom_wb_write_bit(wb, data < 0);
+          } else {
+            encode_unsigned_max(wb, data, data_max);
+          }
+        }
+      }
+    }
+  }
+}
+
+#if !CONFIG_EC_ADAPT
+static void update_seg_probs(AV1_COMP *cpi, aom_writer *w) {
+  AV1_COMMON *cm = &cpi->common;
+#if CONFIG_TILE_GROUPS
+  const int probwt = cm->num_tg;
+#else
+  const int probwt = 1;
+#endif
+
+  if (!cm->seg.enabled || !cm->seg.update_map) return;
+
+  if (cm->seg.temporal_update) {
+    int i;
+
+    for (i = 0; i < PREDICTION_PROBS; i++)
+      av1_cond_prob_diff_update(w, &cm->fc->seg.pred_probs[i],
+                                cm->counts.seg.pred[i], probwt);
+
+    prob_diff_update(av1_segment_tree, cm->fc->seg.tree_probs,
+                     cm->counts.seg.tree_mispred, MAX_SEGMENTS, probwt, w);
+  } else {
+    prob_diff_update(av1_segment_tree, cm->fc->seg.tree_probs,
+                     cm->counts.seg.tree_total, MAX_SEGMENTS, probwt, w);
+  }
+}
+#endif
+
+static void write_tx_mode(AV1_COMMON *cm, MACROBLOCKD *xd, TX_MODE *mode,
+                          struct aom_write_bit_buffer *wb) {
+  int i, all_lossless = 1;
+
+  if (cm->seg.enabled) {
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      if (!xd->lossless[i]) {
+        all_lossless = 0;
+        break;
+      }
+    }
+  } else {
+    all_lossless = xd->lossless[0];
+  }
+  if (all_lossless) {
+    *mode = ONLY_4X4;
+    return;
+  }
+#if CONFIG_TX64X64
+  aom_wb_write_bit(wb, *mode == TX_MODE_SELECT);
+  if (*mode != TX_MODE_SELECT) {
+    aom_wb_write_literal(wb, AOMMIN(*mode, ALLOW_32X32), 2);
+    if (*mode >= ALLOW_32X32) aom_wb_write_bit(wb, *mode == ALLOW_64X64);
+  }
+#else
+  aom_wb_write_bit(wb, *mode == TX_MODE_SELECT);
+  if (*mode != TX_MODE_SELECT) aom_wb_write_literal(wb, *mode, 2);
+#endif  // CONFIG_TX64X64
+}
+
+#if !CONFIG_EC_ADAPT
+static void update_txfm_probs(AV1_COMMON *cm, aom_writer *w,
+                              FRAME_COUNTS *counts) {
+#if CONFIG_TILE_GROUPS
+  const int probwt = cm->num_tg;
+#else
+  const int probwt = 1;
+#endif
+  if (cm->tx_mode == TX_MODE_SELECT) {
+    int i, j;
+    for (i = 0; i < MAX_TX_DEPTH; ++i)
+      for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+        prob_diff_update(av1_tx_size_tree[i], cm->fc->tx_size_probs[i][j],
+                         counts->tx_size[i][j], i + 2, probwt, w);
+  }
+}
+#endif
+
+static void write_frame_interp_filter(InterpFilter filter,
+                                      struct aom_write_bit_buffer *wb) {
+  aom_wb_write_bit(wb, filter == SWITCHABLE);
+  if (filter != SWITCHABLE)
+    aom_wb_write_literal(wb, filter, LOG_SWITCHABLE_FILTERS);
+}
+
+static void fix_interp_filter(AV1_COMMON *cm, FRAME_COUNTS *counts) {
+  if (cm->interp_filter == SWITCHABLE) {
+    // Check to see if only one of the filters is actually used
+    int count[SWITCHABLE_FILTERS];
+    int i, j, c = 0;
+    for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+      count[i] = 0;
+      for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+        count[i] += counts->switchable_interp[j][i];
+      c += (count[i] > 0);
+    }
+    if (c == 1) {
+      // Only one filter is used. So set the filter at frame level
+      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+        if (count[i]) {
+#if CONFIG_MOTION_VAR && (CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION)
+#if CONFIG_WARPED_MOTION
+          if (i == EIGHTTAP_REGULAR || WARP_WM_NEIGHBORS_WITH_OBMC)
+#else
+          if (i == EIGHTTAP_REGULAR || WARP_GM_NEIGHBORS_WITH_OBMC)
+#endif  // CONFIG_WARPED_MOTION
+#endif  // CONFIG_MOTION_VAR && (CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION)
+            cm->interp_filter = i;
+          break;
+        }
+      }
+    }
+  }
+}
+
+static void write_tile_info(const AV1_COMMON *const cm,
+                            struct aom_write_bit_buffer *wb) {
+#if CONFIG_EXT_TILE
+  const int tile_width =
+      ALIGN_POWER_OF_TWO(cm->tile_width, cm->mib_size_log2) >>
+      cm->mib_size_log2;
+  const int tile_height =
+      ALIGN_POWER_OF_TWO(cm->tile_height, cm->mib_size_log2) >>
+      cm->mib_size_log2;
+
+  assert(tile_width > 0);
+  assert(tile_height > 0);
+
+  aom_wb_write_literal(wb, cm->tile_encoding_mode, 1);
+
+// Write the tile sizes
+#if CONFIG_EXT_PARTITION
+  if (cm->sb_size == BLOCK_128X128) {
+    assert(tile_width <= 32);
+    assert(tile_height <= 32);
+    aom_wb_write_literal(wb, tile_width - 1, 5);
+    aom_wb_write_literal(wb, tile_height - 1, 5);
+  } else
+#endif  // CONFIG_EXT_PARTITION
+  {
+    assert(tile_width <= 64);
+    assert(tile_height <= 64);
+    aom_wb_write_literal(wb, tile_width - 1, 6);
+    aom_wb_write_literal(wb, tile_height - 1, 6);
+  }
+#else
+  int min_log2_tile_cols, max_log2_tile_cols, ones;
+  av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+
+  // columns
+  ones = cm->log2_tile_cols - min_log2_tile_cols;
+  while (ones--) aom_wb_write_bit(wb, 1);
+
+  if (cm->log2_tile_cols < max_log2_tile_cols) aom_wb_write_bit(wb, 0);
+
+  // rows
+  aom_wb_write_bit(wb, cm->log2_tile_rows != 0);
+  if (cm->log2_tile_rows != 0) aom_wb_write_bit(wb, cm->log2_tile_rows != 1);
+#endif  // CONFIG_EXT_TILE
+
+#if CONFIG_DEPENDENT_HORZTILES
+  if (cm->log2_tile_rows != 0) aom_wb_write_bit(wb, cm->dependent_horz_tiles);
+#endif
+
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+  aom_wb_write_bit(wb, cm->loop_filter_across_tiles_enabled);
+#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+}
+
+static int get_refresh_mask(AV1_COMP *cpi) {
+  int refresh_mask = 0;
+
+#if CONFIG_EXT_REFS
+  // NOTE(zoeliu): When LAST_FRAME is to get refreshed, the decoder will be
+  // notified to get LAST3_FRAME refreshed and then the virtual indexes for all
+  // the 3 LAST reference frames will be updated accordingly, i.e.:
+  // (1) The original virtual index for LAST3_FRAME will become the new virtual
+  //     index for LAST_FRAME; and
+  // (2) The original virtual indexes for LAST_FRAME and LAST2_FRAME will be
+  //     shifted and become the new virtual indexes for LAST2_FRAME and
+  //     LAST3_FRAME.
+  refresh_mask |=
+      (cpi->refresh_last_frame << cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]);
+  if (cpi->rc.is_bwd_ref_frame && cpi->num_extra_arfs) {
+    // We have swapped the virtual indices
+    refresh_mask |= (cpi->refresh_bwd_ref_frame << cpi->arf_map[0]);
+  } else {
+    refresh_mask |= (cpi->refresh_bwd_ref_frame << cpi->bwd_fb_idx);
+  }
+#else
+  refresh_mask |= (cpi->refresh_last_frame << cpi->lst_fb_idx);
+#endif  // CONFIG_EXT_REFS
+
+  if (av1_preserve_existing_gf(cpi)) {
+    // We have decided to preserve the previously existing golden frame as our
+    // new ARF frame. However, in the short term we leave it in the GF slot and,
+    // if we're updating the GF with the current decoded frame, we save it
+    // instead to the ARF slot.
+    // Later, in the function av1_encoder.c:av1_update_reference_frames() we
+    // will swap gld_fb_idx and alt_fb_idx to achieve our objective. We do it
+    // there so that it can be done outside of the recode loop.
+    // Note: This is highly specific to the use of ARF as a forward reference,
+    // and this needs to be generalized as other uses are implemented
+    // (like RTC/temporal scalability).
+    return refresh_mask | (cpi->refresh_golden_frame << cpi->alt_fb_idx);
+  } else {
+#if CONFIG_EXT_REFS
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    int arf_idx = cpi->arf_map[gf_group->arf_update_idx[gf_group->index]];
+#else
+    int arf_idx = cpi->alt_fb_idx;
+    if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      arf_idx = gf_group->arf_update_idx[gf_group->index];
+    }
+#endif  // CONFIG_EXT_REFS
+    return refresh_mask | (cpi->refresh_golden_frame << cpi->gld_fb_idx) |
+           (cpi->refresh_alt_ref_frame << arf_idx);
+  }
+}
+
+#if CONFIG_EXT_TILE
+static INLINE int find_identical_tile(
+    const int tile_row, const int tile_col,
+    TileBufferEnc (*const tile_buffers)[1024]) {
+  const MV32 candidate_offset[1] = { { 1, 0 } };
+  const uint8_t *const cur_tile_data =
+      tile_buffers[tile_row][tile_col].data + 4;
+  const size_t cur_tile_size = tile_buffers[tile_row][tile_col].size;
+
+  int i;
+
+  if (tile_row == 0) return 0;
+
+  // (TODO: yunqingwang) For now, only above tile is checked and used.
+  // More candidates such as left tile can be added later.
+  for (i = 0; i < 1; i++) {
+    int row_offset = candidate_offset[0].row;
+    int col_offset = candidate_offset[0].col;
+    int row = tile_row - row_offset;
+    int col = tile_col - col_offset;
+    uint8_t tile_hdr;
+    const uint8_t *tile_data;
+    TileBufferEnc *candidate;
+
+    if (row < 0 || col < 0) continue;
+
+    tile_hdr = *(tile_buffers[row][col].data);
+
+    // Read out tcm bit
+    if ((tile_hdr >> 7) == 1) {
+      // The candidate is a copy tile itself
+      row_offset += tile_hdr & 0x7f;
+      row = tile_row - row_offset;
+    }
+
+    candidate = &tile_buffers[row][col];
+
+    if (row_offset >= 128 || candidate->size != cur_tile_size) continue;
+
+    tile_data = candidate->data + 4;
+
+    if (memcmp(tile_data, cur_tile_data, cur_tile_size) != 0) continue;
+
+    // Identical tile found
+    assert(row_offset > 0);
+    return row_offset;
+  }
+
+  // No identical tile found
+  return 0;
+}
+#endif  // CONFIG_EXT_TILE
+
+#if CONFIG_TILE_GROUPS
+static uint32_t write_tiles(AV1_COMP *const cpi,
+                            struct aom_write_bit_buffer *wb,
+                            unsigned int *max_tile_size,
+                            unsigned int *max_tile_col_size) {
+#else
+static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
+                            unsigned int *max_tile_size,
+                            unsigned int *max_tile_col_size) {
+#endif
+  const AV1_COMMON *const cm = &cpi->common;
+#if CONFIG_ANS
+  struct BufAnsCoder *buf_ans = &cpi->buf_ans;
+#else
+  aom_writer mode_bc;
+#endif  // CONFIG_ANS
+  int tile_row, tile_col;
+  TOKENEXTRA *(*const tok_buffers)[MAX_TILE_COLS] = cpi->tile_tok;
+  TileBufferEnc(*const tile_buffers)[MAX_TILE_COLS] = cpi->tile_buffers;
+  uint32_t total_size = 0;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  unsigned int tile_size = 0;
+#if CONFIG_TILE_GROUPS
+  const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
+  const int have_tiles = n_log2_tiles > 0;
+  uint32_t comp_hdr_size;
+  // Fixed size tile groups for the moment
+  const int num_tg_hdrs = cm->num_tg;
+  const int tg_size = (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
+  int tile_count = 0;
+  int tg_count = 1;
+  int tile_size_bytes = 4;
+  int tile_col_size_bytes;
+  uint32_t uncompressed_hdr_size = 0;
+  uint8_t *dst = NULL;
+  struct aom_write_bit_buffer comp_hdr_len_wb;
+  struct aom_write_bit_buffer tg_params_wb;
+  struct aom_write_bit_buffer tile_size_bytes_wb;
+  uint32_t saved_offset;
+  int mtu_size = cpi->oxcf.mtu;
+  int curr_tg_data_size = 0;
+  int hdr_size;
+#endif
+#if CONFIG_EXT_TILE
+  const int have_tiles = tile_cols * tile_rows > 1;
+#endif  // CONFIG_EXT_TILE
+
+  *max_tile_size = 0;
+  *max_tile_col_size = 0;
+
+// All tile size fields are output on 4 bytes. A call to remux_tiles will
+// later compact the data if smaller headers are adequate.
+
+#if CONFIG_EXT_TILE
+  for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+    TileInfo tile_info;
+    const int is_last_col = (tile_col == tile_cols - 1);
+    const uint32_t col_offset = total_size;
+
+    av1_tile_set_col(&tile_info, cm, tile_col);
+
+    // The last column does not have a column header
+    if (!is_last_col) total_size += 4;
+
+    for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+      TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+      const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
+      const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
+      const int data_offset = have_tiles ? 4 : 0;
+#if CONFIG_EC_ADAPT
+      const int tile_idx = tile_row * tile_cols + tile_col;
+      TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+#endif
+      av1_tile_set_row(&tile_info, cm, tile_row);
+
+      buf->data = dst + total_size;
+
+      // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
+      // even for the last one, unless no tiling is used at all.
+      total_size += data_offset;
+#if CONFIG_EC_ADAPT
+      // Initialise tile context from the frame context
+      this_tile->tctx = *cm->fc;
+      cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+#endif
+#if CONFIG_PVQ
+      cpi->td.mb.pvq_q = &this_tile->pvq_q;
+      cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
+#endif  // CONFIG_PVQ
+#if !CONFIG_ANS
+      aom_start_encode(&mode_bc, buf->data + data_offset);
+      write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
+      assert(tok == tok_end);
+      aom_stop_encode(&mode_bc);
+      tile_size = mode_bc.pos;
+#else
+      buf_ans_write_init(buf_ans, buf->data + data_offset);
+      write_modes(cpi, &tile_info, buf_ans, &tok, tok_end);
+      assert(tok == tok_end);
+      aom_buf_ans_flush(buf_ans);
+      tile_size = buf_ans_write_end(buf_ans);
+#endif  // !CONFIG_ANS
+#if CONFIG_PVQ
+      cpi->td.mb.pvq_q = NULL;
+#endif
+      buf->size = tile_size;
+
+      // Record the maximum tile size we see, so we can compact headers later.
+      *max_tile_size = AOMMAX(*max_tile_size, tile_size);
+
+      if (have_tiles) {
+        // tile header: size of this tile, or copy offset
+        uint32_t tile_header = tile_size;
+
+        // If the tile_encoding_mode is 1 (i.e. TILE_VR), check if this tile is
+        // a copy tile.
+        // Very low chances to have copy tiles on the key frames, so don't
+        // search on key frames to reduce unnecessary search.
+        if (cm->frame_type != KEY_FRAME && cm->tile_encoding_mode) {
+          const int idendical_tile_offset =
+              find_identical_tile(tile_row, tile_col, tile_buffers);
+
+          if (idendical_tile_offset > 0) {
+            tile_size = 0;
+            tile_header = idendical_tile_offset | 0x80;
+            tile_header <<= 24;
+          }
+        }
+
+        mem_put_le32(buf->data, tile_header);
+      }
+
+      total_size += tile_size;
+    }
+
+    if (!is_last_col) {
+      uint32_t col_size = total_size - col_offset - 4;
+      mem_put_le32(dst + col_offset, col_size);
+
+      // If it is not final packing, record the maximum tile column size we see,
+      // otherwise, check if the tile size is out of the range.
+      *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size);
+    }
+  }
+#else
+#if CONFIG_TILE_GROUPS
+  write_uncompressed_header(cpi, wb);
+
+#if CONFIG_EXT_REFS
+  if (cm->show_existing_frame) {
+    total_size = aom_wb_bytes_written(wb);
+    return (uint32_t)total_size;
+  }
+#endif  // CONFIG_EXT_REFS
+
+  // Write the tile length code
+  tile_size_bytes_wb = *wb;
+  aom_wb_write_literal(wb, 3, 2);
+
+  /* Write a placeholder for the number of tiles in each tile group */
+  tg_params_wb = *wb;
+  saved_offset = wb->bit_offset;
+  if (have_tiles) {
+    aom_wb_overwrite_literal(wb, 3, n_log2_tiles);
+    aom_wb_overwrite_literal(wb, (1 << n_log2_tiles) - 1, n_log2_tiles);
+  }
+
+  /* Write a placeholder for the compressed header length */
+  comp_hdr_len_wb = *wb;
+  aom_wb_write_literal(wb, 0, 16);
+
+  uncompressed_hdr_size = aom_wb_bytes_written(wb);
+  dst = wb->bit_buffer;
+  comp_hdr_size = write_compressed_header(cpi, dst + uncompressed_hdr_size);
+  aom_wb_overwrite_literal(&comp_hdr_len_wb, (int)(comp_hdr_size), 16);
+  hdr_size = uncompressed_hdr_size + comp_hdr_size;
+  total_size += hdr_size;
+#endif
+
+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    TileInfo tile_info;
+    const int is_last_row = (tile_row == tile_rows - 1);
+    av1_tile_set_row(&tile_info, cm, tile_row);
+
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      const int tile_idx = tile_row * tile_cols + tile_col;
+      TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+#if CONFIG_PVQ || CONFIG_EC_ADAPT
+      TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+#endif
+      const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
+      const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
+      const int is_last_col = (tile_col == tile_cols - 1);
+      const int is_last_tile = is_last_col && is_last_row;
+#if !CONFIG_TILE_GROUPS
+      (void)tile_idx;
+#else
+
+      if ((!mtu_size && tile_count > tg_size) ||
+          (mtu_size && tile_count && curr_tg_data_size >= mtu_size)) {
+        // New tile group
+        tg_count++;
+        // We've exceeded the packet size
+        if (tile_count > 1) {
+          /* The last tile exceeded the packet size. The tile group size
+             should therefore be tile_count-1.
+             Move the last tile and insert headers before it
+           */
+          uint32_t old_total_size = total_size - tile_size - 4;
+          memmove(dst + old_total_size + hdr_size, dst + old_total_size,
+                  (tile_size + 4) * sizeof(uint8_t));
+          // Copy uncompressed header
+          memmove(dst + old_total_size, dst,
+                  uncompressed_hdr_size * sizeof(uint8_t));
+          // Write the number of tiles in the group into the last uncompressed
+          // header before the one we've just inserted
+          aom_wb_overwrite_literal(&tg_params_wb, tile_idx - tile_count,
+                                   n_log2_tiles);
+          aom_wb_overwrite_literal(&tg_params_wb, tile_count - 2, n_log2_tiles);
+          // Update the pointer to the last TG params
+          tg_params_wb.bit_offset = saved_offset + 8 * old_total_size;
+          // Copy compressed header
+          memmove(dst + old_total_size + uncompressed_hdr_size,
+                  dst + uncompressed_hdr_size, comp_hdr_size * sizeof(uint8_t));
+          total_size += hdr_size;
+          tile_count = 1;
+          curr_tg_data_size = hdr_size + tile_size + 4;
+
+        } else {
+          // We exceeded the packet size in just one tile
+          // Copy uncompressed header
+          memmove(dst + total_size, dst,
+                  uncompressed_hdr_size * sizeof(uint8_t));
+          // Write the number of tiles in the group into the last uncompressed
+          // header
+          aom_wb_overwrite_literal(&tg_params_wb, tile_idx - tile_count,
+                                   n_log2_tiles);
+          aom_wb_overwrite_literal(&tg_params_wb, tile_count - 1, n_log2_tiles);
+          tg_params_wb.bit_offset = saved_offset + 8 * total_size;
+          // Copy compressed header
+          memmove(dst + total_size + uncompressed_hdr_size,
+                  dst + uncompressed_hdr_size, comp_hdr_size * sizeof(uint8_t));
+          total_size += hdr_size;
+          tile_count = 0;
+          curr_tg_data_size = hdr_size;
+        }
+      }
+      tile_count++;
+#endif
+      av1_tile_set_col(&tile_info, cm, tile_col);
+
+#if CONFIG_DEPENDENT_HORZTILES && CONFIG_TILE_GROUPS
+      av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col);
+#endif
+      buf->data = dst + total_size;
+
+      // The last tile does not have a header.
+      if (!is_last_tile) total_size += 4;
+
+#if CONFIG_EC_ADAPT
+      // Initialise tile context from the frame context
+      this_tile->tctx = *cm->fc;
+      cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+#endif
+#if CONFIG_PVQ
+      cpi->td.mb.pvq_q = &this_tile->pvq_q;
+      cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
+#endif  // CONFIG_PVQ
+#if CONFIG_ANS
+      buf_ans_write_init(buf_ans, dst + total_size);
+      write_modes(cpi, &tile_info, buf_ans, &tok, tok_end);
+      assert(tok == tok_end);
+      aom_buf_ans_flush(buf_ans);
+      tile_size = buf_ans_write_end(buf_ans);
+#else
+      aom_start_encode(&mode_bc, dst + total_size);
+      write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
+#if !CONFIG_LV_MAP
+      assert(tok == tok_end);
+#endif  // !CONFIG_LV_MAP
+      aom_stop_encode(&mode_bc);
+      tile_size = mode_bc.pos;
+#endif  // CONFIG_ANS
+#if CONFIG_PVQ
+      cpi->td.mb.pvq_q = NULL;
+#endif
+
+      assert(tile_size > 0);
+
+#if CONFIG_TILE_GROUPS
+      curr_tg_data_size += tile_size + 4;
+#endif
+      buf->size = tile_size;
+
+      if (!is_last_tile) {
+        *max_tile_size = AOMMAX(*max_tile_size, tile_size);
+        // size of this tile
+        mem_put_le32(buf->data, tile_size);
+      }
+
+      total_size += tile_size;
+    }
+  }
+#if CONFIG_TILE_GROUPS
+  // Write the final tile group size
+  if (n_log2_tiles) {
+    aom_wb_overwrite_literal(&tg_params_wb, (1 << n_log2_tiles) - tile_count,
+                             n_log2_tiles);
+    aom_wb_overwrite_literal(&tg_params_wb, tile_count - 1, n_log2_tiles);
+  }
+  // Remux if possible. TODO (Thomas Davies): do this for more than one tile
+  // group
+  if (have_tiles && tg_count == 1) {
+    int data_size = total_size - (uncompressed_hdr_size + comp_hdr_size);
+    data_size = remux_tiles(cm, dst + uncompressed_hdr_size + comp_hdr_size,
+                            data_size, *max_tile_size, *max_tile_col_size,
+                            &tile_size_bytes, &tile_col_size_bytes);
+    total_size = data_size + uncompressed_hdr_size + comp_hdr_size;
+    aom_wb_overwrite_literal(&tile_size_bytes_wb, tile_size_bytes - 1, 2);
+  }
+
+#endif
+#endif  // CONFIG_EXT_TILE
+  return (uint32_t)total_size;
+}
+
+static void write_render_size(const AV1_COMMON *cm,
+                              struct aom_write_bit_buffer *wb) {
+  const int scaling_active =
+      cm->width != cm->render_width || cm->height != cm->render_height;
+  aom_wb_write_bit(wb, scaling_active);
+  if (scaling_active) {
+    aom_wb_write_literal(wb, cm->render_width - 1, 16);
+    aom_wb_write_literal(wb, cm->render_height - 1, 16);
+  }
+}
+
+#if CONFIG_FRAME_SUPERRES
+static void write_superres_scale(const AV1_COMMON *const cm,
+                                 struct aom_write_bit_buffer *wb) {
+  // This scaling and frame superres are probably incompatible
+  assert(cm->width == cm->render_width && cm->height == cm->render_height);
+
+  // First bit is whether to to scale or not
+  if (cm->superres_scale_numerator == SUPERRES_SCALE_DENOMINATOR) {
+    aom_wb_write_bit(wb, 0);  // no scaling
+  } else {
+    aom_wb_write_bit(wb, 1);  // scaling, write scale factor
+    // TODO(afergs): write factor to the compressed header instead
+    aom_wb_write_literal(
+        wb, cm->superres_scale_numerator - SUPERRES_SCALE_NUMERATOR_MIN,
+        SUPERRES_SCALE_BITS);
+  }
+}
+#endif  // CONFIG_FRAME_SUPERRES
+
+static void write_frame_size(const AV1_COMMON *cm,
+                             struct aom_write_bit_buffer *wb) {
+#if CONFIG_FRAME_SUPERRES
+  // If SUPERRES scaling is happening, write the full resolution instead of the
+  // downscaled resolution. The decoder will reduce this resolution itself.
+  if (cm->superres_scale_numerator != SUPERRES_SCALE_DENOMINATOR) {
+    aom_wb_write_literal(wb, cm->superres_width - 1, 16);
+    aom_wb_write_literal(wb, cm->superres_height - 1, 16);
+  } else {
+#endif  // CONFIG_FRAME_SUPERRES
+    aom_wb_write_literal(wb, cm->width - 1, 16);
+    aom_wb_write_literal(wb, cm->height - 1, 16);
+#if CONFIG_FRAME_SUPERRES
+  }
+#endif  // CONFIG_FRAME_SUPERRES
+
+  // TODO(afergs): Also write something different to render_size?
+  //               When superres scales, they'll be almost guaranteed to be
+  //               different on the other side.
+  write_render_size(cm, wb);
+#if CONFIG_FRAME_SUPERRES
+  write_superres_scale(cm, wb);
+#endif  // CONFIG_FRAME_SUPERRES
+}
+
+static void write_frame_size_with_refs(AV1_COMP *cpi,
+                                       struct aom_write_bit_buffer *wb) {
+  AV1_COMMON *const cm = &cpi->common;
+  int found = 0;
+
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame);
+
+    if (cfg != NULL) {
+      found =
+          cm->width == cfg->y_crop_width && cm->height == cfg->y_crop_height;
+      found &= cm->render_width == cfg->render_width &&
+               cm->render_height == cfg->render_height;
+    }
+    aom_wb_write_bit(wb, found);
+    if (found) {
+      break;
+    }
+  }
+
+  if (!found) {
+    write_frame_size(cm, wb);
+  }
+}
+
+static void write_sync_code(struct aom_write_bit_buffer *wb) {
+  aom_wb_write_literal(wb, AV1_SYNC_CODE_0, 8);
+  aom_wb_write_literal(wb, AV1_SYNC_CODE_1, 8);
+  aom_wb_write_literal(wb, AV1_SYNC_CODE_2, 8);
+}
+
+static void write_profile(BITSTREAM_PROFILE profile,
+                          struct aom_write_bit_buffer *wb) {
+  switch (profile) {
+    case PROFILE_0: aom_wb_write_literal(wb, 0, 2); break;
+    case PROFILE_1: aom_wb_write_literal(wb, 2, 2); break;
+    case PROFILE_2: aom_wb_write_literal(wb, 1, 2); break;
+    case PROFILE_3: aom_wb_write_literal(wb, 6, 3); break;
+    default: assert(0);
+  }
+}
+
+static void write_bitdepth_colorspace_sampling(
+    AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) {
+  if (cm->profile >= PROFILE_2) {
+    assert(cm->bit_depth > AOM_BITS_8);
+    aom_wb_write_bit(wb, cm->bit_depth == AOM_BITS_10 ? 0 : 1);
+  }
+  aom_wb_write_literal(wb, cm->color_space, 3);
+  if (cm->color_space != AOM_CS_SRGB) {
+    // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+    aom_wb_write_bit(wb, cm->color_range);
+    if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
+      assert(cm->subsampling_x != 1 || cm->subsampling_y != 1);
+      aom_wb_write_bit(wb, cm->subsampling_x);
+      aom_wb_write_bit(wb, cm->subsampling_y);
+      aom_wb_write_bit(wb, 0);  // unused
+    } else {
+      assert(cm->subsampling_x == 1 && cm->subsampling_y == 1);
+    }
+  } else {
+    assert(cm->profile == PROFILE_1 || cm->profile == PROFILE_3);
+    aom_wb_write_bit(wb, 0);  // unused
+  }
+}
+
+#if CONFIG_REFERENCE_BUFFER
+void write_sequence_header(SequenceHeader *seq_params) {
+  /* Placeholder for actually writing to the bitstream */
+  seq_params->frame_id_numbers_present_flag = FRAME_ID_NUMBERS_PRESENT_FLAG;
+  seq_params->frame_id_length_minus7 = FRAME_ID_LENGTH_MINUS7;
+  seq_params->delta_frame_id_length_minus2 = DELTA_FRAME_ID_LENGTH_MINUS2;
+}
+#endif
+
+static void write_uncompressed_header(AV1_COMP *cpi,
+                                      struct aom_write_bit_buffer *wb) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+#if CONFIG_REFERENCE_BUFFER
+  /* TODO: Move outside frame loop or inside key-frame branch */
+  write_sequence_header(&cpi->seq_params);
+#endif
+
+  aom_wb_write_literal(wb, AOM_FRAME_MARKER, 2);
+
+  write_profile(cm->profile, wb);
+
+#if CONFIG_EXT_REFS
+  // NOTE: By default all coded frames to be used as a reference
+  cm->is_reference_frame = 1;
+
+  if (cm->show_existing_frame) {
+    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+    const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show];
+
+    if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Buffer %d does not contain a reconstructed frame",
+                         frame_to_show);
+    }
+    ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+
+    aom_wb_write_bit(wb, 1);  // show_existing_frame
+    aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
+
+#if CONFIG_REFERENCE_BUFFER
+    if (cpi->seq_params.frame_id_numbers_present_flag) {
+      int frame_id_len = cpi->seq_params.frame_id_length_minus7 + 7;
+      int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
+      aom_wb_write_literal(wb, display_frame_id, frame_id_len);
+      /* Add a zero byte to prevent emulation of superframe marker */
+      /* Same logic as when when terminating the entropy coder */
+      /* Consider to have this logic only one place */
+      aom_wb_write_literal(wb, 0, 8);
+    }
+#endif
+
+    return;
+  } else {
+#endif                        // CONFIG_EXT_REFS
+    aom_wb_write_bit(wb, 0);  // show_existing_frame
+#if CONFIG_EXT_REFS
+  }
+#endif  // CONFIG_EXT_REFS
+
+  aom_wb_write_bit(wb, cm->frame_type);
+  aom_wb_write_bit(wb, cm->show_frame);
+  aom_wb_write_bit(wb, cm->error_resilient_mode);
+
+#if CONFIG_REFERENCE_BUFFER
+  cm->invalid_delta_frame_id_minus1 = 0;
+  if (cpi->seq_params.frame_id_numbers_present_flag) {
+    int frame_id_len = cpi->seq_params.frame_id_length_minus7 + 7;
+    aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len);
+  }
+#endif
+
+#if CONFIG_FRAME_SUPERRES
+  // TODO(afergs): Remove - this is just to stop superres from breaking
+  cm->superres_scale_numerator = SUPERRES_SCALE_DENOMINATOR;
+#endif  // CONFIG_FRAME_SUPERRES
+
+  if (cm->frame_type == KEY_FRAME) {
+    write_sync_code(wb);
+    write_bitdepth_colorspace_sampling(cm, wb);
+    write_frame_size(cm, wb);
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+    assert(cpi->common.ans_window_size_log2 >= 8);
+    assert(cpi->common.ans_window_size_log2 < 24);
+    aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4);
+#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
+#if CONFIG_PALETTE
+    aom_wb_write_bit(wb, cm->allow_screen_content_tools);
+#endif  // CONFIG_PALETTE
+  } else {
+    if (!cm->show_frame) aom_wb_write_bit(wb, cm->intra_only);
+#if CONFIG_PALETTE
+    if (cm->intra_only) aom_wb_write_bit(wb, cm->allow_screen_content_tools);
+#endif  // CONFIG_PALETTE
+    if (!cm->error_resilient_mode) {
+      if (cm->intra_only) {
+        aom_wb_write_bit(wb,
+                         cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
+      } else {
+        aom_wb_write_bit(wb,
+                         cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE);
+        if (cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE)
+          aom_wb_write_bit(wb,
+                           cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
+      }
+    }
+
+#if CONFIG_EXT_REFS
+    cpi->refresh_frame_mask = get_refresh_mask(cpi);
+#endif  // CONFIG_EXT_REFS
+
+    if (cm->intra_only) {
+      write_sync_code(wb);
+      write_bitdepth_colorspace_sampling(cm, wb);
+
+#if CONFIG_EXT_REFS
+      aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+#else
+      aom_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+#endif  // CONFIG_EXT_REFS
+      write_frame_size(cm, wb);
+
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+      assert(cpi->common.ans_window_size_log2 >= 8);
+      assert(cpi->common.ans_window_size_log2 < 24);
+      aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4);
+#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
+    } else {
+      MV_REFERENCE_FRAME ref_frame;
+
+#if CONFIG_EXT_REFS
+      aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+#else
+      aom_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+      if (!cpi->refresh_frame_mask) {
+        // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame
+        //       will not be used as a reference
+        cm->is_reference_frame = 0;
+      }
+#endif  // CONFIG_EXT_REFS
+
+      for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+        assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
+        aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
+                             REF_FRAMES_LOG2);
+        aom_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
+#if CONFIG_REFERENCE_BUFFER
+        if (cpi->seq_params.frame_id_numbers_present_flag) {
+          int i = get_ref_frame_map_idx(cpi, ref_frame);
+          int frame_id_len = cpi->seq_params.frame_id_length_minus7 + 7;
+          int diff_len = cpi->seq_params.delta_frame_id_length_minus2 + 2;
+          int delta_frame_id_minus1 =
+              ((cm->current_frame_id - cm->ref_frame_id[i] +
+                (1 << frame_id_len)) %
+               (1 << frame_id_len)) -
+              1;
+          if (delta_frame_id_minus1 < 0 ||
+              delta_frame_id_minus1 >= (1 << diff_len))
+            cm->invalid_delta_frame_id_minus1 = 1;
+          aom_wb_write_literal(wb, delta_frame_id_minus1, diff_len);
+        }
+#endif
+      }
+
+#if CONFIG_FRAME_SIZE
+      if (cm->error_resilient_mode == 0) {
+        write_frame_size_with_refs(cpi, wb);
+      } else {
+        write_frame_size(cm, wb);
+      }
+#else
+      write_frame_size_with_refs(cpi, wb);
+#endif
+
+      aom_wb_write_bit(wb, cm->allow_high_precision_mv);
+
+      fix_interp_filter(cm, cpi->td.counts);
+      write_frame_interp_filter(cm->interp_filter, wb);
+#if CONFIG_TEMPMV_SIGNALING
+      if (!cm->error_resilient_mode) {
+        aom_wb_write_bit(wb, cm->use_prev_frame_mvs);
+      }
+#endif
+    }
+  }
+
+#if CONFIG_REFERENCE_BUFFER
+  cm->refresh_mask = cm->frame_type == KEY_FRAME ? 0xFF : get_refresh_mask(cpi);
+#endif
+
+  if (!cm->error_resilient_mode) {
+    aom_wb_write_bit(
+        wb, cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD);
+  }
+
+  aom_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
+
+  assert(cm->mib_size == mi_size_wide[cm->sb_size]);
+  assert(cm->mib_size == 1 << cm->mib_size_log2);
+#if CONFIG_EXT_PARTITION
+  assert(cm->sb_size == BLOCK_128X128 || cm->sb_size == BLOCK_64X64);
+  aom_wb_write_bit(wb, cm->sb_size == BLOCK_128X128 ? 1 : 0);
+#else
+  assert(cm->sb_size == BLOCK_64X64);
+#endif  // CONFIG_EXT_PARTITION
+
+  encode_loopfilter(cm, wb);
+#if CONFIG_CDEF
+  encode_cdef(cm, wb);
+#endif
+#if CONFIG_LOOP_RESTORATION
+  encode_restoration_mode(cm, wb);
+#endif  // CONFIG_LOOP_RESTORATION
+  encode_quantization(cm, wb);
+  encode_segmentation(cm, xd, wb);
+#if CONFIG_DELTA_Q
+  {
+    int i;
+    struct segmentation *const seg = &cm->seg;
+    int segment_quantizer_active = 0;
+    for (i = 0; i < MAX_SEGMENTS; i++) {
+      if (segfeature_active(seg, i, SEG_LVL_ALT_Q)) {
+        segment_quantizer_active = 1;
+      }
+    }
+
+    if (cm->delta_q_present_flag)
+      assert(segment_quantizer_active == 0 && cm->base_qindex > 0);
+    if (segment_quantizer_active == 0 && cm->base_qindex > 0) {
+      aom_wb_write_bit(wb, cm->delta_q_present_flag);
+      if (cm->delta_q_present_flag) {
+        aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_q_res) - 1, 2);
+        xd->prev_qindex = cm->base_qindex;
+#if CONFIG_EXT_DELTA_Q
+        assert(seg->abs_delta == SEGMENT_DELTADATA);
+        aom_wb_write_bit(wb, cm->delta_lf_present_flag);
+        if (cm->delta_lf_present_flag) {
+          aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_lf_res) - 1, 2);
+          xd->prev_delta_lf_from_base = 0;
+        }
+#endif  // CONFIG_EXT_DELTA_Q
+      }
+    }
+  }
+#endif
+
+  write_tx_mode(cm, xd, &cm->tx_mode, wb);
+
+  if (cpi->allow_comp_inter_inter) {
+    const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
+#if !CONFIG_REF_ADAPT
+    const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
+#endif  // !CONFIG_REF_ADAPT
+
+    aom_wb_write_bit(wb, use_hybrid_pred);
+#if !CONFIG_REF_ADAPT
+    if (!use_hybrid_pred) aom_wb_write_bit(wb, use_compound_pred);
+#endif  // !CONFIG_REF_ADAPT
+  }
+
+#if CONFIG_EXT_TX
+  aom_wb_write_bit(wb, cm->reduced_tx_set_used);
+#endif  // CONFIG_EXT_TX
+
+  write_tile_info(cm, wb);
+}
+
+#if CONFIG_GLOBAL_MOTION
+static void write_global_motion_params(WarpedMotionParams *params,
+                                       WarpedMotionParams *ref_params,
+                                       aom_prob *probs, aom_writer *w,
+                                       int allow_hp) {
+  TransformationType type = params->wmtype;
+  int trans_bits;
+  int trans_prec_diff;
+  av1_write_token(w, av1_global_motion_types_tree, probs,
+                  &global_motion_types_encodings[type]);
+  switch (type) {
+    case HOMOGRAPHY:
+    case HORTRAPEZOID:
+    case VERTRAPEZOID:
+      if (type != HORTRAPEZOID)
+        aom_write_signed_primitive_refsubexpfin(
+            w, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
+            (ref_params->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF),
+            (params->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF));
+      if (type != VERTRAPEZOID)
+        aom_write_signed_primitive_refsubexpfin(
+            w, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
+            (ref_params->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF),
+            (params->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF));
+    // fallthrough intended
+    case AFFINE:
+    case ROTZOOM:
+      aom_write_signed_primitive_refsubexpfin(
+          w, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+          (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
+              (1 << GM_ALPHA_PREC_BITS),
+          (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+      if (type != VERTRAPEZOID)
+        aom_write_signed_primitive_refsubexpfin(
+            w, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+            (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+            (params->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+      if (type >= AFFINE) {
+        if (type != HORTRAPEZOID)
+          aom_write_signed_primitive_refsubexpfin(
+              w, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+              (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+              (params->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+        aom_write_signed_primitive_refsubexpfin(
+            w, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+            (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+                (1 << GM_ALPHA_PREC_BITS),
+            (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+                (1 << GM_ALPHA_PREC_BITS));
+      }
+    // fallthrough intended
+    case TRANSLATION:
+      trans_bits = (type == TRANSLATION) ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+                                         : GM_ABS_TRANS_BITS;
+      trans_prec_diff = (type == TRANSLATION)
+                            ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+                            : GM_TRANS_PREC_DIFF;
+      aom_write_signed_primitive_refsubexpfin(
+          w, (1 << trans_bits) + 1, SUBEXPFIN_K,
+          (ref_params->wmmat[0] >> trans_prec_diff),
+          (params->wmmat[0] >> trans_prec_diff));
+      aom_write_signed_primitive_refsubexpfin(
+          w, (1 << trans_bits) + 1, SUBEXPFIN_K,
+          (ref_params->wmmat[1] >> trans_prec_diff),
+          (params->wmmat[1] >> trans_prec_diff));
+      break;
+    case IDENTITY: break;
+    default: assert(0);
+  }
+}
+
+static void write_global_motion(AV1_COMP *cpi, aom_writer *w) {
+  AV1_COMMON *const cm = &cpi->common;
+  int frame;
+  for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+#if !CONFIG_REF_MV
+    // With ref-mv, clearing unused global motion models here is
+    // unsafe, and we need to rely on the recode loop to do it
+    // instead. See av1_find_mv_refs for details.
+    if (!cpi->td.rd_counts.global_motion_used[frame]) {
+      set_default_warp_params(&cm->global_motion[frame]);
+    }
+#endif
+    write_global_motion_params(
+        &cm->global_motion[frame], &cm->prev_frame->global_motion[frame],
+        cm->fc->global_motion_types_prob, w, cm->allow_high_precision_mv);
+    /*
+    printf("Frame %d/%d: Enc Ref %d (used %d): %d %d %d %d\n",
+           cm->current_video_frame, cm->show_frame, frame,
+           cpi->global_motion_used[frame], cm->global_motion[frame].wmmat[0],
+           cm->global_motion[frame].wmmat[1], cm->global_motion[frame].wmmat[2],
+           cm->global_motion[frame].wmmat[3]);
+           */
+  }
+}
+#endif
+
+static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
+  AV1_COMMON *const cm = &cpi->common;
+#if CONFIG_SUPERTX
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+#endif  // CONFIG_SUPERTX
+  FRAME_CONTEXT *const fc = cm->fc;
+  FRAME_COUNTS *counts = cpi->td.counts;
+  aom_writer *header_bc;
+  int i, j;
+
+#if CONFIG_TILE_GROUPS
+  const int probwt = cm->num_tg;
+#else
+  const int probwt = 1;
+#endif
+
+#if CONFIG_ANS
+  int header_size;
+  header_bc = &cpi->buf_ans;
+  buf_ans_write_init(header_bc, data);
+#else
+  aom_writer real_header_bc;
+  header_bc = &real_header_bc;
+  aom_start_encode(header_bc, data);
+#endif
+
+#if CONFIG_LOOP_RESTORATION
+  encode_restoration(cm, header_bc);
+#endif  // CONFIG_LOOP_RESTORATION
+#if !CONFIG_EC_ADAPT
+  update_txfm_probs(cm, header_bc, counts);
+#endif
+#if CONFIG_LV_MAP
+  av1_write_txb_probs(cpi, header_bc);
+#else
+#if !CONFIG_PVQ
+#if !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET)
+  update_coef_probs(cpi, header_bc);
+#endif  // !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET)
+#endif  // CONFIG_PVQ
+#endif  // CONFIG_LV_MAP
+
+#if CONFIG_VAR_TX
+  update_txfm_partition_probs(cm, header_bc, counts, probwt);
+#endif
+
+  update_skip_probs(cm, header_bc, counts);
+#if !CONFIG_EC_ADAPT && CONFIG_DELTA_Q
+  update_delta_q_probs(cm, header_bc, counts);
+#if CONFIG_EXT_DELTA_Q
+  update_delta_lf_probs(cm, header_bc, counts);
+#endif
+#endif
+#if !CONFIG_EC_ADAPT
+  update_seg_probs(cpi, header_bc);
+
+  for (i = 0; i < INTRA_MODES; ++i) {
+    prob_diff_update(av1_intra_mode_tree, fc->uv_mode_prob[i],
+                     counts->uv_mode[i], INTRA_MODES, probwt, header_bc);
+  }
+
+#if CONFIG_EXT_PARTITION_TYPES
+  for (i = 0; i < PARTITION_PLOFFSET; ++i)
+    prob_diff_update(av1_partition_tree, fc->partition_prob[i],
+                     counts->partition[i], PARTITION_TYPES, probwt, header_bc);
+  for (; i < PARTITION_CONTEXTS_PRIMARY; ++i)
+    prob_diff_update(av1_ext_partition_tree, fc->partition_prob[i],
+                     counts->partition[i], EXT_PARTITION_TYPES, probwt,
+                     header_bc);
+#else
+  for (i = 0; i < PARTITION_CONTEXTS_PRIMARY; ++i)
+    prob_diff_update(av1_partition_tree, fc->partition_prob[i],
+                     counts->partition[i], PARTITION_TYPES, probwt, header_bc);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_UNPOISON_PARTITION_CTX
+  for (; i < PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES; ++i) {
+    unsigned int ct[2] = { counts->partition[i][PARTITION_VERT],
+                           counts->partition[i][PARTITION_SPLIT] };
+    assert(counts->partition[i][PARTITION_NONE] == 0);
+    assert(counts->partition[i][PARTITION_HORZ] == 0);
+    assert(fc->partition_prob[i][PARTITION_NONE] == 0);
+    assert(fc->partition_prob[i][PARTITION_HORZ] == 0);
+    av1_cond_prob_diff_update(header_bc, &fc->partition_prob[i][PARTITION_VERT],
+                              ct, probwt);
+  }
+  for (; i < PARTITION_CONTEXTS_PRIMARY + 2 * PARTITION_BLOCK_SIZES; ++i) {
+    unsigned int ct[2] = { counts->partition[i][PARTITION_HORZ],
+                           counts->partition[i][PARTITION_SPLIT] };
+    assert(counts->partition[i][PARTITION_NONE] == 0);
+    assert(counts->partition[i][PARTITION_VERT] == 0);
+    assert(fc->partition_prob[i][PARTITION_NONE] == 0);
+    assert(fc->partition_prob[i][PARTITION_VERT] == 0);
+    av1_cond_prob_diff_update(header_bc, &fc->partition_prob[i][PARTITION_HORZ],
+                              ct, probwt);
+  }
+#endif
+#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+  for (i = 0; i < INTRA_FILTERS + 1; ++i)
+    prob_diff_update(av1_intra_filter_tree, fc->intra_filter_probs[i],
+                     counts->intra_filter[i], INTRA_FILTERS, probwt, header_bc);
+#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+#endif  // !CONFIG_EC_ADAPT
+
+  if (frame_is_intra_only(cm)) {
+    av1_copy(cm->kf_y_prob, av1_kf_y_mode_prob);
+#if CONFIG_EC_MULTISYMBOL
+    av1_copy(cm->fc->kf_y_cdf, av1_kf_y_mode_cdf);
+#endif
+
+#if !CONFIG_EC_ADAPT
+    for (i = 0; i < INTRA_MODES; ++i)
+      for (j = 0; j < INTRA_MODES; ++j)
+        prob_diff_update(av1_intra_mode_tree, cm->kf_y_prob[i][j],
+                         counts->kf_y_mode[i][j], INTRA_MODES, probwt,
+                         header_bc);
+#endif  // CONFIG_EC_ADAPT
+  } else {
+#if CONFIG_REF_MV
+    update_inter_mode_probs(cm, header_bc, counts);
+#else
+#if !CONFIG_EC_ADAPT
+    for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
+      prob_diff_update(av1_inter_mode_tree, cm->fc->inter_mode_probs[i],
+                       counts->inter_mode[i], INTER_MODES, probwt, header_bc);
+    }
+#endif
+#endif
+#if CONFIG_EXT_INTER
+    update_inter_compound_mode_probs(cm, probwt, header_bc);
+
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
+      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+        if (is_interintra_allowed_bsize_group(i)) {
+          av1_cond_prob_diff_update(header_bc, &fc->interintra_prob[i],
+                                    cm->counts.interintra[i], probwt);
+        }
+      }
+      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+        prob_diff_update(
+            av1_interintra_mode_tree, cm->fc->interintra_mode_prob[i],
+            counts->interintra_mode[i], INTERINTRA_MODES, probwt, header_bc);
+      }
+      for (i = 0; i < BLOCK_SIZES; i++) {
+        if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i))
+          av1_cond_prob_diff_update(header_bc, &fc->wedge_interintra_prob[i],
+                                    cm->counts.wedge_interintra[i], probwt);
+      }
+    }
+#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+    if (cm->reference_mode != SINGLE_REFERENCE) {
+      for (i = 0; i < BLOCK_SIZES; i++)
+        prob_diff_update(av1_compound_type_tree, fc->compound_type_prob[i],
+                         cm->counts.compound_interinter[i], COMPOUND_TYPES,
+                         probwt, header_bc);
+    }
+#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    for (i = BLOCK_8X8; i < BLOCK_SIZES; ++i)
+      prob_diff_update(av1_motion_mode_tree, fc->motion_mode_prob[i],
+                       counts->motion_mode[i], MOTION_MODES, probwt, header_bc);
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if !CONFIG_EC_ADAPT
+    if (cm->interp_filter == SWITCHABLE)
+      update_switchable_interp_probs(cm, header_bc, counts);
+#endif
+
+    for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+      av1_cond_prob_diff_update(header_bc, &fc->intra_inter_prob[i],
+                                counts->intra_inter[i], probwt);
+
+    if (cpi->allow_comp_inter_inter) {
+      const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
+      if (use_hybrid_pred)
+        for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+          av1_cond_prob_diff_update(header_bc, &fc->comp_inter_prob[i],
+                                    counts->comp_inter[i], probwt);
+    }
+
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
+      for (i = 0; i < REF_CONTEXTS; i++) {
+        for (j = 0; j < (SINGLE_REFS - 1); j++) {
+          av1_cond_prob_diff_update(header_bc, &fc->single_ref_prob[i][j],
+                                    counts->single_ref[i][j], probwt);
+        }
+      }
+    }
+    if (cm->reference_mode != SINGLE_REFERENCE) {
+      for (i = 0; i < REF_CONTEXTS; i++) {
+#if CONFIG_EXT_REFS
+        for (j = 0; j < (FWD_REFS - 1); j++) {
+          av1_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j],
+                                    counts->comp_ref[i][j], probwt);
+        }
+        for (j = 0; j < (BWD_REFS - 1); j++) {
+          av1_cond_prob_diff_update(header_bc, &fc->comp_bwdref_prob[i][j],
+                                    counts->comp_bwdref[i][j], probwt);
+        }
+#else
+        for (j = 0; j < (COMP_REFS - 1); j++) {
+          av1_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j],
+                                    counts->comp_ref[i][j], probwt);
+        }
+#endif  // CONFIG_EXT_REFS
+      }
+    }
+
+#if !CONFIG_EC_ADAPT
+    for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
+      prob_diff_update(av1_intra_mode_tree, cm->fc->y_mode_prob[i],
+                       counts->y_mode[i], INTRA_MODES, probwt, header_bc);
+    }
+#endif
+
+    av1_write_nmv_probs(cm, cm->allow_high_precision_mv, header_bc,
+#if CONFIG_REF_MV
+                        counts->mv);
+#else
+                        &counts->mv);
+#endif
+#if !CONFIG_EC_ADAPT
+    update_ext_tx_probs(cm, header_bc);
+#endif
+#if CONFIG_SUPERTX
+    if (!xd->lossless[0]) update_supertx_probs(cm, probwt, header_bc);
+#endif  // CONFIG_SUPERTX
+#if CONFIG_GLOBAL_MOTION
+    write_global_motion(cpi, header_bc);
+#endif  // CONFIG_GLOBAL_MOTION
+  }
+#if CONFIG_EC_MULTISYMBOL
+#if !CONFIG_EC_ADAPT
+#if CONFIG_NEW_TOKENSET
+  av1_coef_head_cdfs(fc);
+#endif
+  av1_coef_pareto_cdfs(fc);
+#if CONFIG_REF_MV
+  for (i = 0; i < NMV_CONTEXTS; ++i) av1_set_mv_cdfs(&fc->nmvc[i]);
+#else
+  av1_set_mv_cdfs(&fc->nmvc);
+#endif
+#if CONFIG_EC_MULTISYMBOL
+  av1_set_mode_cdfs(cm);
+#endif
+#endif  // !CONFIG_EC_ADAPT
+#endif
+#if CONFIG_ANS
+  aom_buf_ans_flush(header_bc);
+  header_size = buf_ans_write_end(header_bc);
+  assert(header_size <= 0xffff);
+  return header_size;
+#else
+  aom_stop_encode(header_bc);
+  assert(header_bc->pos <= 0xffff);
+  return header_bc->pos;
+#endif  // CONFIG_ANS
+}
+
+static int choose_size_bytes(uint32_t size, int spare_msbs) {
+  // Choose the number of bytes required to represent size, without
+  // using the 'spare_msbs' number of most significant bits.
+
+  // Make sure we will fit in 4 bytes to start with..
+  if (spare_msbs > 0 && size >> (32 - spare_msbs) != 0) return -1;
+
+  // Normalise to 32 bits
+  size <<= spare_msbs;
+
+  if (size >> 24 != 0)
+    return 4;
+  else if (size >> 16 != 0)
+    return 3;
+  else if (size >> 8 != 0)
+    return 2;
+  else
+    return 1;
+}
+
+static void mem_put_varsize(uint8_t *const dst, const int sz, const int val) {
+  switch (sz) {
+    case 1: dst[0] = (uint8_t)(val & 0xff); break;
+    case 2: mem_put_le16(dst, val); break;
+    case 3: mem_put_le24(dst, val); break;
+    case 4: mem_put_le32(dst, val); break;
+    default: assert(0 && "Invalid size"); break;
+  }
+}
+static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
+                       const uint32_t data_size, const uint32_t max_tile_size,
+                       const uint32_t max_tile_col_size,
+                       int *const tile_size_bytes,
+                       int *const tile_col_size_bytes) {
+// Choose the tile size bytes (tsb) and tile column size bytes (tcsb)
+#if CONFIG_EXT_TILE
+  // The top bit in the tile size field indicates tile copy mode, so we
+  // have 1 less bit to code the tile size
+  const int tsb = choose_size_bytes(max_tile_size, 1);
+  const int tcsb = choose_size_bytes(max_tile_col_size, 0);
+#else
+  const int tsb = choose_size_bytes(max_tile_size, 0);
+  const int tcsb = 4;  // This is ignored
+  (void)max_tile_col_size;
+#endif  // CONFIG_EXT_TILE
+
+  assert(tsb > 0);
+  assert(tcsb > 0);
+
+  *tile_size_bytes = tsb;
+  *tile_col_size_bytes = tcsb;
+
+  if (tsb == 4 && tcsb == 4) {
+    return data_size;
+  } else {
+    uint32_t wpos = 0;
+    uint32_t rpos = 0;
+
+#if CONFIG_EXT_TILE
+    int tile_row;
+    int tile_col;
+
+    for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
+      // All but the last column has a column header
+      if (tile_col < cm->tile_cols - 1) {
+        uint32_t tile_col_size = mem_get_le32(dst + rpos);
+        rpos += 4;
+
+        // Adjust the tile column size by the number of bytes removed
+        // from the tile size fields.
+        tile_col_size -= (4 - tsb) * cm->tile_rows;
+
+        mem_put_varsize(dst + wpos, tcsb, tile_col_size);
+        wpos += tcsb;
+      }
+
+      for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+        // All, including the last row has a header
+        uint32_t tile_header = mem_get_le32(dst + rpos);
+        rpos += 4;
+
+        // If this is a copy tile, we need to shift the MSB to the
+        // top bit of the new width, and there is no data to copy.
+        if (tile_header >> 31 != 0) {
+          if (tsb < 4) tile_header >>= 32 - 8 * tsb;
+          mem_put_varsize(dst + wpos, tsb, tile_header);
+          wpos += tsb;
+        } else {
+          mem_put_varsize(dst + wpos, tsb, tile_header);
+          wpos += tsb;
+
+          memmove(dst + wpos, dst + rpos, tile_header);
+          rpos += tile_header;
+          wpos += tile_header;
+        }
+      }
+    }
+#else
+    const int n_tiles = cm->tile_cols * cm->tile_rows;
+    int n;
+
+    for (n = 0; n < n_tiles; n++) {
+      int tile_size;
+
+      if (n == n_tiles - 1) {
+        tile_size = data_size - rpos;
+      } else {
+        tile_size = mem_get_le32(dst + rpos);
+        rpos += 4;
+        mem_put_varsize(dst + wpos, tsb, tile_size);
+        wpos += tsb;
+      }
+
+      memmove(dst + wpos, dst + rpos, tile_size);
+
+      rpos += tile_size;
+      wpos += tile_size;
+    }
+#endif  // CONFIG_EXT_TILE
+
+    assert(rpos > wpos);
+    assert(rpos == data_size);
+
+    return wpos;
+  }
+}
+
+void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
+  uint8_t *data = dst;
+#if !CONFIG_TILE_GROUPS
+  uint32_t compressed_header_size;
+  uint32_t uncompressed_header_size;
+  struct aom_write_bit_buffer saved_wb;
+#endif
+  uint32_t data_size;
+  struct aom_write_bit_buffer wb = { data, 0 };
+
+  unsigned int max_tile_size;
+  unsigned int max_tile_col_size;
+
+#if CONFIG_BITSTREAM_DEBUG
+  bitstream_queue_reset_write();
+#endif
+
+#if !CONFIG_TILE_GROUPS
+  int tile_size_bytes;
+  int tile_col_size_bytes;
+  AV1_COMMON *const cm = &cpi->common;
+  const int have_tiles = cm->tile_cols * cm->tile_rows > 1;
+
+  // Write the uncompressed header
+  write_uncompressed_header(cpi, &wb);
+
+#if CONFIG_EXT_REFS
+  if (cm->show_existing_frame) {
+    *size = aom_wb_bytes_written(&wb);
+    return;
+  }
+#endif  // CONFIG_EXT_REFS
+
+  // We do not know these in advance. Output placeholder bit.
+  saved_wb = wb;
+  // Write tile size magnitudes
+  if (have_tiles) {
+// Note that the last item in the uncompressed header is the data
+// describing tile configuration.
+#if CONFIG_EXT_TILE
+    // Number of bytes in tile column size - 1
+    aom_wb_write_literal(&wb, 0, 2);
+#endif  // CONFIG_EXT_TILE
+    // Number of bytes in tile size - 1
+    aom_wb_write_literal(&wb, 0, 2);
+  }
+  // Size of compressed header
+  aom_wb_write_literal(&wb, 0, 16);
+
+  uncompressed_header_size = (uint32_t)aom_wb_bytes_written(&wb);
+  data += uncompressed_header_size;
+
+  aom_clear_system_state();
+
+  // Write the compressed header
+  compressed_header_size = write_compressed_header(cpi, data);
+  data += compressed_header_size;
+
+  // Write the encoded tile data
+  data_size = write_tiles(cpi, data, &max_tile_size, &max_tile_col_size);
+#else
+  data_size = write_tiles(cpi, &wb, &max_tile_size, &max_tile_col_size);
+#endif
+#if !CONFIG_TILE_GROUPS
+  if (have_tiles) {
+    data_size =
+        remux_tiles(cm, data, data_size, max_tile_size, max_tile_col_size,
+                    &tile_size_bytes, &tile_col_size_bytes);
+  }
+
+  data += data_size;
+
+  // Now fill in the gaps in the uncompressed header.
+  if (have_tiles) {
+#if CONFIG_EXT_TILE
+    assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4);
+    aom_wb_write_literal(&saved_wb, tile_col_size_bytes - 1, 2);
+#endif  // CONFIG_EXT_TILE
+    assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+    aom_wb_write_literal(&saved_wb, tile_size_bytes - 1, 2);
+  }
+  // TODO(jbb): Figure out what to do if compressed_header_size > 16 bits.
+  assert(compressed_header_size <= 0xffff);
+  aom_wb_write_literal(&saved_wb, compressed_header_size, 16);
+#else
+  data += data_size;
+#endif
+#if CONFIG_ANS && ANS_REVERSE
+  // Avoid aliasing the superframe index
+  *data++ = 0;
+#endif
+  *size = data - dst;
+}
diff --git a/third_party/aom/av1/encoder/bitstream.h b/third_party/aom/av1/encoder/bitstream.h
new file mode 100644
index 000000000..c75d80891
--- /dev/null
+++ b/third_party/aom/av1/encoder/bitstream.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_BITSTREAM_H_
+#define AV1_ENCODER_BITSTREAM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+
+#if CONFIG_REFERENCE_BUFFER
+void write_sequence_header(SequenceHeader *seq_params);
+#endif
+
+void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size);
+
+void av1_encode_token_init(void);
+
+static INLINE int av1_preserve_existing_gf(AV1_COMP *cpi) {
+#if CONFIG_EXT_REFS
+  // Do not swap gf and arf indices for internal overlay frames
+  return !cpi->multi_arf_allowed && cpi->rc.is_src_frame_alt_ref &&
+         !cpi->rc.is_src_frame_ext_arf;
+#else
+  return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
+         cpi->rc.is_src_frame_alt_ref;
+#endif  // CONFIG_EXT_REFS
+}
+
+void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
+#if CONFIG_SUPERTX
+                       const int supertx_enabled,
+#endif
+#if CONFIG_TXK_SEL
+                       int block, int plane,
+#endif
+                       aom_writer *w);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_BITSTREAM_H_
diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h
new file mode 100644
index 000000000..39e08d5b4
--- /dev/null
+++ b/third_party/aom/av1/encoder/block.h
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_BLOCK_H_
+#define AV1_ENCODER_BLOCK_H_
+
+#include "av1/common/entropymv.h"
+#include "av1/common/entropy.h"
+#if CONFIG_PVQ
+#include "av1/encoder/encint.h"
+#endif
+#if CONFIG_REF_MV
+#include "av1/common/mvref_common.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_PVQ
+// Maximum possible # of tx blocks in luma plane, which is currently 256,
+// since there can be 16x16 of 4x4 tx.
+#define MAX_PVQ_BLOCKS_IN_SB (MAX_SB_SQUARE >> 2 * OD_LOG_BSIZE0)
+#endif
+
+typedef struct {
+  unsigned int sse;
+  int sum;
+  unsigned int var;
+} DIFF;
+
+typedef struct macroblock_plane {
+  DECLARE_ALIGNED(16, int16_t, src_diff[MAX_SB_SQUARE]);
+#if CONFIG_PVQ
+  DECLARE_ALIGNED(16, int16_t, src_int16[MAX_SB_SQUARE]);
+#endif
+  tran_low_t *qcoeff;
+  tran_low_t *coeff;
+  uint16_t *eobs;
+#if CONFIG_LV_MAP
+  uint8_t *txb_entropy_ctx;
+#endif
+  struct buf_2d src;
+
+  // Quantizer setings
+  const int16_t *quant_fp;
+  const int16_t *round_fp;
+  const int16_t *quant;
+  const int16_t *quant_shift;
+  const int16_t *zbin;
+  const int16_t *round;
+#if CONFIG_NEW_QUANT
+  const cuml_bins_type_nuq *cuml_bins_nuq[QUANT_PROFILES];
+#endif  // CONFIG_NEW_QUANT
+} MACROBLOCK_PLANE;
+
+/* The [2] dimension is for whether we skip the EOB node (i.e. if previous
+ * coefficient in this block was zero) or not. */
+typedef unsigned int av1_coeff_cost[PLANE_TYPES][REF_TYPES][COEF_BANDS][2]
+                                   [COEFF_CONTEXTS][ENTROPY_TOKENS];
+
+typedef struct {
+  int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
+  int16_t mode_context[MODE_CTX_REF_FRAMES];
+#if CONFIG_LV_MAP
+  // TODO(angiebird): Reduce the buffer size according to sb_type
+  tran_low_t tcoeff[MAX_MB_PLANE][MAX_SB_SQUARE];
+  uint16_t eobs[MAX_MB_PLANE][MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+  uint8_t txb_skip_ctx[MAX_MB_PLANE]
+                      [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+  int dc_sign_ctx[MAX_MB_PLANE]
+                 [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+#endif
+#if CONFIG_REF_MV
+  uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+  CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+#if CONFIG_EXT_INTER
+  int16_t compound_mode_context[MODE_CTX_REF_FRAMES];
+#endif  // CONFIG_EXT_INTER
+#endif
+} MB_MODE_INFO_EXT;
+
+typedef struct {
+  int col_min;
+  int col_max;
+  int row_min;
+  int row_max;
+} MvLimits;
+
+#if CONFIG_PALETTE
+typedef struct {
+  uint8_t best_palette_color_map[MAX_SB_SQUARE];
+  float kmeans_data_buf[2 * MAX_SB_SQUARE];
+} PALETTE_BUFFER;
+#endif  // CONFIG_PALETTE
+
+typedef struct macroblock MACROBLOCK;
+struct macroblock {
+  struct macroblock_plane plane[MAX_MB_PLANE];
+
+  MACROBLOCKD e_mbd;
+  MB_MODE_INFO_EXT *mbmi_ext;
+  int skip_block;
+  int qindex;
+
+  // The equivalent error at the current rdmult of one whole bit (not one
+  // bitcost unit).
+  int errorperbit;
+  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // for large blocks.
+  int sadperbit16;
+  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // for sub-8x8 blocks.
+  int sadperbit4;
+  int rddiv;
+  int rdmult;
+  int mb_energy;
+  int *m_search_count_ptr;
+  int *ex_search_count_ptr;
+
+#if CONFIG_VAR_TX
+  unsigned int txb_split_count;
+#endif
+
+  // These are set to their default values at the beginning, and then adjusted
+  // further in the encoding process.
+  BLOCK_SIZE min_partition_size;
+  BLOCK_SIZE max_partition_size;
+
+  int mv_best_ref_index[TOTAL_REFS_PER_FRAME];
+  unsigned int max_mv_context[TOTAL_REFS_PER_FRAME];
+  unsigned int source_variance;
+  unsigned int pred_sse[TOTAL_REFS_PER_FRAME];
+  int pred_mv_sad[TOTAL_REFS_PER_FRAME];
+
+#if CONFIG_REF_MV
+  int *nmvjointcost;
+  int nmv_vec_cost[NMV_CONTEXTS][MV_JOINTS];
+  int *nmvcost[NMV_CONTEXTS][2];
+  int *nmvcost_hp[NMV_CONTEXTS][2];
+  int **mv_cost_stack[NMV_CONTEXTS];
+  int *nmvjointsadcost;
+#else
+  int nmvjointcost[MV_JOINTS];
+  int *nmvcost[2];
+  int *nmvcost_hp[2];
+  int nmvjointsadcost[MV_JOINTS];
+#endif
+
+  int **mvcost;
+  int *nmvsadcost[2];
+  int *nmvsadcost_hp[2];
+  int **mvsadcost;
+#if CONFIG_MOTION_VAR
+  int32_t *wsrc_buf;
+  int32_t *mask_buf;
+#endif  // CONFIG_MOTION_VAR
+
+#if CONFIG_PALETTE
+  PALETTE_BUFFER *palette_buffer;
+#endif  // CONFIG_PALETTE
+
+  // These define limits to motion vector components to prevent them
+  // from extending outside the UMV borders
+  MvLimits mv_limits;
+
+#if CONFIG_VAR_TX
+  uint8_t blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
+#if CONFIG_REF_MV
+  uint8_t blk_skip_drl[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
+#endif
+#endif
+
+  int skip;
+
+#if CONFIG_CB4X4
+  int skip_chroma_rd;
+#endif
+
+  // note that token_costs is the cost when eob node is skipped
+  av1_coeff_cost token_costs[TX_SIZES];
+
+  int optimize;
+
+  // Used to store sub partition's choices.
+  MV pred_mv[TOTAL_REFS_PER_FRAME];
+
+  // Store the best motion vector during motion search
+  int_mv best_mv;
+  // Store the second best motion vector during full-pixel motion search
+  int_mv second_best_mv;
+
+  // use default transform and skip transform type search for intra modes
+  int use_default_intra_tx_type;
+  // use default transform and skip transform type search for inter modes
+  int use_default_inter_tx_type;
+#if CONFIG_PVQ
+  int rate;
+  // 1 if neither AC nor DC is coded. Only used during RDO.
+  int pvq_skip[MAX_MB_PLANE];
+  PVQ_QUEUE *pvq_q;
+
+  // Storage for PVQ tx block encodings in a superblock.
+  // There can be max 16x16 of 4x4 blocks (and YUV) encode by PVQ
+  // 256 is the max # of 4x4 blocks in a SB (64x64), which comes from:
+  // 1) Since PVQ is applied to each trasnform-ed block
+  // 2) 4x4 is the smallest tx size in AV1
+  // 3) AV1 allows using smaller tx size than block (i.e. partition) size
+  // TODO(yushin) : The memory usage could be improved a lot, since this has
+  // storage for 10 bands and 128 coefficients for every 4x4 block,
+  PVQ_INFO pvq[MAX_PVQ_BLOCKS_IN_SB][MAX_MB_PLANE];
+  daala_enc_ctx daala_enc;
+  int pvq_speed;
+  int pvq_coded;  // Indicates whether pvq_info needs be stored to tokenize
+#endif
+#if CONFIG_DAALA_DIST
+  // Keep rate of each 4x4 block in the current macroblock during RDO
+  // This is needed when using the 8x8 Daala distortion metric during RDO,
+  // because it evaluates distortion in a different order than the underlying
+  // 4x4 blocks are coded.
+  int rate_4x4[256];
+#endif
+#if CONFIG_CFL
+  // Whether luma needs to be stored during RDO.
+  int cfl_store_y;
+#endif
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_BLOCK_H_
diff --git a/third_party/aom/av1/encoder/blockiness.c b/third_party/aom/av1/encoder/blockiness.c
new file mode 100644
index 000000000..113ceb29d
--- /dev/null
+++ b/third_party/aom/av1/encoder/blockiness.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "av1/common/common.h"
+#include "av1/common/filter.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+static int horizontal_filter(const uint8_t *s) {
+  return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6;
+}
+
+static int vertical_filter(const uint8_t *s, int p) {
+  return (s[p] - s[-2 * p]) * 2 + (s[-p] - s[0]) * 6;
+}
+
+static int variance(int sum, int sum_squared, int size) {
+  return sum_squared / size - (sum / size) * (sum / size);
+}
+// Calculate a blockiness level for a vertical block edge.
+// This function returns a new blockiness metric that's defined as
+
+//              p0 p1 p2 p3
+//              q0 q1 q2 q3
+// block edge ->
+//              r0 r1 r2 r3
+//              s0 s1 s2 s3
+
+// blockiness =  p0*-2+q0*6+r0*-6+s0*2 +
+//               p1*-2+q1*6+r1*-6+s1*2 +
+//               p2*-2+q2*6+r2*-6+s2*2 +
+//               p3*-2+q3*6+r3*-6+s3*2 ;
+
+// reconstructed_blockiness = abs(blockiness from reconstructed buffer -
+//                                blockiness from source buffer,0)
+//
+// I make the assumption that flat blocks are much more visible than high
+// contrast blocks. As such, I scale the result of the blockiness calc
+// by dividing the blockiness by the variance of the pixels on either side
+// of the edge as follows:
+// var_0 = (q0^2+q1^2+q2^2+q3^2) - ((q0 + q1 + q2 + q3) / 4 )^2
+// var_1 = (r0^2+r1^2+r2^2+r3^2) - ((r0 + r1 + r2 + r3) / 4 )^2
+// The returned blockiness is the scaled value
+// Reconstructed blockiness / ( 1 + var_0 + var_1 ) ;
+static int blockiness_vertical(const uint8_t *s, int sp, const uint8_t *r,
+                               int rp, int size) {
+  int s_blockiness = 0;
+  int r_blockiness = 0;
+  int sum_0 = 0;
+  int sum_sq_0 = 0;
+  int sum_1 = 0;
+  int sum_sq_1 = 0;
+  int i;
+  int var_0;
+  int var_1;
+  for (i = 0; i < size; ++i, s += sp, r += rp) {
+    s_blockiness += horizontal_filter(s);
+    r_blockiness += horizontal_filter(r);
+    sum_0 += s[0];
+    sum_sq_0 += s[0] * s[0];
+    sum_1 += s[-1];
+    sum_sq_1 += s[-1] * s[-1];
+  }
+  var_0 = variance(sum_0, sum_sq_0, size);
+  var_1 = variance(sum_1, sum_sq_1, size);
+  r_blockiness = abs(r_blockiness);
+  s_blockiness = abs(s_blockiness);
+
+  if (r_blockiness > s_blockiness)
+    return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+  else
+    return 0;
+}
+
+// Calculate a blockiness level for a horizontal block edge
+// same as above.
+static int blockiness_horizontal(const uint8_t *s, int sp, const uint8_t *r,
+                                 int rp, int size) {
+  int s_blockiness = 0;
+  int r_blockiness = 0;
+  int sum_0 = 0;
+  int sum_sq_0 = 0;
+  int sum_1 = 0;
+  int sum_sq_1 = 0;
+  int i;
+  int var_0;
+  int var_1;
+  for (i = 0; i < size; ++i, ++s, ++r) {
+    s_blockiness += vertical_filter(s, sp);
+    r_blockiness += vertical_filter(r, rp);
+    sum_0 += s[0];
+    sum_sq_0 += s[0] * s[0];
+    sum_1 += s[-sp];
+    sum_sq_1 += s[-sp] * s[-sp];
+  }
+  var_0 = variance(sum_0, sum_sq_0, size);
+  var_1 = variance(sum_1, sum_sq_1, size);
+  r_blockiness = abs(r_blockiness);
+  s_blockiness = abs(s_blockiness);
+
+  if (r_blockiness > s_blockiness)
+    return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+  else
+    return 0;
+}
+
+// This function returns the blockiness for the entire frame currently by
+// looking at all borders in steps of 4.
+double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
+                          const unsigned char *img2, int img2_pitch, int width,
+                          int height) {
+  double blockiness = 0;
+  int i, j;
+  aom_clear_system_state();
+  for (i = 0; i < height;
+       i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+    for (j = 0; j < width; j += 4) {
+      if (i > 0 && i < height && j > 0 && j < width) {
+        blockiness +=
+            blockiness_vertical(img1 + j, img1_pitch, img2 + j, img2_pitch, 4);
+        blockiness += blockiness_horizontal(img1 + j, img1_pitch, img2 + j,
+                                            img2_pitch, 4);
+      }
+    }
+  }
+  blockiness /= width * height / 16;
+  return blockiness;
+}
diff --git a/third_party/aom/av1/encoder/context_tree.c b/third_party/aom/av1/encoder/context_tree.c
new file mode 100644
index 000000000..4c7d6ff00
--- /dev/null
+++ b/third_party/aom/av1/encoder/context_tree.c
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+
+static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = {
+#if CONFIG_CB4X4
+  BLOCK_4X4,
+#endif
+  BLOCK_8X8,     BLOCK_16X16, BLOCK_32X32, BLOCK_64X64,
+#if CONFIG_EXT_PARTITION
+  BLOCK_128X128,
+#endif  // CONFIG_EXT_PARTITION
+};
+
+static void alloc_mode_context(AV1_COMMON *cm, int num_4x4_blk,
+#if CONFIG_EXT_PARTITION_TYPES
+                               PARTITION_TYPE partition,
+#endif
+                               PICK_MODE_CONTEXT *ctx) {
+  const int num_blk = (num_4x4_blk < 4 ? 4 : num_4x4_blk);
+  const int num_pix = num_blk * tx_size_2d[0];
+  int i;
+#if CONFIG_CB4X4 && CONFIG_VAR_TX
+  ctx->num_4x4_blk = num_blk / 4;
+#else
+  ctx->num_4x4_blk = num_blk;
+#endif
+
+#if CONFIG_EXT_PARTITION_TYPES
+  ctx->partition = partition;
+#endif
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VAR_TX
+    CHECK_MEM_ERROR(cm, ctx->blk_skip[i], aom_calloc(num_blk, sizeof(uint8_t)));
+#endif
+    CHECK_MEM_ERROR(cm, ctx->coeff[i],
+                    aom_memalign(32, num_pix * sizeof(*ctx->coeff[i])));
+    CHECK_MEM_ERROR(cm, ctx->qcoeff[i],
+                    aom_memalign(32, num_pix * sizeof(*ctx->qcoeff[i])));
+    CHECK_MEM_ERROR(cm, ctx->dqcoeff[i],
+                    aom_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i])));
+    CHECK_MEM_ERROR(cm, ctx->eobs[i],
+                    aom_memalign(32, num_blk * sizeof(*ctx->eobs[i])));
+#if CONFIG_LV_MAP
+    CHECK_MEM_ERROR(
+        cm, ctx->txb_entropy_ctx[i],
+        aom_memalign(32, num_blk * sizeof(*ctx->txb_entropy_ctx[i])));
+#endif
+
+#if CONFIG_PVQ
+    CHECK_MEM_ERROR(cm, ctx->pvq_ref_coeff[i],
+                    aom_memalign(32, num_pix * sizeof(*ctx->pvq_ref_coeff[i])));
+#endif
+  }
+
+#if CONFIG_PALETTE
+  if (cm->allow_screen_content_tools) {
+    for (i = 0; i < 2; ++i) {
+      CHECK_MEM_ERROR(
+          cm, ctx->color_index_map[i],
+          aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
+    }
+  }
+#endif  // CONFIG_PALETTE
+}
+
+static void free_mode_context(PICK_MODE_CONTEXT *ctx) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VAR_TX
+    aom_free(ctx->blk_skip[i]);
+    ctx->blk_skip[i] = 0;
+#endif
+    aom_free(ctx->coeff[i]);
+    ctx->coeff[i] = 0;
+    aom_free(ctx->qcoeff[i]);
+    ctx->qcoeff[i] = 0;
+    aom_free(ctx->dqcoeff[i]);
+    ctx->dqcoeff[i] = 0;
+#if CONFIG_PVQ
+    aom_free(ctx->pvq_ref_coeff[i]);
+    ctx->pvq_ref_coeff[i] = 0;
+#endif
+    aom_free(ctx->eobs[i]);
+    ctx->eobs[i] = 0;
+#if CONFIG_LV_MAP
+    aom_free(ctx->txb_entropy_ctx[i]);
+    ctx->txb_entropy_ctx[i] = 0;
+#endif
+  }
+
+#if CONFIG_PALETTE
+  for (i = 0; i < 2; ++i) {
+    aom_free(ctx->color_index_map[i]);
+    ctx->color_index_map[i] = 0;
+  }
+#endif  // CONFIG_PALETTE
+}
+
+static void alloc_tree_contexts(AV1_COMMON *cm, PC_TREE *tree,
+                                int num_4x4_blk) {
+#if CONFIG_EXT_PARTITION_TYPES
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_NONE, &tree->none);
+  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_HORZ, &tree->horizontal[0]);
+  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT, &tree->vertical[0]);
+  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT, &tree->horizontal[1]);
+  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT, &tree->vertical[1]);
+
+  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_A,
+                     &tree->horizontala[0]);
+  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_A,
+                     &tree->horizontala[1]);
+  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_HORZ_A,
+                     &tree->horizontala[2]);
+  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_HORZ_B,
+                     &tree->horizontalb[0]);
+  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_B,
+                     &tree->horizontalb[1]);
+  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_B,
+                     &tree->horizontalb[2]);
+  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_A,
+                     &tree->verticala[0]);
+  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_A,
+                     &tree->verticala[1]);
+  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT_A,
+                     &tree->verticala[2]);
+  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT_B,
+                     &tree->verticalb[0]);
+  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_B,
+                     &tree->verticalb[1]);
+  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_B,
+                     &tree->verticalb[2]);
+#ifdef CONFIG_SUPERTX
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ,
+                     &tree->horizontal_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT, &tree->vertical_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_SPLIT, &tree->split_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ_A,
+                     &tree->horizontala_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ_B,
+                     &tree->horizontalb_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT_A,
+                     &tree->verticala_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT_B,
+                     &tree->verticalb_supertx);
+#endif  // CONFIG_SUPERTX
+#else
+  alloc_mode_context(cm, num_4x4_blk, &tree->none);
+  alloc_mode_context(cm, num_4x4_blk / 2, &tree->horizontal[0]);
+  alloc_mode_context(cm, num_4x4_blk / 2, &tree->vertical[0]);
+#ifdef CONFIG_SUPERTX
+  alloc_mode_context(cm, num_4x4_blk, &tree->horizontal_supertx);
+  alloc_mode_context(cm, num_4x4_blk, &tree->vertical_supertx);
+  alloc_mode_context(cm, num_4x4_blk, &tree->split_supertx);
+#endif
+
+  if (num_4x4_blk > 4) {
+    alloc_mode_context(cm, num_4x4_blk / 2, &tree->horizontal[1]);
+    alloc_mode_context(cm, num_4x4_blk / 2, &tree->vertical[1]);
+  } else {
+    memset(&tree->horizontal[1], 0, sizeof(tree->horizontal[1]));
+    memset(&tree->vertical[1], 0, sizeof(tree->vertical[1]));
+  }
+#endif  // CONFIG_EXT_PARTITION_TYPES
+}
+
+static void free_tree_contexts(PC_TREE *tree) {
+#if CONFIG_EXT_PARTITION_TYPES
+  int i;
+  for (i = 0; i < 3; i++) {
+    free_mode_context(&tree->horizontala[i]);
+    free_mode_context(&tree->horizontalb[i]);
+    free_mode_context(&tree->verticala[i]);
+    free_mode_context(&tree->verticalb[i]);
+  }
+#endif  // CONFIG_EXT_PARTITION_TYPES
+  free_mode_context(&tree->none);
+  free_mode_context(&tree->horizontal[0]);
+  free_mode_context(&tree->horizontal[1]);
+  free_mode_context(&tree->vertical[0]);
+  free_mode_context(&tree->vertical[1]);
+#ifdef CONFIG_SUPERTX
+  free_mode_context(&tree->horizontal_supertx);
+  free_mode_context(&tree->vertical_supertx);
+  free_mode_context(&tree->split_supertx);
+#if CONFIG_EXT_PARTITION_TYPES
+  free_mode_context(&tree->horizontala_supertx);
+  free_mode_context(&tree->horizontalb_supertx);
+  free_mode_context(&tree->verticala_supertx);
+  free_mode_context(&tree->verticalb_supertx);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+#endif  // CONFIG_SUPERTX
+}
+
+// This function sets up a tree of contexts such that at each square
+// partition level. There are contexts for none, horizontal, vertical, and
+// split.  Along with a block_size value and a selected block_size which
+// represents the state of our search.
+void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
+  int i, j;
+// TODO(jingning): The pc_tree allocation is redundant. We can take out all
+// the leaf nodes after cb4x4 mode is enabled.
+#if CONFIG_CB4X4
+#if CONFIG_EXT_PARTITION
+  const int tree_nodes_inc = 1024;
+#else
+  const int tree_nodes_inc = 256;
+#endif  // CONFIG_EXT_PARTITION
+  const int leaf_factor = 4;
+#else
+  const int tree_nodes_inc = 0;
+  const int leaf_factor = 1;
+#endif
+#if CONFIG_EXT_PARTITION
+  const int leaf_nodes = 256 * leaf_factor;
+  const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
+#else
+  const int leaf_nodes = 64 * leaf_factor;
+  const int tree_nodes = tree_nodes_inc + 64 + 16 + 4 + 1;
+#endif  // CONFIG_EXT_PARTITION
+  int pc_tree_index = 0;
+  PC_TREE *this_pc;
+  PICK_MODE_CONTEXT *this_leaf;
+  int square_index = 1;
+  int nodes;
+
+  aom_free(td->leaf_tree);
+  CHECK_MEM_ERROR(cm, td->leaf_tree,
+                  aom_calloc(leaf_nodes, sizeof(*td->leaf_tree)));
+  aom_free(td->pc_tree);
+  CHECK_MEM_ERROR(cm, td->pc_tree,
+                  aom_calloc(tree_nodes, sizeof(*td->pc_tree)));
+
+  this_pc = &td->pc_tree[0];
+  this_leaf = &td->leaf_tree[0];
+
+  // 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same
+  // context so we only need to allocate 1 for each 8x8 block.
+  for (i = 0; i < leaf_nodes; ++i) {
+#if CONFIG_EXT_PARTITION_TYPES
+    alloc_mode_context(cm, 4, PARTITION_NONE, &td->leaf_tree[i]);
+#else
+    alloc_mode_context(cm, 16, &td->leaf_tree[i]);
+#endif
+  }
+
+  // Sets up all the leaf nodes in the tree.
+  for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
+    PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+    tree->block_size = square[0];
+#if CONFIG_CB4X4
+    alloc_tree_contexts(cm, tree, 16);
+#else
+    alloc_tree_contexts(cm, tree, 4);
+#endif
+    tree->leaf_split[0] = this_leaf++;
+    for (j = 1; j < 4; j++) tree->leaf_split[j] = tree->leaf_split[0];
+  }
+
+  // Each node has 4 leaf nodes, fill each block_size level of the tree
+  // from leafs to the root.
+  for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+    for (i = 0; i < nodes; ++i) {
+      PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+#if CONFIG_CB4X4
+      alloc_tree_contexts(cm, tree, 16 << (2 * square_index));
+#else
+      alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
+#endif
+      tree->block_size = square[square_index];
+      for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
+      ++pc_tree_index;
+    }
+    ++square_index;
+  }
+
+  // Set up the root node for the largest superblock size
+  i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2;
+  td->pc_root[i] = &td->pc_tree[tree_nodes - 1];
+  td->pc_root[i]->none.best_mode_index = 2;
+  // Set up the root nodes for the rest of the possible superblock sizes
+  while (--i >= 0) {
+    td->pc_root[i] = td->pc_root[i + 1]->split[0];
+    td->pc_root[i]->none.best_mode_index = 2;
+  }
+}
+
+void av1_free_pc_tree(ThreadData *td) {
+#if CONFIG_CB4X4
+#if CONFIG_EXT_PARTITION
+  const int tree_nodes_inc = 1024;
+#else
+  const int tree_nodes_inc = 256;
+#endif  // CONFIG_EXT_PARTITION
+  const int leaf_factor = 4;
+#else
+  const int tree_nodes_inc = 0;
+  const int leaf_factor = 1;
+#endif
+
+#if CONFIG_EXT_PARTITION
+  const int leaf_nodes = 256 * leaf_factor;
+  const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
+#else
+  const int leaf_nodes = 64 * leaf_factor;
+  const int tree_nodes = tree_nodes_inc + 64 + 16 + 4 + 1;
+#endif  // CONFIG_EXT_PARTITION
+  int i;
+
+  // Set up all 4x4 mode contexts
+  for (i = 0; i < leaf_nodes; ++i) free_mode_context(&td->leaf_tree[i]);
+
+  // Sets up all the leaf nodes in the tree.
+  for (i = 0; i < tree_nodes; ++i) free_tree_contexts(&td->pc_tree[i]);
+
+  aom_free(td->pc_tree);
+  td->pc_tree = NULL;
+  aom_free(td->leaf_tree);
+  td->leaf_tree = NULL;
+}
diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h
new file mode 100644
index 000000000..67954126c
--- /dev/null
+++ b/third_party/aom/av1/encoder/context_tree.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_CONTEXT_TREE_H_
+#define AV1_ENCODER_CONTEXT_TREE_H_
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct AV1Common;
+struct ThreadData;
+
+// Structure to hold snapshot of coding context during the mode picking process
+typedef struct {
+  MODE_INFO mic;
+  MB_MODE_INFO_EXT mbmi_ext;
+#if CONFIG_PALETTE
+  uint8_t *color_index_map[2];
+#endif  // CONFIG_PALETTE
+#if CONFIG_VAR_TX
+  uint8_t *blk_skip[MAX_MB_PLANE];
+#endif
+
+  // dual buffer pointers, 0: in use, 1: best in store
+  tran_low_t *coeff[MAX_MB_PLANE];
+  tran_low_t *qcoeff[MAX_MB_PLANE];
+  tran_low_t *dqcoeff[MAX_MB_PLANE];
+#if CONFIG_PVQ
+  tran_low_t *pvq_ref_coeff[MAX_MB_PLANE];
+#endif
+  uint16_t *eobs[MAX_MB_PLANE];
+#if CONFIG_LV_MAP
+  uint8_t *txb_entropy_ctx[MAX_MB_PLANE];
+#endif
+
+  int num_4x4_blk;
+  int skip;
+  int pred_pixel_ready;
+  // For current partition, only if all Y, U, and V transform blocks'
+  // coefficients are quantized to 0, skippable is set to 0.
+  int skippable;
+  int best_mode_index;
+  int hybrid_pred_diff;
+  int comp_pred_diff;
+  int single_pred_diff;
+
+  // TODO(jingning) Use RD_COST struct here instead. This involves a boarder
+  // scope of refactoring.
+  int rate;
+  int64_t dist;
+
+  // motion vector cache for adaptive motion search control in partition
+  // search loop
+  MV pred_mv[TOTAL_REFS_PER_FRAME];
+  InterpFilter pred_interp_filter;
+#if CONFIG_EXT_PARTITION_TYPES
+  PARTITION_TYPE partition;
+#endif
+} PICK_MODE_CONTEXT;
+
+typedef struct PC_TREE {
+  int index;
+  PARTITION_TYPE partitioning;
+  BLOCK_SIZE block_size;
+  PICK_MODE_CONTEXT none;
+  PICK_MODE_CONTEXT horizontal[2];
+  PICK_MODE_CONTEXT vertical[2];
+#if CONFIG_EXT_PARTITION_TYPES
+  PICK_MODE_CONTEXT horizontala[3];
+  PICK_MODE_CONTEXT horizontalb[3];
+  PICK_MODE_CONTEXT verticala[3];
+  PICK_MODE_CONTEXT verticalb[3];
+#endif
+  union {
+    struct PC_TREE *split[4];
+    PICK_MODE_CONTEXT *leaf_split[4];
+  };
+#ifdef CONFIG_SUPERTX
+  PICK_MODE_CONTEXT horizontal_supertx;
+  PICK_MODE_CONTEXT vertical_supertx;
+  PICK_MODE_CONTEXT split_supertx;
+#if CONFIG_EXT_PARTITION_TYPES
+  PICK_MODE_CONTEXT horizontala_supertx;
+  PICK_MODE_CONTEXT horizontalb_supertx;
+  PICK_MODE_CONTEXT verticala_supertx;
+  PICK_MODE_CONTEXT verticalb_supertx;
+#endif
+#endif
+} PC_TREE;
+
+void av1_setup_pc_tree(struct AV1Common *cm, struct ThreadData *td);
+void av1_free_pc_tree(struct ThreadData *td);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif /* AV1_ENCODER_CONTEXT_TREE_H_ */
diff --git a/third_party/aom/av1/encoder/corner_detect.c b/third_party/aom/av1/encoder/corner_detect.c
new file mode 100644
index 000000000..e4c59dd9c
--- /dev/null
+++ b/third_party/aom/av1/encoder/corner_detect.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "third_party/fastfeat/fast.h"
+
+#include "av1/encoder/corner_detect.h"
+
+// Fast_9 wrapper
+#define FAST_BARRIER 18
+int fast_corner_detect(unsigned char *buf, int width, int height, int stride,
+                       int *points, int max_points) {
+  int num_points;
+  xy *const frm_corners_xy = fast9_detect_nonmax(buf, width, height, stride,
+                                                 FAST_BARRIER, &num_points);
+  num_points = (num_points <= max_points ? num_points : max_points);
+  if (num_points > 0 && frm_corners_xy) {
+    memcpy(points, frm_corners_xy, sizeof(*frm_corners_xy) * num_points);
+    free(frm_corners_xy);
+    return num_points;
+  }
+  free(frm_corners_xy);
+  return 0;
+}
diff --git a/third_party/aom/av1/encoder/corner_detect.h b/third_party/aom/av1/encoder/corner_detect.h
new file mode 100644
index 000000000..0317db5b3
--- /dev/null
+++ b/third_party/aom/av1/encoder/corner_detect.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_CORNER_DETECT_H_
+#define AV1_ENCODER_CORNER_DETECT_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+
+int fast_corner_detect(unsigned char *buf, int width, int height, int stride,
+                       int *points, int max_points);
+
+#endif  // AV1_ENCODER_CORNER_DETECT_H_
diff --git a/third_party/aom/av1/encoder/corner_match.c b/third_party/aom/av1/encoder/corner_match.c
new file mode 100644
index 000000000..64ee0c5ae
--- /dev/null
+++ b/third_party/aom/av1/encoder/corner_match.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+
+#include "av1/encoder/corner_match.h"
+
+#define MATCH_SZ 13
+#define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2)
+#define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ)
+#define SEARCH_SZ 9
+#define SEARCH_SZ_BY2 ((SEARCH_SZ - 1) / 2)
+
+#define THRESHOLD_NCC 0.75
+
+/* Compute var(im) * MATCH_SZ_SQ over a MATCH_SZ by MATCH_SZ window of im,
+   centered at (x, y).
+*/
+static double compute_variance(unsigned char *im, int stride, int x, int y) {
+  int sum = 0.0;
+  int sumsq = 0.0;
+  int var;
+  int i, j;
+  for (i = 0; i < MATCH_SZ; ++i)
+    for (j = 0; j < MATCH_SZ; ++j) {
+      sum += im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)];
+      sumsq += im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)] *
+               im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)];
+    }
+  var = sumsq * MATCH_SZ_SQ - sum * sum;
+  return (double)var;
+}
+
+/* Compute corr(im1, im2) * MATCH_SZ * stddev(im1), where the
+   correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
+   of each image, centered at (x1, y1) and (x2, y2) respectively.
+*/
+static double compute_cross_correlation(unsigned char *im1, int stride1, int x1,
+                                        int y1, unsigned char *im2, int stride2,
+                                        int x2, int y2) {
+  int v1, v2;
+  int sum1 = 0;
+  int sum2 = 0;
+  int sumsq2 = 0;
+  int cross = 0;
+  int var2, cov;
+  int i, j;
+  for (i = 0; i < MATCH_SZ; ++i)
+    for (j = 0; j < MATCH_SZ; ++j) {
+      v1 = im1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)];
+      v2 = im2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)];
+      sum1 += v1;
+      sum2 += v2;
+      sumsq2 += v2 * v2;
+      cross += v1 * v2;
+    }
+  var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2;
+  cov = cross * MATCH_SZ_SQ - sum1 * sum2;
+  return cov / sqrt((double)var2);
+}
+
+static int is_eligible_point(int pointx, int pointy, int width, int height) {
+  return (pointx >= MATCH_SZ_BY2 && pointy >= MATCH_SZ_BY2 &&
+          pointx + MATCH_SZ_BY2 < width && pointy + MATCH_SZ_BY2 < height);
+}
+
+static int is_eligible_distance(int point1x, int point1y, int point2x,
+                                int point2y, int width, int height) {
+  const int thresh = (width < height ? height : width) >> 4;
+  return ((point1x - point2x) * (point1x - point2x) +
+          (point1y - point2y) * (point1y - point2y)) <= thresh * thresh;
+}
+
+static void improve_correspondence(unsigned char *frm, unsigned char *ref,
+                                   int width, int height, int frm_stride,
+                                   int ref_stride,
+                                   Correspondence *correspondences,
+                                   int num_correspondences) {
+  int i;
+  for (i = 0; i < num_correspondences; ++i) {
+    int x, y, best_x = 0, best_y = 0;
+    double best_match_ncc = 0.0;
+    for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) {
+      for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) {
+        double match_ncc;
+        if (!is_eligible_point(correspondences[i].rx + x,
+                               correspondences[i].ry + y, width, height))
+          continue;
+        if (!is_eligible_distance(correspondences[i].x, correspondences[i].y,
+                                  correspondences[i].rx + x,
+                                  correspondences[i].ry + y, width, height))
+          continue;
+        match_ncc = compute_cross_correlation(
+            frm, frm_stride, correspondences[i].x, correspondences[i].y, ref,
+            ref_stride, correspondences[i].rx + x, correspondences[i].ry + y);
+        if (match_ncc > best_match_ncc) {
+          best_match_ncc = match_ncc;
+          best_y = y;
+          best_x = x;
+        }
+      }
+    }
+    correspondences[i].rx += best_x;
+    correspondences[i].ry += best_y;
+  }
+  for (i = 0; i < num_correspondences; ++i) {
+    int x, y, best_x = 0, best_y = 0;
+    double best_match_ncc = 0.0;
+    for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y)
+      for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) {
+        double match_ncc;
+        if (!is_eligible_point(correspondences[i].x + x,
+                               correspondences[i].y + y, width, height))
+          continue;
+        if (!is_eligible_distance(
+                correspondences[i].x + x, correspondences[i].y + y,
+                correspondences[i].rx, correspondences[i].ry, width, height))
+          continue;
+        match_ncc = compute_cross_correlation(
+            ref, ref_stride, correspondences[i].rx, correspondences[i].ry, frm,
+            frm_stride, correspondences[i].x + x, correspondences[i].y + y);
+        if (match_ncc > best_match_ncc) {
+          best_match_ncc = match_ncc;
+          best_y = y;
+          best_x = x;
+        }
+      }
+    correspondences[i].x += best_x;
+    correspondences[i].y += best_y;
+  }
+}
+
+int determine_correspondence(unsigned char *frm, int *frm_corners,
+                             int num_frm_corners, unsigned char *ref,
+                             int *ref_corners, int num_ref_corners, int width,
+                             int height, int frm_stride, int ref_stride,
+                             int *correspondence_pts) {
+  // TODO(sarahparker) Improve this to include 2-way match
+  int i, j;
+  Correspondence *correspondences = (Correspondence *)correspondence_pts;
+  int num_correspondences = 0;
+  for (i = 0; i < num_frm_corners; ++i) {
+    double best_match_ncc = 0.0;
+    double template_norm;
+    int best_match_j = -1;
+    if (!is_eligible_point(frm_corners[2 * i], frm_corners[2 * i + 1], width,
+                           height))
+      continue;
+    for (j = 0; j < num_ref_corners; ++j) {
+      double match_ncc;
+      if (!is_eligible_point(ref_corners[2 * j], ref_corners[2 * j + 1], width,
+                             height))
+        continue;
+      if (!is_eligible_distance(frm_corners[2 * i], frm_corners[2 * i + 1],
+                                ref_corners[2 * j], ref_corners[2 * j + 1],
+                                width, height))
+        continue;
+      match_ncc = compute_cross_correlation(
+          frm, frm_stride, frm_corners[2 * i], frm_corners[2 * i + 1], ref,
+          ref_stride, ref_corners[2 * j], ref_corners[2 * j + 1]);
+      if (match_ncc > best_match_ncc) {
+        best_match_ncc = match_ncc;
+        best_match_j = j;
+      }
+    }
+    // Note: We want to test if the best correlation is >= THRESHOLD_NCC,
+    // but need to account for the normalization in compute_cross_correlation.
+    template_norm = compute_variance(frm, frm_stride, frm_corners[2 * i],
+                                     frm_corners[2 * i + 1]);
+    if (best_match_ncc > THRESHOLD_NCC * sqrt(template_norm)) {
+      correspondences[num_correspondences].x = frm_corners[2 * i];
+      correspondences[num_correspondences].y = frm_corners[2 * i + 1];
+      correspondences[num_correspondences].rx = ref_corners[2 * best_match_j];
+      correspondences[num_correspondences].ry =
+          ref_corners[2 * best_match_j + 1];
+      num_correspondences++;
+    }
+  }
+  improve_correspondence(frm, ref, width, height, frm_stride, ref_stride,
+                         correspondences, num_correspondences);
+  return num_correspondences;
+}
diff --git a/third_party/aom/av1/encoder/corner_match.h b/third_party/aom/av1/encoder/corner_match.h
new file mode 100644
index 000000000..c0458642c
--- /dev/null
+++ b/third_party/aom/av1/encoder/corner_match.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_ENCODER_CORNER_MATCH_H_
+#define AV1_ENCODER_CORNER_MATCH_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+
+typedef struct {
+  int x, y;
+  int rx, ry;
+} Correspondence;
+
+int determine_correspondence(unsigned char *frm, int *frm_corners,
+                             int num_frm_corners, unsigned char *ref,
+                             int *ref_corners, int num_ref_corners, int width,
+                             int height, int frm_stride, int ref_stride,
+                             int *correspondence_pts);
+
+#endif  // AV1_ENCODER_CORNER_MATCH_H_
diff --git a/third_party/aom/av1/encoder/cost.c b/third_party/aom/av1/encoder/cost.c
new file mode 100644
index 000000000..e3151a597
--- /dev/null
+++ b/third_party/aom/av1/encoder/cost.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+
+#include "av1/encoder/cost.h"
+#include "av1/common/entropy.h"
+
+/* round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT))
+   Begins with a bogus entry for simpler addressing. */
+const uint16_t av1_prob_cost[256] = {
+  4096, 4096, 3584, 3284, 3072, 2907, 2772, 2659, 2560, 2473, 2395, 2325, 2260,
+  2201, 2147, 2096, 2048, 2003, 1961, 1921, 1883, 1847, 1813, 1780, 1748, 1718,
+  1689, 1661, 1635, 1609, 1584, 1559, 1536, 1513, 1491, 1470, 1449, 1429, 1409,
+  1390, 1371, 1353, 1335, 1318, 1301, 1284, 1268, 1252, 1236, 1221, 1206, 1192,
+  1177, 1163, 1149, 1136, 1123, 1110, 1097, 1084, 1072, 1059, 1047, 1036, 1024,
+  1013, 1001, 990,  979,  968,  958,  947,  937,  927,  917,  907,  897,  887,
+  878,  868,  859,  850,  841,  832,  823,  814,  806,  797,  789,  780,  772,
+  764,  756,  748,  740,  732,  724,  717,  709,  702,  694,  687,  680,  673,
+  665,  658,  651,  644,  637,  631,  624,  617,  611,  604,  598,  591,  585,
+  578,  572,  566,  560,  554,  547,  541,  535,  530,  524,  518,  512,  506,
+  501,  495,  489,  484,  478,  473,  467,  462,  456,  451,  446,  441,  435,
+  430,  425,  420,  415,  410,  405,  400,  395,  390,  385,  380,  375,  371,
+  366,  361,  356,  352,  347,  343,  338,  333,  329,  324,  320,  316,  311,
+  307,  302,  298,  294,  289,  285,  281,  277,  273,  268,  264,  260,  256,
+  252,  248,  244,  240,  236,  232,  228,  224,  220,  216,  212,  209,  205,
+  201,  197,  194,  190,  186,  182,  179,  175,  171,  168,  164,  161,  157,
+  153,  150,  146,  143,  139,  136,  132,  129,  125,  122,  119,  115,  112,
+  109,  105,  102,  99,   95,   92,   89,   86,   82,   79,   76,   73,   70,
+  66,   63,   60,   57,   54,   51,   48,   45,   42,   38,   35,   32,   29,
+  26,   23,   20,   18,   15,   12,   9,    6,    3
+};
+
+static void cost(int *costs, aom_tree tree, const aom_prob *probs, int i,
+                 int c) {
+  const aom_prob prob = probs[i / 2];
+  int b;
+
+  assert(prob != 0);
+  for (b = 0; b <= 1; ++b) {
+    const int cc = c + av1_cost_bit(prob, b);
+    const aom_tree_index ii = tree[i + b];
+
+    if (ii <= 0)
+      costs[-ii] = cc;
+    else
+      cost(costs, tree, probs, ii, cc);
+  }
+}
+
+void av1_cost_tokens(int *costs, const aom_prob *probs, aom_tree tree) {
+  cost(costs, tree, probs, 0, 0);
+}
+
+void av1_cost_tokens_skip(int *costs, const aom_prob *probs, aom_tree tree) {
+  assert(tree[0] <= 0 && tree[1] > 0);
+
+  costs[-tree[0]] = av1_cost_bit(probs[0], 0);
+  cost(costs, tree, probs, 2, 0);
+}
diff --git a/third_party/aom/av1/encoder/cost.h b/third_party/aom/av1/encoder/cost.h
new file mode 100644
index 000000000..d8fb357e6
--- /dev/null
+++ b/third_party/aom/av1/encoder/cost.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_COST_H_
+#define AV1_ENCODER_COST_H_
+
+#include "aom_dsp/prob.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const uint16_t av1_prob_cost[256];
+
+// The factor to scale from cost in bits to cost in av1_prob_cost units.
+#define AV1_PROB_COST_SHIFT 9
+
+#define av1_cost_zero(prob) (av1_prob_cost[prob])
+
+#define av1_cost_one(prob) av1_cost_zero(256 - (prob))
+
+#define av1_cost_bit(prob, bit) av1_cost_zero((bit) ? 256 - (prob) : (prob))
+
+// Cost of coding an n bit literal, using 128 (i.e. 50%) probability
+// for each bit.
+#define av1_cost_literal(n) ((n) * (1 << AV1_PROB_COST_SHIFT))
+
+static INLINE unsigned int cost_branch256(const unsigned int ct[2],
+                                          aom_prob p) {
+  return ct[0] * av1_cost_zero(p) + ct[1] * av1_cost_one(p);
+}
+
+static INLINE int treed_cost(aom_tree tree, const aom_prob *probs, int bits,
+                             int len) {
+  int cost = 0;
+  aom_tree_index i = 0;
+
+  do {
+    const int bit = (bits >> --len) & 1;
+    cost += av1_cost_bit(probs[i >> 1], bit);
+    i = tree[i + bit];
+  } while (len);
+
+  return cost;
+}
+
+void av1_cost_tokens(int *costs, const aom_prob *probs, aom_tree tree);
+void av1_cost_tokens_skip(int *costs, const aom_prob *probs, aom_tree tree);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_COST_H_
diff --git a/third_party/aom/av1/encoder/daala_compat_enc.c b/third_party/aom/av1/encoder/daala_compat_enc.c
new file mode 100644
index 000000000..3df424cac
--- /dev/null
+++ b/third_party/aom/av1/encoder/daala_compat_enc.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "encint.h"
+
+void od_encode_checkpoint(const daala_enc_ctx *enc, od_rollback_buffer *rbuf) {
+#if CONFIG_DAALA_EC
+  od_ec_enc_checkpoint(&rbuf->ec, &enc->w.ec);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+  OD_COPY(&rbuf->adapt, enc->state.adapt, 1);
+}
+
+void od_encode_rollback(daala_enc_ctx *enc, const od_rollback_buffer *rbuf) {
+#if CONFIG_DAALA_EC
+  od_ec_enc_rollback(&enc->w.ec, &rbuf->ec);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+  OD_COPY(enc->state.adapt, &rbuf->adapt, 1);
+}
diff --git a/third_party/aom/av1/encoder/dct.c b/third_party/aom/av1/encoder/dct.c
new file mode 100644
index 000000000..09e1b0563
--- /dev/null
+++ b/third_party/aom/av1/encoder/dct.c
@@ -0,0 +1,2228 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
+#include "aom_dsp/fwd_txfm.h"
+#include "aom_ports/mem.h"
+#include "av1/common/blockd.h"
+#include "av1/common/av1_fwd_txfm1d.h"
+#include "av1/common/av1_fwd_txfm2d_cfg.h"
+#include "av1/common/idct.h"
+
+static INLINE void range_check(const tran_low_t *input, const int size,
+                               const int bit) {
+#if 0  // CONFIG_COEFFICIENT_RANGE_CHECKING
+// TODO(angiebird): the range_check is not used because the bit range
+// in fdct# is not correct. Since we are going to merge in a new version
+// of fdct# from nextgenv2, we won't fix the incorrect bit range now.
+  int i;
+  for (i = 0; i < size; ++i) {
+    assert(abs(input[i]) < (1 << bit));
+  }
+#else
+  (void)input;
+  (void)size;
+  (void)bit;
+#endif
+}
+
+static void fdct4(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t temp;
+  tran_low_t step[4];
+
+  // stage 0
+  range_check(input, 4, 14);
+
+  // stage 1
+  output[0] = input[0] + input[3];
+  output[1] = input[1] + input[2];
+  output[2] = input[1] - input[2];
+  output[3] = input[0] - input[3];
+
+  range_check(output, 4, 15);
+
+  // stage 2
+  temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
+  step[0] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
+  step[1] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
+  step[2] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
+  step[3] = (tran_low_t)fdct_round_shift(temp);
+
+  range_check(step, 4, 16);
+
+  // stage 3
+  output[0] = step[0];
+  output[1] = step[2];
+  output[2] = step[1];
+  output[3] = step[3];
+
+  range_check(output, 4, 16);
+}
+
+static void fdct8(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t temp;
+  tran_low_t step[8];
+
+  // stage 0
+  range_check(input, 8, 13);
+
+  // stage 1
+  output[0] = input[0] + input[7];
+  output[1] = input[1] + input[6];
+  output[2] = input[2] + input[5];
+  output[3] = input[3] + input[4];
+  output[4] = input[3] - input[4];
+  output[5] = input[2] - input[5];
+  output[6] = input[1] - input[6];
+  output[7] = input[0] - input[7];
+
+  range_check(output, 8, 14);
+
+  // stage 2
+  step[0] = output[0] + output[3];
+  step[1] = output[1] + output[2];
+  step[2] = output[1] - output[2];
+  step[3] = output[0] - output[3];
+  step[4] = output[4];
+  temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64;
+  step[5] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[6] * cospi_16_64 + output[5] * cospi_16_64;
+  step[6] = (tran_low_t)fdct_round_shift(temp);
+  step[7] = output[7];
+
+  range_check(step, 8, 15);
+
+  // stage 3
+  temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
+  output[0] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64;
+  output[1] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+  output[2] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64;
+  output[3] = (tran_low_t)fdct_round_shift(temp);
+  output[4] = step[4] + step[5];
+  output[5] = step[4] - step[5];
+  output[6] = step[7] - step[6];
+  output[7] = step[7] + step[6];
+
+  range_check(output, 8, 16);
+
+  // stage 4
+  step[0] = output[0];
+  step[1] = output[1];
+  step[2] = output[2];
+  step[3] = output[3];
+  temp = output[4] * cospi_28_64 + output[7] * cospi_4_64;
+  step[4] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[5] * cospi_12_64 + output[6] * cospi_20_64;
+  step[5] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64;
+  step[6] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
+  step[7] = (tran_low_t)fdct_round_shift(temp);
+
+  range_check(step, 8, 16);
+
+  // stage 5
+  output[0] = step[0];
+  output[1] = step[4];
+  output[2] = step[2];
+  output[3] = step[6];
+  output[4] = step[1];
+  output[5] = step[5];
+  output[6] = step[3];
+  output[7] = step[7];
+
+  range_check(output, 8, 16);
+}
+
+static void fdct16(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t temp;
+  tran_low_t step[16];
+
+  // stage 0
+  range_check(input, 16, 13);
+
+  // stage 1
+  output[0] = input[0] + input[15];
+  output[1] = input[1] + input[14];
+  output[2] = input[2] + input[13];
+  output[3] = input[3] + input[12];
+  output[4] = input[4] + input[11];
+  output[5] = input[5] + input[10];
+  output[6] = input[6] + input[9];
+  output[7] = input[7] + input[8];
+  output[8] = input[7] - input[8];
+  output[9] = input[6] - input[9];
+  output[10] = input[5] - input[10];
+  output[11] = input[4] - input[11];
+  output[12] = input[3] - input[12];
+  output[13] = input[2] - input[13];
+  output[14] = input[1] - input[14];
+  output[15] = input[0] - input[15];
+
+  range_check(output, 16, 14);
+
+  // stage 2
+  step[0] = output[0] + output[7];
+  step[1] = output[1] + output[6];
+  step[2] = output[2] + output[5];
+  step[3] = output[3] + output[4];
+  step[4] = output[3] - output[4];
+  step[5] = output[2] - output[5];
+  step[6] = output[1] - output[6];
+  step[7] = output[0] - output[7];
+  step[8] = output[8];
+  step[9] = output[9];
+  temp = output[10] * -cospi_16_64 + output[13] * cospi_16_64;
+  step[10] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[11] * -cospi_16_64 + output[12] * cospi_16_64;
+  step[11] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[12] * cospi_16_64 + output[11] * cospi_16_64;
+  step[12] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[13] * cospi_16_64 + output[10] * cospi_16_64;
+  step[13] = (tran_low_t)fdct_round_shift(temp);
+  step[14] = output[14];
+  step[15] = output[15];
+
+  range_check(step, 16, 15);
+
+  // stage 3
+  output[0] = step[0] + step[3];
+  output[1] = step[1] + step[2];
+  output[2] = step[1] - step[2];
+  output[3] = step[0] - step[3];
+  output[4] = step[4];
+  temp = step[5] * -cospi_16_64 + step[6] * cospi_16_64;
+  output[5] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[6] * cospi_16_64 + step[5] * cospi_16_64;
+  output[6] = (tran_low_t)fdct_round_shift(temp);
+  output[7] = step[7];
+  output[8] = step[8] + step[11];
+  output[9] = step[9] + step[10];
+  output[10] = step[9] - step[10];
+  output[11] = step[8] - step[11];
+  output[12] = step[15] - step[12];
+  output[13] = step[14] - step[13];
+  output[14] = step[14] + step[13];
+  output[15] = step[15] + step[12];
+
+  range_check(output, 16, 16);
+
+  // stage 4
+  temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
+  step[0] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
+  step[1] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
+  step[2] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
+  step[3] = (tran_low_t)fdct_round_shift(temp);
+  step[4] = output[4] + output[5];
+  step[5] = output[4] - output[5];
+  step[6] = output[7] - output[6];
+  step[7] = output[7] + output[6];
+  step[8] = output[8];
+  temp = output[9] * -cospi_8_64 + output[14] * cospi_24_64;
+  step[9] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[10] * -cospi_24_64 + output[13] * -cospi_8_64;
+  step[10] = (tran_low_t)fdct_round_shift(temp);
+  step[11] = output[11];
+  step[12] = output[12];
+  temp = output[13] * cospi_24_64 + output[10] * -cospi_8_64;
+  step[13] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[14] * cospi_8_64 + output[9] * cospi_24_64;
+  step[14] = (tran_low_t)fdct_round_shift(temp);
+  step[15] = output[15];
+
+  range_check(step, 16, 16);
+
+  // stage 5
+  output[0] = step[0];
+  output[1] = step[1];
+  output[2] = step[2];
+  output[3] = step[3];
+  temp = step[4] * cospi_28_64 + step[7] * cospi_4_64;
+  output[4] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[5] * cospi_12_64 + step[6] * cospi_20_64;
+  output[5] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[6] * cospi_12_64 + step[5] * -cospi_20_64;
+  output[6] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[7] * cospi_28_64 + step[4] * -cospi_4_64;
+  output[7] = (tran_low_t)fdct_round_shift(temp);
+  output[8] = step[8] + step[9];
+  output[9] = step[8] - step[9];
+  output[10] = step[11] - step[10];
+  output[11] = step[11] + step[10];
+  output[12] = step[12] + step[13];
+  output[13] = step[12] - step[13];
+  output[14] = step[15] - step[14];
+  output[15] = step[15] + step[14];
+
+  range_check(output, 16, 16);
+
+  // stage 6
+  step[0] = output[0];
+  step[1] = output[1];
+  step[2] = output[2];
+  step[3] = output[3];
+  step[4] = output[4];
+  step[5] = output[5];
+  step[6] = output[6];
+  step[7] = output[7];
+  temp = output[8] * cospi_30_64 + output[15] * cospi_2_64;
+  step[8] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[9] * cospi_14_64 + output[14] * cospi_18_64;
+  step[9] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[10] * cospi_22_64 + output[13] * cospi_10_64;
+  step[10] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[11] * cospi_6_64 + output[12] * cospi_26_64;
+  step[11] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[12] * cospi_6_64 + output[11] * -cospi_26_64;
+  step[12] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[13] * cospi_22_64 + output[10] * -cospi_10_64;
+  step[13] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[14] * cospi_14_64 + output[9] * -cospi_18_64;
+  step[14] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[15] * cospi_30_64 + output[8] * -cospi_2_64;
+  step[15] = (tran_low_t)fdct_round_shift(temp);
+
+  range_check(step, 16, 16);
+
+  // stage 7
+  output[0] = step[0];
+  output[1] = step[8];
+  output[2] = step[4];
+  output[3] = step[12];
+  output[4] = step[2];
+  output[5] = step[10];
+  output[6] = step[6];
+  output[7] = step[14];
+  output[8] = step[1];
+  output[9] = step[9];
+  output[10] = step[5];
+  output[11] = step[13];
+  output[12] = step[3];
+  output[13] = step[11];
+  output[14] = step[7];
+  output[15] = step[15];
+
+  range_check(output, 16, 16);
+}
+
+static void fdct32(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t temp;
+  tran_low_t step[32];
+
+  // stage 0
+  range_check(input, 32, 14);
+
+  // stage 1
+  output[0] = input[0] + input[31];
+  output[1] = input[1] + input[30];
+  output[2] = input[2] + input[29];
+  output[3] = input[3] + input[28];
+  output[4] = input[4] + input[27];
+  output[5] = input[5] + input[26];
+  output[6] = input[6] + input[25];
+  output[7] = input[7] + input[24];
+  output[8] = input[8] + input[23];
+  output[9] = input[9] + input[22];
+  output[10] = input[10] + input[21];
+  output[11] = input[11] + input[20];
+  output[12] = input[12] + input[19];
+  output[13] = input[13] + input[18];
+  output[14] = input[14] + input[17];
+  output[15] = input[15] + input[16];
+  output[16] = input[15] - input[16];
+  output[17] = input[14] - input[17];
+  output[18] = input[13] - input[18];
+  output[19] = input[12] - input[19];
+  output[20] = input[11] - input[20];
+  output[21] = input[10] - input[21];
+  output[22] = input[9] - input[22];
+  output[23] = input[8] - input[23];
+  output[24] = input[7] - input[24];
+  output[25] = input[6] - input[25];
+  output[26] = input[5] - input[26];
+  output[27] = input[4] - input[27];
+  output[28] = input[3] - input[28];
+  output[29] = input[2] - input[29];
+  output[30] = input[1] - input[30];
+  output[31] = input[0] - input[31];
+
+  range_check(output, 32, 15);
+
+  // stage 2
+  step[0] = output[0] + output[15];
+  step[1] = output[1] + output[14];
+  step[2] = output[2] + output[13];
+  step[3] = output[3] + output[12];
+  step[4] = output[4] + output[11];
+  step[5] = output[5] + output[10];
+  step[6] = output[6] + output[9];
+  step[7] = output[7] + output[8];
+  step[8] = output[7] - output[8];
+  step[9] = output[6] - output[9];
+  step[10] = output[5] - output[10];
+  step[11] = output[4] - output[11];
+  step[12] = output[3] - output[12];
+  step[13] = output[2] - output[13];
+  step[14] = output[1] - output[14];
+  step[15] = output[0] - output[15];
+  step[16] = output[16];
+  step[17] = output[17];
+  step[18] = output[18];
+  step[19] = output[19];
+  temp = output[20] * -cospi_16_64 + output[27] * cospi_16_64;
+  step[20] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[21] * -cospi_16_64 + output[26] * cospi_16_64;
+  step[21] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[22] * -cospi_16_64 + output[25] * cospi_16_64;
+  step[22] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[23] * -cospi_16_64 + output[24] * cospi_16_64;
+  step[23] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[24] * cospi_16_64 + output[23] * cospi_16_64;
+  step[24] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[25] * cospi_16_64 + output[22] * cospi_16_64;
+  step[25] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[26] * cospi_16_64 + output[21] * cospi_16_64;
+  step[26] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[27] * cospi_16_64 + output[20] * cospi_16_64;
+  step[27] = (tran_low_t)fdct_round_shift(temp);
+  step[28] = output[28];
+  step[29] = output[29];
+  step[30] = output[30];
+  step[31] = output[31];
+
+  range_check(step, 32, 16);
+
+  // stage 3
+  output[0] = step[0] + step[7];
+  output[1] = step[1] + step[6];
+  output[2] = step[2] + step[5];
+  output[3] = step[3] + step[4];
+  output[4] = step[3] - step[4];
+  output[5] = step[2] - step[5];
+  output[6] = step[1] - step[6];
+  output[7] = step[0] - step[7];
+  output[8] = step[8];
+  output[9] = step[9];
+  temp = step[10] * -cospi_16_64 + step[13] * cospi_16_64;
+  output[10] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[11] * -cospi_16_64 + step[12] * cospi_16_64;
+  output[11] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[12] * cospi_16_64 + step[11] * cospi_16_64;
+  output[12] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[13] * cospi_16_64 + step[10] * cospi_16_64;
+  output[13] = (tran_low_t)fdct_round_shift(temp);
+  output[14] = step[14];
+  output[15] = step[15];
+  output[16] = step[16] + step[23];
+  output[17] = step[17] + step[22];
+  output[18] = step[18] + step[21];
+  output[19] = step[19] + step[20];
+  output[20] = step[19] - step[20];
+  output[21] = step[18] - step[21];
+  output[22] = step[17] - step[22];
+  output[23] = step[16] - step[23];
+  output[24] = step[31] - step[24];
+  output[25] = step[30] - step[25];
+  output[26] = step[29] - step[26];
+  output[27] = step[28] - step[27];
+  output[28] = step[28] + step[27];
+  output[29] = step[29] + step[26];
+  output[30] = step[30] + step[25];
+  output[31] = step[31] + step[24];
+
+  range_check(output, 32, 17);
+
+  // stage 4
+  step[0] = output[0] + output[3];
+  step[1] = output[1] + output[2];
+  step[2] = output[1] - output[2];
+  step[3] = output[0] - output[3];
+  step[4] = output[4];
+  temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64;
+  step[5] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[6] * cospi_16_64 + output[5] * cospi_16_64;
+  step[6] = (tran_low_t)fdct_round_shift(temp);
+  step[7] = output[7];
+  step[8] = output[8] + output[11];
+  step[9] = output[9] + output[10];
+  step[10] = output[9] - output[10];
+  step[11] = output[8] - output[11];
+  step[12] = output[15] - output[12];
+  step[13] = output[14] - output[13];
+  step[14] = output[14] + output[13];
+  step[15] = output[15] + output[12];
+  step[16] = output[16];
+  step[17] = output[17];
+  temp = output[18] * -cospi_8_64 + output[29] * cospi_24_64;
+  step[18] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[19] * -cospi_8_64 + output[28] * cospi_24_64;
+  step[19] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[20] * -cospi_24_64 + output[27] * -cospi_8_64;
+  step[20] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[21] * -cospi_24_64 + output[26] * -cospi_8_64;
+  step[21] = (tran_low_t)fdct_round_shift(temp);
+  step[22] = output[22];
+  step[23] = output[23];
+  step[24] = output[24];
+  step[25] = output[25];
+  temp = output[26] * cospi_24_64 + output[21] * -cospi_8_64;
+  step[26] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[27] * cospi_24_64 + output[20] * -cospi_8_64;
+  step[27] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[28] * cospi_8_64 + output[19] * cospi_24_64;
+  step[28] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[29] * cospi_8_64 + output[18] * cospi_24_64;
+  step[29] = (tran_low_t)fdct_round_shift(temp);
+  step[30] = output[30];
+  step[31] = output[31];
+
+  range_check(step, 32, 18);
+
+  // stage 5
+  temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
+  output[0] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64;
+  output[1] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+  output[2] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64;
+  output[3] = (tran_low_t)fdct_round_shift(temp);
+  output[4] = step[4] + step[5];
+  output[5] = step[4] - step[5];
+  output[6] = step[7] - step[6];
+  output[7] = step[7] + step[6];
+  output[8] = step[8];
+  temp = step[9] * -cospi_8_64 + step[14] * cospi_24_64;
+  output[9] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[10] * -cospi_24_64 + step[13] * -cospi_8_64;
+  output[10] = (tran_low_t)fdct_round_shift(temp);
+  output[11] = step[11];
+  output[12] = step[12];
+  temp = step[13] * cospi_24_64 + step[10] * -cospi_8_64;
+  output[13] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[14] * cospi_8_64 + step[9] * cospi_24_64;
+  output[14] = (tran_low_t)fdct_round_shift(temp);
+  output[15] = step[15];
+  output[16] = step[16] + step[19];
+  output[17] = step[17] + step[18];
+  output[18] = step[17] - step[18];
+  output[19] = step[16] - step[19];
+  output[20] = step[23] - step[20];
+  output[21] = step[22] - step[21];
+  output[22] = step[22] + step[21];
+  output[23] = step[23] + step[20];
+  output[24] = step[24] + step[27];
+  output[25] = step[25] + step[26];
+  output[26] = step[25] - step[26];
+  output[27] = step[24] - step[27];
+  output[28] = step[31] - step[28];
+  output[29] = step[30] - step[29];
+  output[30] = step[30] + step[29];
+  output[31] = step[31] + step[28];
+
+  range_check(output, 32, 18);
+
+  // stage 6
+  step[0] = output[0];
+  step[1] = output[1];
+  step[2] = output[2];
+  step[3] = output[3];
+  temp = output[4] * cospi_28_64 + output[7] * cospi_4_64;
+  step[4] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[5] * cospi_12_64 + output[6] * cospi_20_64;
+  step[5] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64;
+  step[6] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
+  step[7] = (tran_low_t)fdct_round_shift(temp);
+  step[8] = output[8] + output[9];
+  step[9] = output[8] - output[9];
+  step[10] = output[11] - output[10];
+  step[11] = output[11] + output[10];
+  step[12] = output[12] + output[13];
+  step[13] = output[12] - output[13];
+  step[14] = output[15] - output[14];
+  step[15] = output[15] + output[14];
+  step[16] = output[16];
+  temp = output[17] * -cospi_4_64 + output[30] * cospi_28_64;
+  step[17] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[18] * -cospi_28_64 + output[29] * -cospi_4_64;
+  step[18] = (tran_low_t)fdct_round_shift(temp);
+  step[19] = output[19];
+  step[20] = output[20];
+  temp = output[21] * -cospi_20_64 + output[26] * cospi_12_64;
+  step[21] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[22] * -cospi_12_64 + output[25] * -cospi_20_64;
+  step[22] = (tran_low_t)fdct_round_shift(temp);
+  step[23] = output[23];
+  step[24] = output[24];
+  temp = output[25] * cospi_12_64 + output[22] * -cospi_20_64;
+  step[25] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[26] * cospi_20_64 + output[21] * cospi_12_64;
+  step[26] = (tran_low_t)fdct_round_shift(temp);
+  step[27] = output[27];
+  step[28] = output[28];
+  temp = output[29] * cospi_28_64 + output[18] * -cospi_4_64;
+  step[29] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[30] * cospi_4_64 + output[17] * cospi_28_64;
+  step[30] = (tran_low_t)fdct_round_shift(temp);
+  step[31] = output[31];
+
+  range_check(step, 32, 18);
+
+  // stage 7
+  output[0] = step[0];
+  output[1] = step[1];
+  output[2] = step[2];
+  output[3] = step[3];
+  output[4] = step[4];
+  output[5] = step[5];
+  output[6] = step[6];
+  output[7] = step[7];
+  temp = step[8] * cospi_30_64 + step[15] * cospi_2_64;
+  output[8] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[9] * cospi_14_64 + step[14] * cospi_18_64;
+  output[9] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[10] * cospi_22_64 + step[13] * cospi_10_64;
+  output[10] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[11] * cospi_6_64 + step[12] * cospi_26_64;
+  output[11] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[12] * cospi_6_64 + step[11] * -cospi_26_64;
+  output[12] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[13] * cospi_22_64 + step[10] * -cospi_10_64;
+  output[13] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[14] * cospi_14_64 + step[9] * -cospi_18_64;
+  output[14] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[15] * cospi_30_64 + step[8] * -cospi_2_64;
+  output[15] = (tran_low_t)fdct_round_shift(temp);
+  output[16] = step[16] + step[17];
+  output[17] = step[16] - step[17];
+  output[18] = step[19] - step[18];
+  output[19] = step[19] + step[18];
+  output[20] = step[20] + step[21];
+  output[21] = step[20] - step[21];
+  output[22] = step[23] - step[22];
+  output[23] = step[23] + step[22];
+  output[24] = step[24] + step[25];
+  output[25] = step[24] - step[25];
+  output[26] = step[27] - step[26];
+  output[27] = step[27] + step[26];
+  output[28] = step[28] + step[29];
+  output[29] = step[28] - step[29];
+  output[30] = step[31] - step[30];
+  output[31] = step[31] + step[30];
+
+  range_check(output, 32, 18);
+
+  // stage 8
+  step[0] = output[0];
+  step[1] = output[1];
+  step[2] = output[2];
+  step[3] = output[3];
+  step[4] = output[4];
+  step[5] = output[5];
+  step[6] = output[6];
+  step[7] = output[7];
+  step[8] = output[8];
+  step[9] = output[9];
+  step[10] = output[10];
+  step[11] = output[11];
+  step[12] = output[12];
+  step[13] = output[13];
+  step[14] = output[14];
+  step[15] = output[15];
+  temp = output[16] * cospi_31_64 + output[31] * cospi_1_64;
+  step[16] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[17] * cospi_15_64 + output[30] * cospi_17_64;
+  step[17] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[18] * cospi_23_64 + output[29] * cospi_9_64;
+  step[18] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[19] * cospi_7_64 + output[28] * cospi_25_64;
+  step[19] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[20] * cospi_27_64 + output[27] * cospi_5_64;
+  step[20] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[21] * cospi_11_64 + output[26] * cospi_21_64;
+  step[21] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[22] * cospi_19_64 + output[25] * cospi_13_64;
+  step[22] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[23] * cospi_3_64 + output[24] * cospi_29_64;
+  step[23] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[24] * cospi_3_64 + output[23] * -cospi_29_64;
+  step[24] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[25] * cospi_19_64 + output[22] * -cospi_13_64;
+  step[25] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[26] * cospi_11_64 + output[21] * -cospi_21_64;
+  step[26] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[27] * cospi_27_64 + output[20] * -cospi_5_64;
+  step[27] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[28] * cospi_7_64 + output[19] * -cospi_25_64;
+  step[28] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[29] * cospi_23_64 + output[18] * -cospi_9_64;
+  step[29] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[30] * cospi_15_64 + output[17] * -cospi_17_64;
+  step[30] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[31] * cospi_31_64 + output[16] * -cospi_1_64;
+  step[31] = (tran_low_t)fdct_round_shift(temp);
+
+  range_check(step, 32, 18);
+
+  // stage 9
+  output[0] = step[0];
+  output[1] = step[16];
+  output[2] = step[8];
+  output[3] = step[24];
+  output[4] = step[4];
+  output[5] = step[20];
+  output[6] = step[12];
+  output[7] = step[28];
+  output[8] = step[2];
+  output[9] = step[18];
+  output[10] = step[10];
+  output[11] = step[26];
+  output[12] = step[6];
+  output[13] = step[22];
+  output[14] = step[14];
+  output[15] = step[30];
+  output[16] = step[1];
+  output[17] = step[17];
+  output[18] = step[9];
+  output[19] = step[25];
+  output[20] = step[5];
+  output[21] = step[21];
+  output[22] = step[13];
+  output[23] = step[29];
+  output[24] = step[3];
+  output[25] = step[19];
+  output[26] = step[11];
+  output[27] = step[27];
+  output[28] = step[7];
+  output[29] = step[23];
+  output[30] = step[15];
+  output[31] = step[31];
+
+  range_check(output, 32, 18);
+}
+
+#ifndef AV1_DCT_GTEST
+
+static void fadst4(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t x0, x1, x2, x3;
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_4_9 * x0;
+  s2 = sinpi_2_9 * x1;
+  s3 = sinpi_1_9 * x1;
+  s4 = sinpi_3_9 * x2;
+  s5 = sinpi_4_9 * x3;
+  s6 = sinpi_2_9 * x3;
+  s7 = x0 + x1 - x3;
+
+  x0 = s0 + s2 + s5;
+  x1 = sinpi_3_9 * s7;
+  x2 = s1 - s3 + s6;
+  x3 = s4;
+
+  s0 = x0 + x3;
+  s1 = x1;
+  s2 = x2 - x3;
+  s3 = x2 - x0 + x3;
+
+  // 1-D transform scaling factor is sqrt(2).
+  output[0] = (tran_low_t)fdct_round_shift(s0);
+  output[1] = (tran_low_t)fdct_round_shift(s1);
+  output[2] = (tran_low_t)fdct_round_shift(s2);
+  output[3] = (tran_low_t)fdct_round_shift(s3);
+}
+
+static void fadst8(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_high_t x0 = input[7];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[5];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[3];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[1];
+  tran_high_t x7 = input[6];
+
+  // stage 1
+  s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+  s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+  s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+
+  x0 = s0 + s4;
+  x1 = s1 + s5;
+  x2 = s2 + s6;
+  x3 = s3 + s7;
+  x4 = fdct_round_shift(s0 - s4);
+  x5 = fdct_round_shift(s1 - s5);
+  x6 = fdct_round_shift(s2 - s6);
+  x7 = fdct_round_shift(s3 - s7);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+  s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+  s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+  s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+
+  x0 = fdct_round_shift(s0 + s2);
+  x1 = fdct_round_shift(s1 + s3);
+  x2 = fdct_round_shift(s0 - s2);
+  x3 = fdct_round_shift(s1 - s3);
+  x4 = fdct_round_shift(s4 + s6);
+  x5 = fdct_round_shift(s5 + s7);
+  x6 = fdct_round_shift(s4 - s6);
+  x7 = fdct_round_shift(s5 - s7);
+
+  // stage 3
+  s2 = cospi_16_64 * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (x6 - x7);
+
+  x2 = fdct_round_shift(s2);
+  x3 = fdct_round_shift(s3);
+  x6 = fdct_round_shift(s6);
+  x7 = fdct_round_shift(s7);
+
+  output[0] = (tran_low_t)x0;
+  output[1] = (tran_low_t)-x4;
+  output[2] = (tran_low_t)x6;
+  output[3] = (tran_low_t)-x2;
+  output[4] = (tran_low_t)x3;
+  output[5] = (tran_low_t)-x7;
+  output[6] = (tran_low_t)x5;
+  output[7] = (tran_low_t)-x1;
+}
+
+static void fadst16(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+  tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+  tran_high_t x0 = input[15];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[13];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[11];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[9];
+  tran_high_t x7 = input[6];
+  tran_high_t x8 = input[7];
+  tran_high_t x9 = input[8];
+  tran_high_t x10 = input[5];
+  tran_high_t x11 = input[10];
+  tran_high_t x12 = input[3];
+  tran_high_t x13 = input[12];
+  tran_high_t x14 = input[1];
+  tran_high_t x15 = input[14];
+
+  // stage 1
+  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+
+  x0 = s0 + s8;
+  x1 = s1 + s9;
+  x2 = s2 + s10;
+  x3 = s3 + s11;
+  x4 = s4 + s12;
+  x5 = s5 + s13;
+  x6 = s6 + s14;
+  x7 = s7 + s15;
+
+  x8 = fdct_round_shift(s0 - s8);
+  x9 = fdct_round_shift(s1 - s9);
+  x10 = fdct_round_shift(s2 - s10);
+  x11 = fdct_round_shift(s3 - s11);
+  x12 = fdct_round_shift(s4 - s12);
+  x13 = fdct_round_shift(s5 - s13);
+  x14 = fdct_round_shift(s6 - s14);
+  x15 = fdct_round_shift(s7 - s15);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = s0 + s4;
+  x1 = s1 + s5;
+  x2 = s2 + s6;
+  x3 = s3 + s7;
+  x4 = fdct_round_shift(s0 - s4);
+  x5 = fdct_round_shift(s1 - s5);
+  x6 = fdct_round_shift(s2 - s6);
+  x7 = fdct_round_shift(s3 - s7);
+
+  x8 = s8 + s12;
+  x9 = s9 + s13;
+  x10 = s10 + s14;
+  x11 = s11 + s15;
+  x12 = fdct_round_shift(s8 - s12);
+  x13 = fdct_round_shift(s9 - s13);
+  x14 = fdct_round_shift(s10 - s14);
+  x15 = fdct_round_shift(s11 - s15);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+  x0 = fdct_round_shift(s0 + s2);
+  x1 = fdct_round_shift(s1 + s3);
+  x2 = fdct_round_shift(s0 - s2);
+  x3 = fdct_round_shift(s1 - s3);
+
+  x4 = fdct_round_shift(s4 + s6);
+  x5 = fdct_round_shift(s5 + s7);
+  x6 = fdct_round_shift(s4 - s6);
+  x7 = fdct_round_shift(s5 - s7);
+
+  x8 = fdct_round_shift(s8 + s10);
+  x9 = fdct_round_shift(s9 + s11);
+  x10 = fdct_round_shift(s8 - s10);
+  x11 = fdct_round_shift(s9 - s11);
+
+  x12 = fdct_round_shift(s12 + s14);
+  x13 = fdct_round_shift(s13 + s15);
+  x14 = fdct_round_shift(s12 - s14);
+  x15 = fdct_round_shift(s13 - s15);
+
+  // stage 4
+  s2 = (-cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (-x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (-x10 + x11);
+  s14 = (-cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = fdct_round_shift(s2);
+  x3 = fdct_round_shift(s3);
+  x6 = fdct_round_shift(s6);
+  x7 = fdct_round_shift(s7);
+  x10 = fdct_round_shift(s10);
+  x11 = fdct_round_shift(s11);
+  x14 = fdct_round_shift(s14);
+  x15 = fdct_round_shift(s15);
+
+  output[0] = (tran_low_t)x0;
+  output[1] = (tran_low_t)-x8;
+  output[2] = (tran_low_t)x12;
+  output[3] = (tran_low_t)-x4;
+  output[4] = (tran_low_t)x6;
+  output[5] = (tran_low_t)x14;
+  output[6] = (tran_low_t)x10;
+  output[7] = (tran_low_t)x2;
+  output[8] = (tran_low_t)x3;
+  output[9] = (tran_low_t)x11;
+  output[10] = (tran_low_t)x15;
+  output[11] = (tran_low_t)x7;
+  output[12] = (tran_low_t)x5;
+  output[13] = (tran_low_t)-x13;
+  output[14] = (tran_low_t)x9;
+  output[15] = (tran_low_t)-x1;
+}
+
+// For use in lieu of ADST
+static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[16];
+  for (i = 0; i < 16; ++i) {
+    output[16 + i] = input[i] * 4;
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 16] * Sqrt2);
+  }
+  fdct16(inputhalf, output);
+  // Note overall scaling factor is 4 times orthogonal
+}
+
+#if CONFIG_EXT_TX
+static void fidtx4(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 4; ++i)
+    output[i] = (tran_low_t)fdct_round_shift(input[i] * Sqrt2);
+}
+
+static void fidtx8(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 8; ++i) output[i] = input[i] * 2;
+}
+
+static void fidtx16(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 16; ++i)
+    output[i] = (tran_low_t)fdct_round_shift(input[i] * 2 * Sqrt2);
+}
+
+static void fidtx32(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
+}
+
+static void copy_block(const int16_t *src, int src_stride, int l, int w,
+                       int16_t *dest, int dest_stride) {
+  int i;
+  for (i = 0; i < l; ++i) {
+    memcpy(dest + dest_stride * i, src + src_stride * i, w * sizeof(int16_t));
+  }
+}
+
+static void fliplr(int16_t *dest, int stride, int l, int w) {
+  int i, j;
+  for (i = 0; i < l; ++i) {
+    for (j = 0; j < w / 2; ++j) {
+      const int16_t tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[i * stride + w - 1 - j];
+      dest[i * stride + w - 1 - j] = tmp;
+    }
+  }
+}
+
+static void flipud(int16_t *dest, int stride, int l, int w) {
+  int i, j;
+  for (j = 0; j < w; ++j) {
+    for (i = 0; i < l / 2; ++i) {
+      const int16_t tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
+      dest[(l - 1 - i) * stride + j] = tmp;
+    }
+  }
+}
+
+static void fliplrud(int16_t *dest, int stride, int l, int w) {
+  int i, j;
+  for (i = 0; i < l / 2; ++i) {
+    for (j = 0; j < w; ++j) {
+      const int16_t tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[(l - 1 - i) * stride + w - 1 - j];
+      dest[(l - 1 - i) * stride + w - 1 - j] = tmp;
+    }
+  }
+}
+
+static void copy_fliplr(const int16_t *src, int src_stride, int l, int w,
+                        int16_t *dest, int dest_stride) {
+  copy_block(src, src_stride, l, w, dest, dest_stride);
+  fliplr(dest, dest_stride, l, w);
+}
+
+static void copy_flipud(const int16_t *src, int src_stride, int l, int w,
+                        int16_t *dest, int dest_stride) {
+  copy_block(src, src_stride, l, w, dest, dest_stride);
+  flipud(dest, dest_stride, l, w);
+}
+
+static void copy_fliplrud(const int16_t *src, int src_stride, int l, int w,
+                          int16_t *dest, int dest_stride) {
+  copy_block(src, src_stride, l, w, dest, dest_stride);
+  fliplrud(dest, dest_stride, l, w);
+}
+
+static void maybe_flip_input(const int16_t **src, int *src_stride, int l, int w,
+                             int16_t *buff, int tx_type) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case IDTX:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST: break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST:
+      copy_flipud(*src, *src_stride, l, w, buff, w);
+      *src = buff;
+      *src_stride = w;
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      copy_fliplr(*src, *src_stride, l, w, buff, w);
+      *src = buff;
+      *src_stride = w;
+      break;
+    case FLIPADST_FLIPADST:
+      copy_fliplrud(*src, *src_stride, l, w, buff, w);
+      *src = buff;
+      *src_stride = w;
+      break;
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_EXT_TX
+
+void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
+                  int tx_type) {
+  if (tx_type == DCT_DCT) {
+    aom_fdct4x4_c(input, output, stride);
+  } else {
+    static const transform_2d FHT[] = {
+      { fdct4, fdct4 },    // DCT_DCT
+      { fadst4, fdct4 },   // ADST_DCT
+      { fdct4, fadst4 },   // DCT_ADST
+      { fadst4, fadst4 },  // ADST_ADST
+#if CONFIG_EXT_TX
+      { fadst4, fdct4 },   // FLIPADST_DCT
+      { fdct4, fadst4 },   // DCT_FLIPADST
+      { fadst4, fadst4 },  // FLIPADST_FLIPADST
+      { fadst4, fadst4 },  // ADST_FLIPADST
+      { fadst4, fadst4 },  // FLIPADST_ADST
+      { fidtx4, fidtx4 },  // IDTX
+      { fdct4, fidtx4 },   // V_DCT
+      { fidtx4, fdct4 },   // H_DCT
+      { fadst4, fidtx4 },  // V_ADST
+      { fidtx4, fadst4 },  // H_ADST
+      { fadst4, fidtx4 },  // V_FLIPADST
+      { fidtx4, fadst4 },  // H_FLIPADST
+#endif                     // CONFIG_EXT_TX
+    };
+    const transform_2d ht = FHT[tx_type];
+    tran_low_t out[4 * 4];
+    int i, j;
+    tran_low_t temp_in[4], temp_out[4];
+
+#if CONFIG_EXT_TX
+    int16_t flipped_input[4 * 4];
+    maybe_flip_input(&input, &stride, 4, 4, flipped_input, tx_type);
+#endif
+
+    // Columns
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16;
+      if (i == 0 && temp_in[0]) temp_in[0] += 1;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 4; ++j) out[j * 4 + i] = temp_out[j];
+    }
+
+    // Rows
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j) temp_in[j] = out[j + i * 4];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2;
+    }
+  }
+}
+
+void av1_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
+                  int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct8, fdct4 },    // DCT_DCT
+    { fadst8, fdct4 },   // ADST_DCT
+    { fdct8, fadst4 },   // DCT_ADST
+    { fadst8, fadst4 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { fadst8, fdct4 },   // FLIPADST_DCT
+    { fdct8, fadst4 },   // DCT_FLIPADST
+    { fadst8, fadst4 },  // FLIPADST_FLIPADST
+    { fadst8, fadst4 },  // ADST_FLIPADST
+    { fadst8, fadst4 },  // FLIPADST_ADST
+    { fidtx8, fidtx4 },  // IDTX
+    { fdct8, fidtx4 },   // V_DCT
+    { fidtx8, fdct4 },   // H_DCT
+    { fadst8, fidtx4 },  // V_ADST
+    { fidtx8, fadst4 },  // H_ADST
+    { fadst8, fidtx4 },  // V_FLIPADST
+    { fidtx8, fadst4 },  // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  const int n = 4;
+  const int n2 = 8;
+  tran_low_t out[8 * 4];
+  tran_low_t temp_in[8], temp_out[8];
+  int i, j;
+#if CONFIG_EXT_TX
+  int16_t flipped_input[8 * 4];
+  maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
+#endif
+
+  // Rows
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j)
+      temp_in[j] =
+          (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
+  }
+
+  // Columns
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < n2; ++j)
+      output[i + j * n] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+  }
+  // Note: overall scale factor of transform is 8 times unitary
+}
+
+void av1_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
+                  int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct4, fdct8 },    // DCT_DCT
+    { fadst4, fdct8 },   // ADST_DCT
+    { fdct4, fadst8 },   // DCT_ADST
+    { fadst4, fadst8 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { fadst4, fdct8 },   // FLIPADST_DCT
+    { fdct4, fadst8 },   // DCT_FLIPADST
+    { fadst4, fadst8 },  // FLIPADST_FLIPADST
+    { fadst4, fadst8 },  // ADST_FLIPADST
+    { fadst4, fadst8 },  // FLIPADST_ADST
+    { fidtx4, fidtx8 },  // IDTX
+    { fdct4, fidtx8 },   // V_DCT
+    { fidtx4, fdct8 },   // H_DCT
+    { fadst4, fidtx8 },  // V_ADST
+    { fidtx4, fadst8 },  // H_ADST
+    { fadst4, fidtx8 },  // V_FLIPADST
+    { fidtx4, fadst8 },  // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  const int n = 4;
+  const int n2 = 8;
+  tran_low_t out[8 * 4];
+  tran_low_t temp_in[8], temp_out[8];
+  int i, j;
+#if CONFIG_EXT_TX
+  int16_t flipped_input[8 * 4];
+  maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
+#endif
+
+  // Columns
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j)
+      temp_in[j] =
+          (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
+  }
+
+  // Rows
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < n2; ++j)
+      output[j + i * n2] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+  }
+  // Note: overall scale factor of transform is 8 times unitary
+}
+
+void av1_fht4x16_c(const int16_t *input, tran_low_t *output, int stride,
+                   int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct16, fdct4 },    // DCT_DCT
+    { fadst16, fdct4 },   // ADST_DCT
+    { fdct16, fadst4 },   // DCT_ADST
+    { fadst16, fadst4 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { fadst16, fdct4 },   // FLIPADST_DCT
+    { fdct16, fadst4 },   // DCT_FLIPADST
+    { fadst16, fadst4 },  // FLIPADST_FLIPADST
+    { fadst16, fadst4 },  // ADST_FLIPADST
+    { fadst16, fadst4 },  // FLIPADST_ADST
+    { fidtx16, fidtx4 },  // IDTX
+    { fdct16, fidtx4 },   // V_DCT
+    { fidtx16, fdct4 },   // H_DCT
+    { fadst16, fidtx4 },  // V_ADST
+    { fidtx16, fadst4 },  // H_ADST
+    { fadst16, fidtx4 },  // V_FLIPADST
+    { fidtx16, fadst4 },  // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  const int n = 4;
+  const int n4 = 16;
+  tran_low_t out[16 * 4];
+  tran_low_t temp_in[16], temp_out[16];
+  int i, j;
+#if CONFIG_EXT_TX
+  int16_t flipped_input[16 * 4];
+  maybe_flip_input(&input, &stride, n4, n, flipped_input, tx_type);
+#endif
+
+  // Rows
+  for (i = 0; i < n4; ++i) {
+    for (j = 0; j < n; ++j) temp_in[j] = input[i * stride + j] * 4;
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
+  }
+
+  // Columns
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < n4; ++j)
+      output[i + j * n] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+  }
+  // Note: overall scale factor of transform is 8 times unitary
+}
+
+void av1_fht16x4_c(const int16_t *input, tran_low_t *output, int stride,
+                   int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct4, fdct16 },    // DCT_DCT
+    { fadst4, fdct16 },   // ADST_DCT
+    { fdct4, fadst16 },   // DCT_ADST
+    { fadst4, fadst16 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { fadst4, fdct16 },   // FLIPADST_DCT
+    { fdct4, fadst16 },   // DCT_FLIPADST
+    { fadst4, fadst16 },  // FLIPADST_FLIPADST
+    { fadst4, fadst16 },  // ADST_FLIPADST
+    { fadst4, fadst16 },  // FLIPADST_ADST
+    { fidtx4, fidtx16 },  // IDTX
+    { fdct4, fidtx16 },   // V_DCT
+    { fidtx4, fdct16 },   // H_DCT
+    { fadst4, fidtx16 },  // V_ADST
+    { fidtx4, fadst16 },  // H_ADST
+    { fadst4, fidtx16 },  // V_FLIPADST
+    { fidtx4, fadst16 },  // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  const int n = 4;
+  const int n4 = 16;
+  tran_low_t out[16 * 4];
+  tran_low_t temp_in[16], temp_out[16];
+  int i, j;
+#if CONFIG_EXT_TX
+  int16_t flipped_input[16 * 4];
+  maybe_flip_input(&input, &stride, n, n4, flipped_input, tx_type);
+#endif
+
+  // Columns
+  for (i = 0; i < n4; ++i) {
+    for (j = 0; j < n; ++j) temp_in[j] = input[j * stride + i] * 4;
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
+  }
+
+  // Rows
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < n4; ++j)
+      output[j + i * n4] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+  }
+  // Note: overall scale factor of transform is 8 times unitary
+}
+
+void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
+                   int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct16, fdct8 },    // DCT_DCT
+    { fadst16, fdct8 },   // ADST_DCT
+    { fdct16, fadst8 },   // DCT_ADST
+    { fadst16, fadst8 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { fadst16, fdct8 },   // FLIPADST_DCT
+    { fdct16, fadst8 },   // DCT_FLIPADST
+    { fadst16, fadst8 },  // FLIPADST_FLIPADST
+    { fadst16, fadst8 },  // ADST_FLIPADST
+    { fadst16, fadst8 },  // FLIPADST_ADST
+    { fidtx16, fidtx8 },  // IDTX
+    { fdct16, fidtx8 },   // V_DCT
+    { fidtx16, fdct8 },   // H_DCT
+    { fadst16, fidtx8 },  // V_ADST
+    { fidtx16, fadst8 },  // H_ADST
+    { fadst16, fidtx8 },  // V_FLIPADST
+    { fidtx16, fadst8 },  // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  const int n = 8;
+  const int n2 = 16;
+  tran_low_t out[16 * 8];
+  tran_low_t temp_in[16], temp_out[16];
+  int i, j;
+#if CONFIG_EXT_TX
+  int16_t flipped_input[16 * 8];
+  maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
+#endif
+
+  // Rows
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j)
+      temp_in[j] =
+          (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < n; ++j)
+      out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+  }
+
+  // Columns
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j];
+  }
+  // Note: overall scale factor of transform is 8 times unitary
+}
+
+void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
+                   int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct8, fdct16 },    // DCT_DCT
+    { fadst8, fdct16 },   // ADST_DCT
+    { fdct8, fadst16 },   // DCT_ADST
+    { fadst8, fadst16 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { fadst8, fdct16 },   // FLIPADST_DCT
+    { fdct8, fadst16 },   // DCT_FLIPADST
+    { fadst8, fadst16 },  // FLIPADST_FLIPADST
+    { fadst8, fadst16 },  // ADST_FLIPADST
+    { fadst8, fadst16 },  // FLIPADST_ADST
+    { fidtx8, fidtx16 },  // IDTX
+    { fdct8, fidtx16 },   // V_DCT
+    { fidtx8, fdct16 },   // H_DCT
+    { fadst8, fidtx16 },  // V_ADST
+    { fidtx8, fadst16 },  // H_ADST
+    { fadst8, fidtx16 },  // V_FLIPADST
+    { fidtx8, fadst16 },  // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  const int n = 8;
+  const int n2 = 16;
+  tran_low_t out[16 * 8];
+  tran_low_t temp_in[16], temp_out[16];
+  int i, j;
+#if CONFIG_EXT_TX
+  int16_t flipped_input[16 * 8];
+  maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
+#endif
+
+  // Columns
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j)
+      temp_in[j] =
+          (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < n; ++j)
+      out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+  }
+
+  // Rows
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j];
+  }
+  // Note: overall scale factor of transform is 8 times unitary
+}
+
+void av1_fht8x32_c(const int16_t *input, tran_low_t *output, int stride,
+                   int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct32, fdct8 },         // DCT_DCT
+    { fhalfright32, fdct8 },   // ADST_DCT
+    { fdct32, fadst8 },        // DCT_ADST
+    { fhalfright32, fadst8 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { fhalfright32, fdct8 },   // FLIPADST_DCT
+    { fdct32, fadst8 },        // DCT_FLIPADST
+    { fhalfright32, fadst8 },  // FLIPADST_FLIPADST
+    { fhalfright32, fadst8 },  // ADST_FLIPADST
+    { fhalfright32, fadst8 },  // FLIPADST_ADST
+    { fidtx32, fidtx8 },       // IDTX
+    { fdct32, fidtx8 },        // V_DCT
+    { fidtx32, fdct8 },        // H_DCT
+    { fhalfright32, fidtx8 },  // V_ADST
+    { fidtx32, fadst8 },       // H_ADST
+    { fhalfright32, fidtx8 },  // V_FLIPADST
+    { fidtx32, fadst8 },       // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  const int n = 8;
+  const int n4 = 32;
+  tran_low_t out[32 * 8];
+  tran_low_t temp_in[32], temp_out[32];
+  int i, j;
+#if CONFIG_EXT_TX
+  int16_t flipped_input[32 * 8];
+  maybe_flip_input(&input, &stride, n4, n, flipped_input, tx_type);
+#endif
+
+  // Rows
+  for (i = 0; i < n4; ++i) {
+    for (j = 0; j < n; ++j) temp_in[j] = input[i * stride + j] * 4;
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
+  }
+
+  // Columns
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < n4; ++j)
+      output[i + j * n] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+  }
+  // Note: overall scale factor of transform is 4 times unitary
+}
+
+void av1_fht32x8_c(const int16_t *input, tran_low_t *output, int stride,
+                   int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct8, fdct32 },         // DCT_DCT
+    { fadst8, fdct32 },        // ADST_DCT
+    { fdct8, fhalfright32 },   // DCT_ADST
+    { fadst8, fhalfright32 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { fadst8, fdct32 },        // FLIPADST_DCT
+    { fdct8, fhalfright32 },   // DCT_FLIPADST
+    { fadst8, fhalfright32 },  // FLIPADST_FLIPADST
+    { fadst8, fhalfright32 },  // ADST_FLIPADST
+    { fadst8, fhalfright32 },  // FLIPADST_ADST
+    { fidtx8, fidtx32 },       // IDTX
+    { fdct8, fidtx32 },        // V_DCT
+    { fidtx8, fdct32 },        // H_DCT
+    { fadst8, fidtx32 },       // V_ADST
+    { fidtx8, fhalfright32 },  // H_ADST
+    { fadst8, fidtx32 },       // V_FLIPADST
+    { fidtx8, fhalfright32 },  // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  const int n = 8;
+  const int n4 = 32;
+  tran_low_t out[32 * 8];
+  tran_low_t temp_in[32], temp_out[32];
+  int i, j;
+#if CONFIG_EXT_TX
+  int16_t flipped_input[32 * 8];
+  maybe_flip_input(&input, &stride, n, n4, flipped_input, tx_type);
+#endif
+
+  // Columns
+  for (i = 0; i < n4; ++i) {
+    for (j = 0; j < n; ++j) temp_in[j] = input[j * stride + i] * 4;
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
+  }
+
+  // Rows
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < n4; ++j)
+      output[j + i * n4] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+  }
+  // Note: overall scale factor of transform is 4 times unitary
+}
+
+void av1_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
+                    int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct32, fdct16 },         // DCT_DCT
+    { fhalfright32, fdct16 },   // ADST_DCT
+    { fdct32, fadst16 },        // DCT_ADST
+    { fhalfright32, fadst16 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { fhalfright32, fdct16 },   // FLIPADST_DCT
+    { fdct32, fadst16 },        // DCT_FLIPADST
+    { fhalfright32, fadst16 },  // FLIPADST_FLIPADST
+    { fhalfright32, fadst16 },  // ADST_FLIPADST
+    { fhalfright32, fadst16 },  // FLIPADST_ADST
+    { fidtx32, fidtx16 },       // IDTX
+    { fdct32, fidtx16 },        // V_DCT
+    { fidtx32, fdct16 },        // H_DCT
+    { fhalfright32, fidtx16 },  // V_ADST
+    { fidtx32, fadst16 },       // H_ADST
+    { fhalfright32, fidtx16 },  // V_FLIPADST
+    { fidtx32, fadst16 },       // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  const int n = 16;
+  const int n2 = 32;
+  tran_low_t out[32 * 16];
+  tran_low_t temp_in[32], temp_out[32];
+  int i, j;
+#if CONFIG_EXT_TX
+  int16_t flipped_input[32 * 16];
+  maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
+#endif
+
+  // Rows
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j)
+      temp_in[j] =
+          (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < n; ++j)
+      out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
+  }
+
+  // Columns
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j];
+  }
+  // Note: overall scale factor of transform is 4 times unitary
+}
+
+void av1_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
+                    int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct16, fdct32 },         // DCT_DCT
+    { fadst16, fdct32 },        // ADST_DCT
+    { fdct16, fhalfright32 },   // DCT_ADST
+    { fadst16, fhalfright32 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { fadst16, fdct32 },        // FLIPADST_DCT
+    { fdct16, fhalfright32 },   // DCT_FLIPADST
+    { fadst16, fhalfright32 },  // FLIPADST_FLIPADST
+    { fadst16, fhalfright32 },  // ADST_FLIPADST
+    { fadst16, fhalfright32 },  // FLIPADST_ADST
+    { fidtx16, fidtx32 },       // IDTX
+    { fdct16, fidtx32 },        // V_DCT
+    { fidtx16, fdct32 },        // H_DCT
+    { fadst16, fidtx32 },       // V_ADST
+    { fidtx16, fhalfright32 },  // H_ADST
+    { fadst16, fidtx32 },       // V_FLIPADST
+    { fidtx16, fhalfright32 },  // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  const int n = 16;
+  const int n2 = 32;
+  tran_low_t out[32 * 16];
+  tran_low_t temp_in[32], temp_out[32];
+  int i, j;
+#if CONFIG_EXT_TX
+  int16_t flipped_input[32 * 16];
+  maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
+#endif
+
+  // Columns
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j)
+      temp_in[j] =
+          (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < n; ++j)
+      out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
+  }
+
+  // Rows
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j];
+  }
+  // Note: overall scale factor of transform is 4 times unitary
+}
+
+void av1_fdct8x8_quant_c(const int16_t *input, int stride,
+                         tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         int skip_block, const int16_t *zbin_ptr,
+                         const int16_t *round_ptr, const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr, const int16_t *scan,
+                         const int16_t *iscan
+#if CONFIG_AOM_QM
+                         ,
+                         const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+                         ) {
+  int eob = -1;
+
+  int i, j;
+  tran_low_t intermediate[64];
+
+  // Transform columns
+  {
+    tran_low_t *output = intermediate;
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
+
+    for (i = 0; i < 8; i++) {
+      // stage 1
+      s0 = (input[0 * stride] + input[7 * stride]) * 4;
+      s1 = (input[1 * stride] + input[6 * stride]) * 4;
+      s2 = (input[2 * stride] + input[5 * stride]) * 4;
+      s3 = (input[3 * stride] + input[4 * stride]) * 4;
+      s4 = (input[3 * stride] - input[4 * stride]) * 4;
+      s5 = (input[2 * stride] - input[5 * stride]) * 4;
+      s6 = (input[1 * stride] - input[6 * stride]) * 4;
+      s7 = (input[0 * stride] - input[7 * stride]) * 4;
+
+      // fdct4(step, step);
+      x0 = s0 + s3;
+      x1 = s1 + s2;
+      x2 = s1 - s2;
+      x3 = s0 - s3;
+      t0 = (x0 + x1) * cospi_16_64;
+      t1 = (x0 - x1) * cospi_16_64;
+      t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
+      t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
+      output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
+      output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
+      output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
+      output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
+
+      // stage 2
+      t0 = (s6 - s5) * cospi_16_64;
+      t1 = (s6 + s5) * cospi_16_64;
+      t2 = fdct_round_shift(t0);
+      t3 = fdct_round_shift(t1);
+
+      // stage 3
+      x0 = s4 + t2;
+      x1 = s4 - t2;
+      x2 = s7 - t3;
+      x3 = s7 + t3;
+
+      // stage 4
+      t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+      t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+      t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+      output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
+      output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
+      output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
+      output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
+      input++;
+      output++;
+    }
+  }
+
+  // Rows
+  for (i = 0; i < 8; ++i) {
+    fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
+    for (j = 0; j < 8; ++j) coeff_ptr[j + i * 8] /= 2;
+  }
+
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+#if CONFIG_AOM_QM
+      const qm_val_t wt = qm_ptr[rc];
+      const qm_val_t iwt = iqm_ptr[rc];
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+#endif
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      int64_t tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+      int tmp32;
+#if CONFIG_AOM_QM
+      tmp32 = (int)((tmp * quant_ptr[rc != 0] * wt) >> (16 + AOM_QM_BITS));
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+#else
+      tmp32 = (int)((tmp * quant_ptr[rc != 0]) >> 16);
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+#endif
+
+      if (tmp32) eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
+                  int tx_type) {
+  if (tx_type == DCT_DCT) {
+    aom_fdct8x8_c(input, output, stride);
+  } else {
+    static const transform_2d FHT[] = {
+      { fdct8, fdct8 },    // DCT_DCT
+      { fadst8, fdct8 },   // ADST_DCT
+      { fdct8, fadst8 },   // DCT_ADST
+      { fadst8, fadst8 },  // ADST_ADST
+#if CONFIG_EXT_TX
+      { fadst8, fdct8 },   // FLIPADST_DCT
+      { fdct8, fadst8 },   // DCT_FLIPADST
+      { fadst8, fadst8 },  // FLIPADST_FLIPADST
+      { fadst8, fadst8 },  // ADST_FLIPADST
+      { fadst8, fadst8 },  // FLIPADST_ADST
+      { fidtx8, fidtx8 },  // IDTX
+      { fdct8, fidtx8 },   // V_DCT
+      { fidtx8, fdct8 },   // H_DCT
+      { fadst8, fidtx8 },  // V_ADST
+      { fidtx8, fadst8 },  // H_ADST
+      { fadst8, fidtx8 },  // V_FLIPADST
+      { fidtx8, fadst8 },  // H_FLIPADST
+#endif                     // CONFIG_EXT_TX
+    };
+    const transform_2d ht = FHT[tx_type];
+    tran_low_t out[64];
+    int i, j;
+    tran_low_t temp_in[8], temp_out[8];
+
+#if CONFIG_EXT_TX
+    int16_t flipped_input[8 * 8];
+    maybe_flip_input(&input, &stride, 8, 8, flipped_input, tx_type);
+#endif
+
+    // Columns
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 4;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 8; ++j) out[j * 8 + i] = temp_out[j];
+    }
+
+    // Rows
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j) temp_in[j] = out[j + i * 8];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 8; ++j)
+        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+    }
+  }
+}
+
+/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
+   pixel. */
+void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  int i;
+  tran_high_t a1, b1, c1, d1, e1;
+  const int16_t *ip_pass0 = input;
+  const tran_low_t *ip = NULL;
+  tran_low_t *op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip_pass0[0 * stride];
+    b1 = ip_pass0[1 * stride];
+    c1 = ip_pass0[2 * stride];
+    d1 = ip_pass0[3 * stride];
+
+    a1 += b1;
+    d1 = d1 - c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = (tran_low_t)a1;
+    op[4] = (tran_low_t)c1;
+    op[8] = (tran_low_t)d1;
+    op[12] = (tran_low_t)b1;
+
+    ip_pass0++;
+    op++;
+  }
+  ip = output;
+  op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0];
+    b1 = ip[1];
+    c1 = ip[2];
+    d1 = ip[3];
+
+    a1 += b1;
+    d1 -= c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
+    op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
+    op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
+    op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
+
+    ip += 4;
+    op += 4;
+  }
+}
+
+void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
+                    int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct16, fdct16 },    // DCT_DCT
+    { fadst16, fdct16 },   // ADST_DCT
+    { fdct16, fadst16 },   // DCT_ADST
+    { fadst16, fadst16 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { fadst16, fdct16 },   // FLIPADST_DCT
+    { fdct16, fadst16 },   // DCT_FLIPADST
+    { fadst16, fadst16 },  // FLIPADST_FLIPADST
+    { fadst16, fadst16 },  // ADST_FLIPADST
+    { fadst16, fadst16 },  // FLIPADST_ADST
+    { fidtx16, fidtx16 },  // IDTX
+    { fdct16, fidtx16 },   // V_DCT
+    { fidtx16, fdct16 },   // H_DCT
+    { fadst16, fidtx16 },  // V_ADST
+    { fidtx16, fadst16 },  // H_ADST
+    { fadst16, fidtx16 },  // V_FLIPADST
+    { fidtx16, fadst16 },  // H_FLIPADST
+#endif                     // CONFIG_EXT_TX
+  };
+
+  const transform_2d ht = FHT[tx_type];
+  tran_low_t out[256];
+  int i, j;
+  tran_low_t temp_in[16], temp_out[16];
+
+#if CONFIG_EXT_TX
+  int16_t flipped_input[16 * 16];
+  maybe_flip_input(&input, &stride, 16, 16, flipped_input, tx_type);
+#endif
+
+  // Columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j) temp_in[j] = input[j * stride + i] * 4;
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 16; ++j)
+      out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+  }
+
+  // Rows
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < 16; ++j) output[j + i * 16] = temp_out[j];
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
+                         int tx_type) {
+  av1_fht4x4_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
+                         int tx_type) {
+  av1_fht4x8_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
+                         int tx_type) {
+  av1_fht8x4_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
+                          int tx_type) {
+  av1_fht8x16_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
+                          int tx_type) {
+  av1_fht16x8_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
+                           int tx_type) {
+  av1_fht16x32_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
+                           int tx_type) {
+  av1_fht32x16_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht4x16_c(const int16_t *input, tran_low_t *output, int stride,
+                          int tx_type) {
+  av1_fht4x16_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht16x4_c(const int16_t *input, tran_low_t *output, int stride,
+                          int tx_type) {
+  av1_fht16x4_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht8x32_c(const int16_t *input, tran_low_t *output, int stride,
+                          int tx_type) {
+  av1_fht8x32_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht32x8_c(const int16_t *input, tran_low_t *output, int stride,
+                          int tx_type) {
+  av1_fht32x8_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
+                         int tx_type) {
+  av1_fht8x8_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  av1_fwht4x4_c(input, output, stride);
+}
+
+void av1_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
+                           int tx_type) {
+  av1_fht16x16_c(input, output, stride, tx_type);
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
+                    int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct32, fdct32 },  // DCT_DCT
+#if CONFIG_EXT_TX
+    { fhalfright32, fdct32 },        // ADST_DCT
+    { fdct32, fhalfright32 },        // DCT_ADST
+    { fhalfright32, fhalfright32 },  // ADST_ADST
+    { fhalfright32, fdct32 },        // FLIPADST_DCT
+    { fdct32, fhalfright32 },        // DCT_FLIPADST
+    { fhalfright32, fhalfright32 },  // FLIPADST_FLIPADST
+    { fhalfright32, fhalfright32 },  // ADST_FLIPADST
+    { fhalfright32, fhalfright32 },  // FLIPADST_ADST
+    { fidtx32, fidtx32 },            // IDTX
+    { fdct32, fidtx32 },             // V_DCT
+    { fidtx32, fdct32 },             // H_DCT
+    { fhalfright32, fidtx32 },       // V_ADST
+    { fidtx32, fhalfright32 },       // H_ADST
+    { fhalfright32, fidtx32 },       // V_FLIPADST
+    { fidtx32, fhalfright32 },       // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  tran_low_t out[1024];
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+
+#if CONFIG_EXT_TX
+  int16_t flipped_input[32 * 32];
+  maybe_flip_input(&input, &stride, 32, 32, flipped_input, tx_type);
+#endif
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 32; ++j)
+      out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
+  }
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < 32; ++j) output[j + i * 32] = temp_out[j];
+  }
+}
+
+#if CONFIG_TX64X64
+#if CONFIG_EXT_TX
+static void fidtx64(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 64; ++i)
+    output[i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2);
+}
+
+// For use in lieu of ADST
+static void fhalfright64(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[32];
+  for (i = 0; i < 32; ++i) {
+    output[32 + i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2);
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 32; ++i) {
+    inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 32] * Sqrt2);
+  }
+  fdct32(inputhalf, output);
+  // Note overall scaling factor is 2 times unitary
+}
+#endif  // CONFIG_EXT_TX
+
+static void fdct64_col(const tran_low_t *input, tran_low_t *output) {
+  int32_t in[64], out[64];
+  int i;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_fdct64_new(in, out, fwd_cos_bit_col_dct_dct_64,
+                 fwd_stage_range_col_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+static void fdct64_row(const tran_low_t *input, tran_low_t *output) {
+  int32_t in[64], out[64];
+  int i;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_fdct64_new(in, out, fwd_cos_bit_row_dct_dct_64,
+                 fwd_stage_range_row_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
+                    int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct64_col, fdct64_row },  // DCT_DCT
+#if CONFIG_EXT_TX
+    { fhalfright64, fdct64_row },    // ADST_DCT
+    { fdct64_col, fhalfright64 },    // DCT_ADST
+    { fhalfright64, fhalfright64 },  // ADST_ADST
+    { fhalfright64, fdct64_row },    // FLIPADST_DCT
+    { fdct64_col, fhalfright64 },    // DCT_FLIPADST
+    { fhalfright64, fhalfright64 },  // FLIPADST_FLIPADST
+    { fhalfright64, fhalfright64 },  // ADST_FLIPADST
+    { fhalfright64, fhalfright64 },  // FLIPADST_ADST
+    { fidtx64, fidtx64 },            // IDTX
+    { fdct64_col, fidtx64 },         // V_DCT
+    { fidtx64, fdct64_row },         // H_DCT
+    { fhalfright64, fidtx64 },       // V_ADST
+    { fidtx64, fhalfright64 },       // H_ADST
+    { fhalfright64, fidtx64 },       // V_FLIPADST
+    { fidtx64, fhalfright64 },       // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  tran_low_t out[4096];
+  int i, j;
+  tran_low_t temp_in[64], temp_out[64];
+#if CONFIG_EXT_TX
+  int16_t flipped_input[64 * 64];
+  maybe_flip_input(&input, &stride, 64, 64, flipped_input, tx_type);
+#endif
+  // Columns
+  for (i = 0; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i];
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 64; ++j)
+      out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  }
+
+  // Rows
+  for (i = 0; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) temp_in[j] = out[j + i * 64];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < 64; ++j)
+      output[j + i * 64] =
+          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+  }
+}
+#endif  // CONFIG_TX64X64
+
+#if CONFIG_EXT_TX
+// Forward identity transform.
+void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
+                    int bs, int tx_type) {
+  int r, c;
+  const int shift = bs < 32 ? 3 : (bs < 64 ? 2 : 1);
+  if (tx_type == IDTX) {
+    for (r = 0; r < bs; ++r) {
+      for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] * (1 << shift);
+      src_diff += stride;
+      coeff += bs;
+    }
+  }
+}
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
+                           int tx_type) {
+  av1_fht32x32_c(input, output, stride, tx_type);
+}
+
+#if CONFIG_TX64X64
+void av1_highbd_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
+                           int tx_type) {
+  av1_fht64x64_c(input, output, stride, tx_type);
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // !AV1_DCT_GTEST
diff --git a/third_party/aom/av1/encoder/encint.h b/third_party/aom/av1/encoder/encint.h
new file mode 100644
index 000000000..30ea8521f
--- /dev/null
+++ b/third_party/aom/av1/encoder/encint.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+/* clang-format off */
+
+#if !defined(_encint_H)
+# define _encint_H (1)
+
+typedef struct daala_enc_ctx od_enc_ctx;
+typedef struct od_params_ctx od_params_ctx;
+typedef struct od_rollback_buffer od_rollback_buffer;
+
+# include "aom_dsp/entenc.h"
+# include "av1/common/odintrin.h"
+# include "av1/common/pvq_state.h"
+
+struct daala_enc_ctx{
+  /* Stores context-adaptive CDFs for PVQ. */
+  od_state state;
+  /* AOM entropy encoder. */
+  aom_writer w;
+  int use_activity_masking;
+  /* Mode of quantization matrice : FLAT (0) or HVS (1) */
+  int qm;
+  /*Normalized PVQ lambda for use where we've already performed
+     quantization.*/
+  double pvq_norm_lambda;
+  double pvq_norm_lambda_dc;
+};
+
+// from daalaenc.h
+/**The encoder context.*/
+typedef struct daala_enc_ctx daala_enc_ctx;
+
+/** Holds important encoder information so we can roll back decisions */
+struct od_rollback_buffer {
+  od_ec_enc ec;
+  od_adapt_ctx adapt;
+};
+
+void od_encode_checkpoint(const daala_enc_ctx *enc, od_rollback_buffer *rbuf);
+void od_encode_rollback(daala_enc_ctx *enc, const od_rollback_buffer *rbuf);
+
+#endif
diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c
new file mode 100644
index 000000000..d254157e7
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodeframe.c
@@ -0,0 +1,7160 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./av1_rtcd.h"
+#include "./aom_dsp_rtcd.h"
+#include "./aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/idct.h"
+#include "av1/common/mv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/aq_variance.h"
+#if CONFIG_SUPERTX
+#include "av1/encoder/cost.h"
+#endif
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+#include "av1/common/warped_motion.h"
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+#if CONFIG_GLOBAL_MOTION
+#include "av1/encoder/global_motion.h"
+#endif  // CONFIG_GLOBAL_MOTION
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#if CONFIG_LV_MAP
+#include "av1/encoder/encodetxb.h"
+#endif
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/tokenize.h"
+#if CONFIG_PVQ
+#include "av1/common/pvq.h"
+#include "av1/encoder/pvq_encoder.h"
+#endif
+#if CONFIG_HIGHBITDEPTH
+#define IF_HBD(...) __VA_ARGS__
+#else
+#define IF_HBD(...)
+#endif  // CONFIG_HIGHBITDEPTH
+
+static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
+                              TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
+                              int mi_col, BLOCK_SIZE bsize,
+                              PICK_MODE_CONTEXT *ctx, int *rate);
+
+#if CONFIG_SUPERTX
+static int check_intra_b(PICK_MODE_CONTEXT *ctx);
+
+static int check_intra_sb(const AV1_COMP *cpi, const TileInfo *const tile,
+                          int mi_row, int mi_col, BLOCK_SIZE bsize,
+                          PC_TREE *pc_tree);
+static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
+#if CONFIG_EXT_INTER
+                               int mi_row_ori, int mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                               int mi_row_pred, int mi_col_pred,
+                               BLOCK_SIZE bsize_pred, int b_sub8x8, int block);
+static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
+                            PC_TREE *pc_tree);
+static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
+                               const TileInfo *const tile, int mi_row,
+                               int mi_col, int mi_row_ori, int mi_col_ori,
+                               RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                               BLOCK_SIZE top_bsize, uint8_t *dst_buf[3],
+                               int dst_stride[3], PC_TREE *pc_tree);
+static void update_state_sb_supertx(const AV1_COMP *const cpi, ThreadData *td,
+                                    const TileInfo *const tile, int mi_row,
+                                    int mi_col, BLOCK_SIZE bsize,
+                                    RUN_TYPE dry_run, PC_TREE *pc_tree);
+static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td,
+                          const TileInfo *const tile, int mi_row, int mi_col,
+                          BLOCK_SIZE bsize, int *tmp_rate, int64_t *tmp_dist,
+                          TX_TYPE *best_tx, PC_TREE *pc_tree);
+#endif  // CONFIG_SUPERTX
+
+// This is used as a reference when computing the source variance for the
+//  purposes of activity masking.
+// Eventually this should be replaced by custom no-reference routines,
+//  which will be faster.
+static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = {
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+#if CONFIG_EXT_PARTITION
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
+#endif  // CONFIG_EXT_PARTITION
+};
+
+#if CONFIG_HIGHBITDEPTH
+static const uint16_t AV1_HIGH_VAR_OFFS_8[MAX_SB_SIZE] = {
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+#if CONFIG_EXT_PARTITION
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
+#endif  // CONFIG_EXT_PARTITION
+};
+
+static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = {
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+#if CONFIG_EXT_PARTITION
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4
+#endif  // CONFIG_EXT_PARTITION
+};
+
+static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = {
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16,
+#if CONFIG_EXT_PARTITION
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16
+#endif  // CONFIG_EXT_PARTITION
+};
+#endif  // CONFIG_HIGHBITDEPTH
+
+unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
+                                           const struct buf_2d *ref,
+                                           BLOCK_SIZE bs) {
+  unsigned int sse;
+  const unsigned int var =
+      cpi->fn_ptr[bs].vf(ref->buf, ref->stride, AV1_VAR_OFFS, 0, &sse);
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+
+#if CONFIG_HIGHBITDEPTH
+unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi,
+                                                const struct buf_2d *ref,
+                                                BLOCK_SIZE bs, int bd) {
+  unsigned int var, sse;
+  switch (bd) {
+    case 10:
+      var =
+          cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                             CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10), 0, &sse);
+      break;
+    case 12:
+      var =
+          cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                             CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12), 0, &sse);
+      break;
+    case 8:
+    default:
+      var =
+          cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                             CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8), 0, &sse);
+      break;
+  }
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+static unsigned int get_sby_perpixel_diff_variance(const AV1_COMP *const cpi,
+                                                   const struct buf_2d *ref,
+                                                   int mi_row, int mi_col,
+                                                   BLOCK_SIZE bs) {
+  unsigned int sse, var;
+  uint8_t *last_y;
+  const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME);
+
+  assert(last != NULL);
+  last_y =
+      &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE];
+  var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse);
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+
+static BLOCK_SIZE get_rd_var_based_fixed_partition(AV1_COMP *cpi, MACROBLOCK *x,
+                                                   int mi_row, int mi_col) {
+  unsigned int var = get_sby_perpixel_diff_variance(
+      cpi, &x->plane[0].src, mi_row, mi_col, BLOCK_64X64);
+  if (var < 8)
+    return BLOCK_64X64;
+  else if (var < 128)
+    return BLOCK_32X32;
+  else if (var < 2048)
+    return BLOCK_16X16;
+  else
+    return BLOCK_8X8;
+}
+
+// Lighter version of set_offsets that only sets the mode info
+// pointers.
+static void set_mode_info_offsets(const AV1_COMP *const cpi,
+                                  MACROBLOCK *const x, MACROBLOCKD *const xd,
+                                  int mi_row, int mi_col) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int idx_str = xd->mi_stride * mi_row + mi_col;
+  xd->mi = cm->mi_grid_visible + idx_str;
+  xd->mi[0] = cm->mi + idx_str;
+  x->mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+}
+
+static void set_offsets_without_segment_id(const AV1_COMP *const cpi,
+                                           const TileInfo *const tile,
+                                           MACROBLOCK *const x, int mi_row,
+                                           int mi_col, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+
+  set_skip_context(xd, mi_row, mi_col);
+
+  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  xd->max_tx_size = max_txsize_lookup[bsize];
+#endif
+
+  // Set up destination pointers.
+  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+                       mi_col);
+
+  // Set up limit values for MV components.
+  // Mv beyond the range do not produce new/different prediction block.
+  x->mv_limits.row_min =
+      -(((mi_row + mi_height) * MI_SIZE) + AOM_INTERP_EXTEND);
+  x->mv_limits.col_min = -(((mi_col + mi_width) * MI_SIZE) + AOM_INTERP_EXTEND);
+  x->mv_limits.row_max = (cm->mi_rows - mi_row) * MI_SIZE + AOM_INTERP_EXTEND;
+  x->mv_limits.col_max = (cm->mi_cols - mi_col) * MI_SIZE + AOM_INTERP_EXTEND;
+
+  set_plane_n4(xd, mi_width, mi_height);
+
+  // Set up distance of MB to edge of frame in 1/8th pel units.
+  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
+#if CONFIG_DEPENDENT_HORZTILES
+                 cm->dependent_horz_tiles,
+#endif  // CONFIG_DEPENDENT_HORZTILES
+                 cm->mi_rows, cm->mi_cols);
+
+  // Set up source buffers.
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col);
+
+  // R/D setup.
+  x->rddiv = cpi->rd.RDDIV;
+  x->rdmult = cpi->rd.RDMULT;
+
+  // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs()
+  xd->tile = *tile;
+}
+
+static void set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
+                        MACROBLOCK *const x, int mi_row, int mi_col,
+                        BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+  const struct segmentation *const seg = &cm->seg;
+
+  set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+
+  mbmi = &xd->mi[0]->mbmi;
+
+  // Setup segment ID.
+  if (seg->enabled) {
+    if (!cpi->vaq_refresh) {
+      const uint8_t *const map =
+          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+      mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+    }
+    av1_init_plane_quantizers(cpi, x, mbmi->segment_id);
+  } else {
+    mbmi->segment_id = 0;
+  }
+
+#if CONFIG_SUPERTX
+  mbmi->segment_id_supertx = MAX_SEGMENTS;
+#endif  // CONFIG_SUPERTX
+}
+
+#if CONFIG_SUPERTX
+static void set_offsets_supertx(const AV1_COMP *const cpi, ThreadData *td,
+                                const TileInfo *const tile, int mi_row,
+                                int mi_col, BLOCK_SIZE bsize) {
+  MACROBLOCK *const x = &td->mb;
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+#if CONFIG_DEPENDENT_HORZTILES
+  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col, cm->dependent_horz_tiles);
+#else
+  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+#endif
+
+  // Set up distance of MB to edge of frame in 1/8th pel units.
+  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
+#if CONFIG_DEPENDENT_HORZTILES
+                 cm->dependent_horz_tiles,
+#endif  // CONFIG_DEPENDENT_HORZTILES
+                 cm->mi_rows, cm->mi_cols);
+}
+
+static void set_offsets_extend(const AV1_COMP *const cpi, ThreadData *td,
+                               const TileInfo *const tile, int mi_row_pred,
+                               int mi_col_pred, int mi_row_ori, int mi_col_ori,
+                               BLOCK_SIZE bsize_pred) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori, bsize_ori): region for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  MACROBLOCK *const x = &td->mb;
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_width = mi_size_wide[bsize_pred];
+  const int mi_height = mi_size_high[bsize_pred];
+
+#if CONFIG_DEPENDENT_HORZTILES
+  set_mode_info_offsets(cpi, x, xd, mi_row_ori, mi_col_ori,
+                        cm->dependent_horz_tiles);
+#else
+  set_mode_info_offsets(cpi, x, xd, mi_row_ori, mi_col_ori);
+#endif
+
+  // Set up limit values for MV components.
+  // Mv beyond the range do not produce new/different prediction block.
+  x->mv_limits.row_min =
+      -(((mi_row_pred + mi_height) * MI_SIZE) + AOM_INTERP_EXTEND);
+  x->mv_limits.col_min =
+      -(((mi_col_pred + mi_width) * MI_SIZE) + AOM_INTERP_EXTEND);
+  x->mv_limits.row_max =
+      (cm->mi_rows - mi_row_pred) * MI_SIZE + AOM_INTERP_EXTEND;
+  x->mv_limits.col_max =
+      (cm->mi_cols - mi_col_pred) * MI_SIZE + AOM_INTERP_EXTEND;
+
+// Set up distance of MB to edge of frame in 1/8th pel units.
+#if !CONFIG_CB4X4
+  assert(!(mi_col_pred & (mi_width - mi_size_wide[BLOCK_8X8])) &&
+         !(mi_row_pred & (mi_height - mi_size_high[BLOCK_8X8])));
+#endif
+  set_mi_row_col(xd, tile, mi_row_pred, mi_height, mi_col_pred, mi_width,
+#if CONFIG_DEPENDENT_HORZTILES
+                 cm->dependent_horz_tiles,
+#endif  // CONFIG_DEPENDENT_HORZTILES
+                 cm->mi_rows, cm->mi_cols);
+  xd->up_available = (mi_row_ori > tile->mi_row_start);
+  xd->left_available = (mi_col_ori > tile->mi_col_start);
+
+  // R/D setup.
+  x->rddiv = cpi->rd.RDDIV;
+  x->rdmult = cpi->rd.RDMULT;
+}
+
+static void set_segment_id_supertx(const AV1_COMP *const cpi,
+                                   MACROBLOCK *const x, const int mi_row,
+                                   const int mi_col, const BLOCK_SIZE bsize) {
+  const AV1_COMMON *cm = &cpi->common;
+  const struct segmentation *seg = &cm->seg;
+  const int miw = AOMMIN(mi_size_wide[bsize], cm->mi_cols - mi_col);
+  const int mih = AOMMIN(mi_size_high[bsize], cm->mi_rows - mi_row);
+  const int mi_offset = mi_row * cm->mi_stride + mi_col;
+  MODE_INFO **const mip = cm->mi_grid_visible + mi_offset;
+  int r, c;
+  int seg_id_supertx = MAX_SEGMENTS;
+
+  if (!seg->enabled) {
+    seg_id_supertx = 0;
+  } else {
+    // Find the minimum segment_id
+    for (r = 0; r < mih; r++)
+      for (c = 0; c < miw; c++)
+        seg_id_supertx =
+            AOMMIN(mip[r * cm->mi_stride + c]->mbmi.segment_id, seg_id_supertx);
+    assert(0 <= seg_id_supertx && seg_id_supertx < MAX_SEGMENTS);
+
+    // Initialize plane quantisers
+    av1_init_plane_quantizers(cpi, x, seg_id_supertx);
+  }
+
+  // Assign the the segment_id back to segment_id_supertx
+  for (r = 0; r < mih; r++)
+    for (c = 0; c < miw; c++)
+      mip[r * cm->mi_stride + c]->mbmi.segment_id_supertx = seg_id_supertx;
+}
+#endif  // CONFIG_SUPERTX
+
+static void set_block_size(AV1_COMP *const cpi, MACROBLOCK *const x,
+                           MACROBLOCKD *const xd, int mi_row, int mi_col,
+                           BLOCK_SIZE bsize) {
+  if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) {
+    const int mi_width = AOMMAX(mi_size_wide[bsize], mi_size_wide[BLOCK_8X8]);
+    const int mi_height = AOMMAX(mi_size_high[bsize], mi_size_high[BLOCK_8X8]);
+    for (int r = 0; r < mi_height; ++r) {
+      for (int c = 0; c < mi_width; ++c) {
+        set_mode_info_offsets(cpi, x, xd, mi_row + r, mi_col + c);
+        xd->mi[0]->mbmi.sb_type = bsize;
+      }
+    }
+  }
+}
+
+static void set_vt_partitioning(AV1_COMP *cpi, MACROBLOCK *const x,
+                                MACROBLOCKD *const xd, VAR_TREE *vt, int mi_row,
+                                int mi_col, const int64_t *const threshold,
+                                const BLOCK_SIZE *const bsize_min) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int hbw = mi_size_wide[vt->bsize] / 2;
+  const int hbh = mi_size_high[vt->bsize] / 2;
+  const int has_cols = mi_col + hbw < cm->mi_cols;
+  const int has_rows = mi_row + hbh < cm->mi_rows;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  assert(vt->bsize >= BLOCK_8X8);
+
+  assert(hbh == hbw);
+
+  if (vt->bsize == BLOCK_8X8 && cm->frame_type != KEY_FRAME) {
+    set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_8X8);
+    return;
+  }
+
+  if (vt->force_split || (!has_cols && !has_rows)) goto split;
+
+  // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
+  // variance is below threshold, otherwise split will be selected.
+  // No check for vert/horiz split as too few samples for variance.
+  if (vt->bsize == bsize_min[0]) {
+    if (has_cols && has_rows && vt->variances.none.variance < threshold[0]) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, vt->bsize);
+      return;
+    } else {
+      BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_SPLIT);
+      set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+      if (vt->bsize > BLOCK_8X8) {
+        set_block_size(cpi, x, xd, mi_row, mi_col + hbw, subsize);
+        set_block_size(cpi, x, xd, mi_row + hbh, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row + hbh, mi_col + hbw, subsize);
+      }
+      return;
+    }
+  } else if (vt->bsize > bsize_min[0]) {
+    // For key frame: take split for bsize above 32X32 or very high variance.
+    if (cm->frame_type == KEY_FRAME &&
+        (vt->bsize > BLOCK_32X32 ||
+         vt->variances.none.variance > (threshold[0] << 4))) {
+      goto split;
+    }
+    // If variance is low, take the bsize (no split).
+    if (has_cols && has_rows && vt->variances.none.variance < threshold[0]) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, vt->bsize);
+      return;
+    }
+
+    // Check vertical split.
+    if (has_rows) {
+      BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_VERT);
+      if (vt->variances.vert[0].variance < threshold[0] &&
+          vt->variances.vert[1].variance < threshold[0] &&
+          get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row, mi_col + hbw, subsize);
+        return;
+      }
+    }
+    // Check horizontal split.
+    if (has_cols) {
+      BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_HORZ);
+      if (vt->variances.horz[0].variance < threshold[0] &&
+          vt->variances.horz[1].variance < threshold[0] &&
+          get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row + hbh, mi_col, subsize);
+        return;
+      }
+    }
+  }
+
+split : {
+  set_vt_partitioning(cpi, x, xd, vt->split[0], mi_row, mi_col, threshold + 1,
+                      bsize_min + 1);
+  set_vt_partitioning(cpi, x, xd, vt->split[1], mi_row, mi_col + hbw,
+                      threshold + 1, bsize_min + 1);
+  set_vt_partitioning(cpi, x, xd, vt->split[2], mi_row + hbh, mi_col,
+                      threshold + 1, bsize_min + 1);
+  set_vt_partitioning(cpi, x, xd, vt->split[3], mi_row + hbh, mi_col + hbw,
+                      threshold + 1, bsize_min + 1);
+  return;
+}
+}
+
+// Set the variance split thresholds for following the block sizes:
+// 0 - threshold_64x64, 1 - threshold_32x32, 2 - threshold_16x16,
+// 3 - vbp_threshold_8x8. vbp_threshold_8x8 (to split to 4x4 partition) is
+// currently only used on key frame.
+static void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[], int q) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int is_key_frame = (cm->frame_type == KEY_FRAME);
+  const int threshold_multiplier = is_key_frame ? 20 : 1;
+  const int64_t threshold_base =
+      (int64_t)(threshold_multiplier * cpi->y_dequant[q][1]);
+  if (is_key_frame) {
+    thresholds[1] = threshold_base;
+    thresholds[2] = threshold_base >> 2;
+    thresholds[3] = threshold_base >> 2;
+    thresholds[4] = threshold_base << 2;
+  } else {
+    thresholds[2] = threshold_base;
+    if (cm->width <= 352 && cm->height <= 288) {
+      thresholds[1] = threshold_base >> 2;
+      thresholds[3] = threshold_base << 3;
+    } else {
+      thresholds[1] = threshold_base;
+      thresholds[2] = (5 * threshold_base) >> 2;
+      if (cm->width >= 1920 && cm->height >= 1080)
+        thresholds[2] = (7 * threshold_base) >> 2;
+      thresholds[3] = threshold_base << cpi->oxcf.speed;
+    }
+  }
+  thresholds[0] = INT64_MIN;
+}
+
+void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q) {
+  AV1_COMMON *const cm = &cpi->common;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  const int is_key_frame = (cm->frame_type == KEY_FRAME);
+  if (sf->partition_search_type != VAR_BASED_PARTITION &&
+      sf->partition_search_type != REFERENCE_PARTITION) {
+    return;
+  } else {
+    set_vbp_thresholds(cpi, cpi->vbp_thresholds, q);
+    // The thresholds below are not changed locally.
+    if (is_key_frame) {
+      cpi->vbp_threshold_sad = 0;
+      cpi->vbp_bsize_min = BLOCK_8X8;
+    } else {
+      if (cm->width <= 352 && cm->height <= 288)
+        cpi->vbp_threshold_sad = 100;
+      else
+        cpi->vbp_threshold_sad = (cpi->y_dequant[q][1] << 1) > 1000
+                                     ? (cpi->y_dequant[q][1] << 1)
+                                     : 1000;
+      cpi->vbp_bsize_min = BLOCK_16X16;
+    }
+    cpi->vbp_threshold_minmax = 15 + (q >> 3);
+  }
+}
+
+// Compute the minmax over the 8x8 subblocks.
+static int compute_minmax_8x8(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
+#if CONFIG_HIGHBITDEPTH
+                              int highbd,
+#endif
+                              int pixels_wide, int pixels_high) {
+  int k;
+  int minmax_max = 0;
+  int minmax_min = 255;
+  // Loop over the 4 8x8 subblocks.
+  for (k = 0; k < 4; k++) {
+    const int x8_idx = ((k & 1) << 3);
+    const int y8_idx = ((k >> 1) << 3);
+    int min = 0;
+    int max = 0;
+    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+      const int src_offset = y8_idx * src_stride + x8_idx;
+      const int ref_offset = y8_idx * ref_stride + x8_idx;
+#if CONFIG_HIGHBITDEPTH
+      if (highbd) {
+        aom_highbd_minmax_8x8(src + src_offset, src_stride, ref + ref_offset,
+                              ref_stride, &min, &max);
+      } else {
+        aom_minmax_8x8(src + src_offset, src_stride, ref + ref_offset,
+                       ref_stride, &min, &max);
+      }
+#else
+      aom_minmax_8x8(src + src_offset, src_stride, ref + ref_offset, ref_stride,
+                     &min, &max);
+#endif
+      if ((max - min) > minmax_max) minmax_max = (max - min);
+      if ((max - min) < minmax_min) minmax_min = (max - min);
+    }
+  }
+  return (minmax_max - minmax_min);
+}
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE int avg_4x4(const uint8_t *const src, const int stride,
+                          const int highbd) {
+  if (highbd) {
+    return aom_highbd_avg_4x4(src, stride);
+  } else {
+    return aom_avg_4x4(src, stride);
+  }
+}
+#else
+static INLINE int avg_4x4(const uint8_t *const src, const int stride) {
+  return aom_avg_4x4(src, stride);
+}
+#endif
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE int avg_8x8(const uint8_t *const src, const int stride,
+                          const int highbd) {
+  if (highbd) {
+    return aom_highbd_avg_8x8(src, stride);
+  } else {
+    return aom_avg_8x8(src, stride);
+  }
+}
+#else
+static INLINE int avg_8x8(const uint8_t *const src, const int stride) {
+  return aom_avg_8x8(src, stride);
+}
+#endif
+
+static void init_variance_tree(VAR_TREE *const vt,
+#if CONFIG_HIGHBITDEPTH
+                               const int highbd,
+#endif
+                               BLOCK_SIZE bsize, BLOCK_SIZE leaf_size,
+                               const int width, const int height,
+                               const uint8_t *const src, const int src_stride,
+                               const uint8_t *const ref, const int ref_stride) {
+  assert(bsize >= leaf_size);
+
+  vt->bsize = bsize;
+
+  vt->force_split = 0;
+
+  vt->src = src;
+  vt->src_stride = src_stride;
+  vt->ref = ref;
+  vt->ref_stride = ref_stride;
+
+  vt->width = width;
+  vt->height = height;
+
+#if CONFIG_HIGHBITDEPTH
+  vt->highbd = highbd;
+#endif  // CONFIG_HIGHBITDEPTH
+
+  if (bsize > leaf_size) {
+    const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+    const int px = block_size_wide[subsize];
+
+    init_variance_tree(vt->split[0],
+#if CONFIG_HIGHBITDEPTH
+                       highbd,
+#endif  // CONFIG_HIGHBITDEPTH
+                       subsize, leaf_size, AOMMIN(px, width),
+                       AOMMIN(px, height), src, src_stride, ref, ref_stride);
+    init_variance_tree(vt->split[1],
+#if CONFIG_HIGHBITDEPTH
+                       highbd,
+#endif  // CONFIG_HIGHBITDEPTH
+                       subsize, leaf_size, width - px, AOMMIN(px, height),
+                       src + px, src_stride, ref + px, ref_stride);
+    init_variance_tree(vt->split[2],
+#if CONFIG_HIGHBITDEPTH
+                       highbd,
+#endif  // CONFIG_HIGHBITDEPTH
+                       subsize, leaf_size, AOMMIN(px, width), height - px,
+                       src + px * src_stride, src_stride, ref + px * ref_stride,
+                       ref_stride);
+    init_variance_tree(vt->split[3],
+#if CONFIG_HIGHBITDEPTH
+                       highbd,
+#endif  // CONFIG_HIGHBITDEPTH
+                       subsize, leaf_size, width - px, height - px,
+                       src + px * src_stride + px, src_stride,
+                       ref + px * ref_stride + px, ref_stride);
+  }
+}
+
+// Fill the variance tree based on averaging pixel values (sub-sampling), at
+// the leaf node size.
+static void fill_variance_tree(VAR_TREE *const vt, const BLOCK_SIZE leaf_size) {
+  if (vt->bsize > leaf_size) {
+    fill_variance_tree(vt->split[0], leaf_size);
+    fill_variance_tree(vt->split[1], leaf_size);
+    fill_variance_tree(vt->split[2], leaf_size);
+    fill_variance_tree(vt->split[3], leaf_size);
+    fill_variance_node(vt);
+  } else if (vt->width <= 0 || vt->height <= 0) {
+    fill_variance(0, 0, 0, &vt->variances.none);
+  } else {
+    unsigned int sse = 0;
+    int sum = 0;
+    int src_avg;
+    int ref_avg;
+    assert(leaf_size == BLOCK_4X4 || leaf_size == BLOCK_8X8);
+    if (leaf_size == BLOCK_4X4) {
+      src_avg = avg_4x4(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+      ref_avg = avg_4x4(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
+    } else {
+      src_avg = avg_8x8(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+      ref_avg = avg_8x8(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
+    }
+    sum = src_avg - ref_avg;
+    sse = sum * sum;
+    fill_variance(sse, sum, 0, &vt->variances.none);
+  }
+}
+
+static void refine_variance_tree(VAR_TREE *const vt, const int64_t threshold) {
+  if (vt->bsize >= BLOCK_8X8) {
+    if (vt->bsize == BLOCK_16X16) {
+      if (vt->variances.none.variance <= threshold)
+        return;
+      else
+        vt->force_split = 0;
+    }
+
+    refine_variance_tree(vt->split[0], threshold);
+    refine_variance_tree(vt->split[1], threshold);
+    refine_variance_tree(vt->split[2], threshold);
+    refine_variance_tree(vt->split[3], threshold);
+
+    if (vt->bsize <= BLOCK_16X16) fill_variance_node(vt);
+  } else if (vt->width <= 0 || vt->height <= 0) {
+    fill_variance(0, 0, 0, &vt->variances.none);
+  } else {
+    const int src_avg = avg_4x4(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+    const int ref_avg = avg_4x4(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
+    const int sum = src_avg - ref_avg;
+    const unsigned int sse = sum * sum;
+    assert(vt->bsize == BLOCK_4X4);
+    fill_variance(sse, sum, 0, &vt->variances.none);
+  }
+}
+
+static int check_split_key_frame(VAR_TREE *const vt, const int64_t threshold) {
+  if (vt->bsize == BLOCK_32X32) {
+    vt->force_split = vt->variances.none.variance > threshold;
+  } else {
+    vt->force_split |= check_split_key_frame(vt->split[0], threshold);
+    vt->force_split |= check_split_key_frame(vt->split[1], threshold);
+    vt->force_split |= check_split_key_frame(vt->split[2], threshold);
+    vt->force_split |= check_split_key_frame(vt->split[3], threshold);
+  }
+  return vt->force_split;
+}
+
+static int check_split(AV1_COMP *const cpi, VAR_TREE *const vt,
+                       const int segment_id, const int64_t *const thresholds) {
+  if (vt->bsize == BLOCK_16X16) {
+    vt->force_split = vt->variances.none.variance > thresholds[0];
+    if (!vt->force_split && vt->variances.none.variance > thresholds[-1] &&
+        !cyclic_refresh_segment_id_boosted(segment_id)) {
+      // We have some nominal amount of 16x16 variance (based on average),
+      // compute the minmax over the 8x8 sub-blocks, and if above threshold,
+      // force split to 8x8 block for this 16x16 block.
+      int minmax =
+          compute_minmax_8x8(vt->src, vt->src_stride, vt->ref, vt->ref_stride,
+#if CONFIG_HIGHBITDEPTH
+                             vt->highbd,
+#endif
+                             vt->width, vt->height);
+      vt->force_split = minmax > cpi->vbp_threshold_minmax;
+    }
+  } else {
+    vt->force_split |=
+        check_split(cpi, vt->split[0], segment_id, thresholds + 1);
+    vt->force_split |=
+        check_split(cpi, vt->split[1], segment_id, thresholds + 1);
+    vt->force_split |=
+        check_split(cpi, vt->split[2], segment_id, thresholds + 1);
+    vt->force_split |=
+        check_split(cpi, vt->split[3], segment_id, thresholds + 1);
+
+    if (vt->bsize == BLOCK_32X32 && !vt->force_split) {
+      vt->force_split = vt->variances.none.variance > thresholds[0];
+    }
+  }
+
+  return vt->force_split;
+}
+
+// This function chooses partitioning based on the variance between source and
+// reconstructed last (or golden), where variance is computed for down-sampled
+// inputs.
+static void choose_partitioning(AV1_COMP *const cpi, ThreadData *const td,
+                                const TileInfo *const tile, MACROBLOCK *const x,
+                                const int mi_row, const int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  VAR_TREE *const vt = td->var_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
+#if CONFIG_DUAL_FILTER
+  int i;
+#endif
+  const uint8_t *src;
+  const uint8_t *ref;
+  int src_stride;
+  int ref_stride;
+  int pixels_wide = MI_SIZE * mi_size_wide[cm->sb_size];
+  int pixels_high = MI_SIZE * mi_size_high[cm->sb_size];
+  int64_t thresholds[5] = {
+    cpi->vbp_thresholds[0], cpi->vbp_thresholds[1], cpi->vbp_thresholds[2],
+    cpi->vbp_thresholds[3], cpi->vbp_thresholds[4],
+  };
+  BLOCK_SIZE bsize_min[5] = { BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
+                              cpi->vbp_bsize_min, BLOCK_8X8 };
+  const int start_level = cm->sb_size == BLOCK_64X64 ? 1 : 0;
+  const int64_t *const thre = thresholds + start_level;
+  const BLOCK_SIZE *const bmin = bsize_min + start_level;
+
+  const int is_key_frame = (cm->frame_type == KEY_FRAME);
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
+
+  int segment_id = CR_SEGMENT_ID_BASE;
+
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+    const uint8_t *const map =
+        cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+    segment_id = get_segment_id(cm, map, cm->sb_size, mi_row, mi_col);
+
+    if (cyclic_refresh_segment_id_boosted(segment_id)) {
+      int q = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+      set_vbp_thresholds(cpi, thresholds, q);
+    }
+  }
+
+  set_offsets(cpi, tile, x, mi_row, mi_col, cm->sb_size);
+
+  if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
+  if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3);
+
+  src = x->plane[0].src.buf;
+  src_stride = x->plane[0].src.stride;
+
+  if (!is_key_frame) {
+    MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+    const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+    unsigned int y_sad, y_sad_g;
+
+    const int hbs = cm->mib_size / 2;
+    const int split_vert = mi_col + hbs >= cm->mi_cols;
+    const int split_horz = mi_row + hbs >= cm->mi_rows;
+    BLOCK_SIZE bsize;
+
+    if (split_vert && split_horz)
+      bsize = get_subsize(cm->sb_size, PARTITION_SPLIT);
+    else if (split_vert)
+      bsize = get_subsize(cm->sb_size, PARTITION_VERT);
+    else if (split_horz)
+      bsize = get_subsize(cm->sb_size, PARTITION_HORZ);
+    else
+      bsize = cm->sb_size;
+
+    assert(yv12 != NULL);
+
+    if (yv12_g && yv12_g != yv12) {
+      av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                           &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+      y_sad_g = cpi->fn_ptr[bsize].sdf(
+          x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
+          xd->plane[0].pre[0].stride);
+    } else {
+      y_sad_g = UINT_MAX;
+    }
+
+    av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                         &cm->frame_refs[LAST_FRAME - 1].sf);
+    mbmi->ref_frame[0] = LAST_FRAME;
+    mbmi->ref_frame[1] = NONE_FRAME;
+    mbmi->sb_type = cm->sb_size;
+    mbmi->mv[0].as_int = 0;
+#if CONFIG_DUAL_FILTER
+    for (i = 0; i < 4; ++i) mbmi->interp_filter[i] = BILINEAR;
+#else
+    mbmi->interp_filter = BILINEAR;
+#endif
+
+    y_sad = av1_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+
+    if (y_sad_g < y_sad) {
+      av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                           &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+      mbmi->ref_frame[0] = GOLDEN_FRAME;
+      mbmi->mv[0].as_int = 0;
+      y_sad = y_sad_g;
+    } else {
+      x->pred_mv[LAST_FRAME] = mbmi->mv[0].as_mv;
+    }
+
+    av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, cm->sb_size);
+
+    ref = xd->plane[0].dst.buf;
+    ref_stride = xd->plane[0].dst.stride;
+
+    // If the y_sad is very small, take the largest partition and exit.
+    // Don't check on boosted segment for now, as largest is suppressed there.
+    if (segment_id == CR_SEGMENT_ID_BASE && y_sad < cpi->vbp_threshold_sad) {
+      if (!split_vert && !split_horz) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, cm->sb_size);
+        return;
+      }
+    }
+  } else {
+    ref = AV1_VAR_OFFS;
+    ref_stride = 0;
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      switch (xd->bd) {
+        case 10: ref = CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10); break;
+        case 12: ref = CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12); break;
+        case 8:
+        default: ref = CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8); break;
+      }
+    }
+#endif  // CONFIG_HIGHBITDEPTH
+  }
+
+  init_variance_tree(
+      vt,
+#if CONFIG_HIGHBITDEPTH
+      xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH,
+#endif  // CONFIG_HIGHBITDEPTH
+      cm->sb_size, (is_key_frame || low_res) ? BLOCK_4X4 : BLOCK_8X8,
+      pixels_wide, pixels_high, src, src_stride, ref, ref_stride);
+
+  // Fill in the entire tree of variances and compute splits.
+  if (is_key_frame) {
+    fill_variance_tree(vt, BLOCK_4X4);
+    check_split_key_frame(vt, thre[1]);
+  } else {
+    fill_variance_tree(vt, BLOCK_8X8);
+    check_split(cpi, vt, segment_id, thre);
+    if (low_res) {
+      refine_variance_tree(vt, thre[1] << 1);
+    }
+  }
+
+  vt->force_split |= mi_col + cm->mib_size > cm->mi_cols ||
+                     mi_row + cm->mib_size > cm->mi_rows;
+
+  // Now go through the entire structure, splitting every block size until
+  // we get to one that's got a variance lower than our threshold.
+  set_vt_partitioning(cpi, x, xd, vt, mi_row, mi_col, thre, bmin);
+}
+
+#if CONFIG_DUAL_FILTER
+static void reset_intmv_filter_type(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+                                    MB_MODE_INFO *mbmi) {
+  int dir;
+  for (dir = 0; dir < 2; ++dir) {
+    if (!has_subpel_mv_component(xd->mi[0], xd, dir) &&
+        (mbmi->ref_frame[1] == NONE_FRAME ||
+         !has_subpel_mv_component(xd->mi[0], xd, dir + 2)))
+      mbmi->interp_filter[dir] = (cm->interp_filter == SWITCHABLE)
+                                     ? EIGHTTAP_REGULAR
+                                     : cm->interp_filter;
+    mbmi->interp_filter[dir + 2] = mbmi->interp_filter[dir];
+  }
+}
+
+static void update_filter_type_count(FRAME_COUNTS *counts,
+                                     const MACROBLOCKD *xd,
+                                     const MB_MODE_INFO *mbmi) {
+  int dir;
+  for (dir = 0; dir < 2; ++dir) {
+    if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
+        (mbmi->ref_frame[1] > INTRA_FRAME &&
+         has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
+      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+      ++counts->switchable_interp[ctx][mbmi->interp_filter[dir]];
+    }
+  }
+}
+#endif
+#if CONFIG_GLOBAL_MOTION
+static void update_global_motion_used(PREDICTION_MODE mode, BLOCK_SIZE bsize,
+                                      const MB_MODE_INFO *mbmi,
+                                      RD_COUNTS *rdc) {
+  if (mode == ZEROMV
+#if CONFIG_EXT_INTER
+      || mode == ZERO_ZEROMV
+#endif
+      ) {
+    const int num_4x4s =
+        num_4x4_blocks_wide_lookup[bsize] * num_4x4_blocks_high_lookup[bsize];
+    int ref;
+    for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+      rdc->global_motion_used[mbmi->ref_frame[ref]] += num_4x4s;
+    }
+  }
+}
+#endif  // CONFIG_GLOBAL_MOTION
+
+static void reset_tx_size(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
+                          const TX_MODE tx_mode) {
+  if (xd->lossless[mbmi->segment_id]) {
+    mbmi->tx_size = TX_4X4;
+  } else if (tx_mode != TX_MODE_SELECT) {
+    mbmi->tx_size =
+        tx_size_from_tx_mode(mbmi->sb_type, tx_mode, is_inter_block(mbmi));
+  }
+}
+
+#if CONFIG_REF_MV
+static void set_ref_and_pred_mvs(MACROBLOCK *const x, int_mv *const mi_pred_mv,
+                                 int8_t rf_type) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+
+  const int bw = xd->n8_w << MI_SIZE_LOG2;
+  const int bh = xd->n8_h << MI_SIZE_LOG2;
+  int ref_mv_idx = mbmi->ref_mv_idx;
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  CANDIDATE_MV *const curr_ref_mv_stack = mbmi_ext->ref_mv_stack[rf_type];
+
+#if CONFIG_EXT_INTER
+  if (has_second_ref(mbmi)) {
+    // Special case: NEAR_NEWMV and NEW_NEARMV modes use 1 + mbmi->ref_mv_idx
+    // (like NEARMV) instead
+    if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) ref_mv_idx += 1;
+
+    if (compound_ref0_mode(mbmi->mode) == NEWMV) {
+      int_mv this_mv = curr_ref_mv_stack[ref_mv_idx].this_mv;
+      clamp_mv_ref(&this_mv.as_mv, bw, bh, xd);
+      mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
+      mbmi->pred_mv[0] = this_mv;
+      mi_pred_mv[0] = this_mv;
+    }
+    if (compound_ref1_mode(mbmi->mode) == NEWMV) {
+      int_mv this_mv = curr_ref_mv_stack[ref_mv_idx].comp_mv;
+      clamp_mv_ref(&this_mv.as_mv, bw, bh, xd);
+      mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
+      mbmi->pred_mv[1] = this_mv;
+      mi_pred_mv[1] = this_mv;
+    }
+  } else {
+#endif  // CONFIG_EXT_INTER
+    if (mbmi->mode == NEWMV) {
+      int i;
+      for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
+        int_mv this_mv = (i == 0) ? curr_ref_mv_stack[ref_mv_idx].this_mv
+                                  : curr_ref_mv_stack[ref_mv_idx].comp_mv;
+        clamp_mv_ref(&this_mv.as_mv, bw, bh, xd);
+        mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0] = this_mv;
+        mbmi->pred_mv[i] = this_mv;
+        mi_pred_mv[i] = this_mv;
+      }
+    }
+#if CONFIG_EXT_INTER
+  }
+#endif  // CONFIG_EXT_INTER
+}
+#endif  // CONFIG_REF_MV
+
+static void update_state(const AV1_COMP *const cpi, ThreadData *td,
+                         PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
+                         BLOCK_SIZE bsize, RUN_TYPE dry_run) {
+  int i, x_idx, y;
+  const AV1_COMMON *const cm = &cpi->common;
+  RD_COUNTS *const rdc = &td->rd_counts;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  MODE_INFO *mi = &ctx->mic;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *mi_addr = xd->mi[0];
+  const struct segmentation *const seg = &cm->seg;
+  const int bw = mi_size_wide[mi->mbmi.sb_type];
+  const int bh = mi_size_high[mi->mbmi.sb_type];
+  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
+  MV_REF *const frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
+
+  const int mis = cm->mi_stride;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  const int unify_bsize = CONFIG_CB4X4;
+
+#if CONFIG_REF_MV
+  int8_t rf_type;
+#endif
+
+#if !CONFIG_SUPERTX
+  assert(mi->mbmi.sb_type == bsize);
+#endif
+
+  *mi_addr = *mi;
+  *x->mbmi_ext = ctx->mbmi_ext;
+
+#if CONFIG_DUAL_FILTER
+  reset_intmv_filter_type(cm, xd, mbmi);
+#endif
+
+#if CONFIG_REF_MV
+  rf_type = av1_ref_frame_type(mbmi->ref_frame);
+  if (x->mbmi_ext->ref_mv_count[rf_type] > 1 &&
+      (mbmi->sb_type >= BLOCK_8X8 || unify_bsize)) {
+    set_ref_and_pred_mvs(x, mi->mbmi.pred_mv, rf_type);
+  }
+#endif  // CONFIG_REF_MV
+
+  // If segmentation in use
+  if (seg->enabled) {
+    // For in frame complexity AQ copy the segment id from the segment map.
+    if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+      const uint8_t *const map =
+          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+      mi_addr->mbmi.segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+      reset_tx_size(xd, &mi_addr->mbmi, cm->tx_mode);
+    }
+    // Else for cyclic refresh mode update the segment map, set the segment id
+    // and then update the quantizer.
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+      av1_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row, mi_col,
+                                        bsize, ctx->rate, ctx->dist, x->skip);
+      reset_tx_size(xd, &mi_addr->mbmi, cm->tx_mode);
+    }
+  }
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff[i];
+    p[i].qcoeff = ctx->qcoeff[i];
+    pd[i].dqcoeff = ctx->dqcoeff[i];
+#if CONFIG_PVQ
+    pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i];
+#endif
+    p[i].eobs = ctx->eobs[i];
+#if CONFIG_LV_MAP
+    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+#endif  // CONFIG_LV_MAP
+  }
+#if CONFIG_PALETTE
+  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+#endif  // CONFIG_PALETTE
+
+  // Restore the coding context of the MB to that that was in place
+  // when the mode was picked for it
+  for (y = 0; y < mi_height; y++)
+    for (x_idx = 0; x_idx < mi_width; x_idx++)
+      if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx &&
+          (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
+        xd->mi[x_idx + y * mis] = mi_addr;
+      }
+
+#if CONFIG_DELTA_Q && !CONFIG_EXT_DELTA_Q
+  if (cpi->oxcf.aq_mode > NO_AQ && cpi->oxcf.aq_mode < DELTA_AQ)
+    av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
+#else
+  if (cpi->oxcf.aq_mode)
+    av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
+#endif
+
+  if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8 && !unify_bsize) {
+    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+    mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+  }
+
+  x->skip = ctx->skip;
+
+#if CONFIG_VAR_TX
+  for (i = 0; i < 1; ++i)
+    memcpy(x->blk_skip[i], ctx->blk_skip[i],
+           sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif
+
+  if (dry_run) return;
+
+#if CONFIG_INTERNAL_STATS
+  {
+    unsigned int *const mode_chosen_counts =
+        (unsigned int *)cpi->mode_chosen_counts;  // Cast const away.
+    if (frame_is_intra_only(cm)) {
+      static const int kf_mode_index[] = {
+        THR_DC /*DC_PRED*/,
+        THR_V_PRED /*V_PRED*/,
+        THR_H_PRED /*H_PRED*/,
+        THR_D45_PRED /*D45_PRED*/,
+        THR_D135_PRED /*D135_PRED*/,
+        THR_D117_PRED /*D117_PRED*/,
+        THR_D153_PRED /*D153_PRED*/,
+        THR_D207_PRED /*D207_PRED*/,
+        THR_D63_PRED /*D63_PRED*/,
+#if CONFIG_ALT_INTRA
+        THR_SMOOTH, /*SMOOTH_PRED*/
+#endif              // CONFIG_ALT_INTRA
+        THR_TM /*TM_PRED*/,
+      };
+      ++mode_chosen_counts[kf_mode_index[mbmi->mode]];
+    } else {
+      // Note how often each mode chosen as best
+      ++mode_chosen_counts[ctx->best_mode_index];
+    }
+  }
+#endif
+  if (!frame_is_intra_only(cm)) {
+    if (is_inter_block(mbmi)) {
+      av1_update_mv_count(td);
+#if CONFIG_GLOBAL_MOTION
+      if (bsize >= BLOCK_8X8) {
+        // TODO(sarahparker): global motion stats need to be handled per-tile
+        // to be compatible with tile-based threading.
+        update_global_motion_used(mbmi->mode, bsize, mbmi, rdc);
+      } else {
+        const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+        const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+        int idx, idy;
+        for (idy = 0; idy < 2; idy += num_4x4_h) {
+          for (idx = 0; idx < 2; idx += num_4x4_w) {
+            const int j = idy * 2 + idx;
+            update_global_motion_used(mi->bmi[j].as_mode, bsize, mbmi, rdc);
+          }
+        }
+      }
+#endif  // CONFIG_GLOBAL_MOTION
+      if (cm->interp_filter == SWITCHABLE
+#if CONFIG_WARPED_MOTION
+          && mbmi->motion_mode != WARPED_CAUSAL
+#endif  // CONFIG_WARPED_MOTION
+#if CONFIG_GLOBAL_MOTION
+          && !is_nontrans_global_motion(xd)
+#endif  // CONFIG_GLOBAL_MOTION
+              ) {
+#if CONFIG_DUAL_FILTER
+        update_filter_type_count(td->counts, xd, mbmi);
+#else
+        const int switchable_ctx = av1_get_pred_context_switchable_interp(xd);
+        ++td->counts->switchable_interp[switchable_ctx][mbmi->interp_filter];
+#endif
+      }
+    }
+
+    rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+    rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+    rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
+  }
+
+  for (h = 0; h < y_mis; ++h) {
+    MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+    for (w = 0; w < x_mis; ++w) {
+      MV_REF *const mv = frame_mv + w;
+      mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+      mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+      mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+      mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+    }
+  }
+}
+
+#if CONFIG_SUPERTX
+static void update_state_supertx(const AV1_COMP *const cpi, ThreadData *td,
+                                 PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
+                                 BLOCK_SIZE bsize, RUN_TYPE dry_run) {
+  int y, x_idx;
+#if CONFIG_VAR_TX
+  int i;
+#endif
+  const AV1_COMMON *const cm = &cpi->common;
+  RD_COUNTS *const rdc = &td->rd_counts;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *mi = &ctx->mic;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *mi_addr = xd->mi[0];
+  const struct segmentation *const seg = &cm->seg;
+  const int mis = cm->mi_stride;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  const int x_mis = AOMMIN(mi_width, cm->mi_cols - mi_col);
+  const int y_mis = AOMMIN(mi_height, cm->mi_rows - mi_row);
+  const int unify_bsize = CONFIG_CB4X4;
+  MV_REF *const frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
+
+#if CONFIG_REF_MV
+  int8_t rf_type;
+#endif
+
+  *mi_addr = *mi;
+  *x->mbmi_ext = ctx->mbmi_ext;
+  assert(is_inter_block(mbmi));
+  assert(mbmi->tx_size == ctx->mic.mbmi.tx_size);
+
+#if CONFIG_DUAL_FILTER
+  reset_intmv_filter_type(cm, xd, mbmi);
+#endif
+
+#if CONFIG_REF_MV
+  rf_type = av1_ref_frame_type(mbmi->ref_frame);
+  if (x->mbmi_ext->ref_mv_count[rf_type] > 1 &&
+      (mbmi->sb_type >= BLOCK_8X8 || unify_bsize)) {
+    set_ref_and_pred_mvs(x, mi->mbmi.pred_mv, rf_type);
+  }
+#endif  // CONFIG_REF_MV
+
+  // If segmentation in use
+  if (seg->enabled) {
+    if (cpi->vaq_refresh) {
+      const int energy =
+          bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize);
+      mi_addr->mbmi.segment_id = av1_vaq_segment_id(energy);
+    } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+      // For cyclic refresh mode, now update the segment map
+      // and set the segment id.
+      av1_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row, mi_col,
+                                        bsize, ctx->rate, ctx->dist, 1);
+    } else {
+      // Otherwise just set the segment id based on the current segment map
+      const uint8_t *const map =
+          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+      mi_addr->mbmi.segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+    }
+    mi_addr->mbmi.segment_id_supertx = MAX_SEGMENTS;
+  }
+
+  // Restore the coding context of the MB to that that was in place
+  // when the mode was picked for it
+  for (y = 0; y < mi_height; y++)
+    for (x_idx = 0; x_idx < mi_width; x_idx++)
+      if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx &&
+          (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
+        xd->mi[x_idx + y * mis] = mi_addr;
+      }
+
+#if !CONFIG_CB4X4
+  if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) {
+    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+    mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+  }
+#endif
+
+  x->skip = ctx->skip;
+
+#if CONFIG_VAR_TX
+  for (i = 0; i < 1; ++i)
+    memcpy(x->blk_skip[i], ctx->blk_skip[i],
+           sizeof(uint8_t) * ctx->num_4x4_blk);
+
+  if (!is_inter_block(mbmi) || mbmi->skip)
+    mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+#endif  // CONFIG_VAR_TX
+
+#if CONFIG_VAR_TX
+  {
+    const TX_SIZE mtx = mbmi->tx_size;
+    const int num_4x4_blocks_wide = tx_size_wide_unit[mtx] >> 1;
+    const int num_4x4_blocks_high = tx_size_high_unit[mtx] >> 1;
+    int idy, idx;
+    mbmi->inter_tx_size[0][0] = mtx;
+    for (idy = 0; idy < num_4x4_blocks_high; ++idy)
+      for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
+        mbmi->inter_tx_size[idy][idx] = mtx;
+  }
+#endif  // CONFIG_VAR_TX
+  // Turn motion variation off for supertx
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+
+  if (dry_run) return;
+
+  if (!frame_is_intra_only(cm)) {
+    av1_update_mv_count(td);
+
+#if CONFIG_GLOBAL_MOTION
+    if (is_inter_block(mbmi)) {
+      if (bsize >= BLOCK_8X8) {
+        // TODO(sarahparker): global motion stats need to be handled per-tile
+        // to be compatible with tile-based threading.
+        update_global_motion_used(mbmi->mode, bsize, mbmi, rdc);
+      } else {
+        const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+        const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+        int idx, idy;
+        for (idy = 0; idy < 2; idy += num_4x4_h) {
+          for (idx = 0; idx < 2; idx += num_4x4_w) {
+            const int j = idy * 2 + idx;
+            update_global_motion_used(mi->bmi[j].as_mode, bsize, mbmi, rdc);
+          }
+        }
+      }
+    }
+#endif  // CONFIG_GLOBAL_MOTION
+
+    if (cm->interp_filter == SWITCHABLE
+#if CONFIG_GLOBAL_MOTION
+        && !is_nontrans_global_motion(xd)
+#endif  // CONFIG_GLOBAL_MOTION
+            ) {
+#if CONFIG_DUAL_FILTER
+      update_filter_type_count(td->counts, xd, mbmi);
+#else
+      const int pred_ctx = av1_get_pred_context_switchable_interp(xd);
+      ++td->counts->switchable_interp[pred_ctx][mbmi->interp_filter];
+#endif
+    }
+
+    rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+    rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+    rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
+  }
+
+  for (h = 0; h < y_mis; ++h) {
+    MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+    for (w = 0; w < x_mis; ++w) {
+      MV_REF *const mv = frame_mv + w;
+      mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+      mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+      mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+      mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+    }
+  }
+}
+
+static void update_state_sb_supertx(const AV1_COMP *const cpi, ThreadData *td,
+                                    const TileInfo *const tile, int mi_row,
+                                    int mi_col, BLOCK_SIZE bsize,
+                                    RUN_TYPE dry_run, PC_TREE *pc_tree) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  int hbs = mi_size_wide[bsize] / 2;
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+#endif
+  PARTITION_TYPE partition = pc_tree->partitioning;
+  BLOCK_SIZE subsize = get_subsize(bsize, partition);
+  int i;
+#if CONFIG_EXT_PARTITION_TYPES
+  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+  PICK_MODE_CONTEXT *pmc = NULL;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
+    x->mb_energy = av1_block_energy(cpi, x, bsize);
+
+  switch (partition) {
+    case PARTITION_NONE:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->none, mi_row, mi_col, subsize,
+                           dry_run);
+      break;
+    case PARTITION_VERT:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->vertical[0], mi_row, mi_col,
+                           subsize, dry_run);
+      if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) {
+        set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
+        update_state_supertx(cpi, td, &pc_tree->vertical[1], mi_row,
+                             mi_col + hbs, subsize, dry_run);
+      }
+      pmc = &pc_tree->vertical_supertx;
+      break;
+    case PARTITION_HORZ:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->horizontal[0], mi_row, mi_col,
+                           subsize, dry_run);
+      if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) {
+        set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
+        update_state_supertx(cpi, td, &pc_tree->horizontal[1], mi_row + hbs,
+                             mi_col, subsize, dry_run);
+      }
+      pmc = &pc_tree->horizontal_supertx;
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8 && !unify_bsize) {
+        set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+        update_state_supertx(cpi, td, pc_tree->leaf_split[0], mi_row, mi_col,
+                             subsize, dry_run);
+      } else {
+        set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+        update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, subsize, dry_run,
+                                pc_tree->split[0]);
+        set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
+        update_state_sb_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize,
+                                dry_run, pc_tree->split[1]);
+        set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
+        update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize,
+                                dry_run, pc_tree->split[2]);
+        set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, subsize);
+        update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs,
+                                subsize, dry_run, pc_tree->split[3]);
+      }
+      pmc = &pc_tree->split_supertx;
+      break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->horizontala[0], mi_row, mi_col,
+                           bsize2, dry_run);
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->horizontala[1], mi_row,
+                           mi_col + hbs, bsize2, dry_run);
+      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->horizontala[2], mi_row + hbs,
+                           mi_col, subsize, dry_run);
+      pmc = &pc_tree->horizontala_supertx;
+      break;
+    case PARTITION_HORZ_B:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->horizontalb[0], mi_row, mi_col,
+                           subsize, dry_run);
+      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->horizontalb[1], mi_row + hbs,
+                           mi_col, bsize2, dry_run);
+      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->horizontalb[2], mi_row + hbs,
+                           mi_col + hbs, bsize2, dry_run);
+      pmc = &pc_tree->horizontalb_supertx;
+      break;
+    case PARTITION_VERT_A:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->verticala[0], mi_row, mi_col,
+                           bsize2, dry_run);
+      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->verticala[1], mi_row + hbs,
+                           mi_col, bsize2, dry_run);
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
+      update_state_supertx(cpi, td, &pc_tree->verticala[2], mi_row,
+                           mi_col + hbs, subsize, dry_run);
+      pmc = &pc_tree->verticala_supertx;
+      break;
+    case PARTITION_VERT_B:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->verticalb[0], mi_row, mi_col,
+                           subsize, dry_run);
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->verticalb[1], mi_row,
+                           mi_col + hbs, bsize2, dry_run);
+      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->verticalb[2], mi_row + hbs,
+                           mi_col + hbs, bsize2, dry_run);
+      pmc = &pc_tree->verticalb_supertx;
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    default: assert(0);
+  }
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    if (pmc != NULL) {
+      p[i].coeff = pmc->coeff[i];
+      p[i].qcoeff = pmc->qcoeff[i];
+      pd[i].dqcoeff = pmc->dqcoeff[i];
+      p[i].eobs = pmc->eobs[i];
+    } else {
+      // These should never be used
+      p[i].coeff = NULL;
+      p[i].qcoeff = NULL;
+      pd[i].dqcoeff = NULL;
+      p[i].eobs = NULL;
+    }
+  }
+}
+
+static void update_supertx_param(ThreadData *td, PICK_MODE_CONTEXT *ctx,
+                                 int best_tx, TX_SIZE supertx_size) {
+  MACROBLOCK *const x = &td->mb;
+#if CONFIG_VAR_TX
+  int i;
+
+  for (i = 0; i < 1; ++i)
+    memcpy(ctx->blk_skip[i], x->blk_skip[i],
+           sizeof(uint8_t) * ctx->num_4x4_blk);
+  ctx->mic.mbmi.min_tx_size = get_min_tx_size(supertx_size);
+#endif  // CONFIG_VAR_TX
+  ctx->mic.mbmi.tx_size = supertx_size;
+  ctx->skip = x->skip;
+  ctx->mic.mbmi.tx_type = best_tx;
+}
+
+static void update_supertx_param_sb(const AV1_COMP *const cpi, ThreadData *td,
+                                    int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                    int best_tx, TX_SIZE supertx_size,
+                                    PC_TREE *pc_tree) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int hbs = mi_size_wide[bsize] / 2;
+  PARTITION_TYPE partition = pc_tree->partitioning;
+  BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+  int i;
+#endif
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      update_supertx_param(td, &pc_tree->none, best_tx, supertx_size);
+      break;
+    case PARTITION_VERT:
+      update_supertx_param(td, &pc_tree->vertical[0], best_tx, supertx_size);
+      if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize))
+        update_supertx_param(td, &pc_tree->vertical[1], best_tx, supertx_size);
+      break;
+    case PARTITION_HORZ:
+      update_supertx_param(td, &pc_tree->horizontal[0], best_tx, supertx_size);
+      if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize))
+        update_supertx_param(td, &pc_tree->horizontal[1], best_tx,
+                             supertx_size);
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8 && !unify_bsize) {
+        update_supertx_param(td, pc_tree->leaf_split[0], best_tx, supertx_size);
+      } else {
+        update_supertx_param_sb(cpi, td, mi_row, mi_col, subsize, best_tx,
+                                supertx_size, pc_tree->split[0]);
+        update_supertx_param_sb(cpi, td, mi_row, mi_col + hbs, subsize, best_tx,
+                                supertx_size, pc_tree->split[1]);
+        update_supertx_param_sb(cpi, td, mi_row + hbs, mi_col, subsize, best_tx,
+                                supertx_size, pc_tree->split[2]);
+        update_supertx_param_sb(cpi, td, mi_row + hbs, mi_col + hbs, subsize,
+                                best_tx, supertx_size, pc_tree->split[3]);
+      }
+      break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      for (i = 0; i < 3; i++)
+        update_supertx_param(td, &pc_tree->horizontala[i], best_tx,
+                             supertx_size);
+      break;
+    case PARTITION_HORZ_B:
+      for (i = 0; i < 3; i++)
+        update_supertx_param(td, &pc_tree->horizontalb[i], best_tx,
+                             supertx_size);
+      break;
+    case PARTITION_VERT_A:
+      for (i = 0; i < 3; i++)
+        update_supertx_param(td, &pc_tree->verticala[i], best_tx, supertx_size);
+      break;
+    case PARTITION_VERT_B:
+      for (i = 0; i < 3; i++)
+        update_supertx_param(td, &pc_tree->verticalb[i], best_tx, supertx_size);
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    default: assert(0);
+  }
+}
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+static void set_mode_info_b(const AV1_COMP *const cpi,
+                            const TileInfo *const tile, ThreadData *td,
+                            int mi_row, int mi_col, BLOCK_SIZE bsize,
+                            PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCK *const x = &td->mb;
+  set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+  update_state(cpi, td, ctx, mi_row, mi_col, bsize, 1);
+}
+
+static void set_mode_info_sb(const AV1_COMP *const cpi, ThreadData *td,
+                             const TileInfo *const tile, TOKENEXTRA **tp,
+                             int mi_row, int mi_col, BLOCK_SIZE bsize,
+                             PC_TREE *pc_tree) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int hbs = mi_size_wide[bsize] / 2;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_EXT_PARTITION_TYPES
+  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+  assert(bsize >= BLOCK_8X8);
+#endif
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize, &pc_tree->none);
+      break;
+    case PARTITION_VERT:
+      set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize,
+                      &pc_tree->vertical[0]);
+      if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) {
+        set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, subsize,
+                        &pc_tree->vertical[1]);
+      }
+      break;
+    case PARTITION_HORZ:
+      set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize,
+                      &pc_tree->horizontal[0]);
+      if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) {
+        set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, subsize,
+                        &pc_tree->horizontal[1]);
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8 && !unify_bsize) {
+        set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize,
+                        pc_tree->leaf_split[0]);
+      } else {
+        set_mode_info_sb(cpi, td, tile, tp, mi_row, mi_col, subsize,
+                         pc_tree->split[0]);
+        set_mode_info_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, subsize,
+                         pc_tree->split[1]);
+        set_mode_info_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, subsize,
+                         pc_tree->split[2]);
+        set_mode_info_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, subsize,
+                         pc_tree->split[3]);
+      }
+      break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      set_mode_info_b(cpi, tile, td, mi_row, mi_col, bsize2,
+                      &pc_tree->horizontala[0]);
+      set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, bsize2,
+                      &pc_tree->horizontala[1]);
+      set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, subsize,
+                      &pc_tree->horizontala[2]);
+      break;
+    case PARTITION_HORZ_B:
+      set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize,
+                      &pc_tree->horizontalb[0]);
+      set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, bsize2,
+                      &pc_tree->horizontalb[1]);
+      set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col + hbs, bsize2,
+                      &pc_tree->horizontalb[2]);
+      break;
+    case PARTITION_VERT_A:
+      set_mode_info_b(cpi, tile, td, mi_row, mi_col, bsize2,
+                      &pc_tree->verticala[0]);
+      set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, bsize2,
+                      &pc_tree->verticala[1]);
+      set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, subsize,
+                      &pc_tree->verticala[2]);
+      break;
+    case PARTITION_VERT_B:
+      set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize,
+                      &pc_tree->verticalb[0]);
+      set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, bsize2,
+                      &pc_tree->verticalb[1]);
+      set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col + hbs, bsize2,
+                      &pc_tree->verticalb[2]);
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    default: assert(0 && "Invalid partition type."); break;
+  }
+}
+#endif
+
+void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col) {
+  uint8_t *const buffers[3] = { src->y_buffer, src->u_buffer, src->v_buffer };
+  const int widths[3] = { src->y_crop_width, src->uv_crop_width,
+                          src->uv_crop_width };
+  const int heights[3] = { src->y_crop_height, src->uv_crop_height,
+                           src->uv_crop_height };
+  const int strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
+  int i;
+
+  // Set current frame pointer.
+  x->e_mbd.cur_buf = src;
+
+  for (i = 0; i < MAX_MB_PLANE; i++)
+    setup_pred_plane(&x->plane[i].src, x->e_mbd.mi[0]->mbmi.sb_type, buffers[i],
+                     widths[i], heights[i], strides[i], mi_row, mi_col, NULL,
+                     x->e_mbd.plane[i].subsampling_x,
+                     x->e_mbd.plane[i].subsampling_y);
+}
+
+static int set_segment_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                              int8_t segment_id) {
+  int segment_qindex;
+  const AV1_COMMON *const cm = &cpi->common;
+  av1_init_plane_quantizers(cpi, x, segment_id);
+  aom_clear_system_state();
+  segment_qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+  return av1_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
+}
+
+static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+                             MACROBLOCK *const x, int mi_row, int mi_col,
+                             RD_STATS *rd_cost,
+#if CONFIG_SUPERTX
+                             int *totalrate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                             PARTITION_TYPE partition,
+#endif
+                             BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                             int64_t best_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
+  int i, orig_rdmult;
+  const int unify_bsize = CONFIG_CB4X4;
+
+  aom_clear_system_state();
+
+#if CONFIG_PVQ
+  x->pvq_speed = 1;
+  x->pvq_coded = 0;
+#endif
+#if CONFIG_CFL
+  // Don't store luma during RDO (we will store the best mode later).
+  x->cfl_store_y = 0;
+#endif
+
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+  mbmi = &xd->mi[0]->mbmi;
+  mbmi->sb_type = bsize;
+#if CONFIG_RD_DEBUG
+  mbmi->mi_row = mi_row;
+  mbmi->mi_col = mi_col;
+#endif
+#if CONFIG_SUPERTX
+  // We set tx_size here as skip blocks would otherwise not set it.
+  // tx_size needs to be set at this point as supertx_enable in
+  // write_modes_sb is computed based on this, and if the garbage in memory
+  // just happens to be the supertx_size, then the packer will code this
+  // block as a supertx block, even if rdopt did not pick it as such.
+  mbmi->tx_size = max_txsize_lookup[bsize];
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+  mbmi->partition = partition;
+#endif
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff[i];
+    p[i].qcoeff = ctx->qcoeff[i];
+    pd[i].dqcoeff = ctx->dqcoeff[i];
+#if CONFIG_PVQ
+    pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i];
+#endif
+    p[i].eobs = ctx->eobs[i];
+#if CONFIG_LV_MAP
+    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+#endif
+  }
+
+#if CONFIG_PALETTE
+  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+#endif  // CONFIG_PALETTE
+
+  ctx->skippable = 0;
+  ctx->pred_pixel_ready = 0;
+
+  // Set to zero to make sure we do not use the previous encoded frame stats
+  mbmi->skip = 0;
+
+#if CONFIG_CB4X4
+  x->skip_chroma_rd =
+      !is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                           xd->plane[1].subsampling_y);
+#endif
+
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    x->source_variance = av1_high_get_sby_perpixel_variance(
+        cpi, &x->plane[0].src, bsize, xd->bd);
+  } else {
+    x->source_variance =
+        av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+  }
+#else
+  x->source_variance =
+      av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+#endif  // CONFIG_HIGHBITDEPTH
+
+  // Save rdmult before it might be changed, so it can be restored later.
+  orig_rdmult = x->rdmult;
+
+  if (aq_mode == VARIANCE_AQ) {
+    if (cpi->vaq_refresh) {
+      const int energy =
+          bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize);
+      mbmi->segment_id = av1_vaq_segment_id(energy);
+      // Re-initialise quantiser
+      av1_init_plane_quantizers(cpi, x, mbmi->segment_id);
+    }
+    x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+  } else if (aq_mode == COMPLEXITY_AQ) {
+    x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+  } else if (aq_mode == CYCLIC_REFRESH_AQ) {
+    // If segment is boosted, use rdmult for that segment.
+    if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
+      x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+  }
+
+  // Find best coding mode & reconstruct the MB so it is available
+  // as a predictor for MBs that follow in the SB
+  if (frame_is_intra_only(cm)) {
+    av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+    *totalrate_nocoef = 0;
+#endif  // CONFIG_SUPERTX
+  } else {
+    if (bsize >= BLOCK_8X8 || unify_bsize) {
+      if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+        av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
+                                           rd_cost, bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+        *totalrate_nocoef = rd_cost->rate;
+#endif  // CONFIG_SUPERTX
+      } else {
+        av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+#if CONFIG_SUPERTX
+                                  totalrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                  bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+        assert(*totalrate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
+      }
+    } else {
+      if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+        // The decoder rejects sub8x8 partitions when SEG_LVL_SKIP is set.
+        rd_cost->rate = INT_MAX;
+      } else {
+        av1_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col,
+                                      rd_cost,
+#if CONFIG_SUPERTX
+                                      totalrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                      bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+        assert(*totalrate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
+      }
+    }
+  }
+
+  // Examine the resulting rate and for AQ mode 2 make a segment choice.
+  if ((rd_cost->rate != INT_MAX) && (aq_mode == COMPLEXITY_AQ) &&
+      (bsize >= BLOCK_16X16) &&
+      (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
+       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref))) {
+    av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
+  }
+
+  x->rdmult = orig_rdmult;
+
+  // TODO(jingning) The rate-distortion optimization flow needs to be
+  // refactored to provide proper exit/return handle.
+  if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX;
+
+  ctx->rate = rd_cost->rate;
+  ctx->dist = rd_cost->dist;
+}
+
+#if CONFIG_REF_MV
+static void update_inter_mode_stats(FRAME_COUNTS *counts, PREDICTION_MODE mode,
+                                    int16_t mode_context) {
+  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+  if (mode == NEWMV) {
+    ++counts->newmv_mode[mode_ctx][0];
+    return;
+  } else {
+    ++counts->newmv_mode[mode_ctx][1];
+
+    if (mode_context & (1 << ALL_ZERO_FLAG_OFFSET)) {
+      return;
+    }
+
+    mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
+    if (mode == ZEROMV) {
+      ++counts->zeromv_mode[mode_ctx][0];
+      return;
+    } else {
+      ++counts->zeromv_mode[mode_ctx][1];
+      mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+      if (mode_context & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6;
+      if (mode_context & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7;
+      if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8;
+
+      ++counts->refmv_mode[mode_ctx][mode != NEARESTMV];
+    }
+  }
+}
+#endif
+
+static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
+                         int mi_col
+#if CONFIG_SUPERTX
+                         ,
+                         int supertx_enabled
+#endif
+                         ) {
+#if CONFIG_DELTA_Q
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+#else
+  const MACROBLOCK *x = &td->mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+#endif
+  const MODE_INFO *const mi = xd->mi[0];
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int unify_bsize = CONFIG_CB4X4;
+
+#if CONFIG_DELTA_Q
+  // delta quant applies to both intra and inter
+  const int super_block_upper_left = ((mi_row & 7) == 0) && ((mi_col & 7) == 0);
+
+  if (cm->delta_q_present_flag && (bsize != BLOCK_64X64 || !mbmi->skip) &&
+      super_block_upper_left) {
+    const int dq = (mbmi->current_q_index - xd->prev_qindex) / cm->delta_q_res;
+    const int absdq = abs(dq);
+    int i;
+    for (i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) {
+      td->counts->delta_q[i][1]++;
+    }
+    if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++;
+    xd->prev_qindex = mbmi->current_q_index;
+#if CONFIG_EXT_DELTA_Q
+    if (cm->delta_lf_present_flag) {
+      const int dlf =
+          (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
+          cm->delta_lf_res;
+      const int absdlf = abs(dlf);
+      for (i = 0; i < AOMMIN(absdlf, DELTA_LF_SMALL); ++i) {
+        td->counts->delta_lf[i][1]++;
+      }
+      if (absdlf < DELTA_LF_SMALL) td->counts->delta_lf[absdlf][0]++;
+      xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
+    }
+#endif
+  }
+#else
+  (void)mi_row;
+  (void)mi_col;
+#endif
+  if (!frame_is_intra_only(cm)) {
+    FRAME_COUNTS *const counts = td->counts;
+    const int inter_block = is_inter_block(mbmi);
+    const int seg_ref_active =
+        segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+    if (!seg_ref_active) {
+#if CONFIG_SUPERTX
+      if (!supertx_enabled)
+#endif
+        counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
+      // If the segment reference feature is enabled we have only a single
+      // reference frame allowed for the segment so exclude it from
+      // the reference frame counts used to work out probabilities.
+      if (inter_block) {
+        const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
+#if CONFIG_EXT_REFS
+        const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
+#endif  // CONFIG_EXT_REFS
+
+        if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+#if !SUB8X8_COMP_REF
+          if (mbmi->sb_type >= BLOCK_8X8)
+            counts->comp_inter[av1_get_reference_mode_context(cm, xd)]
+                              [has_second_ref(mbmi)]++;
+#else
+          counts->comp_inter[av1_get_reference_mode_context(cm, xd)]
+                            [has_second_ref(mbmi)]++;
+#endif
+        }
+
+        if (has_second_ref(mbmi)) {
+#if CONFIG_EXT_REFS
+          const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
+
+          counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0][bit]++;
+          if (!bit) {
+            counts->comp_ref[av1_get_pred_context_comp_ref_p1(cm, xd)][1]
+                            [ref0 == LAST_FRAME]++;
+          } else {
+            counts->comp_ref[av1_get_pred_context_comp_ref_p2(cm, xd)][2]
+                            [ref0 == GOLDEN_FRAME]++;
+          }
+
+          counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(cm, xd)][0]
+                             [ref1 == ALTREF_FRAME]++;
+#else
+          counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0]
+                          [ref0 == GOLDEN_FRAME]++;
+#endif  // CONFIG_EXT_REFS
+        } else {
+#if CONFIG_EXT_REFS
+          const int bit = (ref0 == ALTREF_FRAME || ref0 == BWDREF_FRAME);
+
+          counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++;
+          if (bit) {
+            counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
+                              [ref0 != BWDREF_FRAME]++;
+          } else {
+            const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME);
+            counts
+                ->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++;
+            if (!bit1) {
+              counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3]
+                                [ref0 != LAST_FRAME]++;
+            } else {
+              counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4]
+                                [ref0 != LAST3_FRAME]++;
+            }
+          }
+#else
+          counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0]
+                            [ref0 != LAST_FRAME]++;
+          if (ref0 != LAST_FRAME) {
+            counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
+                              [ref0 != GOLDEN_FRAME]++;
+          }
+#endif  // CONFIG_EXT_REFS
+        }
+
+#if CONFIG_EXT_INTER
+        if (cm->reference_mode != COMPOUND_REFERENCE &&
+#if CONFIG_SUPERTX
+            !supertx_enabled &&
+#endif
+            is_interintra_allowed(mbmi)) {
+          const int bsize_group = size_group_lookup[bsize];
+          if (mbmi->ref_frame[1] == INTRA_FRAME) {
+            counts->interintra[bsize_group][1]++;
+            counts->interintra_mode[bsize_group][mbmi->interintra_mode]++;
+            if (is_interintra_wedge_used(bsize))
+              counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
+          } else {
+            counts->interintra[bsize_group][0]++;
+          }
+        }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+        const MOTION_MODE motion_allowed = motion_mode_allowed(
+#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+            0, xd->global_motion,
+#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+            mi);
+#if CONFIG_SUPERTX
+        if (!supertx_enabled)
+#endif  // CONFIG_SUPERTX
+#if CONFIG_EXT_INTER
+          if (mbmi->ref_frame[1] != INTRA_FRAME)
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+          {
+            if (motion_allowed == WARPED_CAUSAL)
+              counts->motion_mode[mbmi->sb_type][mbmi->motion_mode]++;
+            else if (motion_allowed == OBMC_CAUSAL)
+              counts->obmc[mbmi->sb_type][mbmi->motion_mode == OBMC_CAUSAL]++;
+          }
+#else
+        if (motion_allowed > SIMPLE_TRANSLATION)
+          counts->motion_mode[mbmi->sb_type][mbmi->motion_mode]++;
+#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if CONFIG_EXT_INTER
+        if (cm->reference_mode != SINGLE_REFERENCE &&
+            is_inter_compound_mode(mbmi->mode)
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+            && mbmi->motion_mode == SIMPLE_TRANSLATION
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+            ) {
+          counts->compound_interinter[bsize][mbmi->interinter_compound_type]++;
+        }
+#endif  // CONFIG_EXT_INTER
+      }
+    }
+
+    if (inter_block &&
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      int16_t mode_ctx;
+#if !CONFIG_REF_MV
+      mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
+#endif
+      if (bsize >= BLOCK_8X8 || unify_bsize) {
+        const PREDICTION_MODE mode = mbmi->mode;
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+        if (has_second_ref(mbmi)) {
+          mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+          ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+        } else {
+#endif  // CONFIG_EXT_INTER
+          mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
+                                               mbmi->ref_frame, bsize, -1);
+          update_inter_mode_stats(counts, mode, mode_ctx);
+#if CONFIG_EXT_INTER
+        }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_EXT_INTER
+        if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+#else
+        if (mbmi->mode == NEWMV) {
+#endif
+          uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+          int idx;
+
+          for (idx = 0; idx < 2; ++idx) {
+            if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+              uint8_t drl_ctx =
+                  av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+              ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
+
+              if (mbmi->ref_mv_idx == idx) break;
+            }
+          }
+        }
+
+#if CONFIG_EXT_INTER
+        if (have_nearmv_in_inter_mode(mbmi->mode)) {
+#else
+        if (mbmi->mode == NEARMV) {
+#endif
+          uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+          int idx;
+
+          for (idx = 1; idx < 3; ++idx) {
+            if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+              uint8_t drl_ctx =
+                  av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+              ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
+
+              if (mbmi->ref_mv_idx == idx - 1) break;
+            }
+          }
+        }
+#else
+#if CONFIG_EXT_INTER
+        if (is_inter_compound_mode(mode))
+          ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+        else
+#endif  // CONFIG_EXT_INTER
+          ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
+#endif
+      } else {
+        const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+        const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+        int idx, idy;
+        for (idy = 0; idy < 2; idy += num_4x4_h) {
+          for (idx = 0; idx < 2; idx += num_4x4_w) {
+            const int j = idy * 2 + idx;
+            const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+            if (has_second_ref(mbmi)) {
+              mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+              ++counts->inter_compound_mode[mode_ctx]
+                                           [INTER_COMPOUND_OFFSET(b_mode)];
+            } else {
+#endif  // CONFIG_EXT_INTER
+              mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
+                                                   mbmi->ref_frame, bsize, j);
+              update_inter_mode_stats(counts, b_mode, mode_ctx);
+#if CONFIG_EXT_INTER
+            }
+#endif  // CONFIG_EXT_INTER
+#else
+#if CONFIG_EXT_INTER
+            if (is_inter_compound_mode(b_mode))
+              ++counts->inter_compound_mode[mode_ctx]
+                                           [INTER_COMPOUND_OFFSET(b_mode)];
+            else
+#endif  // CONFIG_EXT_INTER
+              ++counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
+#endif
+          }
+        }
+      }
+    }
+  }
+}
+
+typedef struct {
+  ENTROPY_CONTEXT a[2 * MAX_MIB_SIZE * MAX_MB_PLANE];
+  ENTROPY_CONTEXT l[2 * MAX_MIB_SIZE * MAX_MB_PLANE];
+  PARTITION_CONTEXT sa[MAX_MIB_SIZE];
+  PARTITION_CONTEXT sl[MAX_MIB_SIZE];
+#if CONFIG_VAR_TX
+  TXFM_CONTEXT *p_ta;
+  TXFM_CONTEXT *p_tl;
+  TXFM_CONTEXT ta[MAX_MIB_SIZE];
+  TXFM_CONTEXT tl[MAX_MIB_SIZE];
+#endif
+} RD_SEARCH_MACROBLOCK_CONTEXT;
+
+static void restore_context(MACROBLOCK *x,
+                            const RD_SEARCH_MACROBLOCK_CONTEXT *ctx, int mi_row,
+                            int mi_col,
+#if CONFIG_PVQ
+                            od_rollback_buffer *rdo_buf,
+#endif
+                            BLOCK_SIZE bsize) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int p;
+  const int num_4x4_blocks_wide =
+      block_size_wide[bsize] >> tx_size_wide_log2[0];
+  const int num_4x4_blocks_high =
+      block_size_high[bsize] >> tx_size_high_log2[0];
+  int mi_width = mi_size_wide[bsize];
+  int mi_height = mi_size_high[bsize];
+  for (p = 0; p < MAX_MB_PLANE; p++) {
+    memcpy(xd->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
+           ctx->a + num_4x4_blocks_wide * p,
+           (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+               xd->plane[p].subsampling_x);
+    memcpy(xd->left_context[p] +
+               ((mi_row & MAX_MIB_MASK) * 2 >> xd->plane[p].subsampling_y),
+           ctx->l + num_4x4_blocks_high * p,
+           (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+               xd->plane[p].subsampling_y);
+  }
+  memcpy(xd->above_seg_context + mi_col, ctx->sa,
+         sizeof(*xd->above_seg_context) * mi_width);
+  memcpy(xd->left_seg_context + (mi_row & MAX_MIB_MASK), ctx->sl,
+         sizeof(xd->left_seg_context[0]) * mi_height);
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = ctx->p_ta;
+  xd->left_txfm_context = ctx->p_tl;
+  memcpy(xd->above_txfm_context, ctx->ta,
+         sizeof(*xd->above_txfm_context) * mi_width);
+  memcpy(xd->left_txfm_context, ctx->tl,
+         sizeof(*xd->left_txfm_context) * mi_height);
+#endif
+#if CONFIG_PVQ
+  od_encode_rollback(&x->daala_enc, rdo_buf);
+#endif
+}
+
+static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                         int mi_row, int mi_col,
+#if CONFIG_PVQ
+                         od_rollback_buffer *rdo_buf,
+#endif
+                         BLOCK_SIZE bsize) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  int p;
+  const int num_4x4_blocks_wide =
+      block_size_wide[bsize] >> tx_size_wide_log2[0];
+  const int num_4x4_blocks_high =
+      block_size_high[bsize] >> tx_size_high_log2[0];
+  int mi_width = mi_size_wide[bsize];
+  int mi_height = mi_size_high[bsize];
+
+  // buffer the above/left context information of the block in search.
+  for (p = 0; p < MAX_MB_PLANE; ++p) {
+    memcpy(ctx->a + num_4x4_blocks_wide * p,
+           xd->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
+           (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+               xd->plane[p].subsampling_x);
+    memcpy(ctx->l + num_4x4_blocks_high * p,
+           xd->left_context[p] +
+               ((mi_row & MAX_MIB_MASK) * 2 >> xd->plane[p].subsampling_y),
+           (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+               xd->plane[p].subsampling_y);
+  }
+  memcpy(ctx->sa, xd->above_seg_context + mi_col,
+         sizeof(*xd->above_seg_context) * mi_width);
+  memcpy(ctx->sl, xd->left_seg_context + (mi_row & MAX_MIB_MASK),
+         sizeof(xd->left_seg_context[0]) * mi_height);
+#if CONFIG_VAR_TX
+  memcpy(ctx->ta, xd->above_txfm_context,
+         sizeof(*xd->above_txfm_context) * mi_width);
+  memcpy(ctx->tl, xd->left_txfm_context,
+         sizeof(*xd->left_txfm_context) * mi_height);
+  ctx->p_ta = xd->above_txfm_context;
+  ctx->p_tl = xd->left_txfm_context;
+#endif
+#if CONFIG_PVQ
+  od_encode_checkpoint(&x->daala_enc, rdo_buf);
+#endif
+}
+
+static void encode_b(const AV1_COMP *const cpi, const TileInfo *const tile,
+                     ThreadData *td, TOKENEXTRA **tp, int mi_row, int mi_col,
+                     RUN_TYPE dry_run, BLOCK_SIZE bsize,
+#if CONFIG_EXT_PARTITION_TYPES
+                     PARTITION_TYPE partition,
+#endif
+                     PICK_MODE_CONTEXT *ctx, int *rate) {
+  MACROBLOCK *const x = &td->mb;
+#if (CONFIG_MOTION_VAR && CONFIG_NCOBMC) | CONFIG_EXT_DELTA_Q
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+  int check_ncobmc;
+#endif
+#endif
+
+  set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+#if CONFIG_EXT_PARTITION_TYPES
+  x->e_mbd.mi[0]->mbmi.partition = partition;
+#endif
+  update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+  mbmi = &xd->mi[0]->mbmi;
+  const MOTION_MODE motion_allowed = motion_mode_allowed(
+#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+      0, xd->global_motion,
+#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+      xd->mi[0]);
+  check_ncobmc = is_inter_block(mbmi) && motion_allowed >= OBMC_CAUSAL;
+  if (!dry_run && check_ncobmc) {
+    av1_check_ncobmc_rd(cpi, x, mi_row, mi_col);
+    av1_setup_dst_planes(x->e_mbd.plane, bsize,
+                         get_frame_new_buffer(&cpi->common), mi_row, mi_col);
+  }
+#endif
+  encode_superblock(cpi, td, tp, dry_run, mi_row, mi_col, bsize, ctx, rate);
+
+  if (!dry_run) {
+#if CONFIG_EXT_DELTA_Q
+    mbmi = &xd->mi[0]->mbmi;
+    if (bsize == BLOCK_64X64 && mbmi->skip == 1 && is_inter_block(mbmi) &&
+        cpi->common.delta_lf_present_flag) {
+      mbmi->current_delta_lf_from_base = xd->prev_delta_lf_from_base;
+    }
+#endif
+#if CONFIG_SUPERTX
+    update_stats(&cpi->common, td, mi_row, mi_col, 0);
+#else
+    update_stats(&cpi->common, td, mi_row, mi_col);
+#endif
+  }
+}
+
+static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
+                      const TileInfo *const tile, TOKENEXTRA **tp, int mi_row,
+                      int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                      PC_TREE *pc_tree, int *rate) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int hbs = mi_size_wide[bsize] / 2;
+  const int is_partition_root = bsize >= BLOCK_8X8;
+  const int ctx = is_partition_root
+                      ? partition_plane_context(xd, mi_row, mi_col,
+#if CONFIG_UNPOISON_PARTITION_CTX
+                                                mi_row + hbs < cm->mi_rows,
+                                                mi_col + hbs < cm->mi_cols,
+#endif
+                                                bsize)
+                      : -1;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_EXT_PARTITION_TYPES
+  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+  assert(bsize >= BLOCK_8X8);
+#endif
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  if (!dry_run && ctx >= 0) td->counts->partition[ctx][partition]++;
+
+#if CONFIG_SUPERTX
+  if (!frame_is_intra_only(cm) && bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+      partition != PARTITION_NONE && !xd->lossless[0]) {
+    int supertx_enabled;
+    TX_SIZE supertx_size = max_txsize_lookup[bsize];
+    supertx_enabled = check_supertx_sb(bsize, supertx_size, pc_tree);
+    if (supertx_enabled) {
+      const int mi_width = mi_size_wide[bsize];
+      const int mi_height = mi_size_high[bsize];
+      int x_idx, y_idx, i;
+      uint8_t *dst_buf[3];
+      int dst_stride[3];
+      set_skip_context(xd, mi_row, mi_col);
+      set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+      update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, dry_run,
+                              pc_tree);
+
+      av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+                           mi_col);
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        dst_buf[i] = xd->plane[i].dst.buf;
+        dst_stride[i] = xd->plane[i].dst.stride;
+      }
+      predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, dry_run,
+                         bsize, bsize, dst_buf, dst_stride, pc_tree);
+
+      set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+      set_segment_id_supertx(cpi, x, mi_row, mi_col, bsize);
+
+      if (!x->skip) {
+        int this_rate = 0;
+        av1_encode_sb_supertx((AV1_COMMON *)cm, x, bsize);
+        av1_tokenize_sb_supertx(cpi, td, tp, dry_run, bsize, rate);
+        if (rate) *rate += this_rate;
+      } else {
+        xd->mi[0]->mbmi.skip = 1;
+        if (!dry_run) td->counts->skip[av1_get_skip_context(xd)][1]++;
+        reset_skip_context(xd, bsize);
+      }
+      if (!dry_run) {
+        for (y_idx = 0; y_idx < mi_height; y_idx++)
+          for (x_idx = 0; x_idx < mi_width; x_idx++) {
+            if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width >
+                    x_idx &&
+                (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height >
+                    y_idx) {
+              xd->mi[x_idx + y_idx * cm->mi_stride]->mbmi.skip =
+                  xd->mi[0]->mbmi.skip;
+            }
+          }
+        td->counts->supertx[partition_supertx_context_lookup[partition]]
+                           [supertx_size][1]++;
+        td->counts->supertx_size[supertx_size]++;
+#if CONFIG_EXT_TX
+        if (get_ext_tx_types(supertx_size, bsize, 1, cm->reduced_tx_set_used) >
+                1 &&
+            !xd->mi[0]->mbmi.skip) {
+          const int eset =
+              get_ext_tx_set(supertx_size, bsize, 1, cm->reduced_tx_set_used);
+          if (eset > 0) {
+            ++td->counts
+                  ->inter_ext_tx[eset][supertx_size][xd->mi[0]->mbmi.tx_type];
+          }
+        }
+#else
+        if (supertx_size < TX_32X32 && !xd->mi[0]->mbmi.skip) {
+          ++td->counts->inter_ext_tx[supertx_size][xd->mi[0]->mbmi.tx_type];
+        }
+#endif  // CONFIG_EXT_TX
+      }
+#if CONFIG_EXT_PARTITION_TYPES
+      update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize,
+                                   partition);
+#else
+      if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif
+#if CONFIG_VAR_TX
+      set_txfm_ctxs(supertx_size, mi_width, mi_height, xd->mi[0]->mbmi.skip,
+                    xd);
+#endif  // CONFIG_VAR_TX
+      return;
+    } else {
+      if (!dry_run) {
+        td->counts->supertx[partition_supertx_context_lookup[partition]]
+                           [supertx_size][0]++;
+      }
+    }
+  }
+#endif  // CONFIG_SUPERTX
+
+  switch (partition) {
+    case PARTITION_NONE:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+               partition,
+#endif
+               &pc_tree->none, rate);
+      break;
+    case PARTITION_VERT:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+               partition,
+#endif
+               &pc_tree->vertical[0], rate);
+      if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) {
+        encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif
+                 &pc_tree->vertical[1], rate);
+      }
+      break;
+    case PARTITION_HORZ:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+               partition,
+#endif
+               &pc_tree->horizontal[0], rate);
+      if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) {
+        encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif
+                 &pc_tree->horizontal[1], rate);
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8 && !unify_bsize) {
+        encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif
+                 pc_tree->leaf_split[0], rate);
+      } else {
+        encode_sb(cpi, td, tile, tp, mi_row, mi_col, dry_run, subsize,
+                  pc_tree->split[0], rate);
+        encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, dry_run, subsize,
+                  pc_tree->split[1], rate);
+        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, dry_run, subsize,
+                  pc_tree->split[2], rate);
+        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, dry_run,
+                  subsize, pc_tree->split[3], rate);
+      }
+      break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition,
+               &pc_tree->horizontala[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->horizontala[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+               partition, &pc_tree->horizontala[2], rate);
+      break;
+    case PARTITION_HORZ_B:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
+               &pc_tree->horizontalb[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+               partition, &pc_tree->horizontalb[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->horizontalb[2], rate);
+      break;
+    case PARTITION_VERT_A:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition,
+               &pc_tree->verticala[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+               partition, &pc_tree->verticala[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+               partition, &pc_tree->verticala[2], rate);
+
+      break;
+    case PARTITION_VERT_B:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
+               &pc_tree->verticalb[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->verticalb[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->verticalb[2], rate);
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    default: assert(0 && "Invalid partition type."); break;
+  }
+
+#if CONFIG_EXT_PARTITION_TYPES
+  update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+#else
+  if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+}
+
+// Check to see if the given partition size is allowed for a specified number
+// of mi block rows and columns remaining in the image.
+// If not then return the largest allowed partition size
+static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left,
+                                      int cols_left, int *bh, int *bw) {
+  if (rows_left <= 0 || cols_left <= 0) {
+    return AOMMIN(bsize, BLOCK_8X8);
+  } else {
+    for (; bsize > 0; bsize -= 3) {
+      *bh = mi_size_high[bsize];
+      *bw = mi_size_wide[bsize];
+      if ((*bh <= rows_left) && (*bw <= cols_left)) {
+        break;
+      }
+    }
+  }
+  return bsize;
+}
+
+static void set_partial_sb_partition(const AV1_COMMON *const cm, MODE_INFO *mi,
+                                     int bh_in, int bw_in,
+                                     int mi_rows_remaining,
+                                     int mi_cols_remaining, BLOCK_SIZE bsize,
+                                     MODE_INFO **mib) {
+  int bh = bh_in;
+  int r, c;
+  for (r = 0; r < cm->mib_size; r += bh) {
+    int bw = bw_in;
+    for (c = 0; c < cm->mib_size; c += bw) {
+      const int index = r * cm->mi_stride + c;
+      mib[index] = mi + index;
+      mib[index]->mbmi.sb_type = find_partition_size(
+          bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw);
+    }
+  }
+}
+
+// This function attempts to set all mode info entries in a given superblock
+// to the same block partition size.
+// However, at the bottom and right borders of the image the requested size
+// may not be allowed in which case this code attempts to choose the largest
+// allowable partition.
+static void set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+                                   MODE_INFO **mib, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int mi_rows_remaining = tile->mi_row_end - mi_row;
+  const int mi_cols_remaining = tile->mi_col_end - mi_col;
+  int block_row, block_col;
+  MODE_INFO *const mi_upper_left = cm->mi + mi_row * cm->mi_stride + mi_col;
+  int bh = mi_size_high[bsize];
+  int bw = mi_size_wide[bsize];
+
+  assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0));
+
+  // Apply the requested partition size to the SB if it is all "in image"
+  if ((mi_cols_remaining >= cm->mib_size) &&
+      (mi_rows_remaining >= cm->mib_size)) {
+    for (block_row = 0; block_row < cm->mib_size; block_row += bh) {
+      for (block_col = 0; block_col < cm->mib_size; block_col += bw) {
+        int index = block_row * cm->mi_stride + block_col;
+        mib[index] = mi_upper_left + index;
+        mib[index]->mbmi.sb_type = bsize;
+      }
+    }
+  } else {
+    // Else this is a partial SB.
+    set_partial_sb_partition(cm, mi_upper_left, bh, bw, mi_rows_remaining,
+                             mi_cols_remaining, bsize, mib);
+  }
+}
+
+static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
+                             TileDataEnc *tile_data, MODE_INFO **mib,
+                             TOKENEXTRA **tp, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize, int *rate, int64_t *dist,
+#if CONFIG_SUPERTX
+                             int *rate_nocoef,
+#endif
+                             int do_recon, PC_TREE *pc_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  int i;
+  const int pl = (bsize >= BLOCK_8X8)
+                     ? partition_plane_context(xd, mi_row, mi_col,
+#if CONFIG_UNPOISON_PARTITION_CTX
+                                               mi_row + hbs < cm->mi_rows,
+                                               mi_col + hbs < cm->mi_cols,
+#endif
+                                               bsize)
+                     : 0;
+  const PARTITION_TYPE partition =
+      (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
+                           : PARTITION_NONE;
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  RD_STATS last_part_rdc, none_rdc, chosen_rdc;
+  BLOCK_SIZE sub_subsize = BLOCK_4X4;
+  int splits_below = 0;
+  BLOCK_SIZE bs_type = mib[0]->mbmi.sb_type;
+  int do_partition_search = 1;
+  PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
+  const int unify_bsize = CONFIG_CB4X4;
+#if CONFIG_SUPERTX
+  int last_part_rate_nocoef = INT_MAX;
+  int none_rate_nocoef = INT_MAX;
+  int chosen_rate_nocoef = INT_MAX;
+#endif
+#if CONFIG_PVQ
+  od_rollback_buffer pre_rdo_buf;
+#endif
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  assert(num_4x4_blocks_wide_lookup[bsize] ==
+         num_4x4_blocks_high_lookup[bsize]);
+
+  av1_invalid_rd_stats(&last_part_rdc);
+  av1_invalid_rd_stats(&none_rdc);
+  av1_invalid_rd_stats(&chosen_rdc);
+
+  pc_tree->partitioning = partition;
+
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+#endif
+#if !CONFIG_PVQ
+  save_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+  save_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+    x->mb_energy = av1_block_energy(cpi, x, bsize);
+  }
+
+  if (do_partition_search &&
+      cpi->sf.partition_search_type == SEARCH_PARTITION &&
+      cpi->sf.adjust_partitioning_from_last_frame) {
+    // Check if any of the sub blocks are further split.
+    if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
+      sub_subsize = get_subsize(subsize, PARTITION_SPLIT);
+      splits_below = 1;
+      for (i = 0; i < 4; i++) {
+        int jj = i >> 1, ii = i & 0x01;
+        MODE_INFO *this_mi = mib[jj * hbs * cm->mi_stride + ii * hbs];
+        if (this_mi && this_mi->mbmi.sb_type >= sub_subsize) {
+          splits_below = 0;
+        }
+      }
+    }
+
+    // If partition is not none try none unless each of the 4 splits are split
+    // even further..
+    if (partition != PARTITION_NONE && !splits_below &&
+        mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
+      pc_tree->partitioning = PARTITION_NONE;
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
+#if CONFIG_SUPERTX
+                       &none_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_NONE,
+#endif
+                       bsize, ctx_none, INT64_MAX);
+
+      if (none_rdc.rate < INT_MAX) {
+        none_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+        none_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, none_rdc.rate, none_rdc.dist);
+#if CONFIG_SUPERTX
+        none_rate_nocoef += cpi->partition_cost[pl][PARTITION_NONE];
+#endif
+      }
+
+#if !CONFIG_PVQ
+      restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+      restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+      mib[0]->mbmi.sb_type = bs_type;
+      pc_tree->partitioning = partition;
+    }
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+#if CONFIG_SUPERTX
+                       &last_part_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_NONE,
+#endif
+                       bsize, ctx_none, INT64_MAX);
+      break;
+    case PARTITION_HORZ:
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+#if CONFIG_SUPERTX
+                       &last_part_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_HORZ,
+#endif
+                       subsize, &pc_tree->horizontal[0], INT64_MAX);
+      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+          mi_row + hbs < cm->mi_rows) {
+        RD_STATS tmp_rdc;
+#if CONFIG_SUPERTX
+        int rt_nocoef = 0;
+#endif
+        PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0];
+        av1_init_rd_stats(&tmp_rdc);
+        update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                          ctx_h, NULL);
+        rd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
+#if CONFIG_SUPERTX
+                         &rt_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                         PARTITION_HORZ,
+#endif
+                         subsize, &pc_tree->horizontal[1], INT64_MAX);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+#if CONFIG_SUPERTX
+          last_part_rate_nocoef = INT_MAX;
+#endif
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
+#if CONFIG_SUPERTX
+        last_part_rate_nocoef += rt_nocoef;
+#endif
+      }
+      break;
+    case PARTITION_VERT:
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+#if CONFIG_SUPERTX
+                       &last_part_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_VERT,
+#endif
+                       subsize, &pc_tree->vertical[0], INT64_MAX);
+      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+          mi_col + hbs < cm->mi_cols) {
+        RD_STATS tmp_rdc;
+#if CONFIG_SUPERTX
+        int rt_nocoef = 0;
+#endif
+        PICK_MODE_CONTEXT *ctx_v = &pc_tree->vertical[0];
+        av1_init_rd_stats(&tmp_rdc);
+        update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                          ctx_v, NULL);
+        rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
+#if CONFIG_SUPERTX
+                         &rt_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                         PARTITION_VERT,
+#endif
+                         subsize, &pc_tree->vertical[bsize > BLOCK_8X8],
+                         INT64_MAX);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+#if CONFIG_SUPERTX
+          last_part_rate_nocoef = INT_MAX;
+#endif
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
+#if CONFIG_SUPERTX
+        last_part_rate_nocoef += rt_nocoef;
+#endif
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8 && !unify_bsize) {
+        rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+#if CONFIG_SUPERTX
+                         &last_part_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                         PARTITION_SPLIT,
+#endif
+                         subsize, pc_tree->leaf_split[0], INT64_MAX);
+        break;
+      }
+      last_part_rdc.rate = 0;
+      last_part_rdc.dist = 0;
+      last_part_rdc.rdcost = 0;
+#if CONFIG_SUPERTX
+      last_part_rate_nocoef = 0;
+#endif
+      for (i = 0; i < 4; i++) {
+        int x_idx = (i & 1) * hbs;
+        int y_idx = (i >> 1) * hbs;
+        int jj = i >> 1, ii = i & 0x01;
+        RD_STATS tmp_rdc;
+#if CONFIG_SUPERTX
+        int rt_nocoef;
+#endif
+        if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+          continue;
+
+        av1_init_rd_stats(&tmp_rdc);
+        rd_use_partition(cpi, td, tile_data,
+                         mib + jj * hbs * cm->mi_stride + ii * hbs, tp,
+                         mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
+                         &tmp_rdc.dist,
+#if CONFIG_SUPERTX
+                         &rt_nocoef,
+#endif
+                         i != 3, pc_tree->split[i]);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+#if CONFIG_SUPERTX
+          last_part_rate_nocoef = INT_MAX;
+#endif
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+#if CONFIG_SUPERTX
+        last_part_rate_nocoef += rt_nocoef;
+#endif
+      }
+      break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_VERT_A:
+    case PARTITION_VERT_B:
+    case PARTITION_HORZ_A:
+    case PARTITION_HORZ_B: assert(0 && "Cannot handle extended partiton types");
+#endif  //  CONFIG_EXT_PARTITION_TYPES
+    default: assert(0); break;
+  }
+
+  if (last_part_rdc.rate < INT_MAX) {
+    last_part_rdc.rate += cpi->partition_cost[pl][partition];
+    last_part_rdc.rdcost =
+        RDCOST(x->rdmult, x->rddiv, last_part_rdc.rate, last_part_rdc.dist);
+#if CONFIG_SUPERTX
+    last_part_rate_nocoef += cpi->partition_cost[pl][partition];
+#endif
+  }
+
+  if (do_partition_search && cpi->sf.adjust_partitioning_from_last_frame &&
+      cpi->sf.partition_search_type == SEARCH_PARTITION &&
+      partition != PARTITION_SPLIT && bsize > BLOCK_8X8 &&
+      (mi_row + bs < cm->mi_rows || mi_row + hbs == cm->mi_rows) &&
+      (mi_col + bs < cm->mi_cols || mi_col + hbs == cm->mi_cols)) {
+    BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
+    chosen_rdc.rate = 0;
+    chosen_rdc.dist = 0;
+#if CONFIG_SUPERTX
+    chosen_rate_nocoef = 0;
+#endif
+#if !CONFIG_PVQ
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+    pc_tree->partitioning = PARTITION_SPLIT;
+
+    // Split partition.
+    for (i = 0; i < 4; i++) {
+      int x_idx = (i & 1) * hbs;
+      int y_idx = (i >> 1) * hbs;
+      RD_STATS tmp_rdc;
+#if CONFIG_SUPERTX
+      int rt_nocoef = 0;
+#endif
+#if CONFIG_PVQ
+      od_rollback_buffer buf;
+#endif
+      if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+        continue;
+
+#if !CONFIG_PVQ
+      save_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+      save_context(x, &x_ctx, mi_row, mi_col, &buf, bsize);
+#endif
+      pc_tree->split[i]->partitioning = PARTITION_NONE;
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+                       &tmp_rdc,
+#if CONFIG_SUPERTX
+                       &rt_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_SPLIT,
+#endif
+                       split_subsize, &pc_tree->split[i]->none, INT64_MAX);
+
+#if !CONFIG_PVQ
+      restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+      restore_context(x, &x_ctx, mi_row, mi_col, &buf, bsize);
+#endif
+      if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+        av1_invalid_rd_stats(&chosen_rdc);
+#if CONFIG_SUPERTX
+        chosen_rate_nocoef = INT_MAX;
+#endif
+        break;
+      }
+
+      chosen_rdc.rate += tmp_rdc.rate;
+      chosen_rdc.dist += tmp_rdc.dist;
+#if CONFIG_SUPERTX
+      chosen_rate_nocoef += rt_nocoef;
+#endif
+
+      if (i != 3)
+        encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx,
+                  OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL);
+
+      chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+#if CONFIG_SUPERTX
+      chosen_rate_nocoef += cpi->partition_cost[pl][PARTITION_SPLIT];
+#endif
+    }
+    if (chosen_rdc.rate < INT_MAX) {
+      chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+      chosen_rdc.rdcost =
+          RDCOST(x->rdmult, x->rddiv, chosen_rdc.rate, chosen_rdc.dist);
+#if CONFIG_SUPERTX
+      chosen_rate_nocoef += cpi->partition_cost[pl][PARTITION_NONE];
+#endif
+    }
+  }
+
+  // If last_part is better set the partitioning to that.
+  if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
+    mib[0]->mbmi.sb_type = bsize;
+    if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition;
+    chosen_rdc = last_part_rdc;
+#if CONFIG_SUPERTX
+    chosen_rate_nocoef = last_part_rate_nocoef;
+#endif
+  }
+  // If none was better set the partitioning to that.
+  if (none_rdc.rdcost < chosen_rdc.rdcost) {
+    if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
+    chosen_rdc = none_rdc;
+#if CONFIG_SUPERTX
+    chosen_rate_nocoef = none_rate_nocoef;
+#endif
+  }
+
+#if !CONFIG_PVQ
+  restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+  restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+
+  // We must have chosen a partitioning and encoding or we'll fail later on.
+  // No other opportunities for success.
+  if (bsize == cm->sb_size)
+    assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
+
+  if (do_recon) {
+    if (bsize == cm->sb_size) {
+      // NOTE: To get estimate for rate due to the tokens, use:
+      // int rate_coeffs = 0;
+      // encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+      //           bsize, pc_tree, &rate_coeffs);
+      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
+  }
+
+  *rate = chosen_rdc.rate;
+  *dist = chosen_rdc.dist;
+#if CONFIG_SUPERTX
+  *rate_nocoef = chosen_rate_nocoef;
+#endif
+}
+
+/* clang-format off */
+static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  BLOCK_2X2,   BLOCK_2X2,   BLOCK_2X2,    //    2x2,    2x4,     4x2
+#endif
+                            BLOCK_4X4,    //                     4x4
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,    //    4x8,    8x4,     8x8
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,    //   8x16,   16x8,   16x16
+  BLOCK_8X8,   BLOCK_8X8,   BLOCK_16X16,  //  16x32,  32x16,   32x32
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,  //  32x64,  64x32,   64x64
+#if CONFIG_EXT_PARTITION
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16   // 64x128, 128x64, 128x128
+#endif  // CONFIG_EXT_PARTITION
+};
+
+static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  BLOCK_4X4,     BLOCK_4X4,       BLOCK_4X4,    //    2x2,    2x4,     4x2
+#endif
+                                  BLOCK_8X8,    //                     4x4
+  BLOCK_16X16,   BLOCK_16X16,   BLOCK_16X16,    //    4x8,    8x4,     8x8
+  BLOCK_32X32,   BLOCK_32X32,   BLOCK_32X32,    //   8x16,   16x8,   16x16
+  BLOCK_64X64,   BLOCK_64X64,   BLOCK_64X64,    //  16x32,  32x16,   32x32
+  BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST,  //  32x64,  64x32,   64x64
+#if CONFIG_EXT_PARTITION
+  BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST   // 64x128, 128x64, 128x128
+#endif  // CONFIG_EXT_PARTITION
+};
+
+// Next square block size less or equal than current block size.
+static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  BLOCK_2X2,   BLOCK_2X2,     BLOCK_2X2,    //    2x2,    2x4,     4x2
+#endif
+                              BLOCK_4X4,    //                     4x4
+  BLOCK_4X4,   BLOCK_4X4,     BLOCK_8X8,    //    4x8,    8x4,     8x8
+  BLOCK_8X8,   BLOCK_8X8,     BLOCK_16X16,  //   8x16,   16x8,   16x16
+  BLOCK_16X16, BLOCK_16X16,   BLOCK_32X32,  //  16x32,  32x16,   32x32
+  BLOCK_32X32, BLOCK_32X32,   BLOCK_64X64,  //  32x64,  64x32,   64x64
+#if CONFIG_EXT_PARTITION
+  BLOCK_64X64, BLOCK_64X64, BLOCK_128X128   // 64x128, 128x64, 128x128
+#endif  // CONFIG_EXT_PARTITION
+};
+/* clang-format on */
+
+// Look at all the mode_info entries for blocks that are part of this
+// partition and find the min and max values for sb_type.
+// At the moment this is designed to work on a superblock but could be
+// adjusted to use a size parameter.
+//
+// The min and max are assumed to have been initialized prior to calling this
+// function so repeat calls can accumulate a min and max of more than one
+// superblock.
+static void get_sb_partition_size_range(const AV1_COMMON *const cm,
+                                        MACROBLOCKD *xd, MODE_INFO **mib,
+                                        BLOCK_SIZE *min_block_size,
+                                        BLOCK_SIZE *max_block_size) {
+  int i, j;
+  int index = 0;
+
+  // Check the sb_type for each block that belongs to this region.
+  for (i = 0; i < cm->mib_size; ++i) {
+    for (j = 0; j < cm->mib_size; ++j) {
+      MODE_INFO *mi = mib[index + j];
+      BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : BLOCK_4X4;
+      *min_block_size = AOMMIN(*min_block_size, sb_type);
+      *max_block_size = AOMMAX(*max_block_size, sb_type);
+    }
+    index += xd->mi_stride;
+  }
+}
+
+// Look at neighboring blocks and set a min and max partition size based on
+// what they chose.
+static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile,
+                                    MACROBLOCKD *const xd, int mi_row,
+                                    int mi_col, BLOCK_SIZE *min_block_size,
+                                    BLOCK_SIZE *max_block_size) {
+  AV1_COMMON *const cm = &cpi->common;
+  MODE_INFO **mi = xd->mi;
+  const int left_in_image = xd->left_available && mi[-1];
+  const int above_in_image = xd->up_available && mi[-xd->mi_stride];
+  const int mi_rows_remaining = tile->mi_row_end - mi_row;
+  const int mi_cols_remaining = tile->mi_col_end - mi_col;
+  int bh, bw;
+  BLOCK_SIZE min_size = BLOCK_4X4;
+  BLOCK_SIZE max_size = BLOCK_LARGEST;
+
+  // Trap case where we do not have a prediction.
+  if (left_in_image || above_in_image || cm->frame_type != KEY_FRAME) {
+    // Default "min to max" and "max to min"
+    min_size = BLOCK_LARGEST;
+    max_size = BLOCK_4X4;
+
+    // NOTE: each call to get_sb_partition_size_range() uses the previous
+    // passed in values for min and max as a starting point.
+    // Find the min and max partition used in previous frame at this location
+    if (cm->frame_type != KEY_FRAME) {
+      MODE_INFO **prev_mi =
+          &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
+      get_sb_partition_size_range(cm, xd, prev_mi, &min_size, &max_size);
+    }
+    // Find the min and max partition sizes used in the left superblock
+    if (left_in_image) {
+      MODE_INFO **left_sb_mi = &mi[-cm->mib_size];
+      get_sb_partition_size_range(cm, xd, left_sb_mi, &min_size, &max_size);
+    }
+    // Find the min and max partition sizes used in the above suprblock.
+    if (above_in_image) {
+      MODE_INFO **above_sb_mi = &mi[-xd->mi_stride * cm->mib_size];
+      get_sb_partition_size_range(cm, xd, above_sb_mi, &min_size, &max_size);
+    }
+
+    // Adjust observed min and max for "relaxed" auto partition case.
+    if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
+      min_size = min_partition_size[min_size];
+      max_size = max_partition_size[max_size];
+    }
+  }
+
+  // Check border cases where max and min from neighbors may not be legal.
+  max_size = find_partition_size(max_size, mi_rows_remaining, mi_cols_remaining,
+                                 &bh, &bw);
+  min_size = AOMMIN(min_size, max_size);
+
+  // Test for blocks at the edge of the active image.
+  // This may be the actual edge of the image or where there are formatting
+  // bars.
+  if (av1_active_edge_sb(cpi, mi_row, mi_col)) {
+    min_size = BLOCK_4X4;
+  } else {
+    min_size = AOMMIN(cpi->sf.rd_auto_partition_min_limit, min_size);
+  }
+
+  // When use_square_partition_only is true, make sure at least one square
+  // partition is allowed by selecting the next smaller square size as
+  // *min_block_size.
+  if (cpi->sf.use_square_partition_only) {
+    min_size = AOMMIN(min_size, next_square_size[max_size]);
+  }
+
+  *min_block_size = AOMMIN(min_size, cm->sb_size);
+  *max_block_size = AOMMIN(max_size, cm->sb_size);
+}
+
+// TODO(jingning) refactor functions setting partition search range
+static void set_partition_range(const AV1_COMMON *const cm,
+                                const MACROBLOCKD *const xd, int mi_row,
+                                int mi_col, BLOCK_SIZE bsize,
+                                BLOCK_SIZE *const min_bs,
+                                BLOCK_SIZE *const max_bs) {
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  int idx, idy;
+
+  const int idx_str = cm->mi_stride * mi_row + mi_col;
+  MODE_INFO **const prev_mi = &cm->prev_mi_grid_visible[idx_str];
+  BLOCK_SIZE min_size = BLOCK_64X64;  // default values
+  BLOCK_SIZE max_size = BLOCK_4X4;
+
+  if (prev_mi) {
+    for (idy = 0; idy < mi_height; ++idy) {
+      for (idx = 0; idx < mi_width; ++idx) {
+        const MODE_INFO *const mi = prev_mi[idy * cm->mi_stride + idx];
+        const BLOCK_SIZE bs = mi ? mi->mbmi.sb_type : bsize;
+        min_size = AOMMIN(min_size, bs);
+        max_size = AOMMAX(max_size, bs);
+      }
+    }
+  }
+
+  if (xd->left_available) {
+    for (idy = 0; idy < mi_height; ++idy) {
+      const MODE_INFO *const mi = xd->mi[idy * cm->mi_stride - 1];
+      const BLOCK_SIZE bs = mi ? mi->mbmi.sb_type : bsize;
+      min_size = AOMMIN(min_size, bs);
+      max_size = AOMMAX(max_size, bs);
+    }
+  }
+
+  if (xd->up_available) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      const MODE_INFO *const mi = xd->mi[idx - cm->mi_stride];
+      const BLOCK_SIZE bs = mi ? mi->mbmi.sb_type : bsize;
+      min_size = AOMMIN(min_size, bs);
+      max_size = AOMMAX(max_size, bs);
+    }
+  }
+
+  if (min_size == max_size) {
+    min_size = min_partition_size[min_size];
+    max_size = max_partition_size[max_size];
+  }
+
+  *min_bs = AOMMIN(min_size, cm->sb_size);
+  *max_bs = AOMMIN(max_size, cm->sb_size);
+}
+
+static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+  memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
+}
+
+static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+  memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
+}
+
+#if CONFIG_FP_MB_STATS
+const int qindex_skip_threshold_lookup[BLOCK_SIZES] = {
+  0,
+  10,
+  10,
+  30,
+  40,
+  40,
+  60,
+  80,
+  80,
+  90,
+  100,
+  100,
+  120,
+#if CONFIG_EXT_PARTITION
+  // TODO(debargha): What are the correct numbers here?
+  130,
+  130,
+  150
+#endif  // CONFIG_EXT_PARTITION
+};
+const int qindex_split_threshold_lookup[BLOCK_SIZES] = {
+  0,
+  3,
+  3,
+  7,
+  15,
+  15,
+  30,
+  40,
+  40,
+  60,
+  80,
+  80,
+  120,
+#if CONFIG_EXT_PARTITION
+  // TODO(debargha): What are the correct numbers here?
+  160,
+  160,
+  240
+#endif  // CONFIG_EXT_PARTITION
+};
+const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = {
+  1,
+  1,
+  1,
+  1,
+  1,
+  1,
+  1,
+  1,
+  1,
+  1,
+  4,
+  4,
+  6,
+#if CONFIG_EXT_PARTITION
+  // TODO(debargha): What are the correct numbers here?
+  8,
+  8,
+  10
+#endif  // CONFIG_EXT_PARTITION
+};
+
+typedef enum {
+  MV_ZERO = 0,
+  MV_LEFT = 1,
+  MV_UP = 2,
+  MV_RIGHT = 3,
+  MV_DOWN = 4,
+  MV_INVALID
+} MOTION_DIRECTION;
+
+static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) {
+  if (fp_byte & FPMB_MOTION_ZERO_MASK) {
+    return MV_ZERO;
+  } else if (fp_byte & FPMB_MOTION_LEFT_MASK) {
+    return MV_LEFT;
+  } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) {
+    return MV_RIGHT;
+  } else if (fp_byte & FPMB_MOTION_UP_MASK) {
+    return MV_UP;
+  } else {
+    return MV_DOWN;
+  }
+}
+
+static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
+                                           MOTION_DIRECTION that_mv) {
+  if (this_mv == that_mv) {
+    return 0;
+  } else {
+    return abs(this_mv - that_mv) == 2 ? 2 : 1;
+  }
+}
+#endif
+
+#if CONFIG_EXT_PARTITION_TYPES
+static void rd_test_partition3(
+    const AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TOKENEXTRA **tp, PC_TREE *pc_tree, RD_STATS *best_rdc,
+    PICK_MODE_CONTEXT ctxs[3], PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
+    BLOCK_SIZE bsize, PARTITION_TYPE partition,
+#if CONFIG_SUPERTX
+    int64_t best_rd, int *best_rate_nocoef, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+#endif
+    int mi_row0, int mi_col0, BLOCK_SIZE subsize0, int mi_row1, int mi_col1,
+    BLOCK_SIZE subsize1, int mi_row2, int mi_col2, BLOCK_SIZE subsize2) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  RD_STATS this_rdc, sum_rdc;
+#if CONFIG_SUPERTX
+  const AV1_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  int this_rate_nocoef, sum_rate_nocoef;
+  int abort_flag;
+  const int supertx_allowed = !frame_is_intra_only(cm) &&
+                              bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+                              !xd->lossless[0];
+#endif
+  if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
+
+  rd_pick_sb_modes(cpi, tile_data, x, mi_row0, mi_col0, &sum_rdc,
+#if CONFIG_SUPERTX
+                   &sum_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                   partition,
+#endif
+                   subsize0, &ctxs[0], best_rdc->rdcost);
+#if CONFIG_SUPERTX
+  abort_flag = sum_rdc.rdcost >= best_rd;
+#endif
+
+#if CONFIG_SUPERTX
+  if (sum_rdc.rdcost < INT64_MAX) {
+#else
+  if (sum_rdc.rdcost < best_rdc->rdcost) {
+#endif
+    PICK_MODE_CONTEXT *ctx_0 = &ctxs[0];
+    update_state(cpi, td, ctx_0, mi_row0, mi_col0, subsize0, 1);
+    encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row0, mi_col0, subsize0,
+                      ctx_0, NULL);
+
+    if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_0);
+
+#if CONFIG_SUPERTX
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row1, mi_col1, &this_rdc,
+                     &this_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+                     partition,
+#endif
+                     subsize1, &ctxs[1], INT64_MAX - sum_rdc.rdcost);
+#else
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row1, mi_col1, &this_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+                     partition,
+#endif
+                     subsize1, &ctxs[1], best_rdc->rdcost - sum_rdc.rdcost);
+#endif  // CONFIG_SUPERTX
+
+    if (this_rdc.rate == INT_MAX) {
+      sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+      sum_rate_nocoef = INT_MAX;
+#endif
+    } else {
+      sum_rdc.rate += this_rdc.rate;
+      sum_rdc.dist += this_rdc.dist;
+      sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+      sum_rate_nocoef += this_rate_nocoef;
+#endif
+    }
+
+#if CONFIG_SUPERTX
+    if (sum_rdc.rdcost < INT64_MAX) {
+#else
+    if (sum_rdc.rdcost < best_rdc->rdcost) {
+#endif
+      PICK_MODE_CONTEXT *ctx_1 = &ctxs[1];
+      update_state(cpi, td, ctx_1, mi_row1, mi_col1, subsize1, 1);
+      encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row1, mi_col1, subsize1,
+                        ctx_1, NULL);
+
+      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_1);
+
+#if CONFIG_SUPERTX
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row2, mi_col2, &this_rdc,
+                       &this_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+                       partition,
+#endif
+                       subsize2, &ctxs[2], INT64_MAX - sum_rdc.rdcost);
+#else
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row2, mi_col2, &this_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+                       partition,
+#endif
+                       subsize2, &ctxs[2], best_rdc->rdcost - sum_rdc.rdcost);
+#endif  // CONFIG_SUPERTX
+
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef = INT_MAX;
+#endif
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef += this_rate_nocoef;
+#endif
+      }
+
+#if CONFIG_SUPERTX
+      if (supertx_allowed && !abort_flag && sum_rdc.rdcost < INT64_MAX) {
+        TX_SIZE supertx_size = max_txsize_lookup[bsize];
+        const PARTITION_TYPE best_partition = pc_tree->partitioning;
+        pc_tree->partitioning = partition;
+        sum_rdc.rate += av1_cost_bit(
+            cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
+                                [supertx_size],
+            0);
+        sum_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+        if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+          TX_TYPE best_tx = DCT_DCT;
+          RD_STATS tmp_rdc = { sum_rate_nocoef, 0, 0 };
+
+          restore_context(x, x_ctx, mi_row, mi_col, bsize);
+
+          rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
+                        &tmp_rdc.rate, &tmp_rdc.dist, &best_tx, pc_tree);
+
+          tmp_rdc.rate += av1_cost_bit(
+              cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
+                                  [supertx_size],
+              1);
+          tmp_rdc.rdcost =
+              RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+          if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+            sum_rdc = tmp_rdc;
+            update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
+                                    supertx_size, pc_tree);
+          }
+        }
+
+        pc_tree->partitioning = best_partition;
+      }
+#endif  // CONFIG_SUPERTX
+
+      if (sum_rdc.rdcost < best_rdc->rdcost) {
+        int pl = partition_plane_context(xd, mi_row, mi_col,
+#if CONFIG_UNPOISON_PARTITION_CTX
+                                         has_rows, has_cols,
+#endif
+                                         bsize);
+        sum_rdc.rate += cpi->partition_cost[pl][partition];
+        sum_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+#if CONFIG_SUPERTX
+        sum_rate_nocoef += cpi->partition_cost[pl][partition];
+#endif
+        if (sum_rdc.rdcost < best_rdc->rdcost) {
+#if CONFIG_SUPERTX
+          *best_rate_nocoef = sum_rate_nocoef;
+          assert(*best_rate_nocoef >= 0);
+#endif
+          *best_rdc = sum_rdc;
+          pc_tree->partitioning = partition;
+        }
+      }
+    }
+  }
+}
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
+// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
+// unlikely to be selected depending on previous rate-distortion optimization
+// results, for encoding speed-up.
+static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
+                              TileDataEnc *tile_data, TOKENEXTRA **tp,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              RD_STATS *rd_cost,
+#if CONFIG_SUPERTX
+                              int *rate_nocoef,
+#endif
+                              int64_t best_rd, PC_TREE *pc_tree) {
+  const AV1_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_step = mi_size_wide[bsize] / 2;
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  const TOKENEXTRA *const tp_orig = *tp;
+  PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
+#if CONFIG_UNPOISON_PARTITION_CTX
+  const int hbs = mi_size_wide[bsize] / 2;
+  const int has_rows = mi_row + hbs < cm->mi_rows;
+  const int has_cols = mi_col + hbs < cm->mi_cols;
+#else
+  int tmp_partition_cost[PARTITION_TYPES];
+#endif
+  BLOCK_SIZE subsize;
+  RD_STATS this_rdc, sum_rdc, best_rdc;
+  const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
+  int do_square_split = bsize_at_least_8x8;
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+  const int pl = bsize_at_least_8x8
+                     ? partition_plane_context(xd, mi_row, mi_col,
+#if CONFIG_UNPOISON_PARTITION_CTX
+                                               has_rows, has_cols,
+#endif
+                                               bsize)
+                     : 0;
+#else
+  const int unify_bsize = 0;
+  const int pl = partition_plane_context(xd, mi_row, mi_col,
+#if CONFIG_UNPOISON_PARTITION_CTX
+                                         has_rows, has_cols,
+#endif
+                                         bsize);
+#endif  // CONFIG_CB4X4
+  const int *partition_cost = cpi->partition_cost[pl];
+#if CONFIG_SUPERTX
+  int this_rate_nocoef, sum_rate_nocoef = 0, best_rate_nocoef = INT_MAX;
+  int abort_flag;
+  const int supertx_allowed = !frame_is_intra_only(cm) &&
+                              bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+                              !xd->lossless[0];
+#endif  // CONFIG_SUPERTX
+
+  int do_rectangular_split = 1;
+#if CONFIG_EXT_PARTITION_TYPES
+  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+
+  // Override skipping rectangular partition operations for edge blocks
+  const int force_horz_split = (mi_row + mi_step >= cm->mi_rows);
+  const int force_vert_split = (mi_col + mi_step >= cm->mi_cols);
+  const int xss = x->e_mbd.plane[1].subsampling_x;
+  const int yss = x->e_mbd.plane[1].subsampling_y;
+
+  BLOCK_SIZE min_size = x->min_partition_size;
+  BLOCK_SIZE max_size = x->max_partition_size;
+
+#if CONFIG_FP_MB_STATS
+  unsigned int src_diff_var = UINT_MAX;
+  int none_complexity = 0;
+#endif
+
+  int partition_none_allowed = !force_horz_split && !force_vert_split;
+  int partition_horz_allowed =
+      !force_vert_split && yss <= xss && bsize_at_least_8x8;
+  int partition_vert_allowed =
+      !force_horz_split && xss <= yss && bsize_at_least_8x8;
+
+#if CONFIG_PVQ
+  od_rollback_buffer pre_rdo_buf;
+#endif
+
+  (void)*tp_orig;
+
+#if !CONFIG_UNPOISON_PARTITION_CTX
+  if (force_horz_split || force_vert_split) {
+    tmp_partition_cost[PARTITION_NONE] = INT_MAX;
+
+    if (!force_vert_split) {  // force_horz_split only
+      tmp_partition_cost[PARTITION_VERT] = INT_MAX;
+      tmp_partition_cost[PARTITION_HORZ] =
+          av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_HORZ], 0);
+      tmp_partition_cost[PARTITION_SPLIT] =
+          av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_HORZ], 1);
+    } else if (!force_horz_split) {  // force_vert_split only
+      tmp_partition_cost[PARTITION_HORZ] = INT_MAX;
+      tmp_partition_cost[PARTITION_VERT] =
+          av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_VERT], 0);
+      tmp_partition_cost[PARTITION_SPLIT] =
+          av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_VERT], 1);
+    } else {  // force_ horz_split && force_vert_split horz_split
+      tmp_partition_cost[PARTITION_HORZ] = INT_MAX;
+      tmp_partition_cost[PARTITION_VERT] = INT_MAX;
+      tmp_partition_cost[PARTITION_SPLIT] = 0;
+    }
+
+    partition_cost = tmp_partition_cost;
+  }
+#endif
+
+#if CONFIG_VAR_TX
+#ifndef NDEBUG
+  // Nothing should rely on the default value of this array (which is just
+  // leftover from encoding the previous block. Setting it to magic number
+  // when debugging.
+  memset(x->blk_skip[0], 234, sizeof(x->blk_skip[0]));
+#endif  // NDEBUG
+#endif  // CONFIG_VAR_TX
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+  av1_init_rd_stats(&this_rdc);
+  av1_init_rd_stats(&sum_rdc);
+  av1_invalid_rd_stats(&best_rdc);
+  best_rdc.rdcost = best_rd;
+
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
+    x->mb_energy = av1_block_energy(cpi, x, bsize);
+
+  if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
+    const int cb_partition_search_ctrl =
+        ((pc_tree->index == 0 || pc_tree->index == 3) +
+         get_chessboard_index(cm->current_video_frame)) &
+        0x1;
+
+    if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size)
+      set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size);
+  }
+
+  // Determine partition types in search according to the speed features.
+  // The threshold set here has to be of square block size.
+  if (cpi->sf.auto_min_max_partition_size) {
+    const int no_partition_allowed = (bsize <= max_size && bsize >= min_size);
+    // Note: Further partitioning is NOT allowed when bsize == min_size already.
+    const int partition_allowed = (bsize <= max_size && bsize > min_size);
+    partition_none_allowed &= no_partition_allowed;
+    partition_horz_allowed &= partition_allowed || force_horz_split;
+    partition_vert_allowed &= partition_allowed || force_vert_split;
+    do_square_split &= bsize > min_size;
+  }
+  if (cpi->sf.use_square_partition_only) {
+    partition_horz_allowed &= force_horz_split;
+    partition_vert_allowed &= force_vert_split;
+  }
+
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+#endif
+#if !CONFIG_PVQ
+  save_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+  save_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+
+#if CONFIG_FP_MB_STATS
+  if (cpi->use_fp_mb_stats) {
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+    src_diff_var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src, mi_row,
+                                                  mi_col, bsize);
+  }
+#endif
+
+#if CONFIG_FP_MB_STATS
+  // Decide whether we shall split directly and skip searching NONE by using
+  // the first pass block statistics
+  if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_square_split &&
+      partition_none_allowed && src_diff_var > 4 &&
+      cm->base_qindex < qindex_split_threshold_lookup[bsize]) {
+    int mb_row = mi_row >> 1;
+    int mb_col = mi_col >> 1;
+    int mb_row_end =
+        AOMMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+    int mb_col_end =
+        AOMMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+    int r, c;
+
+    // compute a complexity measure, basically measure inconsistency of motion
+    // vectors obtained from the first pass in the current block
+    for (r = mb_row; r < mb_row_end; r++) {
+      for (c = mb_col; c < mb_col_end; c++) {
+        const int mb_index = r * cm->mb_cols + c;
+
+        MOTION_DIRECTION this_mv;
+        MOTION_DIRECTION right_mv;
+        MOTION_DIRECTION bottom_mv;
+
+        this_mv =
+            get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]);
+
+        // to its right
+        if (c != mb_col_end - 1) {
+          right_mv = get_motion_direction_fp(
+              cpi->twopass.this_frame_mb_stats[mb_index + 1]);
+          none_complexity += get_motion_inconsistency(this_mv, right_mv);
+        }
+
+        // to its bottom
+        if (r != mb_row_end - 1) {
+          bottom_mv = get_motion_direction_fp(
+              cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]);
+          none_complexity += get_motion_inconsistency(this_mv, bottom_mv);
+        }
+
+        // do not count its left and top neighbors to avoid double counting
+      }
+    }
+
+    if (none_complexity > complexity_16x16_blocks_threshold[bsize]) {
+      partition_none_allowed = 0;
+    }
+  }
+#endif
+
+  // PARTITION_NONE
+  if (partition_none_allowed) {
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+#if CONFIG_SUPERTX
+                     &this_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                     PARTITION_NONE,
+#endif
+                     bsize, ctx_none, best_rdc.rdcost);
+    if (this_rdc.rate != INT_MAX) {
+      if (bsize_at_least_8x8) {
+        this_rdc.rate += partition_cost[PARTITION_NONE];
+        this_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
+#if CONFIG_SUPERTX
+        this_rate_nocoef += partition_cost[PARTITION_NONE];
+#endif
+      }
+
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        // Adjust dist breakout threshold according to the partition size.
+        const int64_t dist_breakout_thr =
+            cpi->sf.partition_search_breakout_dist_thr >>
+            ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
+             (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]));
+        const int rate_breakout_thr =
+            cpi->sf.partition_search_breakout_rate_thr *
+            num_pels_log2_lookup[bsize];
+
+        best_rdc = this_rdc;
+#if CONFIG_SUPERTX
+        best_rate_nocoef = this_rate_nocoef;
+        assert(best_rate_nocoef >= 0);
+#endif
+        if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
+
+        // If all y, u, v transform blocks in this partition are skippable, and
+        // the dist & rate are within the thresholds, the partition search is
+        // terminated for current branch of the partition search tree.
+        // The dist & rate thresholds are set to 0 at speed 0 to disable the
+        // early termination at that speed.
+        if (!x->e_mbd.lossless[xd->mi[0]->mbmi.segment_id] &&
+            (ctx_none->skippable && best_rdc.dist < dist_breakout_thr &&
+             best_rdc.rate < rate_breakout_thr)) {
+          do_square_split = 0;
+          do_rectangular_split = 0;
+        }
+
+#if CONFIG_FP_MB_STATS
+        // Check if every 16x16 first pass block statistics has zero
+        // motion and the corresponding first pass residue is small enough.
+        // If that is the case, check the difference variance between the
+        // current frame and the last frame. If the variance is small enough,
+        // stop further splitting in RD optimization
+        if (cpi->use_fp_mb_stats && do_square_split &&
+            cm->base_qindex > qindex_skip_threshold_lookup[bsize]) {
+          int mb_row = mi_row >> 1;
+          int mb_col = mi_col >> 1;
+          int mb_row_end =
+              AOMMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+          int mb_col_end =
+              AOMMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+          int r, c;
+
+          int skip = 1;
+          for (r = mb_row; r < mb_row_end; r++) {
+            for (c = mb_col; c < mb_col_end; c++) {
+              const int mb_index = r * cm->mb_cols + c;
+              if (!(cpi->twopass.this_frame_mb_stats[mb_index] &
+                    FPMB_MOTION_ZERO_MASK) ||
+                  !(cpi->twopass.this_frame_mb_stats[mb_index] &
+                    FPMB_ERROR_SMALL_MASK)) {
+                skip = 0;
+                break;
+              }
+            }
+            if (skip == 0) {
+              break;
+            }
+          }
+          if (skip) {
+            if (src_diff_var == UINT_MAX) {
+              set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+              src_diff_var = get_sby_perpixel_diff_variance(
+                  cpi, &x->plane[0].src, mi_row, mi_col, bsize);
+            }
+            if (src_diff_var < 8) {
+              do_square_split = 0;
+              do_rectangular_split = 0;
+            }
+          }
+        }
+#endif
+      }
+    }
+#if !CONFIG_PVQ
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+  }
+
+  // store estimated motion vector
+  if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
+
+  // PARTITION_SPLIT
+  // TODO(jingning): use the motion vectors given by the above search as
+  // the starting point of motion search in the following partition type check.
+  if (do_square_split) {
+    int reached_last_index = 0;
+    subsize = get_subsize(bsize, PARTITION_SPLIT);
+    if (bsize == BLOCK_8X8 && !unify_bsize) {
+#if CONFIG_DUAL_FILTER
+      if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
+        pc_tree->leaf_split[0]->pred_interp_filter =
+            ctx_none->mic.mbmi.interp_filter[0];
+#else
+      if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
+        pc_tree->leaf_split[0]->pred_interp_filter =
+            ctx_none->mic.mbmi.interp_filter;
+#endif
+#if CONFIG_SUPERTX
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+                       &sum_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_SPLIT,
+#endif
+                       subsize, pc_tree->leaf_split[0], INT64_MAX);
+#else
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_SPLIT,
+#endif
+                       subsize, pc_tree->leaf_split[0], best_rdc.rdcost);
+#endif  // CONFIG_SUPERTX
+      if (sum_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef = INT_MAX;
+#endif
+      }
+#if CONFIG_SUPERTX
+      if (supertx_allowed && sum_rdc.rdcost < INT64_MAX) {
+        TX_SIZE supertx_size = max_txsize_lookup[bsize];
+        const PARTITION_TYPE best_partition = pc_tree->partitioning;
+
+        pc_tree->partitioning = PARTITION_SPLIT;
+
+        sum_rdc.rate += av1_cost_bit(
+            cm->fc->supertx_prob[partition_supertx_context_lookup
+                                     [PARTITION_SPLIT]][supertx_size],
+            0);
+        sum_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+        if (is_inter_mode(pc_tree->leaf_split[0]->mic.mbmi.mode)) {
+          TX_TYPE best_tx = DCT_DCT;
+          RD_STATS tmp_rdc;
+          av1_init_rd_stats(&tmp_rdc);
+          tmp_rdc.rate = sum_rate_nocoef;
+
+          restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
+          rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
+                        &tmp_rdc.rate, &tmp_rdc.dist, &best_tx, pc_tree);
+
+          tmp_rdc.rate += av1_cost_bit(
+              cm->fc->supertx_prob[partition_supertx_context_lookup
+                                       [PARTITION_SPLIT]][supertx_size],
+              1);
+          tmp_rdc.rdcost =
+              RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+          if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+            sum_rdc = tmp_rdc;
+            update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
+                                    supertx_size, pc_tree);
+          }
+        }
+
+        pc_tree->partitioning = best_partition;
+      }
+#endif  // CONFIG_SUPERTX
+      reached_last_index = 1;
+    } else {
+      int idx;
+#if CONFIG_SUPERTX
+      for (idx = 0; idx < 4 && sum_rdc.rdcost < INT64_MAX; ++idx) {
+#else
+      for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) {
+#endif  // CONFIG_SUPERTX
+        const int x_idx = (idx & 1) * mi_step;
+        const int y_idx = (idx >> 1) * mi_step;
+
+        if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+          continue;
+
+        if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+
+        pc_tree->split[idx]->index = idx;
+#if CONFIG_SUPERTX
+        rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
+                          mi_col + x_idx, subsize, &this_rdc, &this_rate_nocoef,
+                          INT64_MAX - sum_rdc.rdcost, pc_tree->split[idx]);
+#else
+        rd_pick_partition(
+            cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+            &this_rdc, best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[idx]);
+#endif  // CONFIG_SUPERTX
+
+        if (this_rdc.rate == INT_MAX) {
+          sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+          sum_rate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
+          break;
+        } else {
+          sum_rdc.rate += this_rdc.rate;
+          sum_rdc.dist += this_rdc.dist;
+          sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+          sum_rate_nocoef += this_rate_nocoef;
+#endif  // CONFIG_SUPERTX
+        }
+      }
+      reached_last_index = (idx == 4);
+#if CONFIG_SUPERTX
+      if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && reached_last_index) {
+        TX_SIZE supertx_size = max_txsize_lookup[bsize];
+        const PARTITION_TYPE best_partition = pc_tree->partitioning;
+
+        pc_tree->partitioning = PARTITION_SPLIT;
+
+        sum_rdc.rate += av1_cost_bit(
+            cm->fc->supertx_prob[partition_supertx_context_lookup
+                                     [PARTITION_SPLIT]][supertx_size],
+            0);
+        sum_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+        if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+          TX_TYPE best_tx = DCT_DCT;
+          RD_STATS tmp_rdc;
+          av1_init_rd_stats(&tmp_rdc);
+          tmp_rdc.rate = sum_rate_nocoef;
+
+          restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
+          rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
+                        &tmp_rdc.rate, &tmp_rdc.dist, &best_tx, pc_tree);
+
+          tmp_rdc.rate += av1_cost_bit(
+              cm->fc->supertx_prob[partition_supertx_context_lookup
+                                       [PARTITION_SPLIT]][supertx_size],
+              1);
+          tmp_rdc.rdcost =
+              RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+          if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+            sum_rdc = tmp_rdc;
+            update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
+                                    supertx_size, pc_tree);
+          }
+        }
+
+        pc_tree->partitioning = best_partition;
+      }
+#endif  // CONFIG_SUPERTX
+    }
+
+    if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
+      sum_rdc.rate += partition_cost[PARTITION_SPLIT];
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+#if CONFIG_SUPERTX
+      sum_rate_nocoef += partition_cost[PARTITION_SPLIT];
+#endif  // CONFIG_SUPERTX
+
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+#if CONFIG_SUPERTX
+        best_rate_nocoef = sum_rate_nocoef;
+        assert(best_rate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
+        pc_tree->partitioning = PARTITION_SPLIT;
+      }
+    } else if (cpi->sf.less_rectangular_check) {
+      // skip rectangular partition test when larger block size
+      // gives better rd cost
+      do_rectangular_split &= !partition_none_allowed;
+    }
+#if !CONFIG_PVQ
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+  }  // if (do_split)
+
+  // PARTITION_HORZ
+  if (partition_horz_allowed &&
+      (do_rectangular_split || av1_active_h_edge(cpi, mi_row, mi_step))) {
+    subsize = get_subsize(bsize, PARTITION_HORZ);
+    if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+#if CONFIG_DUAL_FILTER
+    if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+        partition_none_allowed)
+      pc_tree->horizontal[0].pred_interp_filter =
+          ctx_none->mic.mbmi.interp_filter[0];
+#else
+    if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+        partition_none_allowed)
+      pc_tree->horizontal[0].pred_interp_filter =
+          ctx_none->mic.mbmi.interp_filter;
+#endif
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+#if CONFIG_SUPERTX
+                     &sum_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+#if CONFIG_EXT_PARTITION_TYPES
+                     PARTITION_HORZ,
+#endif
+                     subsize, &pc_tree->horizontal[0], best_rdc.rdcost);
+
+#if CONFIG_SUPERTX
+    abort_flag =
+        (sum_rdc.rdcost >= best_rd && (bsize > BLOCK_8X8 || unify_bsize)) ||
+        (sum_rdc.rate == INT_MAX && bsize == BLOCK_8X8);
+    if (sum_rdc.rdcost < INT64_MAX &&
+#else
+    if (sum_rdc.rdcost < best_rdc.rdcost &&
+#endif  // CONFIG_SUPERTX
+        !force_horz_split && (bsize > BLOCK_8X8 || unify_bsize)) {
+      PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0];
+      update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
+      encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                        ctx_h, NULL);
+
+      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_h);
+
+#if CONFIG_DUAL_FILTER
+      if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+          partition_none_allowed)
+        pc_tree->horizontal[1].pred_interp_filter =
+            ctx_h->mic.mbmi.interp_filter[0];
+#else
+      if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+          partition_none_allowed)
+        pc_tree->horizontal[1].pred_interp_filter =
+            ctx_none->mic.mbmi.interp_filter;
+#endif
+#if CONFIG_SUPERTX
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
+                       &this_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_HORZ,
+#endif
+                       subsize, &pc_tree->horizontal[1], INT64_MAX);
+#else
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_HORZ,
+#endif
+                       subsize, &pc_tree->horizontal[1],
+                       best_rdc.rdcost - sum_rdc.rdcost);
+#endif  // CONFIG_SUPERTX
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef += this_rate_nocoef;
+#endif  // CONFIG_SUPERTX
+      }
+    }
+
+#if CONFIG_SUPERTX
+    if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) {
+      TX_SIZE supertx_size = max_txsize_lookup[bsize];
+      const PARTITION_TYPE best_partition = pc_tree->partitioning;
+
+      pc_tree->partitioning = PARTITION_HORZ;
+
+      sum_rdc.rate += av1_cost_bit(
+          cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_HORZ]]
+                              [supertx_size],
+          0);
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+      if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+        TX_TYPE best_tx = DCT_DCT;
+        RD_STATS tmp_rdc;
+        av1_init_rd_stats(&tmp_rdc);
+        tmp_rdc.rate = sum_rate_nocoef;
+
+        restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
+        rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, &tmp_rdc.rate,
+                      &tmp_rdc.dist, &best_tx, pc_tree);
+
+        tmp_rdc.rate += av1_cost_bit(
+            cm->fc
+                ->supertx_prob[partition_supertx_context_lookup[PARTITION_HORZ]]
+                              [supertx_size],
+            1);
+        tmp_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+        if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+          sum_rdc = tmp_rdc;
+          update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
+                                  supertx_size, pc_tree);
+        }
+      }
+
+      pc_tree->partitioning = best_partition;
+    }
+#endif  // CONFIG_SUPERTX
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      sum_rdc.rate += partition_cost[PARTITION_HORZ];
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+#if CONFIG_SUPERTX
+      sum_rate_nocoef += partition_cost[PARTITION_HORZ];
+#endif  // CONFIG_SUPERTX
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+#if CONFIG_SUPERTX
+        best_rate_nocoef = sum_rate_nocoef;
+        assert(best_rate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
+        pc_tree->partitioning = PARTITION_HORZ;
+      }
+    }
+#if !CONFIG_PVQ
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+  }
+
+  // PARTITION_VERT
+  if (partition_vert_allowed &&
+      (do_rectangular_split || av1_active_v_edge(cpi, mi_col, mi_step))) {
+    subsize = get_subsize(bsize, PARTITION_VERT);
+
+    if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+
+#if CONFIG_DUAL_FILTER
+    if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+        partition_none_allowed)
+      pc_tree->vertical[0].pred_interp_filter =
+          ctx_none->mic.mbmi.interp_filter[0];
+#else
+    if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+        partition_none_allowed)
+      pc_tree->vertical[0].pred_interp_filter =
+          ctx_none->mic.mbmi.interp_filter;
+#endif
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+#if CONFIG_SUPERTX
+                     &sum_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+#if CONFIG_EXT_PARTITION_TYPES
+                     PARTITION_VERT,
+#endif
+                     subsize, &pc_tree->vertical[0], best_rdc.rdcost);
+#if CONFIG_SUPERTX
+    abort_flag =
+        (sum_rdc.rdcost >= best_rd && (bsize > BLOCK_8X8 || unify_bsize)) ||
+        (sum_rdc.rate == INT_MAX && bsize == BLOCK_8X8);
+    if (sum_rdc.rdcost < INT64_MAX &&
+#else
+    if (sum_rdc.rdcost < best_rdc.rdcost &&
+#endif  // CONFIG_SUPERTX
+        !force_vert_split && (bsize > BLOCK_8X8 || unify_bsize)) {
+      update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 1);
+      encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                        &pc_tree->vertical[0], NULL);
+
+      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+
+#if CONFIG_DUAL_FILTER
+      if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+          partition_none_allowed)
+        pc_tree->vertical[1].pred_interp_filter =
+            ctx_none->mic.mbmi.interp_filter[0];
+#else
+      if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+          partition_none_allowed)
+        pc_tree->vertical[1].pred_interp_filter =
+            ctx_none->mic.mbmi.interp_filter;
+#endif
+#if CONFIG_SUPERTX
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
+                       &this_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_VERT,
+#endif
+                       subsize, &pc_tree->vertical[1],
+                       INT64_MAX - sum_rdc.rdcost);
+#else
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_VERT,
+#endif
+                       subsize, &pc_tree->vertical[1],
+                       best_rdc.rdcost - sum_rdc.rdcost);
+#endif  // CONFIG_SUPERTX
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef += this_rate_nocoef;
+#endif  // CONFIG_SUPERTX
+      }
+    }
+#if CONFIG_SUPERTX
+    if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) {
+      TX_SIZE supertx_size = max_txsize_lookup[bsize];
+      const PARTITION_TYPE best_partition = pc_tree->partitioning;
+
+      pc_tree->partitioning = PARTITION_VERT;
+
+      sum_rdc.rate += av1_cost_bit(
+          cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_VERT]]
+                              [supertx_size],
+          0);
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+      if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+        TX_TYPE best_tx = DCT_DCT;
+        RD_STATS tmp_rdc;
+        av1_init_rd_stats(&tmp_rdc);
+        tmp_rdc.rate = sum_rate_nocoef;
+
+        restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
+        rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, &tmp_rdc.rate,
+                      &tmp_rdc.dist, &best_tx, pc_tree);
+
+        tmp_rdc.rate += av1_cost_bit(
+            cm->fc
+                ->supertx_prob[partition_supertx_context_lookup[PARTITION_VERT]]
+                              [supertx_size],
+            1);
+        tmp_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+        if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+          sum_rdc = tmp_rdc;
+          update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
+                                  supertx_size, pc_tree);
+        }
+      }
+
+      pc_tree->partitioning = best_partition;
+    }
+#endif  // CONFIG_SUPERTX
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      sum_rdc.rate += partition_cost[PARTITION_VERT];
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+#if CONFIG_SUPERTX
+      sum_rate_nocoef += partition_cost[PARTITION_VERT];
+#endif  // CONFIG_SUPERTX
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+#if CONFIG_SUPERTX
+        best_rate_nocoef = sum_rate_nocoef;
+        assert(best_rate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
+        pc_tree->partitioning = PARTITION_VERT;
+      }
+    }
+#if !CONFIG_PVQ
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+  }
+
+#if CONFIG_EXT_PARTITION_TYPES
+  // PARTITION_HORZ_A
+  if (partition_horz_allowed && do_rectangular_split && bsize > BLOCK_8X8 &&
+      partition_none_allowed) {
+    subsize = get_subsize(bsize, PARTITION_HORZ_A);
+    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+                       pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize,
+                       PARTITION_HORZ_A,
+#if CONFIG_SUPERTX
+                       best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+                       mi_row, mi_col, bsize2, mi_row, mi_col + mi_step, bsize2,
+                       mi_row + mi_step, mi_col, subsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+  }
+  // PARTITION_HORZ_B
+  if (partition_horz_allowed && do_rectangular_split && bsize > BLOCK_8X8 &&
+      partition_none_allowed) {
+    subsize = get_subsize(bsize, PARTITION_HORZ_B);
+    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+                       pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize,
+                       PARTITION_HORZ_B,
+#if CONFIG_SUPERTX
+                       best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+                       mi_row, mi_col, subsize, mi_row + mi_step, mi_col,
+                       bsize2, mi_row + mi_step, mi_col + mi_step, bsize2);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+  }
+  // PARTITION_VERT_A
+  if (partition_vert_allowed && do_rectangular_split && bsize > BLOCK_8X8 &&
+      partition_none_allowed) {
+    subsize = get_subsize(bsize, PARTITION_VERT_A);
+    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+                       pc_tree->verticala, ctx_none, mi_row, mi_col, bsize,
+                       PARTITION_VERT_A,
+#if CONFIG_SUPERTX
+                       best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+                       mi_row, mi_col, bsize2, mi_row + mi_step, mi_col, bsize2,
+                       mi_row, mi_col + mi_step, subsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+  }
+  // PARTITION_VERT_B
+  if (partition_vert_allowed && do_rectangular_split && bsize > BLOCK_8X8 &&
+      partition_none_allowed) {
+    subsize = get_subsize(bsize, PARTITION_VERT_B);
+    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+                       pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize,
+                       PARTITION_VERT_B,
+#if CONFIG_SUPERTX
+                       best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+                       mi_row, mi_col, subsize, mi_row, mi_col + mi_step,
+                       bsize2, mi_row + mi_step, mi_col + mi_step, bsize2);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+  }
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
+  // TODO(jbb): This code added so that we avoid static analysis
+  // warning related to the fact that best_rd isn't used after this
+  // point.  This code should be refactored so that the duplicate
+  // checks occur in some sub function and thus are used...
+  (void)best_rd;
+  *rd_cost = best_rdc;
+#if CONFIG_SUPERTX
+  *rate_nocoef = best_rate_nocoef;
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_CFL
+  // Store the luma for the best mode
+  x->cfl_store_y = 1;
+#endif
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
+      pc_tree->index != 3) {
+    if (bsize == cm->sb_size) {
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+      set_mode_info_sb(cpi, td, tile_info, tp, mi_row, mi_col, bsize, pc_tree);
+#endif
+      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
+  }
+#if CONFIG_CFL
+  x->cfl_store_y = 0;
+#endif
+
+  if (bsize == cm->sb_size) {
+#if !CONFIG_PVQ && !CONFIG_LV_MAP
+    assert(tp_orig < *tp || (tp_orig == *tp && xd->mi[0]->mbmi.skip));
+#endif
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
+  } else {
+    assert(tp_orig == *tp);
+  }
+}
+
+static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
+                             TileDataEnc *tile_data, int mi_row,
+                             TOKENEXTRA **tp) {
+  AV1_COMMON *const cm = &cpi->common;
+  const TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  int mi_col;
+#if CONFIG_EXT_PARTITION
+  const int leaf_nodes = 256;
+#else
+  const int leaf_nodes = 64;
+#endif  // CONFIG_EXT_PARTITION
+
+  // Initialize the left context for the new SB row
+  av1_zero_left_context(xd);
+
+#if CONFIG_DELTA_Q
+  // Reset delta for every tile
+  if (cm->delta_q_present_flag)
+    if (mi_row == tile_info->mi_row_start) xd->prev_qindex = cm->base_qindex;
+#if CONFIG_EXT_DELTA_Q
+  if (cm->delta_lf_present_flag)
+    if (mi_row == tile_info->mi_row_start) xd->prev_delta_lf_from_base = 0;
+#endif
+#endif
+
+  // Code each SB in the row
+  for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
+       mi_col += cm->mib_size) {
+    const struct segmentation *const seg = &cm->seg;
+    int dummy_rate;
+    int64_t dummy_dist;
+    RD_STATS dummy_rdc;
+#if CONFIG_SUPERTX
+    int dummy_rate_nocoef;
+#endif  // CONFIG_SUPERTX
+    int i;
+    int seg_skip = 0;
+
+    const int idx_str = cm->mi_stride * mi_row + mi_col;
+    MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+    PC_TREE *const pc_root = td->pc_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
+
+    av1_update_boundary_info(cm, tile_info, mi_row, mi_col);
+
+    if (sf->adaptive_pred_interp_filter) {
+      for (i = 0; i < leaf_nodes; ++i)
+        td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
+
+      for (i = 0; i < leaf_nodes; ++i) {
+        td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
+        td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
+        td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
+        td->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
+      }
+    }
+
+    av1_zero(x->pred_mv);
+    pc_root->index = 0;
+
+    if (seg->enabled) {
+      const uint8_t *const map =
+          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+      int segment_id = get_segment_id(cm, map, cm->sb_size, mi_row, mi_col);
+      seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+    }
+
+#if CONFIG_DELTA_Q
+    if (cm->delta_q_present_flag) {
+      // Test mode for delta quantization
+      int sb_row = mi_row >> 3;
+      int sb_col = mi_col >> 3;
+      int sb_stride = (cm->width + MAX_SB_SIZE - 1) >> MAX_SB_SIZE_LOG2;
+      int index = ((sb_row * sb_stride + sb_col + 8) & 31) - 16;
+
+      // Ensure divisibility of delta_qindex by delta_q_res
+      int offset_qindex = (index < 0 ? -index - 8 : index - 8);
+      int qmask = ~(cm->delta_q_res - 1);
+      int current_qindex = clamp(cm->base_qindex + offset_qindex,
+                                 cm->delta_q_res, 256 - cm->delta_q_res);
+
+      current_qindex =
+          ((current_qindex - cm->base_qindex + cm->delta_q_res / 2) & qmask) +
+          cm->base_qindex;
+      assert(current_qindex > 0);
+
+      xd->delta_qindex = current_qindex - cm->base_qindex;
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+      xd->mi[0]->mbmi.current_q_index = current_qindex;
+#if !CONFIG_EXT_DELTA_Q
+      xd->mi[0]->mbmi.segment_id = 0;
+#endif  // CONFIG_EXT_DELTA_Q
+      av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
+#if CONFIG_EXT_DELTA_Q
+      if (cpi->oxcf.deltaq_mode == DELTA_Q_LF) {
+        int j, k;
+        int lfmask = ~(cm->delta_lf_res - 1);
+        int current_delta_lf_from_base = offset_qindex / 2;
+        current_delta_lf_from_base =
+            ((current_delta_lf_from_base + cm->delta_lf_res / 2) & lfmask);
+
+        // pre-set the delta lf for loop filter. Note that this value is set
+        // before mi is assigned for each block in current superblock
+        for (j = 0; j < AOMMIN(cm->mib_size, cm->mi_rows - mi_row); j++) {
+          for (k = 0; k < AOMMIN(cm->mib_size, cm->mi_cols - mi_col); k++) {
+            cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)]
+                .mbmi.current_delta_lf_from_base = current_delta_lf_from_base;
+          }
+        }
+      }
+#endif  // CONFIG_EXT_DELTA_Q
+    }
+#endif  // CONFIG_DELTA_Q
+
+    x->source_variance = UINT_MAX;
+    if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
+      BLOCK_SIZE bsize;
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
+      bsize = seg_skip ? cm->sb_size : sf->always_this_block_size;
+      set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->sb_size,
+                       &dummy_rate, &dummy_dist,
+#if CONFIG_SUPERTX
+                       &dummy_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+                       1, pc_root);
+    } else if (cpi->partition_search_skippable_frame) {
+      BLOCK_SIZE bsize;
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
+      bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
+      set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->sb_size,
+                       &dummy_rate, &dummy_dist,
+#if CONFIG_SUPERTX
+                       &dummy_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+                       1, pc_root);
+    } else if (sf->partition_search_type == VAR_BASED_PARTITION) {
+      choose_partitioning(cpi, td, tile_info, x, mi_row, mi_col);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->sb_size,
+                       &dummy_rate, &dummy_dist,
+#if CONFIG_SUPERTX
+                       &dummy_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+                       1, pc_root);
+    } else {
+      // If required set upper and lower partition size limits
+      if (sf->auto_min_max_partition_size) {
+        set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
+        rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
+                                &x->min_partition_size, &x->max_partition_size);
+      }
+      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, cm->sb_size,
+                        &dummy_rdc,
+#if CONFIG_SUPERTX
+                        &dummy_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+                        INT64_MAX, pc_root);
+    }
+  }
+#if CONFIG_SUBFRAME_PROB_UPDATE
+  if (cm->do_subframe_update &&
+      cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    const int mi_rows_per_update =
+        MI_SIZE * AOMMAX(cm->mi_rows / MI_SIZE / COEF_PROBS_BUFS, 1);
+    if ((mi_row + MI_SIZE) % mi_rows_per_update == 0 &&
+        mi_row + MI_SIZE < cm->mi_rows &&
+        cm->coef_probs_update_idx < COEF_PROBS_BUFS - 1) {
+      TX_SIZE t;
+      SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats;
+
+      for (t = 0; t < TX_SIZES; ++t)
+        av1_full_to_model_counts(cpi->td.counts->coef[t],
+                                 cpi->td.rd_counts.coef_counts[t]);
+      av1_partial_adapt_probs(cm, mi_row, mi_col);
+      ++cm->coef_probs_update_idx;
+      av1_copy(subframe_stats->coef_probs_buf[cm->coef_probs_update_idx],
+               cm->fc->coef_probs);
+      av1_copy(subframe_stats->coef_counts_buf[cm->coef_probs_update_idx],
+               cpi->td.rd_counts.coef_counts);
+      av1_copy(subframe_stats->eob_counts_buf[cm->coef_probs_update_idx],
+               cm->counts.eob_branch);
+      av1_fill_token_costs(x->token_costs, cm->fc->coef_probs);
+    }
+  }
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+}
+
+static void init_encode_frame_mb_context(AV1_COMP *cpi) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  // Copy data over into macro block data structures.
+  av1_setup_src_planes(x, cpi->source, 0, 0);
+
+  av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
+}
+
+#if !CONFIG_REF_ADAPT
+static int check_dual_ref_flags(AV1_COMP *cpi) {
+  const int ref_flags = cpi->ref_frame_flags;
+
+  if (segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) {
+    return 0;
+  } else {
+    return (!!(ref_flags & AOM_GOLD_FLAG) + !!(ref_flags & AOM_LAST_FLAG) +
+#if CONFIG_EXT_REFS
+            !!(ref_flags & AOM_LAST2_FLAG) + !!(ref_flags & AOM_LAST3_FLAG) +
+            !!(ref_flags & AOM_BWD_FLAG) +
+#endif  // CONFIG_EXT_REFS
+            !!(ref_flags & AOM_ALT_FLAG)) >= 2;
+  }
+}
+#endif  // !CONFIG_REF_ADAPT
+
+#if !CONFIG_VAR_TX
+static void reset_skip_tx_size(AV1_COMMON *cm, TX_SIZE max_tx_size) {
+  int mi_row, mi_col;
+  const int mis = cm->mi_stride;
+  MODE_INFO **mi_ptr = cm->mi_grid_visible;
+
+  for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) {
+    for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+      if (txsize_sqr_up_map[mi_ptr[mi_col]->mbmi.tx_size] > max_tx_size)
+        mi_ptr[mi_col]->mbmi.tx_size = max_tx_size;
+    }
+  }
+}
+#endif
+
+static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) {
+  if (frame_is_intra_only(&cpi->common)) return INTRA_FRAME;
+#if CONFIG_EXT_REFS
+  // We will not update the golden frame with an internal overlay frame
+  else if ((cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) ||
+           cpi->rc.is_src_frame_ext_arf)
+#else
+  else if (cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame)
+#endif
+    return ALTREF_FRAME;
+  else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
+    return GOLDEN_FRAME;
+  else
+    // TODO(zoeliu): To investigate whether a frame_type other than
+    // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
+    return LAST_FRAME;
+}
+
+static TX_MODE select_tx_mode(const AV1_COMP *cpi, MACROBLOCKD *const xd) {
+  int i, all_lossless = 1;
+
+  if (cpi->common.seg.enabled) {
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      if (!xd->lossless[i]) {
+        all_lossless = 0;
+        break;
+      }
+    }
+  } else {
+    all_lossless = xd->lossless[0];
+  }
+  if (all_lossless) return ONLY_4X4;
+  if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
+    return ALLOW_32X32 + CONFIG_TX64X64;
+  else if (cpi->sf.tx_size_search_method == USE_FULL_RD ||
+           cpi->sf.tx_size_search_method == USE_TX_8X8)
+    return TX_MODE_SELECT;
+  else
+    return cpi->common.tx_mode;
+}
+
+void av1_init_tile_data(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  int tile_col, tile_row;
+  TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
+  unsigned int tile_tok = 0;
+
+  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
+    if (cpi->tile_data != NULL) aom_free(cpi->tile_data);
+    CHECK_MEM_ERROR(
+        cm, cpi->tile_data,
+        aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+    cpi->allocated_tiles = tile_cols * tile_rows;
+
+    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+        TileDataEnc *const tile_data =
+            &cpi->tile_data[tile_row * tile_cols + tile_col];
+        int i, j;
+        for (i = 0; i < BLOCK_SIZES; ++i) {
+          for (j = 0; j < MAX_MODES; ++j) {
+            tile_data->thresh_freq_fact[i][j] = 32;
+            tile_data->mode_map[i][j] = j;
+          }
+        }
+#if CONFIG_PVQ
+        // This will be dynamically increased as more pvq block is encoded.
+        tile_data->pvq_q.buf_len = 1000;
+        CHECK_MEM_ERROR(
+            cm, tile_data->pvq_q.buf,
+            aom_malloc(tile_data->pvq_q.buf_len * sizeof(PVQ_INFO)));
+        tile_data->pvq_q.curr_pos = 0;
+#endif
+      }
+  }
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      TileInfo *const tile_info =
+          &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
+      av1_tile_init(tile_info, cm, tile_row, tile_col);
+
+      cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
+      pre_tok = cpi->tile_tok[tile_row][tile_col];
+      tile_tok = allocated_tokens(*tile_info);
+#if CONFIG_PVQ
+      cpi->tile_data[tile_row * tile_cols + tile_col].pvq_q.curr_pos = 0;
+#endif
+    }
+  }
+}
+
+void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
+                     int tile_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  TileDataEnc *const this_tile =
+      &cpi->tile_data[tile_row * cm->tile_cols + tile_col];
+  const TileInfo *const tile_info = &this_tile->tile_info;
+  TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
+  int mi_row;
+
+#if CONFIG_DEPENDENT_HORZTILES
+#if CONFIG_TILE_GROUPS
+  if ((!cm->dependent_horz_tiles) || (tile_row == 0) ||
+      tile_info->tg_horz_boundary) {
+#else
+  if ((!cm->dependent_horz_tiles) || (tile_row == 0)) {
+#endif
+    av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end);
+  }
+#else
+  av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end);
+#endif
+
+  // Set up pointers to per thread motion search counters.
+  this_tile->m_search_count = 0;   // Count of motion search hits.
+  this_tile->ex_search_count = 0;  // Exhaustive mesh search hits.
+  td->mb.m_search_count_ptr = &this_tile->m_search_count;
+  td->mb.ex_search_count_ptr = &this_tile->ex_search_count;
+
+#if CONFIG_PVQ
+  td->mb.pvq_q = &this_tile->pvq_q;
+
+  // TODO(yushin) : activity masking info needs be signaled by a bitstream
+  td->mb.daala_enc.use_activity_masking = AV1_PVQ_ENABLE_ACTIVITY_MASKING;
+
+  if (td->mb.daala_enc.use_activity_masking)
+    td->mb.daala_enc.qm = OD_HVS_QM;  // Hard coded. Enc/dec required to sync.
+  else
+    td->mb.daala_enc.qm = OD_FLAT_QM;  // Hard coded. Enc/dec required to sync.
+
+  {
+    // FIXME: Multiple segments support
+    int segment_id = 0;
+    int rdmult = set_segment_rdmult(cpi, &td->mb, segment_id);
+    int qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+#if CONFIG_HIGHBITDEPTH
+    const int quantizer_shift = td->mb.e_mbd.bd - 8;
+#else
+    const int quantizer_shift = 0;
+#endif  // CONFIG_HIGHBITDEPTH
+    int64_t q_ac = OD_MAXI(
+        1, av1_ac_quant(qindex, 0, cpi->common.bit_depth) >> quantizer_shift);
+    int64_t q_dc = OD_MAXI(
+        1, av1_dc_quant(qindex, 0, cpi->common.bit_depth) >> quantizer_shift);
+    /* td->mb.daala_enc.pvq_norm_lambda = OD_PVQ_LAMBDA; */
+    td->mb.daala_enc.pvq_norm_lambda =
+        (double)rdmult * (64 / 16) / (q_ac * q_ac * (1 << RDDIV_BITS));
+    td->mb.daala_enc.pvq_norm_lambda_dc =
+        (double)rdmult * (64 / 16) / (q_dc * q_dc * (1 << RDDIV_BITS));
+    // printf("%f\n", td->mb.daala_enc.pvq_norm_lambda);
+  }
+  od_init_qm(td->mb.daala_enc.state.qm, td->mb.daala_enc.state.qm_inv,
+             td->mb.daala_enc.qm == OD_HVS_QM ? OD_QM8_Q4_HVS : OD_QM8_Q4_FLAT);
+
+  if (td->mb.daala_enc.use_activity_masking) {
+    int pli;
+    int use_masking = td->mb.daala_enc.use_activity_masking;
+    int segment_id = 0;
+    int qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+
+    for (pli = 0; pli < MAX_MB_PLANE; pli++) {
+      int i;
+      int q;
+
+      q = qindex;
+      if (q <= OD_DEFAULT_QMS[use_masking][0][pli].interp_q << OD_COEFF_SHIFT) {
+        od_interp_qm(&td->mb.daala_enc.state.pvq_qm_q4[pli][0], q,
+                     &OD_DEFAULT_QMS[use_masking][0][pli], NULL);
+      } else {
+        i = 0;
+        while (OD_DEFAULT_QMS[use_masking][i + 1][pli].qm_q4 != NULL &&
+               q > OD_DEFAULT_QMS[use_masking][i + 1][pli].interp_q
+                       << OD_COEFF_SHIFT) {
+          i++;
+        }
+        od_interp_qm(&td->mb.daala_enc.state.pvq_qm_q4[pli][0], q,
+                     &OD_DEFAULT_QMS[use_masking][i][pli],
+                     &OD_DEFAULT_QMS[use_masking][i + 1][pli]);
+      }
+    }
+  }
+
+#if CONFIG_DAALA_EC
+  od_ec_enc_init(&td->mb.daala_enc.w.ec, 65025);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+
+#if CONFIG_DAALA_EC
+  od_ec_enc_reset(&td->mb.daala_enc.w.ec);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+#endif  // #if CONFIG_PVQ
+
+#if CONFIG_EC_ADAPT
+  this_tile->tctx = *cm->fc;
+  td->mb.e_mbd.tile_ctx = &this_tile->tctx;
+#endif  // #if CONFIG_EC_ADAPT
+
+#if CONFIG_CFL
+  MACROBLOCKD *const xd = &td->mb.e_mbd;
+  xd->cfl = &this_tile->cfl;
+  cfl_init(xd->cfl, cm, xd->plane[AOM_PLANE_U].subsampling_x,
+           xd->plane[AOM_PLANE_U].subsampling_y);
+#endif
+
+#if CONFIG_PVQ
+  td->mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
+#endif  // CONFIG_PVQ
+
+  for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+       mi_row += cm->mib_size) {
+    encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+  }
+
+  cpi->tok_count[tile_row][tile_col] =
+      (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
+  assert(cpi->tok_count[tile_row][tile_col] <= allocated_tokens(*tile_info));
+#if CONFIG_PVQ
+#if CONFIG_DAALA_EC
+  od_ec_enc_clear(&td->mb.daala_enc.w.ec);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+
+  td->mb.pvq_q->last_pos = td->mb.pvq_q->curr_pos;
+  // rewind current position so that bitstream can be written
+  // from the 1st pvq block
+  td->mb.pvq_q->curr_pos = 0;
+
+  td->mb.pvq_q = NULL;
+#endif
+}
+
+static void encode_tiles(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  int tile_col, tile_row;
+
+  av1_init_tile_data(cpi);
+
+  for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row)
+    for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col)
+      av1_encode_tile(cpi, &cpi->td, tile_row, tile_col);
+}
+
+#if CONFIG_FP_MB_STATS
+static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
+                            AV1_COMMON *cm, uint8_t **this_frame_mb_stats) {
+  uint8_t *mb_stats_in = firstpass_mb_stats->mb_stats_start +
+                         cm->current_video_frame * cm->MBs * sizeof(uint8_t);
+
+  if (mb_stats_in > firstpass_mb_stats->mb_stats_end) return EOF;
+
+  *this_frame_mb_stats = mb_stats_in;
+
+  return 1;
+}
+#endif
+
+#if CONFIG_GLOBAL_MOTION
+#define GLOBAL_TRANS_TYPES_ENC 3  // highest motion model to search
+static int gm_get_params_cost(WarpedMotionParams *gm,
+                              WarpedMotionParams *ref_gm, int allow_hp) {
+  assert(gm->wmtype < GLOBAL_TRANS_TYPES);
+  int params_cost = 0;
+  int trans_bits, trans_prec_diff;
+  switch (gm->wmtype) {
+    case HOMOGRAPHY:
+    case HORTRAPEZOID:
+    case VERTRAPEZOID:
+      if (gm->wmtype != HORTRAPEZOID)
+        params_cost += aom_count_signed_primitive_refsubexpfin(
+            GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
+            (ref_gm->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF),
+            (gm->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF));
+      if (gm->wmtype != VERTRAPEZOID)
+        params_cost += aom_count_signed_primitive_refsubexpfin(
+            GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
+            (ref_gm->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF),
+            (gm->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF));
+    // Fallthrough intended
+    case AFFINE:
+    case ROTZOOM:
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS),
+          (gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+      if (gm->wmtype != VERTRAPEZOID)
+        params_cost += aom_count_signed_primitive_refsubexpfin(
+            GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+            (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+            (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+      if (gm->wmtype >= AFFINE) {
+        if (gm->wmtype != HORTRAPEZOID)
+          params_cost += aom_count_signed_primitive_refsubexpfin(
+              GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+              (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+              (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+        params_cost += aom_count_signed_primitive_refsubexpfin(
+            GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+            (ref_gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+                (1 << GM_ALPHA_PREC_BITS),
+            (gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+      }
+    // Fallthrough intended
+    case TRANSLATION:
+      trans_bits = (gm->wmtype == TRANSLATION)
+                       ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+                       : GM_ABS_TRANS_BITS;
+      trans_prec_diff = (gm->wmtype == TRANSLATION)
+                            ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+                            : GM_TRANS_PREC_DIFF;
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          (1 << trans_bits) + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[0] >> trans_prec_diff),
+          (gm->wmmat[0] >> trans_prec_diff));
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          (1 << trans_bits) + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[1] >> trans_prec_diff),
+          (gm->wmmat[1] >> trans_prec_diff));
+    // Fallthrough intended
+    case IDENTITY: break;
+    default: assert(0);
+  }
+  return (params_cost << AV1_PROB_COST_SHIFT);
+}
+#endif  // CONFIG_GLOBAL_MOTION
+
+static void encode_frame_internal(AV1_COMP *cpi) {
+  ThreadData *const td = &cpi->td;
+  MACROBLOCK *const x = &td->mb;
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  RD_COUNTS *const rdc = &cpi->td.rd_counts;
+  int i;
+#if CONFIG_TEMPMV_SIGNALING || CONFIG_EXT_REFS
+  const int last_fb_buf_idx = get_ref_frame_buf_idx(cpi, LAST_FRAME);
+#endif  // CONFIG_TEMPMV_SIGNALING || CONFIG_EXT_REFS
+
+#if CONFIG_ADAPT_SCAN
+  av1_deliver_eob_threshold(cm, xd);
+#endif
+
+  x->min_partition_size = AOMMIN(x->min_partition_size, cm->sb_size);
+  x->max_partition_size = AOMMIN(x->max_partition_size, cm->sb_size);
+#if CONFIG_REF_MV
+  cm->setup_mi(cm);
+#endif
+
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
+
+  av1_zero(*td->counts);
+  av1_zero(rdc->coef_counts);
+  av1_zero(rdc->comp_pred_diff);
+
+#if CONFIG_GLOBAL_MOTION
+  av1_zero(rdc->global_motion_used);
+  if (cpi->common.frame_type == INTER_FRAME && cpi->source &&
+      !cpi->global_motion_search_done) {
+    YV12_BUFFER_CONFIG *ref_buf;
+    int frame;
+    double params_by_motion[RANSAC_NUM_MOTIONS * (MAX_PARAMDIM - 1)];
+    const double *params_this_motion;
+    int inliers_by_motion[RANSAC_NUM_MOTIONS];
+    WarpedMotionParams tmp_wm_params;
+    static const double kInfiniteErrAdv = 1e12;
+    static const double kIdentityParams[MAX_PARAMDIM - 1] = {
+      0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0
+    };
+
+    for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+      ref_buf = get_ref_frame_buffer(cpi, frame);
+      if (ref_buf) {
+        TransformationType model;
+        aom_clear_system_state();
+        for (model = ROTZOOM; model < GLOBAL_TRANS_TYPES_ENC; ++model) {
+          double best_erroradvantage = kInfiniteErrAdv;
+
+          // Initially set all params to identity.
+          for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+            memcpy(params_by_motion + (MAX_PARAMDIM - 1) * i, kIdentityParams,
+                   (MAX_PARAMDIM - 1) * sizeof(*params_by_motion));
+          }
+
+          compute_global_motion_feature_based(
+              model, cpi->source, ref_buf,
+#if CONFIG_HIGHBITDEPTH
+              cpi->common.bit_depth,
+#endif  // CONFIG_HIGHBITDEPTH
+              inliers_by_motion, params_by_motion, RANSAC_NUM_MOTIONS);
+
+          for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+            if (inliers_by_motion[i] == 0) continue;
+
+            params_this_motion = params_by_motion + (MAX_PARAMDIM - 1) * i;
+            convert_model_to_params(params_this_motion, &tmp_wm_params);
+
+            if (tmp_wm_params.wmtype != IDENTITY) {
+              const double erroradv_this_motion = refine_integerized_param(
+                  &tmp_wm_params, tmp_wm_params.wmtype,
+#if CONFIG_HIGHBITDEPTH
+                  xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+#endif  // CONFIG_HIGHBITDEPTH
+                  ref_buf->y_buffer, ref_buf->y_width, ref_buf->y_height,
+                  ref_buf->y_stride, cpi->source->y_buffer,
+                  cpi->source->y_width, cpi->source->y_height,
+                  cpi->source->y_stride, 3);
+              if (erroradv_this_motion < best_erroradvantage) {
+                best_erroradvantage = erroradv_this_motion;
+                // Save the wm_params modified by refine_integerized_param()
+                // rather than motion index to avoid rerunning refine() below.
+                memcpy(&(cm->global_motion[frame]), &tmp_wm_params,
+                       sizeof(WarpedMotionParams));
+              }
+            }
+          }
+          if (cm->global_motion[frame].wmtype <= AFFINE)
+            if (!get_shear_params(&cm->global_motion[frame]))
+              set_default_warp_params(&cm->global_motion[frame]);
+
+          if (cm->global_motion[frame].wmtype == TRANSLATION) {
+            cm->global_motion[frame].wmmat[0] =
+                convert_to_trans_prec(cm->allow_high_precision_mv,
+                                      cm->global_motion[frame].wmmat[0]) *
+                GM_TRANS_ONLY_DECODE_FACTOR;
+            cm->global_motion[frame].wmmat[1] =
+                convert_to_trans_prec(cm->allow_high_precision_mv,
+                                      cm->global_motion[frame].wmmat[1]) *
+                GM_TRANS_ONLY_DECODE_FACTOR;
+          }
+
+          // If the best error advantage found doesn't meet the threshold for
+          // this motion type, revert to IDENTITY.
+          if (!is_enough_erroradvantage(
+                  best_erroradvantage,
+                  gm_get_params_cost(&cm->global_motion[frame],
+                                     &cm->prev_frame->global_motion[frame],
+                                     cm->allow_high_precision_mv))) {
+            set_default_warp_params(&cm->global_motion[frame]);
+          }
+
+          if (cm->global_motion[frame].wmtype != IDENTITY) break;
+        }
+        aom_clear_system_state();
+      }
+      cpi->gmparams_cost[frame] =
+          gm_get_params_cost(&cm->global_motion[frame],
+                             &cm->prev_frame->global_motion[frame],
+                             cm->allow_high_precision_mv) +
+          cpi->gmtype_cost[cm->global_motion[frame].wmtype] -
+          cpi->gmtype_cost[IDENTITY];
+    }
+    cpi->global_motion_search_done = 1;
+  }
+  memcpy(cm->cur_frame->global_motion, cm->global_motion,
+         TOTAL_REFS_PER_FRAME * sizeof(WarpedMotionParams));
+#endif  // CONFIG_GLOBAL_MOTION
+
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    const int qindex = cm->seg.enabled
+                           ? av1_get_qindex(&cm->seg, i, cm->base_qindex)
+                           : cm->base_qindex;
+    xd->lossless[i] = qindex == 0 && cm->y_dc_delta_q == 0 &&
+                      cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
+    xd->qindex[i] = qindex;
+  }
+
+  if (!cm->seg.enabled && xd->lossless[0]) x->optimize = 0;
+
+  cm->tx_mode = select_tx_mode(cpi, xd);
+
+#if CONFIG_DELTA_Q
+  // Fix delta q resolution for the moment
+  cm->delta_q_res = DEFAULT_DELTA_Q_RES;
+// Set delta_q_present_flag before it is used for the first time
+#if CONFIG_EXT_DELTA_Q
+  cm->delta_lf_res = DEFAULT_DELTA_LF_RES;
+  // update delta_q_present_flag and delta_lf_present_flag based on base_qindex
+  cm->delta_q_present_flag &= cm->base_qindex > 0;
+  cm->delta_lf_present_flag &= cm->base_qindex > 0;
+#else
+  cm->delta_q_present_flag =
+      cpi->oxcf.aq_mode == DELTA_AQ && cm->base_qindex > 0;
+#endif  // CONFIG_EXT_DELTA_Q
+#endif
+
+  av1_frame_init_quantizer(cpi);
+
+  av1_initialize_rd_consts(cpi);
+  av1_initialize_me_consts(cpi, x, cm->base_qindex);
+  init_encode_frame_mb_context(cpi);
+#if CONFIG_TEMPMV_SIGNALING
+  if (last_fb_buf_idx != INVALID_IDX) {
+    cm->prev_frame = &cm->buffer_pool->frame_bufs[last_fb_buf_idx];
+    cm->use_prev_frame_mvs &= !cm->error_resilient_mode &&
+                              cm->width == cm->prev_frame->buf.y_width &&
+                              cm->height == cm->prev_frame->buf.y_height &&
+                              !cm->intra_only && !cm->prev_frame->intra_only;
+  }
+#else
+  cm->use_prev_frame_mvs =
+      !cm->error_resilient_mode && cm->width == cm->last_width &&
+      cm->height == cm->last_height && !cm->intra_only && cm->last_show_frame;
+#endif
+
+#if CONFIG_EXT_REFS
+  // NOTE(zoeliu): As cm->prev_frame can take neither a frame of
+  //               show_exisiting_frame=1, nor can it take a frame not used as
+  //               a reference, it is probable that by the time it is being
+  //               referred to, the frame buffer it originally points to may
+  //               already get expired and have been reassigned to the current
+  //               newly coded frame. Hence, we need to check whether this is
+  //               the case, and if yes, we have 2 choices:
+  //               (1) Simply disable the use of previous frame mvs; or
+  //               (2) Have cm->prev_frame point to one reference frame buffer,
+  //                   e.g. LAST_FRAME.
+  if (cm->use_prev_frame_mvs && !enc_is_ref_frame_buf(cpi, cm->prev_frame)) {
+    // Reassign the LAST_FRAME buffer to cm->prev_frame.
+    cm->prev_frame = &cm->buffer_pool->frame_bufs[last_fb_buf_idx];
+  }
+#endif  // CONFIG_EXT_REFS
+
+  // Special case: set prev_mi to NULL when the previous mode info
+  // context cannot be used.
+  cm->prev_mi =
+      cm->use_prev_frame_mvs ? cm->prev_mip + cm->mi_stride + 1 : NULL;
+
+#if CONFIG_VAR_TX
+  x->txb_split_count = 0;
+#if CONFIG_REF_MV
+  av1_zero(x->blk_skip_drl);
+#endif
+#endif
+
+  if (cpi->sf.partition_search_type == VAR_BASED_PARTITION &&
+      cpi->td.var_root[0] == NULL)
+    av1_setup_var_tree(&cpi->common, &cpi->td);
+
+  {
+    struct aom_usec_timer emr_timer;
+    aom_usec_timer_start(&emr_timer);
+
+#if CONFIG_FP_MB_STATS
+    if (cpi->use_fp_mb_stats) {
+      input_fpmb_stats(&cpi->twopass.firstpass_mb_stats, cm,
+                       &cpi->twopass.this_frame_mb_stats);
+    }
+#endif
+
+    // If allowed, encoding tiles in parallel with one thread handling one tile.
+    // TODO(geza.lore): The multi-threaded encoder is not safe with more than
+    // 1 tile rows, as it uses the single above_context et al arrays from
+    // cpi->common
+    if (AOMMIN(cpi->oxcf.max_threads, cm->tile_cols) > 1 && cm->tile_rows == 1)
+      av1_encode_tiles_mt(cpi);
+    else
+      encode_tiles(cpi);
+
+    aom_usec_timer_mark(&emr_timer);
+    cpi->time_encode_sb_row += aom_usec_timer_elapsed(&emr_timer);
+  }
+
+#if 0
+  // Keep record of the total distortion this time around for future use
+  cpi->last_frame_distortion = cpi->frame_distortion;
+#endif
+}
+
+void av1_encode_frame(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+#if CONFIG_EXT_TX
+  // Indicates whether or not to use a default reduced set for ext-tx
+  // rather than the potential full set of 16 transforms
+  cm->reduced_tx_set_used = 0;
+#endif  // CONFIG_EXT_TX
+
+  // In the longer term the encoder should be generalized to match the
+  // decoder such that we allow compound where one of the 3 buffers has a
+  // different sign bias and that buffer is then the fixed ref. However, this
+  // requires further work in the rd loop. For now the only supported encoder
+  // side behavior is where the ALT ref buffer has opposite sign bias to
+  // the other two.
+  if (!frame_is_intra_only(cm)) {
+#if CONFIG_LOWDELAY_COMPOUND  // Normative in encoder
+    cpi->allow_comp_inter_inter = 1;
+#if CONFIG_EXT_REFS
+    cm->comp_fwd_ref[0] = LAST_FRAME;
+    cm->comp_fwd_ref[1] = LAST2_FRAME;
+    cm->comp_fwd_ref[2] = LAST3_FRAME;
+    cm->comp_fwd_ref[3] = GOLDEN_FRAME;
+    cm->comp_bwd_ref[0] = BWDREF_FRAME;
+    cm->comp_bwd_ref[1] = ALTREF_FRAME;
+#else
+    cm->comp_fixed_ref = ALTREF_FRAME;
+    cm->comp_var_ref[0] = LAST_FRAME;
+    cm->comp_var_ref[1] = GOLDEN_FRAME;
+#endif  // CONFIG_EXT_REFS
+#else
+    if ((cm->ref_frame_sign_bias[ALTREF_FRAME] ==
+         cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
+        (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
+         cm->ref_frame_sign_bias[LAST_FRAME])) {
+      cpi->allow_comp_inter_inter = 0;
+    } else {
+      cpi->allow_comp_inter_inter = 1;
+
+#if CONFIG_EXT_REFS
+      cm->comp_fwd_ref[0] = LAST_FRAME;
+      cm->comp_fwd_ref[1] = LAST2_FRAME;
+      cm->comp_fwd_ref[2] = LAST3_FRAME;
+      cm->comp_fwd_ref[3] = GOLDEN_FRAME;
+      cm->comp_bwd_ref[0] = BWDREF_FRAME;
+      cm->comp_bwd_ref[1] = ALTREF_FRAME;
+#else
+      cm->comp_fixed_ref = ALTREF_FRAME;
+      cm->comp_var_ref[0] = LAST_FRAME;
+      cm->comp_var_ref[1] = GOLDEN_FRAME;
+#endif  // CONFIG_EXT_REFS
+    }
+#endif
+  } else {
+    cpi->allow_comp_inter_inter = 0;
+  }
+
+  if (cpi->sf.frame_parameter_update) {
+    int i;
+    RD_OPT *const rd_opt = &cpi->rd;
+    FRAME_COUNTS *counts = cpi->td.counts;
+    RD_COUNTS *const rdc = &cpi->td.rd_counts;
+
+    // This code does a single RD pass over the whole frame assuming
+    // either compound, single or hybrid prediction as per whatever has
+    // worked best for that type of frame in the past.
+    // It also predicts whether another coding mode would have worked
+    // better than this coding mode. If that is the case, it remembers
+    // that for subsequent frames.
+    // It does the same analysis for transform size selection also.
+    //
+    // TODO(zoeliu): To investigate whether a frame_type other than
+    // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
+    const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
+    int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type];
+    const int is_alt_ref = frame_type == ALTREF_FRAME;
+
+/* prediction (compound, single or hybrid) mode selection */
+#if CONFIG_REF_ADAPT
+    // NOTE(zoeliu): "is_alt_ref" is true only for OVERLAY/INTNL_OVERLAY frames
+    if (is_alt_ref || !cpi->allow_comp_inter_inter)
+      cm->reference_mode = SINGLE_REFERENCE;
+    else
+      cm->reference_mode = REFERENCE_MODE_SELECT;
+#else
+    if (is_alt_ref || !cpi->allow_comp_inter_inter)
+      cm->reference_mode = SINGLE_REFERENCE;
+    else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] &&
+             mode_thrs[COMPOUND_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT] &&
+             check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
+      cm->reference_mode = COMPOUND_REFERENCE;
+    else if (mode_thrs[SINGLE_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT])
+      cm->reference_mode = SINGLE_REFERENCE;
+    else
+      cm->reference_mode = REFERENCE_MODE_SELECT;
+#endif  // CONFIG_REF_ADAPT
+
+#if CONFIG_DUAL_FILTER
+    cm->interp_filter = SWITCHABLE;
+#endif
+
+    encode_frame_internal(cpi);
+
+    for (i = 0; i < REFERENCE_MODES; ++i)
+      mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
+
+    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+      int single_count_zero = 0;
+      int comp_count_zero = 0;
+
+      for (i = 0; i < COMP_INTER_CONTEXTS; i++) {
+        single_count_zero += counts->comp_inter[i][0];
+        comp_count_zero += counts->comp_inter[i][1];
+      }
+
+      if (comp_count_zero == 0) {
+        cm->reference_mode = SINGLE_REFERENCE;
+        av1_zero(counts->comp_inter);
+#if !CONFIG_REF_ADAPT
+      } else if (single_count_zero == 0) {
+        cm->reference_mode = COMPOUND_REFERENCE;
+        av1_zero(counts->comp_inter);
+#endif  // !CONFIG_REF_ADAPT
+      }
+    }
+
+#if CONFIG_VAR_TX
+    if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0)
+      cm->tx_mode = ALLOW_32X32 + CONFIG_TX64X64;
+#else
+    if (cm->tx_mode == TX_MODE_SELECT) {
+#if CONFIG_TX64X64
+      int count4x4 = 0;
+      int count8x8_8x8p = 0, count8x8_lp = 0;
+      int count16x16_16x16p = 0, count16x16_lp = 0;
+      int count32x32_32x32p = 0, count32x32_lp = 0;
+      int count64x64_64x64p = 0;
+      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+        // counts->tx_size[max_depth][context_idx][this_depth_level]
+        count4x4 += counts->tx_size[0][i][0];
+        count4x4 += counts->tx_size[1][i][0];
+        count4x4 += counts->tx_size[2][i][0];
+        count4x4 += counts->tx_size[3][i][0];
+
+        count8x8_8x8p += counts->tx_size[0][i][1];
+        count8x8_lp += counts->tx_size[1][i][1];
+        count8x8_lp += counts->tx_size[2][i][1];
+        count8x8_lp += counts->tx_size[3][i][1];
+
+        count16x16_16x16p += counts->tx_size[1][i][2];
+        count16x16_lp += counts->tx_size[2][i][2];
+        count16x16_lp += counts->tx_size[3][i][2];
+
+        count32x32_32x32p += counts->tx_size[2][i][3];
+        count32x32_lp += counts->tx_size[3][i][3];
+
+        count64x64_64x64p += counts->tx_size[3][i][4];
+      }
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      count4x4 += counts->tx_size_implied[0][TX_4X4];
+      count4x4 += counts->tx_size_implied[1][TX_4X4];
+      count4x4 += counts->tx_size_implied[2][TX_4X4];
+      count4x4 += counts->tx_size_implied[3][TX_4X4];
+      count8x8_8x8p += counts->tx_size_implied[1][TX_8X8];
+      count8x8_lp += counts->tx_size_implied[2][TX_8X8];
+      count8x8_lp += counts->tx_size_implied[3][TX_8X8];
+      count8x8_lp += counts->tx_size_implied[4][TX_8X8];
+      count16x16_16x16p += counts->tx_size_implied[2][TX_16X16];
+      count16x16_lp += counts->tx_size_implied[3][TX_16X16];
+      count16x16_lp += counts->tx_size_implied[4][TX_16X16];
+      count32x32_32x32p += counts->tx_size_implied[3][TX_32X32];
+      count32x32_lp += counts->tx_size_implied[4][TX_32X32];
+      count64x64_64x64p += counts->tx_size_implied[4][TX_64X64];
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
+          count32x32_lp == 0 && count32x32_32x32p == 0 &&
+#if CONFIG_SUPERTX
+          cm->counts.supertx_size[TX_16X16] == 0 &&
+          cm->counts.supertx_size[TX_32X32] == 0 &&
+          cm->counts.supertx_size[TX_64X64] == 0 &&
+#endif
+          count64x64_64x64p == 0) {
+        cm->tx_mode = ALLOW_8X8;
+        reset_skip_tx_size(cm, TX_8X8);
+      } else if (count8x8_8x8p == 0 && count8x8_lp == 0 &&
+                 count16x16_16x16p == 0 && count16x16_lp == 0 &&
+                 count32x32_32x32p == 0 && count32x32_lp == 0 &&
+#if CONFIG_SUPERTX
+                 cm->counts.supertx_size[TX_8X8] == 0 &&
+                 cm->counts.supertx_size[TX_16X16] == 0 &&
+                 cm->counts.supertx_size[TX_32X32] == 0 &&
+                 cm->counts.supertx_size[TX_64X64] == 0 &&
+#endif
+                 count64x64_64x64p == 0) {
+        cm->tx_mode = ONLY_4X4;
+        reset_skip_tx_size(cm, TX_4X4);
+      } else if (count4x4 == 0 && count8x8_lp == 0 && count16x16_lp == 0 &&
+                 count32x32_lp == 0) {
+        cm->tx_mode = ALLOW_64X64;
+      } else if (count4x4 == 0 && count8x8_lp == 0 && count16x16_lp == 0 &&
+#if CONFIG_SUPERTX
+                 cm->counts.supertx_size[TX_64X64] == 0 &&
+#endif
+                 count64x64_64x64p == 0) {
+        cm->tx_mode = ALLOW_32X32;
+        reset_skip_tx_size(cm, TX_32X32);
+      } else if (count4x4 == 0 && count8x8_lp == 0 && count32x32_lp == 0 &&
+                 count32x32_32x32p == 0 &&
+#if CONFIG_SUPERTX
+                 cm->counts.supertx_size[TX_32X32] == 0 &&
+                 cm->counts.supertx_size[TX_64X64] == 0 &&
+#endif
+                 count64x64_64x64p == 0) {
+        cm->tx_mode = ALLOW_16X16;
+        reset_skip_tx_size(cm, TX_16X16);
+      }
+
+#else  // CONFIG_TX64X64
+
+      int count4x4 = 0;
+      int count8x8_lp = 0, count8x8_8x8p = 0;
+      int count16x16_16x16p = 0, count16x16_lp = 0;
+      int count32x32 = 0;
+      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+        // counts->tx_size[max_depth][context_idx][this_depth_level]
+        count4x4 += counts->tx_size[0][i][0];
+        count4x4 += counts->tx_size[1][i][0];
+        count4x4 += counts->tx_size[2][i][0];
+
+        count8x8_8x8p += counts->tx_size[0][i][1];
+        count8x8_lp += counts->tx_size[1][i][1];
+        count8x8_lp += counts->tx_size[2][i][1];
+
+        count16x16_16x16p += counts->tx_size[1][i][2];
+        count16x16_lp += counts->tx_size[2][i][2];
+        count32x32 += counts->tx_size[2][i][3];
+      }
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      count4x4 += counts->tx_size_implied[0][TX_4X4];
+      count4x4 += counts->tx_size_implied[1][TX_4X4];
+      count4x4 += counts->tx_size_implied[2][TX_4X4];
+      count4x4 += counts->tx_size_implied[3][TX_4X4];
+      count8x8_8x8p += counts->tx_size_implied[1][TX_8X8];
+      count8x8_lp += counts->tx_size_implied[2][TX_8X8];
+      count8x8_lp += counts->tx_size_implied[3][TX_8X8];
+      count16x16_lp += counts->tx_size_implied[3][TX_16X16];
+      count16x16_16x16p += counts->tx_size_implied[2][TX_16X16];
+      count32x32 += counts->tx_size_implied[3][TX_32X32];
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
+#if CONFIG_SUPERTX
+          cm->counts.supertx_size[TX_16X16] == 0 &&
+          cm->counts.supertx_size[TX_32X32] == 0 &&
+#endif  // CONFIG_SUPERTX
+          count32x32 == 0) {
+        cm->tx_mode = ALLOW_8X8;
+        reset_skip_tx_size(cm, TX_8X8);
+      } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
+                 count8x8_lp == 0 && count16x16_lp == 0 &&
+#if CONFIG_SUPERTX
+                 cm->counts.supertx_size[TX_8X8] == 0 &&
+                 cm->counts.supertx_size[TX_16X16] == 0 &&
+                 cm->counts.supertx_size[TX_32X32] == 0 &&
+#endif  // CONFIG_SUPERTX
+                 count32x32 == 0) {
+        cm->tx_mode = ONLY_4X4;
+        reset_skip_tx_size(cm, TX_4X4);
+      } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
+        cm->tx_mode = ALLOW_32X32;
+      } else if (count32x32 == 0 && count8x8_lp == 0 &&
+#if CONFIG_SUPERTX
+                 cm->counts.supertx_size[TX_32X32] == 0 &&
+#endif  // CONFIG_SUPERTX
+                 count4x4 == 0) {
+        cm->tx_mode = ALLOW_16X16;
+        reset_skip_tx_size(cm, TX_16X16);
+      }
+#endif  // CONFIG_TX64X64
+    }
+#endif
+  } else {
+    encode_frame_internal(cpi);
+  }
+}
+
+static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd,
+                            const MODE_INFO *mi, const MODE_INFO *above_mi,
+                            const MODE_INFO *left_mi, const int intraonly,
+                            const int mi_row, const int mi_col) {
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const PREDICTION_MODE y_mode = mbmi->mode;
+  const PREDICTION_MODE uv_mode = mbmi->uv_mode;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int unify_bsize = CONFIG_CB4X4;
+
+  if (bsize < BLOCK_8X8 && !unify_bsize) {
+    int idx, idy;
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+    for (idy = 0; idy < 2; idy += num_4x4_h)
+      for (idx = 0; idx < 2; idx += num_4x4_w) {
+        const int bidx = idy * 2 + idx;
+        const PREDICTION_MODE bmode = mi->bmi[bidx].as_mode;
+        if (intraonly) {
+          const PREDICTION_MODE a = av1_above_block_mode(mi, above_mi, bidx);
+          const PREDICTION_MODE l = av1_left_block_mode(mi, left_mi, bidx);
+          ++counts->kf_y_mode[a][l][bmode];
+        } else {
+          ++counts->y_mode[0][bmode];
+        }
+      }
+  } else {
+    if (intraonly) {
+      const PREDICTION_MODE above = av1_above_block_mode(mi, above_mi, 0);
+      const PREDICTION_MODE left = av1_left_block_mode(mi, left_mi, 0);
+      ++counts->kf_y_mode[above][left][y_mode];
+    } else {
+      ++counts->y_mode[size_group_lookup[bsize]][y_mode];
+    }
+#if CONFIG_FILTER_INTRA
+    if (mbmi->mode == DC_PRED
+#if CONFIG_PALETTE
+        && mbmi->palette_mode_info.palette_size[0] == 0
+#endif  // CONFIG_PALETTE
+        ) {
+      const int use_filter_intra_mode =
+          mbmi->filter_intra_mode_info.use_filter_intra_mode[0];
+      ++counts->filter_intra[0][use_filter_intra_mode];
+    }
+    if (mbmi->uv_mode == DC_PRED
+#if CONFIG_PALETTE
+        && mbmi->palette_mode_info.palette_size[1] == 0
+#endif  // CONFIG_PALETTE
+        ) {
+      const int use_filter_intra_mode =
+          mbmi->filter_intra_mode_info.use_filter_intra_mode[1];
+      ++counts->filter_intra[1][use_filter_intra_mode];
+    }
+#endif  // CONFIG_FILTER_INTRA
+#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+    if (av1_is_directional_mode(mbmi->mode, bsize)) {
+      const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
+      const int p_angle =
+          mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
+      if (av1_is_intra_filter_switchable(p_angle))
+        ++counts->intra_filter[intra_filter_ctx][mbmi->intra_filter];
+    }
+#endif  // CONFIG_INTRA_INTERP && CONFIG_INTRA_INTERP
+  }
+
+#if CONFIG_CB4X4
+  if (!is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                           xd->plane[1].subsampling_y))
+    return;
+#else
+  (void)mi_row;
+  (void)mi_col;
+  (void)xd;
+#endif
+  ++counts->uv_mode[y_mode][uv_mode];
+}
+
+#if CONFIG_VAR_TX
+static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
+                              FRAME_COUNTS *counts, TX_SIZE tx_size, int depth,
+                              int blk_row, int blk_col) {
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int tx_row = blk_row >> 1;
+  const int tx_col = blk_col >> 1;
+  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
+  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
+  int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
+                                   xd->left_txfm_context + tx_row,
+                                   mbmi->sb_type, tx_size);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (tx_size == plane_tx_size) {
+    ++counts->txfm_partition[ctx][0];
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size, tx_size);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bs = tx_size_wide_unit[sub_txs];
+    int i;
+
+    ++counts->txfm_partition[ctx][1];
+    ++x->txb_split_count;
+
+    if (tx_size == TX_8X8) {
+      mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + tx_col,
+                            xd->left_txfm_context + tx_row, TX_4X4, tx_size);
+      return;
+    }
+
+    for (i = 0; i < 4; ++i) {
+      int offsetr = (i >> 1) * bs;
+      int offsetc = (i & 0x01) * bs;
+      update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr,
+                        blk_col + offsetc);
+    }
+  }
+}
+
+static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x,
+                                      BLOCK_SIZE plane_bsize, int mi_row,
+                                      int mi_col, FRAME_COUNTS *td_counts) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
+  TX_SIZE max_tx_size = get_vartx_max_txsize(&xd->mi[0]->mbmi, plane_bsize);
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+  int idx, idy;
+
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  for (idy = 0; idy < mi_height; idy += bh)
+    for (idx = 0; idx < mi_width; idx += bw)
+      update_txfm_count(x, xd, td_counts, max_tx_size, mi_width != mi_height,
+                        idy, idx);
+}
+
+static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
+                             int blk_col) {
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int tx_row = blk_row >> 1;
+  const int tx_col = blk_col >> 1;
+  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
+  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (tx_size == plane_tx_size) {
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size, tx_size);
+
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsl = tx_size_wide_unit[sub_txs];
+    int i;
+
+    if (tx_size == TX_8X8) {
+      mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + tx_col,
+                            xd->left_txfm_context + tx_row, TX_4X4, tx_size);
+      return;
+    }
+
+    assert(bsl > 0);
+    for (i = 0; i < 4; ++i) {
+      int offsetr = (i >> 1) * bsl;
+      int offsetc = (i & 0x01) * bsl;
+      set_txfm_context(xd, sub_txs, blk_row + offsetr, blk_col + offsetc);
+    }
+  }
+}
+
+static void tx_partition_set_contexts(const AV1_COMMON *const cm,
+                                      MACROBLOCKD *xd, BLOCK_SIZE plane_bsize,
+                                      int mi_row, int mi_col) {
+  const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+  TX_SIZE max_tx_size = get_vartx_max_txsize(&xd->mi[0]->mbmi, plane_bsize);
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+  int idx, idy;
+
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  for (idy = 0; idy < mi_height; idy += bh)
+    for (idx = 0; idx < mi_width; idx += bw)
+      set_txfm_context(xd, max_tx_size, idy, idx);
+}
+#endif
+
+void av1_update_tx_type_count(const AV1_COMMON *cm, MACROBLOCKD *xd,
+#if CONFIG_TXK_SEL
+                              int block, int plane,
+#endif
+                              BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              FRAME_COUNTS *counts) {
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  int is_inter = is_inter_block(mbmi);
+#if !CONFIG_TXK_SEL
+  TX_TYPE tx_type = mbmi->tx_type;
+#else
+  // Only y plane's tx_type is updated
+  if (plane > 0) return;
+  TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size);
+#endif
+#if CONFIG_EXT_TX
+  if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 &&
+      cm->base_qindex > 0 && !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    const int eset =
+        get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
+    if (eset > 0) {
+      if (is_inter) {
+        ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]][tx_type];
+      } else {
+        ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][mbmi->mode]
+                              [tx_type];
+      }
+    }
+  }
+#else
+  (void)bsize;
+  if (tx_size < TX_32X32 &&
+      ((!cm->seg.enabled && cm->base_qindex > 0) ||
+       (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
+      !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    if (is_inter) {
+      ++counts->inter_ext_tx[tx_size][tx_type];
+    } else {
+      ++counts->intra_ext_tx[tx_size][intra_mode_to_tx_type_context[mbmi->mode]]
+                            [tx_type];
+    }
+  }
+#endif  // CONFIG_EXT_TX
+}
+
+static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
+                              TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
+                              int mi_col, BLOCK_SIZE bsize,
+                              PICK_MODE_CONTEXT *ctx, int *rate) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO **mi_8x8 = xd->mi;
+  MODE_INFO *mi = mi_8x8[0];
+  MB_MODE_INFO *mbmi = &mi->mbmi;
+  const int seg_skip =
+      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
+  const int mis = cm->mi_stride;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  const int is_inter = is_inter_block(mbmi);
+#if CONFIG_CB4X4
+  const BLOCK_SIZE block_size = bsize;
+#else
+  const BLOCK_SIZE block_size = AOMMAX(bsize, BLOCK_8X8);
+#endif
+
+#if CONFIG_PVQ
+  x->pvq_speed = 0;
+  x->pvq_coded = (dry_run == OUTPUT_ENABLED) ? 1 : 0;
+#endif
+#if CONFIG_CFL
+  x->cfl_store_y = (dry_run == OUTPUT_ENABLED) ? 1 : 0;
+#endif
+
+  if (!is_inter) {
+    int plane;
+    mbmi->skip = 1;
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      av1_encode_intra_block_plane((AV1_COMMON *)cm, x, block_size, plane, 1,
+                                   mi_row, mi_col);
+    }
+    if (!dry_run) {
+      sum_intra_stats(td->counts, xd, mi, xd->above_mi, xd->left_mi,
+                      frame_is_intra_only(cm), mi_row, mi_col);
+    }
+#if CONFIG_PALETTE
+    if (bsize >= BLOCK_8X8 && !dry_run) {
+      for (plane = 0; plane <= 1; ++plane) {
+        if (mbmi->palette_mode_info.palette_size[plane] > 0) {
+          mbmi->palette_mode_info.palette_first_color_idx[plane] =
+              xd->plane[plane].color_index_map[0];
+          // TODO(huisu): this increases the use of token buffer. Needs stretch
+          // test to verify.
+          av1_tokenize_palette_sb(cpi, td, plane, t, dry_run, bsize, rate);
+        }
+      }
+    }
+#endif  // CONFIG_PALETTE
+#if CONFIG_VAR_TX
+    mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+#endif
+#if CONFIG_LV_MAP
+    av1_update_txb_context(cpi, td, dry_run, block_size, rate, mi_row, mi_col);
+#else   // CONFIG_LV_MAP
+    av1_tokenize_sb(cpi, td, t, dry_run, block_size, rate, mi_row, mi_col);
+#endif  // CONFIG_LV_MAP
+  } else {
+    int ref;
+    const int is_compound = has_second_ref(mbmi);
+
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
+#if CONFIG_INTRABC
+      assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
+#else
+      assert(cfg != NULL);
+#endif  // !CONFIG_INTRABC
+      av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
+                           &xd->block_refs[ref]->sf);
+    }
+    if (!(cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready) || seg_skip)
+      av1_build_inter_predictors_sby(xd, mi_row, mi_col, NULL, block_size);
+
+    av1_build_inter_predictors_sbuv(xd, mi_row, mi_col, NULL, block_size);
+#if CONFIG_MOTION_VAR
+    if (mbmi->motion_mode == OBMC_CAUSAL) {
+#if CONFIG_NCOBMC
+      if (dry_run == OUTPUT_ENABLED)
+        av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+      else
+#endif
+        av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+    }
+#endif  // CONFIG_MOTION_VAR
+
+    av1_encode_sb((AV1_COMMON *)cm, x, block_size, mi_row, mi_col);
+#if CONFIG_VAR_TX
+    if (mbmi->skip) mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+    av1_tokenize_sb_vartx(cpi, td, t, dry_run, mi_row, mi_col, block_size,
+                          rate);
+#else
+#if CONFIG_LV_MAP
+    av1_update_txb_context(cpi, td, dry_run, block_size, rate, mi_row, mi_col);
+#else   // CONFIG_LV_MAP
+    av1_tokenize_sb(cpi, td, t, dry_run, block_size, rate, mi_row, mi_col);
+#endif  // CONFIG_LV_MAP
+#endif
+  }
+
+  if (!dry_run) {
+#if CONFIG_VAR_TX
+    TX_SIZE tx_size =
+        is_inter && !mbmi->skip ? mbmi->min_tx_size : mbmi->tx_size;
+#else
+    TX_SIZE tx_size = mbmi->tx_size;
+#endif
+    if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id] &&
+#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_EXT_TX) && CONFIG_RECT_TX
+        mbmi->sb_type > BLOCK_4X4 &&
+#else
+        mbmi->sb_type >= BLOCK_8X8 &&
+#endif
+        !(is_inter && (mbmi->skip || seg_skip))) {
+#if CONFIG_VAR_TX
+      if (is_inter) {
+        tx_partition_count_update(cm, x, bsize, mi_row, mi_col, td->counts);
+      } else {
+        const int tx_size_ctx = get_tx_size_context(xd);
+        const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
+                                         : intra_tx_size_cat_lookup[bsize];
+        const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
+        const int depth = tx_size_to_depth(coded_tx_size);
+        ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth];
+        if (tx_size != max_txsize_lookup[bsize]) ++x->txb_split_count;
+      }
+#else
+      const int tx_size_ctx = get_tx_size_context(xd);
+      const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
+                                       : intra_tx_size_cat_lookup[bsize];
+      const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
+      const int depth = tx_size_to_depth(coded_tx_size);
+
+      ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth];
+#endif
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+    } else {
+      int i, j;
+      TX_SIZE intra_tx_size;
+      // The new intra coding scheme requires no change of transform size
+      if (is_inter) {
+        if (xd->lossless[mbmi->segment_id]) {
+          intra_tx_size = TX_4X4;
+        } else {
+          intra_tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, 1);
+        }
+      } else {
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+        intra_tx_size = tx_size;
+#else
+        intra_tx_size = (bsize >= BLOCK_8X8) ? tx_size : TX_4X4;
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+      }
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      ++td->counts->tx_size_implied[max_txsize_lookup[bsize]]
+                                   [txsize_sqr_up_map[tx_size]];
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+      for (j = 0; j < mi_height; j++)
+        for (i = 0; i < mi_width; i++)
+          if (mi_col + i < cm->mi_cols && mi_row + j < cm->mi_rows)
+            mi_8x8[mis * j + i]->mbmi.tx_size = intra_tx_size;
+
+#if CONFIG_VAR_TX
+      mbmi->min_tx_size = get_min_tx_size(intra_tx_size);
+      if (intra_tx_size != max_txsize_lookup[bsize]) ++x->txb_split_count;
+#endif
+    }
+
+    ++td->counts->tx_size_totals[txsize_sqr_map[tx_size]];
+    ++td->counts
+          ->tx_size_totals[txsize_sqr_map[get_uv_tx_size(mbmi, &xd->plane[1])]];
+#if !CONFIG_TXK_SEL
+    av1_update_tx_type_count(cm, xd, bsize, tx_size, td->counts);
+#endif
+  }
+
+#if CONFIG_VAR_TX
+  if (cm->tx_mode == TX_MODE_SELECT &&
+#if CONFIG_CB4X4
+      mbmi->sb_type > BLOCK_4X4 &&
+#else
+      mbmi->sb_type >= BLOCK_8X8 &&
+#endif
+      is_inter && !(mbmi->skip || seg_skip)) {
+    if (dry_run) tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
+  } else {
+    TX_SIZE tx_size = mbmi->tx_size;
+    // The new intra coding scheme requires no change of transform size
+    if (is_inter)
+      tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, is_inter);
+    else
+      tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4;
+    mbmi->tx_size = tx_size;
+    set_txfm_ctxs(tx_size, xd->n8_w, xd->n8_h, (mbmi->skip || seg_skip), xd);
+  }
+#endif  // CONFIG_VAR_TX
+}
+
+#if CONFIG_SUPERTX
+static int check_intra_b(PICK_MODE_CONTEXT *ctx) {
+  if (!is_inter_mode((&ctx->mic)->mbmi.mode)) return 1;
+#if CONFIG_EXT_INTER
+  if (ctx->mic.mbmi.ref_frame[1] == INTRA_FRAME) return 1;
+#endif  // CONFIG_EXT_INTER
+  return 0;
+}
+
+static int check_intra_sb(const AV1_COMP *const cpi, const TileInfo *const tile,
+                          int mi_row, int mi_col, BLOCK_SIZE bsize,
+                          PC_TREE *pc_tree) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int hbs = mi_size_wide[bsize] / 2;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_EXT_PARTITION_TYPES
+  int i;
+#endif
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+#endif
+
+#if !CONFIG_CB4X4
+  assert(bsize >= BLOCK_8X8);
+#endif
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return 1;
+
+  switch (partition) {
+    case PARTITION_NONE: return check_intra_b(&pc_tree->none); break;
+    case PARTITION_VERT:
+      if (check_intra_b(&pc_tree->vertical[0])) return 1;
+      if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) {
+        if (check_intra_b(&pc_tree->vertical[1])) return 1;
+      }
+      break;
+    case PARTITION_HORZ:
+      if (check_intra_b(&pc_tree->horizontal[0])) return 1;
+      if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) {
+        if (check_intra_b(&pc_tree->horizontal[1])) return 1;
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8 && !unify_bsize) {
+        if (check_intra_b(pc_tree->leaf_split[0])) return 1;
+      } else {
+        if (check_intra_sb(cpi, tile, mi_row, mi_col, subsize,
+                           pc_tree->split[0]))
+          return 1;
+        if (check_intra_sb(cpi, tile, mi_row, mi_col + hbs, subsize,
+                           pc_tree->split[1]))
+          return 1;
+        if (check_intra_sb(cpi, tile, mi_row + hbs, mi_col, subsize,
+                           pc_tree->split[2]))
+          return 1;
+        if (check_intra_sb(cpi, tile, mi_row + hbs, mi_col + hbs, subsize,
+                           pc_tree->split[3]))
+          return 1;
+      }
+      break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      for (i = 0; i < 3; i++) {
+        if (check_intra_b(&pc_tree->horizontala[i])) return 1;
+      }
+      break;
+    case PARTITION_HORZ_B:
+      for (i = 0; i < 3; i++) {
+        if (check_intra_b(&pc_tree->horizontalb[i])) return 1;
+      }
+      break;
+    case PARTITION_VERT_A:
+      for (i = 0; i < 3; i++) {
+        if (check_intra_b(&pc_tree->verticala[i])) return 1;
+      }
+      break;
+    case PARTITION_VERT_B:
+      for (i = 0; i < 3; i++) {
+        if (check_intra_b(&pc_tree->verticalb[i])) return 1;
+      }
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    default: assert(0);
+  }
+  return 0;
+}
+
+static int check_supertx_b(TX_SIZE supertx_size, PICK_MODE_CONTEXT *ctx) {
+  return ctx->mic.mbmi.tx_size == supertx_size;
+}
+
+static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
+                            PC_TREE *pc_tree) {
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+#endif
+
+  partition = pc_tree->partitioning;
+  subsize = get_subsize(bsize, partition);
+  switch (partition) {
+    case PARTITION_NONE: return check_supertx_b(supertx_size, &pc_tree->none);
+    case PARTITION_VERT:
+      return check_supertx_b(supertx_size, &pc_tree->vertical[0]);
+    case PARTITION_HORZ:
+      return check_supertx_b(supertx_size, &pc_tree->horizontal[0]);
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8 && !unify_bsize)
+        return check_supertx_b(supertx_size, pc_tree->leaf_split[0]);
+      else
+        return check_supertx_sb(subsize, supertx_size, pc_tree->split[0]);
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      return check_supertx_b(supertx_size, &pc_tree->horizontala[0]);
+    case PARTITION_HORZ_B:
+      return check_supertx_b(supertx_size, &pc_tree->horizontalb[0]);
+    case PARTITION_VERT_A:
+      return check_supertx_b(supertx_size, &pc_tree->verticala[0]);
+    case PARTITION_VERT_B:
+      return check_supertx_b(supertx_size, &pc_tree->verticalb[0]);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    default: assert(0); return 0;
+  }
+}
+
+static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
+#if CONFIG_EXT_INTER
+                               int mi_row_ori, int mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                               int mi_row_pred, int mi_col_pred,
+                               BLOCK_SIZE bsize_pred, int b_sub8x8, int block) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori): location for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *mi_8x8 = xd->mi[0];
+  MODE_INFO *mi = mi_8x8;
+  MB_MODE_INFO *mbmi = &mi->mbmi;
+  int ref;
+  const int is_compound = has_second_ref(mbmi);
+
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
+    av1_setup_pre_planes(xd, ref, cfg, mi_row_pred, mi_col_pred,
+                         &xd->block_refs[ref]->sf);
+  }
+
+  if (!b_sub8x8)
+    av1_build_inter_predictors_sb_extend(xd,
+#if CONFIG_EXT_INTER
+                                         mi_row_ori, mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                                         mi_row_pred, mi_col_pred, bsize_pred);
+  else
+    av1_build_inter_predictors_sb_sub8x8_extend(xd,
+#if CONFIG_EXT_INTER
+                                                mi_row_ori, mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                                                mi_row_pred, mi_col_pred,
+                                                bsize_pred, block);
+}
+
+static void predict_b_extend(const AV1_COMP *const cpi, ThreadData *td,
+                             const TileInfo *const tile, int block,
+                             int mi_row_ori, int mi_col_ori, int mi_row_pred,
+                             int mi_col_pred, int mi_row_top, int mi_col_top,
+                             uint8_t *dst_buf[3], int dst_stride[3],
+                             BLOCK_SIZE bsize_top, BLOCK_SIZE bsize_pred,
+                             RUN_TYPE dry_run, int b_sub8x8, int bextend) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori): location for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  // (mi_row_top, mi_col_top, bsize_top): region of the top partition size
+  // block: sub location of sub8x8 blocks
+  // b_sub8x8: 1: ori is sub8x8; 0: ori is not sub8x8
+  // bextend: 1: region to predict is an extension of ori; 0: not
+
+  MACROBLOCK *const x = &td->mb;
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int r = (mi_row_pred - mi_row_top) * MI_SIZE;
+  int c = (mi_col_pred - mi_col_top) * MI_SIZE;
+  const int mi_width_top = mi_size_wide[bsize_top];
+  const int mi_height_top = mi_size_high[bsize_top];
+
+  if (mi_row_pred < mi_row_top || mi_col_pred < mi_col_top ||
+      mi_row_pred >= mi_row_top + mi_height_top ||
+      mi_col_pred >= mi_col_top + mi_width_top || mi_row_pred >= cm->mi_rows ||
+      mi_col_pred >= cm->mi_cols)
+    return;
+
+  set_offsets_extend(cpi, td, tile, mi_row_pred, mi_col_pred, mi_row_ori,
+                     mi_col_ori, bsize_pred);
+  xd->plane[0].dst.stride = dst_stride[0];
+  xd->plane[1].dst.stride = dst_stride[1];
+  xd->plane[2].dst.stride = dst_stride[2];
+  xd->plane[0].dst.buf = dst_buf[0] +
+                         (r >> xd->plane[0].subsampling_y) * dst_stride[0] +
+                         (c >> xd->plane[0].subsampling_x);
+  xd->plane[1].dst.buf = dst_buf[1] +
+                         (r >> xd->plane[1].subsampling_y) * dst_stride[1] +
+                         (c >> xd->plane[1].subsampling_x);
+  xd->plane[2].dst.buf = dst_buf[2] +
+                         (r >> xd->plane[2].subsampling_y) * dst_stride[2] +
+                         (c >> xd->plane[2].subsampling_x);
+
+  predict_superblock(cpi, td,
+#if CONFIG_EXT_INTER
+                     mi_row_ori, mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                     mi_row_pred, mi_col_pred, bsize_pred, b_sub8x8, block);
+
+  if (!dry_run && !bextend)
+    update_stats(&cpi->common, td, mi_row_pred, mi_col_pred, 1);
+}
+
+static void extend_dir(const AV1_COMP *const cpi, ThreadData *td,
+                       const TileInfo *const tile, int block, BLOCK_SIZE bsize,
+                       BLOCK_SIZE top_bsize, int mi_row, int mi_col,
+                       int mi_row_top, int mi_col_top, RUN_TYPE dry_run,
+                       uint8_t *dst_buf[3], int dst_stride[3], int dir) {
+  // dir: 0-lower, 1-upper, 2-left, 3-right
+  //      4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
+  MACROBLOCKD *xd = &td->mb.e_mbd;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  int xss = xd->plane[1].subsampling_x;
+  int yss = xd->plane[1].subsampling_y;
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+#endif
+  int b_sub8x8 = (bsize < BLOCK_8X8) && !unify_bsize ? 1 : 0;
+  int wide_unit, high_unit;
+  int i, j;
+  int ext_offset = 0;
+
+  BLOCK_SIZE extend_bsize;
+  int mi_row_pred, mi_col_pred;
+
+  if (dir == 0 || dir == 1) {  // lower and upper
+    extend_bsize =
+        (mi_width == mi_size_wide[BLOCK_8X8] || bsize < BLOCK_8X8 || xss < yss)
+            ? BLOCK_8X8
+            : BLOCK_16X8;
+
+#if CONFIG_CB4X4
+    if (bsize < BLOCK_8X8) {
+      extend_bsize = BLOCK_4X4;
+      ext_offset = mi_size_wide[BLOCK_8X8];
+    }
+#endif
+    wide_unit = mi_size_wide[extend_bsize];
+    high_unit = mi_size_high[extend_bsize];
+
+    mi_row_pred = mi_row + ((dir == 0) ? mi_height : -(mi_height + ext_offset));
+    mi_col_pred = mi_col;
+
+    for (j = 0; j < mi_height + ext_offset; j += high_unit)
+      for (i = 0; i < mi_width + ext_offset; i += wide_unit)
+        predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred + j,
+                         mi_col_pred + i, mi_row_top, mi_col_top, dst_buf,
+                         dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
+                         1);
+  } else if (dir == 2 || dir == 3) {  // left and right
+    extend_bsize =
+        (mi_height == mi_size_high[BLOCK_8X8] || bsize < BLOCK_8X8 || yss < xss)
+            ? BLOCK_8X8
+            : BLOCK_8X16;
+#if CONFIG_CB4X4
+    if (bsize < BLOCK_8X8) {
+      extend_bsize = BLOCK_4X4;
+      ext_offset = mi_size_wide[BLOCK_8X8];
+    }
+#endif
+    wide_unit = mi_size_wide[extend_bsize];
+    high_unit = mi_size_high[extend_bsize];
+
+    mi_row_pred = mi_row;
+    mi_col_pred = mi_col + ((dir == 3) ? mi_width : -(mi_width + ext_offset));
+
+    for (j = 0; j < mi_height + ext_offset; j += high_unit)
+      for (i = 0; i < mi_width + ext_offset; i += wide_unit)
+        predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred + j,
+                         mi_col_pred + i, mi_row_top, mi_col_top, dst_buf,
+                         dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
+                         1);
+  } else {
+    extend_bsize = BLOCK_8X8;
+#if CONFIG_CB4X4
+    if (bsize < BLOCK_8X8) {
+      extend_bsize = BLOCK_4X4;
+      ext_offset = mi_size_wide[BLOCK_8X8];
+    }
+#endif
+    wide_unit = mi_size_wide[extend_bsize];
+    high_unit = mi_size_high[extend_bsize];
+
+    mi_row_pred = mi_row + ((dir == 4 || dir == 6) ? mi_height
+                                                   : -(mi_height + ext_offset));
+    mi_col_pred =
+        mi_col + ((dir == 6 || dir == 7) ? mi_width : -(mi_width + ext_offset));
+
+    for (j = 0; j < mi_height + ext_offset; j += high_unit)
+      for (i = 0; i < mi_width + ext_offset; i += wide_unit)
+        predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred + j,
+                         mi_col_pred + i, mi_row_top, mi_col_top, dst_buf,
+                         dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
+                         1);
+  }
+}
+
+static void extend_all(const AV1_COMP *const cpi, ThreadData *td,
+                       const TileInfo *const tile, int block, BLOCK_SIZE bsize,
+                       BLOCK_SIZE top_bsize, int mi_row, int mi_col,
+                       int mi_row_top, int mi_col_top, RUN_TYPE dry_run,
+                       uint8_t *dst_buf[3], int dst_stride[3]) {
+  assert(block >= 0 && block < 4);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+             mi_col_top, dry_run, dst_buf, dst_stride, 0);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+             mi_col_top, dry_run, dst_buf, dst_stride, 1);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+             mi_col_top, dry_run, dst_buf, dst_stride, 2);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+             mi_col_top, dry_run, dst_buf, dst_stride, 3);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+             mi_col_top, dry_run, dst_buf, dst_stride, 4);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+             mi_col_top, dry_run, dst_buf, dst_stride, 5);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+             mi_col_top, dry_run, dst_buf, dst_stride, 6);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+             mi_col_top, dry_run, dst_buf, dst_stride, 7);
+}
+
+// This function generates prediction for multiple blocks, between which
+// discontinuity around boundary is reduced by smoothing masks. The basic
+// smoothing mask is a soft step function along horz/vert direction. In more
+// complicated case when a block is split into 4 subblocks, the basic mask is
+// first applied to neighboring subblocks (2 pairs) in horizontal direction and
+// then applied to the 2 masked prediction mentioned above in vertical direction
+// If the block is split into more than one level, at every stage, masked
+// prediction is stored in dst_buf[] passed from higher level.
+static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
+                               const TileInfo *const tile, int mi_row,
+                               int mi_col, int mi_row_top, int mi_col_top,
+                               RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                               BLOCK_SIZE top_bsize, uint8_t *dst_buf[3],
+                               int dst_stride[3], PC_TREE *pc_tree) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int hbs = mi_size_wide[bsize] / 2;
+  const int is_partition_root = bsize >= BLOCK_8X8;
+  const int ctx = is_partition_root
+                      ? partition_plane_context(xd, mi_row, mi_col,
+#if CONFIG_UNPOISON_PARTITION_CTX
+                                                mi_row + hbs < cm->mi_rows,
+                                                mi_col + hbs < cm->mi_cols,
+#endif
+                                                bsize)
+                      : -1;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_EXT_PARTITION_TYPES
+  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+
+  int i;
+  uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf3[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  int dst_stride1[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
+  int dst_stride2[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
+  int dst_stride3[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+  assert(bsize >= BLOCK_8X8);
+#endif
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
+    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_TX_SQUARE * len);
+    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_TX_SQUARE * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
+    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_TX_SQUARE * len);
+    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_TX_SQUARE * len);
+    dst_buf3[0] = CONVERT_TO_BYTEPTR(tmp_buf3);
+    dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAX_TX_SQUARE * len);
+    dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAX_TX_SQUARE * len);
+  } else {
+#endif  // CONFIG_HIGHBITDEPTH
+    dst_buf1[0] = tmp_buf1;
+    dst_buf1[1] = tmp_buf1 + MAX_TX_SQUARE;
+    dst_buf1[2] = tmp_buf1 + 2 * MAX_TX_SQUARE;
+    dst_buf2[0] = tmp_buf2;
+    dst_buf2[1] = tmp_buf2 + MAX_TX_SQUARE;
+    dst_buf2[2] = tmp_buf2 + 2 * MAX_TX_SQUARE;
+    dst_buf3[0] = tmp_buf3;
+    dst_buf3[1] = tmp_buf3 + MAX_TX_SQUARE;
+    dst_buf3[2] = tmp_buf3 + 2 * MAX_TX_SQUARE;
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+
+  if (!dry_run && ctx >= 0 && bsize < top_bsize) {
+    // Explicitly cast away const.
+    FRAME_COUNTS *const frame_counts = (FRAME_COUNTS *)&cm->counts;
+    frame_counts->partition[ctx][partition]++;
+  }
+
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].dst.buf = dst_buf[i];
+    xd->plane[i].dst.stride = dst_stride[i];
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      assert(bsize < top_bsize);
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+                       bsize, dry_run, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+                 mi_col_top, dry_run, dst_buf, dst_stride);
+      break;
+    case PARTITION_HORZ:
+      if (bsize == BLOCK_8X8 && !unify_bsize) {
+        // Fisrt half
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+                         BLOCK_8X8, dry_run, 1, 0);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+
+        // Second half
+        predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+
+        // Smooth
+        xd->plane[0].dst.buf = dst_buf[0];
+        xd->plane[0].dst.stride = dst_stride[0];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
+            0);
+      } else {
+        // First half
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+                         subsize, dry_run, 0, 0);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+        else
+          extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0);
+
+        if (mi_row + hbs < cm->mi_rows) {
+          // Second half
+          predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                           mi_col, mi_row_top, mi_col_top, dst_buf1,
+                           dst_stride1, top_bsize, subsize, dry_run, 0, 0);
+          if (bsize < top_bsize)
+            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, dry_run, dst_buf1,
+                       dst_stride1);
+          else
+            extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, dry_run, dst_buf1,
+                       dst_stride1, 1);
+
+          // Smooth
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = dst_buf[i];
+            xd->plane[i].dst.stride = dst_stride[i];
+            av1_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+                PARTITION_HORZ, i);
+          }
+        }
+      }
+      break;
+    case PARTITION_VERT:
+      if (bsize == BLOCK_8X8 && !unify_bsize) {
+        // First half
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+                         BLOCK_8X8, dry_run, 1, 0);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+
+        // Second half
+        predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+
+        // Smooth
+        xd->plane[0].dst.buf = dst_buf[0];
+        xd->plane[0].dst.stride = dst_stride[0];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
+            0);
+      } else {
+        // bsize: not important, not useful
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+                         subsize, dry_run, 0, 0);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+        else
+          extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3);
+
+        if (mi_col + hbs < cm->mi_cols) {
+          predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
+                           mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+                           dst_stride1, top_bsize, subsize, dry_run, 0, 0);
+          if (bsize < top_bsize)
+            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf1,
+                       dst_stride1);
+          else
+            extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf1,
+                       dst_stride1, 2);
+
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = dst_buf[i];
+            xd->plane[i].dst.stride = dst_stride[i];
+            av1_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+                PARTITION_VERT, i);
+          }
+        }
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8 && !unify_bsize) {
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+                         BLOCK_8X8, dry_run, 1, 0);
+        predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
+        predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf2, dst_stride2,
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
+        predict_b_extend(cpi, td, tile, 3, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf3, dst_stride3,
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
+
+        if (bsize < top_bsize) {
+          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+          extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+          extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
+          extend_all(cpi, td, tile, 3, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf3, dst_stride3);
+        }
+      } else {
+        predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row_top,
+                           mi_col_top, dry_run, subsize, top_bsize, dst_buf,
+                           dst_stride, pc_tree->split[0]);
+        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          predict_sb_complex(cpi, td, tile, mi_row, mi_col + hbs, mi_row_top,
+                             mi_col_top, dry_run, subsize, top_bsize, dst_buf1,
+                             dst_stride1, pc_tree->split[1]);
+        if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
+          predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col, mi_row_top,
+                             mi_col_top, dry_run, subsize, top_bsize, dst_buf2,
+                             dst_stride2, pc_tree->split[2]);
+        if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col + hbs,
+                             mi_row_top, mi_col_top, dry_run, subsize,
+                             top_bsize, dst_buf3, dst_stride3,
+                             pc_tree->split[3]);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+#if !CONFIG_CB4X4
+        if (bsize == BLOCK_8X8 && i != 0)
+          continue;  // Skip <4x4 chroma smoothing
+#endif
+        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
+          av1_build_masked_inter_predictor_complex(
+              xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+              mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+              PARTITION_VERT, i);
+          if (mi_row + hbs < cm->mi_rows) {
+            av1_build_masked_inter_predictor_complex(
+                xd, dst_buf2[i], dst_stride2[i], dst_buf3[i], dst_stride3[i],
+                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+                PARTITION_VERT, i);
+            av1_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i],
+                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+                PARTITION_HORZ, i);
+          }
+        } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
+          if (bsize == BLOCK_8X8 && i != 0)
+            continue;  // Skip <4x4 chroma smoothing
+
+          av1_build_masked_inter_predictor_complex(
+              xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i],
+              mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+              PARTITION_HORZ, i);
+        }
+      }
+      break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+                       bsize2, dry_run, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+                       dst_stride1, top_bsize, bsize2, dry_run, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2,
+                       top_bsize, subsize, dry_run, 0, 0);
+      if (bsize < top_bsize)
+        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
+                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
+      else
+        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
+                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 1);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
+            i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
+            i);
+      }
+
+      break;
+    case PARTITION_VERT_A:
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+                       bsize2, dry_run, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                       top_bsize, bsize2, dry_run, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
+                       dst_stride2, top_bsize, subsize, dry_run, 0, 0);
+      if (bsize < top_bsize)
+        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
+                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
+      else
+        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
+                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
+            i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
+            i);
+      }
+      break;
+    case PARTITION_HORZ_B:
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+                       subsize, dry_run, 0, 0);
+      if (bsize < top_bsize)
+        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+      else
+        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                       top_bsize, bsize2, dry_run, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
+                       mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
+                 mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2,
+                 dst_stride2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf1[i];
+        xd->plane[i].dst.stride = dst_stride1[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i],
+            mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+            PARTITION_VERT, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
+            i);
+      }
+      break;
+    case PARTITION_VERT_B:
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+                       subsize, dry_run, 0, 0);
+      if (bsize < top_bsize)
+        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+      else
+        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+                       dst_stride1, top_bsize, bsize2, dry_run, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
+                       mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
+                 mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2,
+                 dst_stride2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf1[i];
+        xd->plane[i].dst.stride = dst_stride1[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i],
+            mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+            PARTITION_HORZ, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
+            i);
+      }
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    default: assert(0);
+  }
+
+#if CONFIG_EXT_PARTITION_TYPES
+  if (bsize < top_bsize)
+    update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+#else
+  if (bsize < top_bsize && (partition != PARTITION_SPLIT || bsize == BLOCK_8X8))
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+}
+
+static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td,
+                          const TileInfo *const tile, int mi_row, int mi_col,
+                          BLOCK_SIZE bsize, int *tmp_rate, int64_t *tmp_dist,
+                          TX_TYPE *best_tx, PC_TREE *pc_tree) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int plane, pnskip, skippable, skippable_uv, rate_uv, this_rate,
+      base_rate = *tmp_rate;
+  int64_t sse, pnsse, sse_uv, this_dist, dist_uv;
+  uint8_t *dst_buf[3];
+  int dst_stride[3];
+  TX_SIZE tx_size;
+  MB_MODE_INFO *mbmi;
+  TX_TYPE tx_type, best_tx_nostx;
+#if CONFIG_EXT_TX
+  int ext_tx_set;
+#endif  // CONFIG_EXT_TX
+  int tmp_rate_tx = 0, skip_tx = 0;
+  int64_t tmp_dist_tx = 0, rd_tx, bestrd_tx = INT64_MAX;
+
+  set_skip_context(xd, mi_row, mi_col);
+  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+  update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, 1, pc_tree);
+  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+                       mi_col);
+  for (plane = 0; plane < MAX_MB_PLANE; plane++) {
+    dst_buf[plane] = xd->plane[plane].dst.buf;
+    dst_stride[plane] = xd->plane[plane].dst.stride;
+  }
+  predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, 1, bsize,
+                     bsize, dst_buf, dst_stride, pc_tree);
+
+  set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+  set_segment_id_supertx(cpi, x, mi_row, mi_col, bsize);
+
+  mbmi = &xd->mi[0]->mbmi;
+  best_tx_nostx = mbmi->tx_type;
+
+  *best_tx = DCT_DCT;
+
+  // chroma
+  skippable_uv = 1;
+  rate_uv = 0;
+  dist_uv = 0;
+  sse_uv = 0;
+  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_VAR_TX
+    ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    RD_STATS this_rd_stats;
+    av1_init_rd_stats(&this_rd_stats);
+
+    tx_size = max_txsize_lookup[bsize];
+    tx_size =
+        uv_txsize_lookup[bsize][tx_size][cm->subsampling_x][cm->subsampling_y];
+    av1_get_entropy_contexts(bsize, tx_size, pd, ctxa, ctxl);
+
+    av1_subtract_plane(x, bsize, plane);
+    av1_tx_block_rd_b(cpi, x, tx_size, 0, 0, plane, 0,
+                      get_plane_block_size(bsize, pd), &ctxa[0], &ctxl[0],
+                      &this_rd_stats);
+
+    this_rate = this_rd_stats.rate;
+    this_dist = this_rd_stats.dist;
+    pnsse = this_rd_stats.sse;
+    pnskip = this_rd_stats.skip;
+#else
+    tx_size = max_txsize_lookup[bsize];
+    tx_size =
+        uv_txsize_lookup[bsize][tx_size][cm->subsampling_x][cm->subsampling_y];
+    av1_subtract_plane(x, bsize, plane);
+    av1_txfm_rd_in_plane_supertx(x, cpi, &this_rate, &this_dist, &pnskip,
+                                 &pnsse, INT64_MAX, plane, bsize, tx_size, 0);
+#endif  // CONFIG_VAR_TX
+
+    rate_uv += this_rate;
+    dist_uv += this_dist;
+    sse_uv += pnsse;
+    skippable_uv &= pnskip;
+  }
+
+  // luma
+  tx_size = max_txsize_lookup[bsize];
+  av1_subtract_plane(x, bsize, 0);
+#if CONFIG_EXT_TX
+  ext_tx_set = get_ext_tx_set(tx_size, bsize, 1, cm->reduced_tx_set_used);
+#endif  // CONFIG_EXT_TX
+  for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+#if CONFIG_VAR_TX
+    ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
+    const struct macroblockd_plane *const pd = &xd->plane[0];
+    RD_STATS this_rd_stats;
+#endif  // CONFIG_VAR_TX
+
+#if CONFIG_EXT_TX
+    if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
+#else
+    if (tx_size >= TX_32X32 && tx_type != DCT_DCT) continue;
+#endif  // CONFIG_EXT_TX
+    mbmi->tx_type = tx_type;
+
+#if CONFIG_VAR_TX
+    av1_init_rd_stats(&this_rd_stats);
+    av1_get_entropy_contexts(bsize, tx_size, pd, ctxa, ctxl);
+    av1_tx_block_rd_b(cpi, x, tx_size, 0, 0, 0, 0, bsize, &ctxa[0], &ctxl[0],
+                      &this_rd_stats);
+
+    this_rate = this_rd_stats.rate;
+    this_dist = this_rd_stats.dist;
+    pnsse = this_rd_stats.sse;
+    pnskip = this_rd_stats.skip;
+#else
+    av1_txfm_rd_in_plane_supertx(x, cpi, &this_rate, &this_dist, &pnskip,
+                                 &pnsse, INT64_MAX, 0, bsize, tx_size, 0);
+#endif  // CONFIG_VAR_TX
+
+#if CONFIG_EXT_TX
+    if (get_ext_tx_types(tx_size, bsize, 1, cm->reduced_tx_set_used) > 1 &&
+        !xd->lossless[xd->mi[0]->mbmi.segment_id] && this_rate != INT_MAX) {
+      if (ext_tx_set > 0)
+        this_rate +=
+            cpi->inter_tx_type_costs[ext_tx_set][mbmi->tx_size][mbmi->tx_type];
+    }
+#else
+    if (tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+        this_rate != INT_MAX) {
+      this_rate += cpi->inter_tx_type_costs[tx_size][mbmi->tx_type];
+    }
+#endif  // CONFIG_EXT_TX
+    *tmp_rate = rate_uv + this_rate;
+    *tmp_dist = dist_uv + this_dist;
+    sse = sse_uv + pnsse;
+    skippable = skippable_uv && pnskip;
+    if (skippable) {
+      *tmp_rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+      x->skip = 1;
+    } else {
+      if (RDCOST(x->rdmult, x->rddiv, *tmp_rate, *tmp_dist) <
+          RDCOST(x->rdmult, x->rddiv, 0, sse)) {
+        *tmp_rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+        x->skip = 0;
+      } else {
+        *tmp_dist = sse;
+        *tmp_rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+        x->skip = 1;
+      }
+    }
+    *tmp_rate += base_rate;
+    rd_tx = RDCOST(x->rdmult, x->rddiv, *tmp_rate, *tmp_dist);
+    if (rd_tx < bestrd_tx * 0.99 || tx_type == DCT_DCT) {
+      *best_tx = tx_type;
+      bestrd_tx = rd_tx;
+      tmp_rate_tx = *tmp_rate;
+      tmp_dist_tx = *tmp_dist;
+      skip_tx = x->skip;
+    }
+  }
+  *tmp_rate = tmp_rate_tx;
+  *tmp_dist = tmp_dist_tx;
+  x->skip = skip_tx;
+#if CONFIG_VAR_TX
+  for (plane = 0; plane < 1; ++plane)
+    memset(x->blk_skip[plane], x->skip,
+           sizeof(uint8_t) * pc_tree->none.num_4x4_blk);
+#endif  // CONFIG_VAR_TX
+  xd->mi[0]->mbmi.tx_type = best_tx_nostx;
+}
+#endif  // CONFIG_SUPERTX
diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h
new file mode 100644
index 000000000..08d6d20de
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodeframe.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_ENCODEFRAME_H_
+#define AV1_ENCODER_ENCODEFRAME_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct macroblock;
+struct yv12_buffer_config;
+struct AV1_COMP;
+struct ThreadData;
+
+// Constants used in SOURCE_VAR_BASED_PARTITION
+#define VAR_HIST_MAX_BG_VAR 1000
+#define VAR_HIST_FACTOR 10
+#define VAR_HIST_BINS (VAR_HIST_MAX_BG_VAR / VAR_HIST_FACTOR + 1)
+#define VAR_HIST_LARGE_CUT_OFF 75
+#define VAR_HIST_SMALL_CUT_OFF 45
+
+void av1_setup_src_planes(struct macroblock *x,
+                          const struct yv12_buffer_config *src, int mi_row,
+                          int mi_col);
+
+void av1_encode_frame(struct AV1_COMP *cpi);
+
+void av1_init_tile_data(struct AV1_COMP *cpi);
+void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row,
+                     int tile_col);
+
+void av1_set_variance_partition_thresholds(struct AV1_COMP *cpi, int q);
+
+void av1_update_tx_type_count(const struct AV1Common *cm, MACROBLOCKD *xd,
+#if CONFIG_TXK_SEL
+                              int block, int plane,
+#endif
+                              BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              FRAME_COUNTS *counts);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_ENCODEFRAME_H_
diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c
new file mode 100644
index 000000000..c450244b1
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemb.c
@@ -0,0 +1,1671 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/idct.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/scan.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encodemb.h"
+#if CONFIG_LV_MAP
+#include "av1/encoder/encodetxb.h"
+#endif
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/tokenize.h"
+
+#if CONFIG_PVQ
+#include "av1/encoder/encint.h"
+#include "av1/common/partition.h"
+#include "av1/encoder/pvq_encoder.h"
+#endif
+
+#if CONFIG_CFL
+#include "av1/common/cfl.h"
+#endif
+
+// Check if one needs to use c version subtraction.
+static int check_subtract_block_size(int w, int h) { return w < 4 || h < 4; }
+
+static void subtract_block(const MACROBLOCKD *xd, int rows, int cols,
+                           int16_t *diff, ptrdiff_t diff_stride,
+                           const uint8_t *src8, ptrdiff_t src_stride,
+                           const uint8_t *pred8, ptrdiff_t pred_stride) {
+#if !CONFIG_HIGHBITDEPTH
+  (void)xd;
+#endif
+
+  if (check_subtract_block_size(rows, cols)) {
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      aom_highbd_subtract_block_c(rows, cols, diff, diff_stride, src8,
+                                  src_stride, pred8, pred_stride, xd->bd);
+      return;
+    }
+#endif  // CONFIG_HIGHBITDEPTH
+    aom_subtract_block_c(rows, cols, diff, diff_stride, src8, src_stride, pred8,
+                         pred_stride);
+
+    return;
+  }
+
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
+                              pred8, pred_stride, xd->bd);
+    return;
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+  aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
+                     pred_stride);
+}
+
+void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
+                      int blk_col, int blk_row, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  const int tx1d_width = tx_size_wide[tx_size];
+  const int tx1d_height = tx_size_high[tx_size];
+  uint8_t *dst =
+      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  uint8_t *src =
+      &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+  int16_t *src_diff =
+      &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+  subtract_block(xd, tx1d_height, tx1d_width, src_diff, diff_stride, src,
+                 src_stride, dst, dst_stride);
+}
+
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+  struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  subtract_block(xd, bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                 pd->dst.buf, pd->dst.stride);
+}
+
+// These numbers are empirically obtained.
+static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
+#if CONFIG_EC_ADAPT
+  { 10, 7 }, { 8, 5 },
+#else
+  { 10, 6 }, { 8, 5 },
+#endif
+};
+
+#define UPDATE_RD_COST()                             \
+  {                                                  \
+    rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); \
+    rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); \
+  }
+
+static INLINE int64_t
+get_token_bit_costs(unsigned int token_costs[2][COEFF_CONTEXTS][ENTROPY_TOKENS],
+                    int skip_eob, int ctx, int token) {
+#if CONFIG_NEW_TOKENSET
+  (void)skip_eob;
+  return token_costs[token == ZERO_TOKEN || token == EOB_TOKEN][ctx][token];
+#else
+  return token_costs[skip_eob][ctx][token];
+#endif
+}
+
+#define USE_GREEDY_OPTIMIZE_B 0
+
+#if USE_GREEDY_OPTIMIZE_B
+
+typedef struct av1_token_state {
+  int16_t token;
+  tran_low_t qc;
+  tran_low_t dqc;
+} av1_token_state;
+
+int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
+                   TX_SIZE tx_size, int ctx) {
+#if !CONFIG_PVQ
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  struct macroblock_plane *const p = &mb->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ref = is_inter_block(&xd->mi[0]->mbmi);
+  av1_token_state tokens[MAX_TX_SQUARE + 1][2];
+  uint8_t token_cache[MAX_TX_SQUARE];
+  const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const int eob = p->eobs[block];
+  const PLANE_TYPE plane_type = pd->plane_type;
+  const int16_t *const dequant_ptr = pd->dequant;
+  const uint8_t *const band_translate = get_band_translate(tx_size);
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const SCAN_ORDER *const scan_order =
+      get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
+  const int16_t *const scan = scan_order->scan;
+  const int16_t *const nb = scan_order->neighbors;
+  int dqv;
+  const int shift = av1_get_tx_scale(tx_size);
+#if CONFIG_AOM_QM
+  int seg_id = xd->mi[0]->mbmi.segment_id;
+  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][!ref][tx_size];
+#endif
+#if CONFIG_NEW_QUANT
+  int dq = get_dq_profile_from_ctx(mb->qindex, ctx, ref, plane_type);
+  const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq[dq];
+#elif !CONFIG_AOM_QM
+  const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift };
+#endif  // CONFIG_NEW_QUANT
+  int sz = 0;
+  const int64_t rddiv = mb->rddiv;
+  int64_t rd_cost0, rd_cost1;
+  int16_t t0, t1;
+  int i, final_eob;
+#if CONFIG_HIGHBITDEPTH
+  const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
+#else
+  const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, 8);
+#endif
+  unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      mb->token_costs[txsize_sqr_map[tx_size]][plane_type][ref];
+  const int default_eob = tx_size_2d[tx_size];
+
+  assert((mb->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
+
+  assert((!plane_type && !plane) || (plane_type && plane));
+  assert(eob <= default_eob);
+
+  int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1;
+/* CpuSpeedTest uses "--min-q=0 --max-q=0" and expects 100dB psnr
+* This creates conflict with search for a better EOB position
+* The line below is to make sure EOB search is disabled at this corner case.
+*/
+#if !CONFIG_NEW_QUANT && !CONFIG_AOM_QM
+  if (dq_step[1] <= 4) {
+    rdmult = 1;
+  }
+#endif
+
+  int64_t rate0, rate1;
+  for (i = 0; i < eob; i++) {
+    const int rc = scan[i];
+    int x = qcoeff[rc];
+    t0 = av1_get_token(x);
+
+    tokens[i][0].qc = x;
+    tokens[i][0].token = t0;
+    tokens[i][0].dqc = dqcoeff[rc];
+
+    token_cache[rc] = av1_pt_energy_class[t0];
+  }
+  tokens[eob][0].token = EOB_TOKEN;
+  tokens[eob][0].qc = 0;
+  tokens[eob][0].dqc = 0;
+  tokens[eob][1] = tokens[eob][0];
+
+  unsigned int(*token_costs_ptr)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      token_costs;
+
+  final_eob = 0;
+
+  int64_t eob_cost0, eob_cost1;
+
+  const int ctx0 = ctx;
+  /* Record the r-d cost */
+  int64_t accu_rate = 0;
+  int64_t accu_error = 0;
+
+  rate0 = get_token_bit_costs(*(token_costs_ptr + band_translate[0]), 0, ctx0,
+                              EOB_TOKEN);
+  int64_t best_block_rd_cost = RDCOST(rdmult, rddiv, rate0, accu_error);
+
+  // int64_t best_block_rd_cost_all0 = best_block_rd_cost;
+
+  int x_prev = 1;
+
+  for (i = 0; i < eob; i++) {
+    const int rc = scan[i];
+    int x = qcoeff[rc];
+    sz = -(x < 0);
+
+    int band_cur = band_translate[i];
+    int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i);
+    int token_tree_sel_cur = (x_prev == 0);
+
+    if (x == 0) {
+      // no need to search when x == 0
+      rate0 =
+          get_token_bit_costs(*(token_costs_ptr + band_cur), token_tree_sel_cur,
+                              ctx_cur, tokens[i][0].token);
+      accu_rate += rate0;
+      x_prev = 0;
+      // accu_error does not change when x==0
+    } else {
+      /*  Computing distortion
+       */
+      // compute the distortion for the first candidate
+      // and the distortion for quantizing to 0.
+      int dx0 = (-coeff[rc]) * (1 << shift);
+#if CONFIG_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        dx0 >>= xd->bd - 8;
+      }
+#endif
+      int64_t d0 = (int64_t)dx0 * dx0;
+
+      int x_a = x - 2 * sz - 1;
+      int64_t d2, d2_a;
+
+      int dx;
+
+#if CONFIG_AOM_QM
+      int iwt = iqmatrix[rc];
+      dqv = dequant_ptr[rc != 0];
+      dqv = ((iwt * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+#else
+      dqv = dequant_ptr[rc != 0];
+#endif
+
+      dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
+#if CONFIG_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        dx >>= xd->bd - 8;
+      }
+#endif  // CONFIG_HIGHBITDEPTH
+      d2 = (int64_t)dx * dx;
+
+      /* compute the distortion for the second candidate
+       * x_a = x - 2 * sz + 1;
+       */
+      if (x_a != 0) {
+#if CONFIG_NEW_QUANT
+        dx = av1_dequant_coeff_nuq(x, dqv, dequant_val[band_translate[i]]) -
+             (coeff[rc] << shift);
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          dx >>= xd->bd - 8;
+        }
+#endif  // CONFIG_HIGHBITDEPTH
+#else   // CONFIG_NEW_QUANT
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          dx -= ((dqv >> (xd->bd - 8)) + sz) ^ sz;
+        } else {
+          dx -= (dqv + sz) ^ sz;
+        }
+#else
+        dx -= (dqv + sz) ^ sz;
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_NEW_QUANT
+        d2_a = (int64_t)dx * dx;
+      } else {
+        d2_a = d0;
+      }
+      /*  Computing rates and r-d cost
+       */
+
+      int best_x, best_eob_x;
+      int64_t base_bits, next_bits0, next_bits1;
+      int64_t next_eob_bits0, next_eob_bits1;
+
+      // rate cost of x
+      base_bits = av1_get_token_cost(x, &t0, cat6_bits);
+      rate0 = base_bits + get_token_bit_costs(*(token_costs_ptr + band_cur),
+                                              token_tree_sel_cur, ctx_cur, t0);
+
+      base_bits = av1_get_token_cost(x_a, &t1, cat6_bits);
+      rate1 = base_bits + get_token_bit_costs(*(token_costs_ptr + band_cur),
+                                              token_tree_sel_cur, ctx_cur, t1);
+
+      next_bits0 = 0;
+      next_bits1 = 0;
+      next_eob_bits0 = 0;
+      next_eob_bits1 = 0;
+
+      if (i < default_eob - 1) {
+        int ctx_next, token_tree_sel_next;
+        int band_next = band_translate[i + 1];
+
+        token_cache[rc] = av1_pt_energy_class[t0];
+        ctx_next = get_coef_context(nb, token_cache, i + 1);
+        token_tree_sel_next = (x == 0);
+
+        next_bits0 = get_token_bit_costs(*(token_costs_ptr + band_next),
+                                         token_tree_sel_next, ctx_next,
+                                         tokens[i + 1][0].token);
+        next_eob_bits0 =
+            get_token_bit_costs(*(token_costs_ptr + band_next),
+                                token_tree_sel_next, ctx_next, EOB_TOKEN);
+
+        token_cache[rc] = av1_pt_energy_class[t1];
+        ctx_next = get_coef_context(nb, token_cache, i + 1);
+        token_tree_sel_next = (x_a == 0);
+
+        next_bits1 = get_token_bit_costs(*(token_costs_ptr + band_next),
+                                         token_tree_sel_next, ctx_next,
+                                         tokens[i + 1][0].token);
+
+        if (x_a != 0) {
+          next_eob_bits1 =
+              get_token_bit_costs(*(token_costs_ptr + band_next),
+                                  token_tree_sel_next, ctx_next, EOB_TOKEN);
+        }
+      }
+
+      rd_cost0 = RDCOST(rdmult, rddiv, (rate0 + next_bits0), d2);
+      rd_cost1 = RDCOST(rdmult, rddiv, (rate1 + next_bits1), d2_a);
+
+      best_x = (rd_cost1 < rd_cost0);
+
+      eob_cost0 = RDCOST(rdmult, rddiv, (accu_rate + rate0 + next_eob_bits0),
+                         (accu_error + d2 - d0));
+      eob_cost1 = eob_cost0;
+      if (x_a != 0) {
+        eob_cost1 = RDCOST(rdmult, rddiv, (accu_rate + rate1 + next_eob_bits1),
+                           (accu_error + d2_a - d0));
+        best_eob_x = (eob_cost1 < eob_cost0);
+      } else {
+        best_eob_x = 0;
+      }
+
+      int dqc, dqc_a = 0;
+
+      dqc = dqcoeff[rc];
+      if (best_x + best_eob_x) {
+        if (x_a != 0) {
+#if CONFIG_NEW_QUANT
+          dqc_a = av1_dequant_abscoeff_nuq(abs(x_a), dqv,
+                                           dequant_val[band_translate[i]]);
+          dqc_a = shift ? ROUND_POWER_OF_TWO(dqc_a, shift) : dqc_a;
+          if (sz) dqc_a = -dqc_a;
+#else
+// The 32x32 transform coefficient uses half quantization step size.
+// Account for the rounding difference in the dequantized coefficeint
+// value when the quantization index is dropped from an even number
+// to an odd number.
+
+#if CONFIG_AOM_QM
+          tran_low_t offset = dqv >> shift;
+#else
+          tran_low_t offset = dq_step[rc != 0];
+#endif
+          if (shift & x_a) offset += (dqv & 0x01);
+
+          if (sz == 0)
+            dqc_a = dqcoeff[rc] - offset;
+          else
+            dqc_a = dqcoeff[rc] + offset;
+#endif  // CONFIG_NEW_QUANT
+        } else {
+          dqc_a = 0;
+        }  // if (x_a != 0)
+      }
+
+      // record the better quantized value
+      if (best_x) {
+        qcoeff[rc] = x_a;
+        dqcoeff[rc] = dqc_a;
+
+        accu_rate += rate1;
+        accu_error += d2_a - d0;
+        assert(d2_a <= d0);
+
+        token_cache[rc] = av1_pt_energy_class[t1];
+      } else {
+        accu_rate += rate0;
+        accu_error += d2 - d0;
+        assert(d2 <= d0);
+
+        token_cache[rc] = av1_pt_energy_class[t0];
+      }
+
+      x_prev = qcoeff[rc];
+
+      // determine whether to move the eob position to i+1
+      int64_t best_eob_cost_i = eob_cost0;
+
+      tokens[i][1].token = t0;
+      tokens[i][1].qc = x;
+      tokens[i][1].dqc = dqc;
+
+      if ((x_a != 0) && (best_eob_x)) {
+        best_eob_cost_i = eob_cost1;
+
+        tokens[i][1].token = t1;
+        tokens[i][1].qc = x_a;
+        tokens[i][1].dqc = dqc_a;
+      }
+
+      if (best_eob_cost_i < best_block_rd_cost) {
+        best_block_rd_cost = best_eob_cost_i;
+        final_eob = i + 1;
+      }
+    }  // if (x==0)
+  }    // for (i)
+
+  assert(final_eob <= eob);
+  if (final_eob > 0) {
+    assert(tokens[final_eob - 1][1].qc != 0);
+    i = final_eob - 1;
+    int rc = scan[i];
+    qcoeff[rc] = tokens[i][1].qc;
+    dqcoeff[rc] = tokens[i][1].dqc;
+  }
+
+  for (i = final_eob; i < eob; i++) {
+    int rc = scan[i];
+    qcoeff[rc] = 0;
+    dqcoeff[rc] = 0;
+  }
+
+  mb->plane[plane].eobs[block] = final_eob;
+  return final_eob;
+
+#else   // !CONFIG_PVQ
+  (void)cm;
+  (void)tx_size;
+  (void)ctx;
+  struct macroblock_plane *const p = &mb->plane[plane];
+  return p->eobs[block];
+#endif  // !CONFIG_PVQ
+}
+
+#else  // USE_GREEDY_OPTIMIZE_B
+
+typedef struct av1_token_state {
+  int64_t error;
+  int rate;
+  int16_t next;
+  int16_t token;
+  tran_low_t qc;
+  tran_low_t dqc;
+  uint8_t best_index;
+} av1_token_state;
+
+int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
+                   TX_SIZE tx_size, int ctx) {
+#if !CONFIG_PVQ
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  struct macroblock_plane *const p = &mb->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ref = is_inter_block(&xd->mi[0]->mbmi);
+  av1_token_state tokens[MAX_TX_SQUARE + 1][2];
+  uint8_t token_cache[MAX_TX_SQUARE];
+  const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const int eob = p->eobs[block];
+  const PLANE_TYPE plane_type = pd->plane_type;
+  const int default_eob = tx_size_2d[tx_size];
+  const int16_t *const dequant_ptr = pd->dequant;
+  const uint8_t *const band_translate = get_band_translate(tx_size);
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const SCAN_ORDER *const scan_order =
+      get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
+  const int16_t *const scan = scan_order->scan;
+  const int16_t *const nb = scan_order->neighbors;
+  int dqv;
+  const int shift = av1_get_tx_scale(tx_size);
+#if CONFIG_AOM_QM
+  int seg_id = xd->mi[0]->mbmi.segment_id;
+  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][!ref][tx_size];
+#endif
+#if CONFIG_NEW_QUANT
+  int dq = get_dq_profile_from_ctx(mb->qindex, ctx, ref, plane_type);
+  const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq[dq];
+#elif !CONFIG_AOM_QM
+  const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift };
+#endif  // CONFIG_NEW_QUANT
+  int next = eob, sz = 0;
+  const int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1;
+  const int64_t rddiv = mb->rddiv;
+  int64_t rd_cost0, rd_cost1;
+  int rate0, rate1;
+  int64_t error0, error1;
+  int16_t t0, t1;
+  int best, band = (eob < default_eob) ? band_translate[eob]
+                                       : band_translate[eob - 1];
+  int pt, i, final_eob;
+#if CONFIG_HIGHBITDEPTH
+  const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
+#else
+  const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, 8);
+#endif
+  unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      mb->token_costs[txsize_sqr_map[tx_size]][plane_type][ref];
+  const uint16_t *band_counts = &band_count_table[tx_size][band];
+  uint16_t band_left = eob - band_cum_count_table[tx_size][band] + 1;
+  int shortcut = 0;
+  int next_shortcut = 0;
+
+#if CONFIG_EXT_DELTA_Q
+  const int qindex = cm->seg.enabled
+                         ? av1_get_qindex(&cm->seg, xd->mi[0]->mbmi.segment_id,
+                                          cm->base_qindex)
+                         : cm->base_qindex;
+  if (qindex == 0) {
+    assert((qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
+  }
+#else
+  assert((mb->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
+#endif
+
+  token_costs += band;
+
+  assert((!plane_type && !plane) || (plane_type && plane));
+  assert(eob <= default_eob);
+
+  /* Now set up a Viterbi trellis to evaluate alternative roundings. */
+  /* Initialize the sentinel node of the trellis. */
+  tokens[eob][0].rate = 0;
+  tokens[eob][0].error = 0;
+  tokens[eob][0].next = default_eob;
+  tokens[eob][0].token = EOB_TOKEN;
+  tokens[eob][0].qc = 0;
+  tokens[eob][1] = tokens[eob][0];
+
+  for (i = 0; i < eob; i++) {
+    const int rc = scan[i];
+    tokens[i][0].rate = av1_get_token_cost(qcoeff[rc], &t0, cat6_bits);
+    tokens[i][0].token = t0;
+    token_cache[rc] = av1_pt_energy_class[t0];
+  }
+
+  for (i = eob; i-- > 0;) {
+    int base_bits, dx;
+    int64_t d2;
+    const int rc = scan[i];
+    int x = qcoeff[rc];
+#if CONFIG_AOM_QM
+    int iwt = iqmatrix[rc];
+    dqv = dequant_ptr[rc != 0];
+    dqv = ((iwt * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+#else
+    dqv = dequant_ptr[rc != 0];
+#endif
+    next_shortcut = shortcut;
+
+    /* Only add a trellis state for non-zero coefficients. */
+    if (UNLIKELY(x)) {
+      error0 = tokens[next][0].error;
+      error1 = tokens[next][1].error;
+      /* Evaluate the first possibility for this state. */
+      rate0 = tokens[next][0].rate;
+      rate1 = tokens[next][1].rate;
+
+      if (next_shortcut) {
+        /* Consider both possible successor states. */
+        if (next < default_eob) {
+          pt = get_coef_context(nb, token_cache, i + 1);
+          rate0 +=
+              get_token_bit_costs(*token_costs, 0, pt, tokens[next][0].token);
+          rate1 +=
+              get_token_bit_costs(*token_costs, 0, pt, tokens[next][1].token);
+        }
+        UPDATE_RD_COST();
+        /* And pick the best. */
+        best = rd_cost1 < rd_cost0;
+      } else {
+        if (next < default_eob) {
+          pt = get_coef_context(nb, token_cache, i + 1);
+          rate0 +=
+              get_token_bit_costs(*token_costs, 0, pt, tokens[next][0].token);
+        }
+        best = 0;
+      }
+
+      dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
+#if CONFIG_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        dx >>= xd->bd - 8;
+      }
+#endif  // CONFIG_HIGHBITDEPTH
+      d2 = (int64_t)dx * dx;
+      tokens[i][0].rate += (best ? rate1 : rate0);
+      tokens[i][0].error = d2 + (best ? error1 : error0);
+      tokens[i][0].next = next;
+      tokens[i][0].qc = x;
+      tokens[i][0].dqc = dqcoeff[rc];
+      tokens[i][0].best_index = best;
+
+      /* Evaluate the second possibility for this state. */
+      rate0 = tokens[next][0].rate;
+      rate1 = tokens[next][1].rate;
+
+      // The threshold of 3 is empirically obtained.
+      if (UNLIKELY(abs(x) > 3)) {
+        shortcut = 0;
+      } else {
+#if CONFIG_NEW_QUANT
+        shortcut = ((av1_dequant_abscoeff_nuq(abs(x), dqv,
+                                              dequant_val[band_translate[i]]) >
+                     (abs(coeff[rc]) << shift)) &&
+                    (av1_dequant_abscoeff_nuq(abs(x) - 1, dqv,
+                                              dequant_val[band_translate[i]]) <
+                     (abs(coeff[rc]) << shift)));
+#else  // CONFIG_NEW_QUANT
+#if CONFIG_AOM_QM
+        if ((abs(x) * dequant_ptr[rc != 0] * iwt >
+             ((abs(coeff[rc]) << shift) << AOM_QM_BITS)) &&
+            (abs(x) * dequant_ptr[rc != 0] * iwt <
+             (((abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])
+              << AOM_QM_BITS)))
+#else
+        if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) &&
+            (abs(x) * dequant_ptr[rc != 0] <
+             (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0]))
+#endif  // CONFIG_AOM_QM
+          shortcut = 1;
+        else
+          shortcut = 0;
+#endif  // CONFIG_NEW_QUANT
+      }
+
+      if (shortcut) {
+        sz = -(x < 0);
+        x -= 2 * sz + 1;
+      } else {
+        tokens[i][1] = tokens[i][0];
+        next = i;
+
+        if (UNLIKELY(!(--band_left))) {
+          --band_counts;
+          band_left = *band_counts;
+          --token_costs;
+        }
+        continue;
+      }
+
+      /* Consider both possible successor states. */
+      if (!x) {
+        /* If we reduced this coefficient to zero, check to see if
+         *  we need to move the EOB back here.
+         */
+        t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
+        t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
+        base_bits = 0;
+      } else {
+        base_bits = av1_get_token_cost(x, &t0, cat6_bits);
+        t1 = t0;
+      }
+
+      if (next_shortcut) {
+        if (LIKELY(next < default_eob)) {
+          if (t0 != EOB_TOKEN) {
+            token_cache[rc] = av1_pt_energy_class[t0];
+            pt = get_coef_context(nb, token_cache, i + 1);
+            rate0 += get_token_bit_costs(*token_costs, !x, pt,
+                                         tokens[next][0].token);
+          }
+          if (t1 != EOB_TOKEN) {
+            token_cache[rc] = av1_pt_energy_class[t1];
+            pt = get_coef_context(nb, token_cache, i + 1);
+            rate1 += get_token_bit_costs(*token_costs, !x, pt,
+                                         tokens[next][1].token);
+          }
+        }
+
+        UPDATE_RD_COST();
+        /* And pick the best. */
+        best = rd_cost1 < rd_cost0;
+      } else {
+        // The two states in next stage are identical.
+        if (next < default_eob && t0 != EOB_TOKEN) {
+          token_cache[rc] = av1_pt_energy_class[t0];
+          pt = get_coef_context(nb, token_cache, i + 1);
+          rate0 +=
+              get_token_bit_costs(*token_costs, !x, pt, tokens[next][0].token);
+        }
+        best = 0;
+      }
+
+#if CONFIG_NEW_QUANT
+      dx = av1_dequant_coeff_nuq(x, dqv, dequant_val[band_translate[i]]) -
+           (coeff[rc] << shift);
+#if CONFIG_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        dx >>= xd->bd - 8;
+      }
+#endif  // CONFIG_HIGHBITDEPTH
+#else   // CONFIG_NEW_QUANT
+#if CONFIG_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        dx -= ((dqv >> (xd->bd - 8)) + sz) ^ sz;
+      } else {
+        dx -= (dqv + sz) ^ sz;
+      }
+#else
+      dx -= (dqv + sz) ^ sz;
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_NEW_QUANT
+      d2 = (int64_t)dx * dx;
+
+      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
+      tokens[i][1].error = d2 + (best ? error1 : error0);
+      tokens[i][1].next = next;
+      tokens[i][1].token = best ? t1 : t0;
+      tokens[i][1].qc = x;
+
+      if (x) {
+#if CONFIG_NEW_QUANT
+        tokens[i][1].dqc = av1_dequant_abscoeff_nuq(
+            abs(x), dqv, dequant_val[band_translate[i]]);
+        tokens[i][1].dqc = shift ? ROUND_POWER_OF_TWO(tokens[i][1].dqc, shift)
+                                 : tokens[i][1].dqc;
+        if (sz) tokens[i][1].dqc = -tokens[i][1].dqc;
+#else
+// The 32x32 transform coefficient uses half quantization step size.
+// Account for the rounding difference in the dequantized coefficeint
+// value when the quantization index is dropped from an even number
+// to an odd number.
+
+#if CONFIG_AOM_QM
+        tran_low_t offset = dqv >> shift;
+#else
+        tran_low_t offset = dq_step[rc != 0];
+#endif
+        if (shift & x) offset += (dqv & 0x01);
+
+        if (sz == 0)
+          tokens[i][1].dqc = dqcoeff[rc] - offset;
+        else
+          tokens[i][1].dqc = dqcoeff[rc] + offset;
+#endif  // CONFIG_NEW_QUANT
+      } else {
+        tokens[i][1].dqc = 0;
+      }
+
+      tokens[i][1].best_index = best;
+      /* Finally, make this the new head of the trellis. */
+      next = i;
+    } else {
+      /* There's no choice to make for a zero coefficient, so we don't
+       *  add a new trellis node, but we do need to update the costs.
+       */
+      t0 = tokens[next][0].token;
+      t1 = tokens[next][1].token;
+      pt = get_coef_context(nb, token_cache, i + 1);
+      /* Update the cost of each path if we're past the EOB token. */
+      if (t0 != EOB_TOKEN) {
+        tokens[next][0].rate += get_token_bit_costs(*token_costs, 1, pt, t0);
+        tokens[next][0].token = ZERO_TOKEN;
+      }
+      if (t1 != EOB_TOKEN) {
+        tokens[next][1].rate += get_token_bit_costs(*token_costs, 1, pt, t1);
+        tokens[next][1].token = ZERO_TOKEN;
+      }
+      tokens[i][0].best_index = tokens[i][1].best_index = 0;
+      shortcut = (tokens[next][0].rate != tokens[next][1].rate);
+      /* Don't update next, because we didn't add a new node. */
+    }
+
+    if (UNLIKELY(!(--band_left))) {
+      --band_counts;
+      band_left = *band_counts;
+      --token_costs;
+    }
+  }
+
+  /* Now pick the best path through the whole trellis. */
+  rate0 = tokens[next][0].rate;
+  rate1 = tokens[next][1].rate;
+  error0 = tokens[next][0].error;
+  error1 = tokens[next][1].error;
+  t0 = tokens[next][0].token;
+  t1 = tokens[next][1].token;
+  rate0 += get_token_bit_costs(*token_costs, 0, ctx, t0);
+  rate1 += get_token_bit_costs(*token_costs, 0, ctx, t1);
+  UPDATE_RD_COST();
+  best = rd_cost1 < rd_cost0;
+
+  final_eob = -1;
+
+  for (i = next; i < eob; i = next) {
+    const int x = tokens[i][best].qc;
+    const int rc = scan[i];
+    if (x) final_eob = i;
+    qcoeff[rc] = x;
+    dqcoeff[rc] = tokens[i][best].dqc;
+
+    next = tokens[i][best].next;
+    best = tokens[i][best].best_index;
+  }
+  final_eob++;
+
+  mb->plane[plane].eobs[block] = final_eob;
+  assert(final_eob <= default_eob);
+  return final_eob;
+#else   // !CONFIG_PVQ
+  (void)cm;
+  (void)tx_size;
+  (void)ctx;
+  struct macroblock_plane *const p = &mb->plane[plane];
+  return p->eobs[block];
+#endif  // !CONFIG_PVQ
+}
+
+#endif  // USE_GREEDY_OPTIMIZE_B
+
+#if !CONFIG_PVQ
+#if CONFIG_HIGHBITDEPTH
+typedef enum QUANT_FUNC {
+  QUANT_FUNC_LOWBD = 0,
+  QUANT_FUNC_HIGHBD = 1,
+  QUANT_FUNC_TYPES = 2
+} QUANT_FUNC;
+
+static AV1_QUANT_FACADE
+    quant_func_list[AV1_XFORM_QUANT_TYPES][QUANT_FUNC_TYPES] = {
+#if !CONFIG_NEW_QUANT
+      { av1_quantize_fp_facade, av1_highbd_quantize_fp_facade },
+      { av1_quantize_b_facade, av1_highbd_quantize_b_facade },
+      { av1_quantize_dc_facade, av1_highbd_quantize_dc_facade },
+#else   // !CONFIG_NEW_QUANT
+      { av1_quantize_fp_nuq_facade, av1_highbd_quantize_fp_nuq_facade },
+      { av1_quantize_b_nuq_facade, av1_highbd_quantize_b_nuq_facade },
+      { av1_quantize_dc_nuq_facade, av1_highbd_quantize_dc_nuq_facade },
+#endif  // !CONFIG_NEW_QUANT
+      { NULL, NULL }
+    };
+
+#else
+
+typedef enum QUANT_FUNC {
+  QUANT_FUNC_LOWBD = 0,
+  QUANT_FUNC_TYPES = 1
+} QUANT_FUNC;
+
+static AV1_QUANT_FACADE quant_func_list[AV1_XFORM_QUANT_TYPES]
+                                       [QUANT_FUNC_TYPES] = {
+#if !CONFIG_NEW_QUANT
+                                         { av1_quantize_fp_facade },
+                                         { av1_quantize_b_facade },
+                                         { av1_quantize_dc_facade },
+#else   // !CONFIG_NEW_QUANT
+                                         { av1_quantize_fp_nuq_facade },
+                                         { av1_quantize_b_nuq_facade },
+                                         { av1_quantize_dc_nuq_facade },
+#endif  // !CONFIG_NEW_QUANT
+                                         { NULL }
+                                       };
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_PVQ
+
+void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
+                     int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
+                     TX_SIZE tx_size, int ctx,
+                     AV1_XFORM_QUANT xform_quant_idx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+#if !(CONFIG_PVQ || CONFIG_DAALA_DIST)
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+#else
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+#endif
+  PLANE_TYPE plane_type = get_plane_type(plane);
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const int is_inter = is_inter_block(mbmi);
+  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, is_inter);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = block_size_wide[plane_bsize];
+#if CONFIG_AOM_QM
+  int seg_id = mbmi->segment_id;
+  const qm_val_t *qmatrix = pd->seg_qmatrix[seg_id][!is_inter][tx_size];
+  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][!is_inter][tx_size];
+#endif
+
+  FWD_TXFM_PARAM fwd_txfm_param;
+
+#if CONFIG_PVQ || CONFIG_DAALA_DIST
+  uint8_t *dst;
+  int16_t *pred;
+  const int dst_stride = pd->dst.stride;
+  int tx_blk_size;
+  int i, j;
+#endif
+
+#if !CONFIG_PVQ
+  const int tx2d_size = tx_size_2d[tx_size];
+  QUANT_PARAM qparam;
+  const int16_t *src_diff;
+
+  src_diff =
+      &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+  qparam.log_scale = av1_get_tx_scale(tx_size);
+#if CONFIG_NEW_QUANT
+  qparam.tx_size = tx_size;
+  qparam.dq = get_dq_profile_from_ctx(x->qindex, ctx, is_inter, plane_type);
+#endif  // CONFIG_NEW_QUANT
+#if CONFIG_AOM_QM
+  qparam.qmatrix = qmatrix;
+  qparam.iqmatrix = iqmatrix;
+#endif  // CONFIG_AOM_QM
+#else
+  tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
+  int skip = 1;
+  PVQ_INFO *pvq_info = NULL;
+  uint8_t *src;
+  int16_t *src_int16;
+  const int src_stride = p->src.stride;
+
+  (void)ctx;
+  (void)scan_order;
+  (void)qcoeff;
+
+  if (x->pvq_coded) {
+    assert(block < MAX_PVQ_BLOCKS_IN_SB);
+    pvq_info = &x->pvq[block][plane];
+  }
+  src = &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+  src_int16 =
+      &p->src_int16[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+
+  // transform block size in pixels
+  tx_blk_size = tx_size_wide[tx_size];
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    for (j = 0; j < tx_blk_size; j++)
+      for (i = 0; i < tx_blk_size; i++)
+        src_int16[diff_stride * j + i] =
+            CONVERT_TO_SHORTPTR(src)[src_stride * j + i];
+  } else {
+#endif  // CONFIG_HIGHBITDEPTH
+    for (j = 0; j < tx_blk_size; j++)
+      for (i = 0; i < tx_blk_size; i++)
+        src_int16[diff_stride * j + i] = src[src_stride * j + i];
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+#endif
+
+#if CONFIG_PVQ || CONFIG_DAALA_DIST
+  dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  pred = &pd->pred[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+
+  // transform block size in pixels
+  tx_blk_size = tx_size_wide[tx_size];
+
+// copy uint8 orig and predicted block to int16 buffer
+// in order to use existing VP10 transform functions
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    for (j = 0; j < tx_blk_size; j++)
+      for (i = 0; i < tx_blk_size; i++)
+        pred[diff_stride * j + i] =
+            CONVERT_TO_SHORTPTR(dst)[dst_stride * j + i];
+  } else {
+#endif  // CONFIG_HIGHBITDEPTH
+    for (j = 0; j < tx_blk_size; j++)
+      for (i = 0; i < tx_blk_size; i++)
+        pred[diff_stride * j + i] = dst[dst_stride * j + i];
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+#endif
+
+  (void)ctx;
+
+  fwd_txfm_param.tx_type = tx_type;
+  fwd_txfm_param.tx_size = tx_size;
+  fwd_txfm_param.lossless = xd->lossless[mbmi->segment_id];
+
+#if !CONFIG_PVQ
+#if CONFIG_HIGHBITDEPTH
+  fwd_txfm_param.bd = xd->bd;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+    if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
+      if (LIKELY(!x->skip_block)) {
+        quant_func_list[xform_quant_idx][QUANT_FUNC_HIGHBD](
+            coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order, &qparam);
+      } else {
+        av1_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
+      }
+    }
+#if CONFIG_LV_MAP
+    p->txb_entropy_ctx[block] =
+        (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
+#endif  // CONFIG_LV_MAP
+    return;
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+  av1_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+  if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
+    if (LIKELY(!x->skip_block)) {
+      quant_func_list[xform_quant_idx][QUANT_FUNC_LOWBD](
+          coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order, &qparam);
+    } else {
+      av1_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
+    }
+  }
+#if CONFIG_LV_MAP
+  p->txb_entropy_ctx[block] =
+      (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
+#endif  // CONFIG_LV_MAP
+#else   // #if !CONFIG_PVQ
+  (void)xform_quant_idx;
+#if CONFIG_HIGHBITDEPTH
+  fwd_txfm_param.bd = xd->bd;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    av1_highbd_fwd_txfm(src_int16, coeff, diff_stride, &fwd_txfm_param);
+    av1_highbd_fwd_txfm(pred, ref_coeff, diff_stride, &fwd_txfm_param);
+  } else {
+#endif
+    av1_fwd_txfm(src_int16, coeff, diff_stride, &fwd_txfm_param);
+    av1_fwd_txfm(pred, ref_coeff, diff_stride, &fwd_txfm_param);
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif
+
+  // PVQ for inter mode block
+  if (!x->skip_block) {
+    PVQ_SKIP_TYPE ac_dc_coded =
+        av1_pvq_encode_helper(x,
+                              coeff,        // target original vector
+                              ref_coeff,    // reference vector
+                              dqcoeff,      // de-quantized vector
+                              eob,          // End of Block marker
+                              pd->dequant,  // aom's quantizers
+                              plane,        // image plane
+                              tx_size,      // block size in log_2 - 2
+                              tx_type,
+                              &x->rate,  // rate measured
+                              x->pvq_speed,
+                              pvq_info);  // PVQ info for a block
+    skip = ac_dc_coded == PVQ_SKIP;
+  }
+  x->pvq_skip[plane] = skip;
+
+  if (!skip) mbmi->skip = 0;
+#endif  // #if !CONFIG_PVQ
+}
+
+static void encode_block(int plane, int block, int blk_row, int blk_col,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+  struct encode_b_args *const args = arg;
+  AV1_COMMON *cm = args->cm;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int ctx;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint8_t *dst;
+#if !CONFIG_PVQ
+  ENTROPY_CONTEXT *a, *l;
+#endif
+#if CONFIG_VAR_TX
+  int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+#endif
+  dst = &pd->dst
+             .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
+
+#if !CONFIG_PVQ
+  a = &args->ta[blk_col];
+  l = &args->tl[blk_row];
+#if CONFIG_VAR_TX
+  ctx = get_entropy_context(tx_size, a, l);
+#else
+  ctx = combine_entropy_contexts(*a, *l);
+#endif
+#else
+  ctx = 0;
+#endif  // CONFIG_PVQ
+
+#if CONFIG_VAR_TX
+  // Assert not magic number (uninitialized).
+  assert(x->blk_skip[plane][blk_row * bw + blk_col] != 234);
+
+  if (x->blk_skip[plane][blk_row * bw + blk_col] == 0) {
+#else
+  {
+#endif
+    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                    ctx, AV1_XFORM_QUANT_FP);
+  }
+#if CONFIG_VAR_TX
+  else {
+    p->eobs[block] = 0;
+  }
+#endif
+
+#if !CONFIG_PVQ
+  if (p->eobs[block] && !xd->lossless[xd->mi[0]->mbmi.segment_id])
+    av1_optimize_b(cm, x, plane, block, tx_size, ctx);
+
+  av1_set_txb_context(x, plane, block, tx_size, a, l);
+
+  if (p->eobs[block]) *(args->skip) = 0;
+
+  if (p->eobs[block] == 0) return;
+#else
+  (void)ctx;
+  if (!x->pvq_skip[plane]) *(args->skip) = 0;
+
+  if (x->pvq_skip[plane]) return;
+#endif
+  TX_TYPE tx_type = get_tx_type(pd->plane_type, xd, block, tx_size);
+  av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, dst,
+                              pd->dst.stride, p->eobs[block]);
+}
+
+#if CONFIG_VAR_TX
+static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               void *arg) {
+  struct encode_b_args *const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  TX_SIZE plane_tx_size;
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  plane_tx_size =
+      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
+            : mbmi->inter_tx_size[tx_row][tx_col];
+
+  if (tx_size == plane_tx_size) {
+    encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    // This is the square transform block partition entry point.
+    int bsl = tx_size_wide_unit[sub_txs];
+    int i;
+    assert(bsl > 0);
+    assert(tx_size < TX_SIZES_ALL);
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + ((i >> 1) * bsl);
+      const int offsetc = blk_col + ((i & 0x01) * bsl);
+      int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+      encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs,
+                         arg);
+      block += step;
+    }
+  }
+}
+#endif
+
+typedef struct encode_block_pass1_args {
+  AV1_COMMON *cm;
+  MACROBLOCK *x;
+} encode_block_pass1_args;
+
+static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               void *arg) {
+  encode_block_pass1_args *args = (encode_block_pass1_args *)arg;
+  AV1_COMMON *cm = args->cm;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint8_t *dst;
+  int ctx = 0;
+  dst = &pd->dst
+             .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
+
+  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  ctx, AV1_XFORM_QUANT_B);
+#if !CONFIG_PVQ
+  if (p->eobs[block] > 0) {
+#else
+  if (!x->pvq_skip[plane]) {
+    {
+      int tx_blk_size;
+      int i, j;
+      // transform block size in pixels
+      tx_blk_size = tx_size_wide[tx_size];
+
+// Since av1 does not have separate function which does inverse transform
+// but av1_inv_txfm_add_*x*() also does addition of predicted image to
+// inverse transformed image,
+// pass blank dummy image to av1_inv_txfm_add_*x*(), i.e. set dst as zeros
+#if CONFIG_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        for (j = 0; j < tx_blk_size; j++)
+          for (i = 0; i < tx_blk_size; i++)
+            CONVERT_TO_SHORTPTR(dst)[j * pd->dst.stride + i] = 0;
+      } else {
+#endif  // CONFIG_HIGHBITDEPTH
+        for (j = 0; j < tx_blk_size; j++)
+          for (i = 0; i < tx_blk_size; i++) dst[j * pd->dst.stride + i] = 0;
+#if CONFIG_HIGHBITDEPTH
+      }
+#endif  // CONFIG_HIGHBITDEPTH
+    }
+#endif  // !CONFIG_PVQ
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+        av1_highbd_iwht4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+                               xd->bd);
+      } else {
+        av1_highbd_idct4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+                               xd->bd);
+      }
+      return;
+    }
+#endif  //  CONFIG_HIGHBITDEPTH
+    if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+      av1_iwht4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+    } else {
+      av1_idct4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+    }
+  }
+}
+
+void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize) {
+  encode_block_pass1_args args = { cm, x };
+  av1_subtract_plane(x, bsize, 0);
+  av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
+                                         encode_block_pass1, &args);
+}
+
+void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize,
+                   const int mi_row, const int mi_col) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct encode_b_args arg = { cm, x, &ctx, &mbmi->skip, NULL, NULL, 1 };
+  int plane;
+
+  mbmi->skip = 1;
+
+  if (x->skip) return;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+    const int subsampling_x = xd->plane[plane].subsampling_x;
+    const int subsampling_y = xd->plane[plane].subsampling_y;
+
+    if (!is_chroma_reference(mi_row, mi_col, bsize, subsampling_x,
+                             subsampling_y))
+      continue;
+
+    bsize = scale_chroma_bsize(bsize, subsampling_x, subsampling_y);
+#else
+    (void)mi_row;
+    (void)mi_col;
+#endif
+
+#if CONFIG_VAR_TX
+    // TODO(jingning): Clean this up.
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+    const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, plane_bsize);
+    const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+    const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
+    const int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
+    int idx, idy;
+    int block = 0;
+    int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+    av1_get_entropy_contexts(bsize, 0, pd, ctx.ta[plane], ctx.tl[plane]);
+#else
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const TX_SIZE tx_size = get_tx_size(plane, xd);
+    av1_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
+#endif
+
+#if !CONFIG_PVQ
+    av1_subtract_plane(x, bsize, plane);
+#endif
+    arg.ta = ctx.ta[plane];
+    arg.tl = ctx.tl[plane];
+
+#if CONFIG_VAR_TX
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bw) {
+        encode_block_inter(plane, block, idy, idx, plane_bsize, max_tx_size,
+                           &arg);
+        block += step;
+      }
+    }
+#else
+    av1_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
+                                           &arg);
+#endif
+  }
+}
+
+#if CONFIG_SUPERTX
+void av1_encode_sb_supertx(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct encode_b_args arg = { cm, x, &ctx, &mbmi->skip, NULL, NULL, 1 };
+  int plane;
+
+  mbmi->skip = 1;
+  if (x->skip) return;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+#if CONFIG_VAR_TX
+    const TX_SIZE tx_size = TX_4X4;
+#else
+    const TX_SIZE tx_size = get_tx_size(plane, xd);
+#endif
+    av1_subtract_plane(x, bsize, plane);
+    av1_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
+    arg.ta = ctx.ta[plane];
+    arg.tl = ctx.tl[plane];
+    av1_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
+                                           &arg);
+  }
+}
+#endif  // CONFIG_SUPERTX
+
+#if !CONFIG_PVQ
+void av1_set_txb_context(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
+                         ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
+  (void)tx_size;
+  struct macroblock_plane *p = &x->plane[plane];
+
+#if !CONFIG_LV_MAP
+  *a = *l = p->eobs[block] > 0;
+#else   // !CONFIG_LV_MAP
+  *a = *l = p->txb_entropy_ctx[block];
+#endif  // !CONFIG_LV_MAP
+
+#if CONFIG_VAR_TX || CONFIG_LV_MAP
+  int i;
+  for (i = 0; i < tx_size_wide_unit[tx_size]; ++i) a[i] = a[0];
+
+  for (i = 0; i < tx_size_high_unit[tx_size]; ++i) l[i] = l[0];
+#endif
+}
+#endif
+
+static void encode_block_intra_and_set_context(int plane, int block,
+                                               int blk_row, int blk_col,
+                                               BLOCK_SIZE plane_bsize,
+                                               TX_SIZE tx_size, void *arg) {
+  av1_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                         arg);
+#if !CONFIG_PVQ
+  struct encode_b_args *const args = arg;
+  MACROBLOCK *x = args->x;
+  ENTROPY_CONTEXT *a = &args->ta[blk_col];
+  ENTROPY_CONTEXT *l = &args->tl[blk_row];
+  av1_set_txb_context(x, plane, block, tx_size, a, l);
+#endif
+}
+
+void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                            void *arg) {
+  struct encode_b_args *const args = arg;
+  AV1_COMMON *cm = args->cm;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  uint16_t *eob = &p->eobs[block];
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst =
+      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  av1_predict_intra_block_facade(xd, plane, block, blk_col, blk_row, tx_size);
+  av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+
+  const ENTROPY_CONTEXT *a = &args->ta[blk_col];
+  const ENTROPY_CONTEXT *l = &args->tl[blk_row];
+  int ctx = combine_entropy_contexts(*a, *l);
+  if (args->enable_optimize_b) {
+    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                    ctx, AV1_XFORM_QUANT_FP);
+    if (p->eobs[block]) {
+      av1_optimize_b(cm, x, plane, block, tx_size, ctx);
+    }
+  } else {
+    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                    ctx, AV1_XFORM_QUANT_B);
+  }
+
+#if CONFIG_PVQ
+  // *(args->skip) == mbmi->skip
+  if (!x->pvq_skip[plane]) *(args->skip) = 0;
+
+  if (x->pvq_skip[plane]) return;
+#endif  // CONFIG_PVQ
+  av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, dst, dst_stride,
+                              *eob);
+#if !CONFIG_PVQ
+  if (*eob) *(args->skip) = 0;
+#else
+// Note : *(args->skip) == mbmi->skip
+#endif
+#if CONFIG_CFL
+  if (plane == AOM_PLANE_Y && x->cfl_store_y) {
+    cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size);
+  }
+#endif
+}
+
+void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, int plane,
+                                  int enable_optimize_b, const int mi_row,
+                                  const int mi_col) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE] = { 0 };
+  ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE] = { 0 };
+
+  struct encode_b_args arg = {
+    cm, x, NULL, &xd->mi[0]->mbmi.skip, ta, tl, enable_optimize_b
+  };
+
+#if CONFIG_CB4X4
+  if (!is_chroma_reference(mi_row, mi_col, bsize,
+                           xd->plane[plane].subsampling_x,
+                           xd->plane[plane].subsampling_y))
+    return;
+#else
+  (void)mi_row;
+  (void)mi_col;
+#endif
+
+  if (enable_optimize_b) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const TX_SIZE tx_size = get_tx_size(plane, xd);
+    av1_get_entropy_contexts(bsize, tx_size, pd, ta, tl);
+  }
+  av1_foreach_transformed_block_in_plane(
+      xd, bsize, plane, encode_block_intra_and_set_context, &arg);
+}
+
+#if CONFIG_PVQ
+PVQ_SKIP_TYPE av1_pvq_encode_helper(MACROBLOCK *x, tran_low_t *const coeff,
+                                    tran_low_t *ref_coeff,
+                                    tran_low_t *const dqcoeff, uint16_t *eob,
+                                    const int16_t *quant, int plane,
+                                    int tx_size, TX_TYPE tx_type, int *rate,
+                                    int speed, PVQ_INFO *pvq_info) {
+  const int tx_blk_size = tx_size_wide[tx_size];
+  daala_enc_ctx *daala_enc = &x->daala_enc;
+  PVQ_SKIP_TYPE ac_dc_coded;
+  int coeff_shift = 3 - av1_get_tx_scale(tx_size);
+  int hbd_downshift = 0;
+  int rounding_mask;
+  int pvq_dc_quant;
+  int use_activity_masking = daala_enc->use_activity_masking;
+  int tell;
+  int has_dc_skip = 1;
+  int i;
+  int off = od_qm_offset(tx_size, plane ? 1 : 0);
+
+  DECLARE_ALIGNED(16, tran_low_t, coeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_coeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+
+  DECLARE_ALIGNED(16, int32_t, in_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+  DECLARE_ALIGNED(16, int32_t, ref_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+  DECLARE_ALIGNED(16, int32_t, out_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+
+#if CONFIG_HIGHBITDEPTH
+  hbd_downshift = x->e_mbd.bd - 8;
+#endif
+
+  assert(OD_COEFF_SHIFT >= 4);
+  // DC quantizer for PVQ
+  if (use_activity_masking)
+    pvq_dc_quant =
+        OD_MAXI(1, (quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift) *
+                           daala_enc->state
+                               .pvq_qm_q4[plane][od_qm_get_index(tx_size, 0)] >>
+                       4);
+  else
+    pvq_dc_quant =
+        OD_MAXI(1, quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift);
+
+  *eob = 0;
+
+#if CONFIG_DAALA_EC
+  tell = od_ec_enc_tell_frac(&daala_enc->w.ec);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+
+  // Change coefficient ordering for pvq encoding.
+  od_raster_to_coding_order(coeff_pvq, tx_blk_size, tx_type, coeff,
+                            tx_blk_size);
+  od_raster_to_coding_order(ref_coeff_pvq, tx_blk_size, tx_type, ref_coeff,
+                            tx_blk_size);
+
+  // copy int16 inputs to int32
+  for (i = 0; i < tx_blk_size * tx_blk_size; i++) {
+    ref_int32[i] =
+        AOM_SIGNED_SHL(ref_coeff_pvq[i], OD_COEFF_SHIFT - coeff_shift) >>
+        hbd_downshift;
+    in_int32[i] = AOM_SIGNED_SHL(coeff_pvq[i], OD_COEFF_SHIFT - coeff_shift) >>
+                  hbd_downshift;
+  }
+
+  if (abs(in_int32[0] - ref_int32[0]) < pvq_dc_quant * 141 / 256) { /* 0.55 */
+    out_int32[0] = 0;
+  } else {
+    out_int32[0] = OD_DIV_R0(in_int32[0] - ref_int32[0], pvq_dc_quant);
+  }
+
+  ac_dc_coded =
+      od_pvq_encode(daala_enc, ref_int32, in_int32, out_int32,
+                    OD_MAXI(1, quant[0] << (OD_COEFF_SHIFT - 3) >>
+                                   hbd_downshift),  // scale/quantizer
+                    OD_MAXI(1, quant[1] << (OD_COEFF_SHIFT - 3) >>
+                                   hbd_downshift),  // scale/quantizer
+                    plane,
+                    tx_size, OD_PVQ_BETA[use_activity_masking][plane][tx_size],
+                    0,  // is_keyframe,
+                    daala_enc->state.qm + off, daala_enc->state.qm_inv + off,
+                    speed,  // speed
+                    pvq_info);
+
+  // Encode residue of DC coeff, if required.
+  if (!has_dc_skip || out_int32[0]) {
+    generic_encode(&daala_enc->w, &daala_enc->state.adapt->model_dc[plane],
+                   abs(out_int32[0]) - has_dc_skip,
+                   &daala_enc->state.adapt->ex_dc[plane][tx_size][0], 2);
+  }
+  if (out_int32[0]) {
+    aom_write_bit(&daala_enc->w, out_int32[0] < 0);
+  }
+
+  // need to save quantized residue of DC coeff
+  // so that final pvq bitstream writing can know whether DC is coded.
+  if (pvq_info) pvq_info->dq_dc_residue = out_int32[0];
+
+  out_int32[0] = out_int32[0] * pvq_dc_quant;
+  out_int32[0] += ref_int32[0];
+
+  // copy int32 result back to int16
+  assert(OD_COEFF_SHIFT > coeff_shift);
+  rounding_mask = (1 << (OD_COEFF_SHIFT - coeff_shift - 1)) - 1;
+  for (i = 0; i < tx_blk_size * tx_blk_size; i++) {
+    out_int32[i] = AOM_SIGNED_SHL(out_int32[i], hbd_downshift);
+    dqcoeff_pvq[i] = (out_int32[i] + (out_int32[i] < 0) + rounding_mask) >>
+                     (OD_COEFF_SHIFT - coeff_shift);
+  }
+
+  // Back to original coefficient order
+  od_coding_order_to_raster(dqcoeff, tx_blk_size, tx_type, dqcoeff_pvq,
+                            tx_blk_size);
+
+  *eob = tx_blk_size * tx_blk_size;
+
+#if CONFIG_DAALA_EC
+  *rate = (od_ec_enc_tell_frac(&daala_enc->w.ec) - tell)
+          << (AV1_PROB_COST_SHIFT - OD_BITRES);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+  assert(*rate >= 0);
+
+  return ac_dc_coded;
+}
+
+void av1_store_pvq_enc_info(PVQ_INFO *pvq_info, int *qg, int *theta, int *k,
+                            od_coeff *y, int nb_bands, const int *off,
+                            int *size, int skip_rest, int skip_dir,
+                            int bs) {  // block size in log_2 -2
+  int i;
+  const int tx_blk_size = tx_size_wide[bs];
+
+  for (i = 0; i < nb_bands; i++) {
+    pvq_info->qg[i] = qg[i];
+    pvq_info->theta[i] = theta[i];
+    pvq_info->k[i] = k[i];
+    pvq_info->off[i] = off[i];
+    pvq_info->size[i] = size[i];
+  }
+
+  memcpy(pvq_info->y, y, tx_blk_size * tx_blk_size * sizeof(od_coeff));
+
+  pvq_info->nb_bands = nb_bands;
+  pvq_info->skip_rest = skip_rest;
+  pvq_info->skip_dir = skip_dir;
+  pvq_info->bs = bs;
+}
+#endif
diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h
new file mode 100644
index 000000000..73fde1d88
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemb.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_ENCODEMB_H_
+#define AV1_ENCODER_ENCODEMB_H_
+
+#include "./aom_config.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/encoder/block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct optimize_ctx {
+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
+};
+
+struct encode_b_args {
+  AV1_COMMON *cm;
+  MACROBLOCK *x;
+  struct optimize_ctx *ctx;
+  int8_t *skip;
+  ENTROPY_CONTEXT *ta;
+  ENTROPY_CONTEXT *tl;
+  int8_t enable_optimize_b;
+};
+
+typedef enum AV1_XFORM_QUANT {
+  AV1_XFORM_QUANT_FP = 0,
+  AV1_XFORM_QUANT_B = 1,
+  AV1_XFORM_QUANT_DC = 2,
+  AV1_XFORM_QUANT_SKIP_QUANT,
+  AV1_XFORM_QUANT_TYPES,
+} AV1_XFORM_QUANT;
+
+void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+                   int mi_col);
+#if CONFIG_SUPERTX
+void av1_encode_sb_supertx(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize);
+#endif  // CONFIG_SUPERTX
+void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize);
+void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
+                     int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
+                     TX_SIZE tx_size, int ctx, AV1_XFORM_QUANT xform_quant_idx);
+
+int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
+                   TX_SIZE tx_size, int ctx);
+
+void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
+                      int blk_col, int blk_row, TX_SIZE tx_size);
+
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+
+void av1_set_txb_context(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
+                         ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l);
+
+void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
+
+void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, int plane,
+                                  int enable_optimize_b, int mi_row,
+                                  int mi_col);
+
+#if CONFIG_PVQ
+PVQ_SKIP_TYPE av1_pvq_encode_helper(MACROBLOCK *x, tran_low_t *const coeff,
+                                    tran_low_t *ref_coeff,
+                                    tran_low_t *const dqcoeff, uint16_t *eob,
+                                    const int16_t *quant, int plane,
+                                    int tx_size, TX_TYPE tx_type, int *rate,
+                                    int speed, PVQ_INFO *pvq_info);
+
+void av1_store_pvq_enc_info(PVQ_INFO *pvq_info, int *qg, int *theta, int *k,
+                            od_coeff *y, int nb_bands, const int *off,
+                            int *size, int skip_rest, int skip_dir, int bs);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_ENCODEMB_H_
diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c
new file mode 100644
index 000000000..a2a53f840
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemv.c
@@ -0,0 +1,497 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "av1/common/common.h"
+#include "av1/common/entropymode.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/subexp.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+static struct av1_token mv_joint_encodings[MV_JOINTS];
+static struct av1_token mv_class_encodings[MV_CLASSES];
+static struct av1_token mv_fp_encodings[MV_FP_SIZE];
+
+void av1_entropy_mv_init(void) {
+  av1_tokens_from_tree(mv_joint_encodings, av1_mv_joint_tree);
+  av1_tokens_from_tree(mv_class_encodings, av1_mv_class_tree);
+  av1_tokens_from_tree(mv_fp_encodings, av1_mv_fp_tree);
+}
+
+static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
+                                int usehp) {
+  int offset;
+  const int sign = comp < 0;
+  const int mag = sign ? -comp : comp;
+  const int mv_class = av1_get_mv_class(mag - 1, &offset);
+  const int d = offset >> 3;         // int mv data
+  const int fr = (offset >> 1) & 3;  // fractional mv data
+  const int hp = offset & 1;         // high precision mv data
+
+  assert(comp != 0);
+
+  // Sign
+  aom_write(w, sign, mvcomp->sign);
+
+// Class
+#if CONFIG_EC_MULTISYMBOL
+  aom_write_symbol(w, mv_class, mvcomp->class_cdf, MV_CLASSES);
+#else
+  av1_write_token(w, av1_mv_class_tree, mvcomp->classes,
+                  &mv_class_encodings[mv_class]);
+#endif
+
+  // Integer bits
+  if (mv_class == MV_CLASS_0) {
+    aom_write(w, d, mvcomp->class0[0]);
+  } else {
+    int i;
+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
+    for (i = 0; i < n; ++i) aom_write(w, (d >> i) & 1, mvcomp->bits[i]);
+  }
+
+// Fractional bits
+#if CONFIG_EC_MULTISYMBOL
+  aom_write_symbol(
+      w, fr, mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
+      MV_FP_SIZE);
+#else
+  av1_write_token(w, av1_mv_fp_tree,
+                  mv_class == MV_CLASS_0 ? mvcomp->class0_fp[d] : mvcomp->fp,
+                  &mv_fp_encodings[fr]);
+#endif
+
+  // High precision bit
+  if (usehp)
+    aom_write(w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp);
+}
+
+static void build_nmv_component_cost_table(int *mvcost,
+                                           const nmv_component *const mvcomp,
+                                           int usehp) {
+  int i, v;
+  int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
+  int bits_cost[MV_OFFSET_BITS][2];
+  int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE];
+  int class0_hp_cost[2], hp_cost[2];
+
+  sign_cost[0] = av1_cost_zero(mvcomp->sign);
+  sign_cost[1] = av1_cost_one(mvcomp->sign);
+  av1_cost_tokens(class_cost, mvcomp->classes, av1_mv_class_tree);
+  av1_cost_tokens(class0_cost, mvcomp->class0, av1_mv_class0_tree);
+  for (i = 0; i < MV_OFFSET_BITS; ++i) {
+    bits_cost[i][0] = av1_cost_zero(mvcomp->bits[i]);
+    bits_cost[i][1] = av1_cost_one(mvcomp->bits[i]);
+  }
+
+  for (i = 0; i < CLASS0_SIZE; ++i)
+    av1_cost_tokens(class0_fp_cost[i], mvcomp->class0_fp[i], av1_mv_fp_tree);
+  av1_cost_tokens(fp_cost, mvcomp->fp, av1_mv_fp_tree);
+
+  if (usehp) {
+    class0_hp_cost[0] = av1_cost_zero(mvcomp->class0_hp);
+    class0_hp_cost[1] = av1_cost_one(mvcomp->class0_hp);
+    hp_cost[0] = av1_cost_zero(mvcomp->hp);
+    hp_cost[1] = av1_cost_one(mvcomp->hp);
+  }
+  mvcost[0] = 0;
+  for (v = 1; v <= MV_MAX; ++v) {
+    int z, c, o, d, e, f, cost = 0;
+    z = v - 1;
+    c = av1_get_mv_class(z, &o);
+    cost += class_cost[c];
+    d = (o >> 3);     /* int mv data */
+    f = (o >> 1) & 3; /* fractional pel mv data */
+    e = (o & 1);      /* high precision mv data */
+    if (c == MV_CLASS_0) {
+      cost += class0_cost[d];
+    } else {
+      const int b = c + CLASS0_BITS - 1; /* number of bits */
+      for (i = 0; i < b; ++i) cost += bits_cost[i][((d >> i) & 1)];
+    }
+    if (c == MV_CLASS_0) {
+      cost += class0_fp_cost[d][f];
+    } else {
+      cost += fp_cost[f];
+    }
+    if (usehp) {
+      if (c == MV_CLASS_0) {
+        cost += class0_hp_cost[e];
+      } else {
+        cost += hp_cost[e];
+      }
+    }
+    mvcost[v] = cost + sign_cost[0];
+    mvcost[-v] = cost + sign_cost[1];
+  }
+}
+
+static void update_mv(aom_writer *w, const unsigned int ct[2], aom_prob *cur_p,
+                      aom_prob upd_p) {
+  (void)upd_p;
+#if CONFIG_TILE_GROUPS
+  // Just use the default maximum number of tile groups to avoid passing in the
+  // actual
+  // number
+  av1_cond_prob_diff_update(w, cur_p, ct, DEFAULT_MAX_NUM_TG);
+#else
+  av1_cond_prob_diff_update(w, cur_p, ct, 1);
+#endif
+}
+
+#if !CONFIG_EC_ADAPT
+static void write_mv_update(const aom_tree_index *tree,
+                            aom_prob probs[/*n - 1*/],
+                            const unsigned int counts[/*n - 1*/], int n,
+                            aom_writer *w) {
+  int i;
+  unsigned int branch_ct[32][2];
+
+  // Assuming max number of probabilities <= 32
+  assert(n <= 32);
+
+  av1_tree_probs_from_distribution(tree, branch_ct, counts);
+  for (i = 0; i < n - 1; ++i)
+    update_mv(w, branch_ct[i], &probs[i], MV_UPDATE_PROB);
+}
+#endif
+
+void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w,
+                         nmv_context_counts *const nmv_counts) {
+  int i;
+#if CONFIG_REF_MV
+  int nmv_ctx = 0;
+  for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) {
+    nmv_context *const mvc = &cm->fc->nmvc[nmv_ctx];
+    nmv_context_counts *const counts = &nmv_counts[nmv_ctx];
+#if !CONFIG_EC_ADAPT
+    write_mv_update(av1_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS,
+                    w);
+
+    for (i = 0; i < 2; ++i) {
+      int j;
+      nmv_component *comp = &mvc->comps[i];
+      nmv_component_counts *comp_counts = &counts->comps[i];
+
+      update_mv(w, comp_counts->sign, &comp->sign, MV_UPDATE_PROB);
+      write_mv_update(av1_mv_class_tree, comp->classes, comp_counts->classes,
+                      MV_CLASSES, w);
+      write_mv_update(av1_mv_class0_tree, comp->class0, comp_counts->class0,
+                      CLASS0_SIZE, w);
+      for (j = 0; j < MV_OFFSET_BITS; ++j)
+        update_mv(w, comp_counts->bits[j], &comp->bits[j], MV_UPDATE_PROB);
+    }
+
+    for (i = 0; i < 2; ++i) {
+      int j;
+      for (j = 0; j < CLASS0_SIZE; ++j)
+        write_mv_update(av1_mv_fp_tree, mvc->comps[i].class0_fp[j],
+                        counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
+
+      write_mv_update(av1_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp,
+                      MV_FP_SIZE, w);
+    }
+#endif
+
+    if (usehp) {
+      for (i = 0; i < 2; ++i) {
+        update_mv(w, counts->comps[i].class0_hp, &mvc->comps[i].class0_hp,
+                  MV_UPDATE_PROB);
+        update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, MV_UPDATE_PROB);
+      }
+    }
+  }
+#else
+  nmv_context *const mvc = &cm->fc->nmvc;
+  nmv_context_counts *const counts = nmv_counts;
+
+#if !CONFIG_EC_ADAPT
+  write_mv_update(av1_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);
+
+  for (i = 0; i < 2; ++i) {
+    int j;
+    nmv_component *comp = &mvc->comps[i];
+    nmv_component_counts *comp_counts = &counts->comps[i];
+
+    update_mv(w, comp_counts->sign, &comp->sign, MV_UPDATE_PROB);
+    write_mv_update(av1_mv_class_tree, comp->classes, comp_counts->classes,
+                    MV_CLASSES, w);
+    write_mv_update(av1_mv_class0_tree, comp->class0, comp_counts->class0,
+                    CLASS0_SIZE, w);
+    for (j = 0; j < MV_OFFSET_BITS; ++j)
+      update_mv(w, comp_counts->bits[j], &comp->bits[j], MV_UPDATE_PROB);
+  }
+
+  for (i = 0; i < 2; ++i) {
+    int j;
+    for (j = 0; j < CLASS0_SIZE; ++j) {
+      write_mv_update(av1_mv_fp_tree, mvc->comps[i].class0_fp[j],
+                      counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
+    }
+    write_mv_update(av1_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp,
+                    MV_FP_SIZE, w);
+  }
+#endif  // !CONFIG_EC_ADAPT
+
+  if (usehp) {
+    for (i = 0; i < 2; ++i) {
+      update_mv(w, counts->comps[i].class0_hp, &mvc->comps[i].class0_hp,
+                MV_UPDATE_PROB);
+      update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, MV_UPDATE_PROB);
+    }
+  }
+#endif
+}
+
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
+                   nmv_context *mvctx, int usehp) {
+  const MV diff = { mv->row - ref->row, mv->col - ref->col };
+  const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+#if CONFIG_EC_MULTISYMBOL
+  aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS);
+#else
+  av1_write_token(w, av1_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]);
+#endif
+  if (mv_joint_vertical(j))
+    encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
+
+  if (mv_joint_horizontal(j))
+    encode_mv_component(w, diff.col, &mvctx->comps[1], usehp);
+
+  // If auto_mv_step_size is enabled then keep track of the largest
+  // motion vector component used.
+  if (cpi->sf.mv.auto_mv_step_size) {
+    unsigned int maxv = AOMMAX(abs(mv->row), abs(mv->col)) >> 3;
+    cpi->max_mv_magnitude = AOMMAX(maxv, cpi->max_mv_magnitude);
+  }
+}
+
+#if CONFIG_INTRABC
+void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
+                   nmv_context *mvctx) {
+  const MV diff = { mv->row - ref->row, mv->col - ref->col };
+  const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+
+#if CONFIG_EC_MULTISYMBOL
+  aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS);
+#else
+  av1_write_token(w, av1_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]);
+#endif
+  if (mv_joint_vertical(j))
+    encode_mv_component(w, diff.row, &mvctx->comps[0], 0);
+
+  if (mv_joint_horizontal(j))
+    encode_mv_component(w, diff.col, &mvctx->comps[1], 0);
+}
+#endif  // CONFIG_INTRABC
+
+void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+                              const nmv_context *ctx, int usehp) {
+  av1_cost_tokens(mvjoint, ctx->joints, av1_mv_joint_tree);
+  build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], usehp);
+  build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], usehp);
+}
+
+#if CONFIG_EXT_INTER
+static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
+                    const int_mv mvs[2],
+#if CONFIG_REF_MV
+                    const int_mv pred_mvs[2],
+#endif
+                    nmv_context_counts *nmv_counts) {
+  int i;
+  PREDICTION_MODE mode = mbmi->mode;
+#if !CONFIG_REF_MV
+  nmv_context_counts *counts = nmv_counts;
+#endif
+
+  if (mode == NEWMV || mode == NEW_NEWMV) {
+    for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
+      const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
+      const MV diff = { mvs[i].as_mv.row - ref->row,
+                        mvs[i].as_mv.col - ref->col };
+#if CONFIG_REF_MV
+      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+      int nmv_ctx =
+          av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                      mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
+      nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+      (void)pred_mvs;
+#endif
+      av1_inc_mv(&diff, counts, 1);
+    }
+  } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
+    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv;
+    const MV diff = { mvs[1].as_mv.row - ref->row,
+                      mvs[1].as_mv.col - ref->col };
+#if CONFIG_REF_MV
+    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+    int nmv_ctx =
+        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                    mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
+    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#endif
+    av1_inc_mv(&diff, counts, 1);
+  } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv;
+    const MV diff = { mvs[0].as_mv.row - ref->row,
+                      mvs[0].as_mv.col - ref->col };
+#if CONFIG_REF_MV
+    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+    int nmv_ctx =
+        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                    mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
+    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#endif
+    av1_inc_mv(&diff, counts, 1);
+  }
+}
+
+static void inc_mvs_sub8x8(const MODE_INFO *mi, int block, const int_mv mvs[2],
+#if CONFIG_REF_MV
+                           const MB_MODE_INFO_EXT *mbmi_ext,
+#endif
+                           nmv_context_counts *nmv_counts) {
+  int i;
+  PREDICTION_MODE mode = mi->bmi[block].as_mode;
+#if CONFIG_REF_MV
+  const MB_MODE_INFO *mbmi = &mi->mbmi;
+#else
+  nmv_context_counts *counts = nmv_counts;
+#endif
+
+  if (mode == NEWMV || mode == NEW_NEWMV) {
+    for (i = 0; i < 1 + has_second_ref(&mi->mbmi); ++i) {
+      const MV *ref = &mi->bmi[block].ref_mv[i].as_mv;
+      const MV diff = { mvs[i].as_mv.row - ref->row,
+                        mvs[i].as_mv.col - ref->col };
+#if CONFIG_REF_MV
+      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+      int nmv_ctx =
+          av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                      mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
+      nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#endif
+      av1_inc_mv(&diff, counts, 1);
+    }
+  } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
+    const MV *ref = &mi->bmi[block].ref_mv[1].as_mv;
+    const MV diff = { mvs[1].as_mv.row - ref->row,
+                      mvs[1].as_mv.col - ref->col };
+#if CONFIG_REF_MV
+    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+    int nmv_ctx =
+        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                    mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
+    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#endif
+    av1_inc_mv(&diff, counts, 1);
+  } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+    const MV *ref = &mi->bmi[block].ref_mv[0].as_mv;
+    const MV diff = { mvs[0].as_mv.row - ref->row,
+                      mvs[0].as_mv.col - ref->col };
+#if CONFIG_REF_MV
+    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+    int nmv_ctx =
+        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                    mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
+    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#endif
+    av1_inc_mv(&diff, counts, 1);
+  }
+}
+#else
+static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
+                    const int_mv mvs[2],
+#if CONFIG_REF_MV
+                    const int_mv pred_mvs[2],
+#endif
+                    nmv_context_counts *nmv_counts) {
+  int i;
+#if !CONFIG_REF_MV
+  nmv_context_counts *counts = nmv_counts;
+#endif
+
+  for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
+#if CONFIG_REF_MV
+    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+    int nmv_ctx =
+        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                    mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
+    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+    const MV *ref = &pred_mvs[i].as_mv;
+#else
+    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
+#endif
+    const MV diff = { mvs[i].as_mv.row - ref->row,
+                      mvs[i].as_mv.col - ref->col };
+    av1_inc_mv(&diff, counts, 1);
+  }
+}
+#endif  // CONFIG_EXT_INTER
+
+void av1_update_mv_count(ThreadData *td) {
+  const MACROBLOCKD *xd = &td->mb.e_mbd;
+  const MODE_INFO *mi = xd->mi[0];
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MB_MODE_INFO_EXT *mbmi_ext = td->mb.mbmi_ext;
+#if CONFIG_CB4X4
+  const int unify_bsize = 1;
+#else
+  const int unify_bsize = 0;
+#endif
+
+  if (mbmi->sb_type < BLOCK_8X8 && !unify_bsize) {
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[mbmi->sb_type];
+    int idx, idy;
+
+    for (idy = 0; idy < 2; idy += num_4x4_h) {
+      for (idx = 0; idx < 2; idx += num_4x4_w) {
+        const int i = idy * 2 + idx;
+
+#if CONFIG_EXT_INTER
+        if (have_newmv_in_inter_mode(mi->bmi[i].as_mode))
+          inc_mvs_sub8x8(mi, i, mi->bmi[i].as_mv,
+#if CONFIG_REF_MV
+                         mbmi_ext, td->counts->mv);
+#else
+                         &td->counts->mv);
+#endif
+#else
+        if (mi->bmi[i].as_mode == NEWMV)
+          inc_mvs(mbmi, mbmi_ext, mi->bmi[i].as_mv,
+#if CONFIG_REF_MV
+                  mi->bmi[i].pred_mv, td->counts->mv);
+#else
+                  &td->counts->mv);
+#endif
+#endif  // CONFIG_EXT_INTER
+      }
+    }
+  } else {
+#if CONFIG_EXT_INTER
+    if (have_newmv_in_inter_mode(mbmi->mode))
+#else
+    if (mbmi->mode == NEWMV)
+#endif  // CONFIG_EXT_INTER
+      inc_mvs(mbmi, mbmi_ext, mbmi->mv,
+#if CONFIG_REF_MV
+              mbmi->pred_mv, td->counts->mv);
+#else
+              &td->counts->mv);
+#endif
+  }
+}
diff --git a/third_party/aom/av1/encoder/encodemv.h b/third_party/aom/av1/encoder/encodemv.h
new file mode 100644
index 000000000..6d442147f
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemv.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_ENCODEMV_H_
+#define AV1_ENCODER_ENCODEMV_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_entropy_mv_init(void);
+
+void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w,
+                         nmv_context_counts *const counts);
+
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
+                   nmv_context *mvctx, int usehp);
+
+void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+                              const nmv_context *mvctx, int usehp);
+
+void av1_update_mv_count(ThreadData *td);
+
+#if CONFIG_INTRABC
+void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
+                   nmv_context *mvctx);
+#endif  // CONFIG_INTRABC
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_ENCODEMV_H_
diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c
new file mode 100644
index 000000000..027109151
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder.c
@@ -0,0 +1,5980 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./aom_config.h"
+
+#include "av1/common/alloccommon.h"
+#if CONFIG_CDEF
+#include "av1/common/cdef.h"
+#include "av1/common/clpf.h"
+#endif  // CONFIG_CDEF
+#include "av1/common/filter.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/resize.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/bitstream.h"
+#if CONFIG_ANS
+#include "aom_dsp/buf_ans.h"
+#endif
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#if CONFIG_LV_MAP
+#include "av1/encoder/encodetxb.h"
+#endif
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/mbgraph.h"
+#include "av1/encoder/picklpf.h"
+#if CONFIG_LOOP_RESTORATION
+#include "av1/encoder/pickrst.h"
+#endif  // CONFIG_LOOP_RESTORATION
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/temporal_filter.h"
+
+#include "./av1_rtcd.h"
+#include "./aom_dsp_rtcd.h"
+#include "./aom_scale_rtcd.h"
+#include "aom_dsp/psnr.h"
+#if CONFIG_INTERNAL_STATS
+#include "aom_dsp/ssim.h"
+#endif
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "aom_scale/aom_scale.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#if CONFIG_ENTROPY_STATS
+FRAME_COUNTS aggregate_fc;
+#endif  // CONFIG_ENTROPY_STATS
+
+#define AM_SEGMENT_ID_INACTIVE 7
+#define AM_SEGMENT_ID_ACTIVE 0
+
+#define SHARP_FILTER_QTHRESH 0 /* Q threshold for 8-tap sharp filter */
+
+#define ALTREF_HIGH_PRECISION_MV 1     // Whether to use high precision mv
+                                       //  for altref computation.
+#define HIGH_PRECISION_MV_QTHRESH 200  // Q threshold for high precision
+                                       // mv. Choose a very high value for
+                                       // now so that HIGH_PRECISION is always
+                                       // chosen.
+// #define OUTPUT_YUV_REC
+#ifdef OUTPUT_YUV_DENOISED
+FILE *yuv_denoised_file = NULL;
+#endif
+#ifdef OUTPUT_YUV_SKINMAP
+FILE *yuv_skinmap_file = NULL;
+#endif
+#ifdef OUTPUT_YUV_REC
+FILE *yuv_rec_file;
+#define FILE_NAME_LEN 100
+#endif
+
+#if 0
+FILE *framepsnr;
+FILE *kf_list;
+FILE *keyfile;
+#endif
+
+#if CONFIG_CFL
+CFL_CTX NULL_CFL;
+#endif
+
+#if CONFIG_INTERNAL_STATS
+typedef enum { Y, U, V, ALL } STAT_TYPE;
+#endif  // CONFIG_INTERNAL_STATS
+
+static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) {
+  switch (mode) {
+    case NORMAL:
+      *hr = 1;
+      *hs = 1;
+      break;
+    case FOURFIVE:
+      *hr = 4;
+      *hs = 5;
+      break;
+    case THREEFIVE:
+      *hr = 3;
+      *hs = 5;
+      break;
+    case ONETWO:
+      *hr = 1;
+      *hs = 2;
+      break;
+    default:
+      *hr = 1;
+      *hs = 1;
+      assert(0);
+      break;
+  }
+}
+
+// Mark all inactive blocks as active. Other segmentation features may be set
+// so memset cannot be used, instead only inactive blocks should be reset.
+static void suppress_active_map(AV1_COMP *cpi) {
+  unsigned char *const seg_map = cpi->segmentation_map;
+  int i;
+  if (cpi->active_map.enabled || cpi->active_map.update)
+    for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+      if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
+        seg_map[i] = AM_SEGMENT_ID_ACTIVE;
+}
+
+static void apply_active_map(AV1_COMP *cpi) {
+  struct segmentation *const seg = &cpi->common.seg;
+  unsigned char *const seg_map = cpi->segmentation_map;
+  const unsigned char *const active_map = cpi->active_map.map;
+  int i;
+
+  assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE);
+
+  if (frame_is_intra_only(&cpi->common)) {
+    cpi->active_map.enabled = 0;
+    cpi->active_map.update = 1;
+  }
+
+  if (cpi->active_map.update) {
+    if (cpi->active_map.enabled) {
+      for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+        if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
+      av1_enable_segmentation(seg);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
+      // Setting the data to -MAX_LOOP_FILTER will result in the computed loop
+      // filter level being zero regardless of the value of seg->abs_delta.
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF,
+                      -MAX_LOOP_FILTER);
+    } else {
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
+      if (seg->enabled) {
+        seg->update_data = 1;
+        seg->update_map = 1;
+      }
+    }
+    cpi->active_map.update = 0;
+  }
+}
+
+int av1_set_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
+                       int cols) {
+  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
+    unsigned char *const active_map_8x8 = cpi->active_map.map;
+    const int mi_rows = cpi->common.mi_rows;
+    const int mi_cols = cpi->common.mi_cols;
+    const int row_scale = mi_size_high[BLOCK_16X16] == 2 ? 1 : 2;
+    const int col_scale = mi_size_wide[BLOCK_16X16] == 2 ? 1 : 2;
+    cpi->active_map.update = 1;
+    if (new_map_16x16) {
+      int r, c;
+      for (r = 0; r < mi_rows; ++r) {
+        for (c = 0; c < mi_cols; ++c) {
+          active_map_8x8[r * mi_cols + c] =
+              new_map_16x16[(r >> row_scale) * cols + (c >> col_scale)]
+                  ? AM_SEGMENT_ID_ACTIVE
+                  : AM_SEGMENT_ID_INACTIVE;
+        }
+      }
+      cpi->active_map.enabled = 1;
+    } else {
+      cpi->active_map.enabled = 0;
+    }
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
+                       int cols) {
+  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols &&
+      new_map_16x16) {
+    unsigned char *const seg_map_8x8 = cpi->segmentation_map;
+    const int mi_rows = cpi->common.mi_rows;
+    const int mi_cols = cpi->common.mi_cols;
+    const int row_scale = mi_size_high[BLOCK_16X16] == 2 ? 1 : 2;
+    const int col_scale = mi_size_wide[BLOCK_16X16] == 2 ? 1 : 2;
+
+    memset(new_map_16x16, !cpi->active_map.enabled, rows * cols);
+    if (cpi->active_map.enabled) {
+      int r, c;
+      for (r = 0; r < mi_rows; ++r) {
+        for (c = 0; c < mi_cols; ++c) {
+          // Cyclic refresh segments are considered active despite not having
+          // AM_SEGMENT_ID_ACTIVE
+          new_map_16x16[(r >> row_scale) * cols + (c >> col_scale)] |=
+              seg_map_8x8[r * mi_cols + c] != AM_SEGMENT_ID_INACTIVE;
+        }
+      }
+    }
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+void av1_set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv) {
+  MACROBLOCK *const mb = &cpi->td.mb;
+  cpi->common.allow_high_precision_mv = allow_high_precision_mv;
+
+#if CONFIG_REF_MV
+  if (cpi->common.allow_high_precision_mv) {
+    int i;
+    for (i = 0; i < NMV_CONTEXTS; ++i) {
+      mb->mv_cost_stack[i] = mb->nmvcost_hp[i];
+      mb->mvsadcost = mb->nmvsadcost_hp;
+    }
+  } else {
+    int i;
+    for (i = 0; i < NMV_CONTEXTS; ++i) {
+      mb->mv_cost_stack[i] = mb->nmvcost[i];
+      mb->mvsadcost = mb->nmvsadcost;
+    }
+  }
+#else
+  if (cpi->common.allow_high_precision_mv) {
+    mb->mvcost = mb->nmvcost_hp;
+    mb->mvsadcost = mb->nmvcost_hp;
+  } else {
+    mb->mvcost = mb->nmvcost;
+    mb->mvsadcost = mb->nmvcost;
+  }
+#endif
+}
+
+static BLOCK_SIZE select_sb_size(const AV1_COMP *const cpi) {
+#if CONFIG_EXT_PARTITION
+  if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_64X64)
+    return BLOCK_64X64;
+
+  if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_128X128)
+    return BLOCK_128X128;
+
+  assert(cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC);
+
+  assert(IMPLIES(cpi->common.tile_cols > 1,
+                 cpi->common.tile_width % MAX_MIB_SIZE == 0));
+  assert(IMPLIES(cpi->common.tile_rows > 1,
+                 cpi->common.tile_height % MAX_MIB_SIZE == 0));
+
+  // TODO(any): Possibly could improve this with a heuristic.
+  return BLOCK_128X128;
+#else
+  (void)cpi;
+  return BLOCK_64X64;
+#endif  //  CONFIG_EXT_PARTITION
+}
+
+static void setup_frame(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  // Set up entropy context depending on frame type. The decoder mandates
+  // the use of the default context, index 0, for keyframes and inter
+  // frames where the error_resilient_mode or intra_only flag is set. For
+  // other inter-frames the encoder currently uses only two contexts;
+  // context 1 for ALTREF frames and context 0 for the others.
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+    av1_setup_past_independence(cm);
+  } else {
+#if CONFIG_EXT_REFS
+    const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+    if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
+      cm->frame_context_idx = EXT_ARF_FRAME;
+    else if (cpi->refresh_alt_ref_frame)
+      cm->frame_context_idx = ARF_FRAME;
+#else
+    if (cpi->refresh_alt_ref_frame) cm->frame_context_idx = ARF_FRAME;
+#endif  // CONFIG_EXT_REFS
+    else if (cpi->rc.is_src_frame_alt_ref)
+      cm->frame_context_idx = OVERLAY_FRAME;
+    else if (cpi->refresh_golden_frame)
+      cm->frame_context_idx = GLD_FRAME;
+#if CONFIG_EXT_REFS
+    else if (cpi->refresh_bwd_ref_frame)
+      cm->frame_context_idx = BRF_FRAME;
+#endif  // CONFIG_EXT_REFS
+    else
+      cm->frame_context_idx = REGULAR_FRAME;
+  }
+
+  if (cm->frame_type == KEY_FRAME) {
+    cpi->refresh_golden_frame = 1;
+    cpi->refresh_alt_ref_frame = 1;
+    av1_zero(cpi->interp_filter_selected);
+  } else {
+    *cm->fc = cm->frame_contexts[cm->frame_context_idx];
+    av1_zero(cpi->interp_filter_selected[0]);
+  }
+#if CONFIG_EXT_REFS
+#if CONFIG_LOWDELAY_COMPOUND  // No change to bitstream
+  if (cpi->sf.recode_loop == DISALLOW_RECODE) {
+    cpi->refresh_bwd_ref_frame = cpi->refresh_last_frame;
+    cpi->rc.is_bipred_frame = 1;
+  }
+#endif
+#endif
+
+  cpi->vaq_refresh = 0;
+
+  set_sb_size(cm, select_sb_size(cpi));
+}
+
+static void av1_enc_setup_mi(AV1_COMMON *cm) {
+  int i;
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+  // Clear top border row
+  memset(cm->prev_mip, 0, sizeof(*cm->prev_mip) * cm->mi_stride);
+  // Clear left border column
+  for (i = 1; i < cm->mi_rows + 1; ++i)
+    memset(&cm->prev_mip[i * cm->mi_stride], 0, sizeof(*cm->prev_mip));
+
+  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+
+  memset(cm->mi_grid_base, 0,
+         cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
+}
+
+static int av1_enc_alloc_mi(AV1_COMMON *cm, int mi_size) {
+  cm->mip = aom_calloc(mi_size, sizeof(*cm->mip));
+  if (!cm->mip) return 1;
+  cm->prev_mip = aom_calloc(mi_size, sizeof(*cm->prev_mip));
+  if (!cm->prev_mip) return 1;
+  cm->mi_alloc_size = mi_size;
+
+  cm->mi_grid_base = (MODE_INFO **)aom_calloc(mi_size, sizeof(MODE_INFO *));
+  if (!cm->mi_grid_base) return 1;
+  cm->prev_mi_grid_base =
+      (MODE_INFO **)aom_calloc(mi_size, sizeof(MODE_INFO *));
+  if (!cm->prev_mi_grid_base) return 1;
+
+  return 0;
+}
+
+static void av1_enc_free_mi(AV1_COMMON *cm) {
+  aom_free(cm->mip);
+  cm->mip = NULL;
+  aom_free(cm->prev_mip);
+  cm->prev_mip = NULL;
+  aom_free(cm->mi_grid_base);
+  cm->mi_grid_base = NULL;
+  aom_free(cm->prev_mi_grid_base);
+  cm->prev_mi_grid_base = NULL;
+}
+
+static void av1_swap_mi_and_prev_mi(AV1_COMMON *cm) {
+  // Current mip will be the prev_mip for the next frame.
+  MODE_INFO **temp_base = cm->prev_mi_grid_base;
+  MODE_INFO *temp = cm->prev_mip;
+  cm->prev_mip = cm->mip;
+  cm->mip = temp;
+
+  // Update the upper left visible macroblock ptrs.
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+
+  cm->prev_mi_grid_base = cm->mi_grid_base;
+  cm->mi_grid_base = temp_base;
+  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+}
+
+void av1_initialize_enc(void) {
+  static volatile int init_done = 0;
+
+  if (!init_done) {
+    av1_rtcd();
+    aom_dsp_rtcd();
+    aom_scale_rtcd();
+    av1_init_intra_predictors();
+    av1_init_me_luts();
+#if !CONFIG_XIPHRC
+    av1_rc_init_minq_luts();
+#endif
+    av1_entropy_mv_init();
+    av1_encode_token_init();
+#if CONFIG_EXT_INTER
+    av1_init_wedge_masks();
+#endif
+    init_done = 1;
+  }
+}
+
+static void dealloc_compressor_data(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  int i;
+
+  aom_free(cpi->mbmi_ext_base);
+  cpi->mbmi_ext_base = NULL;
+
+#if CONFIG_PVQ
+  if (cpi->oxcf.pass != 1) {
+    const int tile_cols = cm->tile_cols;
+    const int tile_rows = cm->tile_rows;
+    int tile_col, tile_row;
+
+    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+        TileDataEnc *tile_data =
+            &cpi->tile_data[tile_row * tile_cols + tile_col];
+        aom_free(tile_data->pvq_q.buf);
+      }
+  }
+#endif
+  aom_free(cpi->tile_data);
+  cpi->tile_data = NULL;
+
+  // Delete sementation map
+  aom_free(cpi->segmentation_map);
+  cpi->segmentation_map = NULL;
+
+  av1_cyclic_refresh_free(cpi->cyclic_refresh);
+  cpi->cyclic_refresh = NULL;
+
+  aom_free(cpi->active_map.map);
+  cpi->active_map.map = NULL;
+
+  // Free up-sampled reference buffers.
+  for (i = 0; i < (REF_FRAMES + 1); i++)
+    aom_free_frame_buffer(&cpi->upsampled_ref_bufs[i].buf);
+
+  av1_free_ref_frame_buffers(cm->buffer_pool);
+#if CONFIG_LV_MAP
+  av1_free_txb_buf(cpi);
+#endif
+  av1_free_context_buffers(cm);
+
+  aom_free_frame_buffer(&cpi->last_frame_uf);
+#if CONFIG_LOOP_RESTORATION
+  av1_free_restoration_buffers(cm);
+  aom_free_frame_buffer(&cpi->last_frame_db);
+  aom_free_frame_buffer(&cpi->trial_frame_rst);
+  aom_free(cpi->extra_rstbuf);
+  for (i = 0; i < MAX_MB_PLANE; ++i)
+    av1_free_restoration_struct(&cpi->rst_search[i]);
+#endif  // CONFIG_LOOP_RESTORATION
+  aom_free_frame_buffer(&cpi->scaled_source);
+  aom_free_frame_buffer(&cpi->scaled_last_source);
+  aom_free_frame_buffer(&cpi->alt_ref_buffer);
+  av1_lookahead_destroy(cpi->lookahead);
+
+  aom_free(cpi->tile_tok[0][0]);
+  cpi->tile_tok[0][0] = 0;
+
+  av1_free_pc_tree(&cpi->td);
+  av1_free_var_tree(&cpi->td);
+
+#if CONFIG_PALETTE
+  if (cpi->common.allow_screen_content_tools)
+    aom_free(cpi->td.mb.palette_buffer);
+#endif  // CONFIG_PALETTE
+
+  if (cpi->source_diff_var != NULL) {
+    aom_free(cpi->source_diff_var);
+    cpi->source_diff_var = NULL;
+  }
+#if CONFIG_ANS
+  aom_buf_ans_free(&cpi->buf_ans);
+#endif  // CONFIG_ANS
+}
+
+static void save_coding_context(AV1_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  AV1_COMMON *cm = &cpi->common;
+#if CONFIG_REF_MV
+  int i;
+#endif
+
+// Stores a snapshot of key state variables which can subsequently be
+// restored with a call to av1_restore_coding_context. These functions are
+// intended for use in a re-code loop in av1_compress_frame where the
+// quantizer value is adjusted between loop iterations.
+#if CONFIG_REF_MV
+  for (i = 0; i < NMV_CONTEXTS; ++i) {
+    av1_copy(cc->nmv_vec_cost[i], cpi->td.mb.nmv_vec_cost[i]);
+    av1_copy(cc->nmv_costs, cpi->nmv_costs);
+    av1_copy(cc->nmv_costs_hp, cpi->nmv_costs_hp);
+  }
+#else
+  av1_copy(cc->nmvjointcost, cpi->td.mb.nmvjointcost);
+#endif
+
+  av1_copy(cc->nmvcosts, cpi->nmvcosts);
+  av1_copy(cc->nmvcosts_hp, cpi->nmvcosts_hp);
+
+  av1_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
+  av1_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
+
+  cc->fc = *cm->fc;
+}
+
+static void restore_coding_context(AV1_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  AV1_COMMON *cm = &cpi->common;
+#if CONFIG_REF_MV
+  int i;
+#endif
+
+// Restore key state variables to the snapshot state stored in the
+// previous call to av1_save_coding_context.
+#if CONFIG_REF_MV
+  for (i = 0; i < NMV_CONTEXTS; ++i) {
+    av1_copy(cpi->td.mb.nmv_vec_cost[i], cc->nmv_vec_cost[i]);
+    av1_copy(cpi->nmv_costs, cc->nmv_costs);
+    av1_copy(cpi->nmv_costs_hp, cc->nmv_costs_hp);
+  }
+#else
+  av1_copy(cpi->td.mb.nmvjointcost, cc->nmvjointcost);
+#endif
+
+  av1_copy(cpi->nmvcosts, cc->nmvcosts);
+  av1_copy(cpi->nmvcosts_hp, cc->nmvcosts_hp);
+
+  av1_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
+  av1_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
+
+  *cm->fc = cc->fc;
+}
+
+static void configure_static_seg_features(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  struct segmentation *const seg = &cm->seg;
+
+  int high_q = (int)(rc->avg_q > 48.0);
+  int qi_delta;
+
+  // Disable and clear down for KF
+  if (cm->frame_type == KEY_FRAME) {
+    // Clear down the global segmentation map
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    seg->update_map = 0;
+    seg->update_data = 0;
+    cpi->static_mb_pct = 0;
+
+    // Disable segmentation
+    av1_disable_segmentation(seg);
+
+    // Clear down the segment features.
+    av1_clearall_segfeatures(seg);
+  } else if (cpi->refresh_alt_ref_frame) {
+    // If this is an alt ref frame
+    // Clear down the global segmentation map
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    seg->update_map = 0;
+    seg->update_data = 0;
+    cpi->static_mb_pct = 0;
+
+    // Disable segmentation and individual segment features by default
+    av1_disable_segmentation(seg);
+    av1_clearall_segfeatures(seg);
+
+    // Scan frames from current to arf frame.
+    // This function re-enables segmentation if appropriate.
+    av1_update_mbgraph_stats(cpi);
+
+    // If segmentation was enabled set those features needed for the
+    // arf itself.
+    if (seg->enabled) {
+      seg->update_map = 1;
+      seg->update_data = 1;
+
+      qi_delta =
+          av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875, cm->bit_depth);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
+
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
+
+      // Where relevant assume segment data is delta data
+      seg->abs_delta = SEGMENT_DELTADATA;
+    }
+  } else if (seg->enabled) {
+    // All other frames if segmentation has been enabled
+
+    // First normal frame in a valid gf or alt ref group
+    if (rc->frames_since_golden == 0) {
+      // Set up segment features for normal frames in an arf group
+      if (rc->source_alt_ref_active) {
+        seg->update_map = 0;
+        seg->update_data = 1;
+        seg->abs_delta = SEGMENT_DELTADATA;
+
+        qi_delta =
+            av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125, cm->bit_depth);
+        av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2);
+        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+
+        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
+        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
+
+        // Segment coding disabled for compred testing
+        if (high_q || (cpi->static_mb_pct == 100)) {
+          av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+          av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+          av1_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+        }
+      } else {
+        // Disable segmentation and clear down features if alt ref
+        // is not active for this group
+
+        av1_disable_segmentation(seg);
+
+        memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+
+        seg->update_map = 0;
+        seg->update_data = 0;
+
+        av1_clearall_segfeatures(seg);
+      }
+    } else if (rc->is_src_frame_alt_ref) {
+      // Special case where we are coding over the top of a previous
+      // alt ref frame.
+      // Segment coding disabled for compred testing
+
+      // Enable ref frame features for segment 0 as well
+      av1_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME);
+      av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+
+      // All mbs should use ALTREF_FRAME
+      av1_clear_segdata(seg, 0, SEG_LVL_REF_FRAME);
+      av1_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+      av1_clear_segdata(seg, 1, SEG_LVL_REF_FRAME);
+      av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+
+      // Skip all MBs if high Q (0,0 mv and skip coeffs)
+      if (high_q) {
+        av1_enable_segfeature(seg, 0, SEG_LVL_SKIP);
+        av1_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+      }
+      // Enable data update
+      seg->update_data = 1;
+    } else {
+      // All other frames.
+
+      // No updates.. leave things as they are.
+      seg->update_map = 0;
+      seg->update_data = 0;
+    }
+  }
+}
+
+static void update_reference_segmentation_map(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible;
+  uint8_t *cache_ptr = cm->last_frame_seg_map;
+  int row, col;
+
+  for (row = 0; row < cm->mi_rows; row++) {
+    MODE_INFO **mi_8x8 = mi_8x8_ptr;
+    uint8_t *cache = cache_ptr;
+    for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++)
+      cache[0] = mi_8x8[0]->mbmi.segment_id;
+    mi_8x8_ptr += cm->mi_stride;
+    cache_ptr += cm->mi_cols;
+  }
+}
+
+static void alloc_raw_frame_buffers(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+
+  if (!cpi->lookahead)
+    cpi->lookahead = av1_lookahead_init(oxcf->width, oxcf->height,
+                                        cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+                                        cm->use_highbitdepth,
+#endif
+                                        oxcf->lag_in_frames);
+  if (!cpi->lookahead)
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate lag buffers");
+
+  // TODO(agrange) Check if ARF is enabled and skip allocation if not.
+  if (aom_realloc_frame_buffer(&cpi->alt_ref_buffer, oxcf->width, oxcf->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
+                               NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate altref buffer");
+}
+
+static void alloc_util_frame_buffers(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (aom_realloc_frame_buffer(&cpi->last_frame_uf, cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
+                               NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate last frame buffer");
+
+#if CONFIG_LOOP_RESTORATION
+  if (aom_realloc_frame_buffer(&cpi->last_frame_db, cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
+                               NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate last frame deblocked buffer");
+  if (aom_realloc_frame_buffer(&cpi->trial_frame_rst, cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
+                               NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate trial restored frame buffer");
+  int extra_rstbuf_sz = RESTORATION_EXTBUF_SIZE;
+  if (extra_rstbuf_sz > 0) {
+    aom_free(cpi->extra_rstbuf);
+    CHECK_MEM_ERROR(cm, cpi->extra_rstbuf,
+                    (uint8_t *)aom_malloc(extra_rstbuf_sz));
+  } else {
+    cpi->extra_rstbuf = NULL;
+  }
+#endif  // CONFIG_LOOP_RESTORATION
+
+  if (aom_realloc_frame_buffer(&cpi->scaled_source, cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
+                               NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate scaled source buffer");
+
+  if (aom_realloc_frame_buffer(&cpi->scaled_last_source, cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
+                               NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate scaled last source buffer");
+}
+
+static int alloc_context_buffers_ext(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  int mi_size = cm->mi_cols * cm->mi_rows;
+
+  cpi->mbmi_ext_base = aom_calloc(mi_size, sizeof(*cpi->mbmi_ext_base));
+  if (!cpi->mbmi_ext_base) return 1;
+
+  return 0;
+}
+
+void av1_alloc_compressor_data(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+
+  av1_alloc_context_buffers(cm, cm->width, cm->height);
+
+#if CONFIG_LV_MAP
+  av1_alloc_txb_buf(cpi);
+#endif
+
+  alloc_context_buffers_ext(cpi);
+
+  aom_free(cpi->tile_tok[0][0]);
+
+  {
+    unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
+    CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
+                    aom_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
+#if CONFIG_ANS && !ANS_MAX_SYMBOLS
+    aom_buf_ans_alloc(&cpi->buf_ans, &cm->error, (int)tokens);
+#endif  // CONFIG_ANS
+  }
+
+  av1_setup_pc_tree(&cpi->common, &cpi->td);
+}
+
+void av1_new_framerate(AV1_COMP *cpi, double framerate) {
+  cpi->framerate = framerate < 0.1 ? 30 : framerate;
+#if CONFIG_XIPHRC
+  if (!cpi->od_rc.cur_frame) return;
+  cpi->od_rc.framerate = cpi->framerate;
+  od_enc_rc_resize(&cpi->od_rc);
+#else
+  av1_rc_update_framerate(cpi);
+#endif
+}
+
+static void set_tile_info(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+#if CONFIG_TILE_GROUPS && CONFIG_DEPENDENT_HORZTILES
+  int tile_row, tile_col, num_tiles_in_tg;
+  int tg_row_start, tg_col_start;
+#endif
+#if CONFIG_EXT_TILE
+#if CONFIG_EXT_PARTITION
+  if (cpi->oxcf.superblock_size != AOM_SUPERBLOCK_SIZE_64X64) {
+    cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 32);
+    cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 32);
+    cm->tile_width <<= MAX_MIB_SIZE_LOG2;
+    cm->tile_height <<= MAX_MIB_SIZE_LOG2;
+  } else {
+    cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 64);
+    cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64);
+    cm->tile_width <<= MAX_MIB_SIZE_LOG2 - 1;
+    cm->tile_height <<= MAX_MIB_SIZE_LOG2 - 1;
+  }
+#else
+  cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 64);
+  cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64);
+  cm->tile_width <<= MAX_MIB_SIZE_LOG2;
+  cm->tile_height <<= MAX_MIB_SIZE_LOG2;
+#endif  // CONFIG_EXT_PARTITION
+
+  cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols);
+  cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows);
+
+  assert(cm->tile_width >> MAX_MIB_SIZE <= 32);
+  assert(cm->tile_height >> MAX_MIB_SIZE <= 32);
+
+  // Get the number of tiles
+  cm->tile_cols = 1;
+  while (cm->tile_cols * cm->tile_width < cm->mi_cols) ++cm->tile_cols;
+
+  cm->tile_rows = 1;
+  while (cm->tile_rows * cm->tile_height < cm->mi_rows) ++cm->tile_rows;
+#else
+  int min_log2_tile_cols, max_log2_tile_cols;
+  av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+
+  cm->log2_tile_cols =
+      clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
+  cm->log2_tile_rows = cpi->oxcf.tile_rows;
+
+  cm->tile_cols = 1 << cm->log2_tile_cols;
+  cm->tile_rows = 1 << cm->log2_tile_rows;
+
+  cm->tile_width = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+  cm->tile_width >>= cm->log2_tile_cols;
+  cm->tile_height = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
+  cm->tile_height >>= cm->log2_tile_rows;
+
+  // round to integer multiples of max superblock size
+  cm->tile_width = ALIGN_POWER_OF_TWO(cm->tile_width, MAX_MIB_SIZE_LOG2);
+  cm->tile_height = ALIGN_POWER_OF_TWO(cm->tile_height, MAX_MIB_SIZE_LOG2);
+#endif  // CONFIG_EXT_TILE
+
+#if CONFIG_DEPENDENT_HORZTILES
+  cm->dependent_horz_tiles = cpi->oxcf.dependent_horz_tiles;
+  if (cm->log2_tile_rows == 0) cm->dependent_horz_tiles = 0;
+#if CONFIG_TILE_GROUPS
+  if (cpi->oxcf.mtu == 0) {
+    cm->num_tg = cpi->oxcf.num_tile_groups;
+  } else {
+    // Use a default value for the purposes of weighting costs in probability
+    // updates
+    cm->num_tg = DEFAULT_MAX_NUM_TG;
+  }
+  num_tiles_in_tg =
+      (cm->tile_cols * cm->tile_rows + cm->num_tg - 1) / cm->num_tg;
+  tg_row_start = 0;
+  tg_col_start = 0;
+  for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
+      if ((tile_row * cm->tile_cols + tile_col) % num_tiles_in_tg == 0) {
+        tg_row_start = tile_row;
+        tg_col_start = tile_col;
+      }
+      cm->tile_group_start_row[tile_row][tile_col] = tg_row_start;
+      cm->tile_group_start_col[tile_row][tile_col] = tg_col_start;
+    }
+  }
+#endif
+#endif
+
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+  cm->loop_filter_across_tiles_enabled =
+      cpi->oxcf.loop_filter_across_tiles_enabled;
+#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+}
+
+static void update_frame_size(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+  av1_set_mb_mi(cm, cm->width, cm->height);
+  av1_init_context_buffers(cm);
+  av1_init_macroblockd(cm, xd,
+#if CONFIG_PVQ
+                       NULL,
+#endif
+#if CONFIG_CFL
+                       &NULL_CFL,
+#endif
+                       NULL);
+  memset(cpi->mbmi_ext_base, 0,
+         cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
+
+  set_tile_info(cpi);
+}
+
+static void init_buffer_indices(AV1_COMP *cpi) {
+#if CONFIG_EXT_REFS
+  int fb_idx;
+  for (fb_idx = 0; fb_idx < LAST_REF_FRAMES; ++fb_idx)
+    cpi->lst_fb_idxes[fb_idx] = fb_idx;
+  cpi->gld_fb_idx = LAST_REF_FRAMES;
+  cpi->bwd_fb_idx = LAST_REF_FRAMES + 1;
+  cpi->alt_fb_idx = LAST_REF_FRAMES + 2;
+  for (fb_idx = 0; fb_idx < MAX_EXT_ARFS + 1; ++fb_idx)
+    cpi->arf_map[fb_idx] = LAST_REF_FRAMES + 2 + fb_idx;
+#else
+  cpi->lst_fb_idx = 0;
+  cpi->gld_fb_idx = 1;
+  cpi->alt_fb_idx = 2;
+#endif  // CONFIG_EXT_REFS
+}
+
+static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  cpi->oxcf = *oxcf;
+  cpi->framerate = oxcf->init_framerate;
+
+  cm->profile = oxcf->profile;
+  cm->bit_depth = oxcf->bit_depth;
+#if CONFIG_HIGHBITDEPTH
+  cm->use_highbitdepth = oxcf->use_highbitdepth;
+#endif
+  cm->color_space = oxcf->color_space;
+  cm->color_range = oxcf->color_range;
+
+  cm->width = oxcf->width;
+  cm->height = oxcf->height;
+  av1_alloc_compressor_data(cpi);
+
+  // Single thread case: use counts in common.
+  cpi->td.counts = &cm->counts;
+
+  // change includes all joint functionality
+  av1_change_config(cpi, oxcf);
+
+  cpi->static_mb_pct = 0;
+  cpi->ref_frame_flags = 0;
+
+  init_buffer_indices(cpi);
+}
+
+static void set_rc_buffer_sizes(RATE_CONTROL *rc,
+                                const AV1EncoderConfig *oxcf) {
+  const int64_t bandwidth = oxcf->target_bandwidth;
+  const int64_t starting = oxcf->starting_buffer_level_ms;
+  const int64_t optimal = oxcf->optimal_buffer_level_ms;
+  const int64_t maximum = oxcf->maximum_buffer_size_ms;
+
+  rc->starting_buffer_level = starting * bandwidth / 1000;
+  rc->optimal_buffer_level =
+      (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000;
+  rc->maximum_buffer_size =
+      (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000;
+}
+
+#if CONFIG_HIGHBITDEPTH
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
+  cpi->fn_ptr[BT].sdf = SDF;                                           \
+  cpi->fn_ptr[BT].sdaf = SDAF;                                         \
+  cpi->fn_ptr[BT].vf = VF;                                             \
+  cpi->fn_ptr[BT].svf = SVF;                                           \
+  cpi->fn_ptr[BT].svaf = SVAF;                                         \
+  cpi->fn_ptr[BT].sdx3f = SDX3F;                                       \
+  cpi->fn_ptr[BT].sdx8f = SDX8F;                                       \
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;
+
+#define MAKE_BFP_SAD_WRAPPER(fnname)                                           \
+  static unsigned int fnname##_bits8(const uint8_t *src_ptr,                   \
+                                     int source_stride,                        \
+                                     const uint8_t *ref_ptr, int ref_stride) { \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride);                \
+  }                                                                            \
+  static unsigned int fnname##_bits10(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride) {                                                        \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2;           \
+  }                                                                            \
+  static unsigned int fnname##_bits12(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride) {                                                        \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4;           \
+  }
+
+#define MAKE_BFP_SADAVG_WRAPPER(fnname)                                        \
+  static unsigned int fnname##_bits8(                                          \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *second_pred) {                            \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred);   \
+  }                                                                            \
+  static unsigned int fnname##_bits10(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *second_pred) {                            \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+           2;                                                                  \
+  }                                                                            \
+  static unsigned int fnname##_bits12(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *second_pred) {                            \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+           4;                                                                  \
+  }
+
+#define MAKE_BFP_SAD3_WRAPPER(fnname)                                    \
+  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,  \
+                             const uint8_t *ref_ptr, int ref_stride,     \
+                             unsigned int *sad_array) {                  \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
+  }                                                                      \
+  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
+                              const uint8_t *ref_ptr, int ref_stride,    \
+                              unsigned int *sad_array) {                 \
+    int i;                                                               \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
+    for (i = 0; i < 3; i++) sad_array[i] >>= 2;                          \
+  }                                                                      \
+  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
+                              const uint8_t *ref_ptr, int ref_stride,    \
+                              unsigned int *sad_array) {                 \
+    int i;                                                               \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
+    for (i = 0; i < 3; i++) sad_array[i] >>= 4;                          \
+  }
+
+#define MAKE_BFP_SAD8_WRAPPER(fnname)                                    \
+  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,  \
+                             const uint8_t *ref_ptr, int ref_stride,     \
+                             unsigned int *sad_array) {                  \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
+  }                                                                      \
+  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
+                              const uint8_t *ref_ptr, int ref_stride,    \
+                              unsigned int *sad_array) {                 \
+    int i;                                                               \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
+    for (i = 0; i < 8; i++) sad_array[i] >>= 2;                          \
+  }                                                                      \
+  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
+                              const uint8_t *ref_ptr, int ref_stride,    \
+                              unsigned int *sad_array) {                 \
+    int i;                                                               \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
+    for (i = 0; i < 8; i++) sad_array[i] >>= 4;                          \
+  }
+#define MAKE_BFP_SAD4D_WRAPPER(fnname)                                        \
+  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,       \
+                             const uint8_t *const ref_ptr[], int ref_stride,  \
+                             unsigned int *sad_array) {                       \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+  }                                                                           \
+  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride,      \
+                              const uint8_t *const ref_ptr[], int ref_stride, \
+                              unsigned int *sad_array) {                      \
+    int i;                                                                    \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+    for (i = 0; i < 4; i++) sad_array[i] >>= 2;                               \
+  }                                                                           \
+  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride,      \
+                              const uint8_t *const ref_ptr[], int ref_stride, \
+                              unsigned int *sad_array) {                      \
+    int i;                                                                    \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+    for (i = 0; i < 4; i++) sad_array[i] >>= 4;                               \
+  }
+
+#if CONFIG_EXT_PARTITION
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg)
+MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad128x128x3)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad128x128x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d)
+#endif  // CONFIG_EXT_PARTITION
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg)
+MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad32x32x3)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad32x32x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg)
+MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad64x64x3)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad64x64x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg)
+MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad16x16x3)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad16x16x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg)
+MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad16x8x3)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad16x8x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg)
+MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad8x16x3)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad8x16x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg)
+MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad8x8x3)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad8x8x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad8x4x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad4x8x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg)
+MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad4x4x3)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad4x4x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
+
+#if CONFIG_EXT_INTER
+#define HIGHBD_MBFP(BT, MSDF, MVF, MSVF) \
+  cpi->fn_ptr[BT].msdf = MSDF;           \
+  cpi->fn_ptr[BT].mvf = MVF;             \
+  cpi->fn_ptr[BT].msvf = MSVF;
+
+#define MAKE_MBFP_SAD_WRAPPER(fnname)                                          \
+  static unsigned int fnname##_bits8(                                          \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *m, int m_stride) {                        \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride);   \
+  }                                                                            \
+  static unsigned int fnname##_bits10(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *m, int m_stride) {                        \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride) >> \
+           2;                                                                  \
+  }                                                                            \
+  static unsigned int fnname##_bits12(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *m, int m_stride) {                        \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride) >> \
+           4;                                                                  \
+  }
+
+#if CONFIG_EXT_PARTITION
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad128x128)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad128x64)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x128)
+#endif  // CONFIG_EXT_PARTITION
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x64)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x32)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x64)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x32)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x16)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x32)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x16)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x8)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x16)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x8)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x4)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad4x8)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad4x4)
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR
+#define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
+  cpi->fn_ptr[BT].osdf = OSDF;           \
+  cpi->fn_ptr[BT].ovf = OVF;             \
+  cpi->fn_ptr[BT].osvf = OSVF;
+
+#define MAKE_OBFP_SAD_WRAPPER(fnname)                                     \
+  static unsigned int fnname##_bits8(const uint8_t *ref, int ref_stride,  \
+                                     const int32_t *wsrc,                 \
+                                     const int32_t *msk) {                \
+    return fnname(ref, ref_stride, wsrc, msk);                            \
+  }                                                                       \
+  static unsigned int fnname##_bits10(const uint8_t *ref, int ref_stride, \
+                                      const int32_t *wsrc,                \
+                                      const int32_t *msk) {               \
+    return fnname(ref, ref_stride, wsrc, msk) >> 2;                       \
+  }                                                                       \
+  static unsigned int fnname##_bits12(const uint8_t *ref, int ref_stride, \
+                                      const int32_t *wsrc,                \
+                                      const int32_t *msk) {               \
+    return fnname(ref, ref_stride, wsrc, msk) >> 4;                       \
+  }
+
+#if CONFIG_EXT_PARTITION
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128)
+#endif  // CONFIG_EXT_PARTITION
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4)
+#endif  // CONFIG_MOTION_VAR
+
+static void highbd_set_var_fns(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (cm->use_highbitdepth) {
+    switch (cm->bit_depth) {
+      case AOM_BITS_8:
+        HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits8,
+                   aom_highbd_sad32x16_avg_bits8, aom_highbd_8_variance32x16,
+                   aom_highbd_8_sub_pixel_variance32x16,
+                   aom_highbd_8_sub_pixel_avg_variance32x16, NULL, NULL,
+                   aom_highbd_sad32x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits8,
+                   aom_highbd_sad16x32_avg_bits8, aom_highbd_8_variance16x32,
+                   aom_highbd_8_sub_pixel_variance16x32,
+                   aom_highbd_8_sub_pixel_avg_variance16x32, NULL, NULL,
+                   aom_highbd_sad16x32x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits8,
+                   aom_highbd_sad64x32_avg_bits8, aom_highbd_8_variance64x32,
+                   aom_highbd_8_sub_pixel_variance64x32,
+                   aom_highbd_8_sub_pixel_avg_variance64x32, NULL, NULL,
+                   aom_highbd_sad64x32x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits8,
+                   aom_highbd_sad32x64_avg_bits8, aom_highbd_8_variance32x64,
+                   aom_highbd_8_sub_pixel_variance32x64,
+                   aom_highbd_8_sub_pixel_avg_variance32x64, NULL, NULL,
+                   aom_highbd_sad32x64x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits8,
+                   aom_highbd_sad32x32_avg_bits8, aom_highbd_8_variance32x32,
+                   aom_highbd_8_sub_pixel_variance32x32,
+                   aom_highbd_8_sub_pixel_avg_variance32x32,
+                   aom_highbd_sad32x32x3_bits8, aom_highbd_sad32x32x8_bits8,
+                   aom_highbd_sad32x32x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits8,
+                   aom_highbd_sad64x64_avg_bits8, aom_highbd_8_variance64x64,
+                   aom_highbd_8_sub_pixel_variance64x64,
+                   aom_highbd_8_sub_pixel_avg_variance64x64,
+                   aom_highbd_sad64x64x3_bits8, aom_highbd_sad64x64x8_bits8,
+                   aom_highbd_sad64x64x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits8,
+                   aom_highbd_sad16x16_avg_bits8, aom_highbd_8_variance16x16,
+                   aom_highbd_8_sub_pixel_variance16x16,
+                   aom_highbd_8_sub_pixel_avg_variance16x16,
+                   aom_highbd_sad16x16x3_bits8, aom_highbd_sad16x16x8_bits8,
+                   aom_highbd_sad16x16x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_16X8, aom_highbd_sad16x8_bits8, aom_highbd_sad16x8_avg_bits8,
+            aom_highbd_8_variance16x8, aom_highbd_8_sub_pixel_variance16x8,
+            aom_highbd_8_sub_pixel_avg_variance16x8, aom_highbd_sad16x8x3_bits8,
+            aom_highbd_sad16x8x8_bits8, aom_highbd_sad16x8x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_8X16, aom_highbd_sad8x16_bits8, aom_highbd_sad8x16_avg_bits8,
+            aom_highbd_8_variance8x16, aom_highbd_8_sub_pixel_variance8x16,
+            aom_highbd_8_sub_pixel_avg_variance8x16, aom_highbd_sad8x16x3_bits8,
+            aom_highbd_sad8x16x8_bits8, aom_highbd_sad8x16x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_8X8, aom_highbd_sad8x8_bits8, aom_highbd_sad8x8_avg_bits8,
+            aom_highbd_8_variance8x8, aom_highbd_8_sub_pixel_variance8x8,
+            aom_highbd_8_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x3_bits8,
+            aom_highbd_sad8x8x8_bits8, aom_highbd_sad8x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits8,
+                   aom_highbd_sad8x4_avg_bits8, aom_highbd_8_variance8x4,
+                   aom_highbd_8_sub_pixel_variance8x4,
+                   aom_highbd_8_sub_pixel_avg_variance8x4, NULL,
+                   aom_highbd_sad8x4x8_bits8, aom_highbd_sad8x4x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits8,
+                   aom_highbd_sad4x8_avg_bits8, aom_highbd_8_variance4x8,
+                   aom_highbd_8_sub_pixel_variance4x8,
+                   aom_highbd_8_sub_pixel_avg_variance4x8, NULL,
+                   aom_highbd_sad4x8x8_bits8, aom_highbd_sad4x8x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_4X4, aom_highbd_sad4x4_bits8, aom_highbd_sad4x4_avg_bits8,
+            aom_highbd_8_variance4x4, aom_highbd_8_sub_pixel_variance4x4,
+            aom_highbd_8_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits8,
+            aom_highbd_sad4x4x8_bits8, aom_highbd_sad4x4x4d_bits8)
+
+#if CONFIG_CB4X4
+        HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_8_variance2x2, NULL, NULL,
+                   NULL, NULL, NULL)
+        HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_8_variance4x2, NULL, NULL,
+                   NULL, NULL, NULL)
+        HIGHBD_BFP(BLOCK_2X4, NULL, NULL, aom_highbd_8_variance2x4, NULL, NULL,
+                   NULL, NULL, NULL)
+#endif
+
+#if CONFIG_EXT_PARTITION
+        HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits8,
+                   aom_highbd_sad128x128_avg_bits8,
+                   aom_highbd_8_variance128x128,
+                   aom_highbd_8_sub_pixel_variance128x128,
+                   aom_highbd_8_sub_pixel_avg_variance128x128,
+                   aom_highbd_sad128x128x3_bits8, aom_highbd_sad128x128x8_bits8,
+                   aom_highbd_sad128x128x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits8,
+                   aom_highbd_sad128x64_avg_bits8, aom_highbd_8_variance128x64,
+                   aom_highbd_8_sub_pixel_variance128x64,
+                   aom_highbd_8_sub_pixel_avg_variance128x64, NULL, NULL,
+                   aom_highbd_sad128x64x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits8,
+                   aom_highbd_sad64x128_avg_bits8, aom_highbd_8_variance64x128,
+                   aom_highbd_8_sub_pixel_variance64x128,
+                   aom_highbd_8_sub_pixel_avg_variance64x128, NULL, NULL,
+                   aom_highbd_sad64x128x4d_bits8)
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_INTER
+#if CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits8,
+                    aom_highbd_masked_variance128x128,
+                    aom_highbd_masked_sub_pixel_variance128x128)
+        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits8,
+                    aom_highbd_masked_variance128x64,
+                    aom_highbd_masked_sub_pixel_variance128x64)
+        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits8,
+                    aom_highbd_masked_variance64x128,
+                    aom_highbd_masked_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits8,
+                    aom_highbd_masked_variance64x64,
+                    aom_highbd_masked_sub_pixel_variance64x64)
+        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits8,
+                    aom_highbd_masked_variance64x32,
+                    aom_highbd_masked_sub_pixel_variance64x32)
+        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits8,
+                    aom_highbd_masked_variance32x64,
+                    aom_highbd_masked_sub_pixel_variance32x64)
+        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits8,
+                    aom_highbd_masked_variance32x32,
+                    aom_highbd_masked_sub_pixel_variance32x32)
+        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits8,
+                    aom_highbd_masked_variance32x16,
+                    aom_highbd_masked_sub_pixel_variance32x16)
+        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits8,
+                    aom_highbd_masked_variance16x32,
+                    aom_highbd_masked_sub_pixel_variance16x32)
+        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits8,
+                    aom_highbd_masked_variance16x16,
+                    aom_highbd_masked_sub_pixel_variance16x16)
+        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits8,
+                    aom_highbd_masked_variance8x16,
+                    aom_highbd_masked_sub_pixel_variance8x16)
+        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits8,
+                    aom_highbd_masked_variance16x8,
+                    aom_highbd_masked_sub_pixel_variance16x8)
+        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits8,
+                    aom_highbd_masked_variance8x8,
+                    aom_highbd_masked_sub_pixel_variance8x8)
+        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits8,
+                    aom_highbd_masked_variance4x8,
+                    aom_highbd_masked_sub_pixel_variance4x8)
+        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits8,
+                    aom_highbd_masked_variance8x4,
+                    aom_highbd_masked_sub_pixel_variance8x4)
+        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits8,
+                    aom_highbd_masked_variance4x4,
+                    aom_highbd_masked_sub_pixel_variance4x4)
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR
+#if CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits8,
+                    aom_highbd_obmc_variance128x128,
+                    aom_highbd_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits8,
+                    aom_highbd_obmc_variance128x64,
+                    aom_highbd_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits8,
+                    aom_highbd_obmc_variance64x128,
+                    aom_highbd_obmc_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits8,
+                    aom_highbd_obmc_variance64x64,
+                    aom_highbd_obmc_sub_pixel_variance64x64)
+        HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits8,
+                    aom_highbd_obmc_variance64x32,
+                    aom_highbd_obmc_sub_pixel_variance64x32)
+        HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits8,
+                    aom_highbd_obmc_variance32x64,
+                    aom_highbd_obmc_sub_pixel_variance32x64)
+        HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits8,
+                    aom_highbd_obmc_variance32x32,
+                    aom_highbd_obmc_sub_pixel_variance32x32)
+        HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits8,
+                    aom_highbd_obmc_variance32x16,
+                    aom_highbd_obmc_sub_pixel_variance32x16)
+        HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits8,
+                    aom_highbd_obmc_variance16x32,
+                    aom_highbd_obmc_sub_pixel_variance16x32)
+        HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits8,
+                    aom_highbd_obmc_variance16x16,
+                    aom_highbd_obmc_sub_pixel_variance16x16)
+        HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits8,
+                    aom_highbd_obmc_variance8x16,
+                    aom_highbd_obmc_sub_pixel_variance8x16)
+        HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits8,
+                    aom_highbd_obmc_variance16x8,
+                    aom_highbd_obmc_sub_pixel_variance16x8)
+        HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits8,
+                    aom_highbd_obmc_variance8x8,
+                    aom_highbd_obmc_sub_pixel_variance8x8)
+        HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits8,
+                    aom_highbd_obmc_variance4x8,
+                    aom_highbd_obmc_sub_pixel_variance4x8)
+        HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits8,
+                    aom_highbd_obmc_variance8x4,
+                    aom_highbd_obmc_sub_pixel_variance8x4)
+        HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits8,
+                    aom_highbd_obmc_variance4x4,
+                    aom_highbd_obmc_sub_pixel_variance4x4)
+#endif  // CONFIG_MOTION_VAR
+        break;
+
+      case AOM_BITS_10:
+        HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits10,
+                   aom_highbd_sad32x16_avg_bits10, aom_highbd_10_variance32x16,
+                   aom_highbd_10_sub_pixel_variance32x16,
+                   aom_highbd_10_sub_pixel_avg_variance32x16, NULL, NULL,
+                   aom_highbd_sad32x16x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits10,
+                   aom_highbd_sad16x32_avg_bits10, aom_highbd_10_variance16x32,
+                   aom_highbd_10_sub_pixel_variance16x32,
+                   aom_highbd_10_sub_pixel_avg_variance16x32, NULL, NULL,
+                   aom_highbd_sad16x32x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits10,
+                   aom_highbd_sad64x32_avg_bits10, aom_highbd_10_variance64x32,
+                   aom_highbd_10_sub_pixel_variance64x32,
+                   aom_highbd_10_sub_pixel_avg_variance64x32, NULL, NULL,
+                   aom_highbd_sad64x32x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits10,
+                   aom_highbd_sad32x64_avg_bits10, aom_highbd_10_variance32x64,
+                   aom_highbd_10_sub_pixel_variance32x64,
+                   aom_highbd_10_sub_pixel_avg_variance32x64, NULL, NULL,
+                   aom_highbd_sad32x64x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits10,
+                   aom_highbd_sad32x32_avg_bits10, aom_highbd_10_variance32x32,
+                   aom_highbd_10_sub_pixel_variance32x32,
+                   aom_highbd_10_sub_pixel_avg_variance32x32,
+                   aom_highbd_sad32x32x3_bits10, aom_highbd_sad32x32x8_bits10,
+                   aom_highbd_sad32x32x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits10,
+                   aom_highbd_sad64x64_avg_bits10, aom_highbd_10_variance64x64,
+                   aom_highbd_10_sub_pixel_variance64x64,
+                   aom_highbd_10_sub_pixel_avg_variance64x64,
+                   aom_highbd_sad64x64x3_bits10, aom_highbd_sad64x64x8_bits10,
+                   aom_highbd_sad64x64x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits10,
+                   aom_highbd_sad16x16_avg_bits10, aom_highbd_10_variance16x16,
+                   aom_highbd_10_sub_pixel_variance16x16,
+                   aom_highbd_10_sub_pixel_avg_variance16x16,
+                   aom_highbd_sad16x16x3_bits10, aom_highbd_sad16x16x8_bits10,
+                   aom_highbd_sad16x16x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits10,
+                   aom_highbd_sad16x8_avg_bits10, aom_highbd_10_variance16x8,
+                   aom_highbd_10_sub_pixel_variance16x8,
+                   aom_highbd_10_sub_pixel_avg_variance16x8,
+                   aom_highbd_sad16x8x3_bits10, aom_highbd_sad16x8x8_bits10,
+                   aom_highbd_sad16x8x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits10,
+                   aom_highbd_sad8x16_avg_bits10, aom_highbd_10_variance8x16,
+                   aom_highbd_10_sub_pixel_variance8x16,
+                   aom_highbd_10_sub_pixel_avg_variance8x16,
+                   aom_highbd_sad8x16x3_bits10, aom_highbd_sad8x16x8_bits10,
+                   aom_highbd_sad8x16x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_8X8, aom_highbd_sad8x8_bits10, aom_highbd_sad8x8_avg_bits10,
+            aom_highbd_10_variance8x8, aom_highbd_10_sub_pixel_variance8x8,
+            aom_highbd_10_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x3_bits10,
+            aom_highbd_sad8x8x8_bits10, aom_highbd_sad8x8x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits10,
+                   aom_highbd_sad8x4_avg_bits10, aom_highbd_10_variance8x4,
+                   aom_highbd_10_sub_pixel_variance8x4,
+                   aom_highbd_10_sub_pixel_avg_variance8x4, NULL,
+                   aom_highbd_sad8x4x8_bits10, aom_highbd_sad8x4x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits10,
+                   aom_highbd_sad4x8_avg_bits10, aom_highbd_10_variance4x8,
+                   aom_highbd_10_sub_pixel_variance4x8,
+                   aom_highbd_10_sub_pixel_avg_variance4x8, NULL,
+                   aom_highbd_sad4x8x8_bits10, aom_highbd_sad4x8x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_4X4, aom_highbd_sad4x4_bits10, aom_highbd_sad4x4_avg_bits10,
+            aom_highbd_10_variance4x4, aom_highbd_10_sub_pixel_variance4x4,
+            aom_highbd_10_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits10,
+            aom_highbd_sad4x4x8_bits10, aom_highbd_sad4x4x4d_bits10)
+
+#if CONFIG_CB4X4
+        HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_10_variance2x2, NULL, NULL,
+                   NULL, NULL, NULL)
+        HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_10_variance4x2, NULL, NULL,
+                   NULL, NULL, NULL)
+        HIGHBD_BFP(BLOCK_2X4, NULL, NULL, aom_highbd_10_variance2x4, NULL, NULL,
+                   NULL, NULL, NULL)
+#endif
+
+#if CONFIG_EXT_PARTITION
+        HIGHBD_BFP(
+            BLOCK_128X128, aom_highbd_sad128x128_bits10,
+            aom_highbd_sad128x128_avg_bits10, aom_highbd_10_variance128x128,
+            aom_highbd_10_sub_pixel_variance128x128,
+            aom_highbd_10_sub_pixel_avg_variance128x128,
+            aom_highbd_sad128x128x3_bits10, aom_highbd_sad128x128x8_bits10,
+            aom_highbd_sad128x128x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits10,
+                   aom_highbd_sad128x64_avg_bits10,
+                   aom_highbd_10_variance128x64,
+                   aom_highbd_10_sub_pixel_variance128x64,
+                   aom_highbd_10_sub_pixel_avg_variance128x64, NULL, NULL,
+                   aom_highbd_sad128x64x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits10,
+                   aom_highbd_sad64x128_avg_bits10,
+                   aom_highbd_10_variance64x128,
+                   aom_highbd_10_sub_pixel_variance64x128,
+                   aom_highbd_10_sub_pixel_avg_variance64x128, NULL, NULL,
+                   aom_highbd_sad64x128x4d_bits10)
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_INTER
+#if CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits10,
+                    aom_highbd_10_masked_variance128x128,
+                    aom_highbd_10_masked_sub_pixel_variance128x128)
+        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits10,
+                    aom_highbd_10_masked_variance128x64,
+                    aom_highbd_10_masked_sub_pixel_variance128x64)
+        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits10,
+                    aom_highbd_10_masked_variance64x128,
+                    aom_highbd_10_masked_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits10,
+                    aom_highbd_10_masked_variance64x64,
+                    aom_highbd_10_masked_sub_pixel_variance64x64)
+        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits10,
+                    aom_highbd_10_masked_variance64x32,
+                    aom_highbd_10_masked_sub_pixel_variance64x32)
+        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits10,
+                    aom_highbd_10_masked_variance32x64,
+                    aom_highbd_10_masked_sub_pixel_variance32x64)
+        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits10,
+                    aom_highbd_10_masked_variance32x32,
+                    aom_highbd_10_masked_sub_pixel_variance32x32)
+        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits10,
+                    aom_highbd_10_masked_variance32x16,
+                    aom_highbd_10_masked_sub_pixel_variance32x16)
+        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits10,
+                    aom_highbd_10_masked_variance16x32,
+                    aom_highbd_10_masked_sub_pixel_variance16x32)
+        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits10,
+                    aom_highbd_10_masked_variance16x16,
+                    aom_highbd_10_masked_sub_pixel_variance16x16)
+        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits10,
+                    aom_highbd_10_masked_variance8x16,
+                    aom_highbd_10_masked_sub_pixel_variance8x16)
+        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits10,
+                    aom_highbd_10_masked_variance16x8,
+                    aom_highbd_10_masked_sub_pixel_variance16x8)
+        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits10,
+                    aom_highbd_10_masked_variance8x8,
+                    aom_highbd_10_masked_sub_pixel_variance8x8)
+        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits10,
+                    aom_highbd_10_masked_variance4x8,
+                    aom_highbd_10_masked_sub_pixel_variance4x8)
+        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits10,
+                    aom_highbd_10_masked_variance8x4,
+                    aom_highbd_10_masked_sub_pixel_variance8x4)
+        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits10,
+                    aom_highbd_10_masked_variance4x4,
+                    aom_highbd_10_masked_sub_pixel_variance4x4)
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR
+#if CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits10,
+                    aom_highbd_10_obmc_variance128x128,
+                    aom_highbd_10_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits10,
+                    aom_highbd_10_obmc_variance128x64,
+                    aom_highbd_10_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits10,
+                    aom_highbd_10_obmc_variance64x128,
+                    aom_highbd_10_obmc_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits10,
+                    aom_highbd_10_obmc_variance64x64,
+                    aom_highbd_10_obmc_sub_pixel_variance64x64)
+        HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits10,
+                    aom_highbd_10_obmc_variance64x32,
+                    aom_highbd_10_obmc_sub_pixel_variance64x32)
+        HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits10,
+                    aom_highbd_10_obmc_variance32x64,
+                    aom_highbd_10_obmc_sub_pixel_variance32x64)
+        HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits10,
+                    aom_highbd_10_obmc_variance32x32,
+                    aom_highbd_10_obmc_sub_pixel_variance32x32)
+        HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits10,
+                    aom_highbd_10_obmc_variance32x16,
+                    aom_highbd_10_obmc_sub_pixel_variance32x16)
+        HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits10,
+                    aom_highbd_10_obmc_variance16x32,
+                    aom_highbd_10_obmc_sub_pixel_variance16x32)
+        HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits10,
+                    aom_highbd_10_obmc_variance16x16,
+                    aom_highbd_10_obmc_sub_pixel_variance16x16)
+        HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits10,
+                    aom_highbd_10_obmc_variance8x16,
+                    aom_highbd_10_obmc_sub_pixel_variance8x16)
+        HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits10,
+                    aom_highbd_10_obmc_variance16x8,
+                    aom_highbd_10_obmc_sub_pixel_variance16x8)
+        HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits10,
+                    aom_highbd_10_obmc_variance8x8,
+                    aom_highbd_10_obmc_sub_pixel_variance8x8)
+        HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits10,
+                    aom_highbd_10_obmc_variance4x8,
+                    aom_highbd_10_obmc_sub_pixel_variance4x8)
+        HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits10,
+                    aom_highbd_10_obmc_variance8x4,
+                    aom_highbd_10_obmc_sub_pixel_variance8x4)
+        HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits10,
+                    aom_highbd_10_obmc_variance4x4,
+                    aom_highbd_10_obmc_sub_pixel_variance4x4)
+#endif  // CONFIG_MOTION_VAR
+        break;
+
+      case AOM_BITS_12:
+        HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits12,
+                   aom_highbd_sad32x16_avg_bits12, aom_highbd_12_variance32x16,
+                   aom_highbd_12_sub_pixel_variance32x16,
+                   aom_highbd_12_sub_pixel_avg_variance32x16, NULL, NULL,
+                   aom_highbd_sad32x16x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits12,
+                   aom_highbd_sad16x32_avg_bits12, aom_highbd_12_variance16x32,
+                   aom_highbd_12_sub_pixel_variance16x32,
+                   aom_highbd_12_sub_pixel_avg_variance16x32, NULL, NULL,
+                   aom_highbd_sad16x32x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits12,
+                   aom_highbd_sad64x32_avg_bits12, aom_highbd_12_variance64x32,
+                   aom_highbd_12_sub_pixel_variance64x32,
+                   aom_highbd_12_sub_pixel_avg_variance64x32, NULL, NULL,
+                   aom_highbd_sad64x32x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits12,
+                   aom_highbd_sad32x64_avg_bits12, aom_highbd_12_variance32x64,
+                   aom_highbd_12_sub_pixel_variance32x64,
+                   aom_highbd_12_sub_pixel_avg_variance32x64, NULL, NULL,
+                   aom_highbd_sad32x64x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits12,
+                   aom_highbd_sad32x32_avg_bits12, aom_highbd_12_variance32x32,
+                   aom_highbd_12_sub_pixel_variance32x32,
+                   aom_highbd_12_sub_pixel_avg_variance32x32,
+                   aom_highbd_sad32x32x3_bits12, aom_highbd_sad32x32x8_bits12,
+                   aom_highbd_sad32x32x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits12,
+                   aom_highbd_sad64x64_avg_bits12, aom_highbd_12_variance64x64,
+                   aom_highbd_12_sub_pixel_variance64x64,
+                   aom_highbd_12_sub_pixel_avg_variance64x64,
+                   aom_highbd_sad64x64x3_bits12, aom_highbd_sad64x64x8_bits12,
+                   aom_highbd_sad64x64x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits12,
+                   aom_highbd_sad16x16_avg_bits12, aom_highbd_12_variance16x16,
+                   aom_highbd_12_sub_pixel_variance16x16,
+                   aom_highbd_12_sub_pixel_avg_variance16x16,
+                   aom_highbd_sad16x16x3_bits12, aom_highbd_sad16x16x8_bits12,
+                   aom_highbd_sad16x16x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits12,
+                   aom_highbd_sad16x8_avg_bits12, aom_highbd_12_variance16x8,
+                   aom_highbd_12_sub_pixel_variance16x8,
+                   aom_highbd_12_sub_pixel_avg_variance16x8,
+                   aom_highbd_sad16x8x3_bits12, aom_highbd_sad16x8x8_bits12,
+                   aom_highbd_sad16x8x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits12,
+                   aom_highbd_sad8x16_avg_bits12, aom_highbd_12_variance8x16,
+                   aom_highbd_12_sub_pixel_variance8x16,
+                   aom_highbd_12_sub_pixel_avg_variance8x16,
+                   aom_highbd_sad8x16x3_bits12, aom_highbd_sad8x16x8_bits12,
+                   aom_highbd_sad8x16x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_8X8, aom_highbd_sad8x8_bits12, aom_highbd_sad8x8_avg_bits12,
+            aom_highbd_12_variance8x8, aom_highbd_12_sub_pixel_variance8x8,
+            aom_highbd_12_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x3_bits12,
+            aom_highbd_sad8x8x8_bits12, aom_highbd_sad8x8x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits12,
+                   aom_highbd_sad8x4_avg_bits12, aom_highbd_12_variance8x4,
+                   aom_highbd_12_sub_pixel_variance8x4,
+                   aom_highbd_12_sub_pixel_avg_variance8x4, NULL,
+                   aom_highbd_sad8x4x8_bits12, aom_highbd_sad8x4x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits12,
+                   aom_highbd_sad4x8_avg_bits12, aom_highbd_12_variance4x8,
+                   aom_highbd_12_sub_pixel_variance4x8,
+                   aom_highbd_12_sub_pixel_avg_variance4x8, NULL,
+                   aom_highbd_sad4x8x8_bits12, aom_highbd_sad4x8x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_4X4, aom_highbd_sad4x4_bits12, aom_highbd_sad4x4_avg_bits12,
+            aom_highbd_12_variance4x4, aom_highbd_12_sub_pixel_variance4x4,
+            aom_highbd_12_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits12,
+            aom_highbd_sad4x4x8_bits12, aom_highbd_sad4x4x4d_bits12)
+
+#if CONFIG_CB4X4
+        HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_12_variance2x2, NULL, NULL,
+                   NULL, NULL, NULL)
+        HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_12_variance4x2, NULL, NULL,
+                   NULL, NULL, NULL)
+        HIGHBD_BFP(BLOCK_2X4, NULL, NULL, aom_highbd_12_variance2x4, NULL, NULL,
+                   NULL, NULL, NULL)
+#endif
+
+#if CONFIG_EXT_PARTITION
+        HIGHBD_BFP(
+            BLOCK_128X128, aom_highbd_sad128x128_bits12,
+            aom_highbd_sad128x128_avg_bits12, aom_highbd_12_variance128x128,
+            aom_highbd_12_sub_pixel_variance128x128,
+            aom_highbd_12_sub_pixel_avg_variance128x128,
+            aom_highbd_sad128x128x3_bits12, aom_highbd_sad128x128x8_bits12,
+            aom_highbd_sad128x128x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits12,
+                   aom_highbd_sad128x64_avg_bits12,
+                   aom_highbd_12_variance128x64,
+                   aom_highbd_12_sub_pixel_variance128x64,
+                   aom_highbd_12_sub_pixel_avg_variance128x64, NULL, NULL,
+                   aom_highbd_sad128x64x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits12,
+                   aom_highbd_sad64x128_avg_bits12,
+                   aom_highbd_12_variance64x128,
+                   aom_highbd_12_sub_pixel_variance64x128,
+                   aom_highbd_12_sub_pixel_avg_variance64x128, NULL, NULL,
+                   aom_highbd_sad64x128x4d_bits12)
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_INTER
+#if CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits12,
+                    aom_highbd_12_masked_variance128x128,
+                    aom_highbd_12_masked_sub_pixel_variance128x128)
+        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits12,
+                    aom_highbd_12_masked_variance128x64,
+                    aom_highbd_12_masked_sub_pixel_variance128x64)
+        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits12,
+                    aom_highbd_12_masked_variance64x128,
+                    aom_highbd_12_masked_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits12,
+                    aom_highbd_12_masked_variance64x64,
+                    aom_highbd_12_masked_sub_pixel_variance64x64)
+        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits12,
+                    aom_highbd_12_masked_variance64x32,
+                    aom_highbd_12_masked_sub_pixel_variance64x32)
+        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits12,
+                    aom_highbd_12_masked_variance32x64,
+                    aom_highbd_12_masked_sub_pixel_variance32x64)
+        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits12,
+                    aom_highbd_12_masked_variance32x32,
+                    aom_highbd_12_masked_sub_pixel_variance32x32)
+        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits12,
+                    aom_highbd_12_masked_variance32x16,
+                    aom_highbd_12_masked_sub_pixel_variance32x16)
+        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits12,
+                    aom_highbd_12_masked_variance16x32,
+                    aom_highbd_12_masked_sub_pixel_variance16x32)
+        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits12,
+                    aom_highbd_12_masked_variance16x16,
+                    aom_highbd_12_masked_sub_pixel_variance16x16)
+        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits12,
+                    aom_highbd_12_masked_variance8x16,
+                    aom_highbd_12_masked_sub_pixel_variance8x16)
+        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits12,
+                    aom_highbd_12_masked_variance16x8,
+                    aom_highbd_12_masked_sub_pixel_variance16x8)
+        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits12,
+                    aom_highbd_12_masked_variance8x8,
+                    aom_highbd_12_masked_sub_pixel_variance8x8)
+        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits12,
+                    aom_highbd_12_masked_variance4x8,
+                    aom_highbd_12_masked_sub_pixel_variance4x8)
+        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits12,
+                    aom_highbd_12_masked_variance8x4,
+                    aom_highbd_12_masked_sub_pixel_variance8x4)
+        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits12,
+                    aom_highbd_12_masked_variance4x4,
+                    aom_highbd_12_masked_sub_pixel_variance4x4)
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR
+#if CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits12,
+                    aom_highbd_12_obmc_variance128x128,
+                    aom_highbd_12_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits12,
+                    aom_highbd_12_obmc_variance128x64,
+                    aom_highbd_12_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits12,
+                    aom_highbd_12_obmc_variance64x128,
+                    aom_highbd_12_obmc_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits12,
+                    aom_highbd_12_obmc_variance64x64,
+                    aom_highbd_12_obmc_sub_pixel_variance64x64)
+        HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits12,
+                    aom_highbd_12_obmc_variance64x32,
+                    aom_highbd_12_obmc_sub_pixel_variance64x32)
+        HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits12,
+                    aom_highbd_12_obmc_variance32x64,
+                    aom_highbd_12_obmc_sub_pixel_variance32x64)
+        HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits12,
+                    aom_highbd_12_obmc_variance32x32,
+                    aom_highbd_12_obmc_sub_pixel_variance32x32)
+        HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits12,
+                    aom_highbd_12_obmc_variance32x16,
+                    aom_highbd_12_obmc_sub_pixel_variance32x16)
+        HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits12,
+                    aom_highbd_12_obmc_variance16x32,
+                    aom_highbd_12_obmc_sub_pixel_variance16x32)
+        HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits12,
+                    aom_highbd_12_obmc_variance16x16,
+                    aom_highbd_12_obmc_sub_pixel_variance16x16)
+        HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits12,
+                    aom_highbd_12_obmc_variance8x16,
+                    aom_highbd_12_obmc_sub_pixel_variance8x16)
+        HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits12,
+                    aom_highbd_12_obmc_variance16x8,
+                    aom_highbd_12_obmc_sub_pixel_variance16x8)
+        HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits12,
+                    aom_highbd_12_obmc_variance8x8,
+                    aom_highbd_12_obmc_sub_pixel_variance8x8)
+        HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits12,
+                    aom_highbd_12_obmc_variance4x8,
+                    aom_highbd_12_obmc_sub_pixel_variance4x8)
+        HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits12,
+                    aom_highbd_12_obmc_variance8x4,
+                    aom_highbd_12_obmc_sub_pixel_variance8x4)
+        HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits12,
+                    aom_highbd_12_obmc_variance4x4,
+                    aom_highbd_12_obmc_sub_pixel_variance4x4)
+#endif  // CONFIG_MOTION_VAR
+        break;
+
+      default:
+        assert(0 &&
+               "cm->bit_depth should be AOM_BITS_8, "
+               "AOM_BITS_10 or AOM_BITS_12");
+    }
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+static void realloc_segmentation_maps(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  // Create the encoder segmentation map and set all entries to 0
+  aom_free(cpi->segmentation_map);
+  CHECK_MEM_ERROR(cm, cpi->segmentation_map,
+                  aom_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+  // Create a map used for cyclic background refresh.
+  if (cpi->cyclic_refresh) av1_cyclic_refresh_free(cpi->cyclic_refresh);
+  CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
+                  av1_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
+
+  // Create a map used to mark inactive areas.
+  aom_free(cpi->active_map.map);
+  CHECK_MEM_ERROR(cm, cpi->active_map.map,
+                  aom_calloc(cm->mi_rows * cm->mi_cols, 1));
+}
+
+void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  if (cm->profile != oxcf->profile) cm->profile = oxcf->profile;
+  cm->bit_depth = oxcf->bit_depth;
+  cm->color_space = oxcf->color_space;
+  cm->color_range = oxcf->color_range;
+
+  if (cm->profile <= PROFILE_1)
+    assert(cm->bit_depth == AOM_BITS_8);
+  else
+    assert(cm->bit_depth > AOM_BITS_8);
+
+  cpi->oxcf = *oxcf;
+#if CONFIG_HIGHBITDEPTH
+  cpi->td.mb.e_mbd.bd = (int)cm->bit_depth;
+#endif  // CONFIG_HIGHBITDEPTH
+#if CONFIG_GLOBAL_MOTION
+  cpi->td.mb.e_mbd.global_motion = cm->global_motion;
+#endif  // CONFIG_GLOBAL_MOTION
+
+  if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) {
+    rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+  } else {
+    rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
+  }
+
+  cpi->refresh_last_frame = 1;
+  cpi->refresh_golden_frame = 0;
+#if CONFIG_EXT_REFS
+  cpi->refresh_bwd_ref_frame = 0;
+#endif  // CONFIG_EXT_REFS
+
+  cm->refresh_frame_context =
+      (oxcf->error_resilient_mode || oxcf->frame_parallel_decoding_mode)
+          ? REFRESH_FRAME_CONTEXT_FORWARD
+          : REFRESH_FRAME_CONTEXT_BACKWARD;
+  cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
+
+#if CONFIG_PALETTE
+  cm->allow_screen_content_tools = (cpi->oxcf.content == AOM_CONTENT_SCREEN);
+  if (cm->allow_screen_content_tools) {
+    MACROBLOCK *x = &cpi->td.mb;
+    if (x->palette_buffer == 0) {
+      CHECK_MEM_ERROR(cm, x->palette_buffer,
+                      aom_memalign(16, sizeof(*x->palette_buffer)));
+    }
+    // Reallocate the pc_tree, as it's contents depends on
+    // the state of cm->allow_screen_content_tools
+    av1_free_pc_tree(&cpi->td);
+    av1_setup_pc_tree(&cpi->common, &cpi->td);
+  }
+#endif  // CONFIG_PALETTE
+
+  av1_reset_segment_features(cm);
+  av1_set_high_precision_mv(cpi, 0);
+
+  set_rc_buffer_sizes(rc, &cpi->oxcf);
+
+  // Under a configuration change, where maximum_buffer_size may change,
+  // keep buffer level clipped to the maximum allowed buffer size.
+  rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size);
+  rc->buffer_level = AOMMIN(rc->buffer_level, rc->maximum_buffer_size);
+
+  // Set up frame rate and related parameters rate control values.
+  av1_new_framerate(cpi, cpi->framerate);
+
+  // Set absolute upper and lower quality limits
+  rc->worst_quality = cpi->oxcf.worst_allowed_q;
+  rc->best_quality = cpi->oxcf.best_allowed_q;
+
+  cm->interp_filter = cpi->sf.default_interp_filter;
+
+  if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) {
+    cm->render_width = cpi->oxcf.render_width;
+    cm->render_height = cpi->oxcf.render_height;
+  } else {
+    cm->render_width = cpi->oxcf.width;
+    cm->render_height = cpi->oxcf.height;
+  }
+  cm->width = cpi->oxcf.width;
+  cm->height = cpi->oxcf.height;
+
+  if (cpi->initial_width) {
+    if (cm->width > cpi->initial_width || cm->height > cpi->initial_height) {
+      av1_free_context_buffers(cm);
+      av1_alloc_compressor_data(cpi);
+      realloc_segmentation_maps(cpi);
+      cpi->initial_width = cpi->initial_height = 0;
+    }
+  }
+  update_frame_size(cpi);
+
+  cpi->alt_ref_source = NULL;
+  rc->is_src_frame_alt_ref = 0;
+
+#if CONFIG_EXT_REFS
+  rc->is_bwd_ref_frame = 0;
+  rc->is_last_bipred_frame = 0;
+  rc->is_bipred_frame = 0;
+#endif  // CONFIG_EXT_REFS
+
+#if 0
+  // Experimental RD Code
+  cpi->frame_distortion = 0;
+  cpi->last_frame_distortion = 0;
+#endif
+
+  set_tile_info(cpi);
+
+  cpi->ext_refresh_frame_flags_pending = 0;
+  cpi->ext_refresh_frame_context_pending = 0;
+
+#if CONFIG_HIGHBITDEPTH
+  highbd_set_var_fns(cpi);
+#endif
+
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+  cpi->common.ans_window_size_log2 = cpi->oxcf.ans_window_size_log2;
+  if (cpi->buf_ans.size != (1 << cpi->common.ans_window_size_log2)) {
+    aom_buf_ans_free(&cpi->buf_ans);
+    aom_buf_ans_alloc(&cpi->buf_ans, &cpi->common.error,
+                      1 << cpi->common.ans_window_size_log2);
+  }
+#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
+}
+
+#ifndef M_LOG2_E
+#define M_LOG2_E 0.693147180559945309417
+#endif
+#define log2f(x) (log(x) / (float)M_LOG2_E)
+
+#if !CONFIG_REF_MV
+static void cal_nmvjointsadcost(int *mvjointsadcost) {
+  mvjointsadcost[0] = 600;
+  mvjointsadcost[1] = 300;
+  mvjointsadcost[2] = 300;
+  mvjointsadcost[3] = 300;
+}
+#endif
+
+static void cal_nmvsadcosts(int *mvsadcost[2]) {
+  int i = 1;
+
+  mvsadcost[0][0] = 0;
+  mvsadcost[1][0] = 0;
+
+  do {
+    double z = 256 * (2 * (log2f(8 * i) + .6));
+    mvsadcost[0][i] = (int)z;
+    mvsadcost[1][i] = (int)z;
+    mvsadcost[0][-i] = (int)z;
+    mvsadcost[1][-i] = (int)z;
+  } while (++i <= MV_MAX);
+}
+
+static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
+  int i = 1;
+
+  mvsadcost[0][0] = 0;
+  mvsadcost[1][0] = 0;
+
+  do {
+    double z = 256 * (2 * (log2f(8 * i) + .6));
+    mvsadcost[0][i] = (int)z;
+    mvsadcost[1][i] = (int)z;
+    mvsadcost[0][-i] = (int)z;
+    mvsadcost[1][-i] = (int)z;
+  } while (++i <= MV_MAX);
+}
+
+static INLINE void init_upsampled_ref_frame_bufs(AV1_COMP *cpi) {
+  int i;
+
+  for (i = 0; i < (REF_FRAMES + 1); ++i) {
+    cpi->upsampled_ref_bufs[i].ref_count = 0;
+    cpi->upsampled_ref_idx[i] = INVALID_IDX;
+  }
+}
+
+AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
+                                BufferPool *const pool) {
+  unsigned int i;
+  AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP));
+  AV1_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
+
+  if (!cm) return NULL;
+
+  av1_zero(*cpi);
+
+  if (setjmp(cm->error.jmp)) {
+    cm->error.setjmp = 0;
+    av1_remove_compressor(cpi);
+    return 0;
+  }
+
+  cm->error.setjmp = 1;
+  cm->alloc_mi = av1_enc_alloc_mi;
+  cm->free_mi = av1_enc_free_mi;
+  cm->setup_mi = av1_enc_setup_mi;
+
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(cm, cm->frame_contexts,
+                  (FRAME_CONTEXT *)aom_memalign(
+                      32, FRAME_CONTEXTS * sizeof(*cm->frame_contexts)));
+  memset(cm->fc, 0, sizeof(*cm->fc));
+  memset(cm->frame_contexts, 0, FRAME_CONTEXTS * sizeof(*cm->frame_contexts));
+
+  cpi->resize_state = 0;
+  cpi->resize_avg_qp = 0;
+  cpi->resize_buffer_underflow = 0;
+  cpi->common.buffer_pool = pool;
+
+  init_config(cpi, oxcf);
+#if CONFIG_XIPHRC
+  cpi->od_rc.framerate = cpi->framerate;
+  cpi->od_rc.frame_width = cm->render_width;
+  cpi->od_rc.frame_height = cm->render_height;
+  cpi->od_rc.keyframe_rate = oxcf->key_freq;
+  cpi->od_rc.goldenframe_rate = FIXED_GF_INTERVAL;
+  cpi->od_rc.altref_rate = 25;
+  cpi->od_rc.firstpass_quant = 1;
+  cpi->od_rc.bit_depth = cm->bit_depth;
+  cpi->od_rc.minq = oxcf->best_allowed_q;
+  cpi->od_rc.maxq = oxcf->worst_allowed_q;
+  if (cpi->oxcf.rc_mode == AOM_CQ) cpi->od_rc.minq = cpi->od_rc.quality;
+  cpi->od_rc.quality = cpi->oxcf.rc_mode == AOM_Q ? oxcf->cq_level : -1;
+  cpi->od_rc.periodic_boosts = oxcf->frame_periodic_boost;
+  od_enc_rc_init(&cpi->od_rc,
+                 cpi->oxcf.rc_mode == AOM_Q ? -1 : oxcf->target_bandwidth,
+                 oxcf->maximum_buffer_size_ms);
+#else
+  av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
+#endif
+
+  cm->current_video_frame = 0;
+  cpi->partition_search_skippable_frame = 0;
+  cpi->tile_data = NULL;
+  cpi->last_show_frame_buf_idx = INVALID_IDX;
+
+  realloc_segmentation_maps(cpi);
+
+#if CONFIG_REF_MV
+  for (i = 0; i < NMV_CONTEXTS; ++i) {
+    memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs));
+    memset(cpi->nmv_costs_hp, 0, sizeof(cpi->nmv_costs_hp));
+  }
+#endif
+
+  memset(cpi->nmvcosts, 0, sizeof(cpi->nmvcosts));
+  memset(cpi->nmvcosts_hp, 0, sizeof(cpi->nmvcosts_hp));
+  memset(cpi->nmvsadcosts, 0, sizeof(cpi->nmvsadcosts));
+  memset(cpi->nmvsadcosts_hp, 0, sizeof(cpi->nmvsadcosts_hp));
+
+  for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]));
+       i++) {
+    CHECK_MEM_ERROR(
+        cm, cpi->mbgraph_stats[i].mb_stats,
+        aom_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
+  }
+
+#if CONFIG_FP_MB_STATS
+  cpi->use_fp_mb_stats = 0;
+  if (cpi->use_fp_mb_stats) {
+    // a place holder used to store the first pass mb stats in the first pass
+    CHECK_MEM_ERROR(cm, cpi->twopass.frame_mb_stats_buf,
+                    aom_calloc(cm->MBs * sizeof(uint8_t), 1));
+  } else {
+    cpi->twopass.frame_mb_stats_buf = NULL;
+  }
+#endif
+
+  cpi->refresh_alt_ref_frame = 0;
+  cpi->multi_arf_last_grp_enabled = 0;
+
+  cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+#if CONFIG_INTERNAL_STATS
+  cpi->b_calculate_blockiness = 1;
+  cpi->b_calculate_consistency = 1;
+  cpi->total_inconsistency = 0;
+  cpi->psnr.worst = 100.0;
+  cpi->worst_ssim = 100.0;
+
+  cpi->count = 0;
+  cpi->bytes = 0;
+
+  if (cpi->b_calculate_psnr) {
+    cpi->total_sq_error = 0;
+    cpi->total_samples = 0;
+    cpi->tot_recode_hits = 0;
+    cpi->summed_quality = 0;
+    cpi->summed_weights = 0;
+  }
+
+  cpi->fastssim.worst = 100.0;
+  cpi->psnrhvs.worst = 100.0;
+
+  if (cpi->b_calculate_blockiness) {
+    cpi->total_blockiness = 0;
+    cpi->worst_blockiness = 0.0;
+  }
+
+  if (cpi->b_calculate_consistency) {
+    CHECK_MEM_ERROR(cm, cpi->ssim_vars,
+                    aom_malloc(sizeof(*cpi->ssim_vars) * 4 *
+                               cpi->common.mi_rows * cpi->common.mi_cols));
+    cpi->worst_consistency = 100.0;
+  }
+#endif
+#if CONFIG_ENTROPY_STATS
+  av1_zero(aggregate_fc);
+#endif  // CONFIG_ENTROPY_STATS
+
+  cpi->first_time_stamp_ever = INT64_MAX;
+
+#if CONFIG_REF_MV
+  for (i = 0; i < NMV_CONTEXTS; ++i) {
+    cpi->td.mb.nmvcost[i][0] = &cpi->nmv_costs[i][0][MV_MAX];
+    cpi->td.mb.nmvcost[i][1] = &cpi->nmv_costs[i][1][MV_MAX];
+    cpi->td.mb.nmvcost_hp[i][0] = &cpi->nmv_costs_hp[i][0][MV_MAX];
+    cpi->td.mb.nmvcost_hp[i][1] = &cpi->nmv_costs_hp[i][1][MV_MAX];
+  }
+#else
+  cal_nmvjointsadcost(cpi->td.mb.nmvjointsadcost);
+  cpi->td.mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX];
+  cpi->td.mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX];
+  cpi->td.mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX];
+  cpi->td.mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX];
+#endif
+  cpi->td.mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX];
+  cpi->td.mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX];
+  cal_nmvsadcosts(cpi->td.mb.nmvsadcost);
+
+  cpi->td.mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX];
+  cpi->td.mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX];
+  cal_nmvsadcosts_hp(cpi->td.mb.nmvsadcost_hp);
+
+#ifdef OUTPUT_YUV_SKINMAP
+  yuv_skinmap_file = fopen("skinmap.yuv", "ab");
+#endif
+#ifdef OUTPUT_YUV_REC
+  yuv_rec_file = fopen("rec.yuv", "wb");
+#endif
+
+#if 0
+  framepsnr = fopen("framepsnr.stt", "a");
+  kf_list = fopen("kf_list.stt", "w");
+#endif
+
+#if CONFIG_XIPHRC
+  if (oxcf->pass == 2) {
+    cpi->od_rc.twopass_allframes_buf = oxcf->two_pass_stats_in.buf;
+    cpi->od_rc.twopass_allframes_buf_size = oxcf->two_pass_stats_in.sz;
+  }
+#else
+  if (oxcf->pass == 1) {
+    av1_init_first_pass(cpi);
+  } else if (oxcf->pass == 2) {
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
+
+#if CONFIG_FP_MB_STATS
+    if (cpi->use_fp_mb_stats) {
+      const size_t psz = cpi->common.MBs * sizeof(uint8_t);
+      const int ps = (int)(oxcf->firstpass_mb_stats_in.sz / psz);
+
+      cpi->twopass.firstpass_mb_stats.mb_stats_start =
+          oxcf->firstpass_mb_stats_in.buf;
+      cpi->twopass.firstpass_mb_stats.mb_stats_end =
+          cpi->twopass.firstpass_mb_stats.mb_stats_start +
+          (ps - 1) * cpi->common.MBs * sizeof(uint8_t);
+    }
+#endif
+
+    cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
+    cpi->twopass.stats_in = cpi->twopass.stats_in_start;
+    cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1];
+
+    av1_init_second_pass(cpi);
+  }
+#endif
+
+  init_upsampled_ref_frame_bufs(cpi);
+
+  av1_set_speed_features_framesize_independent(cpi);
+  av1_set_speed_features_framesize_dependent(cpi);
+
+  // Allocate memory to store variances for a frame.
+  CHECK_MEM_ERROR(cm, cpi->source_diff_var,
+                  aom_calloc(cm->MBs, sizeof(*cpi->source_diff_var)));
+  cpi->source_var_thresh = 0;
+  cpi->frames_till_next_var_check = 0;
+
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
+  cpi->fn_ptr[BT].sdf = SDF;                                    \
+  cpi->fn_ptr[BT].sdaf = SDAF;                                  \
+  cpi->fn_ptr[BT].vf = VF;                                      \
+  cpi->fn_ptr[BT].svf = SVF;                                    \
+  cpi->fn_ptr[BT].svaf = SVAF;                                  \
+  cpi->fn_ptr[BT].sdx3f = SDX3F;                                \
+  cpi->fn_ptr[BT].sdx8f = SDX8F;                                \
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;
+
+#if CONFIG_EXT_PARTITION
+  BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
+      aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128,
+      aom_sad128x128x3, aom_sad128x128x8, aom_sad128x128x4d)
+
+  BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64,
+      aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64, NULL,
+      NULL, aom_sad128x64x4d)
+
+  BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128,
+      aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128, NULL,
+      NULL, aom_sad64x128x4d)
+#endif  // CONFIG_EXT_PARTITION
+
+  BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16,
+      aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16, NULL, NULL,
+      aom_sad32x16x4d)
+
+  BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32,
+      aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32, NULL, NULL,
+      aom_sad16x32x4d)
+
+  BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32,
+      aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32, NULL, NULL,
+      aom_sad64x32x4d)
+
+  BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64,
+      aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64, NULL, NULL,
+      aom_sad32x64x4d)
+
+  BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32,
+      aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32,
+      aom_sad32x32x3, aom_sad32x32x8, aom_sad32x32x4d)
+
+  BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64,
+      aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64,
+      aom_sad64x64x3, aom_sad64x64x8, aom_sad64x64x4d)
+
+  BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16,
+      aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16,
+      aom_sad16x16x3, aom_sad16x16x8, aom_sad16x16x4d)
+
+  BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8,
+      aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8, aom_sad16x8x3,
+      aom_sad16x8x8, aom_sad16x8x4d)
+
+  BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16,
+      aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16, aom_sad8x16x3,
+      aom_sad8x16x8, aom_sad8x16x4d)
+
+  BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8,
+      aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x3,
+      aom_sad8x8x8, aom_sad8x8x4d)
+
+  BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4,
+      aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, NULL,
+      aom_sad8x4x8, aom_sad8x4x4d)
+
+  BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8,
+      aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, NULL,
+      aom_sad4x8x8, aom_sad4x8x4d)
+
+  BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4,
+      aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x3,
+      aom_sad4x4x8, aom_sad4x4x4d)
+
+#if CONFIG_CB4X4
+  BFP(BLOCK_2X2, NULL, NULL, aom_variance2x2, NULL, NULL, NULL, NULL, NULL)
+  BFP(BLOCK_2X4, NULL, NULL, aom_variance2x4, NULL, NULL, NULL, NULL, NULL)
+  BFP(BLOCK_4X2, NULL, NULL, aom_variance4x2, NULL, NULL, NULL, NULL, NULL)
+#endif
+
+#if CONFIG_MOTION_VAR
+#define OBFP(BT, OSDF, OVF, OSVF) \
+  cpi->fn_ptr[BT].osdf = OSDF;    \
+  cpi->fn_ptr[BT].ovf = OVF;      \
+  cpi->fn_ptr[BT].osvf = OSVF;
+
+#if CONFIG_EXT_PARTITION
+  OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128,
+       aom_obmc_sub_pixel_variance128x128)
+  OBFP(BLOCK_128X64, aom_obmc_sad128x64, aom_obmc_variance128x64,
+       aom_obmc_sub_pixel_variance128x64)
+  OBFP(BLOCK_64X128, aom_obmc_sad64x128, aom_obmc_variance64x128,
+       aom_obmc_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+  OBFP(BLOCK_64X64, aom_obmc_sad64x64, aom_obmc_variance64x64,
+       aom_obmc_sub_pixel_variance64x64)
+  OBFP(BLOCK_64X32, aom_obmc_sad64x32, aom_obmc_variance64x32,
+       aom_obmc_sub_pixel_variance64x32)
+  OBFP(BLOCK_32X64, aom_obmc_sad32x64, aom_obmc_variance32x64,
+       aom_obmc_sub_pixel_variance32x64)
+  OBFP(BLOCK_32X32, aom_obmc_sad32x32, aom_obmc_variance32x32,
+       aom_obmc_sub_pixel_variance32x32)
+  OBFP(BLOCK_32X16, aom_obmc_sad32x16, aom_obmc_variance32x16,
+       aom_obmc_sub_pixel_variance32x16)
+  OBFP(BLOCK_16X32, aom_obmc_sad16x32, aom_obmc_variance16x32,
+       aom_obmc_sub_pixel_variance16x32)
+  OBFP(BLOCK_16X16, aom_obmc_sad16x16, aom_obmc_variance16x16,
+       aom_obmc_sub_pixel_variance16x16)
+  OBFP(BLOCK_16X8, aom_obmc_sad16x8, aom_obmc_variance16x8,
+       aom_obmc_sub_pixel_variance16x8)
+  OBFP(BLOCK_8X16, aom_obmc_sad8x16, aom_obmc_variance8x16,
+       aom_obmc_sub_pixel_variance8x16)
+  OBFP(BLOCK_8X8, aom_obmc_sad8x8, aom_obmc_variance8x8,
+       aom_obmc_sub_pixel_variance8x8)
+  OBFP(BLOCK_4X8, aom_obmc_sad4x8, aom_obmc_variance4x8,
+       aom_obmc_sub_pixel_variance4x8)
+  OBFP(BLOCK_8X4, aom_obmc_sad8x4, aom_obmc_variance8x4,
+       aom_obmc_sub_pixel_variance8x4)
+  OBFP(BLOCK_4X4, aom_obmc_sad4x4, aom_obmc_variance4x4,
+       aom_obmc_sub_pixel_variance4x4)
+#endif  // CONFIG_MOTION_VAR
+
+#if CONFIG_EXT_INTER
+#define MBFP(BT, MSDF, MVF, MSVF) \
+  cpi->fn_ptr[BT].msdf = MSDF;    \
+  cpi->fn_ptr[BT].mvf = MVF;      \
+  cpi->fn_ptr[BT].msvf = MSVF;
+
+#if CONFIG_EXT_PARTITION
+  MBFP(BLOCK_128X128, aom_masked_sad128x128, aom_masked_variance128x128,
+       aom_masked_sub_pixel_variance128x128)
+  MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_variance128x64,
+       aom_masked_sub_pixel_variance128x64)
+  MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_variance64x128,
+       aom_masked_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+  MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_variance64x64,
+       aom_masked_sub_pixel_variance64x64)
+  MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_variance64x32,
+       aom_masked_sub_pixel_variance64x32)
+  MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_variance32x64,
+       aom_masked_sub_pixel_variance32x64)
+  MBFP(BLOCK_32X32, aom_masked_sad32x32, aom_masked_variance32x32,
+       aom_masked_sub_pixel_variance32x32)
+  MBFP(BLOCK_32X16, aom_masked_sad32x16, aom_masked_variance32x16,
+       aom_masked_sub_pixel_variance32x16)
+  MBFP(BLOCK_16X32, aom_masked_sad16x32, aom_masked_variance16x32,
+       aom_masked_sub_pixel_variance16x32)
+  MBFP(BLOCK_16X16, aom_masked_sad16x16, aom_masked_variance16x16,
+       aom_masked_sub_pixel_variance16x16)
+  MBFP(BLOCK_16X8, aom_masked_sad16x8, aom_masked_variance16x8,
+       aom_masked_sub_pixel_variance16x8)
+  MBFP(BLOCK_8X16, aom_masked_sad8x16, aom_masked_variance8x16,
+       aom_masked_sub_pixel_variance8x16)
+  MBFP(BLOCK_8X8, aom_masked_sad8x8, aom_masked_variance8x8,
+       aom_masked_sub_pixel_variance8x8)
+  MBFP(BLOCK_4X8, aom_masked_sad4x8, aom_masked_variance4x8,
+       aom_masked_sub_pixel_variance4x8)
+  MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_variance8x4,
+       aom_masked_sub_pixel_variance8x4)
+  MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_variance4x4,
+       aom_masked_sub_pixel_variance4x4)
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_HIGHBITDEPTH
+  highbd_set_var_fns(cpi);
+#endif
+
+  /* av1_init_quantizer() is first called here. Add check in
+   * av1_frame_init_quantizer() so that av1_init_quantizer is only
+   * called later when needed. This will avoid unnecessary calls of
+   * av1_init_quantizer() for every frame.
+   */
+  av1_init_quantizer(cpi);
+#if CONFIG_AOM_QM
+  aom_qm_init(cm);
+#endif
+
+  av1_loop_filter_init(cm);
+#if CONFIG_LOOP_RESTORATION
+  av1_loop_restoration_precal();
+#endif  // CONFIG_LOOP_RESTORATION
+
+  cm->error.setjmp = 0;
+
+  return cpi;
+}
+
+#define SNPRINT(H, T) snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T))
+
+#define SNPRINT2(H, T, V) \
+  snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
+
+void av1_remove_compressor(AV1_COMP *cpi) {
+  AV1_COMMON *cm;
+  unsigned int i;
+  int t;
+
+  if (!cpi) return;
+
+  cm = &cpi->common;
+  if (cm->current_video_frame > 0) {
+#if CONFIG_ENTROPY_STATS
+    if (cpi->oxcf.pass != 1) {
+      fprintf(stderr, "Writing counts.stt\n");
+      FILE *f = fopen("counts.stt", "wb");
+      fwrite(&aggregate_fc, sizeof(aggregate_fc), 1, f);
+      fclose(f);
+    }
+#endif  // CONFIG_ENTROPY_STATS
+#if CONFIG_INTERNAL_STATS
+    aom_clear_system_state();
+
+    if (cpi->oxcf.pass != 1) {
+      char headings[512] = { 0 };
+      char results[512] = { 0 };
+      FILE *f = fopen("opsnr.stt", "a");
+      double time_encoded =
+          (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) /
+          10000000.000;
+      double total_encode_time =
+          (cpi->time_receive_data + cpi->time_compress_data) / 1000.000;
+      const double dr =
+          (double)cpi->bytes * (double)8 / (double)1000 / time_encoded;
+      const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
+      const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000;
+      const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
+
+      if (cpi->b_calculate_psnr) {
+        const double total_psnr = aom_sse_to_psnr(
+            (double)cpi->total_samples, peak, (double)cpi->total_sq_error);
+        const double total_ssim =
+            100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);
+        snprintf(headings, sizeof(headings),
+                 "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
+                 "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
+                 "WstPsnr\tWstSsim\tWstFast\tWstHVS");
+        snprintf(results, sizeof(results),
+                 "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                 "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                 "%7.3f\t%7.3f\t%7.3f\t%7.3f",
+                 dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr,
+                 cpi->psnr.stat[ALL] / cpi->count, total_psnr, total_ssim,
+                 total_ssim, cpi->fastssim.stat[ALL] / cpi->count,
+                 cpi->psnrhvs.stat[ALL] / cpi->count, cpi->psnr.worst,
+                 cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst);
+
+        if (cpi->b_calculate_blockiness) {
+          SNPRINT(headings, "\t  Block\tWstBlck");
+          SNPRINT2(results, "\t%7.3f", cpi->total_blockiness / cpi->count);
+          SNPRINT2(results, "\t%7.3f", cpi->worst_blockiness);
+        }
+
+        if (cpi->b_calculate_consistency) {
+          double consistency =
+              aom_sse_to_psnr((double)cpi->total_samples, peak,
+                              (double)cpi->total_inconsistency);
+
+          SNPRINT(headings, "\tConsist\tWstCons");
+          SNPRINT2(results, "\t%7.3f", consistency);
+          SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
+        }
+        fprintf(f, "%s\t    Time\tRcErr\tAbsErr\n", headings);
+        fprintf(f, "%s\t%8.0f\t%7.2f\t%7.2f\n", results, total_encode_time,
+                rate_err, fabs(rate_err));
+      }
+
+      fclose(f);
+    }
+
+#endif
+
+#if 0
+    {
+      printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
+      printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
+      printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame,
+             cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000,
+             cpi->time_compress_data / 1000,
+             (cpi->time_receive_data + cpi->time_compress_data) / 1000);
+    }
+#endif
+  }
+
+  for (t = 0; t < cpi->num_workers; ++t) {
+    AVxWorker *const worker = &cpi->workers[t];
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
+
+    // Deallocate allocated threads.
+    aom_get_worker_interface()->end(worker);
+
+    // Deallocate allocated thread data.
+    if (t < cpi->num_workers - 1) {
+#if CONFIG_PALETTE
+      if (cpi->common.allow_screen_content_tools)
+        aom_free(thread_data->td->mb.palette_buffer);
+#endif  // CONFIG_PALETTE
+      aom_free(thread_data->td->counts);
+      av1_free_pc_tree(thread_data->td);
+      av1_free_var_tree(thread_data->td);
+      aom_free(thread_data->td);
+    }
+  }
+  aom_free(cpi->tile_thr_data);
+  aom_free(cpi->workers);
+
+  if (cpi->num_workers > 1) av1_loop_filter_dealloc(&cpi->lf_row_sync);
+
+  dealloc_compressor_data(cpi);
+
+  for (i = 0; i < sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]);
+       ++i) {
+    aom_free(cpi->mbgraph_stats[i].mb_stats);
+  }
+
+#if CONFIG_FP_MB_STATS
+  if (cpi->use_fp_mb_stats) {
+    aom_free(cpi->twopass.frame_mb_stats_buf);
+    cpi->twopass.frame_mb_stats_buf = NULL;
+  }
+#endif
+#if CONFIG_INTERNAL_STATS
+  aom_free(cpi->ssim_vars);
+  cpi->ssim_vars = NULL;
+#endif  // CONFIG_INTERNAL_STATS
+
+  av1_remove_common(cm);
+  av1_free_ref_frame_buffers(cm->buffer_pool);
+  aom_free(cpi);
+
+#ifdef OUTPUT_YUV_SKINMAP
+  fclose(yuv_skinmap_file);
+#endif
+#ifdef OUTPUT_YUV_REC
+  fclose(yuv_rec_file);
+#endif
+
+#if 0
+
+  if (keyfile)
+    fclose(keyfile);
+
+  if (framepsnr)
+    fclose(framepsnr);
+
+  if (kf_list)
+    fclose(kf_list);
+
+#endif
+}
+
+static void generate_psnr_packet(AV1_COMP *cpi) {
+  struct aom_codec_cx_pkt pkt;
+  int i;
+  PSNR_STATS psnr;
+#if CONFIG_HIGHBITDEPTH
+  aom_calc_highbd_psnr(cpi->source, cpi->common.frame_to_show, &psnr,
+                       cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
+#else
+  aom_calc_psnr(cpi->source, cpi->common.frame_to_show, &psnr);
+#endif
+
+  for (i = 0; i < 4; ++i) {
+    pkt.data.psnr.samples[i] = psnr.samples[i];
+    pkt.data.psnr.sse[i] = psnr.sse[i];
+    pkt.data.psnr.psnr[i] = psnr.psnr[i];
+  }
+  pkt.kind = AOM_CODEC_PSNR_PKT;
+  aom_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+}
+
+int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags) {
+  if (ref_frame_flags > ((1 << INTER_REFS_PER_FRAME) - 1)) return -1;
+
+  cpi->ref_frame_flags = ref_frame_flags;
+  return 0;
+}
+
+void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags) {
+  cpi->ext_refresh_golden_frame = (ref_frame_flags & AOM_GOLD_FLAG) != 0;
+  cpi->ext_refresh_alt_ref_frame = (ref_frame_flags & AOM_ALT_FLAG) != 0;
+  cpi->ext_refresh_last_frame = (ref_frame_flags & AOM_LAST_FLAG) != 0;
+  cpi->ext_refresh_frame_flags_pending = 1;
+}
+
+static YV12_BUFFER_CONFIG *get_av1_ref_frame_buffer(
+    AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag) {
+  MV_REFERENCE_FRAME ref_frame = NONE_FRAME;
+  if (ref_frame_flag == AOM_LAST_FLAG) ref_frame = LAST_FRAME;
+#if CONFIG_EXT_REFS
+  else if (ref_frame_flag == AOM_LAST2_FLAG)
+    ref_frame = LAST2_FRAME;
+  else if (ref_frame_flag == AOM_LAST3_FLAG)
+    ref_frame = LAST3_FRAME;
+#endif  // CONFIG_EXT_REFS
+  else if (ref_frame_flag == AOM_GOLD_FLAG)
+    ref_frame = GOLDEN_FRAME;
+#if CONFIG_EXT_REFS
+  else if (ref_frame_flag == AOM_BWD_FLAG)
+    ref_frame = BWDREF_FRAME;
+#endif  // CONFIG_EXT_REFS
+  else if (ref_frame_flag == AOM_ALT_FLAG)
+    ref_frame = ALTREF_FRAME;
+
+  return ref_frame == NONE_FRAME ? NULL : get_ref_frame_buffer(cpi, ref_frame);
+}
+
+int av1_copy_reference_enc(AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag,
+                           YV12_BUFFER_CONFIG *sd) {
+  YV12_BUFFER_CONFIG *cfg = get_av1_ref_frame_buffer(cpi, ref_frame_flag);
+  if (cfg) {
+    aom_yv12_copy_frame(cfg, sd);
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int av1_set_reference_enc(AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag,
+                          YV12_BUFFER_CONFIG *sd) {
+  YV12_BUFFER_CONFIG *cfg = get_av1_ref_frame_buffer(cpi, ref_frame_flag);
+  if (cfg) {
+    aom_yv12_copy_frame(sd, cfg);
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int av1_update_entropy(AV1_COMP *cpi, int update) {
+  cpi->ext_refresh_frame_context = update;
+  cpi->ext_refresh_frame_context_pending = 1;
+  return 0;
+}
+
+#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP)
+// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
+// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
+// not denoise the UV channels at this time. If ever we implement UV channel
+// denoising we will have to modify this.
+void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
+  uint8_t *src = s->y_buffer;
+  int h = s->y_height;
+
+  do {
+    fwrite(src, s->y_width, 1, f);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, f);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, f);
+    src += s->uv_stride;
+  } while (--h);
+}
+#endif
+
+#if CONFIG_EXT_REFS && !CONFIG_XIPHRC
+static void check_show_existing_frame(AV1_COMP *cpi) {
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+  AV1_COMMON *const cm = &cpi->common;
+  const FRAME_UPDATE_TYPE next_frame_update_type =
+      gf_group->update_type[gf_group->index];
+  const int which_arf = gf_group->arf_update_idx[gf_group->index];
+
+  if (cm->show_existing_frame == 1) {
+    cm->show_existing_frame = 0;
+  } else if (cpi->rc.is_last_bipred_frame) {
+    // NOTE(zoeliu): If the current frame is a last bi-predictive frame, it is
+    //               needed next to show the BWDREF_FRAME, which is pointed by
+    //               the last_fb_idxes[0] after reference frame buffer update
+    cpi->rc.is_last_bipred_frame = 0;
+    cm->show_existing_frame = 1;
+    cpi->existing_fb_idx_to_show = cpi->lst_fb_idxes[0];
+  } else if (cpi->is_arf_filter_off[which_arf] &&
+             (next_frame_update_type == OVERLAY_UPDATE ||
+              next_frame_update_type == INTNL_OVERLAY_UPDATE)) {
+    // Other parameters related to OVERLAY_UPDATE will be taken care of
+    // in av1_rc_get_second_pass_params(cpi)
+    cm->show_existing_frame = 1;
+    cpi->rc.is_src_frame_alt_ref = 1;
+    cpi->existing_fb_idx_to_show = cpi->alt_fb_idx;
+    cpi->is_arf_filter_off[which_arf] = 0;
+  }
+  cpi->rc.is_src_frame_ext_arf = 0;
+}
+#endif  // CONFIG_EXT_REFS && !CONFIG_XIPHRC
+
+#ifdef OUTPUT_YUV_REC
+void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
+  uint8_t *src = s->y_buffer;
+  int h = cm->height;
+
+#if CONFIG_HIGHBITDEPTH
+  if (s->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
+
+    do {
+      fwrite(src16, s->y_width, 2, yuv_rec_file);
+      src16 += s->y_stride;
+    } while (--h);
+
+    src16 = CONVERT_TO_SHORTPTR(s->u_buffer);
+    h = s->uv_height;
+
+    do {
+      fwrite(src16, s->uv_width, 2, yuv_rec_file);
+      src16 += s->uv_stride;
+    } while (--h);
+
+    src16 = CONVERT_TO_SHORTPTR(s->v_buffer);
+    h = s->uv_height;
+
+    do {
+      fwrite(src16, s->uv_width, 2, yuv_rec_file);
+      src16 += s->uv_stride;
+    } while (--h);
+
+    fflush(yuv_rec_file);
+    return;
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+
+  do {
+    fwrite(src, s->y_width, 1, yuv_rec_file);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_rec_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_rec_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  fflush(yuv_rec_file);
+}
+#endif  // OUTPUT_YUV_REC
+
+#if CONFIG_HIGHBITDEPTH
+static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                                YV12_BUFFER_CONFIG *dst,
+                                                int bd) {
+#else
+static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                                YV12_BUFFER_CONFIG *dst) {
+#endif  // CONFIG_HIGHBITDEPTH
+  // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t
+  int i;
+  const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
+                                   src->v_buffer };
+  const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
+  const int src_widths[3] = { src->y_crop_width, src->uv_crop_width,
+                              src->uv_crop_width };
+  const int src_heights[3] = { src->y_crop_height, src->uv_crop_height,
+                               src->uv_crop_height };
+  uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
+  const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
+  const int dst_widths[3] = { dst->y_crop_width, dst->uv_crop_width,
+                              dst->uv_crop_width };
+  const int dst_heights[3] = { dst->y_crop_height, dst->uv_crop_height,
+                               dst->uv_crop_height };
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_HIGHBITDEPTH
+    if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+      av1_highbd_resize_plane(srcs[i], src_heights[i], src_widths[i],
+                              src_strides[i], dsts[i], dst_heights[i],
+                              dst_widths[i], dst_strides[i], bd);
+    } else {
+      av1_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
+                       dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
+    }
+#else
+    av1_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
+                     dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
+#endif  // CONFIG_HIGHBITDEPTH
+  }
+  aom_extend_frame_borders(dst);
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                                   YV12_BUFFER_CONFIG *dst, int planes,
+                                   int bd) {
+#else
+static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                                   YV12_BUFFER_CONFIG *dst, int planes) {
+#endif  // CONFIG_HIGHBITDEPTH
+  const int src_w = src->y_crop_width;
+  const int src_h = src->y_crop_height;
+  const int dst_w = dst->y_crop_width;
+  const int dst_h = dst->y_crop_height;
+  const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
+                                   src->v_buffer };
+  const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
+  uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
+  const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
+  const InterpFilterParams interp_filter_params =
+      av1_get_interp_filter_params(EIGHTTAP_REGULAR);
+  const int16_t *kernel = interp_filter_params.filter_ptr;
+  const int taps = interp_filter_params.taps;
+  int x, y, i;
+
+  assert(planes <= 3);
+  for (y = 0; y < dst_h; y += 16) {
+    for (x = 0; x < dst_w; x += 16) {
+      for (i = 0; i < planes; ++i) {
+        const int factor = (i == 0 || i == 3 ? 1 : 2);
+        const int x_q4 = x * (16 / factor) * src_w / dst_w;
+        const int y_q4 = y * (16 / factor) * src_h / dst_h;
+        const int src_stride = src_strides[i];
+        const int dst_stride = dst_strides[i];
+        const uint8_t *src_ptr = srcs[i] +
+                                 (y / factor) * src_h / dst_h * src_stride +
+                                 (x / factor) * src_w / dst_w;
+        uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+
+#if CONFIG_HIGHBITDEPTH
+        if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+          aom_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+                               &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w,
+                               &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
+                               16 / factor, 16 / factor, bd);
+        } else {
+          aom_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
+                        &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w,
+                        &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
+                        16 / factor, 16 / factor);
+        }
+#else
+        aom_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
+                      &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w,
+                      &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
+                      16 / factor, 16 / factor);
+#endif  // CONFIG_HIGHBITDEPTH
+      }
+    }
+  }
+
+  if (planes == 1)
+    aom_extend_frame_borders_y(dst);
+  else
+    aom_extend_frame_borders(dst);
+}
+
+static int scale_down(AV1_COMP *cpi, int q) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+  int scale = 0;
+  assert(frame_is_kf_gf_arf(cpi));
+
+  if (rc->frame_size_selector == UNSCALED &&
+      q >= rc->rf_level_maxq[gf_group->rf_level[gf_group->index]]) {
+    const int max_size_thresh =
+        (int)(rate_thresh_mult[SCALE_STEP1] *
+              AOMMAX(rc->this_frame_target, rc->avg_frame_bandwidth));
+    scale = rc->projected_frame_size > max_size_thresh ? 1 : 0;
+  }
+  return scale;
+}
+
+#if CONFIG_GLOBAL_MOTION
+#define GM_RECODE_LOOP_NUM4X4_FACTOR 192
+static int recode_loop_test_global_motion(AV1_COMP *cpi) {
+  int i;
+  int recode = 0;
+  RD_COUNTS *const rdc = &cpi->td.rd_counts;
+  AV1_COMMON *const cm = &cpi->common;
+  for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+    if (cm->global_motion[i].wmtype != IDENTITY &&
+        rdc->global_motion_used[i] * GM_RECODE_LOOP_NUM4X4_FACTOR <
+            cpi->gmparams_cost[i]) {
+      set_default_warp_params(&cm->global_motion[i]);
+      cpi->gmparams_cost[i] = 0;
+#if CONFIG_REF_MV
+      recode = 1;
+#else
+      recode |= (rdc->global_motion_used[i] > 0);
+#endif
+    }
+  }
+  return recode;
+}
+#endif  // CONFIG_GLOBAL_MOTION
+
+// Function to test for conditions that indicate we should loop
+// back and recode a frame.
+static int recode_loop_test(AV1_COMP *cpi, int high_limit, int low_limit, int q,
+                            int maxq, int minq) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
+  int force_recode = 0;
+
+  if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+      (cpi->sf.recode_loop == ALLOW_RECODE) ||
+      (frame_is_kfgfarf && (cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
+    if (frame_is_kfgfarf && (oxcf->resize_mode == RESIZE_DYNAMIC) &&
+        scale_down(cpi, q)) {
+      // Code this group at a lower resolution.
+      cpi->resize_pending = 1;
+      return 1;
+    }
+
+    // TODO(agrange) high_limit could be greater than the scale-down threshold.
+    if ((rc->projected_frame_size > high_limit && q < maxq) ||
+        (rc->projected_frame_size < low_limit && q > minq)) {
+      force_recode = 1;
+    } else if (cpi->oxcf.rc_mode == AOM_CQ) {
+      // Deal with frame undershoot and whether or not we are
+      // below the automatically set cq level.
+      if (q > oxcf->cq_level &&
+          rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) {
+        force_recode = 1;
+      }
+    }
+  }
+  return force_recode;
+}
+
+static INLINE int get_free_upsampled_ref_buf(EncRefCntBuffer *ubufs) {
+  int i;
+
+  for (i = 0; i < (REF_FRAMES + 1); i++) {
+    if (!ubufs[i].ref_count) {
+      return i;
+    }
+  }
+  return INVALID_IDX;
+}
+
+// Up-sample 1 reference frame.
+static INLINE int upsample_ref_frame(AV1_COMP *cpi,
+                                     const YV12_BUFFER_CONFIG *const ref) {
+  AV1_COMMON *const cm = &cpi->common;
+  EncRefCntBuffer *ubufs = cpi->upsampled_ref_bufs;
+  int new_uidx = get_free_upsampled_ref_buf(ubufs);
+
+  if (new_uidx == INVALID_IDX) {
+    return INVALID_IDX;
+  } else {
+    YV12_BUFFER_CONFIG *upsampled_ref = &ubufs[new_uidx].buf;
+
+    // Can allocate buffer for Y plane only.
+    if (upsampled_ref->buffer_alloc_sz < (ref->buffer_alloc_sz << 6))
+      if (aom_realloc_frame_buffer(upsampled_ref, (cm->width << 3),
+                                   (cm->height << 3), cm->subsampling_x,
+                                   cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+                                   cm->use_highbitdepth,
+#endif
+                                   (AOM_BORDER_IN_PIXELS << 3),
+                                   cm->byte_alignment, NULL, NULL, NULL))
+        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate up-sampled frame buffer");
+
+// Currently, only Y plane is up-sampled, U, V are not used.
+#if CONFIG_HIGHBITDEPTH
+    scale_and_extend_frame(ref, upsampled_ref, 1, (int)cm->bit_depth);
+#else
+    scale_and_extend_frame(ref, upsampled_ref, 1);
+#endif
+    return new_uidx;
+  }
+}
+
+#define DUMP_REF_FRAME_IMAGES 0
+
+#if DUMP_REF_FRAME_IMAGES == 1
+static int dump_one_image(AV1_COMMON *cm,
+                          const YV12_BUFFER_CONFIG *const ref_buf,
+                          char *file_name) {
+  int h;
+  FILE *f_ref = NULL;
+
+  if (ref_buf == NULL) {
+    printf("Frame data buffer is NULL.\n");
+    return AOM_CODEC_MEM_ERROR;
+  }
+
+  if ((f_ref = fopen(file_name, "wb")) == NULL) {
+    printf("Unable to open file %s to write.\n", file_name);
+    return AOM_CODEC_MEM_ERROR;
+  }
+
+  // --- Y ---
+  for (h = 0; h < cm->height; ++h) {
+    fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref);
+  }
+  // --- U ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+           f_ref);
+  }
+  // --- V ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+           f_ref);
+  }
+
+  fclose(f_ref);
+
+  return AOM_CODEC_OK;
+}
+
+static void dump_ref_frame_images(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MV_REFERENCE_FRAME ref_frame;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    char file_name[256] = "";
+    snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv",
+             cm->current_video_frame, ref_frame);
+    dump_one_image(cm, get_ref_frame_buffer(cpi, ref_frame), file_name);
+  }
+}
+#endif  // DUMP_REF_FRAME_IMAGES == 1
+
+#if CONFIG_EXT_REFS
+// This function is used to shift the virtual indices of last reference frames
+// as follows:
+// LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME
+// when the LAST_FRAME is updated.
+static INLINE void shift_last_ref_frames(AV1_COMP *cpi) {
+  int ref_frame;
+  for (ref_frame = LAST_REF_FRAMES - 1; ref_frame > 0; --ref_frame) {
+    cpi->lst_fb_idxes[ref_frame] = cpi->lst_fb_idxes[ref_frame - 1];
+
+    // [0] is allocated to the current coded frame. The statistics for the
+    // reference frames start at [LAST_FRAME], i.e. [1].
+    if (!cpi->rc.is_src_frame_alt_ref) {
+      memcpy(cpi->interp_filter_selected[ref_frame + LAST_FRAME],
+             cpi->interp_filter_selected[ref_frame - 1 + LAST_FRAME],
+             sizeof(cpi->interp_filter_selected[ref_frame - 1 + LAST_FRAME]));
+    }
+  }
+}
+#endif  // CONFIG_EXT_REFS
+
+void av1_update_reference_frames(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  const int use_upsampled_ref = cpi->sf.use_upsampled_references;
+  int new_uidx = 0;
+
+  // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
+  //       for the purpose to verify no mismatch between encoder and decoder.
+  if (cm->show_frame) cpi->last_show_frame_buf_idx = cm->new_fb_idx;
+
+  if (use_upsampled_ref) {
+#if CONFIG_EXT_REFS
+    if (cm->show_existing_frame) {
+      new_uidx = cpi->upsampled_ref_idx[cpi->existing_fb_idx_to_show];
+      // TODO(zoeliu): Once following is confirmed, remove it.
+      assert(cpi->upsampled_ref_bufs[new_uidx].ref_count > 0);
+    } else {
+#endif  // CONFIG_EXT_REFS
+      // Up-sample the current encoded frame.
+      RefCntBuffer *bufs = pool->frame_bufs;
+      const YV12_BUFFER_CONFIG *const ref = &bufs[cm->new_fb_idx].buf;
+
+      new_uidx = upsample_ref_frame(cpi, ref);
+#if CONFIG_EXT_REFS
+      assert(new_uidx != INVALID_IDX);
+    }
+#endif  // CONFIG_EXT_REFS
+  }
+  // At this point the new frame has been encoded.
+  // If any buffer copy / swapping is signaled it should be done here.
+  if (cm->frame_type == KEY_FRAME) {
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+               cm->new_fb_idx);
+#if CONFIG_EXT_REFS
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx],
+               cm->new_fb_idx);
+#endif  // CONFIG_EXT_REFS
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx],
+               cm->new_fb_idx);
+
+    if (use_upsampled_ref) {
+      uref_cnt_fb(cpi->upsampled_ref_bufs,
+                  &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
+#if CONFIG_EXT_REFS
+      uref_cnt_fb(cpi->upsampled_ref_bufs,
+                  &cpi->upsampled_ref_idx[cpi->bwd_fb_idx], new_uidx);
+#endif  // CONFIG_EXT_REFS
+      uref_cnt_fb(cpi->upsampled_ref_bufs,
+                  &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
+    }
+  } else if (av1_preserve_existing_gf(cpi)) {
+    // We have decided to preserve the previously existing golden frame as our
+    // new ARF frame. However, in the short term in function
+    // av1_bitstream.c::get_refresh_mask() we left it in the GF slot and, if
+    // we're updating the GF with the current decoded frame, we save it to the
+    // ARF slot instead.
+    // We now have to update the ARF with the current frame and swap gld_fb_idx
+    // and alt_fb_idx so that, overall, we've stored the old GF in the new ARF
+    // slot and, if we're updating the GF, the current frame becomes the new GF.
+    int tmp;
+
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx],
+               cm->new_fb_idx);
+    if (use_upsampled_ref)
+      uref_cnt_fb(cpi->upsampled_ref_bufs,
+                  &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
+
+    tmp = cpi->alt_fb_idx;
+    cpi->alt_fb_idx = cpi->gld_fb_idx;
+    cpi->gld_fb_idx = tmp;
+
+#if CONFIG_EXT_REFS
+    // We need to modify the mapping accordingly
+    cpi->arf_map[0] = cpi->alt_fb_idx;
+#endif
+// TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to
+// cpi->interp_filter_selected[GOLDEN_FRAME]?
+#if CONFIG_EXT_REFS
+  } else if (cpi->rc.is_last_bipred_frame) {
+    // Refresh the LAST_FRAME with the BWDREF_FRAME and retire the LAST3_FRAME
+    // by updating the virtual indices. Note that the frame BWDREF_FRAME points
+    // to now should be retired, and it should not be used before refreshed.
+    int tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
+
+    shift_last_ref_frames(cpi);
+    cpi->lst_fb_idxes[0] = cpi->bwd_fb_idx;
+    cpi->bwd_fb_idx = tmp;
+
+    memcpy(cpi->interp_filter_selected[LAST_FRAME],
+           cpi->interp_filter_selected[BWDREF_FRAME],
+           sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
+  } else if (cpi->rc.is_src_frame_ext_arf && cm->show_existing_frame) {
+    // Deal with the special case for showing existing internal ALTREF_FRAME
+    // Refresh the LAST_FRAME with the ALTREF_FRAME and retire the LAST3_FRAME
+    // by updating the virtual indices.
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    int which_arf = gf_group->arf_ref_idx[gf_group->index];
+    int tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
+
+    shift_last_ref_frames(cpi);
+    cpi->lst_fb_idxes[0] = cpi->alt_fb_idx;
+    cpi->alt_fb_idx = tmp;
+
+    // We need to modify the mapping accordingly
+    cpi->arf_map[which_arf] = cpi->alt_fb_idx;
+
+    memcpy(cpi->interp_filter_selected[LAST_FRAME],
+           cpi->interp_filter_selected[ALTREF_FRAME + which_arf],
+           sizeof(cpi->interp_filter_selected[ALTREF_FRAME + which_arf]));
+#endif     // CONFIG_EXT_REFS
+  } else { /* For non key/golden frames */
+    if (cpi->refresh_alt_ref_frame) {
+      int arf_idx = cpi->alt_fb_idx;
+      int which_arf = 0;
+#if CONFIG_EXT_REFS
+      if (cpi->oxcf.pass == 2) {
+        const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+        which_arf = gf_group->arf_update_idx[gf_group->index];
+        arf_idx = cpi->arf_map[which_arf];
+      }
+#else
+      if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
+        const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+        arf_idx = gf_group->arf_update_idx[gf_group->index];
+      }
+#endif  // CONFIG_EXT_REFS
+      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
+      if (use_upsampled_ref)
+        uref_cnt_fb(cpi->upsampled_ref_bufs, &cpi->upsampled_ref_idx[arf_idx],
+                    new_uidx);
+
+      memcpy(cpi->interp_filter_selected[ALTREF_FRAME + which_arf],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
+    }
+
+    if (cpi->refresh_golden_frame) {
+      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+                 cm->new_fb_idx);
+      if (use_upsampled_ref)
+        uref_cnt_fb(cpi->upsampled_ref_bufs,
+                    &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
+
+#if !CONFIG_EXT_REFS
+      if (!cpi->rc.is_src_frame_alt_ref)
+#endif  // !CONFIG_EXT_REFS
+        memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+               cpi->interp_filter_selected[0],
+               sizeof(cpi->interp_filter_selected[0]));
+    }
+
+#if CONFIG_EXT_REFS
+    if (cpi->refresh_bwd_ref_frame) {
+      if (cpi->rc.is_bwd_ref_frame && cpi->num_extra_arfs) {
+        // We have swapped the virtual indices to allow bwd_ref_frame to use
+        // ALT0 as reference frame. We need to swap them back.
+        // NOTE: The ALT_REFs' are indexed reversely, and ALT0 refers to the
+        //       farthest ALT_REF from the first frame in the gf group.
+        int tmp = cpi->arf_map[0];
+        cpi->arf_map[0] = cpi->alt_fb_idx;
+        cpi->alt_fb_idx = cpi->bwd_fb_idx;
+        cpi->bwd_fb_idx = tmp;
+      }
+
+      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx],
+                 cm->new_fb_idx);
+      if (use_upsampled_ref)
+        uref_cnt_fb(cpi->upsampled_ref_bufs,
+                    &cpi->upsampled_ref_idx[cpi->bwd_fb_idx], new_uidx);
+
+      memcpy(cpi->interp_filter_selected[BWDREF_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
+    }
+#endif  // CONFIG_EXT_REFS
+  }
+
+  if (cpi->refresh_last_frame) {
+#if CONFIG_EXT_REFS
+    // NOTE(zoeliu): We have two layers of mapping (1) from the per-frame
+    // reference to the reference frame buffer virtual index; and then (2) from
+    // the virtual index to the reference frame buffer physical index:
+    //
+    // LAST_FRAME,      ..., LAST3_FRAME,     ..., ALTREF_FRAME
+    //      |                     |                     |
+    //      v                     v                     v
+    // lst_fb_idxes[0], ..., lst_fb_idxes[2], ..., alt_fb_idx
+    //      |                     |                     |
+    //      v                     v                     v
+    // ref_frame_map[], ..., ref_frame_map[], ..., ref_frame_map[]
+    //
+    // When refresh_last_frame is set, it is intended to retire LAST3_FRAME,
+    // have the other 2 LAST reference frames shifted as follows:
+    // LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME
+    // , and then have LAST_FRAME refreshed by the newly coded frame.
+    //
+    // To fulfill it, the decoder will be notified to execute following 2 steps:
+    //
+    // (a) To change ref_frame_map[] and have the virtual index of LAST3_FRAME
+    //     to point to the newly coded frame, i.e.
+    //     ref_frame_map[lst_fb_idexes[2]] => new_fb_idx;
+    //
+    // (b) To change the 1st layer mapping to have LAST_FRAME mapped to the
+    //     original virtual index of LAST3_FRAME and have the other mappings
+    //     shifted as follows:
+    // LAST_FRAME,      LAST2_FRAME,     LAST3_FRAME
+    //      |                |                |
+    //      v                v                v
+    // lst_fb_idxes[2], lst_fb_idxes[0], lst_fb_idxes[1]
+    int ref_frame;
+
+    if (cpi->rc.is_bwd_ref_frame && cpi->num_extra_arfs) {
+      // We have swapped the virtual indices to use ALT0 as BWD_REF
+      // and we need to swap them back.
+      int tmp = cpi->arf_map[0];
+      cpi->arf_map[0] = cpi->alt_fb_idx;
+      cpi->alt_fb_idx = cpi->bwd_fb_idx;
+      cpi->bwd_fb_idx = tmp;
+    }
+
+    if (cm->frame_type == KEY_FRAME) {
+      for (ref_frame = 0; ref_frame < LAST_REF_FRAMES; ++ref_frame) {
+        ref_cnt_fb(pool->frame_bufs,
+                   &cm->ref_frame_map[cpi->lst_fb_idxes[ref_frame]],
+                   cm->new_fb_idx);
+
+        if (use_upsampled_ref)
+          uref_cnt_fb(cpi->upsampled_ref_bufs,
+                      &cpi->upsampled_ref_idx[cpi->lst_fb_idxes[ref_frame]],
+                      new_uidx);
+      }
+    } else {
+      int tmp;
+
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]],
+                 cm->new_fb_idx);
+
+      if (use_upsampled_ref)
+        uref_cnt_fb(
+            cpi->upsampled_ref_bufs,
+            &cpi->upsampled_ref_idx[cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]],
+            new_uidx);
+
+      tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
+
+      shift_last_ref_frames(cpi);
+      cpi->lst_fb_idxes[0] = tmp;
+
+      assert(cm->show_existing_frame == 0);
+      // NOTE: Currently only LF_UPDATE and INTNL_OVERLAY_UPDATE frames are to
+      //       refresh the LAST_FRAME.
+      memcpy(cpi->interp_filter_selected[LAST_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
+    }
+#else
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
+               cm->new_fb_idx);
+    if (use_upsampled_ref)
+      uref_cnt_fb(cpi->upsampled_ref_bufs,
+                  &cpi->upsampled_ref_idx[cpi->lst_fb_idx], new_uidx);
+    if (!cpi->rc.is_src_frame_alt_ref) {
+      memcpy(cpi->interp_filter_selected[LAST_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
+    }
+#endif  // CONFIG_EXT_REFS
+  }
+
+#if DUMP_REF_FRAME_IMAGES == 1
+  // Dump out all reference frame images.
+  dump_ref_frame_images(cpi);
+#endif  // DUMP_REF_FRAME_IMAGES
+}
+
+static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+  struct loopfilter *lf = &cm->lf;
+  if (is_lossless_requested(&cpi->oxcf)) {
+    lf->filter_level = 0;
+  } else {
+    struct aom_usec_timer timer;
+
+    aom_clear_system_state();
+
+    aom_usec_timer_start(&timer);
+
+    av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_pick);
+
+    aom_usec_timer_mark(&timer);
+    cpi->time_pick_lpf += aom_usec_timer_elapsed(&timer);
+  }
+
+  if (lf->filter_level > 0) {
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4
+    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+#else
+    if (cpi->num_workers > 1)
+      av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
+                               lf->filter_level, 0, 0, cpi->workers,
+                               cpi->num_workers, &cpi->lf_row_sync);
+    else
+      av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+#endif
+  }
+#if CONFIG_CDEF
+  if (is_lossless_requested(&cpi->oxcf)) {
+    cm->cdef_bits = 0;
+    cm->cdef_strengths[0] = 0;
+    cm->nb_cdef_strengths = 1;
+  } else {
+    // Find cm->dering_level, cm->clpf_strength_u and cm->clpf_strength_v
+    av1_cdef_search(cm->frame_to_show, cpi->source, cm, xd);
+
+    // Apply the filter
+    av1_cdef_frame(cm->frame_to_show, cm, xd);
+  }
+#endif
+#if CONFIG_LOOP_RESTORATION
+  av1_pick_filter_restoration(cpi->source, cpi, cpi->sf.lpf_pick);
+  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
+    av1_loop_restoration_frame(cm->frame_to_show, cm, cm->rst_info, 7, 0, NULL);
+  }
+#endif  // CONFIG_LOOP_RESTORATION
+  aom_extend_frame_inner_borders(cm->frame_to_show);
+}
+
+static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, int buffer_idx) {
+  RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
+  if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
+      new_fb_ptr->mi_cols < cm->mi_cols) {
+    aom_free(new_fb_ptr->mvs);
+    CHECK_MEM_ERROR(cm, new_fb_ptr->mvs,
+                    (MV_REF *)aom_calloc(cm->mi_rows * cm->mi_cols,
+                                         sizeof(*new_fb_ptr->mvs)));
+    new_fb_ptr->mi_rows = cm->mi_rows;
+    new_fb_ptr->mi_cols = cm->mi_cols;
+  }
+}
+
+void av1_scale_references(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  MV_REFERENCE_FRAME ref_frame;
+  const AOM_REFFRAME ref_mask[INTER_REFS_PER_FRAME] = {
+    AOM_LAST_FLAG,
+#if CONFIG_EXT_REFS
+    AOM_LAST2_FLAG,
+    AOM_LAST3_FLAG,
+#endif  // CONFIG_EXT_REFS
+    AOM_GOLD_FLAG,
+#if CONFIG_EXT_REFS
+    AOM_BWD_FLAG,
+#endif  // CONFIG_EXT_REFS
+    AOM_ALT_FLAG
+  };
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1).
+    if (cpi->ref_frame_flags & ref_mask[ref_frame - 1]) {
+      BufferPool *const pool = cm->buffer_pool;
+      const YV12_BUFFER_CONFIG *const ref =
+          get_ref_frame_buffer(cpi, ref_frame);
+
+      if (ref == NULL) {
+        cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+        continue;
+      }
+
+#if CONFIG_HIGHBITDEPTH
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+        RefCntBuffer *new_fb_ptr = NULL;
+        int force_scaling = 0;
+        int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
+        if (new_fb == INVALID_IDX) {
+          new_fb = get_free_fb(cm);
+          force_scaling = 1;
+        }
+        if (new_fb == INVALID_IDX) return;
+        new_fb_ptr = &pool->frame_bufs[new_fb];
+        if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
+            new_fb_ptr->buf.y_crop_height != cm->height) {
+          if (aom_realloc_frame_buffer(
+                  &new_fb_ptr->buf, cm->width, cm->height, cm->subsampling_x,
+                  cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                  cm->byte_alignment, NULL, NULL, NULL))
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffer");
+          scale_and_extend_frame(ref, &new_fb_ptr->buf, MAX_MB_PLANE,
+                                 (int)cm->bit_depth);
+          cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+          alloc_frame_mvs(cm, new_fb);
+        }
+#else
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+        RefCntBuffer *new_fb_ptr = NULL;
+        int force_scaling = 0;
+        int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
+        if (new_fb == INVALID_IDX) {
+          new_fb = get_free_fb(cm);
+          force_scaling = 1;
+        }
+        if (new_fb == INVALID_IDX) return;
+        new_fb_ptr = &pool->frame_bufs[new_fb];
+        if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
+            new_fb_ptr->buf.y_crop_height != cm->height) {
+          if (aom_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
+                                       cm->subsampling_x, cm->subsampling_y,
+                                       AOM_BORDER_IN_PIXELS, cm->byte_alignment,
+                                       NULL, NULL, NULL))
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffer");
+          scale_and_extend_frame(ref, &new_fb_ptr->buf, MAX_MB_PLANE);
+          cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+          alloc_frame_mvs(cm, new_fb);
+        }
+#endif  // CONFIG_HIGHBITDEPTH
+
+        if (cpi->sf.use_upsampled_references &&
+            (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
+             new_fb_ptr->buf.y_crop_height != cm->height)) {
+          const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
+          EncRefCntBuffer *ubuf =
+              &cpi->upsampled_ref_bufs[cpi->upsampled_ref_idx[map_idx]];
+
+          if (aom_realloc_frame_buffer(&ubuf->buf, (cm->width << 3),
+                                       (cm->height << 3), cm->subsampling_x,
+                                       cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+                                       cm->use_highbitdepth,
+#endif
+                                       (AOM_BORDER_IN_PIXELS << 3),
+                                       cm->byte_alignment, NULL, NULL, NULL))
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate up-sampled frame buffer");
+#if CONFIG_HIGHBITDEPTH
+          scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, 1,
+                                 (int)cm->bit_depth);
+#else
+          scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, 1);
+#endif
+        }
+      } else {
+        const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+        RefCntBuffer *const buf = &pool->frame_bufs[buf_idx];
+        buf->buf.y_crop_width = ref->y_crop_width;
+        buf->buf.y_crop_height = ref->y_crop_height;
+        cpi->scaled_ref_idx[ref_frame - 1] = buf_idx;
+        ++buf->ref_count;
+      }
+    } else {
+      if (cpi->oxcf.pass != 0) cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+    }
+  }
+}
+
+static void release_scaled_references(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  int i;
+  if (cpi->oxcf.pass == 0) {
+    // Only release scaled references under certain conditions:
+    // if reference will be updated, or if scaled reference has same resolution.
+    int refresh[INTER_REFS_PER_FRAME];
+    refresh[0] = (cpi->refresh_last_frame) ? 1 : 0;
+#if CONFIG_EXT_REFS
+    refresh[1] = refresh[2] = 0;
+    refresh[3] = (cpi->refresh_golden_frame) ? 1 : 0;
+    refresh[4] = (cpi->refresh_bwd_ref_frame) ? 1 : 0;
+    refresh[5] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
+#else
+    refresh[1] = (cpi->refresh_golden_frame) ? 1 : 0;
+    refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
+#endif  // CONFIG_EXT_REFS
+    for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+      const int idx = cpi->scaled_ref_idx[i - 1];
+      RefCntBuffer *const buf =
+          idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL;
+      const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i);
+      if (buf != NULL &&
+          (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width &&
+                              buf->buf.y_crop_height == ref->y_crop_height))) {
+        --buf->ref_count;
+        cpi->scaled_ref_idx[i - 1] = INVALID_IDX;
+      }
+    }
+  } else {
+    for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) {
+      const int idx = cpi->scaled_ref_idx[i];
+      RefCntBuffer *const buf =
+          idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL;
+      if (buf != NULL) {
+        --buf->ref_count;
+        cpi->scaled_ref_idx[i] = INVALID_IDX;
+      }
+    }
+  }
+}
+
+static void full_to_model_count(unsigned int *model_count,
+                                unsigned int *full_count) {
+  int n;
+  model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
+  model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
+  model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
+  for (n = THREE_TOKEN; n < EOB_TOKEN; ++n)
+    model_count[TWO_TOKEN] += full_count[n];
+  model_count[EOB_MODEL_TOKEN] = full_count[EOB_TOKEN];
+}
+
+void av1_full_to_model_counts(av1_coeff_count_model *model_count,
+                              av1_coeff_count *full_count) {
+  int i, j, k, l;
+
+  for (i = 0; i < PLANE_TYPES; ++i)
+    for (j = 0; j < REF_TYPES; ++j)
+      for (k = 0; k < COEF_BANDS; ++k)
+        for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
+          full_to_model_count(model_count[i][j][k][l], full_count[i][j][k][l]);
+}
+
+#if 0 && CONFIG_INTERNAL_STATS
+static void output_frame_level_debug_stats(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
+  int64_t recon_err;
+
+  aom_clear_system_state();
+
+  recon_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+
+  if (cpi->twopass.total_left_stats.coded_error != 0.0)
+    fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
+       "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
+       "%10"PRId64" %10"PRId64" %10d "
+       "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
+        "%6d %6d %5d %5d %5d "
+        "%10"PRId64" %10.3lf"
+        "%10lf %8u %10"PRId64" %10d %10d %10d\n",
+        cpi->common.current_video_frame,
+        cm->width, cm->height,
+        cpi->rc.source_alt_ref_pending,
+        cpi->rc.source_alt_ref_active,
+        cpi->rc.this_frame_target,
+        cpi->rc.projected_frame_size,
+        cpi->rc.projected_frame_size / cpi->common.MBs,
+        (cpi->rc.projected_frame_size - cpi->rc.this_frame_target),
+        cpi->rc.vbr_bits_off_target,
+        cpi->rc.vbr_bits_off_target_fast,
+        cpi->twopass.extend_minq,
+        cpi->twopass.extend_minq_fast,
+        cpi->rc.total_target_vs_actual,
+        (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target),
+        cpi->rc.total_actual_bits, cm->base_qindex,
+        av1_convert_qindex_to_q(cm->base_qindex, cm->bit_depth),
+        (double)av1_dc_quant(cm->base_qindex, 0, cm->bit_depth) / 4.0,
+        av1_convert_qindex_to_q(cpi->twopass.active_worst_quality,
+                                cm->bit_depth),
+        cpi->rc.avg_q,
+        av1_convert_qindex_to_q(cpi->oxcf.cq_level, cm->bit_depth),
+        cpi->refresh_last_frame, cpi->refresh_golden_frame,
+        cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost,
+        cpi->twopass.bits_left,
+        cpi->twopass.total_left_stats.coded_error,
+        cpi->twopass.bits_left /
+            (1 + cpi->twopass.total_left_stats.coded_error),
+        cpi->tot_recode_hits, recon_err, cpi->rc.kf_boost,
+        cpi->twopass.kf_zeromotion_pct,
+        cpi->twopass.fr_content_type);
+
+  fclose(f);
+
+  if (0) {
+    FILE *const fmodes = fopen("Modes.stt", "a");
+    int i;
+
+    fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame,
+            cm->frame_type, cpi->refresh_golden_frame,
+            cpi->refresh_alt_ref_frame);
+
+    for (i = 0; i < MAX_MODES; ++i)
+      fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
+
+    fprintf(fmodes, "\n");
+
+    fclose(fmodes);
+  }
+}
+#endif
+
+static void set_mv_search_params(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const unsigned int max_mv_def = AOMMIN(cm->width, cm->height);
+
+  // Default based on max resolution.
+  cpi->mv_step_param = av1_init_search_range(max_mv_def);
+
+  if (cpi->sf.mv.auto_mv_step_size) {
+    if (frame_is_intra_only(cm)) {
+      // Initialize max_mv_magnitude for use in the first INTER frame
+      // after a key/intra-only frame.
+      cpi->max_mv_magnitude = max_mv_def;
+    } else {
+      if (cm->show_frame) {
+        // Allow mv_steps to correspond to twice the max mv magnitude found
+        // in the previous frame, capped by the default max_mv_magnitude based
+        // on resolution.
+        cpi->mv_step_param = av1_init_search_range(
+            AOMMIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+      }
+      cpi->max_mv_magnitude = 0;
+    }
+  }
+}
+
+static void set_size_independent_vars(AV1_COMP *cpi) {
+#if CONFIG_GLOBAL_MOTION
+  int i;
+  for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+    set_default_warp_params(&cpi->common.global_motion[i]);
+  }
+  cpi->global_motion_search_done = 0;
+#endif  // CONFIG_GLOBAL_MOTION
+  av1_set_speed_features_framesize_independent(cpi);
+  av1_set_rd_speed_thresholds(cpi);
+  av1_set_rd_speed_thresholds_sub8x8(cpi);
+  cpi->common.interp_filter = cpi->sf.default_interp_filter;
+}
+
+static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
+                                    int *top_index) {
+  AV1_COMMON *const cm = &cpi->common;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+  // Setup variables that depend on the dimensions of the frame.
+  av1_set_speed_features_framesize_dependent(cpi);
+
+// Decide q and q bounds.
+#if CONFIG_XIPHRC
+  int frame_type = cm->frame_type == KEY_FRAME ? OD_I_FRAME : OD_P_FRAME;
+  *q = od_enc_rc_select_quantizers_and_lambdas(
+      &cpi->od_rc, cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame,
+      frame_type, bottom_index, top_index);
+#else
+  *q = av1_rc_pick_q_and_bounds(cpi, bottom_index, top_index);
+#endif
+
+  if (!frame_is_intra_only(cm)) {
+    av1_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
+  }
+
+  // Configure experimental use of segmentation for enhanced coding of
+  // static regions if indicated.
+  // Only allowed in the second pass of a two pass encode, as it requires
+  // lagged coding, and if the relevant speed feature flag is set.
+  if (oxcf->pass == 2 && cpi->sf.static_segmentation)
+    configure_static_seg_features(cpi);
+}
+
+static void init_motion_estimation(AV1_COMP *cpi) {
+  int y_stride = cpi->scaled_source.y_stride;
+
+  if (cpi->sf.mv.search_method == NSTEP) {
+    av1_init3smotion_compensation(&cpi->ss_cfg, y_stride);
+  } else if (cpi->sf.mv.search_method == DIAMOND) {
+    av1_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
+  }
+}
+
+#if CONFIG_LOOP_RESTORATION
+static void set_restoration_tilesize(int width, int height,
+                                     RestorationInfo *rst) {
+  (void)width;
+  (void)height;
+  rst[0].restoration_tilesize = (RESTORATION_TILESIZE_MAX >> 1);
+  rst[1].restoration_tilesize = rst[0].restoration_tilesize;
+  rst[2].restoration_tilesize = rst[0].restoration_tilesize;
+}
+#endif  // CONFIG_LOOP_RESTORATION
+
+static void set_frame_size(AV1_COMP *cpi) {
+  int ref_frame;
+  AV1_COMMON *const cm = &cpi->common;
+  AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+  if (oxcf->pass == 2 && oxcf->rc_mode == AOM_VBR &&
+      ((oxcf->resize_mode == RESIZE_FIXED && cm->current_video_frame == 0) ||
+       (oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending))) {
+    av1_calculate_coded_size(cpi, &oxcf->scaled_frame_width,
+                             &oxcf->scaled_frame_height);
+
+    // There has been a change in frame size.
+    av1_set_size_literal(cpi, oxcf->scaled_frame_width,
+                         oxcf->scaled_frame_height);
+  }
+
+  if (oxcf->pass == 0 && oxcf->rc_mode == AOM_CBR &&
+      oxcf->resize_mode == RESIZE_DYNAMIC) {
+    if (cpi->resize_pending == 1) {
+      oxcf->scaled_frame_width =
+          (cm->width * cpi->resize_scale_num) / cpi->resize_scale_den;
+      oxcf->scaled_frame_height =
+          (cm->height * cpi->resize_scale_num) / cpi->resize_scale_den;
+    } else if (cpi->resize_pending == -1) {
+      // Go back up to original size.
+      oxcf->scaled_frame_width = oxcf->width;
+      oxcf->scaled_frame_height = oxcf->height;
+    }
+    if (cpi->resize_pending != 0) {
+      // There has been a change in frame size.
+      av1_set_size_literal(cpi, oxcf->scaled_frame_width,
+                           oxcf->scaled_frame_height);
+
+      // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+      set_mv_search_params(cpi);
+    }
+  }
+
+#if !CONFIG_XIPHRC
+  if (oxcf->pass == 2) {
+    av1_set_target_rate(cpi);
+  }
+#endif
+
+  alloc_frame_mvs(cm, cm->new_fb_idx);
+
+  // Reset the frame pointers to the current frame size.
+  if (aom_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
+                               NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate frame buffer");
+
+#if CONFIG_LOOP_RESTORATION
+  set_restoration_tilesize(cm->width, cm->height, cm->rst_info);
+  for (int i = 0; i < MAX_MB_PLANE; ++i)
+    cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
+  av1_alloc_restoration_buffers(cm);
+  for (int i = 0; i < MAX_MB_PLANE; ++i) {
+    cpi->rst_search[i].restoration_tilesize =
+        cm->rst_info[i].restoration_tilesize;
+    av1_alloc_restoration_struct(cm, &cpi->rst_search[i], cm->width,
+                                 cm->height);
+  }
+#endif  // CONFIG_LOOP_RESTORATION
+  alloc_util_frame_buffers(cpi);
+  init_motion_estimation(cpi);
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - LAST_FRAME];
+    const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+
+    ref_buf->idx = buf_idx;
+
+    if (buf_idx != INVALID_IDX) {
+      YV12_BUFFER_CONFIG *const buf = &cm->buffer_pool->frame_bufs[buf_idx].buf;
+      ref_buf->buf = buf;
+#if CONFIG_HIGHBITDEPTH
+      av1_setup_scale_factors_for_frame(
+          &ref_buf->sf, buf->y_crop_width, buf->y_crop_height, cm->width,
+          cm->height, (buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0);
+#else
+      av1_setup_scale_factors_for_frame(&ref_buf->sf, buf->y_crop_width,
+                                        buf->y_crop_height, cm->width,
+                                        cm->height);
+#endif  // CONFIG_HIGHBITDEPTH
+      if (av1_is_scaled(&ref_buf->sf)) aom_extend_frame_borders(buf);
+    } else {
+      ref_buf->buf = NULL;
+    }
+  }
+
+  set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
+}
+
+static void reset_use_upsampled_references(AV1_COMP *cpi) {
+  MV_REFERENCE_FRAME ref_frame;
+
+  // reset up-sampled reference buffer structure.
+  init_upsampled_ref_frame_bufs(cpi);
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, ref_frame);
+    int new_uidx = upsample_ref_frame(cpi, ref);
+
+    // Update the up-sampled reference index.
+    cpi->upsampled_ref_idx[get_ref_frame_map_idx(cpi, ref_frame)] = new_uidx;
+    cpi->upsampled_ref_bufs[new_uidx].ref_count++;
+  }
+}
+
+static void encode_without_recode_loop(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  int q = 0, bottom_index = 0, top_index = 0;  // Dummy variables.
+  const int use_upsampled_ref = cpi->sf.use_upsampled_references;
+
+  aom_clear_system_state();
+
+  set_frame_size(cpi);
+
+  // For 1 pass CBR under dynamic resize mode: use faster scaling for source.
+  // Only for 2x2 scaling for now.
+  if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == AOM_CBR &&
+      cpi->oxcf.resize_mode == RESIZE_DYNAMIC &&
+      cpi->un_scaled_source->y_width == (cm->width << 1) &&
+      cpi->un_scaled_source->y_height == (cm->height << 1)) {
+    cpi->source = av1_scale_if_required_fast(cm, cpi->un_scaled_source,
+                                             &cpi->scaled_source);
+    if (cpi->unscaled_last_source != NULL)
+      cpi->last_source = av1_scale_if_required_fast(
+          cm, cpi->unscaled_last_source, &cpi->scaled_last_source);
+  } else {
+    cpi->source =
+        av1_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source);
+    if (cpi->unscaled_last_source != NULL)
+      cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
+                                               &cpi->scaled_last_source);
+  }
+
+  if (frame_is_intra_only(cm) == 0) {
+    av1_scale_references(cpi);
+  }
+
+  set_size_independent_vars(cpi);
+  set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
+  // cpi->sf.use_upsampled_references can be different from frame to frame.
+  // Every time when cpi->sf.use_upsampled_references is changed from 0 to 1.
+  // The reference frames for this frame have to be up-sampled before encoding.
+  if (!use_upsampled_ref && cpi->sf.use_upsampled_references &&
+      cm->frame_type != KEY_FRAME)
+    reset_use_upsampled_references(cpi);
+
+  av1_set_quantizer(cm, q);
+  av1_set_variance_partition_thresholds(cpi, q);
+
+  setup_frame(cpi);
+
+#if CONFIG_SUBFRAME_PROB_UPDATE
+  cm->do_subframe_update = cm->tile_cols == 1 && cm->tile_rows == 1;
+  av1_copy(cm->starting_coef_probs, cm->fc->coef_probs);
+  av1_copy(cpi->subframe_stats.enc_starting_coef_probs, cm->fc->coef_probs);
+  cm->coef_probs_update_idx = 0;
+  av1_copy(cpi->subframe_stats.coef_probs_buf[0], cm->fc->coef_probs);
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+
+  suppress_active_map(cpi);
+  // Variance adaptive and in frame q adjustment experiments are mutually
+  // exclusive.
+  if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+    av1_vaq_frame_setup(cpi);
+  } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+    av1_setup_in_frame_q_adj(cpi);
+  } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    av1_cyclic_refresh_setup(cpi);
+  }
+  apply_active_map(cpi);
+
+  // transform / motion compensation build reconstruction frame
+  av1_encode_frame(cpi);
+
+  // Update some stats from cyclic refresh, and check if we should not update
+  // golden reference, for 1 pass CBR.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->frame_type != KEY_FRAME &&
+      (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == AOM_CBR))
+    av1_cyclic_refresh_check_golden_update(cpi);
+
+  // Update the skip mb flag probabilities based on the distribution
+  // seen in the last encoder iteration.
+  // update_base_skip_probs(cpi);
+  aom_clear_system_state();
+}
+
+static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
+                                    uint8_t *dest) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int bottom_index, top_index;
+  int loop_count = 0;
+  int loop_at_this_size = 0;
+  int loop = 0;
+#if !CONFIG_XIPHRC
+  int overshoot_seen = 0;
+  int undershoot_seen = 0;
+#endif
+  int frame_over_shoot_limit;
+  int frame_under_shoot_limit;
+  int q = 0, q_low = 0, q_high = 0;
+  const int use_upsampled_ref = cpi->sf.use_upsampled_references;
+
+  set_size_independent_vars(cpi);
+
+  do {
+    aom_clear_system_state();
+
+    set_frame_size(cpi);
+
+    if (loop_count == 0 || cpi->resize_pending != 0) {
+      set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
+      // cpi->sf.use_upsampled_references can be different from frame to frame.
+      // Every time when cpi->sf.use_upsampled_references is changed from 0 to
+      // 1.
+      // The reference frames for this frame have to be up-sampled before
+      // encoding.
+      if (!use_upsampled_ref && cpi->sf.use_upsampled_references &&
+          cm->frame_type != KEY_FRAME)
+        reset_use_upsampled_references(cpi);
+
+      // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+      set_mv_search_params(cpi);
+
+#if !CONFIG_XIPHRC
+      // Reset the loop state for new frame size.
+      overshoot_seen = 0;
+      undershoot_seen = 0;
+#endif
+
+      // Reconfiguration for change in frame size has concluded.
+      cpi->resize_pending = 0;
+
+      q_low = bottom_index;
+      q_high = top_index;
+
+      loop_at_this_size = 0;
+    }
+
+    // Decide frame size bounds first time through.
+    if (loop_count == 0) {
+      av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+                                       &frame_under_shoot_limit,
+                                       &frame_over_shoot_limit);
+    }
+
+    cpi->source =
+        av1_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source);
+
+    if (cpi->unscaled_last_source != NULL)
+      cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
+                                               &cpi->scaled_last_source);
+
+    if (frame_is_intra_only(cm) == 0) {
+      if (loop_count > 0) {
+        release_scaled_references(cpi);
+      }
+      av1_scale_references(cpi);
+    }
+
+    av1_set_quantizer(cm, q);
+
+    if (loop_count == 0) setup_frame(cpi);
+
+#if CONFIG_Q_ADAPT_PROBS
+    // Base q-index may have changed, so we need to assign proper default coef
+    // probs before every iteration.
+    if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+      int i;
+      av1_default_coef_probs(cm);
+      if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode ||
+          cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL) {
+        for (i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
+      } else if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT) {
+        cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+      }
+    }
+#endif  // CONFIG_Q_ADAPT_PROBS
+
+#if CONFIG_SUBFRAME_PROB_UPDATE
+    cm->do_subframe_update = cm->tile_cols == 1 && cm->tile_rows == 1;
+    if (loop_count == 0 || frame_is_intra_only(cm) ||
+        cm->error_resilient_mode) {
+      av1_copy(cm->starting_coef_probs, cm->fc->coef_probs);
+      av1_copy(cpi->subframe_stats.enc_starting_coef_probs, cm->fc->coef_probs);
+    } else {
+      if (cm->do_subframe_update) {
+        av1_copy(cm->fc->coef_probs,
+                 cpi->subframe_stats.enc_starting_coef_probs);
+        av1_copy(cm->starting_coef_probs,
+                 cpi->subframe_stats.enc_starting_coef_probs);
+        av1_zero(cpi->subframe_stats.coef_counts_buf);
+        av1_zero(cpi->subframe_stats.eob_counts_buf);
+      }
+    }
+    cm->coef_probs_update_idx = 0;
+    av1_copy(cpi->subframe_stats.coef_probs_buf[0], cm->fc->coef_probs);
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+
+    // Variance adaptive and in frame q adjustment experiments are mutually
+    // exclusive.
+    if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+      av1_vaq_frame_setup(cpi);
+    } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+      av1_setup_in_frame_q_adj(cpi);
+    }
+
+    // transform / motion compensation build reconstruction frame
+    av1_encode_frame(cpi);
+
+    // Update the skip mb flag probabilities based on the distribution
+    // seen in the last encoder iteration.
+    // update_base_skip_probs(cpi);
+
+    aom_clear_system_state();
+
+    // Dummy pack of the bitstream using up to date stats to get an
+    // accurate estimate of output frame size to determine if we need
+    // to recode.
+    if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
+      save_coding_context(cpi);
+
+      av1_pack_bitstream(cpi, dest, size);
+
+      rc->projected_frame_size = (int)(*size) << 3;
+      restore_coding_context(cpi);
+
+      if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
+    }
+
+    if (cpi->oxcf.rc_mode == AOM_Q) {
+      loop = 0;
+    } else {
+      if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced &&
+          (rc->projected_frame_size < rc->max_frame_bandwidth)) {
+        int last_q = q;
+        int64_t kf_err;
+
+        int64_t high_err_target = cpi->ambient_err;
+        int64_t low_err_target = cpi->ambient_err >> 1;
+
+#if CONFIG_HIGHBITDEPTH
+        if (cm->use_highbitdepth) {
+          kf_err = aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+        } else {
+          kf_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+        }
+#else
+        kf_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+#endif  // CONFIG_HIGHBITDEPTH
+
+        // Prevent possible divide by zero error below for perfect KF
+        kf_err += !kf_err;
+
+        // The key frame is not good enough or we can afford
+        // to make it better without undue risk of popping.
+        if ((kf_err > high_err_target &&
+             rc->projected_frame_size <= frame_over_shoot_limit) ||
+            (kf_err > low_err_target &&
+             rc->projected_frame_size <= frame_under_shoot_limit)) {
+          // Lower q_high
+          q_high = q > q_low ? q - 1 : q_low;
+
+          // Adjust Q
+          q = (int)((q * high_err_target) / kf_err);
+          q = AOMMIN(q, (q_high + q_low) >> 1);
+        } else if (kf_err < low_err_target &&
+                   rc->projected_frame_size >= frame_under_shoot_limit) {
+          // The key frame is much better than the previous frame
+          // Raise q_low
+          q_low = q < q_high ? q + 1 : q_high;
+
+          // Adjust Q
+          q = (int)((q * low_err_target) / kf_err);
+          q = AOMMIN(q, (q_high + q_low + 1) >> 1);
+        }
+
+        // Clamp Q to upper and lower limits:
+        q = clamp(q, q_low, q_high);
+
+        loop = q != last_q;
+      } else if (recode_loop_test(cpi, frame_over_shoot_limit,
+                                  frame_under_shoot_limit, q,
+                                  AOMMAX(q_high, top_index), bottom_index)) {
+        // Is the projected frame size out of range and are we allowed
+        // to attempt to recode.
+        int last_q = q;
+#if !CONFIG_XIPHRC
+        int retries = 0;
+#endif
+
+        if (cpi->resize_pending == 1) {
+          // Change in frame size so go back around the recode loop.
+          cpi->rc.frame_size_selector =
+              SCALE_STEP1 - cpi->rc.frame_size_selector;
+          cpi->rc.next_frame_size_selector = cpi->rc.frame_size_selector;
+
+#if CONFIG_INTERNAL_STATS
+          ++cpi->tot_recode_hits;
+#endif
+          ++loop_count;
+          loop = 1;
+          continue;
+        }
+
+#if !CONFIG_XIPHRC
+        // Frame size out of permitted range:
+        // Update correction factor & compute new Q to try...
+        // Frame is too large
+        if (rc->projected_frame_size > rc->this_frame_target) {
+          // Special case if the projected size is > the max allowed.
+          if (rc->projected_frame_size >= rc->max_frame_bandwidth)
+            q_high = rc->worst_quality;
+
+          // Raise Qlow as to at least the current value
+          q_low = q < q_high ? q + 1 : q_high;
+
+          if (undershoot_seen || loop_at_this_size > 1) {
+            // Update rate_correction_factor unless
+            av1_rc_update_rate_correction_factors(cpi);
+
+            q = (q_high + q_low + 1) / 2;
+          } else {
+            // Update rate_correction_factor unless
+            av1_rc_update_rate_correction_factors(cpi);
+
+            q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                                  AOMMAX(q_high, top_index));
+
+            while (q < q_low && retries < 10) {
+              av1_rc_update_rate_correction_factors(cpi);
+              q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                                    AOMMAX(q_high, top_index));
+              retries++;
+            }
+          }
+
+          overshoot_seen = 1;
+        } else {
+          // Frame is too small
+          q_high = q > q_low ? q - 1 : q_low;
+
+          if (overshoot_seen || loop_at_this_size > 1) {
+            av1_rc_update_rate_correction_factors(cpi);
+            q = (q_high + q_low) / 2;
+          } else {
+            av1_rc_update_rate_correction_factors(cpi);
+            q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                                  top_index);
+            // Special case reset for qlow for constrained quality.
+            // This should only trigger where there is very substantial
+            // undershoot on a frame and the auto cq level is above
+            // the user passsed in value.
+            if (cpi->oxcf.rc_mode == AOM_CQ && q < q_low) {
+              q_low = q;
+            }
+
+            while (q > q_high && retries < 10) {
+              av1_rc_update_rate_correction_factors(cpi);
+              q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                                    top_index);
+              retries++;
+            }
+          }
+
+          undershoot_seen = 1;
+        }
+#endif
+
+        // Clamp Q to upper and lower limits:
+        q = clamp(q, q_low, q_high);
+
+        loop = (q != last_q);
+      } else {
+        loop = 0;
+      }
+    }
+
+    // Special case for overlay frame.
+    if (rc->is_src_frame_alt_ref &&
+        rc->projected_frame_size < rc->max_frame_bandwidth)
+      loop = 0;
+
+#if CONFIG_GLOBAL_MOTION
+    if (recode_loop_test_global_motion(cpi)) {
+      loop = 1;
+    }
+#endif  // CONFIG_GLOBAL_MOTION
+
+    if (loop) {
+      ++loop_count;
+      ++loop_at_this_size;
+
+#if CONFIG_INTERNAL_STATS
+      ++cpi->tot_recode_hits;
+#endif
+    }
+  } while (loop);
+}
+
+static int get_ref_frame_flags(const AV1_COMP *cpi) {
+  const int *const map = cpi->common.ref_frame_map;
+
+#if CONFIG_EXT_REFS
+  const int last2_is_last =
+      map[cpi->lst_fb_idxes[1]] == map[cpi->lst_fb_idxes[0]];
+  const int last3_is_last =
+      map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[0]];
+  const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[0]];
+#if CONFIG_LOWDELAY_COMPOUND
+  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[0]];
+  const int last3_is_last2 =
+      map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[1]];
+  const int gld_is_last2 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[1]];
+  const int gld_is_last3 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[2]];
+#else
+  const int bwd_is_last = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[0]];
+  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[0]];
+
+  const int last3_is_last2 =
+      map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[1]];
+  const int gld_is_last2 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[1]];
+  const int bwd_is_last2 = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[1]];
+
+  const int gld_is_last3 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[2]];
+  const int bwd_is_last3 = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[2]];
+
+  const int bwd_is_gld = map[cpi->bwd_fb_idx] == map[cpi->gld_fb_idx];
+
+#endif
+  const int last2_is_alt = map[cpi->lst_fb_idxes[1]] == map[cpi->alt_fb_idx];
+  const int last3_is_alt = map[cpi->lst_fb_idxes[2]] == map[cpi->alt_fb_idx];
+  const int gld_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
+  const int bwd_is_alt = map[cpi->bwd_fb_idx] == map[cpi->alt_fb_idx];
+#else
+  const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
+  const int gld_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
+  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
+#endif  // CONFIG_EXT_REFS
+
+  int flags = AOM_REFFRAME_ALL;
+
+#if CONFIG_EXT_REFS
+  // Disable the use of BWDREF_FRAME for non-bipredictive frames.
+  if (!(cpi->rc.is_bipred_frame || cpi->rc.is_last_bipred_frame ||
+        (cpi->rc.is_bwd_ref_frame && cpi->num_extra_arfs)))
+    flags &= ~AOM_BWD_FLAG;
+#endif  // CONFIG_EXT_REFS
+
+  if (gld_is_last || gld_is_alt) flags &= ~AOM_GOLD_FLAG;
+
+  if (cpi->rc.frames_till_gf_update_due == INT_MAX) flags &= ~AOM_GOLD_FLAG;
+
+  if (alt_is_last) flags &= ~AOM_ALT_FLAG;
+
+#if CONFIG_EXT_REFS
+  if (last2_is_last || last2_is_alt) flags &= ~AOM_LAST2_FLAG;
+
+  if (last3_is_last || last3_is_last2 || last3_is_alt) flags &= ~AOM_LAST3_FLAG;
+
+  if (gld_is_last2 || gld_is_last3) flags &= ~AOM_GOLD_FLAG;
+
+#if CONFIG_LOWDELAY_COMPOUND  // Changes LL & HL bitstream
+  /* Allow biprediction between two identical frames (e.g. bwd_is_last = 1) */
+  if (bwd_is_alt && (flags & AOM_BWD_FLAG)) flags &= ~AOM_BWD_FLAG;
+#else
+  if ((bwd_is_last || bwd_is_last2 || bwd_is_last3 || bwd_is_gld ||
+       bwd_is_alt) &&
+      (flags & AOM_BWD_FLAG))
+    flags &= ~AOM_BWD_FLAG;
+#endif
+#endif  // CONFIG_EXT_REFS
+
+  return flags;
+}
+
+static void set_ext_overrides(AV1_COMP *cpi) {
+  // Overrides the defaults with the externally supplied values with
+  // av1_update_reference() and av1_update_entropy() calls
+  // Note: The overrides are valid only for the next frame passed
+  // to encode_frame_to_data_rate() function
+  if (cpi->ext_refresh_frame_context_pending) {
+    cpi->common.refresh_frame_context = cpi->ext_refresh_frame_context;
+    cpi->ext_refresh_frame_context_pending = 0;
+  }
+  if (cpi->ext_refresh_frame_flags_pending) {
+    cpi->refresh_last_frame = cpi->ext_refresh_last_frame;
+    cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame;
+    cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame;
+    cpi->ext_refresh_frame_flags_pending = 0;
+  }
+}
+
+YV12_BUFFER_CONFIG *av1_scale_if_required_fast(AV1_COMMON *cm,
+                                               YV12_BUFFER_CONFIG *unscaled,
+                                               YV12_BUFFER_CONFIG *scaled) {
+  if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+      cm->mi_rows * MI_SIZE != unscaled->y_height) {
+    // For 2x2 scaling down.
+    aom_scale_frame(unscaled, scaled, unscaled->y_buffer, 9, 2, 1, 2, 1, 0);
+    aom_extend_frame_borders(scaled);
+    return scaled;
+  } else {
+    return unscaled;
+  }
+}
+
+YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
+                                          YV12_BUFFER_CONFIG *unscaled,
+                                          YV12_BUFFER_CONFIG *scaled) {
+  if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+      cm->mi_rows * MI_SIZE != unscaled->y_height) {
+#if CONFIG_HIGHBITDEPTH
+    scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
+#else
+    scale_and_extend_frame_nonnormative(unscaled, scaled);
+#endif  // CONFIG_HIGHBITDEPTH
+    return scaled;
+  } else {
+    return unscaled;
+  }
+}
+
+static void set_arf_sign_bias(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  int arf_sign_bias;
+#if CONFIG_EXT_REFS
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+  // The arf_sign_bias will be one for internal ARFs'
+  arf_sign_bias = cpi->rc.source_alt_ref_active &&
+                  (!cpi->refresh_alt_ref_frame ||
+                   (gf_group->rf_level[gf_group->index] == GF_ARF_LOW));
+#else
+  if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    arf_sign_bias = cpi->rc.source_alt_ref_active &&
+                    (!cpi->refresh_alt_ref_frame ||
+                     (gf_group->rf_level[gf_group->index] == GF_ARF_LOW));
+  } else {
+    arf_sign_bias =
+        (cpi->rc.source_alt_ref_active && !cpi->refresh_alt_ref_frame);
+  }
+#endif  // CONFIG_EXT_REFS
+
+  cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias;
+#if CONFIG_EXT_REFS
+  cm->ref_frame_sign_bias[BWDREF_FRAME] = cm->ref_frame_sign_bias[ALTREF_FRAME];
+#endif  // CONFIG_EXT_REFS
+}
+
+static int setup_interp_filter_search_mask(AV1_COMP *cpi) {
+  InterpFilter ifilter;
+  int ref_total[TOTAL_REFS_PER_FRAME] = { 0 };
+  MV_REFERENCE_FRAME ref;
+  int mask = 0;
+  int arf_idx = ALTREF_FRAME;
+
+#if CONFIG_EXT_REFS
+  // Get which arf used as ALTREF_FRAME
+  if (cpi->oxcf.pass == 2)
+    arf_idx += cpi->twopass.gf_group.arf_ref_idx[cpi->twopass.gf_group.index];
+#endif  // CONFIG_EXT_REFS
+
+  if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame)
+    return mask;
+
+#if CONFIG_EXT_REFS
+  for (ref = LAST_FRAME; ref < ALTREF_FRAME; ++ref)
+    for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter)
+      ref_total[ref] += cpi->interp_filter_selected[ref][ifilter];
+
+  for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter)
+    ref_total[ref] += cpi->interp_filter_selected[arf_idx][ifilter];
+#else
+  for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref)
+    for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter)
+      ref_total[ref] += cpi->interp_filter_selected[ref][ifilter];
+#endif  // CONFIG_EXT_REFS
+
+  for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter) {
+    if ((ref_total[LAST_FRAME] &&
+         cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) &&
+#if CONFIG_EXT_REFS
+        (ref_total[LAST2_FRAME] == 0 ||
+         cpi->interp_filter_selected[LAST2_FRAME][ifilter] * 50 <
+             ref_total[LAST2_FRAME]) &&
+        (ref_total[LAST3_FRAME] == 0 ||
+         cpi->interp_filter_selected[LAST3_FRAME][ifilter] * 50 <
+             ref_total[LAST3_FRAME]) &&
+#endif  // CONFIG_EXT_REFS
+        (ref_total[GOLDEN_FRAME] == 0 ||
+         cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50 <
+             ref_total[GOLDEN_FRAME]) &&
+#if CONFIG_EXT_REFS
+        (ref_total[BWDREF_FRAME] == 0 ||
+         cpi->interp_filter_selected[BWDREF_FRAME][ifilter] * 50 <
+             ref_total[BWDREF_FRAME]) &&
+#endif  // CONFIG_EXT_REFS
+        (ref_total[ALTREF_FRAME] == 0 ||
+         cpi->interp_filter_selected[arf_idx][ifilter] * 50 <
+             ref_total[ALTREF_FRAME]))
+      mask |= 1 << ifilter;
+  }
+  return mask;
+}
+
+#define DUMP_RECON_FRAMES 0
+
+#if DUMP_RECON_FRAMES == 1
+// NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+static void dump_filtered_recon_frames(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const YV12_BUFFER_CONFIG *recon_buf = cm->frame_to_show;
+  int h;
+  char file_name[256] = "/tmp/enc_filtered_recon.yuv";
+  FILE *f_recon = NULL;
+
+  if (recon_buf == NULL || !cm->show_frame) {
+    printf("Frame %d is not ready or no show to dump.\n",
+           cm->current_video_frame);
+    return;
+  }
+
+  if (cm->current_video_frame == 0) {
+    if ((f_recon = fopen(file_name, "wb")) == NULL) {
+      printf("Unable to open file %s to write.\n", file_name);
+      return;
+    }
+  } else {
+    if ((f_recon = fopen(file_name, "ab")) == NULL) {
+      printf("Unable to open file %s to append.\n", file_name);
+      return;
+    }
+  }
+  printf(
+      "\nFrame=%5d, encode_update_type[%5d]=%1d, show_existing_frame=%d, "
+      "y_stride=%4d, uv_stride=%4d, width=%4d, height=%4d\n",
+      cm->current_video_frame, cpi->twopass.gf_group.index,
+      cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
+      cm->show_existing_frame, recon_buf->y_stride, recon_buf->uv_stride,
+      cm->width, cm->height);
+
+  // --- Y ---
+  for (h = 0; h < cm->height; ++h) {
+    fwrite(&recon_buf->y_buffer[h * recon_buf->y_stride], 1, cm->width,
+           f_recon);
+  }
+  // --- U ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&recon_buf->u_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+           f_recon);
+  }
+  // --- V ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&recon_buf->v_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+           f_recon);
+  }
+
+  fclose(f_recon);
+}
+#endif  // DUMP_RECON_FRAMES
+
+#if CONFIG_EC_ADAPT
+
+static void make_update_tile_list_enc(AV1_COMP *cpi, const int tile_rows,
+                                      const int tile_cols,
+                                      FRAME_CONTEXT *ec_ctxs[]) {
+  int i;
+  for (i = 0; i < tile_rows * tile_cols; ++i)
+    ec_ctxs[i] = &cpi->tile_data[i].tctx;
+}
+
+#endif
+static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
+                                      uint8_t *dest, int skip_adapt,
+                                      unsigned int *frame_flags) {
+  AV1_COMMON *const cm = &cpi->common;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  struct segmentation *const seg = &cm->seg;
+  TX_SIZE t;
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT **tile_ctxs = aom_malloc(cm->tile_rows * cm->tile_cols *
+                                         sizeof(&cpi->tile_data[0].tctx));
+  aom_cdf_prob **cdf_ptrs =
+      aom_malloc(cm->tile_rows * cm->tile_cols *
+                 sizeof(&cpi->tile_data[0].tctx.partition_cdf[0][0]));
+#endif
+#if CONFIG_XIPHRC
+  int frame_type;
+  int drop_this_frame = 0;
+#endif  // CONFIG_XIPHRC
+  set_ext_overrides(cpi);
+  aom_clear_system_state();
+
+  // Set the arf sign bias for this frame.
+  set_arf_sign_bias(cpi);
+#if CONFIG_TEMPMV_SIGNALING
+  // frame type has been decided outside of this function call
+  cm->cur_frame->intra_only = cm->frame_type == KEY_FRAME || cm->intra_only;
+  cm->use_prev_frame_mvs =
+      !cpi->oxcf.disable_tempmv && !cm->cur_frame->intra_only;
+#endif
+
+#if CONFIG_EXT_REFS
+  // NOTE:
+  // (1) Move the setup of the ref_frame_flags upfront as it would be
+  //     determined by the current frame properties;
+  // (2) The setup of the ref_frame_flags applies to both show_existing_frame's
+  //     and the other cases.
+  if (cm->current_video_frame > 0)
+    cpi->ref_frame_flags = get_ref_frame_flags(cpi);
+
+  if (cm->show_existing_frame) {
+    // NOTE(zoeliu): In BIDIR_PRED, the existing frame to show is the current
+    //               BWDREF_FRAME in the reference frame buffer.
+    cm->frame_type = INTER_FRAME;
+    cm->show_frame = 1;
+    cpi->frame_flags = *frame_flags;
+
+    // In the case of show_existing frame, we will not send fresh flag
+    // to decoder. Any change in the reference frame buffer can be done by
+    // switching the virtual indices.
+
+    cpi->refresh_last_frame = 0;
+    cpi->refresh_golden_frame = 0;
+    cpi->refresh_bwd_ref_frame = 0;
+    cpi->refresh_alt_ref_frame = 0;
+
+    cpi->rc.is_bwd_ref_frame = 0;
+    cpi->rc.is_last_bipred_frame = 0;
+    cpi->rc.is_bipred_frame = 0;
+
+    // Build the bitstream
+    av1_pack_bitstream(cpi, dest, size);
+
+    // Set up frame to show to get ready for stats collection.
+    cm->frame_to_show = get_frame_new_buffer(cm);
+
+#if DUMP_RECON_FRAMES == 1
+    // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+    dump_filtered_recon_frames(cpi);
+#endif  // DUMP_RECON_FRAMES
+
+    // Update the LAST_FRAME in the reference frame buffer.
+    av1_update_reference_frames(cpi);
+
+    // Update frame flags
+    cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN;
+    cpi->frame_flags &= ~FRAMEFLAGS_BWDREF;
+    cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
+
+    *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY;
+
+    // Update the frame type
+    cm->last_frame_type = cm->frame_type;
+
+    // Since we allocate a spot for the OVERLAY frame in the gf group, we need
+    // to do post-encoding update accordingly.
+    if (cpi->rc.is_src_frame_alt_ref) {
+      av1_set_target_rate(cpi);
+#if CONFIG_XIPHRC
+      frame_type = cm->frame_type == INTER_FRAME ? OD_P_FRAME : OD_I_FRAME;
+      drop_this_frame = od_enc_rc_update_state(
+          &cpi->od_rc, *size << 3, cpi->refresh_golden_frame,
+          cpi->refresh_alt_ref_frame, frame_type, cpi->droppable);
+#else
+      av1_rc_postencode_update(cpi, *size);
+#endif
+    }
+
+    cm->last_width = cm->width;
+    cm->last_height = cm->height;
+
+    ++cm->current_video_frame;
+
+#if CONFIG_EC_ADAPT
+    aom_free(tile_ctxs);
+    aom_free(cdf_ptrs);
+#endif
+    return;
+  }
+#endif  // CONFIG_EXT_REFS
+
+  // Set default state for segment based loop filter update flags.
+  cm->lf.mode_ref_delta_update = 0;
+
+  if (cpi->oxcf.pass == 2 && cpi->sf.adaptive_interp_filter_search)
+    cpi->sf.interp_filter_search_mask = setup_interp_filter_search_mask(cpi);
+
+  // Set various flags etc to special state if it is a key frame.
+  if (frame_is_intra_only(cm)) {
+    // Reset the loop filter deltas and segmentation map.
+    av1_reset_segment_features(cm);
+
+    // If segmentation is enabled force a map update for key frames.
+    if (seg->enabled) {
+      seg->update_map = 1;
+      seg->update_data = 1;
+    }
+
+    // The alternate reference frame cannot be active for a key frame.
+    cpi->rc.source_alt_ref_active = 0;
+
+    cm->error_resilient_mode = oxcf->error_resilient_mode;
+
+    // By default, encoder assumes decoder can use prev_mi.
+    if (cm->error_resilient_mode) {
+      cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
+      cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_FORWARD;
+    } else if (cm->intra_only) {
+      // Only reset the current context.
+      cm->reset_frame_context = RESET_FRAME_CONTEXT_CURRENT;
+    }
+  }
+#if CONFIG_TILE_GROUPS
+  if (cpi->oxcf.mtu == 0) {
+    cm->num_tg = cpi->oxcf.num_tile_groups;
+  } else {
+    // Use a default value for the purposes of weighting costs in probability
+    // updates
+    cm->num_tg = DEFAULT_MAX_NUM_TG;
+  }
+#endif
+
+#if CONFIG_EXT_TILE
+  cm->tile_encoding_mode = cpi->oxcf.tile_encoding_mode;
+#endif  // CONFIG_EXT_TILE
+
+#if CONFIG_XIPHRC
+  if (drop_this_frame) {
+    av1_rc_postencode_update_drop_frame(cpi);
+    ++cm->current_video_frame;
+#if CONFIG_EC_ADAPT
+    aom_free(tile_ctxs);
+    aom_free(cdf_ptrs);
+#endif
+    return;
+  }
+#else
+  // For 1 pass CBR, check if we are dropping this frame.
+  // Never drop on key frame.
+  if (oxcf->pass == 0 && oxcf->rc_mode == AOM_CBR &&
+      cm->frame_type != KEY_FRAME) {
+    if (av1_rc_drop_frame(cpi)) {
+      av1_rc_postencode_update_drop_frame(cpi);
+      ++cm->current_video_frame;
+#if CONFIG_EC_ADAPT
+      aom_free(tile_ctxs);
+      aom_free(cdf_ptrs);
+#endif
+      return;
+    }
+  }
+#endif
+
+  aom_clear_system_state();
+
+#if CONFIG_INTERNAL_STATS
+  memset(cpi->mode_chosen_counts, 0,
+         MAX_MODES * sizeof(*cpi->mode_chosen_counts));
+#endif
+
+#if CONFIG_REFERENCE_BUFFER
+  {
+    /* Non-normative definition of current_frame_id ("frame counter" with
+    * wraparound) */
+    const int frame_id_length = FRAME_ID_LENGTH_MINUS7 + 7;
+    if (cm->current_frame_id == -1) {
+      int lsb, msb;
+/* quasi-random initialization of current_frame_id for a key frame */
+#if CONFIG_HIGHBITDEPTH
+      if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) {
+        lsb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[0] & 0xff;
+        msb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[1] & 0xff;
+      } else {
+#endif
+        lsb = cpi->source->y_buffer[0] & 0xff;
+        msb = cpi->source->y_buffer[1] & 0xff;
+#if CONFIG_HIGHBITDEPTH
+      }
+#endif
+      cm->current_frame_id = ((msb << 8) + lsb) % (1 << frame_id_length);
+    } else {
+      cm->current_frame_id =
+          (cm->current_frame_id + 1 + (1 << frame_id_length)) %
+          (1 << frame_id_length);
+    }
+  }
+#endif
+
+#if CONFIG_EXT_DELTA_Q
+  cm->delta_q_present_flag = cpi->oxcf.deltaq_mode != NO_DELTA_Q;
+  cm->delta_lf_present_flag = cpi->oxcf.deltaq_mode == DELTA_Q_LF;
+#endif
+
+  if (cpi->sf.recode_loop == DISALLOW_RECODE) {
+    encode_without_recode_loop(cpi);
+  } else {
+    encode_with_recode_loop(cpi, size, dest);
+  }
+
+#ifdef OUTPUT_YUV_SKINMAP
+  if (cpi->common.current_video_frame > 1) {
+    av1_compute_skin_map(cpi, yuv_skinmap_file);
+  }
+#endif  // OUTPUT_YUV_SKINMAP
+
+  // Special case code to reduce pulsing when key frames are forced at a
+  // fixed interval. Note the reconstruction error if it is the frame before
+  // the force key frame
+  if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
+#if CONFIG_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      cpi->ambient_err =
+          aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+    } else {
+      cpi->ambient_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+    }
+#else
+    cpi->ambient_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+#endif  // CONFIG_HIGHBITDEPTH
+  }
+
+  // If the encoder forced a KEY_FRAME decision
+  if (cm->frame_type == KEY_FRAME) {
+    cpi->refresh_last_frame = 1;
+  }
+
+  cm->frame_to_show = get_frame_new_buffer(cm);
+  cm->frame_to_show->color_space = cm->color_space;
+  cm->frame_to_show->color_range = cm->color_range;
+  cm->frame_to_show->render_width = cm->render_width;
+  cm->frame_to_show->render_height = cm->render_height;
+
+#if CONFIG_EXT_REFS
+// TODO(zoeliu): For non-ref frames, loop filtering may need to be turned
+// off.
+#endif  // CONFIG_EXT_REFS
+
+  // Pick the loop filter level for the frame.
+  loopfilter_frame(cpi, cm);
+
+  // Build the bitstream
+  av1_pack_bitstream(cpi, dest, size);
+
+  if (skip_adapt) {
+#if CONFIG_EC_ADAPT
+    aom_free(tile_ctxs);
+    aom_free(cdf_ptrs);
+#endif
+    return;
+  }
+
+#if CONFIG_REFERENCE_BUFFER
+  {
+    int i;
+    /* Update reference frame id values based on the value of refresh_mask */
+    for (i = 0; i < REF_FRAMES; i++) {
+      if ((cm->refresh_mask >> i) & 1) {
+        cm->ref_frame_id[i] = cm->current_frame_id;
+      }
+    }
+  }
+#endif
+
+#if DUMP_RECON_FRAMES == 1
+  // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+  if (cm->show_frame) dump_filtered_recon_frames(cpi);
+#endif  // DUMP_RECON_FRAMES
+
+  if (cm->seg.update_map) update_reference_segmentation_map(cpi);
+
+  if (frame_is_intra_only(cm) == 0) {
+    release_scaled_references(cpi);
+  }
+
+  av1_update_reference_frames(cpi);
+
+  for (t = 0; t < TX_SIZES; t++)
+    av1_full_to_model_counts(cpi->td.counts->coef[t],
+                             cpi->td.rd_counts.coef_counts[t]);
+#if CONFIG_ENTROPY_STATS
+  av1_accumulate_frame_counts(&aggregate_fc, &cm->counts);
+#endif  // CONFIG_ENTROPY_STATS
+  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+#if CONFIG_SUBFRAME_PROB_UPDATE
+    cm->partial_prob_update = 0;
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+    av1_adapt_coef_probs(cm);
+    av1_adapt_intra_frame_probs(cm);
+#if CONFIG_EC_ADAPT
+    make_update_tile_list_enc(cpi, cm->tile_rows, cm->tile_cols, tile_ctxs);
+    av1_average_tile_coef_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs,
+                               cm->tile_rows * cm->tile_cols);
+    av1_average_tile_intra_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs,
+                                cm->tile_rows * cm->tile_cols);
+#if CONFIG_PVQ
+    av1_average_tile_pvq_cdfs(cpi->common.fc, tile_ctxs,
+                              cm->tile_rows * cm->tile_cols);
+#endif  // CONFIG_PVQ
+#endif  // CONFIG_EC_ADAPT
+#if CONFIG_ADAPT_SCAN
+    av1_adapt_scan_order(cm);
+#endif  // CONFIG_ADAPT_SCAN
+  }
+
+  if (!frame_is_intra_only(cm)) {
+    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+      av1_adapt_inter_frame_probs(cm);
+      av1_adapt_mv_probs(cm, cm->allow_high_precision_mv);
+#if CONFIG_EC_ADAPT
+      av1_average_tile_inter_cdfs(&cpi->common, cpi->common.fc, tile_ctxs,
+                                  cdf_ptrs, cm->tile_rows * cm->tile_cols);
+      av1_average_tile_mv_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs,
+                               cm->tile_rows * cm->tile_cols);
+#endif
+    }
+  }
+
+  if (cpi->refresh_golden_frame == 1)
+    cpi->frame_flags |= FRAMEFLAGS_GOLDEN;
+  else
+    cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN;
+
+  if (cpi->refresh_alt_ref_frame == 1)
+    cpi->frame_flags |= FRAMEFLAGS_ALTREF;
+  else
+    cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
+
+#if CONFIG_EXT_REFS
+  if (cpi->refresh_bwd_ref_frame == 1)
+    cpi->frame_flags |= FRAMEFLAGS_BWDREF;
+  else
+    cpi->frame_flags &= ~FRAMEFLAGS_BWDREF;
+#endif  // CONFIG_EXT_REFS
+
+#if !CONFIG_EXT_REFS
+  cpi->ref_frame_flags = get_ref_frame_flags(cpi);
+#endif  // !CONFIG_EXT_REFS
+
+  cm->last_frame_type = cm->frame_type;
+
+#if CONFIG_XIPHRC
+  frame_type = cm->frame_type == KEY_FRAME ? OD_I_FRAME : OD_P_FRAME;
+
+  drop_this_frame =
+      od_enc_rc_update_state(&cpi->od_rc, *size << 3, cpi->refresh_golden_frame,
+                             cpi->refresh_alt_ref_frame, frame_type, 0);
+  if (drop_this_frame) {
+    av1_rc_postencode_update_drop_frame(cpi);
+    ++cm->current_video_frame;
+#if CONFIG_EC_ADAPT
+    aom_free(tile_ctxs);
+    aom_free(cdf_ptrs);
+#endif
+    return;
+  }
+#else   // !CONFIG_XIPHRC
+  av1_rc_postencode_update(cpi, *size);
+#endif  // CONFIG_XIPHRC
+
+#if 0
+  output_frame_level_debug_stats(cpi);
+#endif
+
+  if (cm->frame_type == KEY_FRAME) {
+    // Tell the caller that the frame was coded as a key frame
+    *frame_flags = cpi->frame_flags | FRAMEFLAGS_KEY;
+  } else {
+    *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY;
+  }
+
+  // Clear the one shot update flags for segmentation map and mode/ref loop
+  // filter deltas.
+  cm->seg.update_map = 0;
+  cm->seg.update_data = 0;
+  cm->lf.mode_ref_delta_update = 0;
+
+  // keep track of the last coded dimensions
+  cm->last_width = cm->width;
+  cm->last_height = cm->height;
+
+  // reset to normal state now that we are done.
+  if (!cm->show_existing_frame) cm->last_show_frame = cm->show_frame;
+
+  if (cm->show_frame) {
+#if CONFIG_EXT_REFS
+// TODO(zoeliu): We may only swamp mi and prev_mi for those frames that are
+// being used as reference.
+#endif  // CONFIG_EXT_REFS
+    av1_swap_mi_and_prev_mi(cm);
+    // Don't increment frame counters if this was an altref buffer
+    // update not a real frame
+    ++cm->current_video_frame;
+  }
+
+#if CONFIG_EXT_REFS
+  // NOTE: Shall not refer to any frame not used as reference.
+  if (cm->is_reference_frame)
+#endif  // CONFIG_EXT_REFS
+    cm->prev_frame = cm->cur_frame;
+#if CONFIG_EC_ADAPT
+  aom_free(tile_ctxs);
+  aom_free(cdf_ptrs);
+#endif
+}
+
+static void Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
+                        int skip_adapt, unsigned int *frame_flags) {
+#if CONFIG_XIPHRC
+  int64_t ip_count;
+  int frame_type, is_golden, is_altref;
+
+  /* Not updated during init so update it here */
+  if (cpi->oxcf.rc_mode == AOM_Q) cpi->od_rc.quality = cpi->oxcf.cq_level;
+
+  frame_type = od_frame_type(&cpi->od_rc, cpi->od_rc.cur_frame, &is_golden,
+                             &is_altref, &ip_count);
+
+  if (frame_type == OD_I_FRAME) {
+    frame_type = KEY_FRAME;
+    cpi->frame_flags &= FRAMEFLAGS_KEY;
+  } else if (frame_type == OD_P_FRAME) {
+    frame_type = INTER_FRAME;
+  }
+
+  if (is_altref) {
+    cpi->refresh_alt_ref_frame = 1;
+    cpi->rc.source_alt_ref_active = 1;
+  }
+
+  cpi->refresh_golden_frame = is_golden;
+  cpi->common.frame_type = frame_type;
+  if (is_golden) cpi->frame_flags &= FRAMEFLAGS_GOLDEN;
+#else
+  if (cpi->oxcf.rc_mode == AOM_CBR) {
+    av1_rc_get_one_pass_cbr_params(cpi);
+  } else {
+    av1_rc_get_one_pass_vbr_params(cpi);
+  }
+#endif
+  encode_frame_to_data_rate(cpi, size, dest, skip_adapt, frame_flags);
+}
+
+#if !CONFIG_XIPHRC
+static void Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
+                        unsigned int *frame_flags) {
+  encode_frame_to_data_rate(cpi, size, dest, 0, frame_flags);
+
+#if CONFIG_EXT_REFS
+  // Do not do post-encoding update for those frames that do not have a spot in
+  // a gf group, but note that an OVERLAY frame always has a spot in a gf group,
+  // even when show_existing_frame is used.
+  if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref) {
+    av1_twopass_postencode_update(cpi);
+  }
+  check_show_existing_frame(cpi);
+#else
+  av1_twopass_postencode_update(cpi);
+#endif  // CONFIG_EXT_REFS
+}
+#endif
+
+static void init_ref_frame_bufs(AV1_COMMON *cm) {
+  int i;
+  BufferPool *const pool = cm->buffer_pool;
+  cm->new_fb_idx = INVALID_IDX;
+  for (i = 0; i < REF_FRAMES; ++i) {
+    cm->ref_frame_map[i] = INVALID_IDX;
+    pool->frame_bufs[i].ref_count = 0;
+  }
+}
+
+static void check_initial_width(AV1_COMP *cpi,
+#if CONFIG_HIGHBITDEPTH
+                                int use_highbitdepth,
+#endif
+                                int subsampling_x, int subsampling_y) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  if (!cpi->initial_width ||
+#if CONFIG_HIGHBITDEPTH
+      cm->use_highbitdepth != use_highbitdepth ||
+#endif
+      cm->subsampling_x != subsampling_x ||
+      cm->subsampling_y != subsampling_y) {
+    cm->subsampling_x = subsampling_x;
+    cm->subsampling_y = subsampling_y;
+#if CONFIG_HIGHBITDEPTH
+    cm->use_highbitdepth = use_highbitdepth;
+#endif
+
+    alloc_raw_frame_buffers(cpi);
+    init_ref_frame_bufs(cm);
+    alloc_util_frame_buffers(cpi);
+
+    init_motion_estimation(cpi);  // TODO(agrange) This can be removed.
+
+    cpi->initial_width = cm->width;
+    cpi->initial_height = cm->height;
+    cpi->initial_mbs = cm->MBs;
+  }
+}
+
+int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
+                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+                          int64_t end_time) {
+  AV1_COMMON *const cm = &cpi->common;
+  struct aom_usec_timer timer;
+  int res = 0;
+  const int subsampling_x = sd->subsampling_x;
+  const int subsampling_y = sd->subsampling_y;
+#if CONFIG_HIGHBITDEPTH
+  const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+#endif
+
+#if CONFIG_HIGHBITDEPTH
+  check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
+#else
+  check_initial_width(cpi, subsampling_x, subsampling_y);
+#endif  // CONFIG_HIGHBITDEPTH
+
+  aom_usec_timer_start(&timer);
+
+  if (av1_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
+#if CONFIG_HIGHBITDEPTH
+                         use_highbitdepth,
+#endif  // CONFIG_HIGHBITDEPTH
+                         frame_flags))
+    res = -1;
+  aom_usec_timer_mark(&timer);
+  cpi->time_receive_data += aom_usec_timer_elapsed(&timer);
+
+  if ((cm->profile == PROFILE_0 || cm->profile == PROFILE_2) &&
+      (subsampling_x != 1 || subsampling_y != 1)) {
+    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+                       "Non-4:2:0 color format requires profile 1 or 3");
+    res = -1;
+  }
+  if ((cm->profile == PROFILE_1 || cm->profile == PROFILE_3) &&
+      (subsampling_x == 1 && subsampling_y == 1)) {
+    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+                       "4:2:0 color format requires profile 0 or 2");
+    res = -1;
+  }
+
+  return res;
+}
+
+static int frame_is_reference(const AV1_COMP *cpi) {
+  const AV1_COMMON *cm = &cpi->common;
+
+  return cm->frame_type == KEY_FRAME || cpi->refresh_last_frame ||
+         cpi->refresh_golden_frame ||
+#if CONFIG_EXT_REFS
+         cpi->refresh_bwd_ref_frame ||
+#endif  // CONFIG_EXT_REFS
+         cpi->refresh_alt_ref_frame || !cm->error_resilient_mode ||
+         cm->lf.mode_ref_delta_update || cm->seg.update_map ||
+         cm->seg.update_data;
+}
+
+static void adjust_frame_rate(AV1_COMP *cpi,
+                              const struct lookahead_entry *source) {
+  int64_t this_duration;
+  int step = 0;
+
+  if (source->ts_start == cpi->first_time_stamp_ever) {
+    this_duration = source->ts_end - source->ts_start;
+    step = 1;
+  } else {
+    int64_t last_duration =
+        cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen;
+
+    this_duration = source->ts_end - cpi->last_end_time_stamp_seen;
+
+    // do a step update if the duration changes by 10%
+    if (last_duration)
+      step = (int)((this_duration - last_duration) * 10 / last_duration);
+  }
+
+  if (this_duration) {
+    if (step) {
+      av1_new_framerate(cpi, 10000000.0 / this_duration);
+    } else {
+      // Average this frame's rate into the last second's average
+      // frame rate. If we haven't seen 1 second yet, then average
+      // over the whole interval seen.
+      const double interval = AOMMIN(
+          (double)(source->ts_end - cpi->first_time_stamp_ever), 10000000.0);
+      double avg_duration = 10000000.0 / cpi->framerate;
+      avg_duration *= (interval - avg_duration + this_duration);
+      avg_duration /= interval;
+
+      av1_new_framerate(cpi, 10000000.0 / avg_duration);
+    }
+  }
+  cpi->last_time_stamp_seen = source->ts_start;
+  cpi->last_end_time_stamp_seen = source->ts_end;
+}
+
+// Returns 0 if this is not an alt ref else the offset of the source frame
+// used as the arf midpoint.
+static int get_arf_src_index(AV1_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int arf_src_index = 0;
+  if (is_altref_enabled(cpi)) {
+    if (cpi->oxcf.pass == 2) {
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+        arf_src_index = gf_group->arf_src_offset[gf_group->index];
+      }
+    } else if (rc->source_alt_ref_pending) {
+      arf_src_index = rc->frames_till_gf_update_due;
+    }
+  }
+  return arf_src_index;
+}
+
+#if CONFIG_EXT_REFS
+static int get_brf_src_index(AV1_COMP *cpi) {
+  int brf_src_index = 0;
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+
+  // TODO(zoeliu): We need to add the check on the -bwd_ref command line setup
+  //               flag.
+  if (gf_group->bidir_pred_enabled[gf_group->index]) {
+    if (cpi->oxcf.pass == 2) {
+      if (gf_group->update_type[gf_group->index] == BRF_UPDATE)
+        brf_src_index = gf_group->brf_src_offset[gf_group->index];
+    } else {
+      // TODO(zoeliu): To re-visit the setup for this scenario
+      brf_src_index = cpi->rc.bipred_group_interval - 1;
+    }
+  }
+
+  return brf_src_index;
+}
+#endif  // CONFIG_EXT_REFS
+
+static void check_src_altref(AV1_COMP *cpi,
+                             const struct lookahead_entry *source) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // If pass == 2, the parameters set here will be reset in
+  // av1_rc_get_second_pass_params()
+
+  if (cpi->oxcf.pass == 2) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    rc->is_src_frame_alt_ref =
+#if CONFIG_EXT_REFS
+        (gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE) ||
+#endif  // CONFIG_EXT_REFS
+        (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
+  } else {
+    rc->is_src_frame_alt_ref =
+        cpi->alt_ref_source && (source == cpi->alt_ref_source);
+  }
+
+  if (rc->is_src_frame_alt_ref) {
+    // Current frame is an ARF overlay frame.
+    cpi->alt_ref_source = NULL;
+
+    // Don't refresh the last buffer for an ARF overlay frame. It will
+    // become the GF so preserve last as an alternative prediction option.
+    cpi->refresh_last_frame = 0;
+  }
+}
+
+#if CONFIG_INTERNAL_STATS
+extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
+                                 const unsigned char *img2, int img2_pitch,
+                                 int width, int height);
+
+static void adjust_image_stat(double y, double u, double v, double all,
+                              ImageStat *s) {
+  s->stat[Y] += y;
+  s->stat[U] += u;
+  s->stat[V] += v;
+  s->stat[ALL] += all;
+  s->worst = AOMMIN(s->worst, all);
+}
+
+static void compute_internal_stats(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  double samples = 0.0;
+  uint32_t in_bit_depth = 8;
+  uint32_t bit_depth = 8;
+
+#if CONFIG_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    in_bit_depth = cpi->oxcf.input_bit_depth;
+    bit_depth = cm->bit_depth;
+  }
+#endif
+  if (cm->show_frame) {
+    const YV12_BUFFER_CONFIG *orig = cpi->source;
+    const YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
+    double y, u, v, frame_all;
+
+    cpi->count++;
+    if (cpi->b_calculate_psnr) {
+      PSNR_STATS psnr;
+      double frame_ssim2 = 0.0, weight = 0.0;
+      aom_clear_system_state();
+// TODO(yaowu): unify these two versions into one.
+#if CONFIG_HIGHBITDEPTH
+      aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth);
+#else
+      aom_calc_psnr(orig, recon, &psnr);
+#endif  // CONFIG_HIGHBITDEPTH
+
+      adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0],
+                        &cpi->psnr);
+      cpi->total_sq_error += psnr.sse[0];
+      cpi->total_samples += psnr.samples[0];
+      samples = psnr.samples[0];
+// TODO(yaowu): unify these two versions into one.
+#if CONFIG_HIGHBITDEPTH
+      if (cm->use_highbitdepth)
+        frame_ssim2 =
+            aom_highbd_calc_ssim(orig, recon, &weight, bit_depth, in_bit_depth);
+      else
+        frame_ssim2 = aom_calc_ssim(orig, recon, &weight);
+#else
+      frame_ssim2 = aom_calc_ssim(orig, recon, &weight);
+#endif  // CONFIG_HIGHBITDEPTH
+
+      cpi->worst_ssim = AOMMIN(cpi->worst_ssim, frame_ssim2);
+      cpi->summed_quality += frame_ssim2 * weight;
+      cpi->summed_weights += weight;
+
+#if 0
+      {
+        FILE *f = fopen("q_used.stt", "a");
+        fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
+                cpi->common.current_video_frame, y2, u2, v2,
+                frame_psnr2, frame_ssim2);
+        fclose(f);
+      }
+#endif
+    }
+    if (cpi->b_calculate_blockiness) {
+#if CONFIG_HIGHBITDEPTH
+      if (!cm->use_highbitdepth)
+#endif
+      {
+        const double frame_blockiness =
+            av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer,
+                               recon->y_stride, orig->y_width, orig->y_height);
+        cpi->worst_blockiness = AOMMAX(cpi->worst_blockiness, frame_blockiness);
+        cpi->total_blockiness += frame_blockiness;
+      }
+
+      if (cpi->b_calculate_consistency) {
+#if CONFIG_HIGHBITDEPTH
+        if (!cm->use_highbitdepth)
+#endif
+        {
+          const double this_inconsistency = aom_get_ssim_metrics(
+              orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride,
+              orig->y_width, orig->y_height, cpi->ssim_vars, &cpi->metrics, 1);
+
+          const double peak = (double)((1 << in_bit_depth) - 1);
+          const double consistency =
+              aom_sse_to_psnr(samples, peak, cpi->total_inconsistency);
+          if (consistency > 0.0)
+            cpi->worst_consistency =
+                AOMMIN(cpi->worst_consistency, consistency);
+          cpi->total_inconsistency += this_inconsistency;
+        }
+      }
+    }
+
+    frame_all =
+        aom_calc_fastssim(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
+    adjust_image_stat(y, u, v, frame_all, &cpi->fastssim);
+    frame_all = aom_psnrhvs(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
+    adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs);
+  }
+}
+#endif  // CONFIG_INTERNAL_STATS
+
+int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
+                            size_t *size, uint8_t *dest, int64_t *time_stamp,
+                            int64_t *time_end, int flush) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  AV1_COMMON *const cm = &cpi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  RATE_CONTROL *const rc = &cpi->rc;
+  struct aom_usec_timer cmptimer;
+  YV12_BUFFER_CONFIG *force_src_buffer = NULL;
+  struct lookahead_entry *last_source = NULL;
+  struct lookahead_entry *source = NULL;
+  int arf_src_index;
+#if CONFIG_EXT_REFS
+  int brf_src_index;
+#endif  // CONFIG_EXT_REFS
+  int i;
+
+#if CONFIG_XIPHRC
+  cpi->od_rc.end_of_input = flush;
+#endif
+
+#if CONFIG_BITSTREAM_DEBUG
+  assert(cpi->oxcf.max_threads == 0 &&
+         "bitstream debug tool does not support multithreading");
+  bitstream_queue_record_write();
+  bitstream_queue_set_frame_write(cm->current_video_frame * 2 + cm->show_frame);
+#endif
+
+  aom_usec_timer_start(&cmptimer);
+
+  av1_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
+
+  // Is multi-arf enabled.
+  // Note that at the moment multi_arf is only configured for 2 pass VBR
+  if ((oxcf->pass == 2) && (cpi->oxcf.enable_auto_arf > 1))
+    cpi->multi_arf_allowed = 1;
+  else
+    cpi->multi_arf_allowed = 0;
+
+  // Normal defaults
+  cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
+  cm->refresh_frame_context =
+      (oxcf->error_resilient_mode || oxcf->frame_parallel_decoding_mode)
+          ? REFRESH_FRAME_CONTEXT_FORWARD
+          : REFRESH_FRAME_CONTEXT_BACKWARD;
+
+  cpi->refresh_last_frame = 1;
+  cpi->refresh_golden_frame = 0;
+#if CONFIG_EXT_REFS
+  cpi->refresh_bwd_ref_frame = 0;
+#endif  // CONFIG_EXT_REFS
+  cpi->refresh_alt_ref_frame = 0;
+
+#if CONFIG_EXT_REFS && !CONFIG_XIPHRC
+  if (oxcf->pass == 2 && cm->show_existing_frame) {
+    // Manage the source buffer and flush out the source frame that has been
+    // coded already; Also get prepared for PSNR calculation if needed.
+    if ((source = av1_lookahead_pop(cpi->lookahead, flush)) == NULL) {
+      *size = 0;
+      return -1;
+    }
+    cpi->source = &source->img;
+    // TODO(zoeliu): To track down to determine whether it's needed to adjust
+    // the frame rate.
+    *time_stamp = source->ts_start;
+    *time_end = source->ts_end;
+
+    // We need to adjust frame rate for an overlay frame
+    if (cpi->rc.is_src_frame_alt_ref) adjust_frame_rate(cpi, source);
+
+    // Find a free buffer for the new frame, releasing the reference previously
+    // held.
+    if (cm->new_fb_idx != INVALID_IDX) {
+      --pool->frame_bufs[cm->new_fb_idx].ref_count;
+    }
+    cm->new_fb_idx = get_free_fb(cm);
+
+    if (cm->new_fb_idx == INVALID_IDX) return -1;
+
+    // Clear down mmx registers
+    aom_clear_system_state();
+
+    // Start with a 0 size frame.
+    *size = 0;
+
+    // We need to update the gf_group for show_existing overlay frame
+    if (cpi->rc.is_src_frame_alt_ref) av1_rc_get_second_pass_params(cpi);
+
+    Pass2Encode(cpi, size, dest, frame_flags);
+
+    if (cpi->b_calculate_psnr) generate_psnr_packet(cpi);
+
+#if CONFIG_INTERNAL_STATS
+    compute_internal_stats(cpi);
+    cpi->bytes += (int)(*size);
+#endif  // CONFIG_INTERNAL_STATS
+
+    // Clear down mmx registers
+    aom_clear_system_state();
+
+    cm->show_existing_frame = 0;
+    return 0;
+  }
+#endif  // CONFIG_EXT_REFS && !CONFIG_XIPHRC
+
+  // Should we encode an arf frame.
+  arf_src_index = get_arf_src_index(cpi);
+  if (arf_src_index) {
+    for (i = 0; i <= arf_src_index; ++i) {
+      struct lookahead_entry *e = av1_lookahead_peek(cpi->lookahead, i);
+      // Avoid creating an alt-ref if there's a forced keyframe pending.
+      if (e == NULL) {
+        break;
+      } else if (e->flags == AOM_EFLAG_FORCE_KF) {
+        arf_src_index = 0;
+        flush = 1;
+        break;
+      }
+    }
+  }
+
+  if (arf_src_index) {
+    assert(arf_src_index <= rc->frames_to_key);
+
+    if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
+      cpi->alt_ref_source = source;
+
+      if (oxcf->arnr_max_frames > 0) {
+        // Produce the filtered ARF frame.
+        av1_temporal_filter(cpi, arf_src_index);
+        aom_extend_frame_borders(&cpi->alt_ref_buffer);
+        force_src_buffer = &cpi->alt_ref_buffer;
+      }
+
+      cm->show_frame = 0;
+      cm->intra_only = 0;
+      cpi->refresh_alt_ref_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_last_frame = 0;
+      rc->is_src_frame_alt_ref = 0;
+    }
+    rc->source_alt_ref_pending = 0;
+  }
+
+#if CONFIG_EXT_REFS
+  rc->is_bwd_ref_frame = 0;
+  brf_src_index = get_brf_src_index(cpi);
+  if (brf_src_index) {
+    assert(brf_src_index <= rc->frames_to_key);
+    if ((source = av1_lookahead_peek(cpi->lookahead, brf_src_index)) != NULL) {
+      cm->show_frame = 0;
+      cm->intra_only = 0;
+
+      cpi->refresh_bwd_ref_frame = 1;
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+
+      rc->is_bwd_ref_frame = 1;
+    }
+  }
+#endif  // CONFIG_EXT_REFS
+
+  if (!source) {
+    // Get last frame source.
+    if (cm->current_video_frame > 0) {
+      if ((last_source = av1_lookahead_peek(cpi->lookahead, -1)) == NULL)
+        return -1;
+    }
+
+    // Read in the source frame.
+    source = av1_lookahead_pop(cpi->lookahead, flush);
+
+    if (source != NULL) {
+      cm->show_frame = 1;
+      cm->intra_only = 0;
+
+      // Check to see if the frame should be encoded as an arf overlay.
+      check_src_altref(cpi, source);
+    }
+  }
+
+  if (source) {
+    cpi->un_scaled_source = cpi->source =
+        force_src_buffer ? force_src_buffer : &source->img;
+
+    cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL;
+
+    *time_stamp = source->ts_start;
+    *time_end = source->ts_end;
+    *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+
+  } else {
+    *size = 0;
+    if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
+#if CONFIG_XIPHRC
+      od_enc_rc_2pass_out(&cpi->od_rc, cpi->output_pkt_list, 1);
+#else
+      av1_end_first_pass(cpi); /* get last stats packet */
+#endif
+      cpi->twopass.first_pass_done = 1;
+    }
+    return -1;
+  }
+
+  if (source->ts_start < cpi->first_time_stamp_ever) {
+    cpi->first_time_stamp_ever = source->ts_start;
+    cpi->last_end_time_stamp_seen = source->ts_start;
+  }
+
+  // Clear down mmx registers
+  aom_clear_system_state();
+
+  // adjust frame rates based on timestamps given
+  if (cm->show_frame) adjust_frame_rate(cpi, source);
+
+  // Find a free buffer for the new frame, releasing the reference previously
+  // held.
+  if (cm->new_fb_idx != INVALID_IDX) {
+    --pool->frame_bufs[cm->new_fb_idx].ref_count;
+  }
+  cm->new_fb_idx = get_free_fb(cm);
+
+  if (cm->new_fb_idx == INVALID_IDX) return -1;
+
+  cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
+
+#if CONFIG_EXT_REFS
+  if (oxcf->pass == 2) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    cpi->alt_fb_idx = cpi->arf_map[gf_group->arf_ref_idx[gf_group->index]];
+  }
+#else
+  if (cpi->multi_arf_allowed) {
+    if (cm->frame_type == KEY_FRAME) {
+      init_buffer_indices(cpi);
+    } else if (oxcf->pass == 2) {
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index];
+    }
+  }
+#endif  // CONFIG_EXT_REFS
+
+  // Start with a 0 size frame.
+  *size = 0;
+
+  cpi->frame_flags = *frame_flags;
+
+  if (oxcf->pass == 2) {
+#if CONFIG_XIPHRC
+    if (od_enc_rc_2pass_in(&cpi->od_rc) < 0) return -1;
+  }
+#else
+    av1_rc_get_second_pass_params(cpi);
+  } else if (oxcf->pass == 1) {
+    set_frame_size(cpi);
+  }
+#endif
+
+  if (cpi->oxcf.pass != 0 || frame_is_intra_only(cm) == 1) {
+    for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i)
+      cpi->scaled_ref_idx[i] = INVALID_IDX;
+  }
+
+#if CONFIG_AOM_QM
+  cm->using_qmatrix = cpi->oxcf.using_qm;
+  cm->min_qmlevel = cpi->oxcf.qm_minlevel;
+  cm->max_qmlevel = cpi->oxcf.qm_maxlevel;
+#endif
+
+#if CONFIG_REFERENCE_BUFFER
+  if (*time_stamp == 0) {
+    cpi->common.current_frame_id = -1;
+  }
+#endif
+
+#if CONFIG_XIPHRC
+  if (oxcf->pass == 1) {
+    size_t tmp;
+    if (cpi->od_rc.cur_frame == 0) Pass0Encode(cpi, &tmp, dest, 1, frame_flags);
+    cpi->od_rc.firstpass_quant = cpi->od_rc.target_quantizer;
+    Pass0Encode(cpi, &tmp, dest, 0, frame_flags);
+    od_enc_rc_2pass_out(&cpi->od_rc, cpi->output_pkt_list, 0);
+  } else if (oxcf->pass == 2) {
+    Pass0Encode(cpi, size, dest, 0, frame_flags);
+  } else {
+    if (cpi->od_rc.cur_frame == 0) {
+      size_t tmp;
+      Pass0Encode(cpi, &tmp, dest, 1, frame_flags);
+    }
+    Pass0Encode(cpi, size, dest, 0, frame_flags);
+  }
+#else
+  if (oxcf->pass == 1) {
+    cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(oxcf);
+    av1_first_pass(cpi, source);
+  } else if (oxcf->pass == 2) {
+    Pass2Encode(cpi, size, dest, frame_flags);
+  } else {
+    // One pass encode
+    Pass0Encode(cpi, size, dest, 0, frame_flags);
+  }
+#endif
+
+  if (!cm->error_resilient_mode)
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+
+  // No frame encoded, or frame was dropped, release scaled references.
+  if ((*size == 0) && (frame_is_intra_only(cm) == 0)) {
+    release_scaled_references(cpi);
+  }
+
+  if (*size > 0) {
+    cpi->droppable = !frame_is_reference(cpi);
+  }
+
+  aom_usec_timer_mark(&cmptimer);
+  cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer);
+
+  if (cpi->b_calculate_psnr && oxcf->pass != 1 && cm->show_frame)
+    generate_psnr_packet(cpi);
+
+#if CONFIG_INTERNAL_STATS
+  if (oxcf->pass != 1) {
+    compute_internal_stats(cpi);
+    cpi->bytes += (int)(*size);
+  }
+#endif  // CONFIG_INTERNAL_STATS
+
+#if CONFIG_XIPHRC
+  cpi->od_rc.cur_frame++;
+#endif
+
+  aom_clear_system_state();
+
+  return 0;
+}
+
+int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
+  AV1_COMMON *cm = &cpi->common;
+  if (!cm->show_frame) {
+    return -1;
+  } else {
+    int ret;
+    if (cm->frame_to_show) {
+      *dest = *cm->frame_to_show;
+      dest->y_width = cm->width;
+      dest->y_height = cm->height;
+      dest->uv_width = cm->width >> cm->subsampling_x;
+      dest->uv_height = cm->height >> cm->subsampling_y;
+      ret = 0;
+    } else {
+      ret = -1;
+    }
+    aom_clear_system_state();
+    return ret;
+  }
+}
+
+int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) {
+  if (cpi->last_show_frame_buf_idx == INVALID_IDX) return -1;
+
+  *frame =
+      cpi->common.buffer_pool->frame_bufs[cpi->last_show_frame_buf_idx].buf;
+  return 0;
+}
+
+int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
+                          AOM_SCALING vert_mode) {
+  AV1_COMMON *cm = &cpi->common;
+  int hr = 0, hs = 0, vr = 0, vs = 0;
+
+  if (horiz_mode > ONETWO || vert_mode > ONETWO) return -1;
+
+  Scale2Ratio(horiz_mode, &hr, &hs);
+  Scale2Ratio(vert_mode, &vr, &vs);
+
+  // always go to the next whole number
+  cm->width = (hs - 1 + cpi->oxcf.width * hr) / hs;
+  cm->height = (vs - 1 + cpi->oxcf.height * vr) / vs;
+  assert(cm->width <= cpi->initial_width);
+  assert(cm->height <= cpi->initial_height);
+
+  update_frame_size(cpi);
+
+  return 0;
+}
+
+int av1_set_size_literal(AV1_COMP *cpi, unsigned int width,
+                         unsigned int height) {
+  AV1_COMMON *cm = &cpi->common;
+#if CONFIG_HIGHBITDEPTH
+  check_initial_width(cpi, cm->use_highbitdepth, 1, 1);
+#else
+  check_initial_width(cpi, 1, 1);
+#endif  // CONFIG_HIGHBITDEPTH
+
+  if (width) {
+    cm->width = width;
+    if (cm->width > cpi->initial_width) {
+      cm->width = cpi->initial_width;
+      printf("Warning: Desired width too large, changed to %d\n", cm->width);
+    }
+  }
+
+  if (height) {
+    cm->height = height;
+    if (cm->height > cpi->initial_height) {
+      cm->height = cpi->initial_height;
+      printf("Warning: Desired height too large, changed to %d\n", cm->height);
+    }
+  }
+  assert(cm->width <= cpi->initial_width);
+  assert(cm->height <= cpi->initial_height);
+
+  update_frame_size(cpi);
+
+  return 0;
+}
+
+int av1_get_quantizer(AV1_COMP *cpi) { return cpi->common.base_qindex; }
+
+void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
+  if (flags &
+      (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF)) {
+    int ref = AOM_REFFRAME_ALL;
+
+    if (flags & AOM_EFLAG_NO_REF_LAST) {
+      ref ^= AOM_LAST_FLAG;
+#if CONFIG_EXT_REFS
+      ref ^= AOM_LAST2_FLAG;
+      ref ^= AOM_LAST3_FLAG;
+#endif  // CONFIG_EXT_REFS
+    }
+
+    if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG;
+
+    if (flags & AOM_EFLAG_NO_REF_ARF) ref ^= AOM_ALT_FLAG;
+
+    av1_use_as_reference(cpi, ref);
+  }
+
+  if (flags &
+      (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
+       AOM_EFLAG_FORCE_GF | AOM_EFLAG_FORCE_ARF)) {
+    int upd = AOM_REFFRAME_ALL;
+
+    if (flags & AOM_EFLAG_NO_UPD_LAST) {
+      upd ^= AOM_LAST_FLAG;
+#if CONFIG_EXT_REFS
+      upd ^= AOM_LAST2_FLAG;
+      upd ^= AOM_LAST3_FLAG;
+#endif  // CONFIG_EXT_REFS
+    }
+
+    if (flags & AOM_EFLAG_NO_UPD_GF) upd ^= AOM_GOLD_FLAG;
+
+    if (flags & AOM_EFLAG_NO_UPD_ARF) upd ^= AOM_ALT_FLAG;
+
+    av1_update_reference(cpi, upd);
+  }
+
+  if (flags & AOM_EFLAG_NO_UPD_ENTROPY) {
+    av1_update_entropy(cpi, 0);
+  }
+}
diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h
new file mode 100644
index 000000000..4e7aef8fc
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder.h
@@ -0,0 +1,883 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_ENCODER_H_
+#define AV1_ENCODER_ENCODER_H_
+
+#include <stdio.h>
+
+#include "./aom_config.h"
+#include "aom/aomcx.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/thread_common.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#if CONFIG_ANS
+#include "aom_dsp/ans.h"
+#include "aom_dsp/buf_ans.h"
+#endif
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/mbgraph.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/tokenize.h"
+#include "av1/encoder/variance_tree.h"
+#if CONFIG_XIPHRC
+#include "av1/encoder/ratectrl_xiph.h"
+#endif
+
+#if CONFIG_INTERNAL_STATS
+#include "aom_dsp/ssim.h"
+#endif
+#include "aom_dsp/variance.h"
+#include "aom/internal/aom_codec_internal.h"
+#include "aom_util/aom_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  int nmvjointcost[MV_JOINTS];
+  int nmvcosts[2][MV_VALS];
+  int nmvcosts_hp[2][MV_VALS];
+
+#if CONFIG_REF_MV
+  int nmv_vec_cost[NMV_CONTEXTS][MV_JOINTS];
+  int nmv_costs[NMV_CONTEXTS][2][MV_VALS];
+  int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS];
+#endif
+
+  // 0 = Intra, Last, GF, ARF
+  signed char last_ref_lf_deltas[TOTAL_REFS_PER_FRAME];
+  // 0 = ZERO_MV, MV
+  signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
+
+  FRAME_CONTEXT fc;
+} CODING_CONTEXT;
+
+typedef enum {
+  // regular inter frame
+  REGULAR_FRAME = 0,
+  // alternate reference frame
+  ARF_FRAME = 1,
+  // overlay frame
+  OVERLAY_FRAME = 2,
+  // golden frame
+  GLD_FRAME = 3,
+#if CONFIG_EXT_REFS
+  // backward reference frame
+  BRF_FRAME = 4,
+  // extra alternate reference frame
+  EXT_ARF_FRAME = 5
+#endif
+} FRAME_CONTEXT_INDEX;
+
+typedef enum {
+  NORMAL = 0,
+  FOURFIVE = 1,
+  THREEFIVE = 2,
+  ONETWO = 3
+} AOM_SCALING;
+
+typedef enum {
+  // Good Quality Fast Encoding. The encoder balances quality with the amount of
+  // time it takes to encode the output. Speed setting controls how fast.
+  GOOD
+} MODE;
+
+typedef enum {
+  FRAMEFLAGS_KEY = 1 << 0,
+  FRAMEFLAGS_GOLDEN = 1 << 1,
+#if CONFIG_EXT_REFS
+  FRAMEFLAGS_BWDREF = 1 << 2,
+  FRAMEFLAGS_ALTREF = 1 << 3,
+#else
+  FRAMEFLAGS_ALTREF = 1 << 2,
+#endif  // CONFIG_EXT_REFS
+} FRAMETYPE_FLAGS;
+
+typedef enum {
+  NO_AQ = 0,
+  VARIANCE_AQ = 1,
+  COMPLEXITY_AQ = 2,
+  CYCLIC_REFRESH_AQ = 3,
+#if CONFIG_DELTA_Q && !CONFIG_EXT_DELTA_Q
+  DELTA_AQ = 4,
+#endif
+  AQ_MODE_COUNT  // This should always be the last member of the enum
+} AQ_MODE;
+#if CONFIG_EXT_DELTA_Q
+typedef enum {
+  NO_DELTA_Q = 0,
+  DELTA_Q_ONLY = 1,
+  DELTA_Q_LF = 2,
+  DELTAQ_MODE_COUNT  // This should always be the last member of the enum
+} DELTAQ_MODE;
+#endif
+typedef enum {
+  RESIZE_NONE = 0,    // No frame resizing allowed.
+  RESIZE_FIXED = 1,   // All frames are coded at the specified dimension.
+  RESIZE_DYNAMIC = 2  // Coded size of each frame is determined by the codec.
+} RESIZE_TYPE;
+
+typedef struct AV1EncoderConfig {
+  BITSTREAM_PROFILE profile;
+  aom_bit_depth_t bit_depth;     // Codec bit-depth.
+  int width;                     // width of data passed to the compressor
+  int height;                    // height of data passed to the compressor
+  unsigned int input_bit_depth;  // Input bit depth.
+  double init_framerate;         // set to passed in framerate
+  int64_t target_bandwidth;      // bandwidth to be used in bits per second
+
+  int noise_sensitivity;  // pre processing blur: recommendation 0
+  int sharpness;          // sharpening output: recommendation 0:
+  int speed;
+  // maximum allowed bitrate for any intra frame in % of bitrate target.
+  unsigned int rc_max_intra_bitrate_pct;
+  // maximum allowed bitrate for any inter frame in % of bitrate target.
+  unsigned int rc_max_inter_bitrate_pct;
+  // percent of rate boost for golden frame in CBR mode.
+  unsigned int gf_cbr_boost_pct;
+
+  MODE mode;
+  int pass;
+
+  // Key Framing Operations
+  int auto_key;  // autodetect cut scenes and set the keyframes
+  int key_freq;  // maximum distance to key frame.
+
+  int lag_in_frames;  // how many frames lag before we start encoding
+
+  // ----------------------------------------------------------------
+  // DATARATE CONTROL OPTIONS
+
+  // vbr, cbr, constrained quality or constant quality
+  enum aom_rc_mode rc_mode;
+
+  // buffer targeting aggressiveness
+  int under_shoot_pct;
+  int over_shoot_pct;
+
+  // buffering parameters
+  int64_t starting_buffer_level_ms;
+  int64_t optimal_buffer_level_ms;
+  int64_t maximum_buffer_size_ms;
+
+  // Frame drop threshold.
+  int drop_frames_water_mark;
+
+  // controlling quality
+  int fixed_q;
+  int worst_allowed_q;
+  int best_allowed_q;
+  int cq_level;
+  AQ_MODE aq_mode;  // Adaptive Quantization mode
+#if CONFIG_EXT_DELTA_Q
+  DELTAQ_MODE deltaq_mode;
+#endif
+#if CONFIG_AOM_QM
+  int using_qm;
+  int qm_minlevel;
+  int qm_maxlevel;
+#endif
+#if CONFIG_TILE_GROUPS
+  unsigned int num_tile_groups;
+  unsigned int mtu;
+#endif
+
+#if CONFIG_TEMPMV_SIGNALING
+  unsigned int disable_tempmv;
+#endif
+  // Internal frame size scaling.
+  RESIZE_TYPE resize_mode;
+  int scaled_frame_width;
+  int scaled_frame_height;
+
+  // Enable feature to reduce the frame quantization every x frames.
+  int frame_periodic_boost;
+
+  // two pass datarate control
+  int two_pass_vbrbias;  // two pass datarate control tweaks
+  int two_pass_vbrmin_section;
+  int two_pass_vbrmax_section;
+  // END DATARATE CONTROL OPTIONS
+  // ----------------------------------------------------------------
+
+  int enable_auto_arf;
+#if CONFIG_EXT_REFS
+  int enable_auto_brf;  // (b)ackward (r)ef (f)rame
+#endif                  // CONFIG_EXT_REFS
+
+  /* Bitfield defining the error resiliency features to enable.
+   * Can provide decodable frames after losses in previous
+   * frames and decodable partitions after losses in the same frame.
+   */
+  unsigned int error_resilient_mode;
+
+  /* Bitfield defining the parallel decoding mode where the
+   * decoding in successive frames may be conducted in parallel
+   * just by decoding the frame headers.
+   */
+  unsigned int frame_parallel_decoding_mode;
+
+  int arnr_max_frames;
+  int arnr_strength;
+
+  int min_gf_interval;
+  int max_gf_interval;
+
+  int tile_columns;
+  int tile_rows;
+#if CONFIG_DEPENDENT_HORZTILES
+  int dependent_horz_tiles;
+#endif
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+  int loop_filter_across_tiles_enabled;
+#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+
+  int max_threads;
+
+  aom_fixed_buf_t two_pass_stats_in;
+  struct aom_codec_pkt_list *output_pkt_list;
+
+#if CONFIG_FP_MB_STATS
+  aom_fixed_buf_t firstpass_mb_stats_in;
+#endif
+
+  aom_tune_metric tuning;
+  aom_tune_content content;
+#if CONFIG_HIGHBITDEPTH
+  int use_highbitdepth;
+#endif
+  aom_color_space_t color_space;
+  int color_range;
+  int render_width;
+  int render_height;
+
+#if CONFIG_EXT_PARTITION
+  aom_superblock_size_t superblock_size;
+#endif  // CONFIG_EXT_PARTITION
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+  int ans_window_size_log2;
+#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
+#if CONFIG_EXT_TILE
+  unsigned int tile_encoding_mode;
+#endif  // CONFIG_EXT_TILE
+
+  unsigned int motion_vector_unit_test;
+} AV1EncoderConfig;
+
+static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) {
+  return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
+}
+
+// TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
+typedef struct TileDataEnc {
+  TileInfo tile_info;
+  int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
+  int mode_map[BLOCK_SIZES][MAX_MODES];
+  int m_search_count;
+  int ex_search_count;
+#if CONFIG_PVQ
+  PVQ_QUEUE pvq_q;
+#endif
+#if CONFIG_CFL
+  CFL_CTX cfl;
+#endif
+#if CONFIG_EC_ADAPT
+  DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
+#endif
+} TileDataEnc;
+
+typedef struct RD_COUNTS {
+  av1_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
+  int64_t comp_pred_diff[REFERENCE_MODES];
+#if CONFIG_GLOBAL_MOTION
+  // Stores number of 4x4 blocks using global motion per reference frame.
+  int global_motion_used[TOTAL_REFS_PER_FRAME];
+#endif  // CONFIG_GLOBAL_MOTION
+} RD_COUNTS;
+
+typedef struct ThreadData {
+  MACROBLOCK mb;
+  RD_COUNTS rd_counts;
+  FRAME_COUNTS *counts;
+
+  PICK_MODE_CONTEXT *leaf_tree;
+  PC_TREE *pc_tree;
+  PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+
+  VAR_TREE *var_tree;
+  VAR_TREE *var_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+} ThreadData;
+
+struct EncWorkerData;
+
+typedef struct ActiveMap {
+  int enabled;
+  int update;
+  unsigned char *map;
+} ActiveMap;
+
+#define NUM_STAT_TYPES 4  // types of stats: Y, U, V and ALL
+
+typedef struct IMAGE_STAT {
+  double stat[NUM_STAT_TYPES];
+  double worst;
+} ImageStat;
+
+#undef NUM_STAT_TYPES
+
+typedef struct {
+  int ref_count;
+  YV12_BUFFER_CONFIG buf;
+} EncRefCntBuffer;
+
+#if CONFIG_SUBFRAME_PROB_UPDATE
+typedef struct SUBFRAME_STATS {
+  av1_coeff_probs_model coef_probs_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES];
+  av1_coeff_count coef_counts_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES];
+  unsigned int eob_counts_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES][REF_TYPES]
+                             [COEF_BANDS][COEFF_CONTEXTS];
+  av1_coeff_probs_model enc_starting_coef_probs[TX_SIZES][PLANE_TYPES];
+} SUBFRAME_STATS;
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+
+typedef struct TileBufferEnc {
+  uint8_t *data;
+  size_t size;
+} TileBufferEnc;
+
+typedef struct AV1_COMP {
+  QUANTS quants;
+  ThreadData td;
+  MB_MODE_INFO_EXT *mbmi_ext_base;
+  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);   // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);  // 8: SIMD width
+#if CONFIG_NEW_QUANT
+  DECLARE_ALIGNED(16, dequant_val_type_nuq,
+                  y_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]);
+  DECLARE_ALIGNED(16, dequant_val_type_nuq,
+                  uv_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]);
+#endif  // CONFIG_NEW_QUANT
+  AV1_COMMON common;
+  AV1EncoderConfig oxcf;
+  struct lookahead_ctx *lookahead;
+  struct lookahead_entry *alt_ref_source;
+
+  YV12_BUFFER_CONFIG *source;
+  YV12_BUFFER_CONFIG *last_source;  // NULL for first frame and alt_ref frames
+  YV12_BUFFER_CONFIG *un_scaled_source;
+  YV12_BUFFER_CONFIG scaled_source;
+  YV12_BUFFER_CONFIG *unscaled_last_source;
+  YV12_BUFFER_CONFIG scaled_last_source;
+
+  // Up-sampled reference buffers
+  // NOTE(zoeliu): It is needed to allocate sufficient space to the up-sampled
+  // reference buffers, which should include the up-sampled version of all the
+  // possibly stored references plus the currently coded frame itself.
+  EncRefCntBuffer upsampled_ref_bufs[REF_FRAMES + 1];
+  int upsampled_ref_idx[REF_FRAMES + 1];
+
+  // For a still frame, this flag is set to 1 to skip partition search.
+  int partition_search_skippable_frame;
+
+  int scaled_ref_idx[TOTAL_REFS_PER_FRAME];
+#if CONFIG_EXT_REFS
+  int lst_fb_idxes[LAST_REF_FRAMES];
+#else
+  int lst_fb_idx;
+#endif  // CONFIG_EXT_REFS
+  int gld_fb_idx;
+#if CONFIG_EXT_REFS
+  int bwd_fb_idx;  // BWD_REF_FRAME
+#endif             // CONFIG_EXT_REFS
+  int alt_fb_idx;
+
+  int last_show_frame_buf_idx;  // last show frame buffer index
+
+  int refresh_last_frame;
+  int refresh_golden_frame;
+#if CONFIG_EXT_REFS
+  int refresh_bwd_ref_frame;
+#endif  // CONFIG_EXT_REFS
+  int refresh_alt_ref_frame;
+
+  int ext_refresh_frame_flags_pending;
+  int ext_refresh_last_frame;
+  int ext_refresh_golden_frame;
+  int ext_refresh_alt_ref_frame;
+
+  int ext_refresh_frame_context_pending;
+  int ext_refresh_frame_context;
+
+  YV12_BUFFER_CONFIG last_frame_uf;
+#if CONFIG_LOOP_RESTORATION
+  YV12_BUFFER_CONFIG last_frame_db;
+  YV12_BUFFER_CONFIG trial_frame_rst;
+  uint8_t *extra_rstbuf;  // Extra buffers used in restoration search
+  RestorationInfo rst_search[MAX_MB_PLANE];  // Used for encoder side search
+#endif                                       // CONFIG_LOOP_RESTORATION
+
+  // Ambient reconstruction err target for force key frames
+  int64_t ambient_err;
+
+  RD_OPT rd;
+
+  CODING_CONTEXT coding_context;
+
+#if CONFIG_REF_MV
+  int nmv_costs[NMV_CONTEXTS][2][MV_VALS];
+  int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS];
+#endif
+
+  int nmvcosts[2][MV_VALS];
+  int nmvcosts_hp[2][MV_VALS];
+  int nmvsadcosts[2][MV_VALS];
+  int nmvsadcosts_hp[2][MV_VALS];
+
+  int64_t last_time_stamp_seen;
+  int64_t last_end_time_stamp_seen;
+  int64_t first_time_stamp_ever;
+
+  RATE_CONTROL rc;
+#if CONFIG_XIPHRC
+  od_rc_state od_rc;
+#endif
+  double framerate;
+
+  // NOTE(zoeliu): Any inter frame allows maximum of REF_FRAMES inter
+  // references; Plus the currently coded frame itself, it is needed to allocate
+  // sufficient space to the size of the maximum possible number of frames.
+  int interp_filter_selected[REF_FRAMES + 1][SWITCHABLE];
+
+  struct aom_codec_pkt_list *output_pkt_list;
+
+  MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
+  int mbgraph_n_frames;  // number of frames filled in the above
+  int static_mb_pct;     // % forced skip mbs by segmentation
+  int ref_frame_flags;
+
+  SPEED_FEATURES sf;
+
+  unsigned int max_mv_magnitude;
+  int mv_step_param;
+
+  int allow_comp_inter_inter;
+
+  uint8_t *segmentation_map;
+
+  CYCLIC_REFRESH *cyclic_refresh;
+  ActiveMap active_map;
+
+  fractional_mv_step_fp *find_fractional_mv_step;
+  av1_full_search_fn_t full_search_sad;  // It is currently unused.
+  av1_diamond_search_fn_t diamond_search_sad;
+  aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
+  uint64_t time_receive_data;
+  uint64_t time_compress_data;
+  uint64_t time_pick_lpf;
+  uint64_t time_encode_sb_row;
+
+#if CONFIG_FP_MB_STATS
+  int use_fp_mb_stats;
+#endif
+
+  TWO_PASS twopass;
+
+  YV12_BUFFER_CONFIG alt_ref_buffer;
+
+#if CONFIG_INTERNAL_STATS
+  unsigned int mode_chosen_counts[MAX_MODES];
+
+  int count;
+  uint64_t total_sq_error;
+  uint64_t total_samples;
+  ImageStat psnr;
+
+  double total_blockiness;
+  double worst_blockiness;
+
+  int bytes;
+  double summed_quality;
+  double summed_weights;
+  unsigned int tot_recode_hits;
+  double worst_ssim;
+
+  ImageStat fastssim;
+  ImageStat psnrhvs;
+
+  int b_calculate_blockiness;
+  int b_calculate_consistency;
+
+  double total_inconsistency;
+  double worst_consistency;
+  Ssimv *ssim_vars;
+  Metrics metrics;
+#endif
+  int b_calculate_psnr;
+
+  int droppable;
+
+  int initial_width;
+  int initial_height;
+  int initial_mbs;  // Number of MBs in the full-size frame; to be used to
+                    // normalize the firstpass stats. This will differ from the
+                    // number of MBs in the current frame when the frame is
+                    // scaled.
+
+  // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type.
+  DIFF *source_diff_var;
+  // The threshold used in SOURCE_VAR_BASED_PARTITION search type.
+  unsigned int source_var_thresh;
+  int frames_till_next_var_check;
+
+  int frame_flags;
+
+  search_site_config ss_cfg;
+
+  int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES];
+#if CONFIG_REF_MV
+  int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2];
+  int zeromv_mode_cost[ZEROMV_MODE_CONTEXTS][2];
+  int refmv_mode_cost[REFMV_MODE_CONTEXTS][2];
+  int drl_mode_cost0[DRL_MODE_CONTEXTS][2];
+#endif
+
+  unsigned int inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
+#if CONFIG_EXT_INTER
+  unsigned int inter_compound_mode_cost[INTER_MODE_CONTEXTS]
+                                       [INTER_COMPOUND_MODES];
+  unsigned int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  int motion_mode_cost[BLOCK_SIZES][MOTION_MODES];
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+  int motion_mode_cost1[BLOCK_SIZES][2];
+#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  int intra_uv_mode_cost[INTRA_MODES][INTRA_MODES];
+  int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+#if CONFIG_EXT_PARTITION_TYPES
+  int partition_cost[PARTITION_CONTEXTS + CONFIG_UNPOISON_PARTITION_CTX]
+                    [EXT_PARTITION_TYPES];
+#else
+  int partition_cost[PARTITION_CONTEXTS + CONFIG_UNPOISON_PARTITION_CTX]
+                    [PARTITION_TYPES];
+#endif
+#if CONFIG_PALETTE
+  int palette_y_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
+  int palette_uv_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
+  int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+                          [PALETTE_COLORS];
+  int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+                           [PALETTE_COLORS];
+#endif  // CONFIG_PALETTE
+  int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
+#if CONFIG_EXT_TX
+  int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+  int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                         [TX_TYPES];
+#else
+  int intra_tx_type_costs[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
+  int inter_tx_type_costs[EXT_TX_SIZES][TX_TYPES];
+#endif  // CONFIG_EXT_TX
+#if CONFIG_EXT_INTRA
+#if CONFIG_INTRA_INTERP
+  int intra_filter_cost[INTRA_FILTERS + 1][INTRA_FILTERS];
+#endif  // CONFIG_INTRA_INTERP
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_LOOP_RESTORATION
+  int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES];
+#endif  // CONFIG_LOOP_RESTORATION
+#if CONFIG_GLOBAL_MOTION
+  int gmtype_cost[TRANS_TYPES];
+  int gmparams_cost[TOTAL_REFS_PER_FRAME];
+#endif  // CONFIG_GLOBAL_MOTION
+
+  int multi_arf_allowed;
+  int multi_arf_enabled;
+  int multi_arf_last_grp_enabled;
+
+  TileDataEnc *tile_data;
+  int allocated_tiles;  // Keep track of memory allocated for tiles.
+
+  TOKENEXTRA *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS];
+  unsigned int tok_count[MAX_TILE_ROWS][MAX_TILE_COLS];
+
+  TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
+
+  int resize_pending;
+  int resize_state;
+  int resize_scale_num;
+  int resize_scale_den;
+  int resize_avg_qp;
+  int resize_buffer_underflow;
+  int resize_count;
+
+  // VAR_BASED_PARTITION thresholds
+  // 0 - threshold_128x128;
+  // 1 - threshold_64x64;
+  // 2 - threshold_32x32;
+  // 3 - threshold_16x16;
+  // 4 - threshold_8x8;
+  int64_t vbp_thresholds[5];
+  int64_t vbp_threshold_minmax;
+  int64_t vbp_threshold_sad;
+  BLOCK_SIZE vbp_bsize_min;
+
+  // VARIANCE_AQ segment map refresh
+  int vaq_refresh;
+
+  // Multi-threading
+  int num_workers;
+  AVxWorker *workers;
+  struct EncWorkerData *tile_thr_data;
+  AV1LfSync lf_row_sync;
+#if CONFIG_SUBFRAME_PROB_UPDATE
+  SUBFRAME_STATS subframe_stats;
+  // TODO(yaowu): minimize the size of count buffers
+  SUBFRAME_STATS wholeframe_stats;
+  av1_coeff_stats branch_ct_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES];
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+#if CONFIG_ANS
+  struct BufAnsCoder buf_ans;
+#endif
+#if CONFIG_EXT_REFS
+  int refresh_frame_mask;
+  int existing_fb_idx_to_show;
+  int is_arf_filter_off[MAX_EXT_ARFS + 1];
+  int num_extra_arfs;
+  int arf_map[MAX_EXT_ARFS + 1];
+#endif  // CONFIG_EXT_REFS
+#if CONFIG_GLOBAL_MOTION
+  int global_motion_search_done;
+#endif
+#if CONFIG_REFERENCE_BUFFER
+  SequenceHeader seq_params;
+#endif
+#if CONFIG_LV_MAP
+  tran_low_t *tcoeff_buf[MAX_MB_PLANE];
+#endif
+} AV1_COMP;
+
+void av1_initialize_enc(void);
+
+struct AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
+                                       BufferPool *const pool);
+void av1_remove_compressor(AV1_COMP *cpi);
+
+void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf);
+
+// receive a frames worth of data. caller can assume that a copy of this
+// frame is made and not just a copy of the pointer..
+int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
+                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+                          int64_t end_time_stamp);
+
+int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
+                            size_t *size, uint8_t *dest, int64_t *time_stamp,
+                            int64_t *time_end, int flush);
+
+int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest);
+
+int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame);
+
+int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags);
+
+void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags);
+
+int av1_copy_reference_enc(AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag,
+                           YV12_BUFFER_CONFIG *sd);
+
+int av1_set_reference_enc(AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag,
+                          YV12_BUFFER_CONFIG *sd);
+
+int av1_update_entropy(AV1_COMP *cpi, int update);
+
+int av1_set_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int av1_get_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
+                          AOM_SCALING vert_mode);
+
+int av1_set_size_literal(AV1_COMP *cpi, unsigned int width,
+                         unsigned int height);
+
+int av1_get_quantizer(struct AV1_COMP *cpi);
+
+void av1_full_to_model_counts(av1_coeff_count_model *model_count,
+                              av1_coeff_count *full_count);
+
+static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
+  return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
+         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
+}
+
+static INLINE int get_ref_frame_map_idx(const AV1_COMP *cpi,
+                                        MV_REFERENCE_FRAME ref_frame) {
+#if CONFIG_EXT_REFS
+  if (ref_frame >= LAST_FRAME && ref_frame <= LAST3_FRAME)
+    return cpi->lst_fb_idxes[ref_frame - 1];
+#else
+  if (ref_frame == LAST_FRAME) return cpi->lst_fb_idx;
+#endif  // CONFIG_EXT_REFS
+  else if (ref_frame == GOLDEN_FRAME)
+    return cpi->gld_fb_idx;
+#if CONFIG_EXT_REFS
+  else if (ref_frame == BWDREF_FRAME)
+    return cpi->bwd_fb_idx;
+#endif  // CONFIG_EXT_REFS
+  else
+    return cpi->alt_fb_idx;
+}
+
+static INLINE int get_ref_frame_buf_idx(const AV1_COMP *cpi,
+                                        MV_REFERENCE_FRAME ref_frame) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
+  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
+    const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+  return buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf
+                                : NULL;
+}
+
+static INLINE const YV12_BUFFER_CONFIG *get_upsampled_ref(
+    const AV1_COMP *cpi, const MV_REFERENCE_FRAME ref_frame) {
+  // Use up-sampled reference frames.
+  const int buf_idx =
+      cpi->upsampled_ref_idx[get_ref_frame_map_idx(cpi, ref_frame)];
+  return &cpi->upsampled_ref_bufs[buf_idx].buf;
+}
+
+#if CONFIG_EXT_REFS
+static INLINE int enc_is_ref_frame_buf(AV1_COMP *cpi, RefCntBuffer *frame_buf) {
+  MV_REFERENCE_FRAME ref_frame;
+  AV1_COMMON *const cm = &cpi->common;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+    if (buf_idx == INVALID_IDX) continue;
+    if (frame_buf == &cm->buffer_pool->frame_bufs[buf_idx]) break;
+  }
+  return (ref_frame <= ALTREF_FRAME);
+}
+#endif  // CONFIG_EXT_REFS
+
+static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols) {
+  // We assume 3 planes all at full resolution. We assume up to 1 token per
+  // pixel, and then allow a head room of 1 EOSB token per 4x4 block per plane,
+  // plus EOSB_TOKEN per plane.
+  return mb_rows * mb_cols * (16 * 16 + 17) * 3;
+}
+
+// Get the allocated token size for a tile. It does the same calculation as in
+// the frame token allocation.
+static INLINE unsigned int allocated_tokens(TileInfo tile) {
+#if CONFIG_CB4X4
+  int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 2) >> 2;
+  int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 2) >> 2;
+#else
+  int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 1) >> 1;
+  int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 1) >> 1;
+#endif
+
+  return get_token_alloc(tile_mb_rows, tile_mb_cols);
+}
+
+void av1_alloc_compressor_data(AV1_COMP *cpi);
+
+void av1_scale_references(AV1_COMP *cpi);
+
+void av1_update_reference_frames(AV1_COMP *cpi);
+
+void av1_set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv);
+#if CONFIG_TEMPMV_SIGNALING
+void av1_set_temporal_mv_prediction(AV1_COMP *cpi, int allow_tempmv_prediction);
+#endif
+
+YV12_BUFFER_CONFIG *av1_scale_if_required_fast(AV1_COMMON *cm,
+                                               YV12_BUFFER_CONFIG *unscaled,
+                                               YV12_BUFFER_CONFIG *scaled);
+
+YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
+                                          YV12_BUFFER_CONFIG *unscaled,
+                                          YV12_BUFFER_CONFIG *scaled);
+
+void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags);
+
+static INLINE int is_altref_enabled(const AV1_COMP *const cpi) {
+  return cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.enable_auto_arf;
+}
+
+// TODO(zoeliu): To set up cpi->oxcf.enable_auto_brf
+#if 0 && CONFIG_EXT_REFS
+static INLINE int is_bwdref_enabled(const AV1_COMP *const cpi) {
+  // NOTE(zoeliu): The enabling of bi-predictive frames depends on the use of
+  //               alt_ref, and now will be off when the alt_ref interval is
+  //               not sufficiently large.
+  return is_altref_enabled(cpi) && cpi->oxcf.enable_auto_brf;
+}
+#endif  // CONFIG_EXT_REFS
+
+static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                MV_REFERENCE_FRAME ref0,
+                                MV_REFERENCE_FRAME ref1) {
+  xd->block_refs[0] =
+      &cm->frame_refs[ref0 >= LAST_FRAME ? ref0 - LAST_FRAME : 0];
+  xd->block_refs[1] =
+      &cm->frame_refs[ref1 >= LAST_FRAME ? ref1 - LAST_FRAME : 0];
+}
+
+static INLINE int get_chessboard_index(int frame_index) {
+  return frame_index & 0x1;
+}
+
+static INLINE int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) {
+  return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
+}
+
+void av1_new_framerate(AV1_COMP *cpi, double framerate);
+
+#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
+
+// Update up-sampled reference frame index.
+static INLINE void uref_cnt_fb(EncRefCntBuffer *ubufs, int *uidx,
+                               int new_uidx) {
+  const int ref_index = *uidx;
+
+  if (ref_index >= 0 && ubufs[ref_index].ref_count > 0)
+    ubufs[ref_index].ref_count--;
+
+  *uidx = new_uidx;
+  ubufs[new_uidx].ref_count++;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_ENCODER_H_
diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c
new file mode 100644
index 000000000..3f71a4472
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodetxb.c
@@ -0,0 +1,784 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/scan.h"
+#include "av1/common/blockd.h"
+#include "av1/common/idct.h"
+#include "av1/common/pred_common.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/subexp.h"
+#include "av1/encoder/tokenize.h"
+
+void av1_alloc_txb_buf(AV1_COMP *cpi) {
+#if 0
+  AV1_COMMON *cm = &cpi->common;
+  int mi_block_size = 1 << MI_SIZE_LOG2;
+  // TODO(angiebird): Make sure cm->subsampling_x/y is set correctly, and then
+  // use precise buffer size according to cm->subsampling_x/y
+  int pixel_stride = mi_block_size * cm->mi_cols;
+  int pixel_height = mi_block_size * cm->mi_rows;
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    CHECK_MEM_ERROR(
+        cm, cpi->tcoeff_buf[i],
+        aom_malloc(sizeof(*cpi->tcoeff_buf[i]) * pixel_stride * pixel_height));
+  }
+#else
+  (void)cpi;
+#endif
+}
+
+void av1_free_txb_buf(AV1_COMP *cpi) {
+#if 0
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    aom_free(cpi->tcoeff_buf[i]);
+  }
+#else
+  (void)cpi;
+#endif
+}
+
+static void write_golomb(aom_writer *w, int level) {
+  int x = level + 1;
+  int i = x;
+  int length = 0;
+
+  while (i) {
+    i >>= 1;
+    ++length;
+  }
+  assert(length > 0);
+
+  for (i = 0; i < length - 1; ++i) aom_write_bit(w, 0);
+
+  for (i = length - 1; i >= 0; --i) aom_write_bit(w, (x >> i) & 0x01);
+}
+
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+                          aom_writer *w, int block, int plane,
+                          const tran_low_t *tcoeff, uint16_t eob,
+                          TXB_CTX *txb_ctx) {
+  aom_prob *nz_map;
+  aom_prob *eob_flag;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_SIZE tx_size = get_tx_size(plane, xd);
+  const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const SCAN_ORDER *const scan_order =
+      get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+  const int16_t *scan = scan_order->scan;
+  int c;
+  int is_nz;
+  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
+  const int seg_eob = tx_size_2d[tx_size];
+  uint8_t txb_mask[32 * 32] = { 0 };
+  uint16_t update_eob = 0;
+
+  aom_write(w, eob == 0, cm->fc->txb_skip[tx_size][txb_ctx->txb_skip_ctx]);
+
+  if (eob == 0) return;
+#if CONFIG_TXK_SEL
+  av1_write_tx_type(cm, xd, block, plane, w);
+#endif
+
+  nz_map = cm->fc->nz_map[tx_size][plane_type];
+  eob_flag = cm->fc->eob_flag[tx_size][plane_type];
+
+  for (c = 0; c < eob; ++c) {
+    int coeff_ctx = get_nz_map_ctx(tcoeff, txb_mask, scan[c], bwl);
+    int eob_ctx = get_eob_ctx(tcoeff, scan[c], bwl);
+
+    tran_low_t v = tcoeff[scan[c]];
+    is_nz = (v != 0);
+
+    if (c == seg_eob - 1) break;
+
+    aom_write(w, is_nz, nz_map[coeff_ctx]);
+
+    if (is_nz) {
+      aom_write(w, c == (eob - 1), eob_flag[eob_ctx]);
+    }
+    txb_mask[scan[c]] = 1;
+  }
+
+  int i;
+  for (i = 0; i < NUM_BASE_LEVELS; ++i) {
+    aom_prob *coeff_base = cm->fc->coeff_base[tx_size][plane_type][i];
+
+    update_eob = 0;
+    for (c = eob - 1; c >= 0; --c) {
+      tran_low_t v = tcoeff[scan[c]];
+      tran_low_t level = abs(v);
+      int sign = (v < 0) ? 1 : 0;
+      int ctx;
+
+      if (level <= i) continue;
+
+      ctx = get_base_ctx(tcoeff, scan[c], bwl, i + 1);
+
+      if (level == i + 1) {
+        aom_write(w, 1, coeff_base[ctx]);
+        if (c == 0) {
+          aom_write(w, sign, cm->fc->dc_sign[plane_type][txb_ctx->dc_sign_ctx]);
+        } else {
+          aom_write_bit(w, sign);
+        }
+        continue;
+      }
+      aom_write(w, 0, coeff_base[ctx]);
+      update_eob = AOMMAX(update_eob, c);
+    }
+  }
+
+  for (c = update_eob; c >= 0; --c) {
+    tran_low_t v = tcoeff[scan[c]];
+    tran_low_t level = abs(v);
+    int sign = (v < 0) ? 1 : 0;
+    int idx;
+    int ctx;
+
+    if (level <= NUM_BASE_LEVELS) continue;
+
+    if (c == 0) {
+      aom_write(w, sign, cm->fc->dc_sign[plane_type][txb_ctx->dc_sign_ctx]);
+    } else {
+      aom_write_bit(w, sign);
+    }
+
+    // level is above 1.
+    ctx = get_level_ctx(tcoeff, scan[c], bwl);
+    for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
+      if (level == (idx + 1 + NUM_BASE_LEVELS)) {
+        aom_write(w, 1, cm->fc->coeff_lps[tx_size][plane_type][ctx]);
+        break;
+      }
+      aom_write(w, 0, cm->fc->coeff_lps[tx_size][plane_type][ctx]);
+    }
+    if (idx < COEFF_BASE_RANGE) continue;
+
+    // use 0-th order Golomb code to handle the residual level.
+    write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS);
+  }
+}
+
+void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
+                         aom_writer *w, int plane) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  BLOCK_SIZE bsize = mbmi->sb_type;
+  struct macroblockd_plane *pd = &xd->plane[plane];
+
+#if CONFIG_CB4X4
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+#else
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd);
+#endif
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  TX_SIZE tx_size = get_tx_size(plane, xd);
+  const int bkw = tx_size_wide_unit[tx_size];
+  const int bkh = tx_size_high_unit[tx_size];
+  const int step = tx_size_wide_unit[tx_size] * tx_size_high_unit[tx_size];
+  int row, col;
+  int block = 0;
+  for (row = 0; row < max_blocks_high; row += bkh) {
+    for (col = 0; col < max_blocks_wide; col += bkw) {
+      tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
+      uint16_t eob = x->mbmi_ext->eobs[plane][block];
+      TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block],
+                          x->mbmi_ext->dc_sign_ctx[plane][block] };
+      av1_write_coeffs_txb(cm, xd, w, block, plane, tcoeff, eob, &txb_ctx);
+      block += step;
+    }
+  }
+}
+
+static INLINE void get_base_ctx_set(const tran_low_t *tcoeffs,
+                                    int c,  // raster order
+                                    const int bwl,
+                                    int ctx_set[NUM_BASE_LEVELS]) {
+  const int row = c >> bwl;
+  const int col = c - (row << bwl);
+  const int stride = 1 << bwl;
+  int mag[NUM_BASE_LEVELS] = { 0 };
+  int idx;
+  tran_low_t abs_coeff;
+  int i;
+
+  for (idx = 0; idx < BASE_CONTEXT_POSITION_NUM; ++idx) {
+    int ref_row = row + base_ref_offset[idx][0];
+    int ref_col = col + base_ref_offset[idx][1];
+    int pos = (ref_row << bwl) + ref_col;
+
+    if (ref_row < 0 || ref_col < 0 || ref_row >= stride || ref_col >= stride)
+      continue;
+
+    abs_coeff = abs(tcoeffs[pos]);
+
+    for (i = 0; i < NUM_BASE_LEVELS; ++i) {
+      ctx_set[i] += abs_coeff > i;
+      if (base_ref_offset[idx][0] >= 0 && base_ref_offset[idx][1] >= 0)
+        mag[i] |= abs_coeff > (i + 1);
+    }
+  }
+
+  for (i = 0; i < NUM_BASE_LEVELS; ++i) {
+    ctx_set[i] = (ctx_set[i] + 1) >> 1;
+
+    if (row == 0 && col == 0)
+      ctx_set[i] = (ctx_set[i] << 1) + mag[i];
+    else if (row == 0)
+      ctx_set[i] = 8 + (ctx_set[i] << 1) + mag[i];
+    else if (col == 0)
+      ctx_set[i] = 18 + (ctx_set[i] << 1) + mag[i];
+    else
+      ctx_set[i] = 28 + (ctx_set[i] << 1) + mag[i];
+  }
+  return;
+}
+
+int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
+                        int block, TXB_CTX *txb_ctx) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const TX_SIZE tx_size = get_tx_size(plane, xd);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const struct macroblock_plane *p = &x->plane[plane];
+  const int eob = p->eobs[block];
+  const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  int c, cost;
+  const int seg_eob = AOMMIN(eob, tx_size_2d[tx_size] - 1);
+  int txb_skip_ctx = txb_ctx->txb_skip_ctx;
+  aom_prob *nz_map = xd->fc->nz_map[tx_size][plane_type];
+
+  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
+  // txb_mask is only initialized for once here. After that, it will be set when
+  // coding zero map and then reset when coding level 1 info.
+  uint8_t txb_mask[32 * 32] = { 0 };
+  aom_prob(*coeff_base)[COEFF_BASE_CONTEXTS] =
+      xd->fc->coeff_base[tx_size][plane_type];
+
+  const SCAN_ORDER *const scan_order =
+      get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+  const int16_t *scan = scan_order->scan;
+
+  cost = 0;
+
+  if (eob == 0) {
+    cost = av1_cost_bit(xd->fc->txb_skip[tx_size][txb_skip_ctx], 1);
+    return cost;
+  }
+
+  cost = av1_cost_bit(xd->fc->txb_skip[tx_size][txb_skip_ctx], 0);
+
+#if CONFIG_TXK_SEL
+  cost += av1_tx_type_cost(cpi, xd, mbmi->sb_type, plane, tx_size, tx_type);
+#endif
+
+  for (c = 0; c < eob; ++c) {
+    tran_low_t v = qcoeff[scan[c]];
+    int is_nz = (v != 0);
+    int level = abs(v);
+
+    if (c < seg_eob) {
+      int coeff_ctx = get_nz_map_ctx(qcoeff, txb_mask, scan[c], bwl);
+      cost += av1_cost_bit(nz_map[coeff_ctx], is_nz);
+    }
+
+    if (is_nz) {
+      int ctx_ls[NUM_BASE_LEVELS] = { 0 };
+      int sign = (v < 0) ? 1 : 0;
+
+      // sign bit cost
+      if (c == 0) {
+        int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+
+        cost += av1_cost_bit(xd->fc->dc_sign[plane_type][dc_sign_ctx], sign);
+      } else {
+        cost += av1_cost_bit(128, sign);
+      }
+
+      get_base_ctx_set(qcoeff, scan[c], bwl, ctx_ls);
+
+      int i;
+      for (i = 0; i < NUM_BASE_LEVELS; ++i) {
+        if (level <= i) continue;
+
+        if (level == i + 1) {
+          cost += av1_cost_bit(coeff_base[i][ctx_ls[i]], 1);
+          continue;
+        }
+        cost += av1_cost_bit(coeff_base[i][ctx_ls[i]], 0);
+      }
+
+      if (level > NUM_BASE_LEVELS) {
+        int idx;
+        int ctx;
+
+        ctx = get_level_ctx(qcoeff, scan[c], bwl);
+
+        for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
+          if (level == (idx + 1 + NUM_BASE_LEVELS)) {
+            cost +=
+                av1_cost_bit(xd->fc->coeff_lps[tx_size][plane_type][ctx], 1);
+            break;
+          }
+          cost += av1_cost_bit(xd->fc->coeff_lps[tx_size][plane_type][ctx], 0);
+        }
+
+        if (idx >= COEFF_BASE_RANGE) {
+          // residual cost
+          int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+          int ri = r;
+          int length = 0;
+
+          while (ri) {
+            ri >>= 1;
+            ++length;
+          }
+
+          for (ri = 0; ri < length - 1; ++ri) cost += av1_cost_bit(128, 0);
+
+          for (ri = length - 1; ri >= 0; --ri)
+            cost += av1_cost_bit(128, (r >> ri) & 0x01);
+        }
+      }
+
+      if (c < seg_eob) {
+        int eob_ctx = get_eob_ctx(qcoeff, scan[c], bwl);
+        cost += av1_cost_bit(xd->fc->eob_flag[tx_size][plane_type][eob_ctx],
+                             c == (eob - 1));
+      }
+    }
+
+    txb_mask[scan[c]] = 1;
+  }
+
+  return cost;
+}
+
+typedef struct TxbParams {
+  const AV1_COMP *cpi;
+  ThreadData *td;
+  int rate;
+} TxbParams;
+
+int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
+                                const SCAN_ORDER *scan_order, int eob) {
+  const int16_t *scan = scan_order->scan;
+  int cul_level = 0;
+  int c;
+  for (c = 0; c < eob; ++c) {
+    cul_level += abs(qcoeff[scan[c]]);
+  }
+
+  cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
+  set_dc_sign(&cul_level, qcoeff[0]);
+
+  return cul_level;
+}
+
+static void update_txb_context(int plane, int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               void *arg) {
+  TxbParams *const args = arg;
+  const AV1_COMP *cpi = args->cpi;
+  const AV1_COMMON *cm = &cpi->common;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const uint16_t eob = p->eobs[block];
+  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  const PLANE_TYPE plane_type = pd->plane_type;
+  const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const SCAN_ORDER *const scan_order =
+      get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+  (void)plane_bsize;
+
+  int cul_level = av1_get_txb_entropy_context(qcoeff, scan_order, eob);
+  av1_set_contexts(xd, pd, plane, tx_size, cul_level, blk_col, blk_row);
+}
+
+static void update_and_record_txb_context(int plane, int block, int blk_row,
+                                          int blk_col, BLOCK_SIZE plane_bsize,
+                                          TX_SIZE tx_size, void *arg) {
+  TxbParams *const args = arg;
+  const AV1_COMP *cpi = args->cpi;
+  const AV1_COMMON *cm = &cpi->common;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  int eob = p->eobs[block], update_eob = 0;
+  const PLANE_TYPE plane_type = pd->plane_type;
+  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
+  const int segment_id = mbmi->segment_id;
+  const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const SCAN_ORDER *const scan_order =
+      get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+  const int16_t *scan = scan_order->scan;
+  const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+  int c, i;
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, pd->above_context + blk_col,
+              pd->left_context + blk_row, &txb_ctx);
+  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
+  int cul_level = 0;
+  unsigned int(*nz_map_count)[SIG_COEF_CONTEXTS][2];
+  uint8_t txb_mask[32 * 32] = { 0 };
+
+  nz_map_count = &td->counts->nz_map[tx_size][plane_type];
+
+  memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
+
+  ++td->counts->txb_skip[tx_size][txb_ctx.txb_skip_ctx][eob == 0];
+  x->mbmi_ext->txb_skip_ctx[plane][block] = txb_ctx.txb_skip_ctx;
+
+  x->mbmi_ext->eobs[plane][block] = eob;
+
+  if (eob == 0) {
+    av1_set_contexts(xd, pd, plane, tx_size, 0, blk_col, blk_row);
+    return;
+  }
+
+#if CONFIG_TXK_SEL
+  av1_update_tx_type_count(cm, xd, block, plane, mbmi->sb_type, tx_size,
+                           td->counts);
+#endif
+
+  for (c = 0; c < eob; ++c) {
+    tran_low_t v = qcoeff[scan[c]];
+    int is_nz = (v != 0);
+    int coeff_ctx = get_nz_map_ctx(tcoeff, txb_mask, scan[c], bwl);
+    int eob_ctx = get_eob_ctx(tcoeff, scan[c], bwl);
+
+    if (c == seg_eob - 1) break;
+
+    ++(*nz_map_count)[coeff_ctx][is_nz];
+
+    if (is_nz) {
+      ++td->counts->eob_flag[tx_size][plane_type][eob_ctx][c == (eob - 1)];
+    }
+    txb_mask[scan[c]] = 1;
+  }
+
+  // Reverse process order to handle coefficient level and sign.
+  for (i = 0; i < NUM_BASE_LEVELS; ++i) {
+    update_eob = 0;
+    for (c = eob - 1; c >= 0; --c) {
+      tran_low_t v = qcoeff[scan[c]];
+      tran_low_t level = abs(v);
+      int ctx;
+
+      if (level <= i) continue;
+
+      ctx = get_base_ctx(tcoeff, scan[c], bwl, i + 1);
+
+      if (level == i + 1) {
+        ++td->counts->coeff_base[tx_size][plane_type][i][ctx][1];
+        if (c == 0) {
+          int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+
+          ++td->counts->dc_sign[plane_type][dc_sign_ctx][v < 0];
+          x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx;
+        }
+        cul_level += level;
+        continue;
+      }
+      ++td->counts->coeff_base[tx_size][plane_type][i][ctx][0];
+      update_eob = AOMMAX(update_eob, c);
+    }
+  }
+
+  for (c = update_eob; c >= 0; --c) {
+    tran_low_t v = qcoeff[scan[c]];
+    tran_low_t level = abs(v);
+    int idx;
+    int ctx;
+
+    if (level <= NUM_BASE_LEVELS) continue;
+
+    cul_level += level;
+    if (c == 0) {
+      int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+
+      ++td->counts->dc_sign[plane_type][dc_sign_ctx][v < 0];
+      x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx;
+    }
+
+    // level is above 1.
+    ctx = get_level_ctx(tcoeff, scan[c], bwl);
+    for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
+      if (level == (idx + 1 + NUM_BASE_LEVELS)) {
+        ++td->counts->coeff_lps[tx_size][plane_type][ctx][1];
+        break;
+      }
+      ++td->counts->coeff_lps[tx_size][plane_type][ctx][0];
+    }
+    if (idx < COEFF_BASE_RANGE) continue;
+
+    // use 0-th order Golomb code to handle the residual level.
+  }
+
+  cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
+
+  // DC value
+  set_dc_sign(&cul_level, tcoeff[0]);
+  av1_set_contexts(xd, pd, plane, tx_size, cul_level, blk_col, blk_row);
+
+#if CONFIG_ADAPT_SCAN
+  // Since dqcoeff is not available here, we pass qcoeff into
+  // av1_update_scan_count_facade(). The update behavior should be the same
+  // because av1_update_scan_count_facade() only cares if coefficients are zero
+  // or not.
+  av1_update_scan_count_facade((AV1_COMMON *)cm, td->counts, tx_size, tx_type,
+                               qcoeff, eob);
+#endif
+}
+
+void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
+                            RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
+                            int mi_row, int mi_col) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int ctx = av1_get_skip_context(xd);
+  const int skip_inc =
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
+  struct TxbParams arg = { cpi, td, 0 };
+  (void)rate;
+  (void)mi_row;
+  (void)mi_col;
+  if (mbmi->skip) {
+    if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
+    reset_skip_context(xd, bsize);
+    return;
+  }
+
+  if (!dry_run) {
+    td->counts->skip[ctx][0] += skip_inc;
+    av1_foreach_transformed_block(xd, bsize, mi_row, mi_col,
+                                  update_and_record_txb_context, &arg);
+  } else if (dry_run == DRY_RUN_NORMAL) {
+    av1_foreach_transformed_block(xd, bsize, mi_row, mi_col, update_txb_context,
+                                  &arg);
+  } else {
+    printf("DRY_RUN_COSTCOEFFS is not supported yet\n");
+    assert(0);
+  }
+}
+
+static void find_new_prob(unsigned int *branch_cnt, aom_prob *oldp,
+                          int *savings, int *update, aom_writer *const bc) {
+  const aom_prob upd = DIFF_UPDATE_PROB;
+  int u = 0;
+  aom_prob newp = get_binary_prob(branch_cnt[0], branch_cnt[1]);
+  int s = av1_prob_diff_update_savings_search(branch_cnt, *oldp, &newp, upd, 1);
+
+  if (s > 0 && newp != *oldp) u = 1;
+
+  if (u)
+    *savings += s - (int)(av1_cost_zero(upd));  // TODO(jingning): 1?
+  else
+    *savings -= (int)(av1_cost_zero(upd));
+
+  if (update) {
+    ++update[u];
+    return;
+  }
+
+  aom_write(bc, u, upd);
+  if (u) {
+    /* send/use new probability */
+    av1_write_prob_diff_update(bc, newp, *oldp);
+    *oldp = newp;
+  }
+}
+
+static void write_txb_probs(aom_writer *const bc, AV1_COMP *cpi,
+                            TX_SIZE tx_size) {
+  FRAME_CONTEXT *fc = cpi->common.fc;
+  FRAME_COUNTS *counts = cpi->td.counts;
+  int savings = 0;
+  int update[2] = { 0, 0 };
+  int plane, ctx, level;
+
+  for (ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx) {
+    find_new_prob(counts->txb_skip[tx_size][ctx], &fc->txb_skip[tx_size][ctx],
+                  &savings, update, bc);
+  }
+
+  for (plane = 0; plane < PLANE_TYPES; ++plane) {
+    for (ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) {
+      find_new_prob(counts->nz_map[tx_size][plane][ctx],
+                    &fc->nz_map[tx_size][plane][ctx], &savings, update, bc);
+    }
+  }
+
+  for (plane = 0; plane < PLANE_TYPES; ++plane) {
+    for (ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) {
+      find_new_prob(counts->eob_flag[tx_size][plane][ctx],
+                    &fc->eob_flag[tx_size][plane][ctx], &savings, update, bc);
+    }
+  }
+
+  for (level = 0; level < NUM_BASE_LEVELS; ++level) {
+    for (plane = 0; plane < PLANE_TYPES; ++plane) {
+      for (ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx) {
+        find_new_prob(counts->coeff_base[tx_size][plane][level][ctx],
+                      &fc->coeff_base[tx_size][plane][level][ctx], &savings,
+                      update, bc);
+      }
+    }
+  }
+
+  for (plane = 0; plane < PLANE_TYPES; ++plane) {
+    for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
+      find_new_prob(counts->coeff_lps[tx_size][plane][ctx],
+                    &fc->coeff_lps[tx_size][plane][ctx], &savings, update, bc);
+    }
+  }
+
+  // Decide if to update the model for this tx_size
+  if (update[1] == 0 || savings < 0) {
+    aom_write_bit(bc, 0);
+    return;
+  }
+  aom_write_bit(bc, 1);
+
+  for (ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx) {
+    find_new_prob(counts->txb_skip[tx_size][ctx], &fc->txb_skip[tx_size][ctx],
+                  &savings, NULL, bc);
+  }
+
+  for (plane = 0; plane < PLANE_TYPES; ++plane) {
+    for (ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) {
+      find_new_prob(counts->nz_map[tx_size][plane][ctx],
+                    &fc->nz_map[tx_size][plane][ctx], &savings, NULL, bc);
+    }
+  }
+
+  for (plane = 0; plane < PLANE_TYPES; ++plane) {
+    for (ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) {
+      find_new_prob(counts->eob_flag[tx_size][plane][ctx],
+                    &fc->eob_flag[tx_size][plane][ctx], &savings, NULL, bc);
+    }
+  }
+
+  for (level = 0; level < NUM_BASE_LEVELS; ++level) {
+    for (plane = 0; plane < PLANE_TYPES; ++plane) {
+      for (ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx) {
+        find_new_prob(counts->coeff_base[tx_size][plane][level][ctx],
+                      &fc->coeff_base[tx_size][plane][level][ctx], &savings,
+                      NULL, bc);
+      }
+    }
+  }
+
+  for (plane = 0; plane < PLANE_TYPES; ++plane) {
+    for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
+      find_new_prob(counts->coeff_lps[tx_size][plane][ctx],
+                    &fc->coeff_lps[tx_size][plane][ctx], &savings, NULL, bc);
+    }
+  }
+}
+
+void av1_write_txb_probs(AV1_COMP *cpi, aom_writer *w) {
+  const TX_MODE tx_mode = cpi->common.tx_mode;
+  const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+  TX_SIZE tx_size;
+  int ctx, plane;
+
+  for (plane = 0; plane < PLANE_TYPES; ++plane)
+    for (ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
+      av1_cond_prob_diff_update(w, &cpi->common.fc->dc_sign[plane][ctx],
+                                cpi->td.counts->dc_sign[plane][ctx], 1);
+
+  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+    write_txb_probs(w, cpi, tx_size);
+}
+
+#if CONFIG_TXK_SEL
+int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                            int block, int blk_row, int blk_col,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                            const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
+                            int use_fast_coef_costing, RD_STATS *rd_stats) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  TX_TYPE txk_start = DCT_DCT;
+  TX_TYPE txk_end = TX_TYPES - 1;
+  TX_TYPE best_tx_type = txk_start;
+  int64_t best_rd = INT64_MAX;
+  const int coeff_ctx = combine_entropy_contexts(*a, *l);
+  TX_TYPE tx_type;
+  for (tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
+    if (plane == 0) mbmi->txk_type[block] = tx_type;
+    TX_TYPE ref_tx_type =
+        get_tx_type(get_plane_type(plane), xd, block, tx_size);
+    if (tx_type != ref_tx_type) {
+      // use get_tx_type() to check if the tx_type is valid for the current mode
+      // if it's not, we skip it here.
+      continue;
+    }
+    RD_STATS this_rd_stats;
+    av1_invalid_rd_stats(&this_rd_stats);
+    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                    coeff_ctx, AV1_XFORM_QUANT_FP);
+    if (x->plane[plane].eobs[block] && !xd->lossless[mbmi->segment_id])
+      av1_optimize_b(cm, x, plane, block, tx_size, coeff_ctx);
+    av1_dist_block(cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size,
+                   &this_rd_stats.dist, &this_rd_stats.sse,
+                   OUTPUT_HAS_PREDICTED_PIXELS);
+    const SCAN_ORDER *scan_order =
+        get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+    this_rd_stats.rate = av1_cost_coeffs(
+        cpi, x, plane, block, tx_size, scan_order, a, l, use_fast_coef_costing);
+    int rd =
+        RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist);
+    if (rd < best_rd) {
+      best_rd = rd;
+      *rd_stats = this_rd_stats;
+      best_tx_type = tx_type;
+    }
+  }
+  if (plane == 0) mbmi->txk_type[block] = best_tx_type;
+  // TODO(angiebird): Instead of re-call av1_xform_quant and av1_optimize_b,
+  // copy the best result in the above tx_type search for loop
+  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  coeff_ctx, AV1_XFORM_QUANT_FP);
+  if (x->plane[plane].eobs[block] && !xd->lossless[mbmi->segment_id])
+    av1_optimize_b(cm, x, plane, block, tx_size, coeff_ctx);
+  if (!is_inter_block(mbmi)) {
+    // intra mode needs decoded result such that the next transform block
+    // can use it for prediction.
+    av1_inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
+                                       x->plane[plane].eobs[block]);
+  }
+  return best_rd;
+}
+#endif  // CONFIG_TXK_SEL
diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h
new file mode 100644
index 000000000..552d47b54
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodetxb.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef ENCODETXB_H_
+#define ENCODETXB_H_
+
+#include "./aom_config.h"
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "aom_dsp/bitwriter.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+void av1_alloc_txb_buf(AV1_COMP *cpi);
+void av1_free_txb_buf(AV1_COMP *cpi);
+int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
+                        int block, TXB_CTX *txb_ctx);
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+                          aom_writer *w, int block, int plane,
+                          const tran_low_t *tcoeff, uint16_t eob,
+                          TXB_CTX *txb_ctx);
+void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
+                         aom_writer *w, int plane);
+int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
+                                const SCAN_ORDER *scan_order, int eob);
+void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
+                            RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
+                            const int mi_row, const int mi_col);
+void av1_write_txb_probs(AV1_COMP *cpi, aom_writer *w);
+
+#if CONFIG_TXK_SEL
+int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                            int block, int blk_row, int blk_col,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                            const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
+                            int use_fast_coef_costing, RD_STATS *rd_stats);
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // COEFFS_CODING_H_
diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c
new file mode 100644
index 000000000..34f0b9566
--- /dev/null
+++ b/third_party/aom/av1/encoder/ethread.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
+  int i, j, k, l, m, n;
+
+  for (i = 0; i < REFERENCE_MODES; i++)
+    td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
+
+#if CONFIG_GLOBAL_MOTION
+  for (i = 0; i < TOTAL_REFS_PER_FRAME; i++)
+    td->rd_counts.global_motion_used[i] +=
+        td_t->rd_counts.global_motion_used[i];
+#endif  // CONFIG_GLOBAL_MOTION
+
+  for (i = 0; i < TX_SIZES; i++)
+    for (j = 0; j < PLANE_TYPES; j++)
+      for (k = 0; k < REF_TYPES; k++)
+        for (l = 0; l < COEF_BANDS; l++)
+          for (m = 0; m < COEFF_CONTEXTS; m++)
+            for (n = 0; n < ENTROPY_TOKENS; n++)
+              td->rd_counts.coef_counts[i][j][k][l][m][n] +=
+                  td_t->rd_counts.coef_counts[i][j][k][l][m][n];
+}
+
+static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
+  AV1_COMP *const cpi = thread_data->cpi;
+  const AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  int t;
+
+  (void)unused;
+
+  for (t = thread_data->start; t < tile_rows * tile_cols;
+       t += cpi->num_workers) {
+    int tile_row = t / tile_cols;
+    int tile_col = t % tile_cols;
+
+    av1_encode_tile(cpi, thread_data->td, tile_row, tile_col);
+  }
+
+  return 0;
+}
+
+void av1_encode_tiles_mt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tile_cols;
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  const int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols);
+  int i;
+
+  av1_init_tile_data(cpi);
+
+  // Only run once to create threads and allocate thread data.
+  if (cpi->num_workers == 0) {
+    CHECK_MEM_ERROR(cm, cpi->workers,
+                    aom_malloc(num_workers * sizeof(*cpi->workers)));
+
+    CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+                    aom_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
+
+    for (i = 0; i < num_workers; i++) {
+      AVxWorker *const worker = &cpi->workers[i];
+      EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
+
+      ++cpi->num_workers;
+      winterface->init(worker);
+
+      thread_data->cpi = cpi;
+
+      if (i < num_workers - 1) {
+        // Allocate thread data.
+        CHECK_MEM_ERROR(cm, thread_data->td,
+                        aom_memalign(32, sizeof(*thread_data->td)));
+        av1_zero(*thread_data->td);
+
+        // Set up pc_tree.
+        thread_data->td->leaf_tree = NULL;
+        thread_data->td->pc_tree = NULL;
+        av1_setup_pc_tree(cm, thread_data->td);
+
+        // Set up variance tree if needed.
+        if (cpi->sf.partition_search_type == VAR_BASED_PARTITION)
+          av1_setup_var_tree(cm, thread_data->td);
+
+        // Allocate frame counters in thread data.
+        CHECK_MEM_ERROR(cm, thread_data->td->counts,
+                        aom_calloc(1, sizeof(*thread_data->td->counts)));
+
+        // Create threads
+        if (!winterface->reset(worker))
+          aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                             "Tile encoder thread creation failed");
+      } else {
+        // Main thread acts as a worker and uses the thread data in cpi.
+        thread_data->td = &cpi->td;
+      }
+
+      winterface->sync(worker);
+    }
+  }
+
+  for (i = 0; i < num_workers; i++) {
+    AVxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *thread_data;
+
+    worker->hook = (AVxWorkerHook)enc_worker_hook;
+    worker->data1 = &cpi->tile_thr_data[i];
+    worker->data2 = NULL;
+    thread_data = (EncWorkerData *)worker->data1;
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+      thread_data->td->rd_counts = cpi->td.rd_counts;
+    }
+    if (thread_data->td->counts != &cpi->common.counts) {
+      memcpy(thread_data->td->counts, &cpi->common.counts,
+             sizeof(cpi->common.counts));
+    }
+
+#if CONFIG_PALETTE
+    // Allocate buffers used by palette coding mode.
+    if (cpi->common.allow_screen_content_tools && i < num_workers - 1) {
+      MACROBLOCK *x = &thread_data->td->mb;
+      CHECK_MEM_ERROR(cm, x->palette_buffer,
+                      aom_memalign(16, sizeof(*x->palette_buffer)));
+    }
+#endif  // CONFIG_PALETTE
+  }
+
+  // Encode a frame
+  for (i = 0; i < num_workers; i++) {
+    AVxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    if (i == cpi->num_workers - 1)
+      winterface->execute(worker);
+    else
+      winterface->launch(worker);
+  }
+
+  // Encoding ends.
+  for (i = 0; i < num_workers; i++) {
+    AVxWorker *const worker = &cpi->workers[i];
+    winterface->sync(worker);
+  }
+
+  for (i = 0; i < num_workers; i++) {
+    AVxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+
+    // Accumulate counters.
+    if (i < cpi->num_workers - 1) {
+      av1_accumulate_frame_counts(&cm->counts, thread_data->td->counts);
+      accumulate_rd_opt(&cpi->td, thread_data->td);
+    }
+  }
+}
diff --git a/third_party/aom/av1/encoder/ethread.h b/third_party/aom/av1/encoder/ethread.h
new file mode 100644
index 000000000..6c30a3e5c
--- /dev/null
+++ b/third_party/aom/av1/encoder/ethread.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_ETHREAD_H_
+#define AV1_ENCODER_ETHREAD_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct ThreadData;
+
+typedef struct EncWorkerData {
+  struct AV1_COMP *cpi;
+  struct ThreadData *td;
+  int start;
+} EncWorkerData;
+
+void av1_encode_tiles_mt(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_ETHREAD_H_
diff --git a/third_party/aom/av1/encoder/extend.c b/third_party/aom/av1/encoder/extend.c
new file mode 100644
index 000000000..007694a38
--- /dev/null
+++ b/third_party/aom/av1/encoder/extend.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/common.h"
+#include "av1/encoder/extend.h"
+
+static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
+                                  uint8_t *dst, int dst_pitch, int w, int h,
+                                  int extend_top, int extend_left,
+                                  int extend_bottom, int extend_right) {
+  int i, linesize;
+
+  // copy the left and right most columns out
+  const uint8_t *src_ptr1 = src;
+  const uint8_t *src_ptr2 = src + w - 1;
+  uint8_t *dst_ptr1 = dst - extend_left;
+  uint8_t *dst_ptr2 = dst + w;
+
+  for (i = 0; i < h; i++) {
+    memset(dst_ptr1, src_ptr1[0], extend_left);
+    memcpy(dst_ptr1 + extend_left, src_ptr1, w);
+    memset(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_pitch;
+    src_ptr2 += src_pitch;
+    dst_ptr1 += dst_pitch;
+    dst_ptr2 += dst_pitch;
+  }
+
+  // Now copy the top and bottom lines into each line of the respective
+  // borders
+  src_ptr1 = dst - extend_left;
+  src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+  dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+  dst_ptr2 = dst + dst_pitch * (h)-extend_left;
+  linesize = extend_left + extend_right + w;
+
+  for (i = 0; i < extend_top; i++) {
+    memcpy(dst_ptr1, src_ptr1, linesize);
+    dst_ptr1 += dst_pitch;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    memcpy(dst_ptr2, src_ptr2, linesize);
+    dst_ptr2 += dst_pitch;
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
+                                         uint8_t *dst8, int dst_pitch, int w,
+                                         int h, int extend_top, int extend_left,
+                                         int extend_bottom, int extend_right) {
+  int i, linesize;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+  // copy the left and right most columns out
+  const uint16_t *src_ptr1 = src;
+  const uint16_t *src_ptr2 = src + w - 1;
+  uint16_t *dst_ptr1 = dst - extend_left;
+  uint16_t *dst_ptr2 = dst + w;
+
+  for (i = 0; i < h; i++) {
+    aom_memset16(dst_ptr1, src_ptr1[0], extend_left);
+    memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(src_ptr1[0]));
+    aom_memset16(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_pitch;
+    src_ptr2 += src_pitch;
+    dst_ptr1 += dst_pitch;
+    dst_ptr2 += dst_pitch;
+  }
+
+  // Now copy the top and bottom lines into each line of the respective
+  // borders
+  src_ptr1 = dst - extend_left;
+  src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+  dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+  dst_ptr2 = dst + dst_pitch * (h)-extend_left;
+  linesize = extend_left + extend_right + w;
+
+  for (i = 0; i < extend_top; i++) {
+    memcpy(dst_ptr1, src_ptr1, linesize * sizeof(src_ptr1[0]));
+    dst_ptr1 += dst_pitch;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    memcpy(dst_ptr2, src_ptr2, linesize * sizeof(src_ptr2[0]));
+    dst_ptr2 += dst_pitch;
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst) {
+  // Extend src frame in buffer
+  // Altref filtering assumes 16 pixel extension
+  const int et_y = 16;
+  const int el_y = 16;
+  // Motion estimation may use src block variance with the block size up
+  // to 64x64, so the right and bottom need to be extended to 64 multiple
+  // or up to 16, whichever is greater.
+  const int er_y =
+      AOMMAX(src->y_width + 16, ALIGN_POWER_OF_TWO(src->y_width, 6)) -
+      src->y_crop_width;
+  const int eb_y =
+      AOMMAX(src->y_height + 16, ALIGN_POWER_OF_TWO(src->y_height, 6)) -
+      src->y_crop_height;
+  const int uv_width_subsampling = (src->uv_width != src->y_width);
+  const int uv_height_subsampling = (src->uv_height != src->y_height);
+  const int et_uv = et_y >> uv_height_subsampling;
+  const int el_uv = el_y >> uv_width_subsampling;
+  const int eb_uv = eb_y >> uv_height_subsampling;
+  const int er_uv = er_y >> uv_width_subsampling;
+
+#if CONFIG_HIGHBITDEPTH
+  if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+    highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, src->y_crop_width,
+                                 src->y_crop_height, et_y, el_y, eb_y, er_y);
+
+    highbd_copy_and_extend_plane(
+        src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+        src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+
+    highbd_copy_and_extend_plane(
+        src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+        src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+    return;
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+
+  copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
+                        dst->y_stride, src->y_crop_width, src->y_crop_height,
+                        et_y, el_y, eb_y, er_y);
+
+  copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer,
+                        dst->uv_stride, src->uv_crop_width, src->uv_crop_height,
+                        et_uv, el_uv, eb_uv, er_uv);
+
+  copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer,
+                        dst->uv_stride, src->uv_crop_width, src->uv_crop_height,
+                        et_uv, el_uv, eb_uv, er_uv);
+}
+
+void av1_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
+                                         YV12_BUFFER_CONFIG *dst, int srcy,
+                                         int srcx, int srch, int srcw) {
+  // If the side is not touching the bounder then don't extend.
+  const int et_y = srcy ? 0 : dst->border;
+  const int el_y = srcx ? 0 : dst->border;
+  const int eb_y = srcy + srch != src->y_height
+                       ? 0
+                       : dst->border + dst->y_height - src->y_height;
+  const int er_y = srcx + srcw != src->y_width
+                       ? 0
+                       : dst->border + dst->y_width - src->y_width;
+  const int src_y_offset = srcy * src->y_stride + srcx;
+  const int dst_y_offset = srcy * dst->y_stride + srcx;
+
+  const int et_uv = ROUND_POWER_OF_TWO(et_y, 1);
+  const int el_uv = ROUND_POWER_OF_TWO(el_y, 1);
+  const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1);
+  const int er_uv = ROUND_POWER_OF_TWO(er_y, 1);
+  const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
+  const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
+  const int srch_uv = ROUND_POWER_OF_TWO(srch, 1);
+  const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1);
+
+  copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
+                        dst->y_buffer + dst_y_offset, dst->y_stride, srcw, srch,
+                        et_y, el_y, eb_y, er_y);
+
+  copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride,
+                        dst->u_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
+                        srch_uv, et_uv, el_uv, eb_uv, er_uv);
+
+  copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride,
+                        dst->v_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
+                        srch_uv, et_uv, el_uv, eb_uv, er_uv);
+}
diff --git a/third_party/aom/av1/encoder/extend.h b/third_party/aom/av1/encoder/extend.h
new file mode 100644
index 000000000..48178b964
--- /dev/null
+++ b/third_party/aom/av1/encoder/extend.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_EXTEND_H_
+#define AV1_ENCODER_EXTEND_H_
+
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst);
+
+void av1_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
+                                         YV12_BUFFER_CONFIG *dst, int srcy,
+                                         int srcx, int srch, int srcw);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_EXTEND_H_
diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c
new file mode 100644
index 000000000..e35a54ef2
--- /dev/null
+++ b/third_party/aom/av1/encoder/firstpass.c
@@ -0,0 +1,3026 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "./aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_scale/yv12config.h"
+
+#include "aom_dsp/variance.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"  // av1_setup_dst_planes()
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/rd.h"
+
+#define OUTPUT_FPF 0
+#define ARF_STATS_OUTPUT 0
+
+#define GROUP_ADAPTIVE_MAXQ 1
+
+#define BOOST_BREAKOUT 12.5
+#define BOOST_FACTOR 12.5
+#define FACTOR_PT_LOW 0.70
+#define FACTOR_PT_HIGH 0.90
+#define FIRST_PASS_Q 10.0
+#define GF_MAX_BOOST 96.0
+#define INTRA_MODE_PENALTY 1024
+#define KF_MAX_BOOST 128.0
+#define MIN_ARF_GF_BOOST 240
+#define MIN_DECAY_FACTOR 0.01
+#define MIN_KF_BOOST 300
+#define NEW_MV_MODE_PENALTY 32
+#define DARK_THRESH 64
+#define DEFAULT_GRP_WEIGHT 1.0
+#define RC_FACTOR_MIN 0.75
+#define RC_FACTOR_MAX 1.75
+
+#define NCOUNT_INTRA_THRESH 8192
+#define NCOUNT_INTRA_FACTOR 3
+#define NCOUNT_FRAME_II_THRESH 5.0
+
+#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001)
+
+#if ARF_STATS_OUTPUT
+unsigned int arf_count = 0;
+#endif
+
+// Resets the first pass file to the given position using a relative seek from
+// the current position.
+static void reset_fpf_position(TWO_PASS *p, const FIRSTPASS_STATS *position) {
+  p->stats_in = position;
+}
+
+// Read frame stats at an offset from the current position.
+static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
+  if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) ||
+      (offset < 0 && p->stats_in + offset < p->stats_in_start)) {
+    return NULL;
+  }
+
+  return &p->stats_in[offset];
+}
+
+static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) {
+  if (p->stats_in >= p->stats_in_end) return EOF;
+
+  *fps = *p->stats_in;
+  ++p->stats_in;
+  return 1;
+}
+
+static void output_stats(FIRSTPASS_STATS *stats,
+                         struct aom_codec_pkt_list *pktlist) {
+  struct aom_codec_cx_pkt pkt;
+  pkt.kind = AOM_CODEC_STATS_PKT;
+  pkt.data.twopass_stats.buf = stats;
+  pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
+  aom_codec_pkt_list_add(pktlist, &pkt);
+
+// TEMP debug code
+#if OUTPUT_FPF
+  {
+    FILE *fpfile;
+    fpfile = fopen("firstpass.stt", "a");
+
+    fprintf(fpfile,
+            "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf"
+            "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
+            "%12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf\n",
+            stats->frame, stats->weight, stats->intra_error, stats->coded_error,
+            stats->sr_coded_error, stats->pcnt_inter, stats->pcnt_motion,
+            stats->pcnt_second_ref, stats->pcnt_neutral, stats->intra_skip_pct,
+            stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr,
+            stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv,
+            stats->MVcv, stats->mv_in_out_count, stats->new_mv_count,
+            stats->count, stats->duration);
+    fclose(fpfile);
+  }
+#endif
+}
+
+#if CONFIG_FP_MB_STATS
+static void output_fpmb_stats(uint8_t *this_frame_mb_stats, int stats_size,
+                              struct aom_codec_pkt_list *pktlist) {
+  struct aom_codec_cx_pkt pkt;
+  pkt.kind = AOM_CODEC_FPMB_STATS_PKT;
+  pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats;
+  pkt.data.firstpass_mb_stats.sz = stats_size * sizeof(*this_frame_mb_stats);
+  aom_codec_pkt_list_add(pktlist, &pkt);
+}
+#endif
+
+static void zero_stats(FIRSTPASS_STATS *section) {
+  section->frame = 0.0;
+  section->weight = 0.0;
+  section->intra_error = 0.0;
+  section->coded_error = 0.0;
+  section->sr_coded_error = 0.0;
+  section->pcnt_inter = 0.0;
+  section->pcnt_motion = 0.0;
+  section->pcnt_second_ref = 0.0;
+  section->pcnt_neutral = 0.0;
+  section->intra_skip_pct = 0.0;
+  section->inactive_zone_rows = 0.0;
+  section->inactive_zone_cols = 0.0;
+  section->MVr = 0.0;
+  section->mvr_abs = 0.0;
+  section->MVc = 0.0;
+  section->mvc_abs = 0.0;
+  section->MVrv = 0.0;
+  section->MVcv = 0.0;
+  section->mv_in_out_count = 0.0;
+  section->new_mv_count = 0.0;
+  section->count = 0.0;
+  section->duration = 1.0;
+}
+
+static void accumulate_stats(FIRSTPASS_STATS *section,
+                             const FIRSTPASS_STATS *frame) {
+  section->frame += frame->frame;
+  section->weight += frame->weight;
+  section->intra_error += frame->intra_error;
+  section->coded_error += frame->coded_error;
+  section->sr_coded_error += frame->sr_coded_error;
+  section->pcnt_inter += frame->pcnt_inter;
+  section->pcnt_motion += frame->pcnt_motion;
+  section->pcnt_second_ref += frame->pcnt_second_ref;
+  section->pcnt_neutral += frame->pcnt_neutral;
+  section->intra_skip_pct += frame->intra_skip_pct;
+  section->inactive_zone_rows += frame->inactive_zone_rows;
+  section->inactive_zone_cols += frame->inactive_zone_cols;
+  section->MVr += frame->MVr;
+  section->mvr_abs += frame->mvr_abs;
+  section->MVc += frame->MVc;
+  section->mvc_abs += frame->mvc_abs;
+  section->MVrv += frame->MVrv;
+  section->MVcv += frame->MVcv;
+  section->mv_in_out_count += frame->mv_in_out_count;
+  section->new_mv_count += frame->new_mv_count;
+  section->count += frame->count;
+  section->duration += frame->duration;
+}
+
+static void subtract_stats(FIRSTPASS_STATS *section,
+                           const FIRSTPASS_STATS *frame) {
+  section->frame -= frame->frame;
+  section->weight -= frame->weight;
+  section->intra_error -= frame->intra_error;
+  section->coded_error -= frame->coded_error;
+  section->sr_coded_error -= frame->sr_coded_error;
+  section->pcnt_inter -= frame->pcnt_inter;
+  section->pcnt_motion -= frame->pcnt_motion;
+  section->pcnt_second_ref -= frame->pcnt_second_ref;
+  section->pcnt_neutral -= frame->pcnt_neutral;
+  section->intra_skip_pct -= frame->intra_skip_pct;
+  section->inactive_zone_rows -= frame->inactive_zone_rows;
+  section->inactive_zone_cols -= frame->inactive_zone_cols;
+  section->MVr -= frame->MVr;
+  section->mvr_abs -= frame->mvr_abs;
+  section->MVc -= frame->MVc;
+  section->mvc_abs -= frame->mvc_abs;
+  section->MVrv -= frame->MVrv;
+  section->MVcv -= frame->MVcv;
+  section->mv_in_out_count -= frame->mv_in_out_count;
+  section->new_mv_count -= frame->new_mv_count;
+  section->count -= frame->count;
+  section->duration -= frame->duration;
+}
+
+// Calculate the linear size relative to a baseline of 1080P
+#define BASE_SIZE 2073600.0  // 1920x1080
+static double get_linear_size_factor(const AV1_COMP *cpi) {
+  const double this_area = cpi->initial_width * cpi->initial_height;
+  return pow(this_area / BASE_SIZE, 0.5);
+}
+
+// Calculate an active area of the image that discounts formatting
+// bars and partially discounts other 0 energy areas.
+#define MIN_ACTIVE_AREA 0.5
+#define MAX_ACTIVE_AREA 1.0
+static double calculate_active_area(const AV1_COMP *cpi,
+                                    const FIRSTPASS_STATS *this_frame) {
+  double active_pct;
+
+  active_pct =
+      1.0 -
+      ((this_frame->intra_skip_pct / 2) +
+       ((this_frame->inactive_zone_rows * 2) / (double)cpi->common.mb_rows));
+  return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
+}
+
+// Calculate a modified Error used in distributing bits between easier and
+// harder frames.
+#define ACT_AREA_CORRECTION 0.5
+static double calculate_modified_err(const AV1_COMP *cpi,
+                                     const TWO_PASS *twopass,
+                                     const AV1EncoderConfig *oxcf,
+                                     const FIRSTPASS_STATS *this_frame) {
+  const FIRSTPASS_STATS *const stats = &twopass->total_stats;
+  const double av_weight = stats->weight / stats->count;
+  const double av_err = (stats->coded_error * av_weight) / stats->count;
+  double modified_error =
+      av_err * pow(this_frame->coded_error * this_frame->weight /
+                       DOUBLE_DIVIDE_CHECK(av_err),
+                   oxcf->two_pass_vbrbias / 100.0);
+
+  // Correction for active area. Frames with a reduced active area
+  // (eg due to formatting bars) have a higher error per mb for the
+  // remaining active MBs. The correction here assumes that coding
+  // 0.5N blocks of complexity 2X is a little easier than coding N
+  // blocks of complexity X.
+  modified_error *=
+      pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
+
+  return fclamp(modified_error, twopass->modified_error_min,
+                twopass->modified_error_max);
+}
+
+// This function returns the maximum target rate per frame.
+static int frame_max_bits(const RATE_CONTROL *rc,
+                          const AV1EncoderConfig *oxcf) {
+  int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth *
+                      (int64_t)oxcf->two_pass_vbrmax_section) /
+                     100;
+  if (max_bits < 0)
+    max_bits = 0;
+  else if (max_bits > rc->max_frame_bandwidth)
+    max_bits = rc->max_frame_bandwidth;
+
+  return (int)max_bits;
+}
+
+void av1_init_first_pass(AV1_COMP *cpi) {
+  zero_stats(&cpi->twopass.total_stats);
+}
+
+void av1_end_first_pass(AV1_COMP *cpi) {
+  output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list);
+}
+
+static aom_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
+  switch (bsize) {
+    case BLOCK_8X8: return aom_mse8x8;
+    case BLOCK_16X8: return aom_mse16x8;
+    case BLOCK_8X16: return aom_mse8x16;
+    default: return aom_mse16x16;
+  }
+}
+
+static unsigned int get_prediction_error(BLOCK_SIZE bsize,
+                                         const struct buf_2d *src,
+                                         const struct buf_2d *ref) {
+  unsigned int sse;
+  const aom_variance_fn_t fn = get_block_variance_fn(bsize);
+  fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+  return sse;
+}
+
+#if CONFIG_HIGHBITDEPTH
+static aom_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
+                                                      int bd) {
+  switch (bd) {
+    default:
+      switch (bsize) {
+        case BLOCK_8X8: return aom_highbd_8_mse8x8;
+        case BLOCK_16X8: return aom_highbd_8_mse16x8;
+        case BLOCK_8X16: return aom_highbd_8_mse8x16;
+        default: return aom_highbd_8_mse16x16;
+      }
+      break;
+    case 10:
+      switch (bsize) {
+        case BLOCK_8X8: return aom_highbd_10_mse8x8;
+        case BLOCK_16X8: return aom_highbd_10_mse16x8;
+        case BLOCK_8X16: return aom_highbd_10_mse8x16;
+        default: return aom_highbd_10_mse16x16;
+      }
+      break;
+    case 12:
+      switch (bsize) {
+        case BLOCK_8X8: return aom_highbd_12_mse8x8;
+        case BLOCK_16X8: return aom_highbd_12_mse16x8;
+        case BLOCK_8X16: return aom_highbd_12_mse8x16;
+        default: return aom_highbd_12_mse16x16;
+      }
+      break;
+  }
+}
+
+static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
+                                                const struct buf_2d *src,
+                                                const struct buf_2d *ref,
+                                                int bd) {
+  unsigned int sse;
+  const aom_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
+  fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+  return sse;
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+// Refine the motion search range according to the frame dimension
+// for first pass test.
+static int get_search_range(const AV1_COMP *cpi) {
+  int sr = 0;
+  const int dim = AOMMIN(cpi->initial_width, cpi->initial_height);
+
+  while ((dim << sr) < MAX_FULL_PEL_VAL) ++sr;
+  return sr;
+}
+
+static void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
+                                     const MV *ref_mv, MV *best_mv,
+                                     int *best_motion_err) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV tmp_mv = { 0, 0 };
+  MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 };
+  int num00, tmp_err, n;
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
+  const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
+
+  int step_param = 3;
+  int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+  const int sr = get_search_range(cpi);
+  step_param += sr;
+  further_steps -= sr;
+
+  // Override the default variance function to use MSE.
+  v_fn_ptr.vf = get_block_variance_fn(bsize);
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+
+  // Center the initial step/diamond search on best mv.
+  tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
+                                    step_param, x->sadperbit16, &num00,
+                                    &v_fn_ptr, ref_mv);
+  if (tmp_err < INT_MAX)
+    tmp_err = av1_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
+  if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty;
+
+  if (tmp_err < *best_motion_err) {
+    *best_motion_err = tmp_err;
+    *best_mv = tmp_mv;
+  }
+
+  // Carry out further step/diamond searches as necessary.
+  n = num00;
+  num00 = 0;
+
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      --num00;
+    } else {
+      tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
+                                        step_param + n, x->sadperbit16, &num00,
+                                        &v_fn_ptr, ref_mv);
+      if (tmp_err < INT_MAX)
+        tmp_err = av1_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
+      if (tmp_err < INT_MAX - new_mv_mode_penalty)
+        tmp_err += new_mv_mode_penalty;
+
+      if (tmp_err < *best_motion_err) {
+        *best_motion_err = tmp_err;
+        *best_mv = tmp_mv;
+      }
+    }
+  }
+}
+
+static BLOCK_SIZE get_bsize(const AV1_COMMON *cm, int mb_row, int mb_col) {
+  if (mi_size_wide[BLOCK_16X16] * mb_col + mi_size_wide[BLOCK_8X8] <
+      cm->mi_cols) {
+    return mi_size_wide[BLOCK_16X16] * mb_row + mi_size_wide[BLOCK_8X8] <
+                   cm->mi_rows
+               ? BLOCK_16X16
+               : BLOCK_16X8;
+  } else {
+    return mi_size_wide[BLOCK_16X16] * mb_row + mi_size_wide[BLOCK_8X8] <
+                   cm->mi_rows
+               ? BLOCK_8X16
+               : BLOCK_8X8;
+  }
+}
+
+static int find_fp_qindex(aom_bit_depth_t bit_depth) {
+  int i;
+
+  for (i = 0; i < QINDEX_RANGE; ++i)
+    if (av1_convert_qindex_to_q(i, bit_depth) >= FIRST_PASS_Q) break;
+
+  if (i == QINDEX_RANGE) i--;
+
+  return i;
+}
+
+static void set_first_pass_params(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (!cpi->refresh_alt_ref_frame &&
+      (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY))) {
+    cm->frame_type = KEY_FRAME;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  // Do not use periodic key frames.
+  cpi->rc.frames_to_key = INT_MAX;
+}
+
+#define UL_INTRA_THRESH 50
+#define INVALID_ROW -1
+void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
+  int mb_row, mb_col;
+  MACROBLOCK *const x = &cpi->td.mb;
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TileInfo tile;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const PICK_MODE_CONTEXT *ctx =
+      &cpi->td.pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2]->none;
+  int i;
+
+  int recon_yoffset, recon_uvoffset;
+  int64_t intra_error = 0;
+  int64_t coded_error = 0;
+  int64_t sr_coded_error = 0;
+
+  int sum_mvr = 0, sum_mvc = 0;
+  int sum_mvr_abs = 0, sum_mvc_abs = 0;
+  int64_t sum_mvrs = 0, sum_mvcs = 0;
+  int mvcount = 0;
+  int intercount = 0;
+  int second_ref_count = 0;
+  const int intrapenalty = INTRA_MODE_PENALTY;
+  double neutral_count;
+  int intra_skip_count = 0;
+  int image_data_start_row = INVALID_ROW;
+  int new_mv_count = 0;
+  int sum_in_vectors = 0;
+  MV lastmv = { 0, 0 };
+  TWO_PASS *twopass = &cpi->twopass;
+  const MV zero_mv = { 0, 0 };
+  int recon_y_stride, recon_uv_stride, uv_mb_height;
+
+  YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+  YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+  YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
+  const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+  double intra_factor;
+  double brightness_factor;
+  BufferPool *const pool = cm->buffer_pool;
+  const int qindex = find_fp_qindex(cm->bit_depth);
+  const int mb_scale = mi_size_wide[BLOCK_16X16];
+#if CONFIG_PVQ
+  PVQ_QUEUE pvq_q;
+  od_adapt_ctx pvq_context;
+#endif
+
+  // First pass code requires valid last and new frame buffers.
+  assert(new_yv12 != NULL);
+  assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
+
+#if CONFIG_FP_MB_STATS
+  if (cpi->use_fp_mb_stats) {
+    av1_zero_array(cpi->twopass.frame_mb_stats_buf, cpi->initial_mbs);
+  }
+#endif
+
+  aom_clear_system_state();
+
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
+  x->e_mbd.mi[0]->mbmi.sb_type = BLOCK_16X16;
+
+  intra_factor = 0.0;
+  brightness_factor = 0.0;
+  neutral_count = 0.0;
+
+  set_first_pass_params(cpi);
+  av1_set_quantizer(cm, qindex);
+
+  av1_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+
+  av1_setup_src_planes(x, cpi->source, 0, 0);
+  av1_setup_dst_planes(xd->plane, cm->sb_size, new_yv12, 0, 0);
+
+  if (!frame_is_intra_only(cm)) {
+    av1_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
+  }
+
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
+
+#if CONFIG_CFL
+  // Don't store luma on the fist pass since chroma is not computed
+  x->cfl_store_y = 0;
+#endif
+  av1_frame_init_quantizer(cpi);
+
+#if CONFIG_PVQ
+  // For pass 1 of 2-pass encoding, init here for PVQ for now.
+  {
+    pvq_q.buf_len = 5000;
+    CHECK_MEM_ERROR(cm, pvq_q.buf,
+                    aom_malloc(pvq_q.buf_len * sizeof(PVQ_INFO)));
+    pvq_q.curr_pos = 0;
+    x->pvq_coded = 0;
+
+    x->pvq_q = &pvq_q;
+
+    // TODO(yushin): Since this init step is also called in 2nd pass,
+    // or 1-pass encoding, consider factoring out it as a function.
+    // TODO(yushin)
+    // If activity masking is enabled, change below to OD_HVS_QM
+    x->daala_enc.qm = OD_FLAT_QM;  // Hard coded. Enc/dec required to sync.
+    x->daala_enc.pvq_norm_lambda = OD_PVQ_LAMBDA;
+    x->daala_enc.pvq_norm_lambda_dc = OD_PVQ_LAMBDA;
+
+    od_init_qm(x->daala_enc.state.qm, x->daala_enc.state.qm_inv,
+               x->daala_enc.qm == OD_HVS_QM ? OD_QM8_Q4_HVS : OD_QM8_Q4_FLAT);
+#if CONFIG_DAALA_EC
+    od_ec_enc_init(&x->daala_enc.w.ec, 65025);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+
+#if CONFIG_DAALA_EC
+    od_ec_enc_reset(&x->daala_enc.w.ec);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+  }
+#endif
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff[i];
+    p[i].qcoeff = ctx->qcoeff[i];
+    pd[i].dqcoeff = ctx->dqcoeff[i];
+#if CONFIG_PVQ
+    pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i];
+#endif
+    p[i].eobs = ctx->eobs[i];
+#if CONFIG_LV_MAP
+    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+#endif
+  }
+
+  av1_init_mv_probs(cm);
+#if CONFIG_ADAPT_SCAN
+  av1_init_scan_order(cm);
+#endif
+  av1_convolve_init(cm);
+#if CONFIG_PVQ
+  od_adapt_ctx_reset(&pvq_context, 0);
+  x->daala_enc.state.adapt = &pvq_context;
+#endif  // CONFIG_PVQ
+  av1_initialize_rd_consts(cpi);
+
+  // Tiling is ignored in the first pass.
+  av1_tile_init(&tile, cm, 0, 0);
+
+  recon_y_stride = new_yv12->y_stride;
+  recon_uv_stride = new_yv12->uv_stride;
+  uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
+
+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+    MV best_ref_mv = { 0, 0 };
+
+    // Reset above block coeffs.
+    xd->up_available = (mb_row != 0);
+    recon_yoffset = (mb_row * recon_y_stride * 16);
+    recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
+
+    // Set up limit values for motion vectors to prevent them extending
+    // outside the UMV borders.
+    x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
+    x->mv_limits.row_max =
+        ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16;
+
+    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
+      int this_error;
+      const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+      const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
+      double log_intra;
+      int level_sample;
+
+#if CONFIG_FP_MB_STATS
+      const int mb_index = mb_row * cm->mb_cols + mb_col;
+#endif
+
+      aom_clear_system_state();
+
+      xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
+      xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
+      xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
+      xd->left_available = (mb_col != 0);
+      xd->mi[0]->mbmi.sb_type = bsize;
+      xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
+      set_mi_row_col(xd, &tile, mb_row * mb_scale, mi_size_high[bsize],
+                     mb_col * mb_scale, mi_size_wide[bsize],
+#if CONFIG_DEPENDENT_HORZTILES
+                     cm->dependent_horz_tiles,
+#endif  // CONFIG_DEPENDENT_HORZTILES
+                     cm->mi_rows, cm->mi_cols);
+
+      set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize]);
+
+      // Do intra 16x16 prediction.
+      xd->mi[0]->mbmi.segment_id = 0;
+#if CONFIG_SUPERTX
+      xd->mi[0]->mbmi.segment_id_supertx = 0;
+#endif  // CONFIG_SUPERTX
+      xd->lossless[xd->mi[0]->mbmi.segment_id] = (qindex == 0);
+      xd->mi[0]->mbmi.mode = DC_PRED;
+      xd->mi[0]->mbmi.tx_size =
+          use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
+      av1_encode_intra_block_plane(cm, x, bsize, 0, 0, mb_row * 2, mb_col * 2);
+      this_error = aom_get_mb_ss(x->plane[0].src_diff);
+
+      // Keep a record of blocks that have almost no intra error residual
+      // (i.e. are in effect completely flat and untextured in the intra
+      // domain). In natural videos this is uncommon, but it is much more
+      // common in animations, graphics and screen content, so may be used
+      // as a signal to detect these types of content.
+      if (this_error < UL_INTRA_THRESH) {
+        ++intra_skip_count;
+      } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) {
+        image_data_start_row = mb_row;
+      }
+
+#if CONFIG_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        switch (cm->bit_depth) {
+          case AOM_BITS_8: break;
+          case AOM_BITS_10: this_error >>= 4; break;
+          case AOM_BITS_12: this_error >>= 8; break;
+          default:
+            assert(0 &&
+                   "cm->bit_depth should be AOM_BITS_8, "
+                   "AOM_BITS_10 or AOM_BITS_12");
+            return;
+        }
+      }
+#endif  // CONFIG_HIGHBITDEPTH
+
+      aom_clear_system_state();
+      log_intra = log(this_error + 1.0);
+      if (log_intra < 10.0)
+        intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
+      else
+        intra_factor += 1.0;
+
+#if CONFIG_HIGHBITDEPTH
+      if (cm->use_highbitdepth)
+        level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
+      else
+        level_sample = x->plane[0].src.buf[0];
+#else
+      level_sample = x->plane[0].src.buf[0];
+#endif
+      if ((level_sample < DARK_THRESH) && (log_intra < 9.0))
+        brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
+      else
+        brightness_factor += 1.0;
+
+      // Intrapenalty below deals with situations where the intra and inter
+      // error scores are very low (e.g. a plain black frame).
+      // We do not have special cases in first pass for 0,0 and nearest etc so
+      // all inter modes carry an overhead cost estimate for the mv.
+      // When the error score is very low this causes us to pick all or lots of
+      // INTRA modes and throw lots of key frames.
+      // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+      this_error += intrapenalty;
+
+      // Accumulate the intra error.
+      intra_error += (int64_t)this_error;
+
+#if CONFIG_FP_MB_STATS
+      if (cpi->use_fp_mb_stats) {
+        // initialization
+        cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+      }
+#endif
+
+      // Set up limit values for motion vectors to prevent them extending
+      // outside the UMV borders.
+      x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
+      x->mv_limits.col_max =
+          ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
+
+      if (!frame_is_intra_only(cm)) {  // Do a motion search
+        int tmp_err, motion_error, raw_motion_error;
+        // Assume 0,0 motion with no mv overhead.
+        MV mv = { 0, 0 }, tmp_mv = { 0, 0 };
+        struct buf_2d unscaled_last_source_buf_2d;
+
+        xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          motion_error = highbd_get_prediction_error(
+              bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+        } else {
+          motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                              &xd->plane[0].pre[0]);
+        }
+#else
+        motion_error =
+            get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+#endif  // CONFIG_HIGHBITDEPTH
+
+        // Compute the motion error of the 0,0 motion using the last source
+        // frame as the reference. Skip the further motion search on
+        // reconstructed frame if this error is small.
+        unscaled_last_source_buf_2d.buf =
+            cpi->unscaled_last_source->y_buffer + recon_yoffset;
+        unscaled_last_source_buf_2d.stride =
+            cpi->unscaled_last_source->y_stride;
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          raw_motion_error = highbd_get_prediction_error(
+              bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
+        } else {
+          raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                                  &unscaled_last_source_buf_2d);
+        }
+#else
+        raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                                &unscaled_last_source_buf_2d);
+#endif  // CONFIG_HIGHBITDEPTH
+
+        // TODO(pengchong): Replace the hard-coded threshold
+        if (raw_motion_error > 25) {
+          // Test last reference frame using the previous best mv as the
+          // starting point (best reference) for the search.
+          first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error);
+
+          // If the current best reference mv is not centered on 0,0 then do a
+          // 0,0 based search as well.
+          if (!is_zero_mv(&best_ref_mv)) {
+            tmp_err = INT_MAX;
+            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
+
+            if (tmp_err < motion_error) {
+              motion_error = tmp_err;
+              mv = tmp_mv;
+            }
+          }
+
+          // Search in an older reference frame.
+          if ((cm->current_video_frame > 1) && gld_yv12 != NULL) {
+            // Assume 0,0 motion with no mv overhead.
+            int gf_motion_error;
+
+            xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
+#if CONFIG_HIGHBITDEPTH
+            if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+              gf_motion_error = highbd_get_prediction_error(
+                  bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+            } else {
+              gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                                     &xd->plane[0].pre[0]);
+            }
+#else
+            gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                                   &xd->plane[0].pre[0]);
+#endif  // CONFIG_HIGHBITDEPTH
+
+            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,
+                                     &gf_motion_error);
+
+            if (gf_motion_error < motion_error && gf_motion_error < this_error)
+              ++second_ref_count;
+
+            // Reset to last frame as reference buffer.
+            xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+            xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
+            xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
+
+            // In accumulating a score for the older reference frame take the
+            // best of the motion predicted score and the intra coded error
+            // (just as will be done for) accumulation of "coded_error" for
+            // the last frame.
+            if (gf_motion_error < this_error)
+              sr_coded_error += gf_motion_error;
+            else
+              sr_coded_error += this_error;
+          } else {
+            sr_coded_error += motion_error;
+          }
+        } else {
+          sr_coded_error += motion_error;
+        }
+
+        // Start by assuming that intra mode is best.
+        best_ref_mv.row = 0;
+        best_ref_mv.col = 0;
+
+#if CONFIG_FP_MB_STATS
+        if (cpi->use_fp_mb_stats) {
+          // intra predication statistics
+          cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
+          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
+          if (this_error > FPMB_ERROR_LARGE_TH) {
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
+          } else if (this_error < FPMB_ERROR_SMALL_TH) {
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
+          }
+        }
+#endif
+
+        if (motion_error <= this_error) {
+          aom_clear_system_state();
+
+          // Keep a count of cases where the inter and intra were very close
+          // and very low. This helps with scene cut detection for example in
+          // cropped clips with black bars at the sides or top and bottom.
+          if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
+              (this_error < (2 * intrapenalty))) {
+            neutral_count += 1.0;
+            // Also track cases where the intra is not much worse than the inter
+            // and use this in limiting the GF/arf group length.
+          } else if ((this_error > NCOUNT_INTRA_THRESH) &&
+                     (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
+            neutral_count +=
+                (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error);
+          }
+
+          mv.row *= 8;
+          mv.col *= 8;
+          this_error = motion_error;
+          xd->mi[0]->mbmi.mode = NEWMV;
+          xd->mi[0]->mbmi.mv[0].as_mv = mv;
+          xd->mi[0]->mbmi.tx_size = TX_4X4;
+          xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME;
+          xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME;
+          av1_build_inter_predictors_sby(xd, mb_row * mb_scale,
+                                         mb_col * mb_scale, NULL, bsize);
+          av1_encode_sby_pass1(cm, x, bsize);
+          sum_mvr += mv.row;
+          sum_mvr_abs += abs(mv.row);
+          sum_mvc += mv.col;
+          sum_mvc_abs += abs(mv.col);
+          sum_mvrs += mv.row * mv.row;
+          sum_mvcs += mv.col * mv.col;
+          ++intercount;
+
+          best_ref_mv = mv;
+
+#if CONFIG_FP_MB_STATS
+          if (cpi->use_fp_mb_stats) {
+            // inter predication statistics
+            cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+            cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
+            if (this_error > FPMB_ERROR_LARGE_TH) {
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                  FPMB_ERROR_LARGE_MASK;
+            } else if (this_error < FPMB_ERROR_SMALL_TH) {
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                  FPMB_ERROR_SMALL_MASK;
+            }
+          }
+#endif
+
+          if (!is_zero_mv(&mv)) {
+            ++mvcount;
+
+#if CONFIG_FP_MB_STATS
+            if (cpi->use_fp_mb_stats) {
+              cpi->twopass.frame_mb_stats_buf[mb_index] &=
+                  ~FPMB_MOTION_ZERO_MASK;
+              // check estimated motion direction
+              if (mv.col > 0 && mv.col >= abs(mv.row)) {
+                // right direction
+                cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                    FPMB_MOTION_RIGHT_MASK;
+              } else if (mv.row < 0 && abs(mv.row) >= abs(mv.col)) {
+                // up direction
+                cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                    FPMB_MOTION_UP_MASK;
+              } else if (mv.col < 0 && abs(mv.col) >= abs(mv.row)) {
+                // left direction
+                cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                    FPMB_MOTION_LEFT_MASK;
+              } else {
+                // down direction
+                cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                    FPMB_MOTION_DOWN_MASK;
+              }
+            }
+#endif
+
+            // Non-zero vector, was it different from the last non zero vector?
+            if (!is_equal_mv(&mv, &lastmv)) ++new_mv_count;
+            lastmv = mv;
+
+            // Does the row vector point inwards or outwards?
+            if (mb_row < cm->mb_rows / 2) {
+              if (mv.row > 0)
+                --sum_in_vectors;
+              else if (mv.row < 0)
+                ++sum_in_vectors;
+            } else if (mb_row > cm->mb_rows / 2) {
+              if (mv.row > 0)
+                ++sum_in_vectors;
+              else if (mv.row < 0)
+                --sum_in_vectors;
+            }
+
+            // Does the col vector point inwards or outwards?
+            if (mb_col < cm->mb_cols / 2) {
+              if (mv.col > 0)
+                --sum_in_vectors;
+              else if (mv.col < 0)
+                ++sum_in_vectors;
+            } else if (mb_col > cm->mb_cols / 2) {
+              if (mv.col > 0)
+                ++sum_in_vectors;
+              else if (mv.col < 0)
+                --sum_in_vectors;
+            }
+          }
+        }
+      } else {
+        sr_coded_error += (int64_t)this_error;
+      }
+      coded_error += (int64_t)this_error;
+
+      // Adjust to the next column of MBs.
+      x->plane[0].src.buf += 16;
+      x->plane[1].src.buf += uv_mb_height;
+      x->plane[2].src.buf += uv_mb_height;
+
+      recon_yoffset += 16;
+      recon_uvoffset += uv_mb_height;
+    }
+
+    // Adjust to the next row of MBs.
+    x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
+    x->plane[1].src.buf +=
+        uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols;
+    x->plane[2].src.buf +=
+        uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols;
+
+    aom_clear_system_state();
+  }
+
+#if CONFIG_PVQ
+#if CONFIG_DAALA_EC
+  od_ec_enc_clear(&x->daala_enc.w.ec);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+
+  x->pvq_q->last_pos = x->pvq_q->curr_pos;
+  x->pvq_q->curr_pos = 0;
+  x->pvq_q = NULL;
+
+  aom_free(pvq_q.buf);
+#endif
+
+  // Clamp the image start to rows/2. This number of rows is discarded top
+  // and bottom as dead data so rows / 2 means the frame is blank.
+  if ((image_data_start_row > cm->mb_rows / 2) ||
+      (image_data_start_row == INVALID_ROW)) {
+    image_data_start_row = cm->mb_rows / 2;
+  }
+  // Exclude any image dead zone
+  if (image_data_start_row > 0) {
+    intra_skip_count =
+        AOMMAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2));
+  }
+
+  {
+    FIRSTPASS_STATS fps;
+    // The minimum error here insures some bit allocation to frames even
+    // in static regions. The allocation per MB declines for larger formats
+    // where the typical "real" energy per MB also falls.
+    // Initial estimate here uses sqrt(mbs) to define the min_err, where the
+    // number of mbs is proportional to the image area.
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                            ? cpi->initial_mbs
+                            : cpi->common.MBs;
+    const double min_err = 200 * sqrt(num_mbs);
+
+    intra_factor = intra_factor / (double)num_mbs;
+    brightness_factor = brightness_factor / (double)num_mbs;
+    fps.weight = intra_factor * brightness_factor;
+
+    fps.frame = cm->current_video_frame;
+    fps.coded_error = (double)(coded_error >> 8) + min_err;
+    fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err;
+    fps.intra_error = (double)(intra_error >> 8) + min_err;
+    fps.count = 1.0;
+    fps.pcnt_inter = (double)intercount / num_mbs;
+    fps.pcnt_second_ref = (double)second_ref_count / num_mbs;
+    fps.pcnt_neutral = (double)neutral_count / num_mbs;
+    fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
+    fps.inactive_zone_rows = (double)image_data_start_row;
+    fps.inactive_zone_cols = (double)0;  // TODO(paulwilkins): fix
+
+    if (mvcount > 0) {
+      fps.MVr = (double)sum_mvr / mvcount;
+      fps.mvr_abs = (double)sum_mvr_abs / mvcount;
+      fps.MVc = (double)sum_mvc / mvcount;
+      fps.mvc_abs = (double)sum_mvc_abs / mvcount;
+      fps.MVrv =
+          ((double)sum_mvrs - ((double)sum_mvr * sum_mvr / mvcount)) / mvcount;
+      fps.MVcv =
+          ((double)sum_mvcs - ((double)sum_mvc * sum_mvc / mvcount)) / mvcount;
+      fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
+      fps.new_mv_count = new_mv_count;
+      fps.pcnt_motion = (double)mvcount / num_mbs;
+    } else {
+      fps.MVr = 0.0;
+      fps.mvr_abs = 0.0;
+      fps.MVc = 0.0;
+      fps.mvc_abs = 0.0;
+      fps.MVrv = 0.0;
+      fps.MVcv = 0.0;
+      fps.mv_in_out_count = 0.0;
+      fps.new_mv_count = 0.0;
+      fps.pcnt_motion = 0.0;
+    }
+
+    // TODO(paulwilkins):  Handle the case when duration is set to 0, or
+    // something less than the full time between subsequent values of
+    // cpi->source_time_stamp.
+    fps.duration = (double)(source->ts_end - source->ts_start);
+
+    // Don't want to do output stats with a stack variable!
+    twopass->this_frame_stats = fps;
+    output_stats(&twopass->this_frame_stats, cpi->output_pkt_list);
+    accumulate_stats(&twopass->total_stats, &fps);
+
+#if CONFIG_FP_MB_STATS
+    if (cpi->use_fp_mb_stats) {
+      output_fpmb_stats(twopass->frame_mb_stats_buf, cpi->initial_mbs,
+                        cpi->output_pkt_list);
+    }
+#endif
+  }
+
+  // Copy the previous Last Frame back into gf and and arf buffers if
+  // the prediction is good enough... but also don't allow it to lag too far.
+  if ((twopass->sr_update_lag > 3) ||
+      ((cm->current_video_frame > 0) &&
+       (twopass->this_frame_stats.pcnt_inter > 0.20) &&
+       ((twopass->this_frame_stats.intra_error /
+         DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
+    if (gld_yv12 != NULL) {
+#if CONFIG_EXT_REFS
+      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+                 cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]]);
+#else
+      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+                 cm->ref_frame_map[cpi->lst_fb_idx]);
+#endif  // CONFIG_EXT_REFS
+    }
+    twopass->sr_update_lag = 1;
+  } else {
+    ++twopass->sr_update_lag;
+  }
+
+  aom_extend_frame_borders(new_yv12);
+
+// The frame we just compressed now becomes the last frame.
+#if CONFIG_EXT_REFS
+  ref_cnt_fb(pool->frame_bufs,
+             &cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]],
+             cm->new_fb_idx);
+#else
+  ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
+             cm->new_fb_idx);
+#endif  // CONFIG_EXT_REFS
+
+  // Special case for the first frame. Copy into the GF buffer as a second
+  // reference.
+  if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX) {
+#if CONFIG_EXT_REFS
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+               cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]]);
+#else
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+               cm->ref_frame_map[cpi->lst_fb_idx]);
+#endif  // CONFIG_EXT_REFS
+  }
+
+  // Use this to see what the first pass reconstruction looks like.
+  if (0) {
+    char filename[512];
+    FILE *recon_file;
+    snprintf(filename, sizeof(filename), "enc%04d.yuv",
+             (int)cm->current_video_frame);
+
+    if (cm->current_video_frame == 0)
+      recon_file = fopen(filename, "wb");
+    else
+      recon_file = fopen(filename, "ab");
+
+    (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file);
+    fclose(recon_file);
+  }
+
+  ++cm->current_video_frame;
+}
+
+static double calc_correction_factor(double err_per_mb, double err_divisor,
+                                     double pt_low, double pt_high, int q,
+                                     aom_bit_depth_t bit_depth) {
+  const double error_term = err_per_mb / err_divisor;
+
+  // Adjustment based on actual quantizer to power term.
+  const double power_term =
+      AOMMIN(av1_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high);
+
+  // Calculate correction factor.
+  if (power_term < 1.0) assert(error_term >= 0.0);
+
+  return fclamp(pow(error_term, power_term), 0.05, 5.0);
+}
+
+#define ERR_DIVISOR 100.0
+static int get_twopass_worst_quality(const AV1_COMP *cpi,
+                                     const double section_err,
+                                     double inactive_zone,
+                                     int section_target_bandwidth,
+                                     double group_weight_factor) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+  inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
+
+  if (section_target_bandwidth <= 0) {
+    return rc->worst_quality;  // Highest value allowed
+  } else {
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                            ? cpi->initial_mbs
+                            : cpi->common.MBs;
+    const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
+    const double av_err_per_mb = section_err / active_mbs;
+    const double speed_term = 1.0 + 0.04 * oxcf->speed;
+    double ediv_size_correction;
+    const int target_norm_bits_per_mb =
+        (int)((uint64_t)section_target_bandwidth << BPER_MB_NORMBITS) /
+        active_mbs;
+    int q;
+
+    // Larger image formats are expected to be a little harder to code
+    // relatively given the same prediction error score. This in part at
+    // least relates to the increased size and hence coding overheads of
+    // motion vectors. Some account of this is made through adjustment of
+    // the error divisor.
+    ediv_size_correction =
+        AOMMAX(0.2, AOMMIN(5.0, get_linear_size_factor(cpi)));
+    if (ediv_size_correction < 1.0)
+      ediv_size_correction = -(1.0 / ediv_size_correction);
+    ediv_size_correction *= 4.0;
+
+    // Try and pick a max Q that will be high enough to encode the
+    // content at the given rate.
+    for (q = rc->best_quality; q < rc->worst_quality; ++q) {
+      const double factor = calc_correction_factor(
+          av_err_per_mb, ERR_DIVISOR - ediv_size_correction, FACTOR_PT_LOW,
+          FACTOR_PT_HIGH, q, cpi->common.bit_depth);
+      const int bits_per_mb = av1_rc_bits_per_mb(
+          INTER_FRAME, q, factor * speed_term * group_weight_factor,
+          cpi->common.bit_depth);
+      if (bits_per_mb <= target_norm_bits_per_mb) break;
+    }
+
+    // Restriction on active max q for constrained quality mode.
+    if (cpi->oxcf.rc_mode == AOM_CQ) q = AOMMAX(q, oxcf->cq_level);
+    return q;
+  }
+}
+
+static void setup_rf_level_maxq(AV1_COMP *cpi) {
+  int i;
+  RATE_CONTROL *const rc = &cpi->rc;
+  for (i = INTER_NORMAL; i < RATE_FACTOR_LEVELS; ++i) {
+    int qdelta = av1_frame_type_qdelta(cpi, i, rc->worst_quality);
+    rc->rf_level_maxq[i] = AOMMAX(rc->worst_quality + qdelta, rc->best_quality);
+  }
+}
+
+void av1_init_subsampling(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int w = cm->width;
+  const int h = cm->height;
+  int i;
+
+  for (i = 0; i < FRAME_SCALE_STEPS; ++i) {
+    // Note: Frames with odd-sized dimensions may result from this scaling.
+    rc->frame_width[i] = (w * 16) / frame_scale_factor[i];
+    rc->frame_height[i] = (h * 16) / frame_scale_factor[i];
+  }
+
+  setup_rf_level_maxq(cpi);
+}
+
+void av1_calculate_coded_size(AV1_COMP *cpi, int *scaled_frame_width,
+                              int *scaled_frame_height) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  *scaled_frame_width = rc->frame_width[rc->frame_size_selector];
+  *scaled_frame_height = rc->frame_height[rc->frame_size_selector];
+}
+
+void av1_init_second_pass(AV1_COMP *cpi) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+  double frame_rate;
+  FIRSTPASS_STATS *stats;
+
+  zero_stats(&twopass->total_stats);
+  zero_stats(&twopass->total_left_stats);
+
+  if (!twopass->stats_in_end) return;
+
+  stats = &twopass->total_stats;
+
+  *stats = *twopass->stats_in_end;
+  twopass->total_left_stats = *stats;
+
+  frame_rate = 10000000.0 * stats->count / stats->duration;
+  // Each frame can have a different duration, as the frame rate in the source
+  // isn't guaranteed to be constant. The frame rate prior to the first frame
+  // encoded in the second pass is a guess. However, the sum duration is not.
+  // It is calculated based on the actual durations of all frames from the
+  // first pass.
+  av1_new_framerate(cpi, frame_rate);
+  twopass->bits_left =
+      (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
+
+  // This variable monitors how far behind the second ref update is lagging.
+  twopass->sr_update_lag = 1;
+
+  // Scan the first pass file and calculate a modified total error based upon
+  // the bias/power function used to allocate bits.
+  {
+    const double avg_error =
+        stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count);
+    const FIRSTPASS_STATS *s = twopass->stats_in;
+    double modified_error_total = 0.0;
+    twopass->modified_error_min =
+        (avg_error * oxcf->two_pass_vbrmin_section) / 100;
+    twopass->modified_error_max =
+        (avg_error * oxcf->two_pass_vbrmax_section) / 100;
+    while (s < twopass->stats_in_end) {
+      modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s);
+      ++s;
+    }
+    twopass->modified_error_left = modified_error_total;
+  }
+
+  // Reset the vbr bits off target counters
+  cpi->rc.vbr_bits_off_target = 0;
+  cpi->rc.vbr_bits_off_target_fast = 0;
+
+  cpi->rc.rate_error_estimate = 0;
+
+  // Static sequence monitor variables.
+  twopass->kf_zeromotion_pct = 100;
+  twopass->last_kfgroup_zeromotion_pct = 100;
+
+  if (oxcf->resize_mode != RESIZE_NONE) {
+    av1_init_subsampling(cpi);
+  }
+}
+
+#define SR_DIFF_PART 0.0015
+#define MOTION_AMP_PART 0.003
+#define INTRA_PART 0.005
+#define DEFAULT_DECAY_LIMIT 0.75
+#define LOW_SR_DIFF_TRHESH 0.1
+#define SR_DIFF_MAX 128.0
+
+static double get_sr_decay_rate(const AV1_COMP *cpi,
+                                const FIRSTPASS_STATS *frame) {
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+                                                             : cpi->common.MBs;
+  double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs;
+  double sr_decay = 1.0;
+  double modified_pct_inter;
+  double modified_pcnt_intra;
+  const double motion_amplitude_factor =
+      frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2);
+
+  modified_pct_inter = frame->pcnt_inter;
+  if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+      (double)NCOUNT_FRAME_II_THRESH) {
+    modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
+  }
+  modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
+  if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
+    sr_diff = AOMMIN(sr_diff, SR_DIFF_MAX);
+    sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
+               (MOTION_AMP_PART * motion_amplitude_factor) -
+               (INTRA_PART * modified_pcnt_intra);
+  }
+  return AOMMAX(sr_decay, AOMMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
+}
+
+// This function gives an estimate of how badly we believe the prediction
+// quality is decaying from frame to frame.
+static double get_zero_motion_factor(const AV1_COMP *cpi,
+                                     const FIRSTPASS_STATS *frame) {
+  const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion;
+  double sr_decay = get_sr_decay_rate(cpi, frame);
+  return AOMMIN(sr_decay, zero_motion_pct);
+}
+
+#define ZM_POWER_FACTOR 0.75
+
+static double get_prediction_decay_rate(const AV1_COMP *cpi,
+                                        const FIRSTPASS_STATS *next_frame) {
+  const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame);
+  const double zero_motion_factor =
+      (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
+                  ZM_POWER_FACTOR));
+
+  return AOMMAX(zero_motion_factor,
+                (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
+}
+
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+static int detect_transition_to_still(AV1_COMP *cpi, int frame_interval,
+                                      int still_interval,
+                                      double loop_decay_rate,
+                                      double last_decay_rate) {
+  TWO_PASS *const twopass = &cpi->twopass;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // Break clause to detect very still sections after motion
+  // For example a static image after a fade or other transition
+  // instead of a clean scene cut.
+  if (frame_interval > rc->min_gf_interval && loop_decay_rate >= 0.999 &&
+      last_decay_rate < 0.9) {
+    int j;
+
+    // Look ahead a few frames to see if static condition persists...
+    for (j = 0; j < still_interval; ++j) {
+      const FIRSTPASS_STATS *stats = &twopass->stats_in[j];
+      if (stats >= twopass->stats_in_end) break;
+
+      if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break;
+    }
+
+    // Only if it does do we signal a transition to still.
+    return j == still_interval;
+  }
+
+  return 0;
+}
+
+// This function detects a flash through the high relative pcnt_second_ref
+// score in the frame following a flash frame. The offset passed in should
+// reflect this.
+static int detect_flash(const TWO_PASS *twopass, int offset) {
+  const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset);
+
+  // What we are looking for here is a situation where there is a
+  // brief break in prediction (such as a flash) but subsequent frames
+  // are reasonably well predicted by an earlier (pre flash) frame.
+  // The recovery after a flash is indicated by a high pcnt_second_ref
+  // compared to pcnt_inter.
+  return next_frame != NULL &&
+         next_frame->pcnt_second_ref > next_frame->pcnt_inter &&
+         next_frame->pcnt_second_ref >= 0.5;
+}
+
+// Update the motion related elements to the GF arf boost calculation.
+static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
+                                          double *mv_in_out,
+                                          double *mv_in_out_accumulator,
+                                          double *abs_mv_in_out_accumulator,
+                                          double *mv_ratio_accumulator) {
+  const double pct = stats->pcnt_motion;
+
+  // Accumulate Motion In/Out of frame stats.
+  *mv_in_out = stats->mv_in_out_count * pct;
+  *mv_in_out_accumulator += *mv_in_out;
+  *abs_mv_in_out_accumulator += fabs(*mv_in_out);
+
+  // Accumulate a measure of how uniform (or conversely how random) the motion
+  // field is (a ratio of abs(mv) / mv).
+  if (pct > 0.05) {
+    const double mvr_ratio =
+        fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr));
+    const double mvc_ratio =
+        fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc));
+
+    *mv_ratio_accumulator +=
+        pct * (mvr_ratio < stats->mvr_abs ? mvr_ratio : stats->mvr_abs);
+    *mv_ratio_accumulator +=
+        pct * (mvc_ratio < stats->mvc_abs ? mvc_ratio : stats->mvc_abs);
+  }
+}
+
+#define BASELINE_ERR_PER_MB 1000.0
+static double calc_frame_boost(AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame,
+                               double this_frame_mv_in_out, double max_boost) {
+  double frame_boost;
+  const double lq = av1_convert_qindex_to_q(
+      cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
+  const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5);
+  int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+                                                       : cpi->common.MBs;
+
+  // Correct for any inactive region in the image
+  num_mbs = (int)AOMMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+
+  // Underlying boost factor is based on inter error ratio.
+  frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
+                DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
+  frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
+
+  // Increase boost for frames where new data coming into frame (e.g. zoom out).
+  // Slightly reduce boost if there is a net balance of motion out of the frame
+  // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
+  if (this_frame_mv_in_out > 0.0)
+    frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+  // In the extreme case the boost is halved.
+  else
+    frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+  return AOMMIN(frame_boost, max_boost * boost_q_correction);
+}
+
+static int calc_arf_boost(AV1_COMP *cpi, int offset, int f_frames, int b_frames,
+                          int *f_boost, int *b_boost) {
+  TWO_PASS *const twopass = &cpi->twopass;
+  int i;
+  double boost_score = 0.0;
+  double mv_ratio_accumulator = 0.0;
+  double decay_accumulator = 1.0;
+  double this_frame_mv_in_out = 0.0;
+  double mv_in_out_accumulator = 0.0;
+  double abs_mv_in_out_accumulator = 0.0;
+  int arf_boost;
+  int flash_detected = 0;
+
+  // Search forward from the proposed arf/next gf position.
+  for (i = 0; i < f_frames; ++i) {
+    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    if (this_frame == NULL) break;
+
+    // Update the motion related elements to the boost calculation.
+    accumulate_frame_motion_stats(
+        this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+        &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+    // We want to discount the flash frame itself and the recovery
+    // frame that follows as both will have poor scores.
+    flash_detected = detect_flash(twopass, i + offset) ||
+                     detect_flash(twopass, i + offset + 1);
+
+    // Accumulate the effect of prediction quality decay.
+    if (!flash_detected) {
+      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
+      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+                              ? MIN_DECAY_FACTOR
+                              : decay_accumulator;
+    }
+
+    boost_score +=
+        decay_accumulator *
+        calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+  }
+
+  *f_boost = (int)boost_score;
+
+  // Reset for backward looking loop.
+  boost_score = 0.0;
+  mv_ratio_accumulator = 0.0;
+  decay_accumulator = 1.0;
+  this_frame_mv_in_out = 0.0;
+  mv_in_out_accumulator = 0.0;
+  abs_mv_in_out_accumulator = 0.0;
+
+  // Search backward towards last gf position.
+  for (i = -1; i >= -b_frames; --i) {
+    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    if (this_frame == NULL) break;
+
+    // Update the motion related elements to the boost calculation.
+    accumulate_frame_motion_stats(
+        this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+        &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+    // We want to discount the the flash frame itself and the recovery
+    // frame that follows as both will have poor scores.
+    flash_detected = detect_flash(twopass, i + offset) ||
+                     detect_flash(twopass, i + offset + 1);
+
+    // Cumulative effect of prediction quality decay.
+    if (!flash_detected) {
+      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
+      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+                              ? MIN_DECAY_FACTOR
+                              : decay_accumulator;
+    }
+
+    boost_score +=
+        decay_accumulator *
+        calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+  }
+  *b_boost = (int)boost_score;
+
+  arf_boost = (*f_boost + *b_boost);
+  if (arf_boost < ((b_frames + f_frames) * 20))
+    arf_boost = ((b_frames + f_frames) * 20);
+  arf_boost = AOMMAX(arf_boost, MIN_ARF_GF_BOOST);
+
+  return arf_boost;
+}
+
+// Calculate a section intra ratio used in setting max loop filter.
+static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
+                                         const FIRSTPASS_STATS *end,
+                                         int section_length) {
+  const FIRSTPASS_STATS *s = begin;
+  double intra_error = 0.0;
+  double coded_error = 0.0;
+  int i = 0;
+
+  while (s < end && i < section_length) {
+    intra_error += s->intra_error;
+    coded_error += s->coded_error;
+    ++s;
+    ++i;
+  }
+
+  return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error));
+}
+
+// Calculate the total bits to allocate in this GF/ARF group.
+static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi,
+                                             double gf_group_err) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const TWO_PASS *const twopass = &cpi->twopass;
+  const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+  int64_t total_group_bits;
+
+  // Calculate the bits to be allocated to the group as a whole.
+  if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
+    total_group_bits = (int64_t)(twopass->kf_group_bits *
+                                 (gf_group_err / twopass->kf_group_error_left));
+  } else {
+    total_group_bits = 0;
+  }
+
+  // Clamp odd edge cases.
+  total_group_bits = (total_group_bits < 0)
+                         ? 0
+                         : (total_group_bits > twopass->kf_group_bits)
+                               ? twopass->kf_group_bits
+                               : total_group_bits;
+
+  // Clip based on user supplied data rate variability limit.
+  if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
+    total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
+
+  return total_group_bits;
+}
+
+// Calculate the number bits extra to assign to boosted frames in a group.
+static int calculate_boost_bits(int frame_count, int boost,
+                                int64_t total_group_bits) {
+  int allocation_chunks;
+
+  // return 0 for invalid inputs (could arise e.g. through rounding errors)
+  if (!boost || (total_group_bits <= 0) || (frame_count <= 0)) return 0;
+
+  allocation_chunks = (frame_count * 100) + boost;
+
+  // Prevent overflow.
+  if (boost > 1023) {
+    int divisor = boost >> 10;
+    boost /= divisor;
+    allocation_chunks /= divisor;
+  }
+
+  // Calculate the number of extra bits for use in the boosted frame or frames.
+  return AOMMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks),
+                0);
+}
+
+#if !CONFIG_EXT_REFS
+// Current limit on maximum number of active arfs in a GF/ARF group.
+#define MAX_ACTIVE_ARFS 2
+#define ARF_SLOT1 2
+#define ARF_SLOT2 3
+// This function indirects the choice of buffers for arfs.
+// At the moment the values are fixed but this may change as part of
+// the integration process with other codec features that swap buffers around.
+static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) {
+  arf_buffer_indices[0] = ARF_SLOT1;
+  arf_buffer_indices[1] = ARF_SLOT2;
+}
+#endif
+
+static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
+                                   double group_error, int gf_arf_bits) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  FIRSTPASS_STATS frame_stats;
+  int i;
+  int frame_index = 0;
+  int target_frame_size;
+  int key_frame;
+  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
+  int64_t total_group_bits = gf_group_bits;
+  double modified_err = 0.0;
+  double err_fraction;
+  int mid_boost_bits = 0;
+#if CONFIG_EXT_REFS
+  // The use of bi-predictive frames are only enabled when following 3
+  // conditions are met:
+  // (1) Alt-ref is enabled;
+  // (2) The bi-predictive group interval is at least 2; and
+  // (3) The bi-predictive group interval is strictly smaller than the
+  //     golden group interval.
+  const int is_bipred_enabled =
+      rc->source_alt_ref_pending && rc->bipred_group_interval &&
+      rc->bipred_group_interval <=
+          (rc->baseline_gf_interval - rc->source_alt_ref_pending);
+  int bipred_group_end = 0;
+  int bipred_frame_index = 0;
+
+  int arf_pos[MAX_EXT_ARFS + 1];
+  const unsigned char ext_arf_interval =
+      (unsigned char)(rc->baseline_gf_interval / (cpi->num_extra_arfs + 1) - 1);
+  int which_arf = cpi->num_extra_arfs;
+  int subgroup_interval[MAX_EXT_ARFS + 1];
+  int ext_arf_boost[MAX_EXT_ARFS];
+  int is_sg_bipred_enabled = is_bipred_enabled;
+  int accumulative_subgroup_interval = 0;
+#else
+  int mid_frame_idx;
+  unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+  av1_zero_array(ext_arf_boost, MAX_EXT_ARFS);
+#endif  // CONFIG_EXT_REFS
+
+  key_frame = cpi->common.frame_type == KEY_FRAME;
+
+#if !CONFIG_EXT_REFS
+  get_arf_buffer_indices(arf_buffer_indices);
+#endif  // !CONFIG_EXT_REFS
+
+  // For key frames the frame target rate is already set and it
+  // is also the golden frame.
+  if (!key_frame) {
+    if (rc->source_alt_ref_active) {
+      gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+      gf_group->rf_level[frame_index] = INTER_NORMAL;
+      gf_group->bit_allocation[frame_index] = 0;
+    } else {
+      gf_group->update_type[frame_index] = GF_UPDATE;
+      gf_group->rf_level[frame_index] = GF_ARF_STD;
+      gf_group->bit_allocation[frame_index] = gf_arf_bits;
+    }
+#if CONFIG_EXT_REFS
+    gf_group->arf_update_idx[frame_index] = 0;
+    gf_group->arf_ref_idx[frame_index] = 0;
+#else
+    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
+    gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
+#endif  // CONFIG_EXT_REFS
+    // Step over the golden frame / overlay frame
+    if (EOF == input_stats(twopass, &frame_stats)) return;
+  }
+
+#if CONFIG_EXT_REFS
+  gf_group->bidir_pred_enabled[frame_index] = 0;
+  gf_group->brf_src_offset[frame_index] = 0;
+#endif  // CONFIG_EXT_REFS
+
+  // Deduct the boost bits for arf (or gf if it is not a key frame)
+  // from the group total.
+  if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits;
+
+  frame_index++;
+
+#if CONFIG_EXT_REFS
+  bipred_frame_index++;
+#endif  // CONFIG_EXT_REFS
+
+  // Store the bits to spend on the ARF if there is one.
+  if (rc->source_alt_ref_pending) {
+    gf_group->update_type[frame_index] = ARF_UPDATE;
+    gf_group->rf_level[frame_index] = GF_ARF_STD;
+    gf_group->bit_allocation[frame_index] = gf_arf_bits;
+
+    gf_group->arf_src_offset[frame_index] =
+        (unsigned char)(rc->baseline_gf_interval - 1);
+
+#if CONFIG_EXT_REFS
+    gf_group->arf_update_idx[frame_index] = 0;
+    gf_group->arf_ref_idx[frame_index] = 0;
+
+    gf_group->bidir_pred_enabled[frame_index] = 0;
+    gf_group->brf_src_offset[frame_index] = 0;
+// NOTE: "bidir_pred_frame_index" stays unchanged for ARF_UPDATE frames.
+#else
+    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
+    gf_group->arf_ref_idx[frame_index] =
+        arf_buffer_indices[cpi->multi_arf_last_grp_enabled &&
+                           rc->source_alt_ref_active];
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+    // Work out the ARFs' positions in this gf group
+    // NOTE(weitinglin): ALT_REFs' are indexed inversely, but coded in display
+    // order (except for the original ARF). In the example of three ALT_REF's,
+    // We index ALTREF's as: KEY ----- ALT2 ----- ALT1 ----- ALT0
+    // but code them in the following order:
+    // KEY-ALT0-ALT2 ----- OVERLAY2-ALT1 ----- OVERLAY1 ----- OVERLAY0
+    arf_pos[0] =
+        frame_index + cpi->num_extra_arfs + gf_group->arf_src_offset[1] + 1;
+    for (i = 0; i < cpi->num_extra_arfs; ++i) {
+      arf_pos[i + 1] =
+          frame_index + (cpi->num_extra_arfs - i) * (ext_arf_interval + 2);
+      subgroup_interval[i] = arf_pos[i] - arf_pos[i + 1] - (i == 0 ? 1 : 2);
+    }
+    subgroup_interval[cpi->num_extra_arfs] = arf_pos[cpi->num_extra_arfs] -
+                                             frame_index -
+                                             (cpi->num_extra_arfs == 0 ? 1 : 2);
+#endif  // CONFIG_EXT_REFS
+
+    ++frame_index;
+
+#if CONFIG_EXT_REFS
+    // Insert an extra ARF
+    if (cpi->num_extra_arfs) {
+      gf_group->update_type[frame_index] = ARF_UPDATE;
+      // Note (weitinglin): GF_ARF_LOW is also used as an identifier
+      //                    for internal ALT_REF's:
+      gf_group->rf_level[frame_index] = GF_ARF_LOW;
+      gf_group->arf_src_offset[frame_index] = ext_arf_interval;
+      gf_group->arf_update_idx[frame_index] = which_arf;
+      gf_group->arf_ref_idx[frame_index] = 0;
+      ++frame_index;
+    }
+    accumulative_subgroup_interval += subgroup_interval[cpi->num_extra_arfs];
+#else
+    if (cpi->multi_arf_enabled) {
+      // Set aside a slot for a level 1 arf.
+      gf_group->update_type[frame_index] = ARF_UPDATE;
+      gf_group->rf_level[frame_index] = GF_ARF_LOW;
+      gf_group->arf_src_offset[frame_index] =
+          (unsigned char)((rc->baseline_gf_interval >> 1) - 1);
+      gf_group->arf_update_idx[frame_index] = arf_buffer_indices[1];
+      gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
+      ++frame_index;
+    }
+#endif  // CONFIG_EXT_ARFS
+  }
+
+#if !CONFIG_EXT_REFS
+  // Define middle frame
+  mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
+#endif  // !CONFIG_EXT_REFS
+
+  // Allocate bits to the other frames in the group.
+  for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) {
+#if !CONFIG_EXT_REFS
+    int arf_idx = 0;
+#endif  // !CONFIG_EXT_REFS
+
+    if (EOF == input_stats(twopass, &frame_stats)) break;
+
+    modified_err = calculate_modified_err(cpi, twopass, oxcf, &frame_stats);
+
+    if (group_error > 0)
+      err_fraction = modified_err / DOUBLE_DIVIDE_CHECK(group_error);
+    else
+      err_fraction = 0.0;
+
+    target_frame_size = (int)((double)total_group_bits * err_fraction);
+
+    if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
+      mid_boost_bits += (target_frame_size >> 4);
+      target_frame_size -= (target_frame_size >> 4);
+#if !CONFIG_EXT_REFS
+      if (frame_index <= mid_frame_idx) arf_idx = 1;
+#endif  // !CONFIG_EXT_REFS
+    }
+
+#if CONFIG_EXT_REFS
+    gf_group->arf_update_idx[frame_index] = which_arf;
+    gf_group->arf_ref_idx[frame_index] = which_arf;
+#else
+    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
+    gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
+#endif  // CONFIG_EXT_REFS
+
+    target_frame_size =
+        clamp(target_frame_size, 0, AOMMIN(max_bits, (int)total_group_bits));
+
+#if CONFIG_EXT_REFS
+    // If we are going to have ARFs, check if we can have BWDREF in this
+    // subgroup.
+    if (rc->source_alt_ref_pending) {
+      is_sg_bipred_enabled =
+          is_bipred_enabled &&
+          (subgroup_interval[which_arf] > rc->bipred_group_interval);
+    }
+
+    // NOTE: BIDIR_PRED is only enabled when the length of the bi-predictive
+    //       frame group interval is strictly smaller than that of the GOLDEN
+    //       FRAME group interval.
+    // TODO(zoeliu): Currently BIDIR_PRED is only enabled when alt-ref is on.
+    if (is_sg_bipred_enabled && !bipred_group_end) {
+      const int cur_brf_src_offset = rc->bipred_group_interval - 1;
+
+      // --- BRF_UPDATE ---
+      if (bipred_frame_index == 1) {
+        gf_group->update_type[frame_index] = BRF_UPDATE;
+        gf_group->bidir_pred_enabled[frame_index] = 1;
+        gf_group->brf_src_offset[frame_index] = cur_brf_src_offset;
+        // --- LAST_BIPRED_UPDATE ---
+      } else if (bipred_frame_index == rc->bipred_group_interval) {
+        gf_group->update_type[frame_index] = LAST_BIPRED_UPDATE;
+        gf_group->bidir_pred_enabled[frame_index] = 1;
+        gf_group->brf_src_offset[frame_index] = 0;
+        // Reset the bi-predictive frame index.
+        bipred_frame_index = 0;
+        // --- BIPRED_UPDATE ---
+      } else {
+        gf_group->update_type[frame_index] = BIPRED_UPDATE;
+        gf_group->bidir_pred_enabled[frame_index] = 1;
+        gf_group->brf_src_offset[frame_index] = 0;
+      }
+
+      bipred_frame_index++;
+      // Check whether the next bi-predictive frame group would entirely be
+      // included within the current golden frame group.
+      // In addition, we need to avoid coding a BRF right before an ARF.
+      if (bipred_frame_index == 1 &&
+          (i + 2 + cur_brf_src_offset) >= accumulative_subgroup_interval) {
+        bipred_group_end = 1;
+      }
+    } else {
+#endif  // CONFIG_EXT_REFS
+      gf_group->update_type[frame_index] = LF_UPDATE;
+#if CONFIG_EXT_REFS
+      gf_group->bidir_pred_enabled[frame_index] = 0;
+      gf_group->brf_src_offset[frame_index] = 0;
+    }
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+    if (gf_group->update_type[frame_index] == BRF_UPDATE) {
+      // Boost up the allocated bits on BWDREF_FRAME
+      gf_group->rf_level[frame_index] = GF_ARF_LOW;
+      gf_group->bit_allocation[frame_index] =
+          target_frame_size + (target_frame_size >> 2);
+    } else if (gf_group->update_type[frame_index] == LAST_BIPRED_UPDATE) {
+      // Press down the allocated bits on LAST_BIPRED_UPDATE frames
+      gf_group->rf_level[frame_index] = INTER_NORMAL;
+      gf_group->bit_allocation[frame_index] =
+          target_frame_size - (target_frame_size >> 1);
+    } else if (gf_group->update_type[frame_index] == BIPRED_UPDATE) {
+      // TODO(zoeliu): To investigate whether the allocated bits on
+      // BIPRED_UPDATE frames need to be further adjusted.
+      gf_group->rf_level[frame_index] = INTER_NORMAL;
+      gf_group->bit_allocation[frame_index] = target_frame_size;
+    } else {
+#endif  // CONFIG_EXT_REFS
+      gf_group->rf_level[frame_index] = INTER_NORMAL;
+      gf_group->bit_allocation[frame_index] = target_frame_size;
+#if CONFIG_EXT_REFS
+    }
+#endif  // CONFIG_EXT_REFS
+
+    ++frame_index;
+
+#if CONFIG_EXT_REFS
+    // Check if we need to update the ARF
+    if (is_sg_bipred_enabled && cpi->num_extra_arfs && which_arf > 0 &&
+        frame_index > arf_pos[which_arf]) {
+      --which_arf;
+      accumulative_subgroup_interval += subgroup_interval[which_arf] + 1;
+      // Meet the new subgroup. Reset the bipred_group_end flag;
+      bipred_group_end = 0;
+      // Insert another extra ARF after the overlay frame
+      if (which_arf) {
+        gf_group->update_type[frame_index] = ARF_UPDATE;
+        gf_group->rf_level[frame_index] = GF_ARF_LOW;
+        gf_group->arf_src_offset[frame_index] = ext_arf_interval;
+        gf_group->arf_update_idx[frame_index] = which_arf;
+        gf_group->arf_ref_idx[frame_index] = 0;
+        ++frame_index;
+      }
+    }
+#endif  // CONFIG_EXT_REFS
+  }
+
+// Note:
+// We need to configure the frame at the end of the sequence + 1 that will be
+// the start frame for the next group. Otherwise prior to the call to
+// av1_rc_get_second_pass_params() the data will be undefined.
+#if CONFIG_EXT_REFS
+  gf_group->arf_update_idx[frame_index] = 0;
+  gf_group->arf_ref_idx[frame_index] = 0;
+#else
+  gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
+  gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
+#endif  // CONFIG_EXT_REFS
+
+  if (rc->source_alt_ref_pending) {
+    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+    gf_group->rf_level[frame_index] = INTER_NORMAL;
+
+#if CONFIG_EXT_REFS
+    if (cpi->num_extra_arfs) {
+      for (i = cpi->num_extra_arfs; i > 0; --i) {
+        int arf_pos_in_gf = (i == cpi->num_extra_arfs ? 2 : arf_pos[i + 1] + 1);
+        gf_group->bit_allocation[arf_pos_in_gf] =
+            gf_group->bit_allocation[arf_pos[i]];
+        gf_group->update_type[arf_pos[i]] = INTNL_OVERLAY_UPDATE;
+        gf_group->bit_allocation[arf_pos[i]] = 0;
+        gf_group->rf_level[arf_pos[i]] = INTER_NORMAL;
+      }
+    }
+#else
+    // Final setup for second arf and its overlay.
+    if (cpi->multi_arf_enabled) {
+      gf_group->bit_allocation[2] =
+          gf_group->bit_allocation[mid_frame_idx] + mid_boost_bits;
+      gf_group->update_type[mid_frame_idx] = OVERLAY_UPDATE;
+      gf_group->bit_allocation[mid_frame_idx] = 0;
+    }
+#endif  // CONFIG_EXT_REFS
+  } else {
+    gf_group->update_type[frame_index] = GF_UPDATE;
+    gf_group->rf_level[frame_index] = GF_ARF_STD;
+  }
+
+#if CONFIG_EXT_REFS
+  gf_group->bidir_pred_enabled[frame_index] = 0;
+  gf_group->brf_src_offset[frame_index] = 0;
+#endif  // CONFIG_EXT_REFS
+
+  // Note whether multi-arf was enabled this group for next time.
+  cpi->multi_arf_last_grp_enabled = cpi->multi_arf_enabled;
+}
+
+// Analyse and define a gf/arf group.
+static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+  FIRSTPASS_STATS next_frame;
+  const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
+  int i;
+
+  double boost_score = 0.0;
+  double old_boost_score = 0.0;
+  double gf_group_err = 0.0;
+#if GROUP_ADAPTIVE_MAXQ
+  double gf_group_raw_error = 0.0;
+#endif
+  double gf_group_skip_pct = 0.0;
+  double gf_group_inactive_zone_rows = 0.0;
+  double gf_first_frame_err = 0.0;
+  double mod_frame_err = 0.0;
+
+  double mv_ratio_accumulator = 0.0;
+  double decay_accumulator = 1.0;
+  double zero_motion_accumulator = 1.0;
+
+  double loop_decay_rate = 1.00;
+  double last_loop_decay_rate = 1.00;
+
+  double this_frame_mv_in_out = 0.0;
+  double mv_in_out_accumulator = 0.0;
+  double abs_mv_in_out_accumulator = 0.0;
+  double mv_ratio_accumulator_thresh;
+  unsigned int allow_alt_ref = is_altref_enabled(cpi);
+
+  int f_boost = 0;
+  int b_boost = 0;
+  int flash_detected;
+  int active_max_gf_interval;
+  int active_min_gf_interval;
+  int64_t gf_group_bits;
+  double gf_group_error_left;
+  int gf_arf_bits;
+  const int is_key_frame = frame_is_intra_only(cm);
+  const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
+
+  // Reset the GF group data structures unless this is a key
+  // frame in which case it will already have been done.
+  if (is_key_frame == 0) {
+    av1_zero(twopass->gf_group);
+  }
+
+  aom_clear_system_state();
+  av1_zero(next_frame);
+
+  // Load stats for the current frame.
+  mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+  // Note the error of the frame at the start of the group. This will be
+  // the GF frame error if we code a normal gf.
+  gf_first_frame_err = mod_frame_err;
+
+  // If this is a key frame or the overlay from a previous arf then
+  // the error score / cost of this frame has already been accounted for.
+  if (arf_active_or_kf) {
+    gf_group_err -= gf_first_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+    gf_group_raw_error -= this_frame->coded_error;
+#endif
+    gf_group_skip_pct -= this_frame->intra_skip_pct;
+    gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
+  }
+
+  // Motion breakout threshold for loop below depends on image size.
+  mv_ratio_accumulator_thresh =
+      (cpi->initial_height + cpi->initial_width) / 4.0;
+
+  // Set a maximum and minimum interval for the GF group.
+  // If the image appears almost completely static we can extend beyond this.
+  {
+    int int_max_q = (int)(av1_convert_qindex_to_q(twopass->active_worst_quality,
+                                                  cpi->common.bit_depth));
+    int int_lbq = (int)(av1_convert_qindex_to_q(rc->last_boosted_qindex,
+                                                cpi->common.bit_depth));
+
+    active_min_gf_interval = rc->min_gf_interval + AOMMIN(2, int_max_q / 200);
+    if (active_min_gf_interval > rc->max_gf_interval)
+      active_min_gf_interval = rc->max_gf_interval;
+
+    if (cpi->multi_arf_allowed) {
+      active_max_gf_interval = rc->max_gf_interval;
+    } else {
+      // The value chosen depends on the active Q range. At low Q we have
+      // bits to spare and are better with a smaller interval and smaller boost.
+      // At high Q when there are few bits to spare we are better with a longer
+      // interval to spread the cost of the GF.
+      active_max_gf_interval = 12 + AOMMIN(4, (int_lbq / 6));
+
+      // We have: active_min_gf_interval <= rc->max_gf_interval
+      if (active_max_gf_interval < active_min_gf_interval)
+        active_max_gf_interval = active_min_gf_interval;
+      else if (active_max_gf_interval > rc->max_gf_interval)
+        active_max_gf_interval = rc->max_gf_interval;
+    }
+  }
+
+  i = 0;
+  while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) {
+    ++i;
+
+    // Accumulate error score of frames in this gf group.
+    mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+    gf_group_err += mod_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+    gf_group_raw_error += this_frame->coded_error;
+#endif
+    gf_group_skip_pct += this_frame->intra_skip_pct;
+    gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
+
+    if (EOF == input_stats(twopass, &next_frame)) break;
+
+    // Test for the case where there is a brief flash but the prediction
+    // quality back to an earlier frame is then restored.
+    flash_detected = detect_flash(twopass, 0);
+
+    // Update the motion related elements to the boost calculation.
+    accumulate_frame_motion_stats(
+        &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+        &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+    // Accumulate the effect of prediction quality decay.
+    if (!flash_detected) {
+      last_loop_decay_rate = loop_decay_rate;
+      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+      decay_accumulator = decay_accumulator * loop_decay_rate;
+
+      // Monitor for static sections.
+      zero_motion_accumulator = AOMMIN(
+          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+
+      // Break clause to detect very still sections after motion. For example,
+      // a static image after a fade or other transition.
+      if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
+                                     last_loop_decay_rate)) {
+        allow_alt_ref = 0;
+        break;
+      }
+    }
+
+    // Calculate a boost number for this frame.
+    boost_score +=
+        decay_accumulator *
+        calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+
+    // Break out conditions.
+    if (
+        // Break at active_max_gf_interval unless almost totally static.
+        (i >= (active_max_gf_interval + arf_active_or_kf) &&
+         zero_motion_accumulator < 0.995) ||
+        (
+            // Don't break out with a very short interval.
+            (i >= active_min_gf_interval + arf_active_or_kf) &&
+            (!flash_detected) &&
+            ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
+             (abs_mv_in_out_accumulator > 3.0) ||
+             (mv_in_out_accumulator < -2.0) ||
+             ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) {
+      boost_score = old_boost_score;
+      break;
+    }
+
+    *this_frame = next_frame;
+    old_boost_score = boost_score;
+  }
+
+  twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
+
+  // Was the group length constrained by the requirement for a new KF?
+  rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
+
+  // Should we use the alternate reference frame.
+  if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
+      (i >= rc->min_gf_interval)) {
+    // Calculate the boost for alt ref.
+    rc->gfu_boost =
+        calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
+    rc->source_alt_ref_pending = 1;
+
+    // Test to see if multi arf is appropriate.
+    cpi->multi_arf_enabled =
+        (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) &&
+         (zero_motion_accumulator < 0.995))
+            ? 1
+            : 0;
+  } else {
+    rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST);
+    rc->source_alt_ref_pending = 0;
+  }
+
+  // Set the interval until the next gf.
+  rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
+
+#if CONFIG_EXT_REFS
+  // Compute how many extra alt_refs we can have
+  cpi->num_extra_arfs = get_number_of_extra_arfs(rc->baseline_gf_interval,
+                                                 rc->source_alt_ref_pending);
+  // Currently at maximum two extra ARFs' are allowed
+  assert(cpi->num_extra_arfs <= MAX_EXT_ARFS);
+#endif  // CONFIG_EXT_REFS
+
+  rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+#if CONFIG_EXT_REFS
+  rc->bipred_group_interval = BFG_INTERVAL;
+  // The minimum bi-predictive frame group interval is 2.
+  if (rc->bipred_group_interval < 2) rc->bipred_group_interval = 0;
+#endif  // CONFIG_EXT_REFS
+
+  // Reset the file position.
+  reset_fpf_position(twopass, start_pos);
+
+  // Calculate the bits to be allocated to the gf/arf group as a whole
+  gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
+
+#if GROUP_ADAPTIVE_MAXQ
+  // Calculate an estimate of the maxq needed for the group.
+  // We are more agressive about correcting for sections
+  // where there could be significant overshoot than for easier
+  // sections where we do not wish to risk creating an overshoot
+  // of the allocated bit budget.
+  if ((cpi->oxcf.rc_mode != AOM_Q) && (rc->baseline_gf_interval > 1)) {
+    const int vbr_group_bits_per_frame =
+        (int)(gf_group_bits / rc->baseline_gf_interval);
+    const double group_av_err = gf_group_raw_error / rc->baseline_gf_interval;
+    const double group_av_skip_pct =
+        gf_group_skip_pct / rc->baseline_gf_interval;
+    const double group_av_inactive_zone =
+        ((gf_group_inactive_zone_rows * 2) /
+         (rc->baseline_gf_interval * (double)cm->mb_rows));
+
+    int tmp_q;
+    // rc factor is a weight factor that corrects for local rate control drift.
+    double rc_factor = 1.0;
+    if (rc->rate_error_estimate > 0) {
+      rc_factor = AOMMAX(RC_FACTOR_MIN,
+                         (double)(100 - rc->rate_error_estimate) / 100.0);
+    } else {
+      rc_factor = AOMMIN(RC_FACTOR_MAX,
+                         (double)(100 - rc->rate_error_estimate) / 100.0);
+    }
+    tmp_q = get_twopass_worst_quality(
+        cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
+        vbr_group_bits_per_frame, twopass->kfgroup_inter_fraction * rc_factor);
+    twopass->active_worst_quality =
+        AOMMAX(tmp_q, twopass->active_worst_quality >> 1);
+  }
+#endif
+
+  // Calculate the extra bits to be used for boosted frame(s)
+  gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval, rc->gfu_boost,
+                                     gf_group_bits);
+
+  // Adjust KF group bits and error remaining.
+  twopass->kf_group_error_left -= (int64_t)gf_group_err;
+
+  // If this is an arf update we want to remove the score for the overlay
+  // frame at the end which will usually be very cheap to code.
+  // The overlay frame has already, in effect, been coded so we want to spread
+  // the remaining bits among the other frames.
+  // For normal GFs remove the score for the GF itself unless this is
+  // also a key frame in which case it has already been accounted for.
+  if (rc->source_alt_ref_pending) {
+    gf_group_error_left = gf_group_err - mod_frame_err;
+  } else if (is_key_frame == 0) {
+    gf_group_error_left = gf_group_err - gf_first_frame_err;
+  } else {
+    gf_group_error_left = gf_group_err;
+  }
+
+  // Allocate bits to each of the frames in the GF group.
+  allocate_gf_group_bits(cpi, gf_group_bits, gf_group_error_left, gf_arf_bits);
+
+  // Reset the file position.
+  reset_fpf_position(twopass, start_pos);
+
+  // Calculate a section intra ratio used in setting max loop filter.
+  if (cpi->common.frame_type != KEY_FRAME) {
+    twopass->section_intra_rating = calculate_section_intra_ratio(
+        start_pos, twopass->stats_in_end, rc->baseline_gf_interval);
+  }
+
+  if (oxcf->resize_mode == RESIZE_DYNAMIC) {
+    // Default to starting GF groups at normal frame size.
+    cpi->rc.next_frame_size_selector = UNSCALED;
+  }
+}
+
+// Threshold for use of the lagging second reference frame. High second ref
+// usage may point to a transient event like a flash or occlusion rather than
+// a real scene cut.
+#define SECOND_REF_USEAGE_THRESH 0.1
+// Minimum % intra coding observed in first pass (1.0 = 100%)
+#define MIN_INTRA_LEVEL 0.25
+// Minimum ratio between the % of intra coding and inter coding in the first
+// pass after discounting neutral blocks (discounting neutral blocks in this
+// way helps catch scene cuts in clips with very flat areas or letter box
+// format clips with image padding.
+#define INTRA_VS_INTER_THRESH 2.0
+// Hard threshold where the first pass chooses intra for almost all blocks.
+// In such a case even if the frame is not a scene cut coding a key frame
+// may be a good option.
+#define VERY_LOW_INTER_THRESH 0.05
+// Maximum threshold for the relative ratio of intra error score vs best
+// inter error score.
+#define KF_II_ERR_THRESHOLD 2.5
+// In real scene cuts there is almost always a sharp change in the intra
+// or inter error score.
+#define ERR_CHANGE_THRESHOLD 0.4
+// For real scene cuts we expect an improvment in the intra inter error
+// ratio in the next frame.
+#define II_IMPROVEMENT_THRESHOLD 3.5
+#define KF_II_MAX 128.0
+
+static int test_candidate_kf(TWO_PASS *twopass,
+                             const FIRSTPASS_STATS *last_frame,
+                             const FIRSTPASS_STATS *this_frame,
+                             const FIRSTPASS_STATS *next_frame) {
+  int is_viable_kf = 0;
+  double pcnt_intra = 1.0 - this_frame->pcnt_inter;
+  double modified_pcnt_inter =
+      this_frame->pcnt_inter - this_frame->pcnt_neutral;
+
+  // Does the frame satisfy the primary criteria of a key frame?
+  // See above for an explanation of the test criteria.
+  // If so, then examine how well it predicts subsequent frames.
+  if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+      (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+      ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
+       ((pcnt_intra > MIN_INTRA_LEVEL) &&
+        (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
+        ((this_frame->intra_error /
+          DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) <
+         KF_II_ERR_THRESHOLD) &&
+        ((fabs(last_frame->coded_error - this_frame->coded_error) /
+              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
+          ERR_CHANGE_THRESHOLD) ||
+         (fabs(last_frame->intra_error - this_frame->intra_error) /
+              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
+          ERR_CHANGE_THRESHOLD) ||
+         ((next_frame->intra_error /
+           DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) >
+          II_IMPROVEMENT_THRESHOLD))))) {
+    int i;
+    const FIRSTPASS_STATS *start_pos = twopass->stats_in;
+    FIRSTPASS_STATS local_next_frame = *next_frame;
+    double boost_score = 0.0;
+    double old_boost_score = 0.0;
+    double decay_accumulator = 1.0;
+
+    // Examine how well the key frame predicts subsequent frames.
+    for (i = 0; i < 16; ++i) {
+      double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error /
+                             DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
+
+      if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX;
+
+      // Cumulative effect of decay in prediction quality.
+      if (local_next_frame.pcnt_inter > 0.85)
+        decay_accumulator *= local_next_frame.pcnt_inter;
+      else
+        decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
+
+      // Keep a running total.
+      boost_score += (decay_accumulator * next_iiratio);
+
+      // Test various breakout clauses.
+      if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) ||
+          (((local_next_frame.pcnt_inter - local_next_frame.pcnt_neutral) <
+            0.20) &&
+           (next_iiratio < 3.0)) ||
+          ((boost_score - old_boost_score) < 3.0) ||
+          (local_next_frame.intra_error < 200)) {
+        break;
+      }
+
+      old_boost_score = boost_score;
+
+      // Get the next frame details
+      if (EOF == input_stats(twopass, &local_next_frame)) break;
+    }
+
+    // If there is tolerable prediction for at least the next 3 frames then
+    // break out else discard this potential key frame and move on
+    if (boost_score > 30.0 && (i > 3)) {
+      is_viable_kf = 1;
+    } else {
+      // Reset the file position
+      reset_fpf_position(twopass, start_pos);
+
+      is_viable_kf = 0;
+    }
+  }
+
+  return is_viable_kf;
+}
+
+#define FRAMES_TO_CHECK_DECAY 8
+
+static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  int i, j;
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const FIRSTPASS_STATS first_frame = *this_frame;
+  const FIRSTPASS_STATS *const start_position = twopass->stats_in;
+  FIRSTPASS_STATS next_frame;
+  FIRSTPASS_STATS last_frame;
+  int kf_bits = 0;
+  int loop_decay_counter = 0;
+  double decay_accumulator = 1.0;
+  double av_decay_accumulator = 0.0;
+  double zero_motion_accumulator = 1.0;
+  double boost_score = 0.0;
+  double kf_mod_err = 0.0;
+  double kf_group_err = 0.0;
+  double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
+
+  av1_zero(next_frame);
+
+  cpi->common.frame_type = KEY_FRAME;
+
+  // Reset the GF group data structures.
+  av1_zero(*gf_group);
+
+  // Is this a forced key frame by interval.
+  rc->this_key_frame_forced = rc->next_key_frame_forced;
+
+  // Clear the alt ref active flag and last group multi arf flags as they
+  // can never be set for a key frame.
+  rc->source_alt_ref_active = 0;
+  cpi->multi_arf_last_grp_enabled = 0;
+
+  // KF is always a GF so clear frames till next gf counter.
+  rc->frames_till_gf_update_due = 0;
+
+  rc->frames_to_key = 1;
+
+  twopass->kf_group_bits = 0;        // Total bits available to kf group
+  twopass->kf_group_error_left = 0;  // Group modified error score.
+
+  kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+  // Initialize the decay rates for the recent frames to check
+  for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
+
+  // Find the next keyframe.
+  i = 0;
+  while (twopass->stats_in < twopass->stats_in_end &&
+         rc->frames_to_key < cpi->oxcf.key_freq) {
+    // Accumulate kf group error.
+    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+    // Load the next frame's stats.
+    last_frame = *this_frame;
+    input_stats(twopass, this_frame);
+
+    // Provided that we are not at the end of the file...
+    if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) {
+      double loop_decay_rate;
+
+      // Check for a scene cut.
+      if (test_candidate_kf(twopass, &last_frame, this_frame,
+                            twopass->stats_in))
+        break;
+
+      // How fast is the prediction quality decaying?
+      loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in);
+
+      // We want to know something about the recent past... rather than
+      // as used elsewhere where we are concerned with decay in prediction
+      // quality since the last GF or KF.
+      recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate;
+      decay_accumulator = 1.0;
+      for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
+        decay_accumulator *= recent_loop_decay[j];
+
+      // Special check for transition or high motion followed by a
+      // static scene.
+      if (detect_transition_to_still(cpi, i, cpi->oxcf.key_freq - i,
+                                     loop_decay_rate, decay_accumulator))
+        break;
+
+      // Step on to the next frame.
+      ++rc->frames_to_key;
+
+      // If we don't have a real key frame within the next two
+      // key_freq intervals then break out of the loop.
+      if (rc->frames_to_key >= 2 * cpi->oxcf.key_freq) break;
+    } else {
+      ++rc->frames_to_key;
+    }
+    ++i;
+  }
+
+  // If there is a max kf interval set by the user we must obey it.
+  // We already breakout of the loop above at 2x max.
+  // This code centers the extra kf if the actual natural interval
+  // is between 1x and 2x.
+  if (cpi->oxcf.auto_key && rc->frames_to_key > cpi->oxcf.key_freq) {
+    FIRSTPASS_STATS tmp_frame = first_frame;
+
+    rc->frames_to_key /= 2;
+
+    // Reset to the start of the group.
+    reset_fpf_position(twopass, start_position);
+
+    kf_group_err = 0.0;
+
+    // Rescan to get the correct error data for the forced kf group.
+    for (i = 0; i < rc->frames_to_key; ++i) {
+      kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame);
+      input_stats(twopass, &tmp_frame);
+    }
+    rc->next_key_frame_forced = 1;
+  } else if (twopass->stats_in == twopass->stats_in_end ||
+             rc->frames_to_key >= cpi->oxcf.key_freq) {
+    rc->next_key_frame_forced = 1;
+  } else {
+    rc->next_key_frame_forced = 0;
+  }
+
+  // Special case for the last key frame of the file.
+  if (twopass->stats_in >= twopass->stats_in_end) {
+    // Accumulate kf group error.
+    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+  }
+
+  // Calculate the number of bits that should be assigned to the kf group.
+  if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
+    // Maximum number of bits for a single normal frame (not key frame).
+    const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+
+    // Maximum number of bits allocated to the key frame group.
+    int64_t max_grp_bits;
+
+    // Default allocation based on bits left and relative
+    // complexity of the section.
+    twopass->kf_group_bits = (int64_t)(
+        twopass->bits_left * (kf_group_err / twopass->modified_error_left));
+
+    // Clip based on maximum per frame rate defined by the user.
+    max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
+    if (twopass->kf_group_bits > max_grp_bits)
+      twopass->kf_group_bits = max_grp_bits;
+  } else {
+    twopass->kf_group_bits = 0;
+  }
+  twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits);
+
+  // Reset the first pass file position.
+  reset_fpf_position(twopass, start_position);
+
+  // Scan through the kf group collating various stats used to determine
+  // how many bits to spend on it.
+  decay_accumulator = 1.0;
+  boost_score = 0.0;
+  for (i = 0; i < (rc->frames_to_key - 1); ++i) {
+    if (EOF == input_stats(twopass, &next_frame)) break;
+
+    // Monitor for static sections.
+    zero_motion_accumulator = AOMMIN(zero_motion_accumulator,
+                                     get_zero_motion_factor(cpi, &next_frame));
+
+    // Not all frames in the group are necessarily used in calculating boost.
+    if ((i <= rc->max_gf_interval) ||
+        ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
+      const double frame_boost =
+          calc_frame_boost(cpi, this_frame, 0, KF_MAX_BOOST);
+
+      // How fast is prediction quality decaying.
+      if (!detect_flash(twopass, 0)) {
+        const double loop_decay_rate =
+            get_prediction_decay_rate(cpi, &next_frame);
+        decay_accumulator *= loop_decay_rate;
+        decay_accumulator = AOMMAX(decay_accumulator, MIN_DECAY_FACTOR);
+        av_decay_accumulator += decay_accumulator;
+        ++loop_decay_counter;
+      }
+      boost_score += (decay_accumulator * frame_boost);
+    }
+  }
+  av_decay_accumulator /= (double)loop_decay_counter;
+
+  reset_fpf_position(twopass, start_position);
+
+  // Store the zero motion percentage
+  twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
+
+  // Calculate a section intra ratio used in setting max loop filter.
+  twopass->section_intra_rating = calculate_section_intra_ratio(
+      start_position, twopass->stats_in_end, rc->frames_to_key);
+
+  // Apply various clamps for min and max boost
+  rc->kf_boost = (int)(av_decay_accumulator * boost_score);
+  rc->kf_boost = AOMMAX(rc->kf_boost, (rc->frames_to_key * 3));
+  rc->kf_boost = AOMMAX(rc->kf_boost, MIN_KF_BOOST);
+
+  // Work out how many bits to allocate for the key frame itself.
+  kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
+                                 twopass->kf_group_bits);
+
+  // Work out the fraction of the kf group bits reserved for the inter frames
+  // within the group after discounting the bits for the kf itself.
+  if (twopass->kf_group_bits) {
+    twopass->kfgroup_inter_fraction =
+        (double)(twopass->kf_group_bits - kf_bits) /
+        (double)twopass->kf_group_bits;
+  } else {
+    twopass->kfgroup_inter_fraction = 1.0;
+  }
+
+  twopass->kf_group_bits -= kf_bits;
+
+  // Save the bits to spend on the key frame.
+  gf_group->bit_allocation[0] = kf_bits;
+  gf_group->update_type[0] = KF_UPDATE;
+  gf_group->rf_level[0] = KF_STD;
+
+  // Note the total error score of the kf group minus the key frame itself.
+  twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+
+  // Adjust the count of total modified error left.
+  // The count of bits left is adjusted elsewhere based on real coded frame
+  // sizes.
+  twopass->modified_error_left -= kf_group_err;
+
+  if (oxcf->resize_mode == RESIZE_DYNAMIC) {
+    // Default to normal-sized frame on keyframes.
+    cpi->rc.next_frame_size_selector = UNSCALED;
+  }
+}
+
+// Define the reference buffers that will be updated post encode.
+static void configure_buffer_updates(AV1_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->twopass;
+
+  // Wei-Ting: Should we define another function to take care of
+  // cpi->rc.is_$Source_Type to make this function as it is in the comment?
+
+  cpi->rc.is_src_frame_alt_ref = 0;
+#if CONFIG_EXT_REFS
+  cpi->rc.is_bwd_ref_frame = 0;
+  cpi->rc.is_last_bipred_frame = 0;
+  cpi->rc.is_bipred_frame = 0;
+  cpi->rc.is_src_frame_ext_arf = 0;
+#endif  // CONFIG_EXT_REFS
+
+  switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
+    case KF_UPDATE:
+#if CONFIG_EXT_REFS
+      cpi->refresh_bwd_ref_frame = 1;
+#endif  // CONFIG_EXT_REFS
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+
+    case LF_UPDATE:
+#if CONFIG_EXT_REFS
+      // If we have extra ALT_REFs, we can use the farthest ALT (ALT0) as
+      // the BWD_REF.
+      if (cpi->num_extra_arfs) {
+        int tmp = cpi->bwd_fb_idx;
+
+        cpi->bwd_fb_idx = cpi->alt_fb_idx;
+        cpi->alt_fb_idx = cpi->arf_map[0];
+        cpi->arf_map[0] = tmp;
+
+        cpi->rc.is_bwd_ref_frame = 1;
+      } else {
+        cpi->rc.is_bwd_ref_frame = 0;
+      }
+#endif  // CONFIG_EXT_REFS
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+
+    case GF_UPDATE:
+#if CONFIG_EXT_REFS
+      cpi->refresh_bwd_ref_frame = 0;
+#endif  // CONFIG_EXT_REFS
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+
+    case OVERLAY_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 1;
+#if CONFIG_EXT_REFS
+      cpi->refresh_bwd_ref_frame = 0;
+#endif  // CONFIG_EXT_REFS
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_src_frame_alt_ref = 1;
+      break;
+
+    case ARF_UPDATE:
+#if CONFIG_EXT_REFS
+      cpi->refresh_bwd_ref_frame = 1;
+#endif  // CONFIG_EXT_REFS
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+
+#if CONFIG_EXT_REFS
+    case BRF_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 1;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_bwd_ref_frame = 1;
+      if (cpi->num_extra_arfs) {
+        // Allow BRF use the farthest ALT_REF (ALT0) as BWD_REF by swapping
+        // the virtual indices.
+        // NOTE: The indices will be swapped back after this frame is encoded
+        //       (in av1_update_reference_frames()).
+        int tmp = cpi->bwd_fb_idx;
+
+        cpi->bwd_fb_idx = cpi->alt_fb_idx;
+        cpi->alt_fb_idx = cpi->arf_map[0];
+        cpi->arf_map[0] = tmp;
+      }
+      break;
+
+    case LAST_BIPRED_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_last_bipred_frame = 1;
+      break;
+
+    case BIPRED_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_bipred_frame = 1;
+      break;
+
+    case INTNL_OVERLAY_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_src_frame_alt_ref = 1;
+      cpi->rc.is_src_frame_ext_arf = 1;
+      break;
+#endif  // CONFIG_EXT_REFS
+
+    default: assert(0); break;
+  }
+}
+
+static int is_skippable_frame(const AV1_COMP *cpi) {
+  // If the current frame does not have non-zero motion vector detected in the
+  // first  pass, and so do its previous and forward frames, then this frame
+  // can be skipped for partition check, and the partition size is assigned
+  // according to the variance
+  const TWO_PASS *const twopass = &cpi->twopass;
+
+  return (!frame_is_intra_only(&cpi->common) &&
+          twopass->stats_in - 2 > twopass->stats_in_start &&
+          twopass->stats_in < twopass->stats_in_end &&
+          (twopass->stats_in - 1)->pcnt_inter -
+                  (twopass->stats_in - 1)->pcnt_motion ==
+              1 &&
+          (twopass->stats_in - 2)->pcnt_inter -
+                  (twopass->stats_in - 2)->pcnt_motion ==
+              1 &&
+          twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
+}
+
+void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  int frames_left;
+  FIRSTPASS_STATS this_frame;
+
+  int target_rate;
+
+  frames_left = (int)(twopass->total_stats.count - cm->current_video_frame);
+
+  if (!twopass->stats_in) return;
+
+  // If this is an arf frame then we dont want to read the stats file or
+  // advance the input pointer as we already have what we need.
+  if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+    configure_buffer_updates(cpi);
+    target_rate = gf_group->bit_allocation[gf_group->index];
+    target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate);
+    rc->base_frame_target = target_rate;
+
+    cm->frame_type = INTER_FRAME;
+
+    // Do the firstpass stats indicate that this frame is skippable for the
+    // partition search?
+    if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
+      cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+    }
+
+    return;
+  }
+
+  aom_clear_system_state();
+
+  if (cpi->oxcf.rc_mode == AOM_Q) {
+    twopass->active_worst_quality = cpi->oxcf.cq_level;
+  } else if (cm->current_video_frame == 0) {
+    // Special case code for first frame.
+    const int section_target_bandwidth =
+        (int)(twopass->bits_left / frames_left);
+    const double section_length = twopass->total_left_stats.count;
+    const double section_error =
+        twopass->total_left_stats.coded_error / section_length;
+    const double section_intra_skip =
+        twopass->total_left_stats.intra_skip_pct / section_length;
+    const double section_inactive_zone =
+        (twopass->total_left_stats.inactive_zone_rows * 2) /
+        ((double)cm->mb_rows * section_length);
+    const int tmp_q = get_twopass_worst_quality(
+        cpi, section_error, section_intra_skip + section_inactive_zone,
+        section_target_bandwidth, DEFAULT_GRP_WEIGHT);
+
+    twopass->active_worst_quality = tmp_q;
+    twopass->baseline_active_worst_quality = tmp_q;
+    rc->ni_av_qi = tmp_q;
+    rc->last_q[INTER_FRAME] = tmp_q;
+    rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->bit_depth);
+    rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
+    rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
+    rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
+  }
+
+  av1_zero(this_frame);
+  if (EOF == input_stats(twopass, &this_frame)) return;
+
+  // Set the frame content type flag.
+  if (this_frame.intra_skip_pct >= FC_ANIMATION_THRESH)
+    twopass->fr_content_type = FC_GRAPHICS_ANIMATION;
+  else
+    twopass->fr_content_type = FC_NORMAL;
+
+  // Keyframe and section processing.
+  if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
+    FIRSTPASS_STATS this_frame_copy;
+    this_frame_copy = this_frame;
+    // Define next KF group and assign bits to it.
+    find_next_key_frame(cpi, &this_frame);
+    this_frame = this_frame_copy;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+
+  // Define a new GF/ARF group. (Should always enter here for key frames).
+  if (rc->frames_till_gf_update_due == 0) {
+    define_gf_group(cpi, &this_frame);
+
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+#if ARF_STATS_OUTPUT
+    {
+      FILE *fpfile;
+      fpfile = fopen("arf.stt", "a");
+      ++arf_count;
+      fprintf(fpfile, "%10d %10ld %10d %10d %10ld\n", cm->current_video_frame,
+              rc->frames_till_gf_update_due, rc->kf_boost, arf_count,
+              rc->gfu_boost);
+
+      fclose(fpfile);
+    }
+#endif
+  }
+
+  configure_buffer_updates(cpi);
+
+  // Do the firstpass stats indicate that this frame is skippable for the
+  // partition search?
+  if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
+    cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+  }
+
+  target_rate = gf_group->bit_allocation[gf_group->index];
+
+  if (cpi->common.frame_type == KEY_FRAME)
+    target_rate = av1_rc_clamp_iframe_target_size(cpi, target_rate);
+  else
+    target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate);
+
+  rc->base_frame_target = target_rate;
+
+  {
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                            ? cpi->initial_mbs
+                            : cpi->common.MBs;
+    // The multiplication by 256 reverses a scaling factor of (>> 8)
+    // applied when combining MB error values for the frame.
+    twopass->mb_av_energy =
+        log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0);
+  }
+
+  // Update the total stats remaining structure.
+  subtract_stats(&twopass->total_left_stats, &this_frame);
+}
+
+#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
+#define HIGH_UNDERSHOOT_RATIO 2
+void av1_twopass_postencode_update(AV1_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->twopass;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int bits_used = rc->base_frame_target;
+
+  // VBR correction is done through rc->vbr_bits_off_target. Based on the
+  // sign of this value, a limited % adjustment is made to the target rate
+  // of subsequent frames, to try and push it back towards 0. This method
+  // is designed to prevent extreme behaviour at the end of a clip
+  // or group of frames.
+  rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
+  twopass->bits_left = AOMMAX(twopass->bits_left - bits_used, 0);
+
+  // Calculate the pct rc error.
+  if (rc->total_actual_bits) {
+    rc->rate_error_estimate =
+        (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits);
+    rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100);
+  } else {
+    rc->rate_error_estimate = 0;
+  }
+
+  if (cpi->common.frame_type != KEY_FRAME) {
+    twopass->kf_group_bits -= bits_used;
+    twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
+  }
+  twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0);
+
+  // Increment the gf group index ready for the next frame.
+  ++twopass->gf_group.index;
+
+  // If the rate control is drifting consider adjustment to min or maxq.
+  if ((cpi->oxcf.rc_mode != AOM_Q) &&
+      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) &&
+      !cpi->rc.is_src_frame_alt_ref) {
+    const int maxq_adj_limit =
+        rc->worst_quality - twopass->active_worst_quality;
+    const int minq_adj_limit =
+        (cpi->oxcf.rc_mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
+
+    // Undershoot.
+    if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
+      --twopass->extend_maxq;
+      if (rc->rolling_target_bits >= rc->rolling_actual_bits)
+        ++twopass->extend_minq;
+      // Overshoot.
+    } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) {
+      --twopass->extend_minq;
+      if (rc->rolling_target_bits < rc->rolling_actual_bits)
+        ++twopass->extend_maxq;
+    } else {
+      // Adjustment for extreme local overshoot.
+      if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
+          rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
+        ++twopass->extend_maxq;
+
+      // Unwind undershoot or overshoot adjustment.
+      if (rc->rolling_target_bits < rc->rolling_actual_bits)
+        --twopass->extend_minq;
+      else if (rc->rolling_target_bits > rc->rolling_actual_bits)
+        --twopass->extend_maxq;
+    }
+
+    twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit);
+    twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
+
+    // If there is a big and undexpected undershoot then feed the extra
+    // bits back in quickly. One situation where this may happen is if a
+    // frame is unexpectedly almost perfectly predicted by the ARF or GF
+    // but not very well predcited by the previous frame.
+    if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) {
+      int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO;
+      if (rc->projected_frame_size < fast_extra_thresh) {
+        rc->vbr_bits_off_target_fast +=
+            fast_extra_thresh - rc->projected_frame_size;
+        rc->vbr_bits_off_target_fast =
+            AOMMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth));
+
+        // Fast adaptation of minQ if necessary to use up the extra bits.
+        if (rc->avg_frame_bandwidth) {
+          twopass->extend_minq_fast =
+              (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth);
+        }
+        twopass->extend_minq_fast = AOMMIN(
+            twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
+      } else if (rc->vbr_bits_off_target_fast) {
+        twopass->extend_minq_fast = AOMMIN(
+            twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
+      } else {
+        twopass->extend_minq_fast = 0;
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h
new file mode 100644
index 000000000..db459cc22
--- /dev/null
+++ b/third_party/aom/av1/encoder/firstpass.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_FIRSTPASS_H_
+#define AV1_ENCODER_FIRSTPASS_H_
+
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_FP_MB_STATS
+
+#define FPMB_DCINTRA_MASK 0x01
+
+#define FPMB_MOTION_ZERO_MASK 0x02
+#define FPMB_MOTION_LEFT_MASK 0x04
+#define FPMB_MOTION_RIGHT_MASK 0x08
+#define FPMB_MOTION_UP_MASK 0x10
+#define FPMB_MOTION_DOWN_MASK 0x20
+
+#define FPMB_ERROR_SMALL_MASK 0x40
+#define FPMB_ERROR_LARGE_MASK 0x80
+#define FPMB_ERROR_SMALL_TH 2000
+#define FPMB_ERROR_LARGE_TH 48000
+
+typedef struct {
+  uint8_t *mb_stats_start;
+  uint8_t *mb_stats_end;
+} FIRSTPASS_MB_STATS;
+#endif
+
+#if CONFIG_EXT_REFS
+// Length of the bi-predictive frame group (BFG)
+// NOTE: Currently each BFG contains one backward ref (BWF) frame plus a certain
+//       number of bi-predictive frames.
+#define BFG_INTERVAL 2
+// The maximum number of extra ALT_REF's
+// NOTE: This number cannot be greater than 2 or the reference frame buffer will
+//       overflow.
+#define MAX_EXT_ARFS 2
+#define MIN_EXT_ARF_INTERVAL 4
+#endif  // CONFIG_EXT_REFS
+
+#define VLOW_MOTION_THRESHOLD 950
+
+typedef struct {
+  double frame;
+  double weight;
+  double intra_error;
+  double coded_error;
+  double sr_coded_error;
+  double pcnt_inter;
+  double pcnt_motion;
+  double pcnt_second_ref;
+  double pcnt_neutral;
+  double intra_skip_pct;
+  double inactive_zone_rows;  // Image mask rows top and bottom.
+  double inactive_zone_cols;  // Image mask columns at left and right edges.
+  double MVr;
+  double mvr_abs;
+  double MVc;
+  double mvc_abs;
+  double MVrv;
+  double MVcv;
+  double mv_in_out_count;
+  double new_mv_count;
+  double duration;
+  double count;
+} FIRSTPASS_STATS;
+
+typedef enum {
+  KF_UPDATE = 0,
+  LF_UPDATE = 1,
+  GF_UPDATE = 2,
+  ARF_UPDATE = 3,
+  OVERLAY_UPDATE = 4,
+#if CONFIG_EXT_REFS
+  BRF_UPDATE = 5,            // Backward Reference Frame
+  LAST_BIPRED_UPDATE = 6,    // Last Bi-predictive Frame
+  BIPRED_UPDATE = 7,         // Bi-predictive Frame, but not the last one
+  INTNL_OVERLAY_UPDATE = 8,  // Internal Overlay Frame
+  FRAME_UPDATE_TYPES = 9
+#else
+  FRAME_UPDATE_TYPES = 5
+#endif  // CONFIG_EXT_REFS
+} FRAME_UPDATE_TYPE;
+
+#define FC_ANIMATION_THRESH 0.15
+typedef enum {
+  FC_NORMAL = 0,
+  FC_GRAPHICS_ANIMATION = 1,
+  FRAME_CONTENT_TYPES = 2
+} FRAME_CONTENT_TYPE;
+
+typedef struct {
+  unsigned char index;
+  RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1];
+  FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
+#if CONFIG_EXT_REFS
+  unsigned char brf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char bidir_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1];
+#endif  // CONFIG_EXT_REFS
+  int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
+} GF_GROUP;
+
+typedef struct {
+  unsigned int section_intra_rating;
+  FIRSTPASS_STATS total_stats;
+  FIRSTPASS_STATS this_frame_stats;
+  const FIRSTPASS_STATS *stats_in;
+  const FIRSTPASS_STATS *stats_in_start;
+  const FIRSTPASS_STATS *stats_in_end;
+  FIRSTPASS_STATS total_left_stats;
+  int first_pass_done;
+  int64_t bits_left;
+  double modified_error_min;
+  double modified_error_max;
+  double modified_error_left;
+  double mb_av_energy;
+
+#if CONFIG_FP_MB_STATS
+  uint8_t *frame_mb_stats_buf;
+  uint8_t *this_frame_mb_stats;
+  FIRSTPASS_MB_STATS firstpass_mb_stats;
+#endif
+  // An indication of the content type of the current frame
+  FRAME_CONTENT_TYPE fr_content_type;
+
+  // Projected total bits available for a key frame group of frames
+  int64_t kf_group_bits;
+
+  // Error score of frames still to be coded in kf group
+  int64_t kf_group_error_left;
+
+  // The fraction for a kf groups total bits allocated to the inter frames
+  double kfgroup_inter_fraction;
+
+  int sr_update_lag;
+
+  int kf_zeromotion_pct;
+  int last_kfgroup_zeromotion_pct;
+  int gf_zeromotion_pct;
+  int active_worst_quality;
+  int baseline_active_worst_quality;
+  int extend_minq;
+  int extend_maxq;
+  int extend_minq_fast;
+
+  GF_GROUP gf_group;
+} TWO_PASS;
+
+struct AV1_COMP;
+
+void av1_init_first_pass(struct AV1_COMP *cpi);
+void av1_rc_get_first_pass_params(struct AV1_COMP *cpi);
+void av1_first_pass(struct AV1_COMP *cpi, const struct lookahead_entry *source);
+void av1_end_first_pass(struct AV1_COMP *cpi);
+
+void av1_init_second_pass(struct AV1_COMP *cpi);
+void av1_rc_get_second_pass_params(struct AV1_COMP *cpi);
+void av1_twopass_postencode_update(struct AV1_COMP *cpi);
+
+// Post encode update of the rate control parameters for 2-pass
+void av1_twopass_postencode_update(struct AV1_COMP *cpi);
+
+void av1_init_subsampling(struct AV1_COMP *cpi);
+
+void av1_calculate_coded_size(struct AV1_COMP *cpi, int *scaled_frame_width,
+                              int *scaled_frame_height);
+
+#if CONFIG_EXT_REFS
+static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) {
+  if (arf_pending && MAX_EXT_ARFS > 0)
+    return interval >= MIN_EXT_ARF_INTERVAL * (MAX_EXT_ARFS + 1)
+               ? MAX_EXT_ARFS
+               : interval >= MIN_EXT_ARF_INTERVAL * MAX_EXT_ARFS
+                     ? MAX_EXT_ARFS - 1
+                     : 0;
+  else
+    return 0;
+}
+#endif  // CONFIG_EXT_REFS
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_FIRSTPASS_H_
diff --git a/third_party/aom/av1/encoder/generic_encoder.c b/third_party/aom/av1/encoder/generic_encoder.c
new file mode 100644
index 000000000..a31bb9ef6
--- /dev/null
+++ b/third_party/aom/av1/encoder/generic_encoder.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stdio.h>
+
+#include "aom_dsp/bitwriter.h"
+#include "av1/common/generic_code.h"
+#include "av1/common/odintrin.h"
+#include "pvq_encoder.h"
+
+/** Encodes a value from 0 to N-1 (with N up to 16) based on a cdf and adapts
+ * the cdf accordingly.
+ *
+ * @param [in,out] w     multi-symbol entropy encoder
+ * @param [in]     val   variable being encoded
+ * @param [in,out] cdf   CDF of the variable (Q15)
+ * @param [in]     n     number of values possible
+ * @param [in,out] count number of symbols encoded with that cdf so far
+ * @param [in]     rate  adaptation rate shift (smaller is faster)
+ */
+void aom_encode_cdf_adapt_q15(aom_writer *w, int val, uint16_t *cdf, int n,
+ int *count, int rate) {
+  int i;
+  if (*count == 0) {
+    /* On the first call, we normalize the cdf to (32768 - n). This should
+       eventually be moved to the state init, but for now it makes it much
+       easier to experiment and convert symbols to the Q15 adaptation.*/
+    int ft;
+    ft = cdf[n - 1];
+    for (i = 0; i < n; i++) {
+      cdf[i] = AOM_ICDF(cdf[i]*32768/ft);
+    }
+  }
+  aom_write_cdf(w, val, cdf, n);
+  aom_cdf_adapt_q15(val, cdf, n, count, rate);
+}
+
+/** Encodes a random variable using a "generic" model, assuming that the
+ * distribution is one-sided (zero and up), has a single mode, and decays
+ * exponentially past the model.
+ *
+ * @param [in,out] w     multi-symbol entropy encoder
+ * @param [in,out] model generic probability model
+ * @param [in]     x     variable being encoded
+ * @param [in,out] ExQ16 expectation of x (adapted)
+ * @param [in]     integration integration period of ExQ16 (leaky average over
+ * 1<<integration samples)
+ */
+void generic_encode(aom_writer *w, generic_encoder *model, int x,
+ int *ex_q16, int integration) {
+  int lg_q1;
+  int shift;
+  int id;
+  uint16_t *cdf;
+  int xs;
+  lg_q1 = log_ex(*ex_q16);
+  OD_LOG((OD_LOG_ENTROPY_CODER, OD_LOG_DEBUG,
+   "%d %d", *ex_q16, lg_q1));
+  /* If expectation is too large, shift x to ensure that
+     all we have past xs=15 is the exponentially decaying tail
+     of the distribution */
+  shift = OD_MAXI(0, (lg_q1 - 5) >> 1);
+  /* Choose the cdf to use: we have two per "octave" of ExQ16 */
+  id = OD_MINI(GENERIC_TABLES - 1, lg_q1);
+  cdf = model->cdf[id];
+  xs = (x + (1 << shift >> 1)) >> shift;
+  aom_write_symbol_pvq(w, OD_MINI(15, xs), cdf, 16);
+  if (xs >= 15) {
+    int e;
+    unsigned decay;
+    /* Estimate decay based on the assumption that the distribution is close
+       to Laplacian for large values. We should probably have an adaptive
+       estimate instead. Note: The 2* is a kludge that's not fully understood
+       yet. */
+    OD_ASSERT(*ex_q16 < INT_MAX >> 1);
+    e = ((2**ex_q16 >> 8) + (1 << shift >> 1)) >> shift;
+    decay = OD_MAXI(2, OD_MINI(254, 256*e/(e + 256)));
+    /* Encode the tail of the distribution assuming exponential decay. */
+    aom_laplace_encode_special(w, xs - 15, decay);
+  }
+  if (shift != 0) {
+    int special;
+    /* Because of the rounding, there's only half the number of possibilities
+       for xs=0. */
+    special = xs == 0;
+    if (shift - special > 0) {
+      aom_write_literal(w, x - (xs << shift) + (!special << (shift - 1)),
+       shift - special);
+    }
+  }
+  generic_model_update(ex_q16, x, integration);
+  OD_LOG((OD_LOG_ENTROPY_CODER, OD_LOG_DEBUG,
+   "enc: %d %d %d %d %d %x", *ex_q16, x, shift, id, xs, enc->rng));
+}
+
+/** Estimates the cost of encoding a value with generic_encode().
+ *
+ * @param [in,out] model generic probability model
+ * @param [in]     x     variable being encoded
+ * @param [in,out] ExQ16 expectation of x (adapted)
+ * @return number of bits (approximation)
+ */
+double generic_encode_cost(generic_encoder *model, int x, int *ex_q16) {
+  int lg_q1;
+  int shift;
+  int id;
+  uint16_t *cdf;
+  int xs;
+  int extra;
+  lg_q1 = log_ex(*ex_q16);
+  /* If expectation is too large, shift x to ensure that
+       all we have past xs=15 is the exponentially decaying tail
+       of the distribution */
+  shift = OD_MAXI(0, (lg_q1 - 5) >> 1);
+  /* Choose the cdf to use: we have two per "octave" of ExQ16 */
+  id = OD_MINI(GENERIC_TABLES - 1, lg_q1);
+  cdf = model->cdf[id];
+  xs = (x + (1 << shift >> 1)) >> shift;
+  extra = 0;
+  if (shift) extra = shift - (xs == 0);
+  xs = OD_MINI(15, xs);
+  /* Shortcut: assume it's going to cost 2 bits for the Laplace coder. */
+  if (xs == 15) extra += 2;
+  return
+      extra - OD_LOG2((double)(cdf[xs] - (xs == 0 ? 0 : cdf[xs - 1]))/cdf[15]);
+}
+
+/*Estimates the cost of encoding a value with a given CDF.*/
+double od_encode_cdf_cost(int val, uint16_t *cdf, int n) {
+  int total_prob;
+  int prev_prob;
+  double val_prob;
+  OD_ASSERT(n > 0);
+  total_prob = cdf[n - 1];
+  if (val == 0) {
+    prev_prob = 0;
+  }
+  else {
+    prev_prob = cdf[val - 1];
+  }
+  val_prob = (cdf[val] - prev_prob) / (double)total_prob;
+  return -OD_LOG2(val_prob);
+}
diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c
new file mode 100644
index 000000000..2a6204939
--- /dev/null
+++ b/third_party/aom/av1/encoder/global_motion.c
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "av1/encoder/global_motion.h"
+
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/corner_detect.h"
+#include "av1/encoder/corner_match.h"
+#include "av1/encoder/ransac.h"
+
+#define MAX_CORNERS 4096
+#define MIN_INLIER_PROB 0.1
+
+#define MIN_TRANS_THRESH (1 * GM_TRANS_DECODE_FACTOR)
+
+// Border over which to compute the global motion
+#define ERRORADV_BORDER 0
+
+#define ERRORADV_MAX_THRESH 0.995
+#define ERRORADV_COST_PRODUCT_THRESH 26000
+
+int is_enough_erroradvantage(double best_erroradvantage, int params_cost) {
+  return best_erroradvantage < ERRORADV_MAX_THRESH &&
+         best_erroradvantage * params_cost < ERRORADV_COST_PRODUCT_THRESH;
+}
+
+static void convert_to_params(const double *params, int32_t *model) {
+  int i;
+  int alpha_present = 0;
+  model[0] = (int32_t)floor(params[0] * (1 << GM_TRANS_PREC_BITS) + 0.5);
+  model[1] = (int32_t)floor(params[1] * (1 << GM_TRANS_PREC_BITS) + 0.5);
+  model[0] = (int32_t)clamp(model[0], GM_TRANS_MIN, GM_TRANS_MAX) *
+             GM_TRANS_DECODE_FACTOR;
+  model[1] = (int32_t)clamp(model[1], GM_TRANS_MIN, GM_TRANS_MAX) *
+             GM_TRANS_DECODE_FACTOR;
+
+  for (i = 2; i < 6; ++i) {
+    const int diag_value = ((i == 2 || i == 5) ? (1 << GM_ALPHA_PREC_BITS) : 0);
+    model[i] = (int32_t)floor(params[i] * (1 << GM_ALPHA_PREC_BITS) + 0.5);
+    model[i] =
+        (int32_t)clamp(model[i] - diag_value, GM_ALPHA_MIN, GM_ALPHA_MAX);
+    alpha_present |= (model[i] != 0);
+    model[i] = (model[i] + diag_value) * GM_ALPHA_DECODE_FACTOR;
+  }
+  for (; i < 8; ++i) {
+    model[i] = (int32_t)floor(params[i] * (1 << GM_ROW3HOMO_PREC_BITS) + 0.5);
+    model[i] = (int32_t)clamp(model[i], GM_ROW3HOMO_MIN, GM_ROW3HOMO_MAX) *
+               GM_ROW3HOMO_DECODE_FACTOR;
+    alpha_present |= (model[i] != 0);
+  }
+
+  if (!alpha_present) {
+    if (abs(model[0]) < MIN_TRANS_THRESH && abs(model[1]) < MIN_TRANS_THRESH) {
+      model[0] = 0;
+      model[1] = 0;
+    }
+  }
+}
+
+void convert_model_to_params(const double *params, WarpedMotionParams *model) {
+  convert_to_params(params, model->wmmat);
+  model->wmtype = get_gmtype(model);
+}
+
+// Adds some offset to a global motion parameter and handles
+// all of the necessary precision shifts, clamping, and
+// zero-centering.
+static int32_t add_param_offset(int param_index, int32_t param_value,
+                                int32_t offset) {
+  const int scale_vals[3] = { GM_TRANS_PREC_DIFF, GM_ALPHA_PREC_DIFF,
+                              GM_ROW3HOMO_PREC_DIFF };
+  const int clamp_vals[3] = { GM_TRANS_MAX, GM_ALPHA_MAX, GM_ROW3HOMO_MAX };
+  // type of param: 0 - translation, 1 - affine, 2 - homography
+  const int param_type = (param_index < 2 ? 0 : (param_index < 6 ? 1 : 2));
+  const int is_one_centered = (param_index == 2 || param_index == 5);
+
+  // Make parameter zero-centered and offset the shift that was done to make
+  // it compatible with the warped model
+  param_value = (param_value - (is_one_centered << WARPEDMODEL_PREC_BITS)) >>
+                scale_vals[param_type];
+  // Add desired offset to the rescaled/zero-centered parameter
+  param_value += offset;
+  // Clamp the parameter so it does not overflow the number of bits allotted
+  // to it in the bitstream
+  param_value = (int32_t)clamp(param_value, -clamp_vals[param_type],
+                               clamp_vals[param_type]);
+  // Rescale the parameter to WARPEDMODEL_PRECISION_BITS so it is compatible
+  // with the warped motion library
+  param_value *= (1 << scale_vals[param_type]);
+
+  // Undo the zero-centering step if necessary
+  return param_value + (is_one_centered << WARPEDMODEL_PREC_BITS);
+}
+
+static void force_wmtype(WarpedMotionParams *wm, TransformationType wmtype) {
+  switch (wmtype) {
+    case IDENTITY: wm->wmmat[0] = 0; wm->wmmat[1] = 0;
+    case TRANSLATION:
+      wm->wmmat[2] = 1 << WARPEDMODEL_PREC_BITS;
+      wm->wmmat[3] = 0;
+    case ROTZOOM: wm->wmmat[4] = -wm->wmmat[3]; wm->wmmat[5] = wm->wmmat[2];
+    case AFFINE: wm->wmmat[6] = wm->wmmat[7] = 0; break;
+    case HORTRAPEZOID: wm->wmmat[6] = wm->wmmat[4] = 0; break;
+    case VERTRAPEZOID: wm->wmmat[7] = wm->wmmat[3] = 0; break;
+    case HOMOGRAPHY: break;
+    default: assert(0);
+  }
+  wm->wmtype = wmtype;
+}
+
+double refine_integerized_param(WarpedMotionParams *wm,
+                                TransformationType wmtype,
+#if CONFIG_HIGHBITDEPTH
+                                int use_hbd, int bd,
+#endif  // CONFIG_HIGHBITDEPTH
+                                uint8_t *ref, int r_width, int r_height,
+                                int r_stride, uint8_t *dst, int d_width,
+                                int d_height, int d_stride, int n_refinements) {
+  static const int max_trans_model_params[TRANS_TYPES] = {
+    0, 2, 4, 6, 8, 8, 8
+  };
+  const int border = ERRORADV_BORDER;
+  int i = 0, p;
+  int n_params = max_trans_model_params[wmtype];
+  int32_t *param_mat = wm->wmmat;
+  double step_error;
+  int32_t step;
+  int32_t *param;
+  int32_t curr_param;
+  int32_t best_param;
+  double best_error;
+
+  force_wmtype(wm, wmtype);
+  best_error = av1_warp_erroradv(wm,
+#if CONFIG_HIGHBITDEPTH
+                                 use_hbd, bd,
+#endif  // CONFIG_HIGHBITDEPTH
+                                 ref, r_width, r_height, r_stride,
+                                 dst + border * d_stride + border, border,
+                                 border, d_width - 2 * border,
+                                 d_height - 2 * border, d_stride, 0, 0, 16, 16);
+  step = 1 << (n_refinements + 1);
+  for (i = 0; i < n_refinements; i++, step >>= 1) {
+    for (p = 0; p < n_params; ++p) {
+      int step_dir = 0;
+      // Skip searches for parameters that are forced to be 0
+      if (wmtype == HORTRAPEZOID && (p == 4 || p == 6)) continue;
+      if (wmtype == VERTRAPEZOID && (p == 3 || p == 7)) continue;
+      param = param_mat + p;
+      curr_param = *param;
+      best_param = curr_param;
+      // look to the left
+      *param = add_param_offset(p, curr_param, -step);
+      step_error = av1_warp_erroradv(
+          wm,
+#if CONFIG_HIGHBITDEPTH
+          use_hbd, bd,
+#endif  // CONFIG_HIGHBITDEPTH
+          ref, r_width, r_height, r_stride, dst + border * d_stride + border,
+          border, border, d_width - 2 * border, d_height - 2 * border, d_stride,
+          0, 0, 16, 16);
+      if (step_error < best_error) {
+        best_error = step_error;
+        best_param = *param;
+        step_dir = -1;
+      }
+
+      // look to the right
+      *param = add_param_offset(p, curr_param, step);
+      step_error = av1_warp_erroradv(
+          wm,
+#if CONFIG_HIGHBITDEPTH
+          use_hbd, bd,
+#endif  // CONFIG_HIGHBITDEPTH
+          ref, r_width, r_height, r_stride, dst + border * d_stride + border,
+          border, border, d_width - 2 * border, d_height - 2 * border, d_stride,
+          0, 0, 16, 16);
+      if (step_error < best_error) {
+        best_error = step_error;
+        best_param = *param;
+        step_dir = 1;
+      }
+      *param = best_param;
+
+      // look to the direction chosen above repeatedly until error increases
+      // for the biggest step size
+      while (step_dir) {
+        *param = add_param_offset(p, best_param, step * step_dir);
+        step_error = av1_warp_erroradv(
+            wm,
+#if CONFIG_HIGHBITDEPTH
+            use_hbd, bd,
+#endif  // CONFIG_HIGHBITDEPTH
+            ref, r_width, r_height, r_stride, dst + border * d_stride + border,
+            border, border, d_width - 2 * border, d_height - 2 * border,
+            d_stride, 0, 0, 16, 16);
+        if (step_error < best_error) {
+          best_error = step_error;
+          best_param = *param;
+        } else {
+          *param = best_param;
+          step_dir = 0;
+        }
+      }
+    }
+  }
+  force_wmtype(wm, wmtype);
+  wm->wmtype = get_gmtype(wm);
+  return best_error;
+}
+
+static INLINE RansacFunc get_ransac_type(TransformationType type) {
+  switch (type) {
+    case HOMOGRAPHY: return ransac_homography;
+    case HORTRAPEZOID: return ransac_hortrapezoid;
+    case VERTRAPEZOID: return ransac_vertrapezoid;
+    case AFFINE: return ransac_affine;
+    case ROTZOOM: return ransac_rotzoom;
+    case TRANSLATION: return ransac_translation;
+    default: assert(0); return NULL;
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static unsigned char *downconvert_frame(YV12_BUFFER_CONFIG *frm,
+                                        int bit_depth) {
+  int i, j;
+  uint16_t *orig_buf = CONVERT_TO_SHORTPTR(frm->y_buffer);
+  uint8_t *buf = malloc(frm->y_height * frm->y_stride * sizeof(*buf));
+
+  for (i = 0; i < frm->y_height; ++i)
+    for (j = 0; j < frm->y_width; ++j)
+      buf[i * frm->y_stride + j] =
+          orig_buf[i * frm->y_stride + j] >> (bit_depth - 8);
+
+  return buf;
+}
+#endif
+
+int compute_global_motion_feature_based(
+    TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref,
+#if CONFIG_HIGHBITDEPTH
+    int bit_depth,
+#endif
+    int *num_inliers_by_motion, double *params_by_motion, int num_motions) {
+  int i;
+  int num_frm_corners, num_ref_corners;
+  int num_correspondences;
+  int *correspondences;
+  int frm_corners[2 * MAX_CORNERS], ref_corners[2 * MAX_CORNERS];
+  unsigned char *frm_buffer = frm->y_buffer;
+  unsigned char *ref_buffer = ref->y_buffer;
+  RansacFunc ransac = get_ransac_type(type);
+
+#if CONFIG_HIGHBITDEPTH
+  if (frm->flags & YV12_FLAG_HIGHBITDEPTH) {
+    // The frame buffer is 16-bit, so we need to convert to 8 bits for the
+    // following code. We cache the result until the frame is released.
+    if (frm->y_buffer_8bit)
+      frm_buffer = frm->y_buffer_8bit;
+    else
+      frm_buffer = frm->y_buffer_8bit = downconvert_frame(frm, bit_depth);
+  }
+  if (ref->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (ref->y_buffer_8bit)
+      ref_buffer = ref->y_buffer_8bit;
+    else
+      ref_buffer = ref->y_buffer_8bit = downconvert_frame(ref, bit_depth);
+  }
+#endif
+
+  // compute interest points in images using FAST features
+  num_frm_corners = fast_corner_detect(frm_buffer, frm->y_width, frm->y_height,
+                                       frm->y_stride, frm_corners, MAX_CORNERS);
+  num_ref_corners = fast_corner_detect(ref_buffer, ref->y_width, ref->y_height,
+                                       ref->y_stride, ref_corners, MAX_CORNERS);
+
+  // find correspondences between the two images
+  correspondences =
+      (int *)malloc(num_frm_corners * 4 * sizeof(*correspondences));
+  num_correspondences = determine_correspondence(
+      frm_buffer, (int *)frm_corners, num_frm_corners, ref_buffer,
+      (int *)ref_corners, num_ref_corners, frm->y_width, frm->y_height,
+      frm->y_stride, ref->y_stride, correspondences);
+
+  ransac(correspondences, num_correspondences, num_inliers_by_motion,
+         params_by_motion, num_motions);
+
+  free(correspondences);
+
+  // Set num_inliers = 0 for motions with too few inliers so they are ignored.
+  for (i = 0; i < num_motions; ++i) {
+    if (num_inliers_by_motion[i] < MIN_INLIER_PROB * num_correspondences) {
+      num_inliers_by_motion[i] = 0;
+    }
+  }
+
+  // Return true if any one of the motions has inliers.
+  for (i = 0; i < num_motions; ++i) {
+    if (num_inliers_by_motion[i] > 0) return 1;
+  }
+  return 0;
+}
diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h
new file mode 100644
index 000000000..8fc757f38
--- /dev/null
+++ b/third_party/aom/av1/encoder/global_motion.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_GLOBAL_MOTION_H_
+#define AV1_ENCODER_GLOBAL_MOTION_H_
+
+#include "aom/aom_integer.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/mv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RANSAC_NUM_MOTIONS 1
+
+void convert_model_to_params(const double *params, WarpedMotionParams *model);
+
+int is_enough_erroradvantage(double erroradv, int params_cost);
+
+double refine_integerized_param(WarpedMotionParams *wm,
+                                TransformationType wmtype,
+#if CONFIG_HIGHBITDEPTH
+                                int use_hbd, int bd,
+#endif  // CONFIG_HIGHBITDEPTH
+                                uint8_t *ref, int r_width, int r_height,
+                                int r_stride, uint8_t *dst, int d_width,
+                                int d_height, int d_stride, int n_refinements);
+
+/*
+  Computes "num_motions" candidate global motion parameters between two frames.
+  The array "params_by_motion" should be length 8 * "num_motions". The ordering
+  of each set of parameters is best described  by the homography:
+
+        [x'     (m2 m3 m0   [x
+    z .  y'  =   m4 m5 m1 *  y
+         1]      m6 m7 1)    1]
+
+  where m{i} represents the ith value in any given set of parameters.
+
+  "num_inliers" should be length "num_motions", and will be populated with the
+  number of inlier feature points for each motion. Params for which the
+  num_inliers entry is 0 should be ignored by the caller.
+*/
+int compute_global_motion_feature_based(
+    TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref,
+#if CONFIG_HIGHBITDEPTH
+    int bit_depth,
+#endif
+    int *num_inliers_by_motion, double *params_by_motion, int num_motions);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AV1_ENCODER_GLOBAL_MOTION_H_
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
new file mode 100644
index 000000000..4fd563163
--- /dev/null
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "av1/common/idct.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+
+#if CONFIG_CB4X4
+static void fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TX_TYPE tx_type, int lossless) {
+  tran_high_t a1 = src_diff[0];
+  tran_high_t b1 = src_diff[1];
+  tran_high_t c1 = src_diff[diff_stride];
+  tran_high_t d1 = src_diff[1 + diff_stride];
+
+  tran_high_t a2 = a1 + c1;
+  tran_high_t b2 = b1 + d1;
+  tran_high_t c2 = a1 - c1;
+  tran_high_t d2 = b1 - d1;
+
+  a1 = a2 + b2;
+  b1 = a2 - b2;
+  c1 = c2 + d2;
+  d1 = c2 - d2;
+
+  coeff[0] = (tran_low_t)(4 * a1);
+  coeff[1] = (tran_low_t)(4 * b1);
+  coeff[2] = (tran_low_t)(4 * c1);
+  coeff[3] = (tran_low_t)(4 * d1);
+
+  (void)tx_type;
+  (void)lossless;
+}
+#endif
+
+static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TX_TYPE tx_type, int lossless) {
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    av1_fwht4x4(src_diff, coeff, diff_stride);
+    return;
+  }
+
+  av1_fht4x4(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TX_TYPE tx_type,
+                         FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  av1_fht4x8(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TX_TYPE tx_type,
+                         FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  av1_fht8x4(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
+                          int diff_stride, TX_TYPE tx_type,
+                          FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  av1_fht8x16(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
+                          int diff_stride, TX_TYPE tx_type,
+                          FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  av1_fht16x8(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
+                           int diff_stride, TX_TYPE tx_type,
+                           FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  av1_fht16x32(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
+                           int diff_stride, TX_TYPE tx_type,
+                           FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  av1_fht32x16(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TX_TYPE tx_type,
+                         FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  av1_fht8x8(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
+                           int diff_stride, TX_TYPE tx_type,
+                           FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  av1_fht16x16(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
+                           int diff_stride, TX_TYPE tx_type,
+                           FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  av1_fht32x32(src_diff, coeff, diff_stride, tx_type);
+}
+
+#if CONFIG_TX64X64
+static void fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
+                           int diff_stride, TX_TYPE tx_type,
+                           FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+#if CONFIG_EXT_TX
+  if (tx_type == IDTX)
+    av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, tx_type);
+  else
+#endif
+    av1_fht64x64(src_diff, coeff, diff_stride, tx_type);
+}
+#endif  // CONFIG_TX64X64
+
+#if CONFIG_HIGHBITDEPTH
+#if CONFIG_CB4X4
+static void highbd_fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TX_TYPE tx_type, int lossless,
+                                const int bd) {
+  tran_high_t a1 = src_diff[0];
+  tran_high_t b1 = src_diff[1];
+  tran_high_t c1 = src_diff[diff_stride];
+  tran_high_t d1 = src_diff[1 + diff_stride];
+
+  tran_high_t a2 = a1 + c1;
+  tran_high_t b2 = b1 + d1;
+  tran_high_t c2 = a1 - c1;
+  tran_high_t d2 = b1 - d1;
+
+  a1 = a2 + b2;
+  b1 = a2 - b2;
+  c1 = c2 + d2;
+  d1 = c2 - d2;
+
+  coeff[0] = (tran_low_t)(4 * a1);
+  coeff[1] = (tran_low_t)(4 * b1);
+  coeff[2] = (tran_low_t)(4 * c1);
+  coeff[3] = (tran_low_t)(4 * d1);
+
+  (void)tx_type;
+  (void)lossless;
+  (void)bd;
+}
+#endif
+
+static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TX_TYPE tx_type, int lossless,
+                                const int bd) {
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    av1_highbd_fwht4x4(src_diff, coeff, diff_stride);
+    return;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      av1_fwd_txfm2d_4x4(src_diff, coeff, diff_stride, tx_type, bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      av1_fwd_txfm2d_4x4(src_diff, coeff, diff_stride, tx_type, bd);
+      break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      av1_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0);
+  }
+}
+
+static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TX_TYPE tx_type,
+                                FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  (void)bd;
+  av1_highbd_fht4x8(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TX_TYPE tx_type,
+                                FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  (void)bd;
+  av1_highbd_fht8x4(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TX_TYPE tx_type,
+                                 FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  (void)bd;
+  av1_highbd_fht8x16(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TX_TYPE tx_type,
+                                 FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  (void)bd;
+  av1_highbd_fht16x8(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TX_TYPE tx_type,
+                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  (void)bd;
+  av1_highbd_fht16x32(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TX_TYPE tx_type,
+                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  (void)bd;
+  av1_highbd_fht32x16(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TX_TYPE tx_type,
+                                FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      av1_fwd_txfm2d_8x8(src_diff, coeff, diff_stride, tx_type, bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      av1_fwd_txfm2d_8x8(src_diff, coeff, diff_stride, tx_type, bd);
+      break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // Use C version since DST exists only in C
+      av1_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0);
+  }
+}
+
+static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TX_TYPE tx_type,
+                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      av1_fwd_txfm2d_16x16(src_diff, coeff, diff_stride, tx_type, bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      av1_fwd_txfm2d_16x16(src_diff, coeff, diff_stride, tx_type, bd);
+      break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // Use C version since DST exists only in C
+      av1_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0);
+  }
+}
+
+static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TX_TYPE tx_type,
+                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  switch (tx_type) {
+    case DCT_DCT:
+      av1_fwd_txfm2d_32x32(src_diff, coeff, diff_stride, tx_type, bd);
+      break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      av1_highbd_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+
+#if CONFIG_TX64X64
+static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TX_TYPE tx_type,
+                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  (void)bd;
+  switch (tx_type) {
+    case DCT_DCT:
+      av1_highbd_fht64x64(src_diff, coeff, diff_stride, tx_type);
+      break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      av1_highbd_fht64x64(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_HIGHBITDEPTH
+
+void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
+                  FWD_TXFM_PARAM *fwd_txfm_param) {
+  const int fwd_txfm_opt = FWD_TXFM_OPT_NORMAL;
+  const TX_TYPE tx_type = fwd_txfm_param->tx_type;
+  const TX_SIZE tx_size = fwd_txfm_param->tx_size;
+  const int lossless = fwd_txfm_param->lossless;
+  switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      fwd_txfm_64x64(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+#endif  // CONFIG_TX64X64
+    case TX_32X32:
+      fwd_txfm_32x32(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_16X16:
+      fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_8X8:
+      fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_4X8:
+      fwd_txfm_4x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_8X4:
+      fwd_txfm_8x4(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_8X16:
+      fwd_txfm_8x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_16X8:
+      fwd_txfm_16x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_16X32:
+      fwd_txfm_16x32(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_32X16:
+      fwd_txfm_32x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_4X4:
+      fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless);
+      break;
+#if CONFIG_CB4X4
+    case TX_2X2:
+      fwd_txfm_2x2(src_diff, coeff, diff_stride, tx_type, lossless);
+      break;
+#endif
+    default: assert(0); break;
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, FWD_TXFM_PARAM *fwd_txfm_param) {
+  const int fwd_txfm_opt = FWD_TXFM_OPT_NORMAL;
+  const TX_TYPE tx_type = fwd_txfm_param->tx_type;
+  const TX_SIZE tx_size = fwd_txfm_param->tx_size;
+  const int lossless = fwd_txfm_param->lossless;
+  const int bd = fwd_txfm_param->bd;
+  switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                            bd);
+      break;
+#endif  // CONFIG_TX64X64
+    case TX_32X32:
+      highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                            bd);
+      break;
+    case TX_16X16:
+      highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                            bd);
+      break;
+    case TX_8X8:
+      highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                          bd);
+      break;
+    case TX_4X8:
+      highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                          bd);
+      break;
+    case TX_8X4:
+      highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                          bd);
+      break;
+    case TX_8X16:
+      highbd_fwd_txfm_8x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                           bd);
+      break;
+    case TX_16X8:
+      highbd_fwd_txfm_16x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                           bd);
+      break;
+    case TX_16X32:
+      highbd_fwd_txfm_16x32(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                            bd);
+      break;
+    case TX_32X16:
+      highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                            bd);
+      break;
+    case TX_4X4:
+      highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless, bd);
+      break;
+#if CONFIG_CB4X4
+    case TX_2X2:
+      highbd_fwd_txfm_2x2(src_diff, coeff, diff_stride, tx_type, lossless, bd);
+      break;
+#endif
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
new file mode 100644
index 000000000..e6fd17275
--- /dev/null
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_HYBRID_FWD_TXFM_H_
+#define AV1_ENCODER_HYBRID_FWD_TXFM_H_
+
+#include "./aom_config.h"
+
+typedef enum FWD_TXFM_OPT { FWD_TXFM_OPT_NORMAL } FWD_TXFM_OPT;
+
+typedef struct FWD_TXFM_PARAM {
+  TX_TYPE tx_type;
+  TX_SIZE tx_size;
+  int lossless;
+#if CONFIG_HIGHBITDEPTH
+  int bd;
+#endif  // CONFIG_HIGHBITDEPTH
+} FWD_TXFM_PARAM;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
+                  FWD_TXFM_PARAM *fwd_txfm_param);
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, FWD_TXFM_PARAM *fwd_txfm_param);
+#endif  // CONFIG_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_HYBRID_FWD_TXFM_H_
diff --git a/third_party/aom/av1/encoder/laplace_encoder.c b/third_party/aom/av1/encoder/laplace_encoder.c
new file mode 100644
index 000000000..54ffc88fb
--- /dev/null
+++ b/third_party/aom/av1/encoder/laplace_encoder.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stdio.h>
+
+#include "aom_dsp/bitwriter.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/pvq.h"
+#include "pvq_encoder.h"
+
+static void aom_encode_pvq_split(aom_writer *w, od_pvq_codeword_ctx *adapt,
+ int count, int sum, int ctx) {
+  int shift;
+  int rest;
+  int fctx;
+  if (sum == 0) return;
+  shift = OD_MAXI(0, OD_ILOG(sum) - 3);
+  if (shift) {
+    rest = count & ((1 << shift) - 1);
+    count >>= shift;
+    sum >>= shift;
+  }
+  fctx = 7*ctx + sum - 1;
+  aom_write_symbol_pvq(w, count, adapt->pvq_split_cdf[fctx], sum + 1);
+  if (shift) aom_write_literal(w, rest, shift);
+}
+
+void aom_encode_band_pvq_splits(aom_writer *w, od_pvq_codeword_ctx *adapt,
+ const int *y, int n, int k, int level) {
+  int mid;
+  int i;
+  int count_right;
+  if (n <= 1 || k == 0) return;
+  if (k == 1 && n <= 16) {
+    int cdf_id;
+    int pos;
+    cdf_id = od_pvq_k1_ctx(n, level == 0);
+    for (pos = 0; !y[pos]; pos++);
+    OD_ASSERT(pos < n);
+    aom_write_symbol_pvq(w, pos, adapt->pvq_k1_cdf[cdf_id], n);
+  }
+  else {
+    mid = n >> 1;
+    count_right = k;
+    for (i = 0; i < mid; i++) count_right -= abs(y[i]);
+    aom_encode_pvq_split(w, adapt, count_right, k, od_pvq_size_ctx(n));
+    aom_encode_band_pvq_splits(w, adapt, y, mid, k - count_right, level + 1);
+    aom_encode_band_pvq_splits(w, adapt, y + mid, n - mid, count_right,
+     level + 1);
+  }
+}
+
+/** Encodes the tail of a Laplace-distributed variable, i.e. it doesn't
+ * do anything special for the zero case.
+ *
+ * @param [in,out] enc     range encoder
+ * @param [in]     x       variable to encode (has to be positive)
+ * @param [in]     decay   decay factor of the distribution in Q8 format,
+ * i.e. pdf ~= decay^x
+ */
+void aom_laplace_encode_special(aom_writer *w, int x, unsigned decay) {
+  int shift;
+  int xs;
+  int sym;
+  const uint16_t *cdf;
+  shift = 0;
+  /* We don't want a large decay value because that would require too many
+     symbols. */
+  while (decay > 235) {
+    decay = (decay*decay + 128) >> 8;
+    shift++;
+  }
+  decay = OD_MINI(decay, 254);
+  decay = OD_MAXI(decay, 2);
+  xs = x >> shift;
+  cdf = EXP_CDF_TABLE[(decay + 1) >> 1];
+  OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "decay = %d", decay));
+  do {
+    sym = OD_MINI(xs, 15);
+    {
+      int i;
+      OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "%d %d %d %d %d\n", x, xs, shift,
+       sym, max));
+      for (i = 0; i < 16; i++) {
+        OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "%d ", cdf[i]));
+      }
+      OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "\n"));
+    }
+    aom_write_cdf(w, sym, cdf, 16);
+    xs -= 15;
+  } while (sym >= 15);
+  if (shift) aom_write_literal(w, x & ((1 << shift) - 1), shift);
+}
diff --git a/third_party/aom/av1/encoder/lookahead.c b/third_party/aom/av1/encoder/lookahead.c
new file mode 100644
index 000000000..591ca6152
--- /dev/null
+++ b/third_party/aom/av1/encoder/lookahead.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <stdlib.h>
+
+#include "./aom_config.h"
+
+#include "av1/common/common.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/lookahead.h"
+
+/* Return the buffer at the given absolute index and increment the index */
+static struct lookahead_entry *pop(struct lookahead_ctx *ctx, int *idx) {
+  int index = *idx;
+  struct lookahead_entry *buf = ctx->buf + index;
+
+  assert(index < ctx->max_sz);
+  if (++index >= ctx->max_sz) index -= ctx->max_sz;
+  *idx = index;
+  return buf;
+}
+
+void av1_lookahead_destroy(struct lookahead_ctx *ctx) {
+  if (ctx) {
+    if (ctx->buf) {
+      int i;
+
+      for (i = 0; i < ctx->max_sz; i++) aom_free_frame_buffer(&ctx->buf[i].img);
+      free(ctx->buf);
+    }
+    free(ctx);
+  }
+}
+
+struct lookahead_ctx *av1_lookahead_init(unsigned int width,
+                                         unsigned int height,
+                                         unsigned int subsampling_x,
+                                         unsigned int subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+                                         int use_highbitdepth,
+#endif
+                                         unsigned int depth) {
+  struct lookahead_ctx *ctx = NULL;
+
+  // Clamp the lookahead queue depth
+  depth = clamp(depth, 1, MAX_LAG_BUFFERS);
+
+  // Allocate memory to keep previous source frames available.
+  depth += MAX_PRE_FRAMES;
+
+  // Allocate the lookahead structures
+  ctx = calloc(1, sizeof(*ctx));
+  if (ctx) {
+    const int legacy_byte_alignment = 0;
+    unsigned int i;
+    ctx->max_sz = depth;
+    ctx->buf = calloc(depth, sizeof(*ctx->buf));
+    if (!ctx->buf) goto bail;
+    for (i = 0; i < depth; i++)
+      if (aom_alloc_frame_buffer(&ctx->buf[i].img, width, height, subsampling_x,
+                                 subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+                                 use_highbitdepth,
+#endif
+                                 AOM_BORDER_IN_PIXELS, legacy_byte_alignment))
+        goto bail;
+  }
+  return ctx;
+bail:
+  av1_lookahead_destroy(ctx);
+  return NULL;
+}
+
+#define USE_PARTIAL_COPY 0
+
+int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
+                       int64_t ts_start, int64_t ts_end,
+#if CONFIG_HIGHBITDEPTH
+                       int use_highbitdepth,
+#endif
+                       aom_enc_frame_flags_t flags) {
+  struct lookahead_entry *buf;
+#if USE_PARTIAL_COPY
+  int row, col, active_end;
+  int mb_rows = (src->y_height + 15) >> 4;
+  int mb_cols = (src->y_width + 15) >> 4;
+#endif
+  int width = src->y_crop_width;
+  int height = src->y_crop_height;
+  int uv_width = src->uv_crop_width;
+  int uv_height = src->uv_crop_height;
+  int subsampling_x = src->subsampling_x;
+  int subsampling_y = src->subsampling_y;
+  int larger_dimensions, new_dimensions;
+
+  if (ctx->sz + 1 + MAX_PRE_FRAMES > ctx->max_sz) return 1;
+  ctx->sz++;
+  buf = pop(ctx, &ctx->write_idx);
+
+  new_dimensions = width != buf->img.y_crop_width ||
+                   height != buf->img.y_crop_height ||
+                   uv_width != buf->img.uv_crop_width ||
+                   uv_height != buf->img.uv_crop_height;
+  larger_dimensions = width > buf->img.y_width || height > buf->img.y_height ||
+                      uv_width > buf->img.uv_width ||
+                      uv_height > buf->img.uv_height;
+  assert(!larger_dimensions || new_dimensions);
+
+#if USE_PARTIAL_COPY
+  // TODO(jkoleszar): This is disabled for now, as
+  // av1_copy_and_extend_frame_with_rect is not subsampling/alpha aware.
+
+  // Only do this partial copy if the following conditions are all met:
+  // 1. Lookahead queue has has size of 1.
+  // 2. Active map is provided.
+  // 3. This is not a key frame, golden nor altref frame.
+  if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) {
+    for (row = 0; row < mb_rows; ++row) {
+      col = 0;
+
+      while (1) {
+        // Find the first active macroblock in this row.
+        for (; col < mb_cols; ++col) {
+          if (active_map[col]) break;
+        }
+
+        // No more active macroblock in this row.
+        if (col == mb_cols) break;
+
+        // Find the end of active region in this row.
+        active_end = col;
+
+        for (; active_end < mb_cols; ++active_end) {
+          if (!active_map[active_end]) break;
+        }
+
+        // Only copy this active region.
+        av1_copy_and_extend_frame_with_rect(src, &buf->img, row << 4, col << 4,
+                                            16, (active_end - col) << 4);
+
+        // Start again from the end of this active region.
+        col = active_end;
+      }
+
+      active_map += mb_cols;
+    }
+  } else {
+#endif
+    if (larger_dimensions) {
+      YV12_BUFFER_CONFIG new_img;
+      memset(&new_img, 0, sizeof(new_img));
+      if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x,
+                                 subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+                                 use_highbitdepth,
+#endif
+                                 AOM_BORDER_IN_PIXELS, 0))
+        return 1;
+      aom_free_frame_buffer(&buf->img);
+      buf->img = new_img;
+    } else if (new_dimensions) {
+      buf->img.y_crop_width = src->y_crop_width;
+      buf->img.y_crop_height = src->y_crop_height;
+      buf->img.uv_crop_width = src->uv_crop_width;
+      buf->img.uv_crop_height = src->uv_crop_height;
+      buf->img.subsampling_x = src->subsampling_x;
+      buf->img.subsampling_y = src->subsampling_y;
+    }
+    // Partial copy not implemented yet
+    av1_copy_and_extend_frame(src, &buf->img);
+#if USE_PARTIAL_COPY
+  }
+#endif
+
+  buf->ts_start = ts_start;
+  buf->ts_end = ts_end;
+  buf->flags = flags;
+  return 0;
+}
+
+struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx,
+                                          int drain) {
+  struct lookahead_entry *buf = NULL;
+
+  if (ctx && ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
+    buf = pop(ctx, &ctx->read_idx);
+    ctx->sz--;
+  }
+  return buf;
+}
+
+struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx,
+                                           int index) {
+  struct lookahead_entry *buf = NULL;
+
+  if (index >= 0) {
+    // Forward peek
+    if (index < ctx->sz) {
+      index += ctx->read_idx;
+      if (index >= ctx->max_sz) index -= ctx->max_sz;
+      buf = ctx->buf + index;
+    }
+  } else if (index < 0) {
+    // Backward peek
+    if (-index <= MAX_PRE_FRAMES) {
+      index += (int)(ctx->read_idx);
+      if (index < 0) index += (int)(ctx->max_sz);
+      buf = ctx->buf + index;
+    }
+  }
+
+  return buf;
+}
+
+unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx) { return ctx->sz; }
diff --git a/third_party/aom/av1/encoder/lookahead.h b/third_party/aom/av1/encoder/lookahead.h
new file mode 100644
index 000000000..19f75d7e4
--- /dev/null
+++ b/third_party/aom/av1/encoder/lookahead.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_LOOKAHEAD_H_
+#define AV1_ENCODER_LOOKAHEAD_H_
+
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_LAG_BUFFERS 25
+
+struct lookahead_entry {
+  YV12_BUFFER_CONFIG img;
+  int64_t ts_start;
+  int64_t ts_end;
+  aom_enc_frame_flags_t flags;
+};
+
+// The max of past frames we want to keep in the queue.
+#define MAX_PRE_FRAMES 1
+
+struct lookahead_ctx {
+  int max_sz;                  /* Absolute size of the queue */
+  int sz;                      /* Number of buffers currently in the queue */
+  int read_idx;                /* Read index */
+  int write_idx;               /* Write index */
+  struct lookahead_entry *buf; /* Buffer list */
+};
+
+/**\brief Initializes the lookahead stage
+ *
+ * The lookahead stage is a queue of frame buffers on which some analysis
+ * may be done when buffers are enqueued.
+ */
+struct lookahead_ctx *av1_lookahead_init(unsigned int width,
+                                         unsigned int height,
+                                         unsigned int subsampling_x,
+                                         unsigned int subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+                                         int use_highbitdepth,
+#endif
+                                         unsigned int depth);
+
+/**\brief Destroys the lookahead stage
+ */
+void av1_lookahead_destroy(struct lookahead_ctx *ctx);
+
+/**\brief Enqueue a source buffer
+ *
+ * This function will copy the source image into a new framebuffer with
+ * the expected stride/border.
+ *
+ * If active_map is non-NULL and there is only one frame in the queue, then copy
+ * only active macroblocks.
+ *
+ * \param[in] ctx         Pointer to the lookahead context
+ * \param[in] src         Pointer to the image to enqueue
+ * \param[in] ts_start    Timestamp for the start of this frame
+ * \param[in] ts_end      Timestamp for the end of this frame
+ * \param[in] flags       Flags set on this frame
+ * \param[in] active_map  Map that specifies which macroblock is active
+ */
+int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
+                       int64_t ts_start, int64_t ts_end,
+#if CONFIG_HIGHBITDEPTH
+                       int use_highbitdepth,
+#endif
+                       aom_enc_frame_flags_t flags);
+
+/**\brief Get the next source buffer to encode
+ *
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] drain     Flag indicating the buffer should be drained
+ *                      (return a buffer regardless of the current queue depth)
+ *
+ * \retval NULL, if drain set and queue is empty
+ * \retval NULL, if drain not set and queue not of the configured depth
+ */
+struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain);
+
+/**\brief Get a future source buffer to encode
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] index     Index of the frame to be returned, 0 == next frame
+ *
+ * \retval NULL, if no buffer exists at the specified index
+ */
+struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx,
+                                           int index);
+
+/**\brief Get the number of frames currently in the lookahead queue
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ */
+unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_LOOKAHEAD_H_
diff --git a/third_party/aom/av1/encoder/mbgraph.c b/third_party/aom/av1/encoder/mbgraph.c
new file mode 100644
index 000000000..1296027dc
--- /dev/null
+++ b/third_party/aom/av1/encoder/mbgraph.c
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+
+#include "./av1_rtcd.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/system_state.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/common/blockd.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+
+static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv,
+                                              int mb_row, int mb_col) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  const aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
+
+  const MvLimits tmp_mv_limits = x->mv_limits;
+  MV ref_full;
+  int cost_list[5];
+
+  // Further step/diamond searches as necessary
+  int step_param = mv_sf->reduce_first_step_size;
+  step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  av1_set_mv_search_range(&x->mv_limits, ref_mv);
+
+  ref_full.col = ref_mv->col >> 3;
+  ref_full.row = ref_mv->row >> 3;
+
+  /*cpi->sf.search_method == HEX*/
+  av1_hex_search(x, &ref_full, step_param, x->errorperbit, 0,
+                 cond_cost_list(cpi, cost_list), &v_fn_ptr, 0, ref_mv);
+
+  // Try sub-pixel MC
+  // if (bestsme > error_thresh && bestsme < INT_MAX)
+  {
+    int distortion;
+    unsigned int sse;
+    cpi->find_fractional_mv_step(x, ref_mv, cpi->common.allow_high_precision_mv,
+                                 x->errorperbit, &v_fn_ptr, 0,
+                                 mv_sf->subpel_iters_per_step,
+                                 cond_cost_list(cpi, cost_list), NULL, NULL,
+                                 &distortion, &sse, NULL, 0, 0, 0);
+  }
+
+#if CONFIG_EXT_INTER
+  if (has_second_ref(&xd->mi[0]->mbmi))
+    xd->mi[0]->mbmi.mode = NEW_NEWMV;
+  else
+#endif  // CONFIG_EXT_INTER
+    xd->mi[0]->mbmi.mode = NEWMV;
+
+  xd->mi[0]->mbmi.mv[0] = x->best_mv;
+#if CONFIG_EXT_INTER
+  xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME;
+#endif  // CONFIG_EXT_INTER
+
+  av1_build_inter_predictors_sby(xd, mb_row, mb_col, NULL, BLOCK_16X16);
+
+  /* restore UMV window */
+  x->mv_limits = tmp_mv_limits;
+
+  return aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                      xd->plane[0].dst.buf, xd->plane[0].dst.stride);
+}
+
+static int do_16x16_motion_search(AV1_COMP *cpi, const MV *ref_mv, int mb_row,
+                                  int mb_col) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  unsigned int err, tmp_err;
+  MV best_mv;
+
+  // Try zero MV first
+  // FIXME should really use something like near/nearest MV and/or MV prediction
+  err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
+  best_mv.col = best_mv.row = 0;
+
+  // Test last reference frame using the previous best mv as the
+  // starting point (best reference) for the search
+  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, mb_row, mb_col);
+  if (tmp_err < err) {
+    err = tmp_err;
+    best_mv = x->best_mv.as_mv;
+  }
+
+  // If the current best reference mv is not centered on 0,0 then do a 0,0
+  // based search as well.
+  if (ref_mv->row != 0 || ref_mv->col != 0) {
+    MV zero_ref_mv = { 0, 0 };
+
+    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, mb_row, mb_col);
+    if (tmp_err < err) {
+      err = tmp_err;
+      best_mv = x->best_mv.as_mv;
+    }
+  }
+
+  x->best_mv.as_mv = best_mv;
+  return err;
+}
+
+static int do_16x16_zerozero_search(AV1_COMP *cpi, int_mv *dst_mv) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  unsigned int err;
+
+  // Try zero MV first
+  // FIXME should really use something like near/nearest MV and/or MV prediction
+  err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
+
+  dst_mv->as_int = 0;
+
+  return err;
+}
+static int find_best_16x16_intra(AV1_COMP *cpi, PREDICTION_MODE *pbest_mode) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  PREDICTION_MODE best_mode = -1, mode;
+  unsigned int best_err = INT_MAX;
+
+  // calculate SATD for each intra prediction mode;
+  // we're intentionally not doing 4x4, we just want a rough estimate
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    unsigned int err;
+
+    xd->mi[0]->mbmi.mode = mode;
+    av1_predict_intra_block(xd, 16, 16, BLOCK_16X16, mode, x->plane[0].src.buf,
+                            x->plane[0].src.stride, xd->plane[0].dst.buf,
+                            xd->plane[0].dst.stride, 0, 0, 0);
+    err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                       xd->plane[0].dst.buf, xd->plane[0].dst.stride);
+
+    // find best
+    if (err < best_err) {
+      best_err = err;
+      best_mode = mode;
+    }
+  }
+
+  if (pbest_mode) *pbest_mode = best_mode;
+
+  return best_err;
+}
+
+static void update_mbgraph_mb_stats(AV1_COMP *cpi, MBGRAPH_MB_STATS *stats,
+                                    YV12_BUFFER_CONFIG *buf, int mb_y_offset,
+                                    YV12_BUFFER_CONFIG *golden_ref,
+                                    const MV *prev_golden_ref_mv,
+                                    YV12_BUFFER_CONFIG *alt_ref, int mb_row,
+                                    int mb_col) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int intra_error;
+  AV1_COMMON *cm = &cpi->common;
+
+  // FIXME in practice we're completely ignoring chroma here
+  x->plane[0].src.buf = buf->y_buffer + mb_y_offset;
+  x->plane[0].src.stride = buf->y_stride;
+
+  xd->plane[0].dst.buf = get_frame_new_buffer(cm)->y_buffer + mb_y_offset;
+  xd->plane[0].dst.stride = get_frame_new_buffer(cm)->y_stride;
+
+  // do intra 16x16 prediction
+  intra_error = find_best_16x16_intra(cpi, &stats->ref[INTRA_FRAME].m.mode);
+  if (intra_error <= 0) intra_error = 1;
+  stats->ref[INTRA_FRAME].err = intra_error;
+
+  // Golden frame MV search, if it exists and is different than last frame
+  if (golden_ref) {
+    int g_motion_error;
+    xd->plane[0].pre[0].buf = golden_ref->y_buffer + mb_y_offset;
+    xd->plane[0].pre[0].stride = golden_ref->y_stride;
+    g_motion_error =
+        do_16x16_motion_search(cpi, prev_golden_ref_mv, mb_row, mb_col);
+    stats->ref[GOLDEN_FRAME].m.mv = x->best_mv;
+    stats->ref[GOLDEN_FRAME].err = g_motion_error;
+  } else {
+    stats->ref[GOLDEN_FRAME].err = INT_MAX;
+    stats->ref[GOLDEN_FRAME].m.mv.as_int = 0;
+  }
+
+  // Do an Alt-ref frame MV search, if it exists and is different than
+  // last/golden frame.
+  if (alt_ref) {
+    int a_motion_error;
+    xd->plane[0].pre[0].buf = alt_ref->y_buffer + mb_y_offset;
+    xd->plane[0].pre[0].stride = alt_ref->y_stride;
+    a_motion_error =
+        do_16x16_zerozero_search(cpi, &stats->ref[ALTREF_FRAME].m.mv);
+
+    stats->ref[ALTREF_FRAME].err = a_motion_error;
+  } else {
+    stats->ref[ALTREF_FRAME].err = INT_MAX;
+    stats->ref[ALTREF_FRAME].m.mv.as_int = 0;
+  }
+}
+
+static void update_mbgraph_frame_stats(AV1_COMP *cpi,
+                                       MBGRAPH_FRAME_STATS *stats,
+                                       YV12_BUFFER_CONFIG *buf,
+                                       YV12_BUFFER_CONFIG *golden_ref,
+                                       YV12_BUFFER_CONFIG *alt_ref) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  AV1_COMMON *const cm = &cpi->common;
+
+  int mb_col, mb_row, offset = 0;
+  int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
+  MV gld_top_mv = { 0, 0 };
+  MODE_INFO mi_local;
+
+  av1_zero(mi_local);
+  // Set up limit values for motion vectors to prevent them extending outside
+  // the UMV borders.
+  x->mv_limits.row_min = -BORDER_MV_PIXELS_B16;
+  x->mv_limits.row_max = (cm->mb_rows - 1) * 8 + BORDER_MV_PIXELS_B16;
+  xd->up_available = 0;
+  xd->plane[0].dst.stride = buf->y_stride;
+  xd->plane[0].pre[0].stride = buf->y_stride;
+  xd->plane[1].dst.stride = buf->uv_stride;
+  xd->mi[0] = &mi_local;
+  mi_local.mbmi.sb_type = BLOCK_16X16;
+  mi_local.mbmi.ref_frame[0] = LAST_FRAME;
+  mi_local.mbmi.ref_frame[1] = NONE_FRAME;
+
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+    MV gld_left_mv = gld_top_mv;
+    int mb_y_in_offset = mb_y_offset;
+    int arf_y_in_offset = arf_y_offset;
+    int gld_y_in_offset = gld_y_offset;
+
+    // Set up limit values for motion vectors to prevent them extending outside
+    // the UMV borders.
+    x->mv_limits.col_min = -BORDER_MV_PIXELS_B16;
+    x->mv_limits.col_max = (cm->mb_cols - 1) * 8 + BORDER_MV_PIXELS_B16;
+    xd->left_available = 0;
+
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+      MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col];
+
+      update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset, golden_ref,
+                              &gld_left_mv, alt_ref, mb_row, mb_col);
+      gld_left_mv = mb_stats->ref[GOLDEN_FRAME].m.mv.as_mv;
+      if (mb_col == 0) {
+        gld_top_mv = gld_left_mv;
+      }
+      xd->left_available = 1;
+      mb_y_in_offset += 16;
+      gld_y_in_offset += 16;
+      arf_y_in_offset += 16;
+      x->mv_limits.col_min -= 16;
+      x->mv_limits.col_max -= 16;
+    }
+    xd->up_available = 1;
+    mb_y_offset += buf->y_stride * 16;
+    gld_y_offset += golden_ref->y_stride * 16;
+    if (alt_ref) arf_y_offset += alt_ref->y_stride * 16;
+    x->mv_limits.row_min -= 16;
+    x->mv_limits.row_max -= 16;
+    offset += cm->mb_cols;
+  }
+}
+
+// void separate_arf_mbs_byzz
+static void separate_arf_mbs(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  int mb_col, mb_row, offset, i;
+  int mi_row, mi_col;
+  int ncnt[4] = { 0 };
+  int n_frames = cpi->mbgraph_n_frames;
+
+  int *arf_not_zz;
+
+  CHECK_MEM_ERROR(
+      cm, arf_not_zz,
+      aom_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));
+
+  // We are not interested in results beyond the alt ref itself.
+  if (n_frames > cpi->rc.frames_till_gf_update_due)
+    n_frames = cpi->rc.frames_till_gf_update_due;
+
+  // defer cost to reference frames
+  for (i = n_frames - 1; i >= 0; i--) {
+    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+
+    for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
+         offset += cm->mb_cols, mb_row++) {
+      for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+        MBGRAPH_MB_STATS *mb_stats = &frame_stats->mb_stats[offset + mb_col];
+
+        int altref_err = mb_stats->ref[ALTREF_FRAME].err;
+        int intra_err = mb_stats->ref[INTRA_FRAME].err;
+        int golden_err = mb_stats->ref[GOLDEN_FRAME].err;
+
+        // Test for altref vs intra and gf and that its mv was 0,0.
+        if (altref_err > 1000 || altref_err > intra_err ||
+            altref_err > golden_err) {
+          arf_not_zz[offset + mb_col]++;
+        }
+      }
+    }
+  }
+
+  // arf_not_zz is indexed by MB, but this loop is indexed by MI to avoid out
+  // of bound access in segmentation_map
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+      // If any of the blocks in the sequence failed then the MB
+      // goes in segment 0
+      if (arf_not_zz[mi_row / 2 * cm->mb_cols + mi_col / 2]) {
+        ncnt[0]++;
+        cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 0;
+      } else {
+        cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 1;
+        ncnt[1]++;
+      }
+    }
+  }
+
+  // Only bother with segmentation if over 10% of the MBs in static segment
+  // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) )
+  if (1) {
+    // Note % of blocks that are marked as static
+    if (cm->MBs)
+      cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols);
+
+    // This error case should not be reachable as this function should
+    // never be called with the common data structure uninitialized.
+    else
+      cpi->static_mb_pct = 0;
+
+    av1_enable_segmentation(&cm->seg);
+  } else {
+    cpi->static_mb_pct = 0;
+    av1_disable_segmentation(&cm->seg);
+  }
+
+  // Free localy allocated storage
+  aom_free(arf_not_zz);
+}
+
+void av1_update_mbgraph_stats(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  int i, n_frames = av1_lookahead_depth(cpi->lookahead);
+  YV12_BUFFER_CONFIG *golden_ref = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+
+  assert(golden_ref != NULL);
+
+  // we need to look ahead beyond where the ARF transitions into
+  // being a GF - so exit if we don't look ahead beyond that
+  if (n_frames <= cpi->rc.frames_till_gf_update_due) return;
+
+  if (n_frames > MAX_LAG_BUFFERS) n_frames = MAX_LAG_BUFFERS;
+
+  cpi->mbgraph_n_frames = n_frames;
+  for (i = 0; i < n_frames; i++) {
+    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+    memset(frame_stats->mb_stats, 0,
+           cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats));
+  }
+
+  // do motion search to find contribution of each reference to data
+  // later on in this GF group
+  // FIXME really, the GF/last MC search should be done forward, and
+  // the ARF MC search backwards, to get optimal results for MV caching
+  for (i = 0; i < n_frames; i++) {
+    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+    struct lookahead_entry *q_cur = av1_lookahead_peek(cpi->lookahead, i);
+
+    assert(q_cur != NULL);
+
+    update_mbgraph_frame_stats(cpi, frame_stats, &q_cur->img, golden_ref,
+                               cpi->source);
+  }
+
+  aom_clear_system_state();
+
+  separate_arf_mbs(cpi);
+}
diff --git a/third_party/aom/av1/encoder/mbgraph.h b/third_party/aom/av1/encoder/mbgraph.h
new file mode 100644
index 000000000..758e2ad15
--- /dev/null
+++ b/third_party/aom/av1/encoder/mbgraph.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_MBGRAPH_H_
+#define AV1_ENCODER_MBGRAPH_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  struct {
+    int err;
+    union {
+      int_mv mv;
+      PREDICTION_MODE mode;
+    } m;
+  } ref[TOTAL_REFS_PER_FRAME];
+} MBGRAPH_MB_STATS;
+
+typedef struct { MBGRAPH_MB_STATS *mb_stats; } MBGRAPH_FRAME_STATS;
+
+struct AV1_COMP;
+
+void av1_update_mbgraph_stats(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_MBGRAPH_H_
diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c
new file mode 100644
index 000000000..d069eefb0
--- /dev/null
+++ b/third_party/aom/av1/encoder/mcomp.c
@@ -0,0 +1,3493 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/common.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/rdopt.h"
+
+// #define NEW_DIAMOND_SEARCH
+
+static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
+                                             const MV *mv) {
+  return &buf->buf[mv->row * buf->stride + mv->col];
+}
+
+void av1_set_mv_search_range(MvLimits *mv_limits, const MV *mv) {
+  int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
+  int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
+  int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
+  int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;
+
+  col_min = AOMMAX(col_min, (MV_LOW >> 3) + 1);
+  row_min = AOMMAX(row_min, (MV_LOW >> 3) + 1);
+  col_max = AOMMIN(col_max, (MV_UPP >> 3) - 1);
+  row_max = AOMMIN(row_max, (MV_UPP >> 3) - 1);
+
+  // Get intersection of UMV window and valid MV window to reduce # of checks
+  // in diamond search.
+  if (mv_limits->col_min < col_min) mv_limits->col_min = col_min;
+  if (mv_limits->col_max > col_max) mv_limits->col_max = col_max;
+  if (mv_limits->row_min < row_min) mv_limits->row_min = row_min;
+  if (mv_limits->row_max > row_max) mv_limits->row_max = row_max;
+}
+
+static void av1_set_subpel_mv_search_range(const MvLimits *mv_limits,
+                                           int *col_min, int *col_max,
+                                           int *row_min, int *row_max,
+                                           const MV *ref_mv) {
+  const int max_mv = MAX_FULL_PEL_VAL * 8;
+  const int minc = AOMMAX(mv_limits->col_min * 8, ref_mv->col - max_mv);
+  const int maxc = AOMMIN(mv_limits->col_max * 8, ref_mv->col + max_mv);
+  const int minr = AOMMAX(mv_limits->row_min * 8, ref_mv->row - max_mv);
+  const int maxr = AOMMIN(mv_limits->row_max * 8, ref_mv->row + max_mv);
+
+  *col_min = AOMMAX(MV_LOW + 1, minc);
+  *col_max = AOMMIN(MV_UPP - 1, maxc);
+  *row_min = AOMMAX(MV_LOW + 1, minr);
+  *row_max = AOMMIN(MV_UPP - 1, maxr);
+}
+
+int av1_init_search_range(int size) {
+  int sr = 0;
+  // Minimum search size no matter what the passed in value.
+  size = AOMMAX(16, size);
+
+  while ((size << sr) < MAX_FULL_PEL_VAL) sr++;
+
+  sr = AOMMIN(sr, MAX_MVSEARCH_STEPS - 2);
+  return sr;
+}
+
+static INLINE int mv_cost(const MV *mv, const int *joint_cost,
+                          int *const comp_cost[2]) {
+  return joint_cost[av1_get_mv_joint(mv)] + comp_cost[0][mv->row] +
+         comp_cost[1][mv->col];
+}
+
+int av1_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost,
+                    int *mvcost[2], int weight) {
+  const MV diff = { mv->row - ref->row, mv->col - ref->col };
+  return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * weight, 7);
+}
+
+#define PIXEL_TRANSFORM_ERROR_SCALE 4
+static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost,
+                       int *mvcost[2], int error_per_bit) {
+  if (mvcost) {
+    const MV diff = { mv->row - ref->row, mv->col - ref->col };
+    return (int)ROUND_POWER_OF_TWO_64(
+        (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit,
+        RDDIV_BITS + AV1_PROB_COST_SHIFT - RD_EPB_SHIFT +
+            PIXEL_TRANSFORM_ERROR_SCALE);
+  }
+  return 0;
+}
+
+static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
+                          int sad_per_bit) {
+  const MV diff = { (mv->row - ref->row) * 8, (mv->col - ref->col) * 8 };
+  return ROUND_POWER_OF_TWO(
+      (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->mvsadcost) * sad_per_bit,
+      AV1_PROB_COST_SHIFT);
+}
+
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride) {
+  int len, ss_count = 1;
+
+  cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0;
+  cfg->ss[0].offset = 0;
+
+  for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
+    // Generate offsets for 4 search sites per step.
+    const MV ss_mvs[] = { { -len, 0 }, { len, 0 }, { 0, -len }, { 0, len } };
+    int i;
+    for (i = 0; i < 4; ++i) {
+      search_site *const ss = &cfg->ss[ss_count++];
+      ss->mv = ss_mvs[i];
+      ss->offset = ss->mv.row * stride + ss->mv.col;
+    }
+  }
+
+  cfg->ss_count = ss_count;
+  cfg->searches_per_step = 4;
+}
+
+void av1_init3smotion_compensation(search_site_config *cfg, int stride) {
+  int len, ss_count = 1;
+
+  cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0;
+  cfg->ss[0].offset = 0;
+
+  for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
+    // Generate offsets for 8 search sites per step.
+    const MV ss_mvs[8] = { { -len, 0 },   { len, 0 },     { 0, -len },
+                           { 0, len },    { -len, -len }, { -len, len },
+                           { len, -len }, { len, len } };
+    int i;
+    for (i = 0; i < 8; ++i) {
+      search_site *const ss = &cfg->ss[ss_count++];
+      ss->mv = ss_mvs[i];
+      ss->offset = ss->mv.row * stride + ss->mv.col;
+    }
+  }
+
+  cfg->ss_count = ss_count;
+  cfg->searches_per_step = 8;
+}
+
+/*
+ * To avoid the penalty for crossing cache-line read, preload the reference
+ * area in a small buffer, which is aligned to make sure there won't be crossing
+ * cache-line read while reading from this buffer. This reduced the cpu
+ * cycles spent on reading ref data in sub-pixel filter functions.
+ * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
+ * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
+ * could reduce the area.
+ */
+
+// convert motion vector component to offset for sv[a]f calc
+static INLINE int sp(int x) { return x & 7; }
+
+static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
+  return &buf[(r >> 3) * stride + (c >> 3)];
+}
+
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER(v, r, c)                                             \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                 \
+    MV this_mv = { r, c };                                                \
+    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);    \
+    if (second_pred == NULL)                                              \
+      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r),  \
+                         src_address, src_stride, &sse);                  \
+    else                                                                  \
+      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
+                          src_address, src_stride, &sse, second_pred);    \
+    v += thismse;                                                         \
+    if (v < besterr) {                                                    \
+      besterr = v;                                                        \
+      br = r;                                                             \
+      bc = c;                                                             \
+      *distortion = thismse;                                              \
+      *sse1 = sse;                                                        \
+    }                                                                     \
+  } else {                                                                \
+    v = INT_MAX;                                                          \
+  }
+
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+static INLINE const uint8_t *upre(const uint8_t *buf, int stride, int r,
+                                  int c) {
+  return &buf[(r)*stride + (c)];
+}
+
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER1(v, r, c)                                         \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    MV this_mv = { r, c };                                             \
+    thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,   \
+                                   upre(y, y_stride, r, c), y_stride,  \
+                                   second_pred, w, h, &sse);           \
+    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \
+    v += thismse;                                                      \
+    if (v < besterr) {                                                 \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+
+#define FIRST_LEVEL_CHECKS                                       \
+  {                                                              \
+    unsigned int left, right, up, down, diag;                    \
+    CHECK_BETTER(left, tr, tc - hstep);                          \
+    CHECK_BETTER(right, tr, tc + hstep);                         \
+    CHECK_BETTER(up, tr - hstep, tc);                            \
+    CHECK_BETTER(down, tr + hstep, tc);                          \
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);     \
+    switch (whichdir) {                                          \
+      case 0: CHECK_BETTER(diag, tr - hstep, tc - hstep); break; \
+      case 1: CHECK_BETTER(diag, tr - hstep, tc + hstep); break; \
+      case 2: CHECK_BETTER(diag, tr + hstep, tc - hstep); break; \
+      case 3: CHECK_BETTER(diag, tr + hstep, tc + hstep); break; \
+    }                                                            \
+  }
+
+#define SECOND_LEVEL_CHECKS                                       \
+  {                                                               \
+    int kr, kc;                                                   \
+    unsigned int second;                                          \
+    if (tr != br && tc != bc) {                                   \
+      kr = br - tr;                                               \
+      kc = bc - tc;                                               \
+      CHECK_BETTER(second, tr + kr, tc + 2 * kc);                 \
+      CHECK_BETTER(second, tr + 2 * kr, tc + kc);                 \
+    } else if (tr == br && tc != bc) {                            \
+      kc = bc - tc;                                               \
+      CHECK_BETTER(second, tr + hstep, tc + 2 * kc);              \
+      CHECK_BETTER(second, tr - hstep, tc + 2 * kc);              \
+      switch (whichdir) {                                         \
+        case 0:                                                   \
+        case 1: CHECK_BETTER(second, tr + hstep, tc + kc); break; \
+        case 2:                                                   \
+        case 3: CHECK_BETTER(second, tr - hstep, tc + kc); break; \
+      }                                                           \
+    } else if (tr != br && tc == bc) {                            \
+      kr = br - tr;                                               \
+      CHECK_BETTER(second, tr + 2 * kr, tc + hstep);              \
+      CHECK_BETTER(second, tr + 2 * kr, tc - hstep);              \
+      switch (whichdir) {                                         \
+        case 0:                                                   \
+        case 2: CHECK_BETTER(second, tr + kr, tc + hstep); break; \
+        case 1:                                                   \
+        case 3: CHECK_BETTER(second, tr + kr, tc - hstep); break; \
+      }                                                           \
+    }                                                             \
+  }
+
+// TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of
+// SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten
+// later in the same way.
+#define SECOND_LEVEL_CHECKS_BEST(k)                \
+  {                                                \
+    unsigned int second;                           \
+    int br0 = br;                                  \
+    int bc0 = bc;                                  \
+    assert(tr == br || tc == bc);                  \
+    if (tr == br && tc != bc) {                    \
+      kc = bc - tc;                                \
+    } else if (tr != br && tc == bc) {             \
+      kr = br - tr;                                \
+    }                                              \
+    CHECK_BETTER##k(second, br0 + kr, bc0);        \
+    CHECK_BETTER##k(second, br0, bc0 + kc);        \
+    if (br0 != br || bc0 != bc) {                  \
+      CHECK_BETTER##k(second, br0 + kr, bc0 + kc); \
+    }                                              \
+  }
+
+#define SETUP_SUBPEL_SEARCH                                                 \
+  const uint8_t *const src_address = x->plane[0].src.buf;                   \
+  const int src_stride = x->plane[0].src.stride;                            \
+  const MACROBLOCKD *xd = &x->e_mbd;                                        \
+  unsigned int besterr = INT_MAX;                                           \
+  unsigned int sse;                                                         \
+  unsigned int whichdir;                                                    \
+  int thismse;                                                              \
+  MV *bestmv = &x->best_mv.as_mv;                                           \
+  const unsigned int halfiters = iters_per_step;                            \
+  const unsigned int quarteriters = iters_per_step;                         \
+  const unsigned int eighthiters = iters_per_step;                          \
+  const int y_stride = xd->plane[0].pre[0].stride;                          \
+  const int offset = bestmv->row * y_stride + bestmv->col;                  \
+  const uint8_t *const y = xd->plane[0].pre[0].buf;                         \
+                                                                            \
+  int br = bestmv->row * 8;                                                 \
+  int bc = bestmv->col * 8;                                                 \
+  int hstep = 4;                                                            \
+  int minc, maxc, minr, maxr;                                               \
+  int tr = br;                                                              \
+  int tc = bc;                                                              \
+                                                                            \
+  av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, \
+                                 ref_mv);                                   \
+                                                                            \
+  bestmv->row *= 8;                                                         \
+  bestmv->col *= 8;
+
+static unsigned int setup_center_error(
+    const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
+    int error_per_bit, const aom_variance_fn_ptr_t *vfp,
+    const uint8_t *const src, const int src_stride, const uint8_t *const y,
+    int y_stride, const uint8_t *second_pred, int w, int h, int offset,
+    int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion) {
+  unsigned int besterr;
+#if CONFIG_HIGHBITDEPTH
+  if (second_pred != NULL) {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
+      aom_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
+                               y_stride);
+      besterr =
+          vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1);
+    } else {
+      DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
+      aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+    }
+  } else {
+    besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
+  }
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+#else
+  (void)xd;
+  if (second_pred != NULL) {
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
+    aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+    besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+  } else {
+    besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
+  }
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+#endif  // CONFIG_HIGHBITDEPTH
+  return besterr;
+}
+
+static INLINE int divide_and_round(int n, int d) {
+  return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
+}
+
+static INLINE int is_cost_list_wellbehaved(int *cost_list) {
+  return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] &&
+         cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4];
+}
+
+// Returns surface minima estimate at given precision in 1/2^n bits.
+// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
+// For a given set of costs S0, S1, S2, S3, S4 at points
+// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
+// the solution for the location of the minima (x0, y0) is given by:
+// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
+// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
+// The code below is an integerized version of that.
+static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) {
+  *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
+                         (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
+  *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
+                         (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
+}
+
+int av1_find_best_sub_pixel_tree_pruned_evenmore(
+    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, const uint8_t *second_pred, int w, int h,
+    int use_upsampled_ref) {
+  SETUP_SUBPEL_SEARCH;
+  besterr = setup_center_error(
+      xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
+      y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion);
+  (void)halfiters;
+  (void)quarteriters;
+  (void)eighthiters;
+  (void)whichdir;
+  (void)allow_hp;
+  (void)forced_stop;
+  (void)hstep;
+  (void)use_upsampled_ref;
+
+  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
+    int ir, ic;
+    unsigned int minpt;
+    get_cost_surf_min(cost_list, &ir, &ic, 2);
+    if (ir != 0 || ic != 0) {
+      CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic);
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+
+    tr = br;
+    tc = bc;
+
+    // Each subsequent iteration checks at least one point in common with
+    // the last iteration could be 2 ( if diag selected) 1/4 pel
+    // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+    if (forced_stop != 2) {
+      hstep >>= 1;
+      FIRST_LEVEL_CHECKS;
+      if (quarteriters > 1) {
+        SECOND_LEVEL_CHECKS;
+      }
+    }
+  }
+
+  tr = br;
+  tc = bc;
+
+  if (allow_hp && forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  return besterr;
+}
+
+int av1_find_best_sub_pixel_tree_pruned_more(
+    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, const uint8_t *second_pred, int w, int h,
+    int use_upsampled_ref) {
+  SETUP_SUBPEL_SEARCH;
+  (void)use_upsampled_ref;
+
+  besterr = setup_center_error(
+      xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
+      y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion);
+  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
+    unsigned int minpt;
+    int ir, ic;
+    get_cost_surf_min(cost_list, &ir, &ic, 1);
+    if (ir != 0 || ic != 0) {
+      CHECK_BETTER(minpt, tr + ir * hstep, tc + ic * hstep);
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    tr = br;
+    tc = bc;
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  if (allow_hp && forced_stop == 0) {
+    tr = br;
+    tc = bc;
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void)tr;
+  (void)tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  return besterr;
+}
+
+int av1_find_best_sub_pixel_tree_pruned(
+    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, const uint8_t *second_pred, int w, int h,
+    int use_upsampled_ref) {
+  SETUP_SUBPEL_SEARCH;
+  (void)use_upsampled_ref;
+
+  besterr = setup_center_error(
+      xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
+      y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion);
+  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX) {
+    unsigned int left, right, up, down, diag;
+    whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) +
+               (cost_list[2] < cost_list[4] ? 0 : 2);
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(left, tr, tc - hstep);
+        CHECK_BETTER(down, tr + hstep, tc);
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(right, tr, tc + hstep);
+        CHECK_BETTER(down, tr + hstep, tc);
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(left, tr, tc - hstep);
+        CHECK_BETTER(up, tr - hstep, tc);
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(right, tr, tc + hstep);
+        CHECK_BETTER(up, tr - hstep, tc);
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  tr = br;
+  tc = bc;
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+
+  if (allow_hp && forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void)tr;
+  (void)tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  return besterr;
+}
+
+/* clang-format off */
+static const MV search_step_table[12] = {
+  // left, right, up, down
+  { 0, -4 }, { 0, 4 }, { -4, 0 }, { 4, 0 },
+  { 0, -2 }, { 0, 2 }, { -2, 0 }, { 2, 0 },
+  { 0, -1 }, { 0, 1 }, { -1, 0 }, { 1, 0 }
+};
+/* clang-format on */
+
+static int upsampled_pref_error(const MACROBLOCKD *xd,
+                                const aom_variance_fn_ptr_t *vfp,
+                                const uint8_t *const src, const int src_stride,
+                                const uint8_t *const y, int y_stride,
+                                const uint8_t *second_pred, int w, int h,
+                                unsigned int *sse) {
+  unsigned int besterr;
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+    if (second_pred != NULL)
+      aom_highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h, y,
+                                         y_stride);
+    else
+      aom_highbd_upsampled_pred(pred16, w, h, y, y_stride);
+
+    besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, sse);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+#else
+  DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+  (void)xd;
+#endif  // CONFIG_HIGHBITDEPTH
+    if (second_pred != NULL)
+      aom_comp_avg_upsampled_pred(pred, second_pred, w, h, y, y_stride);
+    else
+      aom_upsampled_pred(pred, w, h, y, y_stride);
+
+    besterr = vfp->vf(pred, w, src, src_stride, sse);
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif
+  return besterr;
+}
+
+static unsigned int upsampled_setup_center_error(
+    const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
+    int error_per_bit, const aom_variance_fn_ptr_t *vfp,
+    const uint8_t *const src, const int src_stride, const uint8_t *const y,
+    int y_stride, const uint8_t *second_pred, int w, int h, int offset,
+    int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion) {
+  unsigned int besterr = upsampled_pref_error(
+      xd, vfp, src, src_stride, y + offset, y_stride, second_pred, w, h, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+int av1_find_best_sub_pixel_tree(MACROBLOCK *x, const MV *ref_mv, int allow_hp,
+                                 int error_per_bit,
+                                 const aom_variance_fn_ptr_t *vfp,
+                                 int forced_stop, int iters_per_step,
+                                 int *cost_list, int *mvjcost, int *mvcost[2],
+                                 int *distortion, unsigned int *sse1,
+                                 const uint8_t *second_pred, int w, int h,
+                                 int use_upsampled_ref) {
+  const uint8_t *const src_address = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  unsigned int thismse;
+  const int y_stride = xd->plane[0].pre[0].stride;
+  MV *bestmv = &x->best_mv.as_mv;
+  const int offset = bestmv->row * y_stride + bestmv->col;
+  const uint8_t *const y = xd->plane[0].pre[0].buf;
+
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  int iter, round = 3 - forced_stop;
+  int tr = br;
+  int tc = bc;
+  const MV *search_step = search_step_table;
+  int idx, best_idx = -1;
+  unsigned int cost_array[5];
+  int kr, kc;
+  int minc, maxc, minr, maxr;
+
+  av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
+                                 ref_mv);
+
+  if (!allow_hp)
+    if (round == 3) round = 2;
+
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+
+  // use_upsampled_ref can be 0 or 1
+  if (use_upsampled_ref)
+    besterr = upsampled_setup_center_error(
+        xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
+        y_stride, second_pred, w, h, (offset * 8), mvjcost, mvcost, sse1,
+        distortion);
+  else
+    besterr = setup_center_error(
+        xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
+        y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion);
+
+  (void)cost_list;  // to silence compiler warning
+
+  for (iter = 0; iter < round; ++iter) {
+    // Check vertical and horizontal sub-pixel positions.
+    for (idx = 0; idx < 4; ++idx) {
+      tr = br + search_step[idx].row;
+      tc = bc + search_step[idx].col;
+      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+        MV this_mv = { tr, tc };
+
+        if (use_upsampled_ref) {
+          const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+          thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,
+                                         pre_address, y_stride, second_pred, w,
+                                         h, &sse);
+        } else {
+          const uint8_t *const pre_address =
+              y + (tr >> 3) * y_stride + (tc >> 3);
+          if (second_pred == NULL)
+            thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
+                               src_address, src_stride, &sse);
+          else
+            thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+                                src_address, src_stride, &sse, second_pred);
+        }
+
+        cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
+                                                mvcost, error_per_bit);
+
+        if (cost_array[idx] < besterr) {
+          best_idx = idx;
+          besterr = cost_array[idx];
+          *distortion = thismse;
+          *sse1 = sse;
+        }
+      } else {
+        cost_array[idx] = INT_MAX;
+      }
+    }
+
+    // Check diagonal sub-pixel position
+    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+    tc = bc + kc;
+    tr = br + kr;
+    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+      MV this_mv = { tr, tc };
+
+      if (use_upsampled_ref) {
+        const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+        thismse =
+            upsampled_pref_error(xd, vfp, src_address, src_stride, pre_address,
+                                 y_stride, second_pred, w, h, &sse);
+      } else {
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+
+        if (second_pred == NULL)
+          thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address,
+                             src_stride, &sse);
+        else
+          thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+                              src_address, src_stride, &sse, second_pred);
+      }
+
+      cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+                                            error_per_bit);
+
+      if (cost_array[4] < besterr) {
+        best_idx = 4;
+        besterr = cost_array[4];
+        *distortion = thismse;
+        *sse1 = sse;
+      }
+    } else {
+      cost_array[idx] = INT_MAX;
+    }
+
+    if (best_idx < 4 && best_idx >= 0) {
+      br += search_step[best_idx].row;
+      bc += search_step[best_idx].col;
+    } else if (best_idx == 4) {
+      br = tr;
+      bc = tc;
+    }
+
+    if (iters_per_step > 1 && best_idx != -1) {
+      if (use_upsampled_ref) {
+        SECOND_LEVEL_CHECKS_BEST(1);
+      } else {
+        SECOND_LEVEL_CHECKS_BEST(0);
+      }
+    }
+
+    search_step += 4;
+    hstep >>= 1;
+    best_idx = -1;
+  }
+
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void)tr;
+  (void)tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  return besterr;
+}
+
+#undef PRE
+#undef CHECK_BETTER
+
+static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col,
+                               int range) {
+  return ((row - range) >= mv_limits->row_min) &
+         ((row + range) <= mv_limits->row_max) &
+         ((col - range) >= mv_limits->col_min) &
+         ((col + range) <= mv_limits->col_max);
+}
+
+static INLINE int is_mv_in(const MvLimits *mv_limits, const MV *mv) {
+  return (mv->col >= mv_limits->col_min) && (mv->col <= mv_limits->col_max) &&
+         (mv->row >= mv_limits->row_min) && (mv->row <= mv_limits->row_max);
+}
+
+#define CHECK_BETTER                                                      \
+  {                                                                       \
+    if (thissad < bestsad) {                                              \
+      if (use_mvcost)                                                     \
+        thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); \
+      if (thissad < bestsad) {                                            \
+        bestsad = thissad;                                                \
+        best_site = i;                                                    \
+      }                                                                   \
+    }                                                                     \
+  }
+
+#define MAX_PATTERN_SCALES 11
+#define MAX_PATTERN_CANDIDATES 8  // max number of canddiates per scale
+#define PATTERN_CANDIDATES_REF 3  // number of refinement candidates
+
+// Calculate and return a sad+mvcost list around an integer best pel.
+static INLINE void calc_int_cost_list(const MACROBLOCK *x,
+                                      const MV *const ref_mv, int sadpb,
+                                      const aom_variance_fn_ptr_t *fn_ptr,
+                                      const MV *best_mv, int *cost_list) {
+  static const MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } };
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
+  const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
+  const int br = best_mv->row;
+  const int bc = best_mv->col;
+  int i;
+  unsigned int sse;
+  const MV this_mv = { br, bc };
+
+  cost_list[0] =
+      fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv),
+                 in_what->stride, &sse) +
+      mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+  if (check_bounds(&x->mv_limits, br, bc, 1)) {
+    for (i = 0; i < 4; i++) {
+      const MV neighbor_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+      cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
+                                    get_buf_from_mv(in_what, &neighbor_mv),
+                                    in_what->stride, &sse) +
+                         mv_err_cost(&neighbor_mv, &fcenter_mv, x->nmvjointcost,
+                                     x->mvcost, x->errorperbit);
+    }
+  } else {
+    for (i = 0; i < 4; i++) {
+      const MV neighbor_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+      if (!is_mv_in(&x->mv_limits, &neighbor_mv))
+        cost_list[i + 1] = INT_MAX;
+      else
+        cost_list[i + 1] =
+            fn_ptr->vf(what->buf, what->stride,
+                       get_buf_from_mv(in_what, &neighbor_mv), in_what->stride,
+                       &sse) +
+            mv_err_cost(&neighbor_mv, &fcenter_mv, x->nmvjointcost, x->mvcost,
+                        x->errorperbit);
+    }
+  }
+}
+
+static INLINE void calc_int_sad_list(const MACROBLOCK *x,
+                                     const MV *const ref_mv, int sadpb,
+                                     const aom_variance_fn_ptr_t *fn_ptr,
+                                     const MV *best_mv, int *cost_list,
+                                     const int use_mvcost, const int bestsad) {
+  static const MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } };
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
+  const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
+  int i;
+  const int br = best_mv->row;
+  const int bc = best_mv->col;
+
+  if (cost_list[0] == INT_MAX) {
+    cost_list[0] = bestsad;
+    if (check_bounds(&x->mv_limits, br, bc, 1)) {
+      for (i = 0; i < 4; i++) {
+        const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+        cost_list[i + 1] =
+            fn_ptr->sdf(what->buf, what->stride,
+                        get_buf_from_mv(in_what, &this_mv), in_what->stride);
+      }
+    } else {
+      for (i = 0; i < 4; i++) {
+        const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+        if (!is_mv_in(&x->mv_limits, &this_mv))
+          cost_list[i + 1] = INT_MAX;
+        else
+          cost_list[i + 1] =
+              fn_ptr->sdf(what->buf, what->stride,
+                          get_buf_from_mv(in_what, &this_mv), in_what->stride);
+      }
+    }
+  } else {
+    if (use_mvcost) {
+      for (i = 0; i < 4; i++) {
+        const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+        if (cost_list[i + 1] != INT_MAX) {
+          cost_list[i + 1] += mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+        }
+      }
+    }
+  }
+}
+
+// Generic pattern search function that searches over multiple scales.
+// Each scale can have a different number of candidates and shape of
+// candidates as indicated in the num_candidates and candidates arrays
+// passed into this function
+//
+static int pattern_search(
+    MACROBLOCK *x, MV *start_mv, int search_param, int sad_per_bit,
+    int do_init_search, int *cost_list, const aom_variance_fn_ptr_t *vfp,
+    int use_mvcost, const MV *center_mv,
+    const int num_candidates[MAX_PATTERN_SCALES],
+    const MV candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES]) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
+    10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+  };
+  int i, s, t;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const int last_is_4 = num_candidates[0] == 4;
+  int br, bc;
+  int bestsad = INT_MAX;
+  int thissad;
+  int k = -1;
+  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  int best_init_s = search_param_to_steps[search_param];
+  // adjust ref_mv to make sure it is within MV range
+  clamp_mv(start_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  br = start_mv->row;
+  bc = start_mv->col;
+  if (cost_list != NULL) {
+    cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] =
+        INT_MAX;
+  }
+
+  // Work out the start point for the search
+  bestsad = vfp->sdf(what->buf, what->stride,
+                     get_buf_from_mv(in_what, start_mv), in_what->stride) +
+            mvsad_err_cost(x, start_mv, &fcenter_mv, sad_per_bit);
+
+  // Search all possible scales upto the search param around the center point
+  // pick the scale of the point that is best as the starting scale of
+  // further steps around it.
+  if (do_init_search) {
+    s = best_init_s;
+    best_init_s = -1;
+    for (t = 0; t <= s; ++t) {
+      int best_site = -1;
+      if (check_bounds(&x->mv_limits, br, bc, 1 << t)) {
+        for (i = 0; i < num_candidates[t]; i++) {
+          const MV this_mv = { br + candidates[t][i].row,
+                               bc + candidates[t][i].col };
+          thissad =
+              vfp->sdf(what->buf, what->stride,
+                       get_buf_from_mv(in_what, &this_mv), in_what->stride);
+          CHECK_BETTER
+        }
+      } else {
+        for (i = 0; i < num_candidates[t]; i++) {
+          const MV this_mv = { br + candidates[t][i].row,
+                               bc + candidates[t][i].col };
+          if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+          thissad =
+              vfp->sdf(what->buf, what->stride,
+                       get_buf_from_mv(in_what, &this_mv), in_what->stride);
+          CHECK_BETTER
+        }
+      }
+      if (best_site == -1) {
+        continue;
+      } else {
+        best_init_s = t;
+        k = best_site;
+      }
+    }
+    if (best_init_s != -1) {
+      br += candidates[best_init_s][k].row;
+      bc += candidates[best_init_s][k].col;
+    }
+  }
+
+  // If the center point is still the best, just skip this and move to
+  // the refinement step.
+  if (best_init_s != -1) {
+    const int last_s = (last_is_4 && cost_list != NULL);
+    int best_site = -1;
+    s = best_init_s;
+
+    for (; s >= last_s; s--) {
+      // No need to search all points the 1st time if initial search was used
+      if (!do_init_search || s != best_init_s) {
+        if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = { br + candidates[s][i].row,
+                                 bc + candidates[s][i].col };
+            thissad =
+                vfp->sdf(what->buf, what->stride,
+                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = { br + candidates[s][i].row,
+                                 bc + candidates[s][i].col };
+            if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+            thissad =
+                vfp->sdf(what->buf, what->stride,
+                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site == -1) {
+          continue;
+        } else {
+          br += candidates[s][best_site].row;
+          bc += candidates[s][best_site].col;
+          k = best_site;
+        }
+      }
+
+      do {
+        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+        best_site = -1;
+        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+        next_chkpts_indices[1] = k;
+        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+
+        if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {
+              br + candidates[s][next_chkpts_indices[i]].row,
+              bc + candidates[s][next_chkpts_indices[i]].col
+            };
+            thissad =
+                vfp->sdf(what->buf, what->stride,
+                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {
+              br + candidates[s][next_chkpts_indices[i]].row,
+              bc + candidates[s][next_chkpts_indices[i]].col
+            };
+            if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+            thissad =
+                vfp->sdf(what->buf, what->stride,
+                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          k = next_chkpts_indices[best_site];
+          br += candidates[s][k].row;
+          bc += candidates[s][k].col;
+        }
+      } while (best_site != -1);
+    }
+
+    // Note: If we enter the if below, then cost_list must be non-NULL.
+    if (s == 0) {
+      cost_list[0] = bestsad;
+      if (!do_init_search || s != best_init_s) {
+        if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = { br + candidates[s][i].row,
+                                 bc + candidates[s][i].col };
+            cost_list[i + 1] = thissad =
+                vfp->sdf(what->buf, what->stride,
+                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = { br + candidates[s][i].row,
+                                 bc + candidates[s][i].col };
+            if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+            cost_list[i + 1] = thissad =
+                vfp->sdf(what->buf, what->stride,
+                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          br += candidates[s][best_site].row;
+          bc += candidates[s][best_site].col;
+          k = best_site;
+        }
+      }
+      while (best_site != -1) {
+        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+        best_site = -1;
+        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+        next_chkpts_indices[1] = k;
+        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+        cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
+        cost_list[((k + 2) % 4) + 1] = cost_list[0];
+        cost_list[0] = bestsad;
+
+        if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {
+              br + candidates[s][next_chkpts_indices[i]].row,
+              bc + candidates[s][next_chkpts_indices[i]].col
+            };
+            cost_list[next_chkpts_indices[i] + 1] = thissad =
+                vfp->sdf(what->buf, what->stride,
+                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {
+              br + candidates[s][next_chkpts_indices[i]].row,
+              bc + candidates[s][next_chkpts_indices[i]].col
+            };
+            if (!is_mv_in(&x->mv_limits, &this_mv)) {
+              cost_list[next_chkpts_indices[i] + 1] = INT_MAX;
+              continue;
+            }
+            cost_list[next_chkpts_indices[i] + 1] = thissad =
+                vfp->sdf(what->buf, what->stride,
+                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          k = next_chkpts_indices[best_site];
+          br += candidates[s][k].row;
+          bc += candidates[s][k].col;
+        }
+      }
+    }
+  }
+
+  // Returns the one-away integer pel cost/sad around the best as follows:
+  // cost_list[0]: cost/sad at the best integer pel
+  // cost_list[1]: cost/sad at delta {0, -1} (left)   from the best integer pel
+  // cost_list[2]: cost/sad at delta { 1, 0} (bottom) from the best integer pel
+  // cost_list[3]: cost/sad at delta { 0, 1} (right)  from the best integer pel
+  // cost_list[4]: cost/sad at delta {-1, 0} (top)    from the best integer pel
+  if (cost_list) {
+    const MV best_int_mv = { br, bc };
+    if (last_is_4) {
+      calc_int_sad_list(x, center_mv, sad_per_bit, vfp, &best_int_mv, cost_list,
+                        use_mvcost, bestsad);
+    } else {
+      calc_int_cost_list(x, center_mv, sad_per_bit, vfp, &best_int_mv,
+                         cost_list);
+    }
+  }
+  x->best_mv.as_mv.row = br;
+  x->best_mv.as_mv.col = bc;
+  return bestsad;
+}
+
+int av1_get_mvpred_var(const MACROBLOCK *x, const MV *best_mv,
+                       const MV *center_mv, const aom_variance_fn_ptr_t *vfp,
+                       int use_mvcost) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+  unsigned int unused;
+
+  return vfp->vf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
+                 in_what->stride, &unused) +
+         (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+                                   x->errorperbit)
+                     : 0);
+}
+
+int av1_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
+                          const MV *center_mv, const uint8_t *second_pred,
+                          const aom_variance_fn_ptr_t *vfp, int use_mvcost) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+  unsigned int unused;
+
+  return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0,
+                   what->buf, what->stride, &unused, second_pred) +
+         (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+                                   x->errorperbit)
+                     : 0);
+}
+
+int av1_hex_search(MACROBLOCK *x, MV *start_mv, int search_param,
+                   int sad_per_bit, int do_init_search, int *cost_list,
+                   const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+                   const MV *center_mv) {
+  // First scale has 8-closest points, the rest have 6 points in hex shape
+  // at increasing scales
+  static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6,
+                                                              6, 6, 6, 6, 6 };
+  // Note that the largest candidate step at each scale is 2^scale
+  /* clang-format off */
+  static const MV hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+    { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, { -1, 1 },
+      { -1, 0 } },
+    { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } },
+    { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } },
+    { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } },
+    { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 }, { -8, 16 }, { -16, 0 } },
+    { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 },
+      { -32, 0 } },
+    { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 },
+      { -64, 0 } },
+    { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 }, { -64, 128 },
+      { -128, 0 } },
+    { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 }, { -128, 256 },
+      { -256, 0 } },
+    { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 }, { -256, 512 },
+      { -512, 0 } },
+    { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 },
+      { -512, 1024 }, { -1024, 0 } },
+  };
+  /* clang-format on */
+  return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search,
+                        cost_list, vfp, use_mvcost, center_mv,
+                        hex_num_candidates, hex_candidates);
+}
+
+static int bigdia_search(MACROBLOCK *x, MV *start_mv, int search_param,
+                         int sad_per_bit, int do_init_search, int *cost_list,
+                         const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+                         const MV *center_mv) {
+  // First scale has 4-closest points, the rest have 8 points in diamond
+  // shape at increasing scales
+  static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
+    4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  };
+  // Note that the largest candidate step at each scale is 2^scale
+  /* clang-format off */
+  static const MV
+      bigdia_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+        { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } },
+        { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 },
+          { -1, 1 }, { -2, 0 } },
+        { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 },
+          { -2, 2 }, { -4, 0 } },
+        { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 },
+          { -4, 4 }, { -8, 0 } },
+        { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 },
+          { -8, 8 }, { -16, 0 } },
+        { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 },
+          { 0, 32 }, { -16, 16 }, { -32, 0 } },
+        { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 },
+          { 0, 64 }, { -32, 32 }, { -64, 0 } },
+        { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 },
+          { 0, 128 }, { -64, 64 }, { -128, 0 } },
+        { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 }, { 128, 128 },
+          { 0, 256 }, { -128, 128 }, { -256, 0 } },
+        { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 }, { 256, 256 },
+          { 0, 512 }, { -256, 256 }, { -512, 0 } },
+        { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 },
+          { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } },
+      };
+  /* clang-format on */
+  return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search,
+                        cost_list, vfp, use_mvcost, center_mv,
+                        bigdia_num_candidates, bigdia_candidates);
+}
+
+static int square_search(MACROBLOCK *x, MV *start_mv, int search_param,
+                         int sad_per_bit, int do_init_search, int *cost_list,
+                         const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+                         const MV *center_mv) {
+  // All scales have 8 closest points in square shape
+  static const int square_num_candidates[MAX_PATTERN_SCALES] = {
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  };
+  // Note that the largest candidate step at each scale is 2^scale
+  /* clang-format off */
+  static const MV
+      square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+        { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
+          { -1, 1 }, { -1, 0 } },
+        { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 },
+          { -2, 2 }, { -2, 0 } },
+        { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 },
+          { -4, 4 }, { -4, 0 } },
+        { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 },
+          { -8, 8 }, { -8, 0 } },
+        { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 },
+          { 0, 16 }, { -16, 16 }, { -16, 0 } },
+        { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 },
+          { 0, 32 }, { -32, 32 }, { -32, 0 } },
+        { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 },
+          { 0, 64 }, { -64, 64 }, { -64, 0 } },
+        { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 }, { 128, 128 },
+          { 0, 128 }, { -128, 128 }, { -128, 0 } },
+        { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 }, { 256, 256 },
+          { 0, 256 }, { -256, 256 }, { -256, 0 } },
+        { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 }, { 512, 512 },
+          { 0, 512 }, { -512, 512 }, { -512, 0 } },
+        { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 },
+          { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } },
+      };
+  /* clang-format on */
+  return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search,
+                        cost_list, vfp, use_mvcost, center_mv,
+                        square_num_candidates, square_candidates);
+}
+
+static int fast_hex_search(MACROBLOCK *x, MV *ref_mv, int search_param,
+                           int sad_per_bit,
+                           int do_init_search,  // must be zero for fast_hex
+                           int *cost_list, const aom_variance_fn_ptr_t *vfp,
+                           int use_mvcost, const MV *center_mv) {
+  return av1_hex_search(x, ref_mv, AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param),
+                        sad_per_bit, do_init_search, cost_list, vfp, use_mvcost,
+                        center_mv);
+}
+
+static int fast_dia_search(MACROBLOCK *x, MV *ref_mv, int search_param,
+                           int sad_per_bit, int do_init_search, int *cost_list,
+                           const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+                           const MV *center_mv) {
+  return bigdia_search(x, ref_mv, AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param),
+                       sad_per_bit, do_init_search, cost_list, vfp, use_mvcost,
+                       center_mv);
+}
+
+#undef CHECK_BETTER
+
+// Exhuastive motion search around a given centre position with a given
+// step size.
+static int exhuastive_mesh_search(MACROBLOCK *x, MV *ref_mv, MV *best_mv,
+                                  int range, int step, int sad_per_bit,
+                                  const aom_variance_fn_ptr_t *fn_ptr,
+                                  const MV *center_mv) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  MV fcenter_mv = { center_mv->row, center_mv->col };
+  unsigned int best_sad = INT_MAX;
+  int r, c, i;
+  int start_col, end_col, start_row, end_row;
+  int col_step = (step > 1) ? step : 4;
+
+  assert(step >= 1);
+
+  clamp_mv(&fcenter_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  *best_mv = fcenter_mv;
+  best_sad =
+      fn_ptr->sdf(what->buf, what->stride,
+                  get_buf_from_mv(in_what, &fcenter_mv), in_what->stride) +
+      mvsad_err_cost(x, &fcenter_mv, ref_mv, sad_per_bit);
+  start_row = AOMMAX(-range, x->mv_limits.row_min - fcenter_mv.row);
+  start_col = AOMMAX(-range, x->mv_limits.col_min - fcenter_mv.col);
+  end_row = AOMMIN(range, x->mv_limits.row_max - fcenter_mv.row);
+  end_col = AOMMIN(range, x->mv_limits.col_max - fcenter_mv.col);
+
+  for (r = start_row; r <= end_row; r += step) {
+    for (c = start_col; c <= end_col; c += col_step) {
+      // Step > 1 means we are not checking every location in this pass.
+      if (step > 1) {
+        const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c };
+        unsigned int sad =
+            fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
+                        in_what->stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            x->second_best_mv.as_mv = *best_mv;
+            *best_mv = mv;
+          }
+        }
+      } else {
+        // 4 sads in a single call if we are checking every location
+        if (c + 3 <= end_col) {
+          unsigned int sads[4];
+          const uint8_t *addrs[4];
+          for (i = 0; i < 4; ++i) {
+            const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+            addrs[i] = get_buf_from_mv(in_what, &mv);
+          }
+          fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads);
+
+          for (i = 0; i < 4; ++i) {
+            if (sads[i] < best_sad) {
+              const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+              const unsigned int sad =
+                  sads[i] + mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+              if (sad < best_sad) {
+                best_sad = sad;
+                x->second_best_mv.as_mv = *best_mv;
+                *best_mv = mv;
+              }
+            }
+          }
+        } else {
+          for (i = 0; i < end_col - c; ++i) {
+            const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+            unsigned int sad =
+                fn_ptr->sdf(what->buf, what->stride,
+                            get_buf_from_mv(in_what, &mv), in_what->stride);
+            if (sad < best_sad) {
+              sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+              if (sad < best_sad) {
+                best_sad = sad;
+                x->second_best_mv.as_mv = *best_mv;
+                *best_mv = mv;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return best_sad;
+}
+
+int av1_diamond_search_sad_c(MACROBLOCK *x, const search_site_config *cfg,
+                             MV *ref_mv, MV *best_mv, int search_param,
+                             int sad_per_bit, int *num00,
+                             const aom_variance_fn_ptr_t *fn_ptr,
+                             const MV *center_mv) {
+  int i, j, step;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  uint8_t *what = x->plane[0].src.buf;
+  const int what_stride = x->plane[0].src.stride;
+  const uint8_t *in_what;
+  const int in_what_stride = xd->plane[0].pre[0].stride;
+  const uint8_t *best_address;
+
+  unsigned int bestsad = INT_MAX;
+  int best_site = 0;
+  int last_site = 0;
+
+  int ref_row;
+  int ref_col;
+
+  // search_param determines the length of the initial step and hence the number
+  // of iterations.
+  // 0 = initial step (MAX_FIRST_STEP) pel
+  // 1 = (MAX_FIRST_STEP/2) pel,
+  // 2 = (MAX_FIRST_STEP/4) pel...
+  const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+
+  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  ref_row = ref_mv->row;
+  ref_col = ref_mv->col;
+  *num00 = 0;
+  best_mv->row = ref_row;
+  best_mv->col = ref_col;
+
+  // Work out the start point for the search
+  in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
+  best_address = in_what;
+
+  // Check the starting position
+  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) +
+            mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
+
+  i = 1;
+
+  for (step = 0; step < tot_steps; step++) {
+    int all_in = 1, t;
+
+    // All_in is true if every one of the points we are checking are within
+    // the bounds of the image.
+    all_in &= ((best_mv->row + ss[i].mv.row) > x->mv_limits.row_min);
+    all_in &= ((best_mv->row + ss[i + 1].mv.row) < x->mv_limits.row_max);
+    all_in &= ((best_mv->col + ss[i + 2].mv.col) > x->mv_limits.col_min);
+    all_in &= ((best_mv->col + ss[i + 3].mv.col) < x->mv_limits.col_max);
+
+    // If all the pixels are within the bounds we don't check whether the
+    // search point is valid in this loop,  otherwise we check each point
+    // for validity..
+    if (all_in) {
+      unsigned int sad_array[4];
+
+      for (j = 0; j < cfg->searches_per_step; j += 4) {
+        unsigned char const *block_offset[4];
+
+        for (t = 0; t < 4; t++)
+          block_offset[t] = ss[i + t].offset + best_address;
+
+        fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+                       sad_array);
+
+        for (t = 0; t < 4; t++, i++) {
+          if (sad_array[t] < bestsad) {
+            const MV this_mv = { best_mv->row + ss[i].mv.row,
+                                 best_mv->col + ss[i].mv.col };
+            sad_array[t] +=
+                mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (sad_array[t] < bestsad) {
+              bestsad = sad_array[t];
+              best_site = i;
+            }
+          }
+        }
+      }
+    } else {
+      for (j = 0; j < cfg->searches_per_step; j++) {
+        // Trap illegal vectors
+        const MV this_mv = { best_mv->row + ss[i].mv.row,
+                             best_mv->col + ss[i].mv.col };
+
+        if (is_mv_in(&x->mv_limits, &this_mv)) {
+          const uint8_t *const check_here = ss[i].offset + best_address;
+          unsigned int thissad =
+              fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+
+          if (thissad < bestsad) {
+            thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_site = i;
+            }
+          }
+        }
+        i++;
+      }
+    }
+    if (best_site != last_site) {
+      x->second_best_mv.as_mv = *best_mv;
+      best_mv->row += ss[best_site].mv.row;
+      best_mv->col += ss[best_site].mv.col;
+      best_address += ss[best_site].offset;
+      last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+      while (1) {
+        const MV this_mv = { best_mv->row + ss[best_site].mv.row,
+                             best_mv->col + ss[best_site].mv.col };
+        if (is_mv_in(&x->mv_limits, &this_mv)) {
+          const uint8_t *const check_here = ss[best_site].offset + best_address;
+          unsigned int thissad =
+              fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+          if (thissad < bestsad) {
+            thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_mv->row += ss[best_site].mv.row;
+              best_mv->col += ss[best_site].mv.col;
+              best_address += ss[best_site].offset;
+              continue;
+            }
+          }
+        }
+        break;
+      }
+#endif
+    } else if (best_address == in_what) {
+      (*num00)++;
+    }
+  }
+  return bestsad;
+}
+
+static int vector_match(int16_t *ref, int16_t *src, int bwl) {
+  int best_sad = INT_MAX;
+  int this_sad;
+  int d;
+  int center, offset = 0;
+  int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.
+  for (d = 0; d <= bw; d += 16) {
+    this_sad = aom_vector_var(&ref[d], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      offset = d;
+    }
+  }
+  center = offset;
+
+  for (d = -8; d <= 8; d += 16) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -4; d <= 4; d += 8) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -2; d <= 2; d += 4) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -1; d <= 1; d += 2) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+
+  return (center - (bw >> 1));
+}
+
+static const MV search_pos[4] = {
+  { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 },
+};
+
+unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
+                                           BLOCK_SIZE bsize, int mi_row,
+                                           int mi_col) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+  DECLARE_ALIGNED(16, int16_t, hbuf[2 * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, int16_t, vbuf[2 * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, int16_t, src_hbuf[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, int16_t, src_vbuf[MAX_SB_SQUARE]);
+  int idx;
+  const int bw = 4 << b_width_log2_lookup[bsize];
+  const int bh = 4 << b_height_log2_lookup[bsize];
+  const int search_width = bw << 1;
+  const int search_height = bh << 1;
+  const int src_stride = x->plane[0].src.stride;
+  const int ref_stride = xd->plane[0].pre[0].stride;
+  uint8_t const *ref_buf, *src_buf;
+  MV *tmp_mv = &xd->mi[0]->mbmi.mv[0].as_mv;
+  unsigned int best_sad, tmp_sad, sad_arr[4];
+  MV this_mv;
+  const int norm_factor = 3 + (bw >> 5);
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]);
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
+    av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+#if CONFIG_HIGHBITDEPTH
+  {
+    unsigned int this_sad;
+    tmp_mv->row = 0;
+    tmp_mv->col = 0;
+    this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+                                      xd->plane[0].pre[0].buf, ref_stride);
+
+    if (scaled_ref_frame) {
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+    }
+    return this_sad;
+  }
+#endif
+
+  // Set up prediction 1-D reference set
+  ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
+  for (idx = 0; idx < search_width; idx += 16) {
+    aom_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
+    ref_buf += 16;
+  }
+
+  ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
+  for (idx = 0; idx < search_height; ++idx) {
+    vbuf[idx] = aom_int_pro_col(ref_buf, bw) >> norm_factor;
+    ref_buf += ref_stride;
+  }
+
+  // Set up src 1-D reference set
+  for (idx = 0; idx < bw; idx += 16) {
+    src_buf = x->plane[0].src.buf + idx;
+    aom_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
+  }
+
+  src_buf = x->plane[0].src.buf;
+  for (idx = 0; idx < bh; ++idx) {
+    src_vbuf[idx] = aom_int_pro_col(src_buf, bw) >> norm_factor;
+    src_buf += src_stride;
+  }
+
+  // Find the best match per 1-D search
+  tmp_mv->col = vector_match(hbuf, src_hbuf, b_width_log2_lookup[bsize]);
+  tmp_mv->row = vector_match(vbuf, src_vbuf, b_height_log2_lookup[bsize]);
+
+  this_mv = *tmp_mv;
+  src_buf = x->plane[0].src.buf;
+  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+  best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+
+  {
+    const uint8_t *const pos[4] = {
+      ref_buf - ref_stride, ref_buf - 1, ref_buf + 1, ref_buf + ref_stride,
+    };
+
+    cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, sad_arr);
+  }
+
+  for (idx = 0; idx < 4; ++idx) {
+    if (sad_arr[idx] < best_sad) {
+      best_sad = sad_arr[idx];
+      tmp_mv->row = search_pos[idx].row + this_mv.row;
+      tmp_mv->col = search_pos[idx].col + this_mv.col;
+    }
+  }
+
+  if (sad_arr[0] < sad_arr[3])
+    this_mv.row -= 1;
+  else
+    this_mv.row += 1;
+
+  if (sad_arr[1] < sad_arr[2])
+    this_mv.col -= 1;
+  else
+    this_mv.col += 1;
+
+  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+
+  tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+  if (best_sad > tmp_sad) {
+    *tmp_mv = this_mv;
+    best_sad = tmp_sad;
+  }
+
+  tmp_mv->row *= 8;
+  tmp_mv->col *= 8;
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+  }
+
+  return best_sad;
+}
+
+/* do_refine: If last step (1-away) of n-step search doesn't pick the center
+              point as the best match, we will do a final 1-away diamond
+              refining search  */
+static int full_pixel_diamond(const AV1_COMP *const cpi, MACROBLOCK *x,
+                              MV *mvp_full, int step_param, int sadpb,
+                              int further_steps, int do_refine, int *cost_list,
+                              const aom_variance_fn_ptr_t *fn_ptr,
+                              const MV *ref_mv) {
+  MV temp_mv;
+  int thissme, n, num00 = 0;
+  int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
+                                        step_param, sadpb, &n, fn_ptr, ref_mv);
+  if (bestsme < INT_MAX)
+    bestsme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+  x->best_mv.as_mv = temp_mv;
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  if (n > further_steps) do_refine = 0;
+
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      num00--;
+    } else {
+      thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
+                                        step_param + n, sadpb, &num00, fn_ptr,
+                                        ref_mv);
+      if (thissme < INT_MAX)
+        thissme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n) do_refine = 0;
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        x->best_mv.as_mv = temp_mv;
+      }
+    }
+  }
+
+  // final 1-away diamond refining search
+  if (do_refine) {
+    const int search_range = 8;
+    MV best_mv = x->best_mv.as_mv;
+    thissme = av1_refining_search_sad(x, &best_mv, sadpb, search_range, fn_ptr,
+                                      ref_mv);
+    if (thissme < INT_MAX)
+      thissme = av1_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
+    if (thissme < bestsme) {
+      bestsme = thissme;
+      x->best_mv.as_mv = best_mv;
+    }
+  }
+
+  // Return cost list.
+  if (cost_list) {
+    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, &x->best_mv.as_mv, cost_list);
+  }
+  return bestsme;
+}
+
+#define MIN_RANGE 7
+#define MAX_RANGE 256
+#define MIN_INTERVAL 1
+// Runs an limited range exhaustive mesh search using a pattern set
+// according to the encode speed profile.
+static int full_pixel_exhaustive(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                 const MV *centre_mv_full, int sadpb,
+                                 int *cost_list,
+                                 const aom_variance_fn_ptr_t *fn_ptr,
+                                 const MV *ref_mv, MV *dst_mv) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MV temp_mv = { centre_mv_full->row, centre_mv_full->col };
+  MV f_ref_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
+  int bestsme;
+  int i;
+  int interval = sf->mesh_patterns[0].interval;
+  int range = sf->mesh_patterns[0].range;
+  int baseline_interval_divisor;
+
+  // Keep track of number of exhaustive calls (this frame in this thread).
+  ++(*x->ex_search_count_ptr);
+
+  // Trap illegal values for interval and range for this function.
+  if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) ||
+      (interval > range))
+    return INT_MAX;
+
+  baseline_interval_divisor = range / interval;
+
+  // Check size of proposed first range against magnitude of the centre
+  // value used as a starting point.
+  range = AOMMAX(range, (5 * AOMMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4);
+  range = AOMMIN(range, MAX_RANGE);
+  interval = AOMMAX(interval, range / baseline_interval_divisor);
+
+  // initial search
+  bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, range, interval,
+                                   sadpb, fn_ptr, &temp_mv);
+
+  if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
+    // Progressive searches with range and step size decreasing each time
+    // till we reach a step size of 1. Then break out.
+    for (i = 1; i < MAX_MESH_STEP; ++i) {
+      // First pass with coarser step and longer range
+      bestsme = exhuastive_mesh_search(
+          x, &f_ref_mv, &temp_mv, sf->mesh_patterns[i].range,
+          sf->mesh_patterns[i].interval, sadpb, fn_ptr, &temp_mv);
+
+      if (sf->mesh_patterns[i].interval == 1) break;
+    }
+  }
+
+  if (bestsme < INT_MAX)
+    bestsme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+  *dst_mv = temp_mv;
+
+  // Return cost list.
+  if (cost_list) {
+    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+  }
+  return bestsme;
+}
+
+int av1_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
+                          int sad_per_bit, int distance,
+                          const aom_variance_fn_ptr_t *fn_ptr,
+                          const MV *center_mv, MV *best_mv) {
+  int r, c;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const int row_min = AOMMAX(ref_mv->row - distance, x->mv_limits.row_min);
+  const int row_max = AOMMIN(ref_mv->row + distance, x->mv_limits.row_max);
+  const int col_min = AOMMAX(ref_mv->col - distance, x->mv_limits.col_min);
+  const int col_max = AOMMIN(ref_mv->col + distance, x->mv_limits.col_max);
+  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  int best_sad =
+      fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
+                  in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+  *best_mv = *ref_mv;
+
+  for (r = row_min; r < row_max; ++r) {
+    for (c = col_min; c < col_max; ++c) {
+      const MV mv = { r, c };
+      const int sad =
+          fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
+                      in_what->stride) +
+          mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+      if (sad < best_sad) {
+        best_sad = sad;
+        *best_mv = mv;
+      }
+    }
+  }
+  return best_sad;
+}
+
+int av1_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
+                          int sad_per_bit, int distance,
+                          const aom_variance_fn_ptr_t *fn_ptr,
+                          const MV *center_mv, MV *best_mv) {
+  int r;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const int row_min = AOMMAX(ref_mv->row - distance, x->mv_limits.row_min);
+  const int row_max = AOMMIN(ref_mv->row + distance, x->mv_limits.row_max);
+  const int col_min = AOMMAX(ref_mv->col - distance, x->mv_limits.col_min);
+  const int col_max = AOMMIN(ref_mv->col + distance, x->mv_limits.col_max);
+  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  unsigned int best_sad =
+      fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
+                  in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+  *best_mv = *ref_mv;
+
+  for (r = row_min; r < row_max; ++r) {
+    int c = col_min;
+    const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
+
+    if (fn_ptr->sdx3f != NULL) {
+      while ((c + 2) < col_max) {
+        int i;
+        DECLARE_ALIGNED(16, uint32_t, sads[3]);
+
+        fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
+                      sads);
+
+        for (i = 0; i < 3; ++i) {
+          unsigned int sad = sads[i];
+          if (sad < best_sad) {
+            const MV mv = { r, c };
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              *best_mv = mv;
+            }
+          }
+          ++check_here;
+          ++c;
+        }
+      }
+    }
+
+    while (c < col_max) {
+      unsigned int sad =
+          fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride);
+      if (sad < best_sad) {
+        const MV mv = { r, c };
+        sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+        if (sad < best_sad) {
+          best_sad = sad;
+          *best_mv = mv;
+        }
+      }
+      ++check_here;
+      ++c;
+    }
+  }
+
+  return best_sad;
+}
+
+int av1_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
+                          int sad_per_bit, int distance,
+                          const aom_variance_fn_ptr_t *fn_ptr,
+                          const MV *center_mv, MV *best_mv) {
+  int r;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const int row_min = AOMMAX(ref_mv->row - distance, x->mv_limits.row_min);
+  const int row_max = AOMMIN(ref_mv->row + distance, x->mv_limits.row_max);
+  const int col_min = AOMMAX(ref_mv->col - distance, x->mv_limits.col_min);
+  const int col_max = AOMMIN(ref_mv->col + distance, x->mv_limits.col_max);
+  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  unsigned int best_sad =
+      fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
+                  in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+  *best_mv = *ref_mv;
+
+  for (r = row_min; r < row_max; ++r) {
+    int c = col_min;
+    const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
+
+    if (fn_ptr->sdx8f != NULL) {
+      while ((c + 7) < col_max) {
+        int i;
+        DECLARE_ALIGNED(16, uint32_t, sads[8]);
+
+        fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride,
+                      sads);
+
+        for (i = 0; i < 8; ++i) {
+          unsigned int sad = sads[i];
+          if (sad < best_sad) {
+            const MV mv = { r, c };
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              *best_mv = mv;
+            }
+          }
+          ++check_here;
+          ++c;
+        }
+      }
+    }
+
+    if (fn_ptr->sdx3f != NULL) {
+      while ((c + 2) < col_max) {
+        int i;
+        DECLARE_ALIGNED(16, uint32_t, sads[3]);
+
+        fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
+                      sads);
+
+        for (i = 0; i < 3; ++i) {
+          unsigned int sad = sads[i];
+          if (sad < best_sad) {
+            const MV mv = { r, c };
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              *best_mv = mv;
+            }
+          }
+          ++check_here;
+          ++c;
+        }
+      }
+    }
+
+    while (c < col_max) {
+      unsigned int sad =
+          fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride);
+      if (sad < best_sad) {
+        const MV mv = { r, c };
+        sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+        if (sad < best_sad) {
+          best_sad = sad;
+          *best_mv = mv;
+        }
+      }
+      ++check_here;
+      ++c;
+    }
+  }
+
+  return best_sad;
+}
+
+int av1_refining_search_sad(MACROBLOCK *x, MV *ref_mv, int error_per_bit,
+                            int search_range,
+                            const aom_variance_fn_ptr_t *fn_ptr,
+                            const MV *center_mv) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
+  unsigned int best_sad =
+      fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+  int i, j;
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+    const int all_in = ((ref_mv->row - 1) > x->mv_limits.row_min) &
+                       ((ref_mv->row + 1) < x->mv_limits.row_max) &
+                       ((ref_mv->col - 1) > x->mv_limits.col_min) &
+                       ((ref_mv->col + 1) < x->mv_limits.col_max);
+
+    if (all_in) {
+      unsigned int sads[4];
+      const uint8_t *const positions[4] = { best_address - in_what->stride,
+                                            best_address - 1, best_address + 1,
+                                            best_address + in_what->stride };
+
+      fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads);
+
+      for (j = 0; j < 4; ++j) {
+        if (sads[j] < best_sad) {
+          const MV mv = { ref_mv->row + neighbors[j].row,
+                          ref_mv->col + neighbors[j].col };
+          sads[j] += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sads[j] < best_sad) {
+            best_sad = sads[j];
+            best_site = j;
+          }
+        }
+      }
+    } else {
+      for (j = 0; j < 4; ++j) {
+        const MV mv = { ref_mv->row + neighbors[j].row,
+                        ref_mv->col + neighbors[j].col };
+
+        if (is_mv_in(&x->mv_limits, &mv)) {
+          unsigned int sad =
+              fn_ptr->sdf(what->buf, what->stride,
+                          get_buf_from_mv(in_what, &mv), in_what->stride);
+          if (sad < best_sad) {
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              best_site = j;
+            }
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      x->second_best_mv.as_mv = *ref_mv;
+      ref_mv->row += neighbors[best_site].row;
+      ref_mv->col += neighbors[best_site].col;
+      best_address = get_buf_from_mv(in_what, ref_mv);
+    }
+  }
+
+  return best_sad;
+}
+
+// This function is called when we do joint motion search in comp_inter_inter
+// mode.
+int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
+                             const aom_variance_fn_ptr_t *fn_ptr,
+                             const MV *center_mv, const uint8_t *second_pred) {
+  const MV neighbors[8] = { { -1, 0 },  { 0, -1 }, { 0, 1 },  { 1, 0 },
+                            { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } };
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  MV *best_mv = &x->best_mv.as_mv;
+  unsigned int best_sad = INT_MAX;
+  int i, j;
+
+  clamp_mv(best_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  best_sad =
+      fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
+                   in_what->stride, second_pred) +
+      mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
+
+  for (i = 0; i < search_range; ++i) {
+    int best_site = -1;
+
+    for (j = 0; j < 8; ++j) {
+      const MV mv = { best_mv->row + neighbors[j].row,
+                      best_mv->col + neighbors[j].col };
+
+      if (is_mv_in(&x->mv_limits, &mv)) {
+        unsigned int sad =
+            fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
+                         in_what->stride, second_pred);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      best_mv->row += neighbors[best_site].row;
+      best_mv->col += neighbors[best_site].col;
+    }
+  }
+  return best_sad;
+}
+
+#define MIN_EX_SEARCH_LIMIT 128
+static int is_exhaustive_allowed(const AV1_COMP *const cpi, MACROBLOCK *x) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const int max_ex =
+      AOMMAX(MIN_EX_SEARCH_LIMIT,
+             (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
+
+  return sf->allow_exhaustive_searches &&
+         (sf->exhaustive_searches_thresh < INT_MAX) &&
+         (*x->ex_search_count_ptr <= max_ex) && !cpi->rc.is_src_frame_alt_ref;
+}
+
+int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                          MV *mvp_full, int step_param, int error_per_bit,
+                          int *cost_list, const MV *ref_mv, int var_max,
+                          int rd) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const SEARCH_METHODS method = sf->mv.search_method;
+  const aom_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
+  int var = 0;
+
+  if (cost_list) {
+    cost_list[0] = INT_MAX;
+    cost_list[1] = INT_MAX;
+    cost_list[2] = INT_MAX;
+    cost_list[3] = INT_MAX;
+    cost_list[4] = INT_MAX;
+  }
+
+  // Keep track of number of searches (this frame in this thread).
+  ++(*x->m_search_count_ptr);
+
+  switch (method) {
+    case FAST_DIAMOND:
+      var = fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
+                            cost_list, fn_ptr, 1, ref_mv);
+      break;
+    case FAST_HEX:
+      var = fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
+                            cost_list, fn_ptr, 1, ref_mv);
+      break;
+    case HEX:
+      var = av1_hex_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
+                           fn_ptr, 1, ref_mv);
+      break;
+    case SQUARE:
+      var = square_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
+                          fn_ptr, 1, ref_mv);
+      break;
+    case BIGDIA:
+      var = bigdia_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
+                          fn_ptr, 1, ref_mv);
+      break;
+    case NSTEP:
+      var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
+                               MAX_MVSEARCH_STEPS - 1 - step_param, 1,
+                               cost_list, fn_ptr, ref_mv);
+
+      // Should we allow a follow on exhaustive search?
+      if (is_exhaustive_allowed(cpi, x)) {
+        int exhuastive_thr = sf->exhaustive_searches_thresh;
+        exhuastive_thr >>=
+            10 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+
+        // Threshold variance for an exhaustive full search.
+        if (var > exhuastive_thr) {
+          int var_ex;
+          MV tmp_mv_ex;
+          var_ex =
+              full_pixel_exhaustive(cpi, x, &x->best_mv.as_mv, error_per_bit,
+                                    cost_list, fn_ptr, ref_mv, &tmp_mv_ex);
+
+          if (var_ex < var) {
+            var = var_ex;
+            x->best_mv.as_mv = tmp_mv_ex;
+          }
+        }
+      }
+      break;
+
+      break;
+    default: assert(0 && "Invalid search method.");
+  }
+
+  if (method != NSTEP && rd && var < var_max)
+    var = av1_get_mvpred_var(x, &x->best_mv.as_mv, ref_mv, fn_ptr, 1);
+
+  return var;
+}
+
+#if CONFIG_EXT_INTER
+/* returns subpixel variance error function */
+#define DIST(r, c)                                                         \
+  vfp->msvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, src_stride, \
+            mask, mask_stride, &sse)
+
+/* checks if (r, c) has better score than previous best */
+
+#define MVC(r, c)                                                         \
+  (mvcost                                                                 \
+       ? ((mvjcost[((r) != rr) * 2 + ((c) != rc)] + mvcost[0][((r)-rr)] + \
+           mvcost[1][((c)-rc)]) *                                         \
+              error_per_bit +                                             \
+          4096) >>                                                        \
+             13                                                           \
+       : 0)
+
+#define CHECK_BETTER(v, r, c)                             \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
+    thismse = (DIST(r, c));                               \
+    if ((v = MVC(r, c) + thismse) < besterr) {            \
+      besterr = v;                                        \
+      br = r;                                             \
+      bc = c;                                             \
+      *distortion = thismse;                              \
+      *sse1 = sse;                                        \
+    }                                                     \
+  } else {                                                \
+    v = INT_MAX;                                          \
+  }
+
+#undef CHECK_BETTER0
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+#undef CHECK_BETTER1
+#define CHECK_BETTER1(v, r, c)                                                 \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                      \
+    thismse = upsampled_masked_pref_error(xd, mask, mask_stride, vfp, z,       \
+                                          src_stride, upre(y, y_stride, r, c), \
+                                          y_stride, w, h, &sse);               \
+    if ((v = MVC(r, c) + thismse) < besterr) {                                 \
+      besterr = v;                                                             \
+      br = r;                                                                  \
+      bc = c;                                                                  \
+      *distortion = thismse;                                                   \
+      *sse1 = sse;                                                             \
+    }                                                                          \
+  } else {                                                                     \
+    v = INT_MAX;                                                               \
+  }
+
+int av1_find_best_masked_sub_pixel_tree(
+    const MACROBLOCK *x, const uint8_t *mask, int mask_stride, MV *bestmv,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
+    int is_second) {
+  const uint8_t *const z = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  int thismse;
+  unsigned int whichdir;
+  unsigned int halfiters = iters_per_step;
+  unsigned int quarteriters = iters_per_step;
+  unsigned int eighthiters = iters_per_step;
+
+  const int y_stride = xd->plane[0].pre[is_second].stride;
+  const int offset = bestmv->row * y_stride + bestmv->col;
+  const uint8_t *const y = xd->plane[0].pre[is_second].buf;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  int tr = br;
+  int tc = bc;
+  int minc, maxc, minr, maxr;
+
+  av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
+                                 ref_mv);
+
+  // central mv
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+
+  // calculate central point error
+  besterr =
+      vfp->mvf(y + offset, y_stride, z, src_stride, mask, mask_stride, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+  // 1/2 pel
+  FIRST_LEVEL_CHECKS;
+  if (halfiters > 1) {
+    SECOND_LEVEL_CHECKS;
+  }
+  tr = br;
+  tc = bc;
+
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+
+  if (allow_hp && forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void)tr;
+  (void)tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  return besterr;
+}
+
+static unsigned int setup_masked_center_error(
+    const uint8_t *mask, int mask_stride, const MV *bestmv, const MV *ref_mv,
+    int error_per_bit, const aom_variance_fn_ptr_t *vfp,
+    const uint8_t *const src, const int src_stride, const uint8_t *const y,
+    int y_stride, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
+    int *distortion) {
+  unsigned int besterr;
+  besterr =
+      vfp->mvf(y + offset, y_stride, src, src_stride, mask, mask_stride, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+static int upsampled_masked_pref_error(const MACROBLOCKD *xd,
+                                       const uint8_t *mask, int mask_stride,
+                                       const aom_variance_fn_ptr_t *vfp,
+                                       const uint8_t *const src,
+                                       const int src_stride,
+                                       const uint8_t *const y, int y_stride,
+                                       int w, int h, unsigned int *sse) {
+  unsigned int besterr;
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+    aom_highbd_upsampled_pred(pred16, w, h, y, y_stride);
+
+    besterr = vfp->mvf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, mask,
+                       mask_stride, sse);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+#else
+  DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+  (void)xd;
+#endif  // CONFIG_HIGHBITDEPTH
+    aom_upsampled_pred(pred, w, h, y, y_stride);
+
+    besterr = vfp->mvf(pred, w, src, src_stride, mask, mask_stride, sse);
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif
+  return besterr;
+}
+
+static unsigned int upsampled_setup_masked_center_error(
+    const MACROBLOCKD *xd, const uint8_t *mask, int mask_stride,
+    const MV *bestmv, const MV *ref_mv, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, const uint8_t *const src,
+    const int src_stride, const uint8_t *const y, int y_stride, int w, int h,
+    int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
+    int *distortion) {
+  unsigned int besterr =
+      upsampled_masked_pref_error(xd, mask, mask_stride, vfp, src, src_stride,
+                                  y + offset, y_stride, w, h, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+int av1_find_best_masked_sub_pixel_tree_up(
+    const AV1_COMP *cpi, MACROBLOCK *x, const uint8_t *mask, int mask_stride,
+    int mi_row, int mi_col, MV *bestmv, const MV *ref_mv, int allow_hp,
+    int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop,
+    int iters_per_step, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, int is_second, int use_upsampled_ref) {
+  const uint8_t *const z = x->plane[0].src.buf;
+  const uint8_t *const src_address = z;
+  const int src_stride = x->plane[0].src.stride;
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  unsigned int thismse;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  int iter;
+  int round = 3 - forced_stop;
+  int tr = br;
+  int tc = bc;
+  const MV *search_step = search_step_table;
+  int idx, best_idx = -1;
+  unsigned int cost_array[5];
+  int kr, kc;
+  const int w = block_size_wide[mbmi->sb_type];
+  const int h = block_size_high[mbmi->sb_type];
+  int offset;
+  int y_stride;
+  const uint8_t *y;
+
+  const struct buf_2d backup_pred = pd->pre[is_second];
+  int minc, maxc, minr, maxr;
+
+  av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
+                                 ref_mv);
+
+  if (use_upsampled_ref) {
+    int ref = xd->mi[0]->mbmi.ref_frame[is_second];
+    const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
+    setup_pred_plane(&pd->pre[is_second], mbmi->sb_type,
+                     upsampled_ref->y_buffer, upsampled_ref->y_crop_width,
+                     upsampled_ref->y_crop_height, upsampled_ref->y_stride,
+                     (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
+                     pd->subsampling_y);
+  }
+  y = pd->pre[is_second].buf;
+  y_stride = pd->pre[is_second].stride;
+  offset = bestmv->row * y_stride + bestmv->col;
+
+  if (!allow_hp)
+    if (round == 3) round = 2;
+
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+
+  // use_upsampled_ref can be 0 or 1
+  if (use_upsampled_ref)
+    besterr = upsampled_setup_masked_center_error(
+        xd, mask, mask_stride, bestmv, ref_mv, error_per_bit, vfp, z,
+        src_stride, y, y_stride, w, h, (offset * 8), mvjcost, mvcost, sse1,
+        distortion);
+  else
+    besterr = setup_masked_center_error(
+        mask, mask_stride, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y,
+        y_stride, offset, mvjcost, mvcost, sse1, distortion);
+
+  for (iter = 0; iter < round; ++iter) {
+    // Check vertical and horizontal sub-pixel positions.
+    for (idx = 0; idx < 4; ++idx) {
+      tr = br + search_step[idx].row;
+      tc = bc + search_step[idx].col;
+      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+        MV this_mv = { tr, tc };
+
+        if (use_upsampled_ref) {
+          const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+          thismse = upsampled_masked_pref_error(
+              xd, mask, mask_stride, vfp, src_address, src_stride, pre_address,
+              y_stride, w, h, &sse);
+        } else {
+          const uint8_t *const pre_address =
+              y + (tr >> 3) * y_stride + (tc >> 3);
+          thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr),
+                              src_address, src_stride, mask, mask_stride, &sse);
+        }
+
+        cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
+                                                mvcost, error_per_bit);
+
+        if (cost_array[idx] < besterr) {
+          best_idx = idx;
+          besterr = cost_array[idx];
+          *distortion = thismse;
+          *sse1 = sse;
+        }
+      } else {
+        cost_array[idx] = INT_MAX;
+      }
+    }
+
+    // Check diagonal sub-pixel position
+    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+    tc = bc + kc;
+    tr = br + kr;
+    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+      MV this_mv = { tr, tc };
+
+      if (use_upsampled_ref) {
+        const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+        thismse = upsampled_masked_pref_error(
+            xd, mask, mask_stride, vfp, src_address, src_stride, pre_address,
+            y_stride, w, h, &sse);
+      } else {
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+
+        thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr), src_address,
+                            src_stride, mask, mask_stride, &sse);
+      }
+
+      cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+                                            error_per_bit);
+
+      if (cost_array[4] < besterr) {
+        best_idx = 4;
+        besterr = cost_array[4];
+        *distortion = thismse;
+        *sse1 = sse;
+      }
+    } else {
+      cost_array[idx] = INT_MAX;
+    }
+
+    if (best_idx < 4 && best_idx >= 0) {
+      br += search_step[best_idx].row;
+      bc += search_step[best_idx].col;
+    } else if (best_idx == 4) {
+      br = tr;
+      bc = tc;
+    }
+
+    if (iters_per_step > 1 && best_idx != -1) {
+      if (use_upsampled_ref) {
+        SECOND_LEVEL_CHECKS_BEST(1);
+      } else {
+        SECOND_LEVEL_CHECKS_BEST(0);
+      }
+    }
+
+    tr = br;
+    tc = bc;
+
+    search_step += 4;
+    hstep >>= 1;
+    best_idx = -1;
+  }
+
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void)tr;
+  (void)tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if (use_upsampled_ref) {
+    pd->pre[is_second] = backup_pred;
+  }
+
+  return besterr;
+}
+
+#undef DIST
+#undef MVC
+#undef CHECK_BETTER
+
+static int get_masked_mvpred_var(const MACROBLOCK *x, const uint8_t *mask,
+                                 int mask_stride, const MV *best_mv,
+                                 const MV *center_mv,
+                                 const aom_variance_fn_ptr_t *vfp,
+                                 int use_mvcost, int is_second) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+  unsigned int unused;
+
+  return vfp->mvf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
+                  in_what->stride, mask, mask_stride, &unused) +
+         (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+                                   x->errorperbit)
+                     : 0);
+}
+
+int masked_refining_search_sad(const MACROBLOCK *x, const uint8_t *mask,
+                               int mask_stride, MV *ref_mv, int error_per_bit,
+                               int search_range,
+                               const aom_variance_fn_ptr_t *fn_ptr,
+                               const MV *center_mv, int is_second) {
+  const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  unsigned int best_sad =
+      fn_ptr->msdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
+                   in_what->stride, mask, mask_stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+  int i, j;
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+
+    for (j = 0; j < 4; j++) {
+      const MV mv = { ref_mv->row + neighbors[j].row,
+                      ref_mv->col + neighbors[j].col };
+      if (is_mv_in(&x->mv_limits, &mv)) {
+        unsigned int sad =
+            fn_ptr->msdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
+                         in_what->stride, mask, mask_stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      ref_mv->row += neighbors[best_site].row;
+      ref_mv->col += neighbors[best_site].col;
+    }
+  }
+  return best_sad;
+}
+
+int masked_diamond_search_sad(const MACROBLOCK *x,
+                              const search_site_config *cfg,
+                              const uint8_t *mask, int mask_stride, MV *ref_mv,
+                              MV *best_mv, int search_param, int sad_per_bit,
+                              int *num00, const aom_variance_fn_ptr_t *fn_ptr,
+                              const MV *center_mv, int is_second) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  // search_param determines the length of the initial step and hence the number
+  // of iterations
+  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
+  // (MAX_FIRST_STEP/4) pel... etc.
+  const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  const uint8_t *best_address, *in_what_ref;
+  int best_sad = INT_MAX;
+  int best_site = 0;
+  int last_site = 0;
+  int i, j, step;
+
+  clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  in_what_ref = get_buf_from_mv(in_what, ref_mv);
+  best_address = in_what_ref;
+  *num00 = 0;
+  *best_mv = *ref_mv;
+
+  // Check the starting position
+  best_sad = fn_ptr->msdf(what->buf, what->stride, best_address,
+                          in_what->stride, mask, mask_stride) +
+             mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
+
+  i = 1;
+
+  for (step = 0; step < tot_steps; step++) {
+    for (j = 0; j < cfg->searches_per_step; j++) {
+      const MV mv = { best_mv->row + ss[i].mv.row,
+                      best_mv->col + ss[i].mv.col };
+      if (is_mv_in(&x->mv_limits, &mv)) {
+        int sad =
+            fn_ptr->msdf(what->buf, what->stride, best_address + ss[i].offset,
+                         in_what->stride, mask, mask_stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = i;
+          }
+        }
+      }
+
+      i++;
+    }
+
+    if (best_site != last_site) {
+      best_mv->row += ss[best_site].mv.row;
+      best_mv->col += ss[best_site].mv.col;
+      best_address += ss[best_site].offset;
+      last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+      while (1) {
+        const MV this_mv = { best_mv->row + ss[best_site].mv.row,
+                             best_mv->col + ss[best_site].mv.col };
+        if (is_mv_in(&x->mv_limits, &this_mv)) {
+          int sad = fn_ptr->msdf(what->buf, what->stride,
+                                 best_address + ss[best_site].offset,
+                                 in_what->stride, mask, mask_stride);
+          if (sad < best_sad) {
+            sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              best_mv->row += ss[best_site].mv.row;
+              best_mv->col += ss[best_site].mv.col;
+              best_address += ss[best_site].offset;
+              continue;
+            }
+          }
+        }
+        break;
+      }
+#endif
+    } else if (best_address == in_what_ref) {
+      (*num00)++;
+    }
+  }
+  return best_sad;
+}
+
+int av1_masked_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
+                                  const uint8_t *mask, int mask_stride,
+                                  MV *mvp_full, int step_param, int sadpb,
+                                  int further_steps, int do_refine,
+                                  const aom_variance_fn_ptr_t *fn_ptr,
+                                  const MV *ref_mv, MV *dst_mv, int is_second) {
+  MV temp_mv;
+  int thissme, n, num00 = 0;
+  int bestsme = masked_diamond_search_sad(x, &cpi->ss_cfg, mask, mask_stride,
+                                          mvp_full, &temp_mv, step_param, sadpb,
+                                          &n, fn_ptr, ref_mv, is_second);
+  if (bestsme < INT_MAX)
+    bestsme = get_masked_mvpred_var(x, mask, mask_stride, &temp_mv, ref_mv,
+                                    fn_ptr, 1, is_second);
+  *dst_mv = temp_mv;
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  if (n > further_steps) do_refine = 0;
+
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      num00--;
+    } else {
+      thissme = masked_diamond_search_sad(
+          x, &cpi->ss_cfg, mask, mask_stride, mvp_full, &temp_mv,
+          step_param + n, sadpb, &num00, fn_ptr, ref_mv, is_second);
+      if (thissme < INT_MAX)
+        thissme = get_masked_mvpred_var(x, mask, mask_stride, &temp_mv, ref_mv,
+                                        fn_ptr, 1, is_second);
+
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n) do_refine = 0;
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        *dst_mv = temp_mv;
+      }
+    }
+  }
+
+  // final 1-away diamond refining search
+  if (do_refine) {
+    const int search_range = 8;
+    MV best_mv = *dst_mv;
+    thissme =
+        masked_refining_search_sad(x, mask, mask_stride, &best_mv, sadpb,
+                                   search_range, fn_ptr, ref_mv, is_second);
+    if (thissme < INT_MAX)
+      thissme = get_masked_mvpred_var(x, mask, mask_stride, &best_mv, ref_mv,
+                                      fn_ptr, 1, is_second);
+    if (thissme < bestsme) {
+      bestsme = thissme;
+      *dst_mv = best_mv;
+    }
+  }
+  return bestsme;
+}
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+  vfp->osvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, mask, &sse)
+
+/* checks if (r, c) has better score than previous best */
+#define MVC(r, c)                                                         \
+  (mvcost                                                                 \
+       ? ((mvjcost[((r) != rr) * 2 + ((c) != rc)] + mvcost[0][((r)-rr)] + \
+           mvcost[1][((c)-rc)]) *                                         \
+              error_per_bit +                                             \
+          4096) >>                                                        \
+             13                                                           \
+       : 0)
+
+#define CHECK_BETTER(v, r, c)                             \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
+    thismse = (DIST(r, c));                               \
+    if ((v = MVC(r, c) + thismse) < besterr) {            \
+      besterr = v;                                        \
+      br = r;                                             \
+      bc = c;                                             \
+      *distortion = thismse;                              \
+      *sse1 = sse;                                        \
+    }                                                     \
+  } else {                                                \
+    v = INT_MAX;                                          \
+  }
+
+#undef CHECK_BETTER0
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+#undef CHECK_BETTER1
+#define CHECK_BETTER1(v, r, c)                                            \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                 \
+    thismse = upsampled_obmc_pref_error(                                  \
+        xd, mask, vfp, z, upre(y, y_stride, r, c), y_stride, w, h, &sse); \
+    if ((v = MVC(r, c) + thismse) < besterr) {                            \
+      besterr = v;                                                        \
+      br = r;                                                             \
+      bc = c;                                                             \
+      *distortion = thismse;                                              \
+      *sse1 = sse;                                                        \
+    }                                                                     \
+  } else {                                                                \
+    v = INT_MAX;                                                          \
+  }
+
+static unsigned int setup_obmc_center_error(
+    const int32_t *mask, const MV *bestmv, const MV *ref_mv, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc,
+    const uint8_t *const y, int y_stride, int offset, int *mvjcost,
+    int *mvcost[2], unsigned int *sse1, int *distortion) {
+  unsigned int besterr;
+  besterr = vfp->ovf(y + offset, y_stride, wsrc, mask, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+static int upsampled_obmc_pref_error(const MACROBLOCKD *xd, const int32_t *mask,
+                                     const aom_variance_fn_ptr_t *vfp,
+                                     const int32_t *const wsrc,
+                                     const uint8_t *const y, int y_stride,
+                                     int w, int h, unsigned int *sse) {
+  unsigned int besterr;
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+    aom_highbd_upsampled_pred(pred16, w, h, y, y_stride);
+
+    besterr = vfp->ovf(CONVERT_TO_BYTEPTR(pred16), w, wsrc, mask, sse);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+#else
+  DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+  (void)xd;
+#endif  // CONFIG_HIGHBITDEPTH
+    aom_upsampled_pred(pred, w, h, y, y_stride);
+
+    besterr = vfp->ovf(pred, w, wsrc, mask, sse);
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif
+  return besterr;
+}
+
+static unsigned int upsampled_setup_obmc_center_error(
+    const MACROBLOCKD *xd, const int32_t *mask, const MV *bestmv,
+    const MV *ref_mv, int error_per_bit, const aom_variance_fn_ptr_t *vfp,
+    const int32_t *const wsrc, const uint8_t *const y, int y_stride, int w,
+    int h, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
+    int *distortion) {
+  unsigned int besterr = upsampled_obmc_pref_error(
+      xd, mask, vfp, wsrc, y + offset, y_stride, w, h, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+int av1_find_best_obmc_sub_pixel_tree_up(
+    const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, MV *bestmv,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
+    int is_second, int use_upsampled_ref) {
+  const int32_t *wsrc = x->wsrc_buf;
+  const int32_t *mask = x->mask_buf;
+  const int *const z = wsrc;
+  const int *const src_address = z;
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  unsigned int thismse;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  int iter;
+  int round = 3 - forced_stop;
+  int tr = br;
+  int tc = bc;
+  const MV *search_step = search_step_table;
+  int idx, best_idx = -1;
+  unsigned int cost_array[5];
+  int kr, kc;
+  const int w = block_size_wide[mbmi->sb_type];
+  const int h = block_size_high[mbmi->sb_type];
+  int offset;
+  int y_stride;
+  const uint8_t *y;
+
+  const struct buf_2d backup_pred = pd->pre[is_second];
+  int minc, maxc, minr, maxr;
+
+  av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
+                                 ref_mv);
+
+  if (use_upsampled_ref) {
+    int ref = xd->mi[0]->mbmi.ref_frame[is_second];
+    const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
+    setup_pred_plane(&pd->pre[is_second], mbmi->sb_type,
+                     upsampled_ref->y_buffer, upsampled_ref->y_crop_width,
+                     upsampled_ref->y_crop_height, upsampled_ref->y_stride,
+                     (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
+                     pd->subsampling_y);
+  }
+  y = pd->pre[is_second].buf;
+  y_stride = pd->pre[is_second].stride;
+  offset = bestmv->row * y_stride + bestmv->col;
+
+  if (!allow_hp)
+    if (round == 3) round = 2;
+
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+  // use_upsampled_ref can be 0 or 1
+  if (use_upsampled_ref)
+    besterr = upsampled_setup_obmc_center_error(
+        xd, mask, bestmv, ref_mv, error_per_bit, vfp, z, y, y_stride, w, h,
+        (offset * 8), mvjcost, mvcost, sse1, distortion);
+  else
+    besterr = setup_obmc_center_error(mask, bestmv, ref_mv, error_per_bit, vfp,
+                                      z, y, y_stride, offset, mvjcost, mvcost,
+                                      sse1, distortion);
+
+  for (iter = 0; iter < round; ++iter) {
+    // Check vertical and horizontal sub-pixel positions.
+    for (idx = 0; idx < 4; ++idx) {
+      tr = br + search_step[idx].row;
+      tc = bc + search_step[idx].col;
+      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+        MV this_mv = { tr, tc };
+
+        if (use_upsampled_ref) {
+          const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+          thismse = upsampled_obmc_pref_error(
+              xd, mask, vfp, src_address, pre_address, y_stride, w, h, &sse);
+        } else {
+          const uint8_t *const pre_address =
+              y + (tr >> 3) * y_stride + (tc >> 3);
+          thismse = vfp->osvf(pre_address, y_stride, sp(tc), sp(tr),
+                              src_address, mask, &sse);
+        }
+
+        cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
+                                                mvcost, error_per_bit);
+        if (cost_array[idx] < besterr) {
+          best_idx = idx;
+          besterr = cost_array[idx];
+          *distortion = thismse;
+          *sse1 = sse;
+        }
+      } else {
+        cost_array[idx] = INT_MAX;
+      }
+    }
+
+    // Check diagonal sub-pixel position
+    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+    tc = bc + kc;
+    tr = br + kr;
+    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+      MV this_mv = { tr, tc };
+
+      if (use_upsampled_ref) {
+        const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+        thismse = upsampled_obmc_pref_error(xd, mask, vfp, src_address,
+                                            pre_address, y_stride, w, h, &sse);
+      } else {
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+
+        thismse = vfp->osvf(pre_address, y_stride, sp(tc), sp(tr), src_address,
+                            mask, &sse);
+      }
+
+      cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+                                            error_per_bit);
+
+      if (cost_array[4] < besterr) {
+        best_idx = 4;
+        besterr = cost_array[4];
+        *distortion = thismse;
+        *sse1 = sse;
+      }
+    } else {
+      cost_array[idx] = INT_MAX;
+    }
+
+    if (best_idx < 4 && best_idx >= 0) {
+      br += search_step[best_idx].row;
+      bc += search_step[best_idx].col;
+    } else if (best_idx == 4) {
+      br = tr;
+      bc = tc;
+    }
+
+    if (iters_per_step > 1 && best_idx != -1) {
+      if (use_upsampled_ref) {
+        SECOND_LEVEL_CHECKS_BEST(1);
+      } else {
+        SECOND_LEVEL_CHECKS_BEST(0);
+      }
+    }
+
+    tr = br;
+    tc = bc;
+
+    search_step += 4;
+    hstep >>= 1;
+    best_idx = -1;
+  }
+
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void)tr;
+  (void)tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if (use_upsampled_ref) {
+    pd->pre[is_second] = backup_pred;
+  }
+
+  return besterr;
+}
+
+#undef DIST
+#undef MVC
+#undef CHECK_BETTER
+
+static int get_obmc_mvpred_var(const MACROBLOCK *x, const int32_t *wsrc,
+                               const int32_t *mask, const MV *best_mv,
+                               const MV *center_mv,
+                               const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+                               int is_second) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+  unsigned int unused;
+
+  return vfp->ovf(get_buf_from_mv(in_what, best_mv), in_what->stride, wsrc,
+                  mask, &unused) +
+         (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+                                   x->errorperbit)
+                     : 0);
+}
+
+int obmc_refining_search_sad(const MACROBLOCK *x, const int32_t *wsrc,
+                             const int32_t *mask, MV *ref_mv, int error_per_bit,
+                             int search_range,
+                             const aom_variance_fn_ptr_t *fn_ptr,
+                             const MV *center_mv, int is_second) {
+  const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  unsigned int best_sad = fn_ptr->osdf(get_buf_from_mv(in_what, ref_mv),
+                                       in_what->stride, wsrc, mask) +
+                          mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+  int i, j;
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+
+    for (j = 0; j < 4; j++) {
+      const MV mv = { ref_mv->row + neighbors[j].row,
+                      ref_mv->col + neighbors[j].col };
+      if (is_mv_in(&x->mv_limits, &mv)) {
+        unsigned int sad = fn_ptr->osdf(get_buf_from_mv(in_what, &mv),
+                                        in_what->stride, wsrc, mask);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      ref_mv->row += neighbors[best_site].row;
+      ref_mv->col += neighbors[best_site].col;
+    }
+  }
+  return best_sad;
+}
+
+int obmc_diamond_search_sad(const MACROBLOCK *x, const search_site_config *cfg,
+                            const int32_t *wsrc, const int32_t *mask,
+                            MV *ref_mv, MV *best_mv, int search_param,
+                            int sad_per_bit, int *num00,
+                            const aom_variance_fn_ptr_t *fn_ptr,
+                            const MV *center_mv, int is_second) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  // search_param determines the length of the initial step and hence the number
+  // of iterations
+  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
+  // (MAX_FIRST_STEP/4) pel... etc.
+  const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  const uint8_t *best_address, *in_what_ref;
+  int best_sad = INT_MAX;
+  int best_site = 0;
+  int last_site = 0;
+  int i, j, step;
+
+  clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  in_what_ref = in_what->buf + ref_mv->row * in_what->stride + ref_mv->col;
+  best_address = in_what_ref;
+  *num00 = 0;
+  *best_mv = *ref_mv;
+
+  // Check the starting position
+  best_sad = fn_ptr->osdf(best_address, in_what->stride, wsrc, mask) +
+             mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
+
+  i = 1;
+
+  for (step = 0; step < tot_steps; step++) {
+    for (j = 0; j < cfg->searches_per_step; j++) {
+      const MV mv = { best_mv->row + ss[i].mv.row,
+                      best_mv->col + ss[i].mv.col };
+      if (is_mv_in(&x->mv_limits, &mv)) {
+        int sad = fn_ptr->osdf(best_address + ss[i].offset, in_what->stride,
+                               wsrc, mask);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = i;
+          }
+        }
+      }
+
+      i++;
+    }
+
+    if (best_site != last_site) {
+      best_mv->row += ss[best_site].mv.row;
+      best_mv->col += ss[best_site].mv.col;
+      best_address += ss[best_site].offset;
+      last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+      while (1) {
+        const MV this_mv = { best_mv->row + ss[best_site].mv.row,
+                             best_mv->col + ss[best_site].mv.col };
+        if (is_mv_in(&x->mv_limits, &this_mv)) {
+          int sad = fn_ptr->osdf(best_address + ss[best_site].offset,
+                                 in_what->stride, wsrc, mask);
+          if (sad < best_sad) {
+            sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              best_mv->row += ss[best_site].mv.row;
+              best_mv->col += ss[best_site].mv.col;
+              best_address += ss[best_site].offset;
+              continue;
+            }
+          }
+        }
+        break;
+      }
+#endif
+    } else if (best_address == in_what_ref) {
+      (*num00)++;
+    }
+  }
+  return best_sad;
+}
+
+int av1_obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
+                                MV *mvp_full, int step_param, int sadpb,
+                                int further_steps, int do_refine,
+                                const aom_variance_fn_ptr_t *fn_ptr,
+                                const MV *ref_mv, MV *dst_mv, int is_second) {
+  const int32_t *wsrc = x->wsrc_buf;
+  const int32_t *mask = x->mask_buf;
+  MV temp_mv;
+  int thissme, n, num00 = 0;
+  int bestsme =
+      obmc_diamond_search_sad(x, &cpi->ss_cfg, wsrc, mask, mvp_full, &temp_mv,
+                              step_param, sadpb, &n, fn_ptr, ref_mv, is_second);
+  if (bestsme < INT_MAX)
+    bestsme = get_obmc_mvpred_var(x, wsrc, mask, &temp_mv, ref_mv, fn_ptr, 1,
+                                  is_second);
+  *dst_mv = temp_mv;
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  if (n > further_steps) do_refine = 0;
+
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      num00--;
+    } else {
+      thissme = obmc_diamond_search_sad(x, &cpi->ss_cfg, wsrc, mask, mvp_full,
+                                        &temp_mv, step_param + n, sadpb, &num00,
+                                        fn_ptr, ref_mv, is_second);
+      if (thissme < INT_MAX)
+        thissme = get_obmc_mvpred_var(x, wsrc, mask, &temp_mv, ref_mv, fn_ptr,
+                                      1, is_second);
+
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n) do_refine = 0;
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        *dst_mv = temp_mv;
+      }
+    }
+  }
+
+  // final 1-away diamond refining search
+  if (do_refine) {
+    const int search_range = 8;
+    MV best_mv = *dst_mv;
+    thissme = obmc_refining_search_sad(x, wsrc, mask, &best_mv, sadpb,
+                                       search_range, fn_ptr, ref_mv, is_second);
+    if (thissme < INT_MAX)
+      thissme = get_obmc_mvpred_var(x, wsrc, mask, &best_mv, ref_mv, fn_ptr, 1,
+                                    is_second);
+    if (thissme < bestsme) {
+      bestsme = thissme;
+      *dst_mv = best_mv;
+    }
+  }
+  return bestsme;
+}
+#endif  // CONFIG_MOTION_VAR
+
+// Note(yunqingwang): The following 2 functions are only used in the motion
+// vector unit test, which return extreme motion vectors allowed by the MV
+// limits.
+#define COMMON_MV_TEST     \
+  SETUP_SUBPEL_SEARCH;     \
+                           \
+  (void)error_per_bit;     \
+  (void)vfp;               \
+  (void)src_address;       \
+  (void)src_stride;        \
+  (void)y;                 \
+  (void)y_stride;          \
+  (void)second_pred;       \
+  (void)w;                 \
+  (void)h;                 \
+  (void)use_upsampled_ref; \
+  (void)offset;            \
+  (void)mvjcost;           \
+  (void)mvcost;            \
+  (void)sse1;              \
+  (void)distortion;        \
+                           \
+  (void)halfiters;         \
+  (void)quarteriters;      \
+  (void)eighthiters;       \
+  (void)whichdir;          \
+  (void)forced_stop;       \
+  (void)hstep;             \
+                           \
+  (void)tr;                \
+  (void)tc;                \
+  (void)sse;               \
+  (void)thismse;           \
+  (void)cost_list;
+// Return the maximum MV.
+int av1_return_max_sub_pixel_mv(MACROBLOCK *x, const MV *ref_mv, int allow_hp,
+                                int error_per_bit,
+                                const aom_variance_fn_ptr_t *vfp,
+                                int forced_stop, int iters_per_step,
+                                int *cost_list, int *mvjcost, int *mvcost[2],
+                                int *distortion, unsigned int *sse1,
+                                const uint8_t *second_pred, int w, int h,
+                                int use_upsampled_ref) {
+  COMMON_MV_TEST;
+  (void)minr;
+  (void)minc;
+  bestmv->row = maxr;
+  bestmv->col = maxc;
+  besterr = 0;
+  // In the sub-pel motion search, if hp is not used, then the last bit of mv
+  // has to be 0.
+  lower_mv_precision(bestmv, allow_hp);
+  return besterr;
+}
+// Return the minimum MV.
+int av1_return_min_sub_pixel_mv(MACROBLOCK *x, const MV *ref_mv, int allow_hp,
+                                int error_per_bit,
+                                const aom_variance_fn_ptr_t *vfp,
+                                int forced_stop, int iters_per_step,
+                                int *cost_list, int *mvjcost, int *mvcost[2],
+                                int *distortion, unsigned int *sse1,
+                                const uint8_t *second_pred, int w, int h,
+                                int use_upsampled_ref) {
+  COMMON_MV_TEST;
+  (void)maxr;
+  (void)maxc;
+  bestmv->row = minr;
+  bestmv->col = minc;
+  besterr = 0;
+  // In the sub-pel motion search, if hp is not used, then the last bit of mv
+  // has to be 0.
+  lower_mv_precision(bestmv, allow_hp);
+  return besterr;
+}
diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h
new file mode 100644
index 000000000..8465860ad
--- /dev/null
+++ b/third_party/aom/av1/encoder/mcomp.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_MCOMP_H_
+#define AV1_ENCODER_MCOMP_H_
+
+#include "av1/encoder/block.h"
+#include "aom_dsp/variance.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The maximum number of steps in a step search given the largest
+// allowed initial step
+#define MAX_MVSEARCH_STEPS 11
+// Max full pel mv specified in the unit of full pixel
+// Enable the use of motion vector in range [-1023, 1023].
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1)
+// Maximum size of the first step in full pel units
+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1))
+// Allowed motion vector pixel distance outside image border
+// for Block_16x16
+#define BORDER_MV_PIXELS_B16 (16 + AOM_INTERP_EXTEND)
+
+// motion search site
+typedef struct search_site {
+  MV mv;
+  int offset;
+} search_site;
+
+typedef struct search_site_config {
+  search_site ss[8 * MAX_MVSEARCH_STEPS + 1];
+  int ss_count;
+  int searches_per_step;
+} search_site_config;
+
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride);
+void av1_init3smotion_compensation(search_site_config *cfg, int stride);
+
+void av1_set_mv_search_range(MvLimits *mv_limits, const MV *mv);
+
+int av1_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost,
+                    int *mvcost[2], int weight);
+
+// Utility to compute variance + MV rate cost for a given MV
+int av1_get_mvpred_var(const MACROBLOCK *x, const MV *best_mv,
+                       const MV *center_mv, const aom_variance_fn_ptr_t *vfp,
+                       int use_mvcost);
+int av1_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
+                          const MV *center_mv, const uint8_t *second_pred,
+                          const aom_variance_fn_ptr_t *vfp, int use_mvcost);
+
+struct AV1_COMP;
+struct SPEED_FEATURES;
+
+int av1_init_search_range(int size);
+
+int av1_refining_search_sad(struct macroblock *x, struct mv *ref_mv,
+                            int sad_per_bit, int distance,
+                            const aom_variance_fn_ptr_t *fn_ptr,
+                            const struct mv *center_mv);
+
+// Runs sequence of diamond searches in smaller steps for RD.
+int av1_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                           MV *mvp_full, int step_param, int sadpb,
+                           int further_steps, int do_refine, int *cost_list,
+                           const aom_variance_fn_ptr_t *fn_ptr,
+                           const MV *ref_mv, MV *dst_mv);
+
+// Perform integral projection based motion estimation.
+unsigned int av1_int_pro_motion_estimation(const struct AV1_COMP *cpi,
+                                           MACROBLOCK *x, BLOCK_SIZE bsize,
+                                           int mi_row, int mi_col);
+
+int av1_hex_search(MACROBLOCK *x, MV *start_mv, int search_param,
+                   int sad_per_bit, int do_init_search, int *cost_list,
+                   const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+                   const MV *center_mv);
+
+typedef int(fractional_mv_step_fp)(
+    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp,
+    int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
+    int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
+    int *distortion, unsigned int *sse1, const uint8_t *second_pred, int w,
+    int h, int use_upsampled_ref);
+
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_more;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_evenmore;
+extern fractional_mv_step_fp av1_return_max_sub_pixel_mv;
+extern fractional_mv_step_fp av1_return_min_sub_pixel_mv;
+
+typedef int (*av1_full_search_fn_t)(const MACROBLOCK *x, const MV *ref_mv,
+                                    int sad_per_bit, int distance,
+                                    const aom_variance_fn_ptr_t *fn_ptr,
+                                    const MV *center_mv, MV *best_mv);
+
+typedef int (*av1_diamond_search_fn_t)(
+    MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv,
+    int search_param, int sad_per_bit, int *num00,
+    const aom_variance_fn_ptr_t *fn_ptr, const MV *center_mv);
+
+int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
+                             const aom_variance_fn_ptr_t *fn_ptr,
+                             const MV *center_mv, const uint8_t *second_pred);
+
+struct AV1_COMP;
+
+int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                          BLOCK_SIZE bsize, MV *mvp_full, int step_param,
+                          int error_per_bit, int *cost_list, const MV *ref_mv,
+                          int var_max, int rd);
+
+#if CONFIG_EXT_INTER
+int av1_find_best_masked_sub_pixel_tree(
+    const MACROBLOCK *x, const uint8_t *mask, int mask_stride, MV *bestmv,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
+    int is_second);
+int av1_find_best_masked_sub_pixel_tree_up(
+    const struct AV1_COMP *cpi, MACROBLOCK *x, const uint8_t *mask,
+    int mask_stride, int mi_row, int mi_col, MV *bestmv, const MV *ref_mv,
+    int allow_hp, int error_per_bit, const aom_variance_fn_ptr_t *vfp,
+    int forced_stop, int iters_per_step, int *mvjcost, int *mvcost[2],
+    int *distortion, unsigned int *sse1, int is_second, int use_upsampled_ref);
+int av1_masked_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                                  const uint8_t *mask, int mask_stride,
+                                  MV *mvp_full, int step_param, int sadpb,
+                                  int further_steps, int do_refine,
+                                  const aom_variance_fn_ptr_t *fn_ptr,
+                                  const MV *ref_mv, MV *dst_mv, int is_second);
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR
+int av1_obmc_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                                MV *mvp_full, int step_param, int sadpb,
+                                int further_steps, int do_refine,
+                                const aom_variance_fn_ptr_t *fn_ptr,
+                                const MV *ref_mv, MV *dst_mv, int is_second);
+int av1_find_best_obmc_sub_pixel_tree_up(
+    const struct AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col,
+    MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
+    int is_second, int use_upsampled_ref);
+#endif  // CONFIG_MOTION_VAR
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_MCOMP_H_
diff --git a/third_party/aom/av1/encoder/mips/msa/error_msa.c b/third_party/aom/av1/encoder/mips/msa/error_msa.c
new file mode 100644
index 000000000..8d13af7ad
--- /dev/null
+++ b/third_party/aom/av1/encoder/mips/msa/error_msa.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "aom_dsp/mips/macros_msa.h"
+
+#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize)                                     \
+  static int64_t block_error_##BSize##size_msa(                              \
+      const int16_t *coeff_ptr, const int16_t *dq_coeff_ptr, int64_t *ssz) { \
+    int64_t err = 0;                                                         \
+    uint32_t loop_cnt;                                                       \
+    v8i16 coeff, dq_coeff, coeff_r_h, coeff_l_h;                             \
+    v4i32 diff_r, diff_l, coeff_r_w, coeff_l_w;                              \
+    v2i64 sq_coeff_r, sq_coeff_l;                                            \
+    v2i64 err0, err_dup0, err1, err_dup1;                                    \
+                                                                             \
+    coeff = LD_SH(coeff_ptr);                                                \
+    dq_coeff = LD_SH(dq_coeff_ptr);                                          \
+    UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                                \
+    ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                      \
+    HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                       \
+    DOTP_SW2_SD(coeff_r_w, coeff_l_w, coeff_r_w, coeff_l_w, sq_coeff_r,      \
+                sq_coeff_l);                                                 \
+    DOTP_SW2_SD(diff_r, diff_l, diff_r, diff_l, err0, err1);                 \
+                                                                             \
+    coeff = LD_SH(coeff_ptr + 8);                                            \
+    dq_coeff = LD_SH(dq_coeff_ptr + 8);                                      \
+    UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                                \
+    ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                      \
+    HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                       \
+    DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);              \
+    DPADD_SD2_SD(diff_r, diff_l, err0, err1);                                \
+                                                                             \
+    coeff_ptr += 16;                                                         \
+    dq_coeff_ptr += 16;                                                      \
+                                                                             \
+    for (loop_cnt = ((BSize >> 4) - 1); loop_cnt--;) {                       \
+      coeff = LD_SH(coeff_ptr);                                              \
+      dq_coeff = LD_SH(dq_coeff_ptr);                                        \
+      UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                              \
+      ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                    \
+      HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                     \
+      DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);            \
+      DPADD_SD2_SD(diff_r, diff_l, err0, err1);                              \
+                                                                             \
+      coeff = LD_SH(coeff_ptr + 8);                                          \
+      dq_coeff = LD_SH(dq_coeff_ptr + 8);                                    \
+      UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                              \
+      ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                    \
+      HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                     \
+      DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);            \
+      DPADD_SD2_SD(diff_r, diff_l, err0, err1);                              \
+                                                                             \
+      coeff_ptr += 16;                                                       \
+      dq_coeff_ptr += 16;                                                    \
+    }                                                                        \
+                                                                             \
+    err_dup0 = __msa_splati_d(sq_coeff_r, 1);                                \
+    err_dup1 = __msa_splati_d(sq_coeff_l, 1);                                \
+    sq_coeff_r += err_dup0;                                                  \
+    sq_coeff_l += err_dup1;                                                  \
+    *ssz = __msa_copy_s_d(sq_coeff_r, 0);                                    \
+    *ssz += __msa_copy_s_d(sq_coeff_l, 0);                                   \
+                                                                             \
+    err_dup0 = __msa_splati_d(err0, 1);                                      \
+    err_dup1 = __msa_splati_d(err1, 1);                                      \
+    err0 += err_dup0;                                                        \
+    err1 += err_dup1;                                                        \
+    err = __msa_copy_s_d(err0, 0);                                           \
+    err += __msa_copy_s_d(err1, 0);                                          \
+                                                                             \
+    return err;                                                              \
+  }
+
+/* clang-format off */
+BLOCK_ERROR_BLOCKSIZE_MSA(16)
+BLOCK_ERROR_BLOCKSIZE_MSA(64)
+BLOCK_ERROR_BLOCKSIZE_MSA(256)
+BLOCK_ERROR_BLOCKSIZE_MSA(1024)
+/* clang-format on */
+
+int64_t av1_block_error_msa(const tran_low_t *coeff_ptr,
+                            const tran_low_t *dq_coeff_ptr, intptr_t blk_size,
+                            int64_t *ssz) {
+  int64_t err;
+  const int16_t *coeff = (const int16_t *)coeff_ptr;
+  const int16_t *dq_coeff = (const int16_t *)dq_coeff_ptr;
+
+  switch (blk_size) {
+    case 16: err = block_error_16size_msa(coeff, dq_coeff, ssz); break;
+    case 64: err = block_error_64size_msa(coeff, dq_coeff, ssz); break;
+    case 256: err = block_error_256size_msa(coeff, dq_coeff, ssz); break;
+    case 1024: err = block_error_1024size_msa(coeff, dq_coeff, ssz); break;
+    default:
+      err = av1_block_error_c(coeff_ptr, dq_coeff_ptr, blk_size, ssz);
+      break;
+  }
+
+  return err;
+}
diff --git a/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c
new file mode 100644
index 000000000..4b0364d6c
--- /dev/null
+++ b/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c
@@ -0,0 +1,436 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/enums.h"
+#include "av1/encoder/mips/msa/fdct_msa.h"
+#include "aom_dsp/mips/fwd_txfm_msa.h"
+
+static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride,
+                                   const int32_t *const0, int16_t *int_buf) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
+  v4i32 k0, k1, k2, k3;
+
+  /* load input data */
+  r0 = LD_SH(input);
+  r15 = LD_SH(input + 15 * stride);
+  r7 = LD_SH(input + 7 * stride);
+  r8 = LD_SH(input + 8 * stride);
+  SLLI_4V(r0, r15, r7, r8, 2);
+
+  /* stage 1 */
+  LD_SW2(const0, 4, k0, k1);
+  LD_SW2(const0 + 8, 4, k2, k3);
+  MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+
+  r3 = LD_SH(input + 3 * stride);
+  r4 = LD_SH(input + 4 * stride);
+  r11 = LD_SH(input + 11 * stride);
+  r12 = LD_SH(input + 12 * stride);
+  SLLI_4V(r3, r4, r11, r12, 2);
+
+  LD_SW2(const0 + 4 * 4, 4, k0, k1);
+  LD_SW2(const0 + 4 * 6, 4, k2, k3);
+  MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+
+  /* stage 2 */
+  BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
+  ST_SH2(tp0, tp2, int_buf, 8);
+  ST_SH2(tp1, tp3, int_buf + 4 * 8, 8);
+
+  LD_SW2(const0 + 4 * 8, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 10);
+  MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+
+  ST_SH2(h0, h1, int_buf + 8 * 8, 8);
+  ST_SH2(h3, h2, int_buf + 12 * 8, 8);
+
+  r9 = LD_SH(input + 9 * stride);
+  r6 = LD_SH(input + 6 * stride);
+  r1 = LD_SH(input + stride);
+  r14 = LD_SH(input + 14 * stride);
+  SLLI_4V(r9, r6, r1, r14, 2);
+
+  LD_SW2(const0 + 4 * 11, 4, k0, k1);
+  LD_SW2(const0 + 4 * 13, 4, k2, k3);
+  MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
+
+  ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
+
+  r13 = LD_SH(input + 13 * stride);
+  r2 = LD_SH(input + 2 * stride);
+  r5 = LD_SH(input + 5 * stride);
+  r10 = LD_SH(input + 10 * stride);
+  SLLI_4V(r13, r2, r5, r10, 2);
+
+  LD_SW2(const0 + 4 * 15, 4, k0, k1);
+  LD_SW2(const0 + 4 * 17, 4, k2, k3);
+  MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
+
+  ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
+
+  BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
+  ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
+}
+
+static void fadst16_step2_msa_helper(int16_t *int_buf, const int32_t *const0,
+                                     int16_t *out, int16_t *out_ptr) {
+  v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
+  v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
+  v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+  v4i32 k0, k1, k2, k3;
+
+  LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15);
+  LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7);
+  LD_SW2(const0 + 4 * 19, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 21);
+  MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+
+  tp0 = LD_SH(int_buf + 4 * 8);
+  tp1 = LD_SH(int_buf + 5 * 8);
+  tp3 = LD_SH(int_buf + 10 * 8);
+  tp2 = LD_SH(int_buf + 14 * 8);
+  LD_SW2(const0 + 4 * 22, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 24);
+  MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
+  out4 = -out4;
+  ST_SH(out4, (out + 3 * 16));
+  ST_SH(out5, (out_ptr + 4 * 16));
+
+  h1 = LD_SH(int_buf + 9 * 8);
+  h3 = LD_SH(int_buf + 12 * 8);
+  MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+  out13 = -out13;
+  ST_SH(out12, (out + 2 * 16));
+  ST_SH(out13, (out_ptr + 5 * 16));
+
+  tp0 = LD_SH(int_buf);
+  tp1 = LD_SH(int_buf + 8);
+  tp2 = LD_SH(int_buf + 2 * 8);
+  tp3 = LD_SH(int_buf + 6 * 8);
+
+  BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
+  out1 = -out1;
+  ST_SH(out0, (out));
+  ST_SH(out1, (out_ptr + 7 * 16));
+
+  h0 = LD_SH(int_buf + 8 * 8);
+  h2 = LD_SH(int_buf + 13 * 8);
+
+  BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+  out8 = -out8;
+  ST_SH(out8, (out + 16));
+  ST_SH(out9, (out_ptr + 6 * 16));
+
+  /* stage 4 */
+  LD_SW2(const0 + 4 * 25, 4, k0, k1);
+  LD_SW2(const0 + 4 * 27, 4, k2, k3);
+  MADD_SHORT(h10, h11, k1, k2, out2, out3);
+  ST_SH(out2, (out + 7 * 16));
+  ST_SH(out3, (out_ptr));
+
+  MADD_SHORT(out6, out7, k0, k3, out6, out7);
+  ST_SH(out6, (out + 4 * 16));
+  ST_SH(out7, (out_ptr + 3 * 16));
+
+  MADD_SHORT(out10, out11, k0, k3, out10, out11);
+  ST_SH(out10, (out + 6 * 16));
+  ST_SH(out11, (out_ptr + 16));
+
+  MADD_SHORT(out14, out15, k1, k2, out14, out15);
+  ST_SH(out14, (out + 5 * 16));
+  ST_SH(out15, (out_ptr + 2 * 16));
+}
+
+static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0,
+                                   int16_t *out) {
+  fadst16_step2_msa_helper(int_buf, const0, out, out + 128);
+}
+
+static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+  /* load input data */
+  LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
+                     r7);
+  FDCT_POSTPROC_2V_NEG_H(r0, r1);
+  FDCT_POSTPROC_2V_NEG_H(r2, r3);
+  FDCT_POSTPROC_2V_NEG_H(r4, r5);
+  FDCT_POSTPROC_2V_NEG_H(r6, r7);
+  ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
+  out += 64;
+
+  LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
+                     r12, r13, r14, r15);
+  FDCT_POSTPROC_2V_NEG_H(r8, r9);
+  FDCT_POSTPROC_2V_NEG_H(r10, r11);
+  FDCT_POSTPROC_2V_NEG_H(r12, r13);
+  FDCT_POSTPROC_2V_NEG_H(r14, r15);
+  ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
+  out += 64;
+
+  /* load input data */
+  input += 128;
+  LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
+                     r7);
+  FDCT_POSTPROC_2V_NEG_H(r0, r1);
+  FDCT_POSTPROC_2V_NEG_H(r2, r3);
+  FDCT_POSTPROC_2V_NEG_H(r4, r5);
+  FDCT_POSTPROC_2V_NEG_H(r6, r7);
+  ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
+  out += 64;
+
+  LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
+                     r12, r13, r14, r15);
+  FDCT_POSTPROC_2V_NEG_H(r8, r9);
+  FDCT_POSTPROC_2V_NEG_H(r10, r11);
+  FDCT_POSTPROC_2V_NEG_H(r12, r13);
+  FDCT_POSTPROC_2V_NEG_H(r14, r15);
+  ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
+}
+
+static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0,
+                                   int16_t *int_buf) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
+  v4i32 k0, k1, k2, k3;
+
+  /* load input data */
+  r0 = LD_SH(input);
+  r7 = LD_SH(input + 7 * 8);
+  r8 = LD_SH(input + 8 * 8);
+  r15 = LD_SH(input + 15 * 8);
+
+  /* stage 1 */
+  LD_SW2(const0, 4, k0, k1);
+  LD_SW2(const0 + 4 * 2, 4, k2, k3);
+  MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+
+  r3 = LD_SH(input + 3 * 8);
+  r4 = LD_SH(input + 4 * 8);
+  r11 = LD_SH(input + 11 * 8);
+  r12 = LD_SH(input + 12 * 8);
+
+  LD_SW2(const0 + 4 * 4, 4, k0, k1);
+  LD_SW2(const0 + 4 * 6, 4, k2, k3);
+  MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+
+  /* stage 2 */
+  BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
+  ST_SH2(tp0, tp1, int_buf, 4 * 8);
+  ST_SH2(tp2, tp3, int_buf + 8, 4 * 8);
+
+  LD_SW2(const0 + 4 * 8, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 10);
+  MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+  ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8);
+  ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8);
+
+  r1 = LD_SH(input + 8);
+  r6 = LD_SH(input + 6 * 8);
+  r9 = LD_SH(input + 9 * 8);
+  r14 = LD_SH(input + 14 * 8);
+
+  LD_SW2(const0 + 4 * 11, 4, k0, k1);
+  LD_SW2(const0 + 4 * 13, 4, k2, k3);
+  MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
+  ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
+
+  r2 = LD_SH(input + 2 * 8);
+  r5 = LD_SH(input + 5 * 8);
+  r10 = LD_SH(input + 10 * 8);
+  r13 = LD_SH(input + 13 * 8);
+
+  LD_SW2(const0 + 4 * 15, 4, k0, k1);
+  LD_SW2(const0 + 4 * 17, 4, k2, k3);
+  MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
+  ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
+  BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
+  ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
+}
+
+static void fadst16_rows_step2_msa(int16_t *int_buf, const int32_t *const0,
+                                   int16_t *out) {
+  fadst16_step2_msa_helper(int_buf, const0, out, out + 8);
+}
+
+static void fadst16_transpose_msa(int16_t *input, int16_t *out) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+  /* load input data */
+  LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
+          l7, l15);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
+                     r7);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
+                     r12, r13, r14, r15);
+  ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
+  ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
+  out += 16 * 8;
+
+  /* load input data */
+  input += 128;
+  LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
+          l7, l15);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
+                     r7);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
+                     r12, r13, r14, r15);
+  ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
+  ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
+}
+
+static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) {
+  int16_t *temp = intermediate;
+  int16_t *out = output;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11;
+  v8i16 in12, in13, in14, in15;
+
+  LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+  temp = intermediate + 8;
+  LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                     in10, in11, in12, in13, in14, in15);
+  FDCT_POSTPROC_2V_NEG_H(in0, in1);
+  FDCT_POSTPROC_2V_NEG_H(in2, in3);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  FDCT_POSTPROC_2V_NEG_H(in6, in7);
+  FDCT_POSTPROC_2V_NEG_H(in8, in9);
+  FDCT_POSTPROC_2V_NEG_H(in10, in11);
+  FDCT_POSTPROC_2V_NEG_H(in12, in13);
+  FDCT_POSTPROC_2V_NEG_H(in14, in15);
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+               in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6,
+               tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
+  temp = intermediate;
+  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16);
+  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  temp = intermediate;
+  LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+  FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
+               in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
+                     tmp1, in1, tmp2, in2, tmp3, in3);
+  ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16);
+  TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
+                     tmp5, in5, tmp6, in6, tmp7, in7);
+  out = output + 8;
+  ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16);
+}
+
+void av1_fht16x16_msa(const int16_t *input, int16_t *output, int32_t stride,
+                      int32_t tx_type) {
+  DECLARE_ALIGNED(32, int16_t, tmp[256]);
+  DECLARE_ALIGNED(32, int16_t, trans_buf[256]);
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[128]);
+  int32_t i;
+  int16_t *ptmpbuf = &tmp_buf[0];
+  int16_t *trans = &trans_buf[0];
+  const int32_t const_arr[29 * 4] = {
+    52707308,    52707308,    52707308,    52707308,    -1072430300,
+    -1072430300, -1072430300, -1072430300, 795618043,   795618043,
+    795618043,   795618043,   -721080468,  -721080468,  -721080468,
+    -721080468,  459094491,   459094491,   459094491,   459094491,
+    -970646691,  -970646691,  -970646691,  -970646691,  1010963856,
+    1010963856,  1010963856,  1010963856,  -361743294,  -361743294,
+    -361743294,  -361743294,  209469125,   209469125,   209469125,
+    209469125,   -1053094788, -1053094788, -1053094788, -1053094788,
+    1053160324,  1053160324,  1053160324,  1053160324,  639644520,
+    639644520,   639644520,   639644520,   -862444000,  -862444000,
+    -862444000,  -862444000,  1062144356,  1062144356,  1062144356,
+    1062144356,  -157532337,  -157532337,  -157532337,  -157532337,
+    260914709,   260914709,   260914709,   260914709,   -1041559667,
+    -1041559667, -1041559667, -1041559667, 920985831,   920985831,
+    920985831,   920985831,   -551995675,  -551995675,  -551995675,
+    -551995675,  596522295,   596522295,   596522295,   596522295,
+    892853362,   892853362,   892853362,   892853362,   -892787826,
+    -892787826,  -892787826,  -892787826,  410925857,   410925857,
+    410925857,   410925857,   -992012162,  -992012162,  -992012162,
+    -992012162,  992077698,   992077698,   992077698,   992077698,
+    759246145,   759246145,   759246145,   759246145,   -759180609,
+    -759180609,  -759180609,  -759180609,  -759222975,  -759222975,
+    -759222975,  -759222975,  759288511,   759288511,   759288511,
+    759288511
+  };
+
+  switch (tx_type) {
+    case DCT_DCT:
+      /* column transform */
+      for (i = 0; i < 2; ++i) {
+        fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
+      }
+
+      /* row transform */
+      for (i = 0; i < 2; ++i) {
+        fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
+      }
+      break;
+    case ADST_DCT:
+      /* column transform */
+      for (i = 0; i < 2; ++i) {
+        fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
+        fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
+      }
+
+      /* row transform */
+      for (i = 0; i < 2; ++i) {
+        postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
+      }
+      break;
+    case DCT_ADST:
+      /* column transform */
+      for (i = 0; i < 2; ++i) {
+        fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
+      }
+
+      fadst16_transpose_postproc_msa(tmp, trans);
+
+      /* row transform */
+      for (i = 0; i < 2; ++i) {
+        fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
+        fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
+      }
+
+      fadst16_transpose_msa(tmp, output);
+      break;
+    case ADST_ADST:
+      /* column transform */
+      for (i = 0; i < 2; ++i) {
+        fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
+        fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
+      }
+
+      fadst16_transpose_postproc_msa(tmp, trans);
+
+      /* row transform */
+      for (i = 0; i < 2; ++i) {
+        fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
+        fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
+      }
+
+      fadst16_transpose_msa(tmp, output);
+      break;
+    default: assert(0); break;
+  }
+}
diff --git a/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c
new file mode 100644
index 000000000..da1ac74f0
--- /dev/null
+++ b/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/enums.h"
+#include "av1/encoder/mips/msa/fdct_msa.h"
+
+void av1_fwht4x4_msa(const int16_t *input, int16_t *output,
+                     int32_t src_stride) {
+  v8i16 in0, in1, in2, in3, in4;
+
+  LD_SH4(input, src_stride, in0, in1, in2, in3);
+
+  in0 += in1;
+  in3 -= in2;
+  in4 = (in0 - in3) >> 1;
+  SUB2(in4, in1, in4, in2, in1, in2);
+  in0 -= in2;
+  in3 += in1;
+
+  TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1);
+
+  in0 += in2;
+  in1 -= in3;
+  in4 = (in0 - in1) >> 1;
+  SUB2(in4, in2, in4, in3, in2, in3);
+  in0 -= in3;
+  in1 += in2;
+
+  SLLI_4V(in0, in1, in2, in3, 2);
+
+  TRANSPOSE4x4_SH_SH(in0, in3, in1, in2, in0, in3, in1, in2);
+
+  ST4x2_UB(in0, output, 4);
+  ST4x2_UB(in3, output + 4, 4);
+  ST4x2_UB(in1, output + 8, 4);
+  ST4x2_UB(in2, output + 12, 4);
+}
+
+void av1_fht4x4_msa(const int16_t *input, int16_t *output, int32_t stride,
+                    int32_t tx_type) {
+  v8i16 in0, in1, in2, in3;
+
+  LD_SH4(input, stride, in0, in1, in2, in3);
+
+  /* fdct4 pre-process */
+  {
+    v8i16 temp, mask;
+    v16i8 zero = { 0 };
+    v16i8 one = __msa_ldi_b(1);
+
+    mask = (v8i16)__msa_sldi_b(zero, one, 15);
+    SLLI_4V(in0, in1, in2, in3, 4);
+    temp = __msa_ceqi_h(in0, 0);
+    temp = (v8i16)__msa_xori_b((v16u8)temp, 255);
+    temp = mask & temp;
+    in0 += temp;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+      AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case ADST_DCT:
+      AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case DCT_ADST:
+      AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case ADST_ADST:
+      AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    default: assert(0); break;
+  }
+
+  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+  ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+  SRA_4V(in0, in1, in2, in3, 2);
+  PCKEV_D2_SH(in1, in0, in3, in2, in0, in2);
+  ST_SH2(in0, in2, output, 8);
+}
diff --git a/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c
new file mode 100644
index 000000000..4cbf60a11
--- /dev/null
+++ b/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/enums.h"
+#include "av1/encoder/mips/msa/fdct_msa.h"
+
+void av1_fht8x8_msa(const int16_t *input, int16_t *output, int32_t stride,
+                    int32_t tx_type) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+  LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7);
+  SLLI_4V(in0, in1, in2, in3, 2);
+  SLLI_4V(in4, in5, in6, in7, 2);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+                in5, in6, in7);
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
+                         in3, in4, in5, in6, in7);
+      AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+                in5, in6, in7);
+      break;
+    case ADST_DCT:
+      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+                in5, in6, in7);
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
+                         in3, in4, in5, in6, in7);
+      AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+                in5, in6, in7);
+      break;
+    case DCT_ADST:
+      AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+                in5, in6, in7);
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
+                         in3, in4, in5, in6, in7);
+      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+                in5, in6, in7);
+      break;
+    case ADST_ADST:
+      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+                in5, in6, in7);
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
+                         in3, in4, in5, in6, in7);
+      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+                in5, in6, in7);
+      break;
+    default: assert(0); break;
+  }
+
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
+  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
+}
diff --git a/third_party/aom/av1/encoder/mips/msa/fdct_msa.h b/third_party/aom/av1/encoder/mips/msa/fdct_msa.h
new file mode 100644
index 000000000..52bcf790c
--- /dev/null
+++ b/third_party/aom/av1/encoder/mips/msa/fdct_msa.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_
+#define AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_
+
+#include "aom_dsp/mips/fwd_txfm_msa.h"
+#include "aom_dsp/mips/txfm_macros_msa.h"
+#include "aom_ports/mem.h"
+
+#define AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,  \
+                  out3, out4, out5, out6, out7)                              \
+  {                                                                          \
+    v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m;                       \
+    v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m;                        \
+    v8i16 coeff0_m = { cospi_2_64,  cospi_6_64,  cospi_10_64, cospi_14_64,   \
+                       cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \
+    v8i16 coeff1_m = { cospi_8_64,  -cospi_8_64,  cospi_16_64, -cospi_16_64, \
+                       cospi_24_64, -cospi_24_64, 0,           0 };          \
+                                                                             \
+    SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m);                          \
+    cnst2_m = -cnst0_m;                                                      \
+    ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);       \
+    SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m);                          \
+    cnst4_m = -cnst2_m;                                                      \
+    ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);       \
+                                                                             \
+    ILVRL_H2_SH(in0, in7, vec1_m, vec0_m);                                   \
+    ILVRL_H2_SH(in4, in3, vec3_m, vec2_m);                                   \
+    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m,  \
+                          cnst2_m, cnst3_m, in7, in0, in4, in3);             \
+                                                                             \
+    SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m);                          \
+    cnst2_m = -cnst0_m;                                                      \
+    ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);       \
+    SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m);                          \
+    cnst4_m = -cnst2_m;                                                      \
+    ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);       \
+                                                                             \
+    ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                   \
+    ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                   \
+                                                                             \
+    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m,  \
+                          cnst2_m, cnst3_m, in5, in2, in6, in1);             \
+    BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5);                   \
+    out7 = -s0_m;                                                            \
+    out0 = s1_m;                                                             \
+                                                                             \
+    SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m);  \
+                                                                             \
+    ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m);       \
+    cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                               \
+    cnst1_m = cnst0_m;                                                       \
+                                                                             \
+    ILVRL_H2_SH(in4, in3, vec1_m, vec0_m);                                   \
+    ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                   \
+    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m,  \
+                          cnst3_m, cnst1_m, out1, out6, s0_m, s1_m);         \
+                                                                             \
+    SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                          \
+    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                               \
+                                                                             \
+    ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                   \
+    ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m);                                 \
+    out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                   \
+    out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);                   \
+    out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);                   \
+    out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);                   \
+                                                                             \
+    out1 = -out1;                                                            \
+    out3 = -out3;                                                            \
+    out5 = -out5;                                                            \
+  }
+
+#define AOM_FADST4(in0, in1, in2, in3, out0, out1, out2, out3)              \
+  {                                                                         \
+    v4i32 s0_m, s1_m, s2_m, s3_m, constant_m;                               \
+    v4i32 in0_r_m, in1_r_m, in2_r_m, in3_r_m;                               \
+                                                                            \
+    UNPCK_R_SH_SW(in0, in0_r_m);                                            \
+    UNPCK_R_SH_SW(in1, in1_r_m);                                            \
+    UNPCK_R_SH_SW(in2, in2_r_m);                                            \
+    UNPCK_R_SH_SW(in3, in3_r_m);                                            \
+                                                                            \
+    constant_m = __msa_fill_w(sinpi_4_9);                                   \
+    MUL2(in0_r_m, constant_m, in3_r_m, constant_m, s1_m, s0_m);             \
+                                                                            \
+    constant_m = __msa_fill_w(sinpi_1_9);                                   \
+    s0_m += in0_r_m * constant_m;                                           \
+    s1_m -= in1_r_m * constant_m;                                           \
+                                                                            \
+    constant_m = __msa_fill_w(sinpi_2_9);                                   \
+    s0_m += in1_r_m * constant_m;                                           \
+    s1_m += in3_r_m * constant_m;                                           \
+                                                                            \
+    s2_m = in0_r_m + in1_r_m - in3_r_m;                                     \
+                                                                            \
+    constant_m = __msa_fill_w(sinpi_3_9);                                   \
+    MUL2(in2_r_m, constant_m, s2_m, constant_m, s3_m, in1_r_m);             \
+                                                                            \
+    in0_r_m = s0_m + s3_m;                                                  \
+    s2_m = s1_m - s3_m;                                                     \
+    s3_m = s1_m - s0_m + s3_m;                                              \
+                                                                            \
+    SRARI_W4_SW(in0_r_m, in1_r_m, s2_m, s3_m, DCT_CONST_BITS);              \
+    PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m, s3_m, s3_m, \
+                out0, out1, out2, out3);                                    \
+  }
+#endif  // AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_
diff --git a/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c b/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c
new file mode 100644
index 000000000..4ec679642
--- /dev/null
+++ b/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "aom_dsp/mips/macros_msa.h"
+
+static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride,
+                                            uint8_t *frm2_ptr, int32_t filt_sth,
+                                            int32_t filt_wgt, uint32_t *acc,
+                                            uint16_t *cnt) {
+  uint32_t row;
+  uint64_t f0, f1, f2, f3;
+  v16i8 frm2, frm1 = { 0 };
+  v16i8 frm4, frm3 = { 0 };
+  v16u8 frm_r, frm_l;
+  v8i16 frm2_r, frm2_l;
+  v8i16 diff0, diff1, mod0_h, mod1_h;
+  v4i32 cnst3, cnst16, filt_wt, strength;
+  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
+  v4i32 acc0, acc1, acc2, acc3;
+  v8i16 cnt0, cnt1;
+
+  filt_wt = __msa_fill_w(filt_wgt);
+  strength = __msa_fill_w(filt_sth);
+  cnst3 = __msa_ldi_w(3);
+  cnst16 = __msa_ldi_w(16);
+
+  for (row = 2; row--;) {
+    LD4(frm1_ptr, stride, f0, f1, f2, f3);
+    frm1_ptr += (4 * stride);
+
+    LD_SB2(frm2_ptr, 16, frm2, frm4);
+    frm2_ptr += 32;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    INSERT_D2_SB(f0, f1, frm1);
+    INSERT_D2_SB(f2, f3, frm3);
+    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    UNPCK_UB_SH(frm2, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+         mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+    UNPCK_UB_SH(frm4, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+         mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+  }
+}
+
+static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, uint32_t stride,
+                                             uint8_t *frm2_ptr,
+                                             int32_t filt_sth, int32_t filt_wgt,
+                                             uint32_t *acc, uint16_t *cnt) {
+  uint32_t row;
+  v16i8 frm1, frm2, frm3, frm4;
+  v16u8 frm_r, frm_l;
+  v16i8 zero = { 0 };
+  v8u16 frm2_r, frm2_l;
+  v8i16 diff0, diff1, mod0_h, mod1_h;
+  v4i32 cnst3, cnst16, filt_wt, strength;
+  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
+  v4i32 acc0, acc1, acc2, acc3;
+  v8i16 cnt0, cnt1;
+
+  filt_wt = __msa_fill_w(filt_wgt);
+  strength = __msa_fill_w(filt_sth);
+  cnst3 = __msa_ldi_w(3);
+  cnst16 = __msa_ldi_w(16);
+
+  for (row = 8; row--;) {
+    LD_SB2(frm1_ptr, stride, frm1, frm3);
+    frm1_ptr += stride;
+
+    LD_SB2(frm2_ptr, 16, frm2, frm4);
+    frm2_ptr += 16;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+         mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+         mod2_w, mod3_w);
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    frm1_ptr += stride;
+    frm2_ptr += 16;
+  }
+}
+
+void av1_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
+                                   uint8_t *frame2_ptr, uint32_t blk_w,
+                                   uint32_t blk_h, int32_t strength,
+                                   int32_t filt_wgt, uint32_t *accu,
+                                   uint16_t *cnt) {
+  if (8 == (blk_w * blk_h)) {
+    temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength,
+                                    filt_wgt, accu, cnt);
+  } else if (16 == (blk_w * blk_h)) {
+    temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength,
+                                     filt_wgt, accu, cnt);
+  } else {
+    av1_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
+                                strength, filt_wgt, accu, cnt);
+  }
+}
diff --git a/third_party/aom/av1/encoder/palette.c b/third_party/aom/av1/encoder/palette.c
new file mode 100644
index 000000000..355141de5
--- /dev/null
+++ b/third_party/aom/av1/encoder/palette.c
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/palette.h"
+
+static float calc_dist(const float *p1, const float *p2, int dim) {
+  float dist = 0;
+  int i;
+  for (i = 0; i < dim; ++i) {
+    const float diff = p1[i] - p2[i];
+    dist += diff * diff;
+  }
+  return dist;
+}
+
+void av1_calc_indices(const float *data, const float *centroids,
+                      uint8_t *indices, int n, int k, int dim) {
+  int i, j;
+  for (i = 0; i < n; ++i) {
+    float min_dist = calc_dist(data + i * dim, centroids, dim);
+    indices[i] = 0;
+    for (j = 1; j < k; ++j) {
+      const float this_dist =
+          calc_dist(data + i * dim, centroids + j * dim, dim);
+      if (this_dist < min_dist) {
+        min_dist = this_dist;
+        indices[i] = j;
+      }
+    }
+  }
+}
+
+// Generate a random number in the range [0, 32768).
+static unsigned int lcg_rand16(unsigned int *state) {
+  *state = (unsigned int)(*state * 1103515245ULL + 12345);
+  return *state / 65536 % 32768;
+}
+
+static void calc_centroids(const float *data, float *centroids,
+                           const uint8_t *indices, int n, int k, int dim) {
+  int i, j, index;
+  int count[PALETTE_MAX_SIZE];
+  unsigned int rand_state = (unsigned int)data[0];
+
+  assert(n <= 32768);
+
+  memset(count, 0, sizeof(count[0]) * k);
+  memset(centroids, 0, sizeof(centroids[0]) * k * dim);
+
+  for (i = 0; i < n; ++i) {
+    index = indices[i];
+    assert(index < k);
+    ++count[index];
+    for (j = 0; j < dim; ++j) {
+      centroids[index * dim + j] += data[i * dim + j];
+    }
+  }
+
+  for (i = 0; i < k; ++i) {
+    if (count[i] == 0) {
+      memcpy(centroids + i * dim, data + (lcg_rand16(&rand_state) % n) * dim,
+             sizeof(centroids[0]) * dim);
+    } else {
+      const float norm = 1.0f / count[i];
+      for (j = 0; j < dim; ++j) centroids[i * dim + j] *= norm;
+    }
+  }
+
+  // Round to nearest integers.
+  for (i = 0; i < k * dim; ++i) {
+    centroids[i] = roundf(centroids[i]);
+  }
+}
+
+static float calc_total_dist(const float *data, const float *centroids,
+                             const uint8_t *indices, int n, int k, int dim) {
+  float dist = 0;
+  int i;
+  (void)k;
+
+  for (i = 0; i < n; ++i)
+    dist += calc_dist(data + i * dim, centroids + indices[i] * dim, dim);
+
+  return dist;
+}
+
+void av1_k_means(const float *data, float *centroids, uint8_t *indices, int n,
+                 int k, int dim, int max_itr) {
+  int i;
+  float this_dist;
+  float pre_centroids[2 * PALETTE_MAX_SIZE];
+  uint8_t pre_indices[MAX_SB_SQUARE];
+
+  av1_calc_indices(data, centroids, indices, n, k, dim);
+  this_dist = calc_total_dist(data, centroids, indices, n, k, dim);
+
+  for (i = 0; i < max_itr; ++i) {
+    const float pre_dist = this_dist;
+    memcpy(pre_centroids, centroids, sizeof(pre_centroids[0]) * k * dim);
+    memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
+
+    calc_centroids(data, centroids, indices, n, k, dim);
+    av1_calc_indices(data, centroids, indices, n, k, dim);
+    this_dist = calc_total_dist(data, centroids, indices, n, k, dim);
+
+    if (this_dist > pre_dist) {
+      memcpy(centroids, pre_centroids, sizeof(pre_centroids[0]) * k * dim);
+      memcpy(indices, pre_indices, sizeof(pre_indices[0]) * n);
+      break;
+    }
+    if (!memcmp(centroids, pre_centroids, sizeof(pre_centroids[0]) * k * dim))
+      break;
+  }
+}
+
+static int float_comparer(const void *a, const void *b) {
+  const float fa = *(const float *)a;
+  const float fb = *(const float *)b;
+  return (fa > fb) - (fa < fb);
+}
+
+int av1_remove_duplicates(float *centroids, int num_centroids) {
+  int num_unique;  // number of unique centroids
+  int i;
+  qsort(centroids, num_centroids, sizeof(*centroids), float_comparer);
+  // Remove duplicates.
+  num_unique = 1;
+  for (i = 1; i < num_centroids; ++i) {
+    if (centroids[i] != centroids[i - 1]) {  // found a new unique centroid
+      centroids[num_unique++] = centroids[i];
+    }
+  }
+  return num_unique;
+}
+
+int av1_count_colors(const uint8_t *src, int stride, int rows, int cols) {
+  int n = 0, r, c, i, val_count[256];
+  uint8_t val;
+  memset(val_count, 0, sizeof(val_count));
+
+  for (r = 0; r < rows; ++r) {
+    for (c = 0; c < cols; ++c) {
+      val = src[r * stride + c];
+      ++val_count[val];
+    }
+  }
+
+  for (i = 0; i < 256; ++i) {
+    if (val_count[i]) {
+      ++n;
+    }
+  }
+
+  return n;
+}
+
+#if CONFIG_PALETTE_DELTA_ENCODING
+int av1_get_palette_delta_bits_y(const PALETTE_MODE_INFO *const pmi,
+                                 int bit_depth, int *min_bits) {
+  const int n = pmi->palette_size[0];
+  int max_d = 0, i;
+  *min_bits = bit_depth - 3;
+  for (i = 1; i < n; ++i) {
+    const int delta = pmi->palette_colors[i] - pmi->palette_colors[i - 1];
+    assert(delta > 0);
+    if (delta > max_d) max_d = delta;
+  }
+  return AOMMAX(av1_ceil_log2(max_d), *min_bits);
+}
+
+int av1_get_palette_delta_bits_u(const PALETTE_MODE_INFO *const pmi,
+                                 int bit_depth, int *min_bits) {
+  const int n = pmi->palette_size[1];
+  int max_d = 0, i;
+  *min_bits = bit_depth - 3;
+  for (i = 1; i < n; ++i) {
+    const int delta = pmi->palette_colors[PALETTE_MAX_SIZE + i] -
+                      pmi->palette_colors[PALETTE_MAX_SIZE + i - 1];
+    assert(delta >= 0);
+    if (delta > max_d) max_d = delta;
+  }
+  return AOMMAX(av1_ceil_log2(max_d + 1), *min_bits);
+}
+
+int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
+                                 int bit_depth, int *zero_count,
+                                 int *min_bits) {
+  const int n = pmi->palette_size[1];
+  const int max_val = 1 << bit_depth;
+  int max_d = 0, i;
+  *min_bits = bit_depth - 4;
+  *zero_count = 0;
+  for (i = 1; i < n; ++i) {
+    const int delta = pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] -
+                      pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1];
+    const int v = abs(delta);
+    const int d = AOMMIN(v, max_val - v);
+    if (d > max_d) max_d = d;
+    if (d == 0) ++(*zero_count);
+  }
+  return AOMMAX(av1_ceil_log2(max_d + 1), *min_bits);
+}
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
+
+int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
+                             int bit_depth) {
+  const int n = pmi->palette_size[0];
+#if CONFIG_PALETTE_DELTA_ENCODING
+  int min_bits = 0;
+  const int bits = av1_get_palette_delta_bits_y(pmi, bit_depth, &min_bits);
+  return av1_cost_bit(128, 0) * (2 + bit_depth + bits * (n - 1));
+#else
+  return bit_depth * n * av1_cost_bit(128, 0);
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
+}
+
+int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
+                              int bit_depth) {
+  const int n = pmi->palette_size[1];
+#if CONFIG_PALETTE_DELTA_ENCODING
+  int cost = 0;
+  // U channel palette color cost.
+  int min_bits_u = 0;
+  const int bits_u = av1_get_palette_delta_bits_u(pmi, bit_depth, &min_bits_u);
+  cost += av1_cost_bit(128, 0) * (2 + bit_depth + bits_u * (n - 1));
+  // V channel palette color cost.
+  int zero_count = 0, min_bits_v = 0;
+  const int bits_v =
+      av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v);
+  const int bits_using_delta =
+      2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count;
+  const int bits_using_raw = bit_depth * n;
+  cost += av1_cost_bit(128, 0) * (1 + AOMMIN(bits_using_delta, bits_using_raw));
+  return cost;
+#else
+  return 2 * bit_depth * n * av1_cost_bit(128, 0);
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
+}
+
+#if CONFIG_HIGHBITDEPTH
+int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
+                            int bit_depth) {
+  int n = 0, r, c, i;
+  uint16_t val;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  int val_count[1 << 12];
+
+  assert(bit_depth <= 12);
+  memset(val_count, 0, (1 << 12) * sizeof(val_count[0]));
+  for (r = 0; r < rows; ++r) {
+    for (c = 0; c < cols; ++c) {
+      val = src[r * stride + c];
+      ++val_count[val];
+    }
+  }
+
+  for (i = 0; i < (1 << bit_depth); ++i) {
+    if (val_count[i]) {
+      ++n;
+    }
+  }
+
+  return n;
+}
+#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h
new file mode 100644
index 000000000..5403ac5e6
--- /dev/null
+++ b/third_party/aom/av1/encoder/palette.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_PALETTE_H_
+#define AV1_ENCODER_PALETTE_H_
+
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Given 'n' 'data' points and 'k' 'centroids' each of dimension 'dim',
+// calculate the centroid 'indices' for the data points.
+void av1_calc_indices(const float *data, const float *centroids,
+                      uint8_t *indices, int n, int k, int dim);
+
+// Given 'n' 'data' points and an initial guess of 'k' 'centroids' each of
+// dimension 'dim', runs up to 'max_itr' iterations of k-means algorithm to get
+// updated 'centroids' and the centroid 'indices' for elements in 'data'.
+// Note: the output centroids are rounded off to nearest integers.
+void av1_k_means(const float *data, float *centroids, uint8_t *indices, int n,
+                 int k, int dim, int max_itr);
+
+// Given a list of centroids, returns the unique number of centroids 'k', and
+// puts these unique centroids in first 'k' indices of 'centroids' array.
+// Ideally, the centroids should be rounded to integers before calling this
+// method.
+int av1_remove_duplicates(float *centroids, int num_centroids);
+
+// Returns the number of colors in 'src'.
+int av1_count_colors(const uint8_t *src, int stride, int rows, int cols);
+#if CONFIG_HIGHBITDEPTH
+// Same as av1_count_colors(), but for high-bitdepth mode.
+int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
+                            int bit_depth);
+#endif  // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_PALETTE_DELTA_ENCODING
+// Return the number of bits used to transmit each luma palette color delta.
+int av1_get_palette_delta_bits_y(const PALETTE_MODE_INFO *const pmi,
+                                 int bit_depth, int *min_bits);
+
+// Return the number of bits used to transmit each U palette color delta.
+int av1_get_palette_delta_bits_u(const PALETTE_MODE_INFO *const pmi,
+                                 int bit_depth, int *min_bits);
+
+// Return the number of bits used to transmit each v palette color delta;
+// assign zero_count with the number of deltas being 0.
+int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
+                                 int bit_depth, int *zero_count, int *min_bits);
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
+
+// Return the rate cost for transmitting luma palette color values.
+int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi, int bit_depth);
+
+// Return the rate cost for transmitting chroma palette color values.
+int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
+                              int bit_depth);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif /* AV1_ENCODER_PALETTE_H_ */
diff --git a/third_party/aom/av1/encoder/pickcdef.c b/third_party/aom/av1/encoder/pickcdef.c
new file mode 100644
index 000000000..da64fb48d
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickcdef.c
@@ -0,0 +1,490 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <string.h>
+
+#include "./aom_scale_rtcd.h"
+#include "aom/aom_integer.h"
+#include "av1/common/cdef.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/encoder.h"
+
+#define TOTAL_STRENGTHS (DERING_STRENGTHS * CLPF_STRENGTHS)
+
+/* Search for the best strength to add as an option, knowing we
+   already selected nb_strengths options. */
+static uint64_t search_one(int *lev, int nb_strengths,
+                           uint64_t mse[][TOTAL_STRENGTHS], int sb_count) {
+  uint64_t tot_mse[TOTAL_STRENGTHS];
+  int i, j;
+  uint64_t best_tot_mse = (uint64_t)1 << 63;
+  int best_id = 0;
+  memset(tot_mse, 0, sizeof(tot_mse));
+  for (i = 0; i < sb_count; i++) {
+    int gi;
+    uint64_t best_mse = (uint64_t)1 << 63;
+    /* Find best mse among already selected options. */
+    for (gi = 0; gi < nb_strengths; gi++) {
+      if (mse[i][lev[gi]] < best_mse) {
+        best_mse = mse[i][lev[gi]];
+      }
+    }
+    /* Find best mse when adding each possible new option. */
+    for (j = 0; j < TOTAL_STRENGTHS; j++) {
+      uint64_t best = best_mse;
+      if (mse[i][j] < best) best = mse[i][j];
+      tot_mse[j] += best;
+    }
+  }
+  for (j = 0; j < TOTAL_STRENGTHS; j++) {
+    if (tot_mse[j] < best_tot_mse) {
+      best_tot_mse = tot_mse[j];
+      best_id = j;
+    }
+  }
+  lev[nb_strengths] = best_id;
+  return best_tot_mse;
+}
+
+/* Search for the best luma+chroma strength to add as an option, knowing we
+   already selected nb_strengths options. */
+static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths,
+                                uint64_t (**mse)[TOTAL_STRENGTHS],
+                                int sb_count) {
+  uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS];
+  int i, j;
+  uint64_t best_tot_mse = (uint64_t)1 << 63;
+  int best_id0 = 0;
+  int best_id1 = 0;
+  memset(tot_mse, 0, sizeof(tot_mse));
+  for (i = 0; i < sb_count; i++) {
+    int gi;
+    uint64_t best_mse = (uint64_t)1 << 63;
+    /* Find best mse among already selected options. */
+    for (gi = 0; gi < nb_strengths; gi++) {
+      uint64_t curr = mse[0][i][lev0[gi]];
+      curr += mse[1][i][lev1[gi]];
+      if (curr < best_mse) {
+        best_mse = curr;
+      }
+    }
+    /* Find best mse when adding each possible new option. */
+    for (j = 0; j < TOTAL_STRENGTHS; j++) {
+      int k;
+      for (k = 0; k < TOTAL_STRENGTHS; k++) {
+        uint64_t best = best_mse;
+        uint64_t curr = mse[0][i][j];
+        curr += mse[1][i][k];
+        if (curr < best) best = curr;
+        tot_mse[j][k] += best;
+      }
+    }
+  }
+  for (j = 0; j < TOTAL_STRENGTHS; j++) {
+    int k;
+    for (k = 0; k < TOTAL_STRENGTHS; k++) {
+      if (tot_mse[j][k] < best_tot_mse) {
+        best_tot_mse = tot_mse[j][k];
+        best_id0 = j;
+        best_id1 = k;
+      }
+    }
+  }
+  lev0[nb_strengths] = best_id0;
+  lev1[nb_strengths] = best_id1;
+  return best_tot_mse;
+}
+
+/* Search for the set of strengths that minimizes mse. */
+static uint64_t joint_strength_search(int *best_lev, int nb_strengths,
+                                      uint64_t mse[][TOTAL_STRENGTHS],
+                                      int sb_count) {
+  uint64_t best_tot_mse;
+  int i;
+  best_tot_mse = (uint64_t)1 << 63;
+  /* Greedy search: add one strength options at a time. */
+  for (i = 0; i < nb_strengths; i++) {
+    best_tot_mse = search_one(best_lev, i, mse, sb_count);
+  }
+  /* Trying to refine the greedy search by reconsidering each
+     already-selected option. */
+  for (i = 0; i < 4 * nb_strengths; i++) {
+    int j;
+    for (j = 0; j < nb_strengths - 1; j++) best_lev[j] = best_lev[j + 1];
+    best_tot_mse = search_one(best_lev, nb_strengths - 1, mse, sb_count);
+  }
+  return best_tot_mse;
+}
+
+/* Search for the set of luma+chroma strengths that minimizes mse. */
+static uint64_t joint_strength_search_dual(int *best_lev0, int *best_lev1,
+                                           int nb_strengths,
+                                           uint64_t (**mse)[TOTAL_STRENGTHS],
+                                           int sb_count) {
+  uint64_t best_tot_mse;
+  int i;
+  best_tot_mse = (uint64_t)1 << 63;
+  /* Greedy search: add one strength options at a time. */
+  for (i = 0; i < nb_strengths; i++) {
+    best_tot_mse = search_one_dual(best_lev0, best_lev1, i, mse, sb_count);
+  }
+  /* Trying to refine the greedy search by reconsidering each
+     already-selected option. */
+  for (i = 0; i < 4 * nb_strengths; i++) {
+    int j;
+    for (j = 0; j < nb_strengths - 1; j++) {
+      best_lev0[j] = best_lev0[j + 1];
+      best_lev1[j] = best_lev1[j + 1];
+    }
+    best_tot_mse =
+        search_one_dual(best_lev0, best_lev1, nb_strengths - 1, mse, sb_count);
+  }
+  return best_tot_mse;
+}
+
+/* FIXME: SSE-optimize this. */
+static void copy_sb16_16(uint16_t *dst, int dstride, const uint16_t *src,
+                         int src_voffset, int src_hoffset, int sstride,
+                         int vsize, int hsize) {
+  int r, c;
+  const uint16_t *base = &src[src_voffset * sstride + src_hoffset];
+  for (r = 0; r < vsize; r++) {
+    for (c = 0; c < hsize; c++) {
+      dst[r * dstride + c] = base[r * sstride + c];
+    }
+  }
+}
+
+static INLINE uint64_t dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
+                                      int sstride, int coeff_shift) {
+  uint64_t svar = 0;
+  uint64_t dvar = 0;
+  uint64_t sum_s = 0;
+  uint64_t sum_d = 0;
+  uint64_t sum_s2 = 0;
+  uint64_t sum_d2 = 0;
+  uint64_t sum_sd = 0;
+  int i, j;
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 8; j++) {
+      sum_s += src[i * sstride + j];
+      sum_d += dst[i * dstride + j];
+      sum_s2 += src[i * sstride + j] * src[i * sstride + j];
+      sum_d2 += dst[i * dstride + j] * dst[i * dstride + j];
+      sum_sd += src[i * sstride + j] * dst[i * dstride + j];
+    }
+  }
+  /* Compute the variance -- the calculation cannot go negative. */
+  svar = sum_s2 - ((sum_s * sum_s + 32) >> 6);
+  dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6);
+  return (uint64_t)floor(
+      .5 +
+      (sum_d2 + sum_s2 - 2 * sum_sd) * .5 *
+          (svar + dvar + (400 << 2 * coeff_shift)) /
+          (sqrt((20000 << 4 * coeff_shift) + svar * (double)dvar)));
+}
+
+static INLINE uint64_t mse_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
+                                     int sstride) {
+  uint64_t sum = 0;
+  int i, j;
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 8; j++) {
+      int e = dst[i * dstride + j] - src[i * sstride + j];
+      sum += e * e;
+    }
+  }
+  return sum;
+}
+
+static INLINE uint64_t mse_4x4_16bit(uint16_t *dst, int dstride, uint16_t *src,
+                                     int sstride) {
+  uint64_t sum = 0;
+  int i, j;
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      int e = dst[i * dstride + j] - src[i * sstride + j];
+      sum += e * e;
+    }
+  }
+  return sum;
+}
+
+/* Compute MSE only on the blocks we filtered. */
+uint64_t compute_dering_dist(uint16_t *dst, int dstride, uint16_t *src,
+                             dering_list *dlist, int dering_count,
+                             BLOCK_SIZE bsize, int coeff_shift, int pli) {
+  uint64_t sum = 0;
+  int bi, bx, by;
+  if (bsize == BLOCK_8X8) {
+    for (bi = 0; bi < dering_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      if (pli == 0) {
+        sum += dist_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
+                              &src[bi << (3 + 3)], 8, coeff_shift);
+      } else {
+        sum += mse_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
+                             &src[bi << (3 + 3)], 8);
+      }
+    }
+  } else if (bsize == BLOCK_4X8) {
+    for (bi = 0; bi < dering_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      sum += mse_4x4_16bit(&dst[(by << 3) * dstride + (bx << 2)], dstride,
+                           &src[bi << (3 + 2)], 4);
+      sum += mse_4x4_16bit(&dst[((by << 3) + 4) * dstride + (bx << 2)], dstride,
+                           &src[(bi << (3 + 2)) + 4 * 4], 4);
+    }
+  } else if (bsize == BLOCK_8X4) {
+    for (bi = 0; bi < dering_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 3)], dstride,
+                           &src[bi << (2 + 3)], 8);
+      sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 3) + 4], dstride,
+                           &src[(bi << (2 + 3)) + 4], 8);
+    }
+  } else {
+    assert(bsize == BLOCK_4X4);
+    for (bi = 0; bi < dering_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 2)], dstride,
+                           &src[bi << (2 + 2)], 4);
+    }
+  }
+  return sum >> 2 * coeff_shift;
+}
+
+void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
+                     AV1_COMMON *cm, MACROBLOCKD *xd) {
+  int r, c;
+  int sbr, sbc;
+  uint16_t *src[3];
+  uint16_t *ref_coeff[3];
+  dering_list dlist[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
+  int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
+  int stride[3];
+  int bsize[3];
+  int mi_wide_l2[3];
+  int mi_high_l2[3];
+  int xdec[3];
+  int ydec[3];
+  int pli;
+  int dering_count;
+  int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
+  uint64_t best_tot_mse = (uint64_t)1 << 63;
+  uint64_t tot_mse;
+  int sb_count;
+  int nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
+  int nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
+  int *sb_index = aom_malloc(nvsb * nhsb * sizeof(*sb_index));
+  int *selected_strength = aom_malloc(nvsb * nhsb * sizeof(*sb_index));
+  uint64_t(*mse[2])[TOTAL_STRENGTHS];
+  int clpf_damping = 3 + (cm->base_qindex >> 6);
+  int dering_damping = 6;
+  int i;
+  int nb_strengths;
+  int nb_strength_bits;
+  int quantizer;
+  double lambda;
+  int nplanes = 3;
+  DECLARE_ALIGNED(32, uint16_t, inbuf[OD_DERING_INBUF_SIZE]);
+  uint16_t *in;
+  DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_SB_SQUARE]);
+  int chroma_dering =
+      xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
+      xd->plane[2].subsampling_x == xd->plane[2].subsampling_y;
+  quantizer =
+      av1_ac_quant(cm->base_qindex, 0, cm->bit_depth) >> (cm->bit_depth - 8);
+  lambda = .12 * quantizer * quantizer / 256.;
+
+  av1_setup_dst_planes(xd->plane, cm->sb_size, frame, 0, 0);
+  mse[0] = aom_malloc(sizeof(**mse) * nvsb * nhsb);
+  mse[1] = aom_malloc(sizeof(**mse) * nvsb * nhsb);
+  for (pli = 0; pli < nplanes; pli++) {
+    uint8_t *ref_buffer;
+    int ref_stride;
+    switch (pli) {
+      case 0:
+        ref_buffer = ref->y_buffer;
+        ref_stride = ref->y_stride;
+        break;
+      case 1:
+        ref_buffer = ref->u_buffer;
+        ref_stride = ref->uv_stride;
+        break;
+      case 2:
+        ref_buffer = ref->v_buffer;
+        ref_stride = ref->uv_stride;
+        break;
+    }
+    src[pli] = aom_memalign(
+        32, sizeof(*src) * cm->mi_rows * cm->mi_cols * MI_SIZE * MI_SIZE);
+    ref_coeff[pli] = aom_memalign(
+        32, sizeof(*ref_coeff) * cm->mi_rows * cm->mi_cols * MI_SIZE * MI_SIZE);
+    xdec[pli] = xd->plane[pli].subsampling_x;
+    ydec[pli] = xd->plane[pli].subsampling_y;
+    bsize[pli] = ydec[pli] ? (xdec[pli] ? BLOCK_4X4 : BLOCK_8X4)
+                           : (xdec[pli] ? BLOCK_4X8 : BLOCK_8X8);
+    stride[pli] = cm->mi_cols << MI_SIZE_LOG2;
+    mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
+    mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
+
+    const int frame_height =
+        (cm->mi_rows * MI_SIZE) >> xd->plane[pli].subsampling_y;
+    const int frame_width =
+        (cm->mi_cols * MI_SIZE) >> xd->plane[pli].subsampling_x;
+
+    for (r = 0; r < frame_height; ++r) {
+      for (c = 0; c < frame_width; ++c) {
+#if CONFIG_HIGHBITDEPTH
+        if (cm->use_highbitdepth) {
+          src[pli][r * stride[pli] + c] = CONVERT_TO_SHORTPTR(
+              xd->plane[pli].dst.buf)[r * xd->plane[pli].dst.stride + c];
+          ref_coeff[pli][r * stride[pli] + c] =
+              CONVERT_TO_SHORTPTR(ref_buffer)[r * ref_stride + c];
+        } else {
+#endif
+          src[pli][r * stride[pli] + c] =
+              xd->plane[pli].dst.buf[r * xd->plane[pli].dst.stride + c];
+          ref_coeff[pli][r * stride[pli] + c] = ref_buffer[r * ref_stride + c];
+#if CONFIG_HIGHBITDEPTH
+        }
+#endif
+      }
+    }
+  }
+  in = inbuf + OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER;
+  sb_count = 0;
+  for (sbr = 0; sbr < nvsb; ++sbr) {
+    for (sbc = 0; sbc < nhsb; ++sbc) {
+      int nvb, nhb;
+      int gi;
+      int dirinit = 0;
+      nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
+      nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
+      cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
+                          MAX_MIB_SIZE * sbc]
+          ->mbmi.cdef_strength = -1;
+      if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue;
+      dering_count = sb_compute_dering_list(cm, sbr * MAX_MIB_SIZE,
+                                            sbc * MAX_MIB_SIZE, dlist, 1);
+      for (pli = 0; pli < nplanes; pli++) {
+        for (i = 0; i < OD_DERING_INBUF_SIZE; i++)
+          inbuf[i] = OD_DERING_VERY_LARGE;
+        for (gi = 0; gi < TOTAL_STRENGTHS; gi++) {
+          int threshold;
+          uint64_t curr_mse;
+          int clpf_strength;
+          threshold = gi / CLPF_STRENGTHS;
+          if (pli > 0 && !chroma_dering) threshold = 0;
+          /* We avoid filtering the pixels for which some of the pixels to
+             average
+             are outside the frame. We could change the filter instead, but it
+             would add special cases for any future vectorization. */
+          int yoff = OD_FILT_VBORDER * (sbr != 0);
+          int xoff = OD_FILT_HBORDER * (sbc != 0);
+          int ysize = (nvb << mi_high_l2[pli]) +
+                      OD_FILT_VBORDER * (sbr != nvsb - 1) + yoff;
+          int xsize = (nhb << mi_wide_l2[pli]) +
+                      OD_FILT_HBORDER * (sbc != nhsb - 1) + xoff;
+          clpf_strength = gi % CLPF_STRENGTHS;
+          if (clpf_strength == 0)
+            copy_sb16_16(&in[(-yoff * OD_FILT_BSTRIDE - xoff)], OD_FILT_BSTRIDE,
+                         src[pli],
+                         (sbr * MAX_MIB_SIZE << mi_high_l2[pli]) - yoff,
+                         (sbc * MAX_MIB_SIZE << mi_wide_l2[pli]) - xoff,
+                         stride[pli], ysize, xsize);
+          od_dering(clpf_strength ? NULL : (uint8_t *)in, OD_FILT_BSTRIDE,
+                    tmp_dst, in, xdec[pli], ydec[pli], dir, &dirinit, var, pli,
+                    dlist, dering_count, threshold,
+                    clpf_strength + (clpf_strength == 3), clpf_damping,
+                    dering_damping, coeff_shift, clpf_strength != 0, 1);
+          curr_mse = compute_dering_dist(
+              ref_coeff[pli] +
+                  (sbr * MAX_MIB_SIZE << mi_high_l2[pli]) * stride[pli] +
+                  (sbc * MAX_MIB_SIZE << mi_wide_l2[pli]),
+              stride[pli], tmp_dst, dlist, dering_count, bsize[pli],
+              coeff_shift, pli);
+          if (pli < 2)
+            mse[pli][sb_count][gi] = curr_mse;
+          else
+            mse[1][sb_count][gi] += curr_mse;
+          sb_index[sb_count] =
+              MAX_MIB_SIZE * sbr * cm->mi_stride + MAX_MIB_SIZE * sbc;
+        }
+      }
+      sb_count++;
+    }
+  }
+  nb_strength_bits = 0;
+  /* Search for different number of signalling bits. */
+  for (i = 0; i <= 3; i++) {
+    int j;
+    int best_lev0[CDEF_MAX_STRENGTHS];
+    int best_lev1[CDEF_MAX_STRENGTHS] = { 0 };
+    nb_strengths = 1 << i;
+    if (nplanes >= 3)
+      tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths,
+                                           mse, sb_count);
+    else
+      tot_mse =
+          joint_strength_search(best_lev0, nb_strengths, mse[0], sb_count);
+    /* Count superblock signalling cost. */
+    tot_mse += (uint64_t)(sb_count * lambda * i);
+    /* Count header signalling cost. */
+    tot_mse += (uint64_t)(nb_strengths * lambda * CDEF_STRENGTH_BITS);
+    if (tot_mse < best_tot_mse) {
+      best_tot_mse = tot_mse;
+      nb_strength_bits = i;
+      for (j = 0; j < 1 << nb_strength_bits; j++) {
+        cm->cdef_strengths[j] = best_lev0[j];
+        cm->cdef_uv_strengths[j] = best_lev1[j];
+      }
+    }
+  }
+  nb_strengths = 1 << nb_strength_bits;
+
+  cm->cdef_bits = nb_strength_bits;
+  cm->nb_cdef_strengths = nb_strengths;
+  for (i = 0; i < sb_count; i++) {
+    int gi;
+    int best_gi;
+    uint64_t best_mse = (uint64_t)1 << 63;
+    best_gi = 0;
+    for (gi = 0; gi < cm->nb_cdef_strengths; gi++) {
+      uint64_t curr = mse[0][i][cm->cdef_strengths[gi]];
+      if (nplanes >= 3) curr += mse[1][i][cm->cdef_uv_strengths[gi]];
+      if (curr < best_mse) {
+        best_gi = gi;
+        best_mse = curr;
+      }
+    }
+    selected_strength[i] = best_gi;
+    cm->mi_grid_visible[sb_index[i]]->mbmi.cdef_strength = best_gi;
+  }
+  cm->cdef_dering_damping = dering_damping;
+  cm->cdef_clpf_damping = clpf_damping;
+  aom_free(mse[0]);
+  aom_free(mse[1]);
+  for (pli = 0; pli < nplanes; pli++) {
+    aom_free(src[pli]);
+    aom_free(ref_coeff[pli]);
+  }
+  aom_free(sb_index);
+  aom_free(selected_strength);
+}
diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c
new file mode 100644
index 000000000..fc0ea485d
--- /dev/null
+++ b/third_party/aom/av1/encoder/picklpf.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+
+#include "./aom_scale_rtcd.h"
+
+#include "aom_dsp/psnr.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/quant_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/picklpf.h"
+
+int av1_get_max_filter_level(const AV1_COMP *cpi) {
+  if (cpi->oxcf.pass == 2) {
+    return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
+                                                 : MAX_LOOP_FILTER;
+  } else {
+    return MAX_LOOP_FILTER;
+  }
+}
+
+static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
+                                AV1_COMP *const cpi, int filt_level,
+                                int partial_frame) {
+  AV1_COMMON *const cm = &cpi->common;
+  int64_t filt_err;
+
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4
+  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1,
+                        partial_frame);
+#else
+  if (cpi->num_workers > 1)
+    av1_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
+                             filt_level, 1, partial_frame, cpi->workers,
+                             cpi->num_workers, &cpi->lf_row_sync);
+  else
+    av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
+                          1, partial_frame);
+#endif
+
+#if CONFIG_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    filt_err = aom_highbd_get_y_sse(sd, cm->frame_to_show);
+  } else {
+    filt_err = aom_get_y_sse(sd, cm->frame_to_show);
+  }
+#else
+  filt_err = aom_get_y_sse(sd, cm->frame_to_show);
+#endif  // CONFIG_HIGHBITDEPTH
+
+  // Re-instate the unfiltered frame
+  aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+
+  return filt_err;
+}
+
+int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+                            int partial_frame, double *best_cost_ret) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const struct loopfilter *const lf = &cm->lf;
+  const int min_filter_level = 0;
+  const int max_filter_level = av1_get_max_filter_level(cpi);
+  int filt_direction = 0;
+  int64_t best_err;
+  int filt_best;
+  MACROBLOCK *x = &cpi->td.mb;
+
+  // Start the search at the previous frame filter level unless it is now out of
+  // range.
+  int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
+  int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
+  // Sum squared error at each filter level
+  int64_t ss_err[MAX_LOOP_FILTER + 1];
+
+  // Set each entry to -1
+  memset(ss_err, 0xFF, sizeof(ss_err));
+
+  //  Make a copy of the unfiltered / processed recon buffer
+  aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+
+  best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame);
+  filt_best = filt_mid;
+  ss_err[filt_mid] = best_err;
+
+  while (filter_step > 0) {
+    const int filt_high = AOMMIN(filt_mid + filter_step, max_filter_level);
+    const int filt_low = AOMMAX(filt_mid - filter_step, min_filter_level);
+
+    // Bias against raising loop filter in favor of lowering it.
+    int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+
+    if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20))
+      bias = (bias * cpi->twopass.section_intra_rating) / 20;
+
+    // yx, bias less for large block size
+    if (cm->tx_mode != ONLY_4X4) bias >>= 1;
+
+    if (filt_direction <= 0 && filt_low != filt_mid) {
+      // Get Low filter error score
+      if (ss_err[filt_low] < 0) {
+        ss_err[filt_low] = try_filter_frame(sd, cpi, filt_low, partial_frame);
+      }
+      // If value is close to the best so far then bias towards a lower loop
+      // filter value.
+      if (ss_err[filt_low] < (best_err + bias)) {
+        // Was it actually better than the previous best?
+        if (ss_err[filt_low] < best_err) {
+          best_err = ss_err[filt_low];
+        }
+        filt_best = filt_low;
+      }
+    }
+
+    // Now look at filt_high
+    if (filt_direction >= 0 && filt_high != filt_mid) {
+      if (ss_err[filt_high] < 0) {
+        ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame);
+      }
+      // If value is significantly better than previous best, bias added against
+      // raising filter value
+      if (ss_err[filt_high] < (best_err - bias)) {
+        best_err = ss_err[filt_high];
+        filt_best = filt_high;
+      }
+    }
+
+    // Half the step distance if the best filter value was the same as last time
+    if (filt_best == filt_mid) {
+      filter_step /= 2;
+      filt_direction = 0;
+    } else {
+      filt_direction = (filt_best < filt_mid) ? -1 : 1;
+      filt_mid = filt_best;
+    }
+  }
+
+  // Update best error
+  best_err = ss_err[filt_best];
+
+  if (best_cost_ret)
+    *best_cost_ret = RDCOST_DBL(x->rdmult, x->rddiv, 0, best_err);
+  return filt_best;
+}
+
+void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+                           LPF_PICK_METHOD method) {
+  AV1_COMMON *const cm = &cpi->common;
+  struct loopfilter *const lf = &cm->lf;
+
+  lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0 : cpi->oxcf.sharpness;
+
+  if (method == LPF_PICK_MINIMAL_LPF && lf->filter_level) {
+    lf->filter_level = 0;
+  } else if (method >= LPF_PICK_FROM_Q) {
+    const int min_filter_level = 0;
+    const int max_filter_level = av1_get_max_filter_level(cpi);
+    const int q = av1_ac_quant(cm->base_qindex, 0, cm->bit_depth);
+// These values were determined by linear fitting the result of the
+// searched level, filt_guess = q * 0.316206 + 3.87252
+#if CONFIG_HIGHBITDEPTH
+    int filt_guess;
+    switch (cm->bit_depth) {
+      case AOM_BITS_8:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+        break;
+      case AOM_BITS_10:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
+        break;
+      case AOM_BITS_12:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
+        break;
+      default:
+        assert(0 &&
+               "bit_depth should be AOM_BITS_8, AOM_BITS_10 "
+               "or AOM_BITS_12");
+        return;
+    }
+#else
+    int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+#endif  // CONFIG_HIGHBITDEPTH
+    if (cm->frame_type == KEY_FRAME) filt_guess -= 4;
+    lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
+  } else {
+    lf->filter_level = av1_search_filter_level(
+        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL);
+  }
+
+#if CONFIG_EXT_TILE
+  // TODO(any): 0 loopfilter level is only necessary if individual tile
+  // decoding is required. We need to communicate this requirement to this
+  // code and force loop filter level 0 only if required.
+  if (cm->tile_encoding_mode) lf->filter_level = 0;
+#endif  // CONFIG_EXT_TILE
+}
diff --git a/third_party/aom/av1/encoder/picklpf.h b/third_party/aom/av1/encoder/picklpf.h
new file mode 100644
index 000000000..3c0a83462
--- /dev/null
+++ b/third_party/aom/av1/encoder/picklpf.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_PICKLPF_H_
+#define AV1_ENCODER_PICKLPF_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+
+struct yv12_buffer_config;
+struct AV1_COMP;
+int av1_get_max_filter_level(const AV1_COMP *cpi);
+int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+                            int partial_frame, double *err);
+void av1_pick_filter_level(const struct yv12_buffer_config *sd,
+                           struct AV1_COMP *cpi, LPF_PICK_METHOD method);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_PICKLPF_H_
diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c
new file mode 100644
index 000000000..21410e0af
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickrst.c
@@ -0,0 +1,1269 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+
+#include "./aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_dsp/psnr.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/onyxc_int.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/restoration.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/picklpf.h"
+#include "av1/encoder/pickrst.h"
+
+// When set to RESTORE_WIENER or RESTORE_SGRPROJ only those are allowed.
+// When set to RESTORE_NONE (0) we allow switchable.
+const RestorationType force_restore_type = RESTORE_NONE;
+
+// Number of Wiener iterations
+#define NUM_WIENER_ITERS 10
+
+typedef double (*search_restore_type)(const YV12_BUFFER_CONFIG *src,
+                                      AV1_COMP *cpi, int partial_frame,
+                                      RestorationInfo *info,
+                                      RestorationType *rest_level,
+                                      double *best_tile_cost,
+                                      YV12_BUFFER_CONFIG *dst_frame);
+
+const int frame_level_restore_bits[RESTORE_TYPES] = { 2, 2, 2, 2 };
+
+static int64_t sse_restoration_tile(const YV12_BUFFER_CONFIG *src,
+                                    const YV12_BUFFER_CONFIG *dst,
+                                    const AV1_COMMON *cm, int h_start,
+                                    int width, int v_start, int height,
+                                    int components_pattern) {
+  int64_t filt_err = 0;
+  (void)cm;
+  // Y and UV components cannot be mixed
+  assert(components_pattern == 1 || components_pattern == 2 ||
+         components_pattern == 4 || components_pattern == 6);
+#if CONFIG_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    if ((components_pattern >> AOM_PLANE_Y) & 1) {
+      filt_err +=
+          aom_highbd_get_y_sse_part(src, dst, h_start, width, v_start, height);
+    }
+    if ((components_pattern >> AOM_PLANE_U) & 1) {
+      filt_err +=
+          aom_highbd_get_u_sse_part(src, dst, h_start, width, v_start, height);
+    }
+    if ((components_pattern >> AOM_PLANE_V) & 1) {
+      filt_err +=
+          aom_highbd_get_v_sse_part(src, dst, h_start, width, v_start, height);
+    }
+    return filt_err;
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+  if ((components_pattern >> AOM_PLANE_Y) & 1) {
+    filt_err += aom_get_y_sse_part(src, dst, h_start, width, v_start, height);
+  }
+  if ((components_pattern >> AOM_PLANE_U) & 1) {
+    filt_err += aom_get_u_sse_part(src, dst, h_start, width, v_start, height);
+  }
+  if ((components_pattern >> AOM_PLANE_V) & 1) {
+    filt_err += aom_get_v_sse_part(src, dst, h_start, width, v_start, height);
+  }
+  return filt_err;
+}
+
+static int64_t sse_restoration_frame(AV1_COMMON *const cm,
+                                     const YV12_BUFFER_CONFIG *src,
+                                     const YV12_BUFFER_CONFIG *dst,
+                                     int components_pattern) {
+  int64_t filt_err = 0;
+#if CONFIG_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    if ((components_pattern >> AOM_PLANE_Y) & 1) {
+      filt_err += aom_highbd_get_y_sse(src, dst);
+    }
+    if ((components_pattern >> AOM_PLANE_U) & 1) {
+      filt_err += aom_highbd_get_u_sse(src, dst);
+    }
+    if ((components_pattern >> AOM_PLANE_V) & 1) {
+      filt_err += aom_highbd_get_v_sse(src, dst);
+    }
+    return filt_err;
+  }
+#else
+  (void)cm;
+#endif  // CONFIG_HIGHBITDEPTH
+  if ((components_pattern >> AOM_PLANE_Y) & 1) {
+    filt_err = aom_get_y_sse(src, dst);
+  }
+  if ((components_pattern >> AOM_PLANE_U) & 1) {
+    filt_err += aom_get_u_sse(src, dst);
+  }
+  if ((components_pattern >> AOM_PLANE_V) & 1) {
+    filt_err += aom_get_v_sse(src, dst);
+  }
+  return filt_err;
+}
+
+static int64_t try_restoration_tile(const YV12_BUFFER_CONFIG *src,
+                                    AV1_COMP *const cpi, RestorationInfo *rsi,
+                                    int components_pattern, int partial_frame,
+                                    int tile_idx, int subtile_idx,
+                                    int subtile_bits,
+                                    YV12_BUFFER_CONFIG *dst_frame) {
+  AV1_COMMON *const cm = &cpi->common;
+  int64_t filt_err;
+  int tile_width, tile_height, nhtiles, nvtiles;
+  int h_start, h_end, v_start, v_end;
+  int ntiles, width, height;
+
+  // Y and UV components cannot be mixed
+  assert(components_pattern == 1 || components_pattern == 2 ||
+         components_pattern == 4 || components_pattern == 6);
+
+  if (components_pattern == 1) {  // Y only
+    width = src->y_crop_width;
+    height = src->y_crop_height;
+  } else {  // Color
+    width = src->uv_crop_width;
+    height = src->uv_crop_height;
+  }
+  ntiles = av1_get_rest_ntiles(
+      width, height, cm->rst_info[components_pattern > 1].restoration_tilesize,
+      &tile_width, &tile_height, &nhtiles, &nvtiles);
+  (void)ntiles;
+
+  av1_loop_restoration_frame(cm->frame_to_show, cm, rsi, components_pattern,
+                             partial_frame, dst_frame);
+  av1_get_rest_tile_limits(tile_idx, subtile_idx, subtile_bits, nhtiles,
+                           nvtiles, tile_width, tile_height, width, height, 0,
+                           0, &h_start, &h_end, &v_start, &v_end);
+  filt_err = sse_restoration_tile(src, dst_frame, cm, h_start, h_end - h_start,
+                                  v_start, v_end - v_start, components_pattern);
+
+  return filt_err;
+}
+
+static int64_t try_restoration_frame(const YV12_BUFFER_CONFIG *src,
+                                     AV1_COMP *const cpi, RestorationInfo *rsi,
+                                     int components_pattern, int partial_frame,
+                                     YV12_BUFFER_CONFIG *dst_frame) {
+  AV1_COMMON *const cm = &cpi->common;
+  int64_t filt_err;
+  av1_loop_restoration_frame(cm->frame_to_show, cm, rsi, components_pattern,
+                             partial_frame, dst_frame);
+  filt_err = sse_restoration_frame(cm, src, dst_frame, components_pattern);
+  return filt_err;
+}
+
+static int64_t get_pixel_proj_error(uint8_t *src8, int width, int height,
+                                    int src_stride, uint8_t *dat8,
+                                    int dat_stride, int bit_depth,
+                                    int32_t *flt1, int flt1_stride,
+                                    int32_t *flt2, int flt2_stride, int *xqd) {
+  int i, j;
+  int64_t err = 0;
+  int xq[2];
+  decode_xq(xqd, xq);
+  if (bit_depth == 8) {
+    const uint8_t *src = src8;
+    const uint8_t *dat = dat8;
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        const int32_t u =
+            (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+        const int32_t f1 = (int32_t)flt1[i * flt1_stride + j] - u;
+        const int32_t f2 = (int32_t)flt2[i * flt2_stride + j] - u;
+        const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
+        const int32_t e =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) -
+            src[i * src_stride + j];
+        err += e * e;
+      }
+    }
+  } else {
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+    const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        const int32_t u =
+            (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+        const int32_t f1 = (int32_t)flt1[i * flt1_stride + j] - u;
+        const int32_t f2 = (int32_t)flt2[i * flt2_stride + j] - u;
+        const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
+        const int32_t e =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) -
+            src[i * src_stride + j];
+        err += e * e;
+      }
+    }
+  }
+  return err;
+}
+
+static void get_proj_subspace(uint8_t *src8, int width, int height,
+                              int src_stride, uint8_t *dat8, int dat_stride,
+                              int bit_depth, int32_t *flt1, int flt1_stride,
+                              int32_t *flt2, int flt2_stride, int *xq) {
+  int i, j;
+  double H[2][2] = { { 0, 0 }, { 0, 0 } };
+  double C[2] = { 0, 0 };
+  double Det;
+  double x[2];
+  const int size = width * height;
+
+  aom_clear_system_state();
+
+  // Default
+  xq[0] = 0;
+  xq[1] = 0;
+  if (bit_depth == 8) {
+    const uint8_t *src = src8;
+    const uint8_t *dat = dat8;
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        const double u = (double)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+        const double s =
+            (double)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+        const double f1 = (double)flt1[i * flt1_stride + j] - u;
+        const double f2 = (double)flt2[i * flt2_stride + j] - u;
+        H[0][0] += f1 * f1;
+        H[1][1] += f2 * f2;
+        H[0][1] += f1 * f2;
+        C[0] += f1 * s;
+        C[1] += f2 * s;
+      }
+    }
+  } else {
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+    const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        const double u = (double)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+        const double s =
+            (double)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+        const double f1 = (double)flt1[i * flt1_stride + j] - u;
+        const double f2 = (double)flt2[i * flt2_stride + j] - u;
+        H[0][0] += f1 * f1;
+        H[1][1] += f2 * f2;
+        H[0][1] += f1 * f2;
+        C[0] += f1 * s;
+        C[1] += f2 * s;
+      }
+    }
+  }
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+  Det = (H[0][0] * H[1][1] - H[0][1] * H[1][0]);
+  if (Det < 1e-8) return;  // ill-posed, return default values
+  x[0] = (H[1][1] * C[0] - H[0][1] * C[1]) / Det;
+  x[1] = (H[0][0] * C[1] - H[1][0] * C[0]) / Det;
+  xq[0] = (int)rint(x[0] * (1 << SGRPROJ_PRJ_BITS));
+  xq[1] = (int)rint(x[1] * (1 << SGRPROJ_PRJ_BITS));
+}
+
+void encode_xq(int *xq, int *xqd) {
+  xqd[0] = xq[0];
+  xqd[0] = clamp(xqd[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
+  xqd[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[0] - xq[1];
+  xqd[1] = clamp(xqd[1], SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1);
+}
+
+static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
+                                          int dat_stride, uint8_t *src8,
+                                          int src_stride, int bit_depth,
+                                          int *eps, int *xqd, int32_t *rstbuf) {
+  int32_t *flt1 = rstbuf;
+  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
+  int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
+  int ep, bestep = 0;
+  int64_t err, besterr = -1;
+  int exqd[2], bestxqd[2] = { 0, 0 };
+
+  for (ep = 0; ep < SGRPROJ_PARAMS; ep++) {
+    int exq[2];
+#if CONFIG_HIGHBITDEPTH
+    if (bit_depth > 8) {
+      uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+#if USE_HIGHPASS_IN_SGRPROJ
+      av1_highpass_filter_highbd(dat, width, height, dat_stride, flt1, width,
+                                 sgr_params[ep].corner, sgr_params[ep].edge);
+#else
+      av1_selfguided_restoration_highbd(dat, width, height, dat_stride, flt1,
+                                        width, bit_depth, sgr_params[ep].r1,
+                                        sgr_params[ep].e1, tmpbuf2);
+#endif  // USE_HIGHPASS_IN_SGRPROJ
+      av1_selfguided_restoration_highbd(dat, width, height, dat_stride, flt2,
+                                        width, bit_depth, sgr_params[ep].r2,
+                                        sgr_params[ep].e2, tmpbuf2);
+    } else {
+#endif
+#if USE_HIGHPASS_IN_SGRPROJ
+      av1_highpass_filter(dat8, width, height, dat_stride, flt1, width,
+                          sgr_params[ep].corner, sgr_params[ep].edge);
+#else
+    av1_selfguided_restoration(dat8, width, height, dat_stride, flt1, width,
+                               sgr_params[ep].r1, sgr_params[ep].e1, tmpbuf2);
+#endif  // USE_HIGHPASS_IN_SGRPROJ
+      av1_selfguided_restoration(dat8, width, height, dat_stride, flt2, width,
+                                 sgr_params[ep].r2, sgr_params[ep].e2, tmpbuf2);
+#if CONFIG_HIGHBITDEPTH
+    }
+#endif
+    get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
+                      bit_depth, flt1, width, flt2, width, exq);
+    encode_xq(exq, exqd);
+    err =
+        get_pixel_proj_error(src8, width, height, src_stride, dat8, dat_stride,
+                             bit_depth, flt1, width, flt2, width, exqd);
+    if (besterr == -1 || err < besterr) {
+      bestep = ep;
+      besterr = err;
+      bestxqd[0] = exqd[0];
+      bestxqd[1] = exqd[1];
+    }
+  }
+  *eps = bestep;
+  xqd[0] = bestxqd[0];
+  xqd[1] = bestxqd[1];
+}
+
+static int count_sgrproj_bits(SgrprojInfo *sgrproj_info,
+                              SgrprojInfo *ref_sgrproj_info) {
+  int bits = SGRPROJ_PARAMS_BITS;
+  bits += aom_count_primitive_refsubexpfin(
+      SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+      ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+      sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+  bits += aom_count_primitive_refsubexpfin(
+      SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+      ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+      sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+  return bits;
+}
+
+static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
+                             int partial_frame, RestorationInfo *info,
+                             RestorationType *type, double *best_tile_cost,
+                             YV12_BUFFER_CONFIG *dst_frame) {
+  SgrprojInfo *sgrproj_info = info->sgrproj_info;
+  double err, cost_norestore, cost_sgrproj;
+  int bits;
+  MACROBLOCK *x = &cpi->td.mb;
+  AV1_COMMON *const cm = &cpi->common;
+  const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
+  RestorationInfo *rsi = &cpi->rst_search[0];
+  int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
+  int h_start, h_end, v_start, v_end;
+  // Allocate for the src buffer at high precision
+  const int ntiles = av1_get_rest_ntiles(
+      cm->width, cm->height, cm->rst_info[0].restoration_tilesize, &tile_width,
+      &tile_height, &nhtiles, &nvtiles);
+  SgrprojInfo ref_sgrproj_info;
+  set_default_sgrproj(&ref_sgrproj_info);
+
+  rsi->frame_restoration_type = RESTORE_SGRPROJ;
+
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+    rsi->restoration_type[tile_idx] = RESTORE_NONE;
+  }
+  // Compute best Sgrproj filters for each tile
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+    av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
+                             tile_height, cm->width, cm->height, 0, 0, &h_start,
+                             &h_end, &v_start, &v_end);
+    err = sse_restoration_tile(src, cm->frame_to_show, cm, h_start,
+                               h_end - h_start, v_start, v_end - v_start, 1);
+    // #bits when a tile is not restored
+    bits = av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 0);
+    cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+    best_tile_cost[tile_idx] = DBL_MAX;
+    search_selfguided_restoration(
+        dgd->y_buffer + v_start * dgd->y_stride + h_start, h_end - h_start,
+        v_end - v_start, dgd->y_stride,
+        src->y_buffer + v_start * src->y_stride + h_start, src->y_stride,
+#if CONFIG_HIGHBITDEPTH
+        cm->bit_depth,
+#else
+        8,
+#endif  // CONFIG_HIGHBITDEPTH
+        &rsi->sgrproj_info[tile_idx].ep, rsi->sgrproj_info[tile_idx].xqd,
+        cm->rst_internal.tmpbuf);
+    rsi->restoration_type[tile_idx] = RESTORE_SGRPROJ;
+    err = try_restoration_tile(src, cpi, rsi, 1, partial_frame, tile_idx, 0, 0,
+                               dst_frame);
+    bits = count_sgrproj_bits(&rsi->sgrproj_info[tile_idx], &ref_sgrproj_info)
+           << AV1_PROB_COST_SHIFT;
+    bits += av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 1);
+    cost_sgrproj = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+    if (cost_sgrproj >= cost_norestore) {
+      type[tile_idx] = RESTORE_NONE;
+    } else {
+      type[tile_idx] = RESTORE_SGRPROJ;
+      memcpy(&sgrproj_info[tile_idx], &rsi->sgrproj_info[tile_idx],
+             sizeof(sgrproj_info[tile_idx]));
+      bits = count_sgrproj_bits(&rsi->sgrproj_info[tile_idx], &ref_sgrproj_info)
+             << AV1_PROB_COST_SHIFT;
+      memcpy(&ref_sgrproj_info, &sgrproj_info[tile_idx],
+             sizeof(ref_sgrproj_info));
+      best_tile_cost[tile_idx] = err;
+    }
+    rsi->restoration_type[tile_idx] = RESTORE_NONE;
+  }
+  // Cost for Sgrproj filtering
+  set_default_sgrproj(&ref_sgrproj_info);
+  bits = frame_level_restore_bits[rsi->frame_restoration_type]
+         << AV1_PROB_COST_SHIFT;
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+    bits +=
+        av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, type[tile_idx] != RESTORE_NONE);
+    memcpy(&rsi->sgrproj_info[tile_idx], &sgrproj_info[tile_idx],
+           sizeof(sgrproj_info[tile_idx]));
+    if (type[tile_idx] == RESTORE_SGRPROJ) {
+      bits +=
+          count_sgrproj_bits(&rsi->sgrproj_info[tile_idx], &ref_sgrproj_info)
+          << AV1_PROB_COST_SHIFT;
+      memcpy(&ref_sgrproj_info, &rsi->sgrproj_info[tile_idx],
+             sizeof(ref_sgrproj_info));
+    }
+    rsi->restoration_type[tile_idx] = type[tile_idx];
+  }
+  err = try_restoration_frame(src, cpi, rsi, 1, partial_frame, dst_frame);
+  cost_sgrproj = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+
+  return cost_sgrproj;
+}
+
+static double find_average(uint8_t *src, int h_start, int h_end, int v_start,
+                           int v_end, int stride) {
+  uint64_t sum = 0;
+  double avg = 0;
+  int i, j;
+  aom_clear_system_state();
+  for (i = v_start; i < v_end; i++)
+    for (j = h_start; j < h_end; j++) sum += src[i * stride + j];
+  avg = (double)sum / ((v_end - v_start) * (h_end - h_start));
+  return avg;
+}
+
+static void compute_stats(uint8_t *dgd, uint8_t *src, int h_start, int h_end,
+                          int v_start, int v_end, int dgd_stride,
+                          int src_stride, double *M, double *H) {
+  int i, j, k, l;
+  double Y[WIENER_WIN2];
+  const double avg =
+      find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  memset(M, 0, sizeof(*M) * WIENER_WIN2);
+  memset(H, 0, sizeof(*H) * WIENER_WIN2 * WIENER_WIN2);
+  for (i = v_start; i < v_end; i++) {
+    for (j = h_start; j < h_end; j++) {
+      const double X = (double)src[i * src_stride + j] - avg;
+      int idx = 0;
+      for (k = -WIENER_HALFWIN; k <= WIENER_HALFWIN; k++) {
+        for (l = -WIENER_HALFWIN; l <= WIENER_HALFWIN; l++) {
+          Y[idx] = (double)dgd[(i + l) * dgd_stride + (j + k)] - avg;
+          idx++;
+        }
+      }
+      for (k = 0; k < WIENER_WIN2; ++k) {
+        M[k] += Y[k] * X;
+        H[k * WIENER_WIN2 + k] += Y[k] * Y[k];
+        for (l = k + 1; l < WIENER_WIN2; ++l) {
+          // H is a symmetric matrix, so we only need to fill out the upper
+          // triangle here. We can copy it down to the lower triangle outside
+          // the (i, j) loops.
+          H[k * WIENER_WIN2 + l] += Y[k] * Y[l];
+        }
+      }
+    }
+  }
+  for (k = 0; k < WIENER_WIN2; ++k) {
+    for (l = k + 1; l < WIENER_WIN2; ++l) {
+      H[l * WIENER_WIN2 + k] = H[k * WIENER_WIN2 + l];
+    }
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static double find_average_highbd(uint16_t *src, int h_start, int h_end,
+                                  int v_start, int v_end, int stride) {
+  uint64_t sum = 0;
+  double avg = 0;
+  int i, j;
+  aom_clear_system_state();
+  for (i = v_start; i < v_end; i++)
+    for (j = h_start; j < h_end; j++) sum += src[i * stride + j];
+  avg = (double)sum / ((v_end - v_start) * (h_end - h_start));
+  return avg;
+}
+
+static void compute_stats_highbd(uint8_t *dgd8, uint8_t *src8, int h_start,
+                                 int h_end, int v_start, int v_end,
+                                 int dgd_stride, int src_stride, double *M,
+                                 double *H) {
+  int i, j, k, l;
+  double Y[WIENER_WIN2];
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+  const double avg =
+      find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  memset(M, 0, sizeof(*M) * WIENER_WIN2);
+  memset(H, 0, sizeof(*H) * WIENER_WIN2 * WIENER_WIN2);
+  for (i = v_start; i < v_end; i++) {
+    for (j = h_start; j < h_end; j++) {
+      const double X = (double)src[i * src_stride + j] - avg;
+      int idx = 0;
+      for (k = -WIENER_HALFWIN; k <= WIENER_HALFWIN; k++) {
+        for (l = -WIENER_HALFWIN; l <= WIENER_HALFWIN; l++) {
+          Y[idx] = (double)dgd[(i + l) * dgd_stride + (j + k)] - avg;
+          idx++;
+        }
+      }
+      for (k = 0; k < WIENER_WIN2; ++k) {
+        M[k] += Y[k] * X;
+        H[k * WIENER_WIN2 + k] += Y[k] * Y[k];
+        for (l = k + 1; l < WIENER_WIN2; ++l) {
+          // H is a symmetric matrix, so we only need to fill out the upper
+          // triangle here. We can copy it down to the lower triangle outside
+          // the (i, j) loops.
+          H[k * WIENER_WIN2 + l] += Y[k] * Y[l];
+        }
+      }
+    }
+  }
+  for (k = 0; k < WIENER_WIN2; ++k) {
+    for (l = k + 1; l < WIENER_WIN2; ++l) {
+      H[l * WIENER_WIN2 + k] = H[k * WIENER_WIN2 + l];
+    }
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+// Solves Ax = b, where x and b are column vectors
+static int linsolve(int n, double *A, int stride, double *b, double *x) {
+  int i, j, k;
+  double c;
+
+  aom_clear_system_state();
+
+  // Forward elimination
+  for (k = 0; k < n - 1; k++) {
+    // Bring the largest magitude to the diagonal position
+    for (i = n - 1; i > k; i--) {
+      if (fabs(A[(i - 1) * stride + k]) < fabs(A[i * stride + k])) {
+        for (j = 0; j < n; j++) {
+          c = A[i * stride + j];
+          A[i * stride + j] = A[(i - 1) * stride + j];
+          A[(i - 1) * stride + j] = c;
+        }
+        c = b[i];
+        b[i] = b[i - 1];
+        b[i - 1] = c;
+      }
+    }
+    for (i = k; i < n - 1; i++) {
+      if (fabs(A[k * stride + k]) < 1e-10) return 0;
+      c = A[(i + 1) * stride + k] / A[k * stride + k];
+      for (j = 0; j < n; j++) A[(i + 1) * stride + j] -= c * A[k * stride + j];
+      b[i + 1] -= c * b[k];
+    }
+  }
+  // Backward substitution
+  for (i = n - 1; i >= 0; i--) {
+    if (fabs(A[i * stride + i]) < 1e-10) return 0;
+    c = 0;
+    for (j = i + 1; j <= n - 1; j++) c += A[i * stride + j] * x[j];
+    x[i] = (b[i] - c) / A[i * stride + i];
+  }
+
+  return 1;
+}
+
+static INLINE int wrap_index(int i) {
+  return (i >= WIENER_HALFWIN1 ? WIENER_WIN - 1 - i : i);
+}
+
+// Fix vector b, update vector a
+static void update_a_sep_sym(double **Mc, double **Hc, double *a, double *b) {
+  int i, j;
+  double S[WIENER_WIN];
+  double A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
+  int w, w2;
+  memset(A, 0, sizeof(A));
+  memset(B, 0, sizeof(B));
+  for (i = 0; i < WIENER_WIN; i++) {
+    for (j = 0; j < WIENER_WIN; ++j) {
+      const int jj = wrap_index(j);
+      A[jj] += Mc[i][j] * b[i];
+    }
+  }
+  for (i = 0; i < WIENER_WIN; i++) {
+    for (j = 0; j < WIENER_WIN; j++) {
+      int k, l;
+      for (k = 0; k < WIENER_WIN; ++k)
+        for (l = 0; l < WIENER_WIN; ++l) {
+          const int kk = wrap_index(k);
+          const int ll = wrap_index(l);
+          B[ll * WIENER_HALFWIN1 + kk] +=
+              Hc[j * WIENER_WIN + i][k * WIENER_WIN2 + l] * b[i] * b[j];
+        }
+    }
+  }
+  // Normalization enforcement in the system of equations itself
+  w = WIENER_WIN;
+  w2 = (w >> 1) + 1;
+  for (i = 0; i < w2 - 1; ++i)
+    A[i] -=
+        A[w2 - 1] * 2 + B[i * w2 + w2 - 1] - 2 * B[(w2 - 1) * w2 + (w2 - 1)];
+  for (i = 0; i < w2 - 1; ++i)
+    for (j = 0; j < w2 - 1; ++j)
+      B[i * w2 + j] -= 2 * (B[i * w2 + (w2 - 1)] + B[(w2 - 1) * w2 + j] -
+                            2 * B[(w2 - 1) * w2 + (w2 - 1)]);
+  if (linsolve(w2 - 1, B, w2, A, S)) {
+    S[w2 - 1] = 1.0;
+    for (i = w2; i < w; ++i) {
+      S[i] = S[w - 1 - i];
+      S[w2 - 1] -= 2 * S[i];
+    }
+    memcpy(a, S, w * sizeof(*a));
+  }
+}
+
+// Fix vector a, update vector b
+static void update_b_sep_sym(double **Mc, double **Hc, double *a, double *b) {
+  int i, j;
+  double S[WIENER_WIN];
+  double A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
+  int w, w2;
+  memset(A, 0, sizeof(A));
+  memset(B, 0, sizeof(B));
+  for (i = 0; i < WIENER_WIN; i++) {
+    const int ii = wrap_index(i);
+    for (j = 0; j < WIENER_WIN; j++) A[ii] += Mc[i][j] * a[j];
+  }
+
+  for (i = 0; i < WIENER_WIN; i++) {
+    for (j = 0; j < WIENER_WIN; j++) {
+      const int ii = wrap_index(i);
+      const int jj = wrap_index(j);
+      int k, l;
+      for (k = 0; k < WIENER_WIN; ++k)
+        for (l = 0; l < WIENER_WIN; ++l)
+          B[jj * WIENER_HALFWIN1 + ii] +=
+              Hc[i * WIENER_WIN + j][k * WIENER_WIN2 + l] * a[k] * a[l];
+    }
+  }
+  // Normalization enforcement in the system of equations itself
+  w = WIENER_WIN;
+  w2 = WIENER_HALFWIN1;
+  for (i = 0; i < w2 - 1; ++i)
+    A[i] -=
+        A[w2 - 1] * 2 + B[i * w2 + w2 - 1] - 2 * B[(w2 - 1) * w2 + (w2 - 1)];
+  for (i = 0; i < w2 - 1; ++i)
+    for (j = 0; j < w2 - 1; ++j)
+      B[i * w2 + j] -= 2 * (B[i * w2 + (w2 - 1)] + B[(w2 - 1) * w2 + j] -
+                            2 * B[(w2 - 1) * w2 + (w2 - 1)]);
+  if (linsolve(w2 - 1, B, w2, A, S)) {
+    S[w2 - 1] = 1.0;
+    for (i = w2; i < w; ++i) {
+      S[i] = S[w - 1 - i];
+      S[w2 - 1] -= 2 * S[i];
+    }
+    memcpy(b, S, w * sizeof(*b));
+  }
+}
+
+static int wiener_decompose_sep_sym(double *M, double *H, double *a,
+                                    double *b) {
+  static const double init_filt[WIENER_WIN] = {
+    0.035623, -0.127154, 0.211436, 0.760190, 0.211436, -0.127154, 0.035623,
+  };
+  int i, j, iter;
+  double *Hc[WIENER_WIN2];
+  double *Mc[WIENER_WIN];
+  for (i = 0; i < WIENER_WIN; i++) {
+    Mc[i] = M + i * WIENER_WIN;
+    for (j = 0; j < WIENER_WIN; j++) {
+      Hc[i * WIENER_WIN + j] =
+          H + i * WIENER_WIN * WIENER_WIN2 + j * WIENER_WIN;
+    }
+  }
+  memcpy(a, init_filt, sizeof(*a) * WIENER_WIN);
+  memcpy(b, init_filt, sizeof(*b) * WIENER_WIN);
+
+  iter = 1;
+  while (iter < NUM_WIENER_ITERS) {
+    update_a_sep_sym(Mc, Hc, a, b);
+    update_b_sep_sym(Mc, Hc, a, b);
+    iter++;
+  }
+  return 1;
+}
+
+// Computes the function x'*H*x - x'*M for the learned 2D filter x, and compares
+// against identity filters; Final score is defined as the difference between
+// the function values
+static double compute_score(double *M, double *H, InterpKernel vfilt,
+                            InterpKernel hfilt) {
+  double ab[WIENER_WIN * WIENER_WIN];
+  int i, k, l;
+  double P = 0, Q = 0;
+  double iP = 0, iQ = 0;
+  double Score, iScore;
+  double a[WIENER_WIN], b[WIENER_WIN];
+
+  aom_clear_system_state();
+
+  a[WIENER_HALFWIN] = b[WIENER_HALFWIN] = 1.0;
+  for (i = 0; i < WIENER_HALFWIN; ++i) {
+    a[i] = a[WIENER_WIN - i - 1] = (double)vfilt[i] / WIENER_FILT_STEP;
+    b[i] = b[WIENER_WIN - i - 1] = (double)hfilt[i] / WIENER_FILT_STEP;
+    a[WIENER_HALFWIN] -= 2 * a[i];
+    b[WIENER_HALFWIN] -= 2 * b[i];
+  }
+  for (k = 0; k < WIENER_WIN; ++k) {
+    for (l = 0; l < WIENER_WIN; ++l) ab[k * WIENER_WIN + l] = a[l] * b[k];
+  }
+  for (k = 0; k < WIENER_WIN2; ++k) {
+    P += ab[k] * M[k];
+    for (l = 0; l < WIENER_WIN2; ++l)
+      Q += ab[k] * H[k * WIENER_WIN2 + l] * ab[l];
+  }
+  Score = Q - 2 * P;
+
+  iP = M[WIENER_WIN2 >> 1];
+  iQ = H[(WIENER_WIN2 >> 1) * WIENER_WIN2 + (WIENER_WIN2 >> 1)];
+  iScore = iQ - 2 * iP;
+
+  return Score - iScore;
+}
+
+static void quantize_sym_filter(double *f, InterpKernel fi) {
+  int i;
+  for (i = 0; i < WIENER_HALFWIN; ++i) {
+    fi[i] = RINT(f[i] * WIENER_FILT_STEP);
+  }
+  // Specialize for 7-tap filter
+  fi[0] = CLIP(fi[0], WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP0_MAXV);
+  fi[1] = CLIP(fi[1], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV);
+  fi[2] = CLIP(fi[2], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV);
+  // Satisfy filter constraints
+  fi[WIENER_WIN - 1] = fi[0];
+  fi[WIENER_WIN - 2] = fi[1];
+  fi[WIENER_WIN - 3] = fi[2];
+  // The central element has an implicit +WIENER_FILT_STEP
+  fi[3] = -2 * (fi[0] + fi[1] + fi[2]);
+}
+
+static int count_wiener_bits(WienerInfo *wiener_info,
+                             WienerInfo *ref_wiener_info) {
+  int bits = 0;
+  bits += aom_count_primitive_refsubexpfin(
+      WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+      WIENER_FILT_TAP0_SUBEXP_K,
+      ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
+      wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV);
+  bits += aom_count_primitive_refsubexpfin(
+      WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+      WIENER_FILT_TAP1_SUBEXP_K,
+      ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV,
+      wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV);
+  bits += aom_count_primitive_refsubexpfin(
+      WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+      WIENER_FILT_TAP2_SUBEXP_K,
+      ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV,
+      wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV);
+  bits += aom_count_primitive_refsubexpfin(
+      WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+      WIENER_FILT_TAP0_SUBEXP_K,
+      ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
+      wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV);
+  bits += aom_count_primitive_refsubexpfin(
+      WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+      WIENER_FILT_TAP1_SUBEXP_K,
+      ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV,
+      wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV);
+  bits += aom_count_primitive_refsubexpfin(
+      WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+      WIENER_FILT_TAP2_SUBEXP_K,
+      ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV,
+      wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV);
+  return bits;
+}
+
+static double search_wiener_uv(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
+                               int partial_frame, int plane,
+                               RestorationInfo *info, RestorationType *type,
+                               YV12_BUFFER_CONFIG *dst_frame) {
+  WienerInfo *wiener_info = info->wiener_info;
+  AV1_COMMON *const cm = &cpi->common;
+  RestorationInfo *rsi = cpi->rst_search;
+  int64_t err;
+  int bits;
+  double cost_wiener, cost_norestore, cost_wiener_frame, cost_norestore_frame;
+  MACROBLOCK *x = &cpi->td.mb;
+  double M[WIENER_WIN2];
+  double H[WIENER_WIN2 * WIENER_WIN2];
+  double vfilterd[WIENER_WIN], hfilterd[WIENER_WIN];
+  const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
+  const int width = src->uv_crop_width;
+  const int height = src->uv_crop_height;
+  const int src_stride = src->uv_stride;
+  const int dgd_stride = dgd->uv_stride;
+  double score;
+  int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
+  int h_start, h_end, v_start, v_end;
+  const int ntiles =
+      av1_get_rest_ntiles(width, height, cm->rst_info[1].restoration_tilesize,
+                          &tile_width, &tile_height, &nhtiles, &nvtiles);
+  WienerInfo ref_wiener_info;
+  set_default_wiener(&ref_wiener_info);
+  assert(width == dgd->uv_crop_width);
+  assert(height == dgd->uv_crop_height);
+
+  rsi[plane].frame_restoration_type = RESTORE_NONE;
+  err = sse_restoration_frame(cm, src, cm->frame_to_show, (1 << plane));
+  bits = 0;
+  cost_norestore_frame = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+
+  rsi[plane].frame_restoration_type = RESTORE_WIENER;
+
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+    rsi[plane].restoration_type[tile_idx] = RESTORE_NONE;
+  }
+
+  // Compute best Wiener filters for each tile
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+    av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
+                             tile_height, width, height, 0, 0, &h_start, &h_end,
+                             &v_start, &v_end);
+    err = sse_restoration_tile(src, cm->frame_to_show, cm, h_start,
+                               h_end - h_start, v_start, v_end - v_start,
+                               1 << plane);
+    // #bits when a tile is not restored
+    bits = av1_cost_bit(RESTORE_NONE_WIENER_PROB, 0);
+    cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+    // best_tile_cost[tile_idx] = DBL_MAX;
+
+    av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
+                             tile_height, width, height, WIENER_HALFWIN,
+                             WIENER_HALFWIN, &h_start, &h_end, &v_start,
+                             &v_end);
+    if (plane == AOM_PLANE_U) {
+#if CONFIG_HIGHBITDEPTH
+      if (cm->use_highbitdepth)
+        compute_stats_highbd(dgd->u_buffer, src->u_buffer, h_start, h_end,
+                             v_start, v_end, dgd_stride, src_stride, M, H);
+      else
+#endif  // CONFIG_HIGHBITDEPTH
+        compute_stats(dgd->u_buffer, src->u_buffer, h_start, h_end, v_start,
+                      v_end, dgd_stride, src_stride, M, H);
+    } else if (plane == AOM_PLANE_V) {
+#if CONFIG_HIGHBITDEPTH
+      if (cm->use_highbitdepth)
+        compute_stats_highbd(dgd->v_buffer, src->v_buffer, h_start, h_end,
+                             v_start, v_end, dgd_stride, src_stride, M, H);
+      else
+#endif  // CONFIG_HIGHBITDEPTH
+        compute_stats(dgd->v_buffer, src->v_buffer, h_start, h_end, v_start,
+                      v_end, dgd_stride, src_stride, M, H);
+    } else {
+      assert(0);
+    }
+
+    type[tile_idx] = RESTORE_WIENER;
+
+    if (!wiener_decompose_sep_sym(M, H, vfilterd, hfilterd)) {
+      type[tile_idx] = RESTORE_NONE;
+      continue;
+    }
+    quantize_sym_filter(vfilterd, rsi[plane].wiener_info[tile_idx].vfilter);
+    quantize_sym_filter(hfilterd, rsi[plane].wiener_info[tile_idx].hfilter);
+
+    // Filter score computes the value of the function x'*A*x - x'*b for the
+    // learned filter and compares it against identity filer. If there is no
+    // reduction in the function, the filter is reverted back to identity
+    score = compute_score(M, H, rsi[plane].wiener_info[tile_idx].vfilter,
+                          rsi[plane].wiener_info[tile_idx].hfilter);
+    if (score > 0.0) {
+      type[tile_idx] = RESTORE_NONE;
+      continue;
+    }
+
+    rsi[plane].restoration_type[tile_idx] = RESTORE_WIENER;
+    err = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
+                               tile_idx, 0, 0, dst_frame);
+    bits =
+        count_wiener_bits(&rsi[plane].wiener_info[tile_idx], &ref_wiener_info)
+        << AV1_PROB_COST_SHIFT;
+    // bits = WIENER_FILT_BITS << AV1_PROB_COST_SHIFT;
+    bits += av1_cost_bit(RESTORE_NONE_WIENER_PROB, 1);
+    cost_wiener = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+    if (cost_wiener >= cost_norestore) {
+      type[tile_idx] = RESTORE_NONE;
+    } else {
+      type[tile_idx] = RESTORE_WIENER;
+      memcpy(&wiener_info[tile_idx], &rsi[plane].wiener_info[tile_idx],
+             sizeof(wiener_info[tile_idx]));
+      memcpy(&ref_wiener_info, &rsi[plane].wiener_info[tile_idx],
+             sizeof(ref_wiener_info));
+    }
+    rsi[plane].restoration_type[tile_idx] = RESTORE_NONE;
+  }
+  // Cost for Wiener filtering
+  set_default_wiener(&ref_wiener_info);
+  bits = 0;
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+    bits +=
+        av1_cost_bit(RESTORE_NONE_WIENER_PROB, type[tile_idx] != RESTORE_NONE);
+    memcpy(&rsi[plane].wiener_info[tile_idx], &wiener_info[tile_idx],
+           sizeof(wiener_info[tile_idx]));
+    if (type[tile_idx] == RESTORE_WIENER) {
+      bits +=
+          count_wiener_bits(&rsi[plane].wiener_info[tile_idx], &ref_wiener_info)
+          << AV1_PROB_COST_SHIFT;
+      memcpy(&ref_wiener_info, &rsi[plane].wiener_info[tile_idx],
+             sizeof(ref_wiener_info));
+    }
+    rsi[plane].restoration_type[tile_idx] = type[tile_idx];
+  }
+  err = try_restoration_frame(src, cpi, rsi, 1 << plane, partial_frame,
+                              dst_frame);
+  cost_wiener_frame = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+
+  if (cost_wiener_frame < cost_norestore_frame) {
+    info->frame_restoration_type = RESTORE_WIENER;
+  } else {
+    info->frame_restoration_type = RESTORE_NONE;
+  }
+
+  return info->frame_restoration_type == RESTORE_WIENER ? cost_wiener_frame
+                                                        : cost_norestore_frame;
+}
+
+static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
+                            int partial_frame, RestorationInfo *info,
+                            RestorationType *type, double *best_tile_cost,
+                            YV12_BUFFER_CONFIG *dst_frame) {
+  WienerInfo *wiener_info = info->wiener_info;
+  AV1_COMMON *const cm = &cpi->common;
+  RestorationInfo *rsi = cpi->rst_search;
+  int64_t err;
+  int bits;
+  double cost_wiener, cost_norestore;
+  MACROBLOCK *x = &cpi->td.mb;
+  double M[WIENER_WIN2];
+  double H[WIENER_WIN2 * WIENER_WIN2];
+  double vfilterd[WIENER_WIN], hfilterd[WIENER_WIN];
+  const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
+  const int width = cm->width;
+  const int height = cm->height;
+  const int src_stride = src->y_stride;
+  const int dgd_stride = dgd->y_stride;
+  double score;
+  int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
+  int h_start, h_end, v_start, v_end;
+  const int ntiles =
+      av1_get_rest_ntiles(width, height, cm->rst_info[0].restoration_tilesize,
+                          &tile_width, &tile_height, &nhtiles, &nvtiles);
+  WienerInfo ref_wiener_info;
+  set_default_wiener(&ref_wiener_info);
+
+  assert(width == dgd->y_crop_width);
+  assert(height == dgd->y_crop_height);
+  assert(width == src->y_crop_width);
+  assert(height == src->y_crop_height);
+
+  rsi->frame_restoration_type = RESTORE_WIENER;
+
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+    rsi->restoration_type[tile_idx] = RESTORE_NONE;
+  }
+
+// Construct a (WIENER_HALFWIN)-pixel border around the frame
+#if CONFIG_HIGHBITDEPTH
+  if (cm->use_highbitdepth)
+    extend_frame_highbd(CONVERT_TO_SHORTPTR(dgd->y_buffer), width, height,
+                        dgd_stride);
+  else
+#endif
+    extend_frame(dgd->y_buffer, width, height, dgd_stride);
+
+  // Compute best Wiener filters for each tile
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+    av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
+                             tile_height, width, height, 0, 0, &h_start, &h_end,
+                             &v_start, &v_end);
+    err = sse_restoration_tile(src, cm->frame_to_show, cm, h_start,
+                               h_end - h_start, v_start, v_end - v_start, 1);
+    // #bits when a tile is not restored
+    bits = av1_cost_bit(RESTORE_NONE_WIENER_PROB, 0);
+    cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+    best_tile_cost[tile_idx] = DBL_MAX;
+
+    av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
+                             tile_height, width, height, 0, 0, &h_start, &h_end,
+                             &v_start, &v_end);
+#if CONFIG_HIGHBITDEPTH
+    if (cm->use_highbitdepth)
+      compute_stats_highbd(dgd->y_buffer, src->y_buffer, h_start, h_end,
+                           v_start, v_end, dgd_stride, src_stride, M, H);
+    else
+#endif  // CONFIG_HIGHBITDEPTH
+      compute_stats(dgd->y_buffer, src->y_buffer, h_start, h_end, v_start,
+                    v_end, dgd_stride, src_stride, M, H);
+
+    type[tile_idx] = RESTORE_WIENER;
+
+    if (!wiener_decompose_sep_sym(M, H, vfilterd, hfilterd)) {
+      type[tile_idx] = RESTORE_NONE;
+      continue;
+    }
+    quantize_sym_filter(vfilterd, rsi->wiener_info[tile_idx].vfilter);
+    quantize_sym_filter(hfilterd, rsi->wiener_info[tile_idx].hfilter);
+
+    // Filter score computes the value of the function x'*A*x - x'*b for the
+    // learned filter and compares it against identity filer. If there is no
+    // reduction in the function, the filter is reverted back to identity
+    score = compute_score(M, H, rsi->wiener_info[tile_idx].vfilter,
+                          rsi->wiener_info[tile_idx].hfilter);
+    if (score > 0.0) {
+      type[tile_idx] = RESTORE_NONE;
+      continue;
+    }
+
+    rsi->restoration_type[tile_idx] = RESTORE_WIENER;
+    err = try_restoration_tile(src, cpi, rsi, 1, partial_frame, tile_idx, 0, 0,
+                               dst_frame);
+    bits = count_wiener_bits(&rsi->wiener_info[tile_idx], &ref_wiener_info)
+           << AV1_PROB_COST_SHIFT;
+    bits += av1_cost_bit(RESTORE_NONE_WIENER_PROB, 1);
+    cost_wiener = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+    if (cost_wiener >= cost_norestore) {
+      type[tile_idx] = RESTORE_NONE;
+    } else {
+      type[tile_idx] = RESTORE_WIENER;
+      memcpy(&wiener_info[tile_idx], &rsi->wiener_info[tile_idx],
+             sizeof(wiener_info[tile_idx]));
+      memcpy(&ref_wiener_info, &rsi->wiener_info[tile_idx],
+             sizeof(ref_wiener_info));
+      bits = count_wiener_bits(&wiener_info[tile_idx], &ref_wiener_info)
+             << AV1_PROB_COST_SHIFT;
+      best_tile_cost[tile_idx] = err;
+    }
+    rsi->restoration_type[tile_idx] = RESTORE_NONE;
+  }
+  // Cost for Wiener filtering
+  set_default_wiener(&ref_wiener_info);
+  bits = frame_level_restore_bits[rsi->frame_restoration_type]
+         << AV1_PROB_COST_SHIFT;
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+    bits +=
+        av1_cost_bit(RESTORE_NONE_WIENER_PROB, type[tile_idx] != RESTORE_NONE);
+    memcpy(&rsi->wiener_info[tile_idx], &wiener_info[tile_idx],
+           sizeof(wiener_info[tile_idx]));
+    if (type[tile_idx] == RESTORE_WIENER) {
+      bits += count_wiener_bits(&rsi->wiener_info[tile_idx], &ref_wiener_info)
+              << AV1_PROB_COST_SHIFT;
+      memcpy(&ref_wiener_info, &rsi->wiener_info[tile_idx],
+             sizeof(ref_wiener_info));
+    }
+    rsi->restoration_type[tile_idx] = type[tile_idx];
+  }
+  err = try_restoration_frame(src, cpi, rsi, 1, partial_frame, dst_frame);
+  cost_wiener = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+
+  return cost_wiener;
+}
+
+static double search_norestore(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
+                               int partial_frame, RestorationInfo *info,
+                               RestorationType *type, double *best_tile_cost,
+                               YV12_BUFFER_CONFIG *dst_frame) {
+  double err, cost_norestore;
+  int bits;
+  MACROBLOCK *x = &cpi->td.mb;
+  AV1_COMMON *const cm = &cpi->common;
+  int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
+  int h_start, h_end, v_start, v_end;
+  const int ntiles = av1_get_rest_ntiles(
+      cm->width, cm->height, cm->rst_info[0].restoration_tilesize, &tile_width,
+      &tile_height, &nhtiles, &nvtiles);
+  (void)info;
+  (void)dst_frame;
+  (void)partial_frame;
+
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+    av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
+                             tile_height, cm->width, cm->height, 0, 0, &h_start,
+                             &h_end, &v_start, &v_end);
+    err = sse_restoration_tile(src, cm->frame_to_show, cm, h_start,
+                               h_end - h_start, v_start, v_end - v_start, 1);
+    type[tile_idx] = RESTORE_NONE;
+    best_tile_cost[tile_idx] = err;
+  }
+  // RD cost associated with no restoration
+  err = sse_restoration_tile(src, cm->frame_to_show, cm, 0, cm->width, 0,
+                             cm->height, 1);
+  bits = frame_level_restore_bits[RESTORE_NONE] << AV1_PROB_COST_SHIFT;
+  cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+  return cost_norestore;
+}
+
+static double search_switchable_restoration(
+    AV1_COMP *cpi, int partial_frame, RestorationInfo *rsi,
+    double *tile_cost[RESTORE_SWITCHABLE_TYPES]) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *x = &cpi->td.mb;
+  double cost_switchable = 0;
+  int bits, tile_idx;
+  RestorationType r;
+  const int ntiles = av1_get_rest_ntiles(cm->width, cm->height,
+                                         cm->rst_info[0].restoration_tilesize,
+                                         NULL, NULL, NULL, NULL);
+  SgrprojInfo ref_sgrproj_info;
+  set_default_sgrproj(&ref_sgrproj_info);
+  WienerInfo ref_wiener_info;
+  set_default_wiener(&ref_wiener_info);
+  (void)partial_frame;
+
+  rsi->frame_restoration_type = RESTORE_SWITCHABLE;
+  bits = frame_level_restore_bits[rsi->frame_restoration_type]
+         << AV1_PROB_COST_SHIFT;
+  cost_switchable = RDCOST_DBL(x->rdmult, x->rddiv, bits >> 4, 0);
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+    double best_cost = RDCOST_DBL(
+        x->rdmult, x->rddiv, (cpi->switchable_restore_cost[RESTORE_NONE] >> 4),
+        tile_cost[RESTORE_NONE][tile_idx]);
+    rsi->restoration_type[tile_idx] = RESTORE_NONE;
+    for (r = 1; r < RESTORE_SWITCHABLE_TYPES; r++) {
+      if (force_restore_type != 0)
+        if (r != force_restore_type) continue;
+      int tilebits = 0;
+      if (r == RESTORE_WIENER)
+        tilebits +=
+            count_wiener_bits(&rsi->wiener_info[tile_idx], &ref_wiener_info);
+      else if (r == RESTORE_SGRPROJ)
+        tilebits +=
+            count_sgrproj_bits(&rsi->sgrproj_info[tile_idx], &ref_sgrproj_info);
+      tilebits <<= AV1_PROB_COST_SHIFT;
+      tilebits += cpi->switchable_restore_cost[r];
+      double cost = RDCOST_DBL(x->rdmult, x->rddiv, tilebits >> 4,
+                               tile_cost[r][tile_idx]);
+
+      if (cost < best_cost) {
+        rsi->restoration_type[tile_idx] = r;
+        best_cost = cost;
+      }
+    }
+    if (rsi->restoration_type[tile_idx] == RESTORE_WIENER)
+      memcpy(&ref_wiener_info, &rsi->wiener_info[tile_idx],
+             sizeof(ref_wiener_info));
+    else if (rsi->restoration_type[tile_idx] == RESTORE_SGRPROJ)
+      memcpy(&ref_sgrproj_info, &rsi->sgrproj_info[tile_idx],
+             sizeof(ref_sgrproj_info));
+    if (force_restore_type != 0)
+      assert(rsi->restoration_type[tile_idx] == force_restore_type ||
+             rsi->restoration_type[tile_idx] == RESTORE_NONE);
+    cost_switchable += best_cost;
+  }
+  return cost_switchable;
+}
+
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
+                                 LPF_PICK_METHOD method) {
+  static search_restore_type search_restore_fun[RESTORE_SWITCHABLE_TYPES] = {
+    search_norestore, search_wiener, search_sgrproj,
+  };
+  AV1_COMMON *const cm = &cpi->common;
+  double cost_restore[RESTORE_TYPES];
+  double *tile_cost[RESTORE_SWITCHABLE_TYPES];
+  RestorationType *restore_types[RESTORE_SWITCHABLE_TYPES];
+  double best_cost_restore;
+  RestorationType r, best_restore;
+
+  const int ntiles = av1_get_rest_ntiles(cm->width, cm->height,
+                                         cm->rst_info[0].restoration_tilesize,
+                                         NULL, NULL, NULL, NULL);
+
+  for (r = 0; r < RESTORE_SWITCHABLE_TYPES; r++) {
+    tile_cost[r] = (double *)aom_malloc(sizeof(*tile_cost[0]) * ntiles);
+    restore_types[r] =
+        (RestorationType *)aom_malloc(sizeof(*restore_types[0]) * ntiles);
+  }
+
+  for (r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) {
+    if (force_restore_type != 0)
+      if (r != RESTORE_NONE && r != force_restore_type) continue;
+    cost_restore[r] = search_restore_fun[r](
+        src, cpi, method == LPF_PICK_FROM_SUBIMAGE, &cm->rst_info[0],
+        restore_types[r], tile_cost[r], &cpi->trial_frame_rst);
+  }
+  cost_restore[RESTORE_SWITCHABLE] = search_switchable_restoration(
+      cpi, method == LPF_PICK_FROM_SUBIMAGE, &cm->rst_info[0], tile_cost);
+
+  best_cost_restore = DBL_MAX;
+  best_restore = 0;
+  for (r = 0; r < RESTORE_TYPES; ++r) {
+    if (force_restore_type != 0)
+      if (r != RESTORE_NONE && r != force_restore_type) continue;
+    if (cost_restore[r] < best_cost_restore) {
+      best_restore = r;
+      best_cost_restore = cost_restore[r];
+    }
+  }
+  cm->rst_info[0].frame_restoration_type = best_restore;
+  if (force_restore_type != 0)
+    assert(best_restore == force_restore_type || best_restore == RESTORE_NONE);
+  if (best_restore != RESTORE_SWITCHABLE) {
+    memcpy(cm->rst_info[0].restoration_type, restore_types[best_restore],
+           ntiles * sizeof(restore_types[best_restore][0]));
+  }
+
+  // Color components
+  search_wiener_uv(src, cpi, method == LPF_PICK_FROM_SUBIMAGE, AOM_PLANE_U,
+                   &cm->rst_info[AOM_PLANE_U],
+                   cm->rst_info[AOM_PLANE_U].restoration_type,
+                   &cpi->trial_frame_rst);
+  search_wiener_uv(src, cpi, method == LPF_PICK_FROM_SUBIMAGE, AOM_PLANE_V,
+                   &cm->rst_info[AOM_PLANE_V],
+                   cm->rst_info[AOM_PLANE_V].restoration_type,
+                   &cpi->trial_frame_rst);
+  /*
+  printf("Frame %d/%d restore types: %d %d %d\n",
+         cm->current_video_frame, cm->show_frame,
+         cm->rst_info[0].frame_restoration_type,
+         cm->rst_info[1].frame_restoration_type,
+         cm->rst_info[2].frame_restoration_type);
+  printf("Frame %d/%d frame_restore_type %d : %f %f %f %f\n",
+         cm->current_video_frame, cm->show_frame,
+         cm->rst_info[0].frame_restoration_type, cost_restore[0],
+         cost_restore[1], cost_restore[2], cost_restore[3]);
+         */
+
+  for (r = 0; r < RESTORE_SWITCHABLE_TYPES; r++) {
+    aom_free(tile_cost[r]);
+    aom_free(restore_types[r]);
+  }
+}
diff --git a/third_party/aom/av1/encoder/pickrst.h b/third_party/aom/av1/encoder/pickrst.h
new file mode 100644
index 000000000..f6096ed1d
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickrst.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_ENCODER_PICKRST_H_
+#define AV1_ENCODER_PICKRST_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+
+struct yv12_buffer_config;
+struct AV1_COMP;
+
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+                                 LPF_PICK_METHOD method);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_PICKRST_H_
diff --git a/third_party/aom/av1/encoder/pvq_encoder.c b/third_party/aom/av1/encoder/pvq_encoder.c
new file mode 100644
index 000000000..ab63f1b7d
--- /dev/null
+++ b/third_party/aom/av1/encoder/pvq_encoder.c
@@ -0,0 +1,988 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "aom_dsp/entcode.h"
+#include "aom_dsp/entenc.h"
+#include "av1/common/blockd.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/partition.h"
+#include "av1/common/pvq_state.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/pvq_encoder.h"
+#include "aom_ports/system_state.h"
+
+/*Shift to ensure that the upper bound (i.e. for the max blocksize) of the
+   dot-product of the 1st band of chroma with the luma ref doesn't overflow.*/
+#define OD_CFL_FLIP_SHIFT (OD_LIMIT_BSIZE_MAX + 0)
+
+void aom_write_symbol_pvq(aom_writer *w, int symb, aom_cdf_prob *cdf,
+    int nsymbs) {
+  if (cdf[0] == 0)
+    aom_cdf_init_q15_1D(cdf, nsymbs, CDF_SIZE(nsymbs));
+  aom_write_symbol(w, symb, cdf, nsymbs);
+}
+
+static void aom_encode_pvq_codeword(aom_writer *w, od_pvq_codeword_ctx *adapt,
+ const od_coeff *in, int n, int k) {
+  int i;
+  aom_encode_band_pvq_splits(w, adapt, in, n, k, 0);
+  for (i = 0; i < n; i++) if (in[i]) aom_write_bit(w, in[i] < 0);
+}
+
+/* Computes 1/sqrt(i) using a table for small values. */
+static double od_rsqrt_table(int i) {
+  static double table[16] = {
+    1.000000, 0.707107, 0.577350, 0.500000,
+    0.447214, 0.408248, 0.377964, 0.353553,
+    0.333333, 0.316228, 0.301511, 0.288675,
+    0.277350, 0.267261, 0.258199, 0.250000};
+  if (i <= 16) return table[i-1];
+  else return 1./sqrt(i);
+}
+
+/*Computes 1/sqrt(start+2*i+1) using a lookup table containing the results
+   where 0 <= i < table_size.*/
+static double od_custom_rsqrt_dynamic_table(const double* table,
+ const int table_size, const double start, const int i) {
+  if (i < table_size) return table[i];
+  else return od_rsqrt_table((int)(start + 2*i + 1));
+}
+
+/*Fills tables used in od_custom_rsqrt_dynamic_table for a given start.*/
+static void od_fill_dynamic_rsqrt_table(double *table, const int table_size,
+ const double start) {
+  int i;
+  for (i = 0; i < table_size; i++)
+    table[i] = od_rsqrt_table((int)(start + 2*i + 1));
+}
+
+/** Find the codepoint on the given PSphere closest to the desired
+ * vector. Double-precision PVQ search just to make sure our tests
+ * aren't limited by numerical accuracy.
+ *
+ * @param [in]      xcoeff  input vector to quantize (x in the math doc)
+ * @param [in]      n       number of dimensions
+ * @param [in]      k       number of pulses
+ * @param [out]     ypulse  optimal codevector found (y in the math doc)
+ * @param [out]     g2      multiplier for the distortion (typically squared
+ *                          gain units)
+ * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
+ * @param [in]      prev_k  number of pulses already in ypulse that we should
+ *                          reuse for the search (or 0 for a new search)
+ * @return                  cosine distance between x and y (between 0 and 1)
+ */
+double pvq_search_rdo_double_c(const od_val16 *xcoeff, int n, int k,
+ od_coeff *ypulse, double g2, double pvq_norm_lambda, int prev_k) {
+  int i, j;
+  double xy;
+  double yy;
+  /* TODO - This blows our 8kB stack space budget and should be fixed when
+   converting PVQ to fixed point. */
+  double x[MAXN];
+  double xx;
+  double lambda;
+  double norm_1;
+  int rdo_pulses;
+  double delta_rate;
+  xx = xy = yy = 0;
+  for (j = 0; j < n; j++) {
+    x[j] = fabs((float)xcoeff[j]);
+    xx += x[j]*x[j];
+  }
+  norm_1 = 1./sqrt(1e-30 + xx);
+  lambda = pvq_norm_lambda/(1e-30 + g2);
+  i = 0;
+  if (prev_k > 0 && prev_k <= k) {
+    /* We reuse pulses from a previous search so we don't have to search them
+       again. */
+    for (j = 0; j < n; j++) {
+      ypulse[j] = abs(ypulse[j]);
+      xy += x[j]*ypulse[j];
+      yy += ypulse[j]*ypulse[j];
+      i += ypulse[j];
+    }
+  }
+  else if (k > 2) {
+    double l1_norm;
+    double l1_inv;
+    l1_norm = 0;
+    for (j = 0; j < n; j++) l1_norm += x[j];
+    l1_inv = 1./OD_MAXF(l1_norm, 1e-100);
+    for (j = 0; j < n; j++) {
+      double tmp;
+      tmp = k*x[j]*l1_inv;
+      ypulse[j] = OD_MAXI(0, (int)floor(tmp));
+      xy += x[j]*ypulse[j];
+      yy += ypulse[j]*ypulse[j];
+      i += ypulse[j];
+    }
+  }
+  else OD_CLEAR(ypulse, n);
+
+  /* Only use RDO on the last few pulses. This not only saves CPU, but using
+     RDO on all pulses actually makes the results worse for reasons I don't
+     fully understand. */
+  rdo_pulses = 1 + k/4;
+  /* Rough assumption for now, the last position costs about 3 bits more than
+     the first. */
+  delta_rate = 3./n;
+  /* Search one pulse at a time */
+  for (; i < k - rdo_pulses; i++) {
+    int pos;
+    double best_xy;
+    double best_yy;
+    pos = 0;
+    best_xy = -10;
+    best_yy = 1;
+    for (j = 0; j < n; j++) {
+      double tmp_xy;
+      double tmp_yy;
+      tmp_xy = xy + x[j];
+      tmp_yy = yy + 2*ypulse[j] + 1;
+      tmp_xy *= tmp_xy;
+      if (j == 0 || tmp_xy*best_yy > best_xy*tmp_yy) {
+        best_xy = tmp_xy;
+        best_yy = tmp_yy;
+        pos = j;
+      }
+    }
+    xy = xy + x[pos];
+    yy = yy + 2*ypulse[pos] + 1;
+    ypulse[pos]++;
+  }
+  /* Search last pulses with RDO. Distortion is D = (x-y)^2 = x^2 - 2*x*y + y^2
+     and since x^2 and y^2 are constant, we just maximize x*y, plus a
+     lambda*rate term. Note that since x and y aren't normalized here,
+     we need to divide by sqrt(x^2)*sqrt(y^2). */
+  for (; i < k; i++) {
+    double rsqrt_table[4];
+    int rsqrt_table_size = 4;
+    int pos;
+    double best_cost;
+    pos = 0;
+    best_cost = -1e5;
+    /*Fill the small rsqrt lookup table with inputs relative to yy.
+      Specifically, the table of n values is filled with
+       rsqrt(yy + 1), rsqrt(yy + 2 + 1) .. rsqrt(yy + 2*(n-1) + 1).*/
+    od_fill_dynamic_rsqrt_table(rsqrt_table, rsqrt_table_size, yy);
+    for (j = 0; j < n; j++) {
+      double tmp_xy;
+      double tmp_yy;
+      tmp_xy = xy + x[j];
+      /*Calculate rsqrt(yy + 2*ypulse[j] + 1) using an optimized method.*/
+      tmp_yy = od_custom_rsqrt_dynamic_table(rsqrt_table, rsqrt_table_size,
+       yy, ypulse[j]);
+      tmp_xy = 2*tmp_xy*norm_1*tmp_yy - lambda*j*delta_rate;
+      if (j == 0 || tmp_xy > best_cost) {
+        best_cost = tmp_xy;
+        pos = j;
+      }
+    }
+    xy = xy + x[pos];
+    yy = yy + 2*ypulse[pos] + 1;
+    ypulse[pos]++;
+  }
+  for (i = 0; i < n; i++) {
+    if (xcoeff[i] < 0) ypulse[i] = -ypulse[i];
+  }
+  return xy/(1e-100 + sqrt(xx*yy));
+}
+
+/** Encodes the gain so that the return value increases with the
+ * distance |x-ref|, so that we can encode a zero when x=ref. The
+ * value x=0 is not covered because it is only allowed in the noref
+ * case.
+ *
+ * @param [in]      x      quantized gain to encode
+ * @param [in]      ref    quantized gain of the reference
+ * @return                 interleave-encoded quantized gain value
+ */
+static int neg_interleave(int x, int ref) {
+  if (x < ref) return -2*(x - ref) - 1;
+  else if (x < 2*ref) return 2*(x - ref);
+  else return x-1;
+}
+
+int od_vector_is_null(const od_coeff *x, int len) {
+  int i;
+  for (i = 0; i < len; i++) if (x[i]) return 0;
+  return 1;
+}
+
+static double od_pvq_rate(int qg, int icgr, int theta, int ts,
+  const od_adapt_ctx *adapt, const od_coeff *y0, int k, int n, int speed) {
+  double rate;
+  if (k == 0) rate = 0;
+  else if (speed > 0) {
+    int i;
+    int sum;
+    double f;
+    /* Compute "center of mass" of the pulse vector. */
+    sum = 0;
+    for (i = 0; i < n - (theta != -1); i++) sum += i*abs(y0[i]);
+    f = sum/(double)(k*n);
+    /* Estimates the number of bits it will cost to encode K pulses in
+       N dimensions based on hand-tuned fit for bitrate vs K, N and
+       "center of mass". */
+    rate = (1 + .4*f)*n*OD_LOG2(1 + OD_MAXF(0, log(n*2*(1*f + .025))*k/n)) + 3;
+  }
+  else {
+    aom_writer w;
+    od_pvq_codeword_ctx cd;
+    int tell;
+#if CONFIG_DAALA_EC
+    od_ec_enc_init(&w.ec, 1000);
+#else
+# error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+    OD_COPY(&cd, &adapt->pvq.pvq_codeword_ctx, 1);
+#if CONFIG_DAALA_EC
+    tell = od_ec_enc_tell_frac(&w.ec);
+#else
+# error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+    aom_encode_pvq_codeword(&w, &cd, y0, n - (theta != -1), k);
+#if CONFIG_DAALA_EC
+    rate = (od_ec_enc_tell_frac(&w.ec)-tell)/8.;
+    od_ec_enc_clear(&w.ec);
+#else
+# error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+  }
+  if (qg > 0 && theta >= 0) {
+    /* Approximate cost of entropy-coding theta */
+    rate += .9*OD_LOG2(ts);
+    if (qg == icgr) rate -= .5;
+  }
+  return rate;
+}
+
+#define MAX_PVQ_ITEMS (20)
+/* This stores the information about a PVQ search candidate, so we can sort
+   based on K. */
+typedef struct {
+  int gain;
+  int k;
+  od_val32 qtheta;
+  int theta;
+  int ts;
+  od_val32 qcg;
+} pvq_search_item;
+
+int items_compare(pvq_search_item *a, pvq_search_item *b) {
+  /* Break ties in K with gain to ensure a stable sort.
+     Otherwise, the order depends on qsort implementation. */
+  return a->k == b->k ? a->gain - b->gain : a->k - b->k;
+}
+
+/** Perform PVQ quantization with prediction, trying several
+ * possible gains and angles. See draft-valin-videocodec-pvq and
+ * http://jmvalin.ca/slides/pvq.pdf for more details.
+ *
+ * @param [out]    out         coefficients after quantization
+ * @param [in]     x0          coefficients before quantization
+ * @param [in]     r0          reference, aka predicted coefficients
+ * @param [in]     n           number of dimensions
+ * @param [in]     q0          quantization step size
+ * @param [out]    y           pulse vector (i.e. selected PVQ codevector)
+ * @param [out]    itheta      angle between input and reference (-1 if noref)
+ * @param [out]    vk          total number of pulses
+ * @param [in]     beta        per-band activity masking beta param
+ * @param [out]    skip_diff   distortion cost of skipping this block
+ *                             (accumulated)
+ * @param [in]     is_keyframe whether we're encoding a keyframe
+ * @param [in]     pli         plane index
+ * @param [in]     adapt       probability adaptation context
+ * @param [in]     qm          QM with magnitude compensation
+ * @param [in]     qm_inv      Inverse of QM with magnitude compensation
+ * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
+ * @param [in]     speed       Make search faster by making approximations
+ * @return         gain        index of the quatized gain
+*/
+static int pvq_theta(od_coeff *out, const od_coeff *x0, const od_coeff *r0,
+    int n, int q0, od_coeff *y, int *itheta, int *vk,
+    od_val16 beta, double *skip_diff, int is_keyframe, int pli,
+    const od_adapt_ctx *adapt, const int16_t *qm, const int16_t *qm_inv,
+    double pvq_norm_lambda, int speed) {
+  od_val32 g;
+  od_val32 gr;
+  od_coeff y_tmp[MAXN + 3];
+  int i;
+  /* Number of pulses. */
+  int k;
+  /* Companded gain of x and reference, normalized to q. */
+  od_val32 cg;
+  od_val32 cgr;
+  int icgr;
+  int qg;
+  /* Best RDO cost (D + lamdba*R) so far. */
+  double best_cost;
+  double dist0;
+  /* Distortion (D) that corresponds to the best RDO cost. */
+  double best_dist;
+  double dist;
+  /* Sign of Householder reflection. */
+  int s;
+  /* Dimension on which Householder reflects. */
+  int m;
+  od_val32 theta;
+  double corr;
+  int best_k;
+  od_val32 best_qtheta;
+  od_val32 gain_offset;
+  int noref;
+  double skip_dist;
+  int cfl_enabled;
+  int skip;
+  double gain_weight;
+  od_val16 x16[MAXN];
+  od_val16 r16[MAXN];
+  int xshift;
+  int rshift;
+  /* Give more weight to gain error when calculating the total distortion. */
+  gain_weight = 1.0;
+  OD_ASSERT(n > 1);
+  corr = 0;
+#if !defined(OD_FLOAT_PVQ)
+  /* Shift needed to make x fit in 16 bits even after rotation.
+     This shift value is not normative (it can be changed without breaking
+     the bitstream) */
+  xshift = OD_MAXI(0, od_vector_log_mag(x0, n) - 15);
+  /* Shift needed to make the reference fit in 15 bits, so that the Householder
+     vector can fit in 16 bits.
+     This shift value *is* normative, and has to match the decoder. */
+  rshift = OD_MAXI(0, od_vector_log_mag(r0, n) - 14);
+#else
+  xshift = 0;
+  rshift = 0;
+#endif
+  for (i = 0; i < n; i++) {
+#if defined(OD_FLOAT_PVQ)
+    /*This is slightly different from the original float PVQ code,
+       where the qm was applied in the accumulation in od_pvq_compute_gain and
+       the vectors were od_coeffs, not od_val16 (i.e. double).*/
+    x16[i] = x0[i]*(double)qm[i]*OD_QM_SCALE_1;
+    r16[i] = r0[i]*(double)qm[i]*OD_QM_SCALE_1;
+#else
+    x16[i] = OD_SHR_ROUND(x0[i]*qm[i], OD_QM_SHIFT + xshift);
+    r16[i] = OD_SHR_ROUND(r0[i]*qm[i], OD_QM_SHIFT + rshift);
+#endif
+    corr += OD_MULT16_16(x16[i], r16[i]);
+  }
+  cfl_enabled = is_keyframe && pli != 0 && !OD_DISABLE_CFL;
+  cg  = od_pvq_compute_gain(x16, n, q0, &g, beta, xshift);
+  cgr = od_pvq_compute_gain(r16, n, q0, &gr, beta, rshift);
+  if (cfl_enabled) cgr = OD_CGAIN_SCALE;
+  /* gain_offset is meant to make sure one of the quantized gains has
+     exactly the same gain as the reference. */
+#if defined(OD_FLOAT_PVQ)
+  icgr = (int)floor(.5 + cgr);
+#else
+  icgr = OD_SHR_ROUND(cgr, OD_CGAIN_SHIFT);
+#endif
+  gain_offset = cgr - OD_SHL(icgr, OD_CGAIN_SHIFT);
+  /* Start search with null case: gain=0, no pulse. */
+  qg = 0;
+  dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2;
+  best_dist = dist;
+  best_cost = dist + pvq_norm_lambda*od_pvq_rate(0, 0, -1, 0, adapt, NULL, 0,
+    n, speed);
+  noref = 1;
+  best_k = 0;
+  *itheta = -1;
+  OD_CLEAR(y, n);
+  best_qtheta = 0;
+  m = 0;
+  s = 1;
+  corr = corr/(1e-100 + g*(double)gr/OD_SHL(1, xshift + rshift));
+  corr = OD_MAXF(OD_MINF(corr, 1.), -1.);
+  if (is_keyframe) skip_dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2;
+  else {
+    skip_dist = gain_weight*(cg - cgr)*(cg - cgr)
+     + cgr*(double)cg*(2 - 2*corr);
+    skip_dist *= OD_CGAIN_SCALE_2;
+  }
+  if (!is_keyframe) {
+    /* noref, gain=0 isn't allowed, but skip is allowed. */
+    od_val32 scgr;
+    scgr = OD_MAXF(0,gain_offset);
+    if (icgr == 0) {
+      best_dist = gain_weight*(cg - scgr)*(cg - scgr)
+       + scgr*(double)cg*(2 - 2*corr);
+      best_dist *= OD_CGAIN_SCALE_2;
+    }
+    best_cost = best_dist + pvq_norm_lambda*od_pvq_rate(0, icgr, 0, 0, adapt,
+     NULL, 0, n, speed);
+    best_qtheta = 0;
+    *itheta = 0;
+    noref = 0;
+  }
+  dist0 = best_dist;
+  if (n <= OD_MAX_PVQ_SIZE && !od_vector_is_null(r0, n) && corr > 0) {
+    od_val16 xr[MAXN];
+    int gain_bound;
+    int prev_k;
+    pvq_search_item items[MAX_PVQ_ITEMS];
+    int idx;
+    int nitems;
+    double cos_dist;
+    idx = 0;
+    gain_bound = OD_SHR(cg - gain_offset, OD_CGAIN_SHIFT);
+    /* Perform theta search only if prediction is useful. */
+    theta = OD_ROUND32(OD_THETA_SCALE*acos(corr));
+    m = od_compute_householder(r16, n, gr, &s, rshift);
+    od_apply_householder(xr, x16, r16, n);
+    prev_k = 0;
+    for (i = m; i < n - 1; i++) xr[i] = xr[i + 1];
+    /* Compute all candidate PVQ searches within a reasonable range of gain
+       and theta. */
+    for (i = OD_MAXI(1, gain_bound - 1); i <= gain_bound + 1; i++) {
+      int j;
+      od_val32 qcg;
+      int ts;
+      int theta_lower;
+      int theta_upper;
+      /* Quantized companded gain */
+      qcg = OD_SHL(i, OD_CGAIN_SHIFT) + gain_offset;
+      /* Set angular resolution (in ra) to match the encoded gain */
+      ts = od_pvq_compute_max_theta(qcg, beta);
+      theta_lower = OD_MAXI(0, (int)floor(.5 +
+       theta*OD_THETA_SCALE_1*2/M_PI*ts) - 2);
+      theta_upper = OD_MINI(ts - 1, (int)ceil(theta*OD_THETA_SCALE_1*2/M_PI*ts));
+      /* Include the angles within a reasonable range. */
+      for (j = theta_lower; j <= theta_upper; j++) {
+        od_val32 qtheta;
+        qtheta = od_pvq_compute_theta(j, ts);
+        k = od_pvq_compute_k(qcg, j, 0, n, beta);
+        items[idx].gain = i;
+        items[idx].theta = j;
+        items[idx].k = k;
+        items[idx].qcg = qcg;
+        items[idx].qtheta = qtheta;
+        items[idx].ts = ts;
+        idx++;
+        OD_ASSERT(idx < MAX_PVQ_ITEMS);
+      }
+    }
+    nitems = idx;
+    cos_dist = 0;
+    /* Sort PVQ search candidates in ascending order of pulses K so that
+       we can reuse all the previously searched pulses across searches. */
+    qsort(items, nitems, sizeof(items[0]),
+     (int (*)(const void *, const void *))items_compare);
+    /* Search for the best gain/theta in order. */
+    for (idx = 0; idx < nitems; idx++) {
+      int j;
+      od_val32 qcg;
+      int ts;
+      double cost;
+      double dist_theta;
+      double sin_prod;
+      od_val32 qtheta;
+      /* Quantized companded gain */
+      qcg = items[idx].qcg;
+      i = items[idx].gain;
+      j = items[idx].theta;
+      /* Set angular resolution (in ra) to match the encoded gain */
+      ts = items[idx].ts;
+      /* Search for the best angle within a reasonable range. */
+      qtheta = items[idx].qtheta;
+      k = items[idx].k;
+      /* Compute the minimal possible distortion by not taking the PVQ
+         cos_dist into account. */
+      dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1;
+      dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta;
+      dist *= OD_CGAIN_SCALE_2;
+      /* If we have no hope of beating skip (including a 1-bit worst-case
+         penalty), stop now. */
+      if (dist > dist0 + 1.0*pvq_norm_lambda && k != 0) continue;
+      sin_prod = od_pvq_sin(theta)*OD_TRIG_SCALE_1*od_pvq_sin(qtheta)*
+       OD_TRIG_SCALE_1;
+      /* PVQ search, using a gain of qcg*cg*sin(theta)*sin(qtheta) since
+         that's the factor by which cos_dist is multiplied to get the
+         distortion metric. */
+      if (k == 0) {
+        cos_dist = 0;
+        OD_CLEAR(y_tmp, n-1);
+      }
+      else if (k != prev_k) {
+        cos_dist = pvq_search_rdo_double(xr, n - 1, k, y_tmp,
+         qcg*(double)cg*sin_prod*OD_CGAIN_SCALE_2, pvq_norm_lambda, prev_k);
+      }
+      prev_k = k;
+      /* See Jmspeex' Journal of Dubious Theoretical Results. */
+      dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1
+       + sin_prod*(2 - 2*cos_dist);
+      dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta;
+      dist *= OD_CGAIN_SCALE_2;
+      /* Do approximate RDO. */
+      cost = dist + pvq_norm_lambda*od_pvq_rate(i, icgr, j, ts, adapt, y_tmp,
+       k, n, speed);
+      if (cost < best_cost) {
+        best_cost = cost;
+        best_dist = dist;
+        qg = i;
+        best_k = k;
+        best_qtheta = qtheta;
+        *itheta = j;
+        noref = 0;
+        OD_COPY(y, y_tmp, n - 1);
+      }
+    }
+  }
+  /* Don't bother with no-reference version if there's a reasonable
+     correlation. */
+  if (n <= OD_MAX_PVQ_SIZE && (corr < .5
+        || cg < (od_val32)(OD_SHL(2, OD_CGAIN_SHIFT)))) {
+    int gain_bound;
+    int prev_k;
+    gain_bound = OD_SHR(cg, OD_CGAIN_SHIFT);
+    prev_k = 0;
+    /* Search for the best gain (haven't determined reasonable range yet). */
+    for (i = OD_MAXI(1, gain_bound); i <= gain_bound + 1; i++) {
+      double cos_dist;
+      double cost;
+      od_val32 qcg;
+      qcg = OD_SHL(i, OD_CGAIN_SHIFT);
+      k = od_pvq_compute_k(qcg, -1, 1, n, beta);
+      /* Compute the minimal possible distortion by not taking the PVQ
+         cos_dist into account. */
+      dist = gain_weight*(qcg - cg)*(qcg - cg);
+      dist *= OD_CGAIN_SCALE_2;
+      if (dist > dist0 && k != 0) continue;
+      cos_dist = pvq_search_rdo_double(x16, n, k, y_tmp,
+       qcg*(double)cg*OD_CGAIN_SCALE_2, pvq_norm_lambda, prev_k);
+      prev_k = k;
+      /* See Jmspeex' Journal of Dubious Theoretical Results. */
+      dist = gain_weight*(qcg - cg)*(qcg - cg)
+       + qcg*(double)cg*(2 - 2*cos_dist);
+      dist *= OD_CGAIN_SCALE_2;
+      /* Do approximate RDO. */
+      cost = dist + pvq_norm_lambda*od_pvq_rate(i, 0, -1, 0, adapt, y_tmp, k,
+       n, speed);
+      if (cost <= best_cost) {
+        best_cost = cost;
+        best_dist = dist;
+        qg = i;
+        noref = 1;
+        best_k = k;
+        *itheta = -1;
+        OD_COPY(y, y_tmp, n);
+      }
+    }
+  }
+  k = best_k;
+  theta = best_qtheta;
+  skip = 0;
+  if (noref) {
+    if (qg == 0) skip = OD_PVQ_SKIP_ZERO;
+  }
+  else {
+    if (!is_keyframe && qg == 0) {
+      skip = (icgr ? OD_PVQ_SKIP_ZERO : OD_PVQ_SKIP_COPY);
+    }
+    if (qg == icgr && *itheta == 0 && !cfl_enabled) skip = OD_PVQ_SKIP_COPY;
+  }
+  /* Synthesize like the decoder would. */
+  if (skip) {
+    if (skip == OD_PVQ_SKIP_COPY) OD_COPY(out, r0, n);
+    else OD_CLEAR(out, n);
+  }
+  else {
+    if (noref) gain_offset = 0;
+    g = od_gain_expand(OD_SHL(qg, OD_CGAIN_SHIFT) + gain_offset, q0, beta);
+    od_pvq_synthesis_partial(out, y, r16, n, noref, g, theta, m, s,
+     qm_inv);
+  }
+  *vk = k;
+  *skip_diff += skip_dist - best_dist;
+  /* Encode gain differently depending on whether we use prediction or not.
+     Special encoding on inter frames where qg=0 is allowed for noref=0
+     but not noref=1.*/
+  if (is_keyframe) return noref ? qg : neg_interleave(qg, icgr);
+  else return noref ? qg - 1 : neg_interleave(qg + 1, icgr + 1);
+}
+
+/** Encodes a single vector of integers (eg, a partition within a
+ *  coefficient block) using PVQ
+ *
+ * @param [in,out] w          multi-symbol entropy encoder
+ * @param [in]     qg         quantized gain
+ * @param [in]     theta      quantized post-prediction theta
+ * @param [in]     in         coefficient vector to code
+ * @param [in]     n          number of coefficients in partition
+ * @param [in]     k          number of pulses in partition
+ * @param [in,out] model      entropy encoder state
+ * @param [in,out] adapt      adaptation context
+ * @param [in,out] exg        ExQ16 expectation of gain value
+ * @param [in,out] ext        ExQ16 expectation of theta value
+ * @param [in]     cdf_ctx    selects which cdf context to use
+ * @param [in]     is_keyframe whether we're encoding a keyframe
+ * @param [in]     code_skip  whether the "skip rest" flag is allowed
+ * @param [in]     skip_rest  when set, we skip all higher bands
+ * @param [in]     encode_flip whether we need to encode the CfL flip flag now
+ * @param [in]     flip       value of the CfL flip flag
+ */
+void pvq_encode_partition(aom_writer *w,
+                                 int qg,
+                                 int theta,
+                                 const od_coeff *in,
+                                 int n,
+                                 int k,
+                                 generic_encoder model[3],
+                                 od_adapt_ctx *adapt,
+                                 int *exg,
+                                 int *ext,
+                                 int cdf_ctx,
+                                 int is_keyframe,
+                                 int code_skip,
+                                 int skip_rest,
+                                 int encode_flip,
+                                 int flip) {
+  int noref;
+  int id;
+  noref = (theta == -1);
+  id = (qg > 0) + 2*OD_MINI(theta + 1,3) + 8*code_skip*skip_rest;
+  if (is_keyframe) {
+    OD_ASSERT(id != 8);
+    if (id >= 8) id--;
+  }
+  else {
+    OD_ASSERT(id != 10);
+    if (id >= 10) id--;
+  }
+  /* Jointly code gain, theta and noref for small values. Then we handle
+     larger gain and theta values. For noref, theta = -1. */
+  aom_write_symbol_pvq(w, id, &adapt->pvq.pvq_gaintheta_cdf[cdf_ctx][0],
+   8 + 7*code_skip);
+  if (encode_flip) {
+    /* We could eventually do some smarter entropy coding here, but it would
+       have to be good enough to overcome the overhead of the entropy coder.
+       An early attempt using a "toogle" flag with simple adaptation wasn't
+       worth the trouble. */
+    aom_write_bit(w, flip);
+  }
+  if (qg > 0) {
+    int tmp;
+    tmp = *exg;
+    generic_encode(w, &model[!noref], qg - 1, &tmp, 2);
+    OD_IIR_DIADIC(*exg, qg << 16, 2);
+  }
+  if (theta > 1) {
+    int tmp;
+    tmp = *ext;
+    generic_encode(w, &model[2], theta - 2, &tmp, 2);
+    OD_IIR_DIADIC(*ext, theta << 16, 2);
+  }
+  aom_encode_pvq_codeword(w, &adapt->pvq.pvq_codeword_ctx, in,
+   n - (theta != -1), k);
+}
+
+/** Quantizes a scalar with rate-distortion optimization (RDO)
+ * @param [in] x      unquantized value
+ * @param [in] q      quantization step size
+ * @param [in] delta0 rate increase for encoding a 1 instead of a 0
+ * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
+ * @retval quantized value
+ */
+int od_rdo_quant(od_coeff x, int q, double delta0, double pvq_norm_lambda) {
+  int n;
+  /* Optimal quantization threshold is 1/2 + lambda*delta_rate/2. See
+     Jmspeex' Journal of Dubious Theoretical Results for details. */
+  n = OD_DIV_R0(abs(x), q);
+  if ((double)abs(x)/q < (double)n/2 + pvq_norm_lambda*delta0/(2*n)) {
+    return 0;
+  }
+  else {
+    return OD_DIV_R0(x, q);
+  }
+}
+
+/** Encode a coefficient block (excepting DC) using PVQ
+ *
+ * @param [in,out] enc     daala encoder context
+ * @param [in]     ref     'reference' (prediction) vector
+ * @param [in]     in      coefficient block to quantize and encode
+ * @param [out]    out     quantized coefficient block
+ * @param [in]     q0      scale/quantizer
+ * @param [in]     pli     plane index
+ * @param [in]     bs      log of the block size minus two
+ * @param [in]     beta    per-band activity masking beta param
+ * @param [in]     is_keyframe whether we're encoding a keyframe
+ * @param [in]     qm      QM with magnitude compensation
+ * @param [in]     qm_inv  Inverse of QM with magnitude compensation
+ * @param [in]     speed   Make search faster by making approximations
+ * @param [in]     pvq_info If null, conisdered as RDO search mode
+ * @return         Returns block skip info indicating whether DC/AC are coded.
+ *                 bit0: DC is coded, bit1: AC is coded (1 means coded)
+ *
+ */
+PVQ_SKIP_TYPE od_pvq_encode(daala_enc_ctx *enc,
+                   od_coeff *ref,
+                   const od_coeff *in,
+                   od_coeff *out,
+                   int q_dc,
+                   int q_ac,
+                   int pli,
+                   int bs,
+                   const od_val16 *beta,
+                   int is_keyframe,
+                   const int16_t *qm,
+                   const int16_t *qm_inv,
+                   int speed,
+                   PVQ_INFO *pvq_info){
+  int theta[PVQ_MAX_PARTITIONS];
+  int qg[PVQ_MAX_PARTITIONS];
+  int k[PVQ_MAX_PARTITIONS];
+  od_coeff y[OD_TXSIZE_MAX*OD_TXSIZE_MAX];
+  int *exg;
+  int *ext;
+  int nb_bands;
+  int i;
+  const int *off;
+  int size[PVQ_MAX_PARTITIONS];
+  generic_encoder *model;
+  double skip_diff;
+  int tell;
+  uint16_t *skip_cdf;
+  od_rollback_buffer buf;
+  int dc_quant;
+  int flip;
+  int cfl_encoded;
+  int skip_rest;
+  int skip_dir;
+  int skip_theta_value;
+  const unsigned char *pvq_qm;
+  double dc_rate;
+  int use_masking;
+  PVQ_SKIP_TYPE ac_dc_coded;
+
+  aom_clear_system_state();
+
+  use_masking = enc->use_activity_masking;
+
+  if (use_masking)
+    pvq_qm = &enc->state.pvq_qm_q4[pli][0];
+  else
+    pvq_qm = 0;
+
+  exg = &enc->state.adapt->pvq.pvq_exg[pli][bs][0];
+  ext = enc->state.adapt->pvq.pvq_ext + bs*PVQ_MAX_PARTITIONS;
+  skip_cdf = enc->state.adapt->skip_cdf[2*bs + (pli != 0)];
+  model = enc->state.adapt->pvq.pvq_param_model;
+  nb_bands = OD_BAND_OFFSETS[bs][0];
+  off = &OD_BAND_OFFSETS[bs][1];
+
+  if (use_masking)
+    dc_quant = OD_MAXI(1, q_dc * pvq_qm[od_qm_get_index(bs, 0)] >> 4);
+  else
+    dc_quant = OD_MAXI(1, q_dc);
+
+  tell = 0;
+  for (i = 0; i < nb_bands; i++) size[i] = off[i+1] - off[i];
+  skip_diff = 0;
+  flip = 0;
+  /*If we are coding a chroma block of a keyframe, we are doing CfL.*/
+  if (pli != 0 && is_keyframe) {
+    od_val32 xy;
+    xy = 0;
+    /*Compute the dot-product of the first band of chroma with the luma ref.*/
+    for (i = off[0]; i < off[1]; i++) {
+#if defined(OD_FLOAT_PVQ)
+      xy += ref[i]*(double)qm[i]*OD_QM_SCALE_1*
+       (double)in[i]*(double)qm[i]*OD_QM_SCALE_1;
+#else
+      od_val32 rq;
+      od_val32 inq;
+      rq = ref[i]*qm[i];
+      inq = in[i]*qm[i];
+      xy += OD_SHR(rq*(int64_t)inq, OD_SHL(OD_QM_SHIFT + OD_CFL_FLIP_SHIFT,
+       1));
+#endif
+    }
+    /*If cos(theta) < 0, then |theta| > pi/2 and we should negate the ref.*/
+    if (xy < 0) {
+      flip = 1;
+      for(i = off[0]; i < off[nb_bands]; i++) ref[i] = -ref[i];
+    }
+  }
+  for (i = 0; i < nb_bands; i++) {
+    int q;
+
+    if (use_masking)
+      q = OD_MAXI(1, q_ac * pvq_qm[od_qm_get_index(bs, i + 1)] >> 4);
+    else
+      q = OD_MAXI(1, q_ac);
+
+    qg[i] = pvq_theta(out + off[i], in + off[i], ref + off[i], size[i],
+     q, y + off[i], &theta[i], &k[i], beta[i], &skip_diff, is_keyframe,
+     pli, enc->state.adapt, qm + off[i], qm_inv + off[i],
+     enc->pvq_norm_lambda, speed);
+  }
+  od_encode_checkpoint(enc, &buf);
+  if (is_keyframe) out[0] = 0;
+  else {
+    int n;
+    n = OD_DIV_R0(abs(in[0] - ref[0]), dc_quant);
+    if (n == 0) {
+      out[0] = 0;
+    } else {
+      int tell2;
+      od_rollback_buffer dc_buf;
+
+      dc_rate = -OD_LOG2((double)(skip_cdf[3] - skip_cdf[2])/
+       (double)(skip_cdf[2] - skip_cdf[1]));
+      dc_rate += 1;
+
+#if CONFIG_DAALA_EC
+      tell2 = od_ec_enc_tell_frac(&enc->w.ec);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+      od_encode_checkpoint(enc, &dc_buf);
+      generic_encode(&enc->w, &enc->state.adapt->model_dc[pli],
+       n - 1, &enc->state.adapt->ex_dc[pli][bs][0], 2);
+#if CONFIG_DAALA_EC
+      tell2 = od_ec_enc_tell_frac(&enc->w.ec) - tell2;
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+      dc_rate += tell2/8.0;
+      od_encode_rollback(enc, &dc_buf);
+
+      out[0] = od_rdo_quant(in[0] - ref[0], dc_quant, dc_rate,
+       enc->pvq_norm_lambda);
+    }
+  }
+#if CONFIG_DAALA_EC
+  tell = od_ec_enc_tell_frac(&enc->w.ec);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+  /* Code as if we're not skipping. */
+  aom_write_symbol(&enc->w, 2 + (out[0] != 0), skip_cdf, 4);
+  ac_dc_coded = AC_CODED + (out[0] != 0);
+  cfl_encoded = 0;
+  skip_rest = 1;
+  skip_theta_value = is_keyframe ? -1 : 0;
+  for (i = 1; i < nb_bands; i++) {
+    if (theta[i] != skip_theta_value || qg[i]) skip_rest = 0;
+  }
+  skip_dir = 0;
+  if (nb_bands > 1) {
+    for (i = 0; i < 3; i++) {
+      int j;
+      int tmp;
+      tmp = 1;
+      // ToDo(yaowu): figure out better stop condition without gcc warning.
+      for (j = i + 1; j < nb_bands && j < PVQ_MAX_PARTITIONS; j += 3) {
+        if (theta[j] != skip_theta_value || qg[j]) tmp = 0;
+      }
+      skip_dir |= tmp << i;
+    }
+  }
+  if (theta[0] == skip_theta_value && qg[0] == 0 && skip_rest) nb_bands = 0;
+
+  /* NOTE: There was no other better place to put this function. */
+  if (pvq_info)
+    av1_store_pvq_enc_info(pvq_info, qg, theta, k, y, nb_bands, off, size,
+      skip_rest, skip_dir, bs);
+
+  for (i = 0; i < nb_bands; i++) {
+    int encode_flip;
+    /* Encode CFL flip bit just after the first time it's used. */
+    encode_flip = pli != 0 && is_keyframe && theta[i] != -1 && !cfl_encoded;
+    if (i == 0 || (!skip_rest && !(skip_dir & (1 << ((i - 1)%3))))) {
+      pvq_encode_partition(&enc->w, qg[i], theta[i], y + off[i],
+       size[i], k[i], model, enc->state.adapt, exg + i, ext + i,
+       (pli != 0)*OD_TXSIZES*PVQ_MAX_PARTITIONS + bs*PVQ_MAX_PARTITIONS + i,
+       is_keyframe, i == 0 && (i < nb_bands - 1), skip_rest, encode_flip, flip);
+    }
+    if (i == 0 && !skip_rest && bs > 0) {
+      aom_write_symbol(&enc->w, skip_dir,
+       &enc->state.adapt->pvq.pvq_skip_dir_cdf[(pli != 0) + 2*(bs - 1)][0], 7);
+    }
+    if (encode_flip) cfl_encoded = 1;
+  }
+#if CONFIG_DAALA_EC
+  tell = od_ec_enc_tell_frac(&enc->w.ec) - tell;
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+  /* Account for the rate of skipping the AC, based on the same DC decision
+     we made when trying to not skip AC. */
+  {
+    double skip_rate;
+    if (out[0] != 0) {
+      skip_rate = -OD_LOG2((skip_cdf[1] - skip_cdf[0])/
+     (double)skip_cdf[3]);
+    }
+    else {
+      skip_rate = -OD_LOG2(skip_cdf[0]/
+     (double)skip_cdf[3]);
+    }
+    tell -= (int)floor(.5+8*skip_rate);
+  }
+  if (nb_bands == 0 || skip_diff <= enc->pvq_norm_lambda/8*tell) {
+    if (is_keyframe) out[0] = 0;
+    else {
+      int n;
+      n = OD_DIV_R0(abs(in[0] - ref[0]), dc_quant);
+      if (n == 0) {
+        out[0] = 0;
+      } else {
+        int tell2;
+        od_rollback_buffer dc_buf;
+
+        dc_rate = -OD_LOG2((double)(skip_cdf[1] - skip_cdf[0])/
+         (double)skip_cdf[0]);
+        dc_rate += 1;
+
+#if CONFIG_DAALA_EC
+        tell2 = od_ec_enc_tell_frac(&enc->w.ec);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+        od_encode_checkpoint(enc, &dc_buf);
+        generic_encode(&enc->w, &enc->state.adapt->model_dc[pli],
+         n - 1, &enc->state.adapt->ex_dc[pli][bs][0], 2);
+#if CONFIG_DAALA_EC
+        tell2 = od_ec_enc_tell_frac(&enc->w.ec) - tell2;
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+        dc_rate += tell2/8.0;
+        od_encode_rollback(enc, &dc_buf);
+
+        out[0] = od_rdo_quant(in[0] - ref[0], dc_quant, dc_rate,
+         enc->pvq_norm_lambda);
+      }
+    }
+    /* We decide to skip, roll back everything as it was before. */
+    od_encode_rollback(enc, &buf);
+    aom_write_symbol(&enc->w, out[0] != 0, skip_cdf, 4);
+    ac_dc_coded = (out[0] != 0);
+    if (is_keyframe) for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = 0;
+    else for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = ref[i];
+  }
+  if (pvq_info)
+    pvq_info->ac_dc_coded = ac_dc_coded;
+  return ac_dc_coded;
+}
diff --git a/third_party/aom/av1/encoder/pvq_encoder.h b/third_party/aom/av1/encoder/pvq_encoder.h
new file mode 100644
index 000000000..b84c8961b
--- /dev/null
+++ b/third_party/aom/av1/encoder/pvq_encoder.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#if !defined(_pvq_encoder_H)
+# define _pvq_encoder_H (1)
+# include "aom_dsp/bitwriter.h"
+# include "aom_dsp/entenc.h"
+# include "av1/common/blockd.h"
+# include "av1/common/pvq.h"
+# include "av1/encoder/encint.h"
+
+void aom_write_symbol_pvq(aom_writer *w, int symb, aom_cdf_prob *cdf,
+    int nsymbs);
+
+void aom_encode_band_pvq_splits(aom_writer *w, od_pvq_codeword_ctx *adapt,
+ const int *y, int n, int k, int level);
+
+void aom_laplace_encode_special(aom_writer *w, int x, unsigned decay);
+
+void pvq_encode_partition(aom_writer *w,
+                                 int qg,
+                                 int theta,
+                                 const od_coeff *in,
+                                 int n,
+                                 int k,
+                                 generic_encoder model[3],
+                                 od_adapt_ctx *adapt,
+                                 int *exg,
+                                 int *ext,
+                                 int cdf_ctx,
+                                 int is_keyframe,
+                                 int code_skip,
+                                 int skip_rest,
+                                 int encode_flip,
+                                 int flip);
+
+PVQ_SKIP_TYPE od_pvq_encode(daala_enc_ctx *enc, od_coeff *ref,
+    const od_coeff *in, od_coeff *out, int q_dc, int q_ac, int pli, int bs,
+    const od_val16 *beta, int is_keyframe,
+    const int16_t *qm, const int16_t *qm_inv, int speed,
+    PVQ_INFO *pvq_info);
+
+#endif
diff --git a/third_party/aom/av1/encoder/ransac.c b/third_party/aom/av1/encoder/ransac.c
new file mode 100644
index 000000000..5d5dd7572
--- /dev/null
+++ b/third_party/aom/av1/encoder/ransac.c
@@ -0,0 +1,1210 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#define _POSIX_C_SOURCE 200112L  // rand_r()
+#include <memory.h>
+#include <math.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "av1/encoder/ransac.h"
+
+#define MAX_MINPTS 4
+#define MAX_DEGENERATE_ITER 10
+#define MINPTS_MULTIPLIER 5
+
+#define INLIER_THRESHOLD 1.0
+#define MIN_TRIALS 20
+
+////////////////////////////////////////////////////////////////////////////////
+// ransac
+typedef int (*IsDegenerateFunc)(double *p);
+typedef void (*NormalizeFunc)(double *p, int np, double *T);
+typedef void (*DenormalizeFunc)(double *params, double *T1, double *T2);
+typedef int (*FindTransformationFunc)(int points, double *points1,
+                                      double *points2, double *params);
+typedef void (*ProjectPointsDoubleFunc)(double *mat, double *points,
+                                        double *proj, const int n,
+                                        const int stride_points,
+                                        const int stride_proj);
+
+static void project_points_double_translation(double *mat, double *points,
+                                              double *proj, const int n,
+                                              const int stride_points,
+                                              const int stride_proj) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const double x = *(points++), y = *(points++);
+    *(proj++) = x + mat[0];
+    *(proj++) = y + mat[1];
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+static void project_points_double_rotzoom(double *mat, double *points,
+                                          double *proj, const int n,
+                                          const int stride_points,
+                                          const int stride_proj) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const double x = *(points++), y = *(points++);
+    *(proj++) = mat[2] * x + mat[3] * y + mat[0];
+    *(proj++) = -mat[3] * x + mat[2] * y + mat[1];
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+static void project_points_double_affine(double *mat, double *points,
+                                         double *proj, const int n,
+                                         const int stride_points,
+                                         const int stride_proj) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const double x = *(points++), y = *(points++);
+    *(proj++) = mat[2] * x + mat[3] * y + mat[0];
+    *(proj++) = mat[4] * x + mat[5] * y + mat[1];
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+static void project_points_double_hortrapezoid(double *mat, double *points,
+                                               double *proj, const int n,
+                                               const int stride_points,
+                                               const int stride_proj) {
+  int i;
+  double x, y, Z, Z_inv;
+  for (i = 0; i < n; ++i) {
+    x = *(points++), y = *(points++);
+    Z_inv = mat[7] * y + 1;
+    assert(fabs(Z_inv) > 0.000001);
+    Z = 1. / Z_inv;
+    *(proj++) = (mat[2] * x + mat[3] * y + mat[0]) * Z;
+    *(proj++) = (mat[5] * y + mat[1]) * Z;
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+static void project_points_double_vertrapezoid(double *mat, double *points,
+                                               double *proj, const int n,
+                                               const int stride_points,
+                                               const int stride_proj) {
+  int i;
+  double x, y, Z, Z_inv;
+  for (i = 0; i < n; ++i) {
+    x = *(points++), y = *(points++);
+    Z_inv = mat[6] * x + 1;
+    assert(fabs(Z_inv) > 0.000001);
+    Z = 1. / Z_inv;
+    *(proj++) = (mat[2] * x + mat[0]) * Z;
+    *(proj++) = (mat[4] * x + mat[5] * y + mat[1]) * Z;
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+static void project_points_double_homography(double *mat, double *points,
+                                             double *proj, const int n,
+                                             const int stride_points,
+                                             const int stride_proj) {
+  int i;
+  double x, y, Z, Z_inv;
+  for (i = 0; i < n; ++i) {
+    x = *(points++), y = *(points++);
+    Z_inv = mat[6] * x + mat[7] * y + 1;
+    assert(fabs(Z_inv) > 0.000001);
+    Z = 1. / Z_inv;
+    *(proj++) = (mat[2] * x + mat[3] * y + mat[0]) * Z;
+    *(proj++) = (mat[4] * x + mat[5] * y + mat[1]) * Z;
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// svdcmp
+// Adopted from Numerical Recipes in C
+
+static const double TINY_NEAR_ZERO = 1.0E-12;
+
+static INLINE double sign(double a, double b) {
+  return ((b) >= 0 ? fabs(a) : -fabs(a));
+}
+
+static INLINE double pythag(double a, double b) {
+  double ct;
+  const double absa = fabs(a);
+  const double absb = fabs(b);
+
+  if (absa > absb) {
+    ct = absb / absa;
+    return absa * sqrt(1.0 + ct * ct);
+  } else {
+    ct = absa / absb;
+    return (absb == 0) ? 0 : absb * sqrt(1.0 + ct * ct);
+  }
+}
+
+static void multiply_mat(const double *m1, const double *m2, double *res,
+                         const int m1_rows, const int inner_dim,
+                         const int m2_cols) {
+  double sum;
+
+  int row, col, inner;
+  for (row = 0; row < m1_rows; ++row) {
+    for (col = 0; col < m2_cols; ++col) {
+      sum = 0;
+      for (inner = 0; inner < inner_dim; ++inner)
+        sum += m1[row * inner_dim + inner] * m2[inner * m2_cols + col];
+      *(res++) = sum;
+    }
+  }
+}
+
+static int svdcmp(double **u, int m, int n, double w[], double **v) {
+  const int max_its = 30;
+  int flag, i, its, j, jj, k, l, nm;
+  double anorm, c, f, g, h, s, scale, x, y, z;
+  double *rv1 = (double *)aom_malloc(sizeof(*rv1) * (n + 1));
+  g = scale = anorm = 0.0;
+  for (i = 0; i < n; i++) {
+    l = i + 1;
+    rv1[i] = scale * g;
+    g = s = scale = 0.0;
+    if (i < m) {
+      for (k = i; k < m; k++) scale += fabs(u[k][i]);
+      if (scale != 0.) {
+        for (k = i; k < m; k++) {
+          u[k][i] /= scale;
+          s += u[k][i] * u[k][i];
+        }
+        f = u[i][i];
+        g = -sign(sqrt(s), f);
+        h = f * g - s;
+        u[i][i] = f - g;
+        for (j = l; j < n; j++) {
+          for (s = 0.0, k = i; k < m; k++) s += u[k][i] * u[k][j];
+          f = s / h;
+          for (k = i; k < m; k++) u[k][j] += f * u[k][i];
+        }
+        for (k = i; k < m; k++) u[k][i] *= scale;
+      }
+    }
+    w[i] = scale * g;
+    g = s = scale = 0.0;
+    if (i < m && i != n - 1) {
+      for (k = l; k < n; k++) scale += fabs(u[i][k]);
+      if (scale != 0.) {
+        for (k = l; k < n; k++) {
+          u[i][k] /= scale;
+          s += u[i][k] * u[i][k];
+        }
+        f = u[i][l];
+        g = -sign(sqrt(s), f);
+        h = f * g - s;
+        u[i][l] = f - g;
+        for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
+        for (j = l; j < m; j++) {
+          for (s = 0.0, k = l; k < n; k++) s += u[j][k] * u[i][k];
+          for (k = l; k < n; k++) u[j][k] += s * rv1[k];
+        }
+        for (k = l; k < n; k++) u[i][k] *= scale;
+      }
+    }
+    anorm = fmax(anorm, (fabs(w[i]) + fabs(rv1[i])));
+  }
+
+  for (i = n - 1; i >= 0; i--) {
+    if (i < n - 1) {
+      if (g != 0.) {
+        for (j = l; j < n; j++) v[j][i] = (u[i][j] / u[i][l]) / g;
+        for (j = l; j < n; j++) {
+          for (s = 0.0, k = l; k < n; k++) s += u[i][k] * v[k][j];
+          for (k = l; k < n; k++) v[k][j] += s * v[k][i];
+        }
+      }
+      for (j = l; j < n; j++) v[i][j] = v[j][i] = 0.0;
+    }
+    v[i][i] = 1.0;
+    g = rv1[i];
+    l = i;
+  }
+  for (i = AOMMIN(m, n) - 1; i >= 0; i--) {
+    l = i + 1;
+    g = w[i];
+    for (j = l; j < n; j++) u[i][j] = 0.0;
+    if (g != 0.) {
+      g = 1.0 / g;
+      for (j = l; j < n; j++) {
+        for (s = 0.0, k = l; k < m; k++) s += u[k][i] * u[k][j];
+        f = (s / u[i][i]) * g;
+        for (k = i; k < m; k++) u[k][j] += f * u[k][i];
+      }
+      for (j = i; j < m; j++) u[j][i] *= g;
+    } else {
+      for (j = i; j < m; j++) u[j][i] = 0.0;
+    }
+    ++u[i][i];
+  }
+  for (k = n - 1; k >= 0; k--) {
+    for (its = 0; its < max_its; its++) {
+      flag = 1;
+      for (l = k; l >= 0; l--) {
+        nm = l - 1;
+        if ((double)(fabs(rv1[l]) + anorm) == anorm || nm < 0) {
+          flag = 0;
+          break;
+        }
+        if ((double)(fabs(w[nm]) + anorm) == anorm) break;
+      }
+      if (flag) {
+        c = 0.0;
+        s = 1.0;
+        for (i = l; i <= k; i++) {
+          f = s * rv1[i];
+          rv1[i] = c * rv1[i];
+          if ((double)(fabs(f) + anorm) == anorm) break;
+          g = w[i];
+          h = pythag(f, g);
+          w[i] = h;
+          h = 1.0 / h;
+          c = g * h;
+          s = -f * h;
+          for (j = 0; j < m; j++) {
+            y = u[j][nm];
+            z = u[j][i];
+            u[j][nm] = y * c + z * s;
+            u[j][i] = z * c - y * s;
+          }
+        }
+      }
+      z = w[k];
+      if (l == k) {
+        if (z < 0.0) {
+          w[k] = -z;
+          for (j = 0; j < n; j++) v[j][k] = -v[j][k];
+        }
+        break;
+      }
+      if (its == max_its - 1) {
+        aom_free(rv1);
+        return 1;
+      }
+      assert(k > 0);
+      x = w[l];
+      nm = k - 1;
+      y = w[nm];
+      g = rv1[nm];
+      h = rv1[k];
+      f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
+      g = pythag(f, 1.0);
+      f = ((x - z) * (x + z) + h * ((y / (f + sign(g, f))) - h)) / x;
+      c = s = 1.0;
+      for (j = l; j <= nm; j++) {
+        i = j + 1;
+        g = rv1[i];
+        y = w[i];
+        h = s * g;
+        g = c * g;
+        z = pythag(f, h);
+        rv1[j] = z;
+        c = f / z;
+        s = h / z;
+        f = x * c + g * s;
+        g = g * c - x * s;
+        h = y * s;
+        y *= c;
+        for (jj = 0; jj < n; jj++) {
+          x = v[jj][j];
+          z = v[jj][i];
+          v[jj][j] = x * c + z * s;
+          v[jj][i] = z * c - x * s;
+        }
+        z = pythag(f, h);
+        w[j] = z;
+        if (z != 0.) {
+          z = 1.0 / z;
+          c = f * z;
+          s = h * z;
+        }
+        f = c * g + s * y;
+        x = c * y - s * g;
+        for (jj = 0; jj < m; jj++) {
+          y = u[jj][j];
+          z = u[jj][i];
+          u[jj][j] = y * c + z * s;
+          u[jj][i] = z * c - y * s;
+        }
+      }
+      rv1[l] = 0.0;
+      rv1[k] = f;
+      w[k] = x;
+    }
+  }
+  aom_free(rv1);
+  return 0;
+}
+
+static int SVD(double *U, double *W, double *V, double *matx, int M, int N) {
+  // Assumes allocation for U is MxN
+  double **nrU = (double **)aom_malloc((M) * sizeof(*nrU));
+  double **nrV = (double **)aom_malloc((N) * sizeof(*nrV));
+  int problem, i;
+
+  problem = !(nrU && nrV);
+  if (!problem) {
+    for (i = 0; i < M; i++) {
+      nrU[i] = &U[i * N];
+    }
+    for (i = 0; i < N; i++) {
+      nrV[i] = &V[i * N];
+    }
+  } else {
+    if (nrU) aom_free(nrU);
+    if (nrV) aom_free(nrV);
+    return 1;
+  }
+
+  /* copy from given matx into nrU */
+  for (i = 0; i < M; i++) {
+    memcpy(&(nrU[i][0]), matx + N * i, N * sizeof(*matx));
+  }
+
+  /* HERE IT IS: do SVD */
+  if (svdcmp(nrU, M, N, W, nrV)) {
+    aom_free(nrU);
+    aom_free(nrV);
+    return 1;
+  }
+
+  /* aom_free Numerical Recipes arrays */
+  aom_free(nrU);
+  aom_free(nrV);
+
+  return 0;
+}
+
+int pseudo_inverse(double *inv, double *matx, const int M, const int N) {
+  double ans;
+  int i, j, k;
+  double *const U = (double *)aom_malloc(M * N * sizeof(*matx));
+  double *const W = (double *)aom_malloc(N * sizeof(*matx));
+  double *const V = (double *)aom_malloc(N * N * sizeof(*matx));
+
+  if (!(U && W && V)) {
+    return 1;
+  }
+  if (SVD(U, W, V, matx, M, N)) {
+    aom_free(U);
+    aom_free(W);
+    aom_free(V);
+    return 1;
+  }
+  for (i = 0; i < N; i++) {
+    if (fabs(W[i]) < TINY_NEAR_ZERO) {
+      aom_free(U);
+      aom_free(W);
+      aom_free(V);
+      return 1;
+    }
+  }
+
+  for (i = 0; i < N; i++) {
+    for (j = 0; j < M; j++) {
+      ans = 0;
+      for (k = 0; k < N; k++) {
+        ans += V[k + N * i] * U[k + N * j] / W[k];
+      }
+      inv[j + M * i] = ans;
+    }
+  }
+  aom_free(U);
+  aom_free(W);
+  aom_free(V);
+  return 0;
+}
+
+static void normalize_homography(double *pts, int n, double *T) {
+  double *p = pts;
+  double mean[2] = { 0, 0 };
+  double msqe = 0;
+  double scale;
+  int i;
+  for (i = 0; i < n; ++i, p += 2) {
+    mean[0] += p[0];
+    mean[1] += p[1];
+  }
+  mean[0] /= n;
+  mean[1] /= n;
+  for (p = pts, i = 0; i < n; ++i, p += 2) {
+    p[0] -= mean[0];
+    p[1] -= mean[1];
+    msqe += sqrt(p[0] * p[0] + p[1] * p[1]);
+  }
+  msqe /= n;
+  scale = (msqe == 0 ? 1.0 : sqrt(2) / msqe);
+  T[0] = scale;
+  T[1] = 0;
+  T[2] = -scale * mean[0];
+  T[3] = 0;
+  T[4] = scale;
+  T[5] = -scale * mean[1];
+  T[6] = 0;
+  T[7] = 0;
+  T[8] = 1;
+  for (p = pts, i = 0; i < n; ++i, p += 2) {
+    p[0] *= scale;
+    p[1] *= scale;
+  }
+}
+
+static void invnormalize_mat(double *T, double *iT) {
+  double is = 1.0 / T[0];
+  double m0 = -T[2] * is;
+  double m1 = -T[5] * is;
+  iT[0] = is;
+  iT[1] = 0;
+  iT[2] = m0;
+  iT[3] = 0;
+  iT[4] = is;
+  iT[5] = m1;
+  iT[6] = 0;
+  iT[7] = 0;
+  iT[8] = 1;
+}
+
+static void denormalize_homography(double *params, double *T1, double *T2) {
+  double iT2[9];
+  double params2[9];
+  invnormalize_mat(T2, iT2);
+  multiply_mat(params, T1, params2, 3, 3, 3);
+  multiply_mat(iT2, params2, params, 3, 3, 3);
+}
+
+static void denormalize_homography_reorder(double *params, double *T1,
+                                           double *T2) {
+  double params_denorm[MAX_PARAMDIM];
+  memcpy(params_denorm, params, sizeof(*params) * 8);
+  params_denorm[8] = 1.0;
+  denormalize_homography(params_denorm, T1, T2);
+  params[0] = params_denorm[2];
+  params[1] = params_denorm[5];
+  params[2] = params_denorm[0];
+  params[3] = params_denorm[1];
+  params[4] = params_denorm[3];
+  params[5] = params_denorm[4];
+  params[6] = params_denorm[6];
+  params[7] = params_denorm[7];
+}
+
+static void denormalize_affine_reorder(double *params, double *T1, double *T2) {
+  double params_denorm[MAX_PARAMDIM];
+  params_denorm[0] = params[0];
+  params_denorm[1] = params[1];
+  params_denorm[2] = params[4];
+  params_denorm[3] = params[2];
+  params_denorm[4] = params[3];
+  params_denorm[5] = params[5];
+  params_denorm[6] = params_denorm[7] = 0;
+  params_denorm[8] = 1;
+  denormalize_homography(params_denorm, T1, T2);
+  params[0] = params_denorm[2];
+  params[1] = params_denorm[5];
+  params[2] = params_denorm[0];
+  params[3] = params_denorm[1];
+  params[4] = params_denorm[3];
+  params[5] = params_denorm[4];
+  params[6] = params[7] = 0;
+}
+
+static void denormalize_rotzoom_reorder(double *params, double *T1,
+                                        double *T2) {
+  double params_denorm[MAX_PARAMDIM];
+  params_denorm[0] = params[0];
+  params_denorm[1] = params[1];
+  params_denorm[2] = params[2];
+  params_denorm[3] = -params[1];
+  params_denorm[4] = params[0];
+  params_denorm[5] = params[3];
+  params_denorm[6] = params_denorm[7] = 0;
+  params_denorm[8] = 1;
+  denormalize_homography(params_denorm, T1, T2);
+  params[0] = params_denorm[2];
+  params[1] = params_denorm[5];
+  params[2] = params_denorm[0];
+  params[3] = params_denorm[1];
+  params[4] = -params[3];
+  params[5] = params[2];
+  params[6] = params[7] = 0;
+}
+
+static void denormalize_translation_reorder(double *params, double *T1,
+                                            double *T2) {
+  double params_denorm[MAX_PARAMDIM];
+  params_denorm[0] = 1;
+  params_denorm[1] = 0;
+  params_denorm[2] = params[0];
+  params_denorm[3] = 0;
+  params_denorm[4] = 1;
+  params_denorm[5] = params[1];
+  params_denorm[6] = params_denorm[7] = 0;
+  params_denorm[8] = 1;
+  denormalize_homography(params_denorm, T1, T2);
+  params[0] = params_denorm[2];
+  params[1] = params_denorm[5];
+  params[2] = params[5] = 1;
+  params[3] = params[4] = 0;
+  params[6] = params[7] = 0;
+}
+
+static int find_translation(int np, double *pts1, double *pts2, double *mat) {
+  int i;
+  double sx, sy, dx, dy;
+  double sumx, sumy;
+
+  double T1[9], T2[9];
+  normalize_homography(pts1, np, T1);
+  normalize_homography(pts2, np, T2);
+
+  sumx = 0;
+  sumy = 0;
+  for (i = 0; i < np; ++i) {
+    dx = *(pts2++);
+    dy = *(pts2++);
+    sx = *(pts1++);
+    sy = *(pts1++);
+
+    sumx += dx - sx;
+    sumy += dy - sy;
+  }
+  mat[0] = sumx / np;
+  mat[1] = sumy / np;
+  denormalize_translation_reorder(mat, T1, T2);
+  return 0;
+}
+
+static int find_rotzoom(int np, double *pts1, double *pts2, double *mat) {
+  const int np2 = np * 2;
+  double *a = (double *)aom_malloc(sizeof(*a) * np2 * 9);
+  double *b = a + np2 * 4;
+  double *temp = b + np2;
+  int i;
+  double sx, sy, dx, dy;
+
+  double T1[9], T2[9];
+  normalize_homography(pts1, np, T1);
+  normalize_homography(pts2, np, T2);
+
+  for (i = 0; i < np; ++i) {
+    dx = *(pts2++);
+    dy = *(pts2++);
+    sx = *(pts1++);
+    sy = *(pts1++);
+
+    a[i * 2 * 4 + 0] = sx;
+    a[i * 2 * 4 + 1] = sy;
+    a[i * 2 * 4 + 2] = 1;
+    a[i * 2 * 4 + 3] = 0;
+    a[(i * 2 + 1) * 4 + 0] = sy;
+    a[(i * 2 + 1) * 4 + 1] = -sx;
+    a[(i * 2 + 1) * 4 + 2] = 0;
+    a[(i * 2 + 1) * 4 + 3] = 1;
+
+    b[2 * i] = dx;
+    b[2 * i + 1] = dy;
+  }
+  if (pseudo_inverse(temp, a, np2, 4)) {
+    aom_free(a);
+    return 1;
+  }
+  multiply_mat(temp, b, mat, 4, np2, 1);
+  denormalize_rotzoom_reorder(mat, T1, T2);
+  aom_free(a);
+  return 0;
+}
+
+static int find_affine(int np, double *pts1, double *pts2, double *mat) {
+  const int np2 = np * 2;
+  double *a = (double *)aom_malloc(sizeof(*a) * np2 * 13);
+  double *b = a + np2 * 6;
+  double *temp = b + np2;
+  int i;
+  double sx, sy, dx, dy;
+
+  double T1[9], T2[9];
+  normalize_homography(pts1, np, T1);
+  normalize_homography(pts2, np, T2);
+
+  for (i = 0; i < np; ++i) {
+    dx = *(pts2++);
+    dy = *(pts2++);
+    sx = *(pts1++);
+    sy = *(pts1++);
+
+    a[i * 2 * 6 + 0] = sx;
+    a[i * 2 * 6 + 1] = sy;
+    a[i * 2 * 6 + 2] = 0;
+    a[i * 2 * 6 + 3] = 0;
+    a[i * 2 * 6 + 4] = 1;
+    a[i * 2 * 6 + 5] = 0;
+    a[(i * 2 + 1) * 6 + 0] = 0;
+    a[(i * 2 + 1) * 6 + 1] = 0;
+    a[(i * 2 + 1) * 6 + 2] = sx;
+    a[(i * 2 + 1) * 6 + 3] = sy;
+    a[(i * 2 + 1) * 6 + 4] = 0;
+    a[(i * 2 + 1) * 6 + 5] = 1;
+
+    b[2 * i] = dx;
+    b[2 * i + 1] = dy;
+  }
+  if (pseudo_inverse(temp, a, np2, 6)) {
+    aom_free(a);
+    return 1;
+  }
+  multiply_mat(temp, b, mat, 6, np2, 1);
+  denormalize_affine_reorder(mat, T1, T2);
+  aom_free(a);
+  return 0;
+}
+
+static int find_vertrapezoid(int np, double *pts1, double *pts2, double *mat) {
+  const int np3 = np * 3;
+  double *a = (double *)aom_malloc(sizeof(*a) * np3 * 14);
+  double *U = a + np3 * 7;
+  double S[7], V[7 * 7], H[9];
+  int i, mini;
+  double sx, sy, dx, dy;
+  double T1[9], T2[9];
+
+  normalize_homography(pts1, np, T1);
+  normalize_homography(pts2, np, T2);
+
+  for (i = 0; i < np; ++i) {
+    dx = *(pts2++);
+    dy = *(pts2++);
+    sx = *(pts1++);
+    sy = *(pts1++);
+
+    a[i * 3 * 7 + 0] = a[i * 3 * 7 + 1] = 0;
+    a[i * 3 * 7 + 2] = -sx;
+    a[i * 3 * 7 + 3] = -sy;
+    a[i * 3 * 7 + 4] = -1;
+    a[i * 3 * 7 + 5] = dy * sx;
+    a[i * 3 * 7 + 6] = dy;
+
+    a[(i * 3 + 1) * 7 + 0] = sx;
+    a[(i * 3 + 1) * 7 + 1] = 1;
+    a[(i * 3 + 1) * 7 + 2] = a[(i * 3 + 1) * 7 + 3] = a[(i * 3 + 1) * 7 + 4] =
+        0;
+    a[(i * 3 + 1) * 7 + 5] = -dx * sx;
+    a[(i * 3 + 1) * 7 + 6] = -dx;
+
+    a[(i * 3 + 2) * 7 + 0] = -dy * sx;
+    a[(i * 3 + 2) * 7 + 1] = -dy;
+    a[(i * 3 + 2) * 7 + 2] = dx * sx;
+    a[(i * 3 + 2) * 7 + 3] = dx * sy;
+    a[(i * 3 + 2) * 7 + 4] = dx;
+    a[(i * 3 + 2) * 7 + 5] = a[(i * 3 + 2) * 7 + 6] = 0;
+  }
+  if (SVD(U, S, V, a, np3, 7)) {
+    aom_free(a);
+    return 1;
+  } else {
+    double minS = 1e12;
+    mini = -1;
+    for (i = 0; i < 7; ++i) {
+      if (S[i] < minS) {
+        minS = S[i];
+        mini = i;
+      }
+    }
+  }
+  H[1] = H[7] = 0;
+  for (i = 0; i < 1; i++) H[i] = V[i * 7 + mini];
+  for (; i < 6; i++) H[i + 1] = V[i * 7 + mini];
+  for (; i < 7; i++) H[i + 2] = V[i * 7 + mini];
+
+  denormalize_homography_reorder(H, T1, T2);
+  aom_free(a);
+  if (H[8] == 0.0) {
+    return 1;
+  } else {
+    // normalize
+    double f = 1.0 / H[8];
+    for (i = 0; i < 8; i++) mat[i] = f * H[i];
+  }
+  return 0;
+}
+
+static int find_hortrapezoid(int np, double *pts1, double *pts2, double *mat) {
+  const int np3 = np * 3;
+  double *a = (double *)aom_malloc(sizeof(*a) * np3 * 14);
+  double *U = a + np3 * 7;
+  double S[7], V[7 * 7], H[9];
+  int i, mini;
+  double sx, sy, dx, dy;
+  double T1[9], T2[9];
+
+  normalize_homography(pts1, np, T1);
+  normalize_homography(pts2, np, T2);
+
+  for (i = 0; i < np; ++i) {
+    dx = *(pts2++);
+    dy = *(pts2++);
+    sx = *(pts1++);
+    sy = *(pts1++);
+
+    a[i * 3 * 7 + 0] = a[i * 3 * 7 + 1] = a[i * 3 * 7 + 2] = 0;
+    a[i * 3 * 7 + 3] = -sy;
+    a[i * 3 * 7 + 4] = -1;
+    a[i * 3 * 7 + 5] = dy * sy;
+    a[i * 3 * 7 + 6] = dy;
+
+    a[(i * 3 + 1) * 7 + 0] = sx;
+    a[(i * 3 + 1) * 7 + 1] = sy;
+    a[(i * 3 + 1) * 7 + 2] = 1;
+    a[(i * 3 + 1) * 7 + 3] = a[(i * 3 + 1) * 7 + 4] = 0;
+    a[(i * 3 + 1) * 7 + 5] = -dx * sy;
+    a[(i * 3 + 1) * 7 + 6] = -dx;
+
+    a[(i * 3 + 2) * 7 + 0] = -dy * sx;
+    a[(i * 3 + 2) * 7 + 1] = -dy * sy;
+    a[(i * 3 + 2) * 7 + 2] = -dy;
+    a[(i * 3 + 2) * 7 + 3] = dx * sy;
+    a[(i * 3 + 2) * 7 + 4] = dx;
+    a[(i * 3 + 2) * 7 + 5] = a[(i * 3 + 2) * 7 + 6] = 0;
+  }
+
+  if (SVD(U, S, V, a, np3, 7)) {
+    aom_free(a);
+    return 1;
+  } else {
+    double minS = 1e12;
+    mini = -1;
+    for (i = 0; i < 7; ++i) {
+      if (S[i] < minS) {
+        minS = S[i];
+        mini = i;
+      }
+    }
+  }
+  H[3] = H[6] = 0;
+  for (i = 0; i < 3; i++) H[i] = V[i * 7 + mini];
+  for (; i < 5; i++) H[i + 1] = V[i * 7 + mini];
+  for (; i < 7; i++) H[i + 2] = V[i * 7 + mini];
+
+  denormalize_homography_reorder(H, T1, T2);
+  aom_free(a);
+  if (H[8] == 0.0) {
+    return 1;
+  } else {
+    // normalize
+    double f = 1.0 / H[8];
+    for (i = 0; i < 8; i++) mat[i] = f * H[i];
+  }
+  return 0;
+}
+
+static int find_homography(int np, double *pts1, double *pts2, double *mat) {
+  // Implemented from Peter Kovesi's normalized implementation
+  const int np3 = np * 3;
+  double *a = (double *)aom_malloc(sizeof(*a) * np3 * 18);
+  double *U = a + np3 * 9;
+  double S[9], V[9 * 9], H[9];
+  int i, mini;
+  double sx, sy, dx, dy;
+  double T1[9], T2[9];
+
+  normalize_homography(pts1, np, T1);
+  normalize_homography(pts2, np, T2);
+
+  for (i = 0; i < np; ++i) {
+    dx = *(pts2++);
+    dy = *(pts2++);
+    sx = *(pts1++);
+    sy = *(pts1++);
+
+    a[i * 3 * 9 + 0] = a[i * 3 * 9 + 1] = a[i * 3 * 9 + 2] = 0;
+    a[i * 3 * 9 + 3] = -sx;
+    a[i * 3 * 9 + 4] = -sy;
+    a[i * 3 * 9 + 5] = -1;
+    a[i * 3 * 9 + 6] = dy * sx;
+    a[i * 3 * 9 + 7] = dy * sy;
+    a[i * 3 * 9 + 8] = dy;
+
+    a[(i * 3 + 1) * 9 + 0] = sx;
+    a[(i * 3 + 1) * 9 + 1] = sy;
+    a[(i * 3 + 1) * 9 + 2] = 1;
+    a[(i * 3 + 1) * 9 + 3] = a[(i * 3 + 1) * 9 + 4] = a[(i * 3 + 1) * 9 + 5] =
+        0;
+    a[(i * 3 + 1) * 9 + 6] = -dx * sx;
+    a[(i * 3 + 1) * 9 + 7] = -dx * sy;
+    a[(i * 3 + 1) * 9 + 8] = -dx;
+
+    a[(i * 3 + 2) * 9 + 0] = -dy * sx;
+    a[(i * 3 + 2) * 9 + 1] = -dy * sy;
+    a[(i * 3 + 2) * 9 + 2] = -dy;
+    a[(i * 3 + 2) * 9 + 3] = dx * sx;
+    a[(i * 3 + 2) * 9 + 4] = dx * sy;
+    a[(i * 3 + 2) * 9 + 5] = dx;
+    a[(i * 3 + 2) * 9 + 6] = a[(i * 3 + 2) * 9 + 7] = a[(i * 3 + 2) * 9 + 8] =
+        0;
+  }
+
+  if (SVD(U, S, V, a, np3, 9)) {
+    aom_free(a);
+    return 1;
+  } else {
+    double minS = 1e12;
+    mini = -1;
+    for (i = 0; i < 9; ++i) {
+      if (S[i] < minS) {
+        minS = S[i];
+        mini = i;
+      }
+    }
+  }
+
+  for (i = 0; i < 9; i++) H[i] = V[i * 9 + mini];
+  denormalize_homography_reorder(H, T1, T2);
+  aom_free(a);
+  if (H[8] == 0.0) {
+    return 1;
+  } else {
+    // normalize
+    double f = 1.0 / H[8];
+    for (i = 0; i < 8; i++) mat[i] = f * H[i];
+  }
+  return 0;
+}
+
+static int get_rand_indices(int npoints, int minpts, int *indices,
+                            unsigned int *seed) {
+  int i, j;
+  int ptr = rand_r(seed) % npoints;
+  if (minpts > npoints) return 0;
+  indices[0] = ptr;
+  ptr = (ptr == npoints - 1 ? 0 : ptr + 1);
+  i = 1;
+  while (i < minpts) {
+    int index = rand_r(seed) % npoints;
+    while (index) {
+      ptr = (ptr == npoints - 1 ? 0 : ptr + 1);
+      for (j = 0; j < i; ++j) {
+        if (indices[j] == ptr) break;
+      }
+      if (j == i) index--;
+    }
+    indices[i++] = ptr;
+  }
+  return 1;
+}
+
+typedef struct {
+  int num_inliers;
+  double variance;
+  int *inlier_indices;
+} RANSAC_MOTION;
+
+// Return -1 if 'a' is a better motion, 1 if 'b' is better, 0 otherwise.
+static int compare_motions(const void *arg_a, const void *arg_b) {
+  const RANSAC_MOTION *motion_a = (RANSAC_MOTION *)arg_a;
+  const RANSAC_MOTION *motion_b = (RANSAC_MOTION *)arg_b;
+
+  if (motion_a->num_inliers > motion_b->num_inliers) return -1;
+  if (motion_a->num_inliers < motion_b->num_inliers) return 1;
+  if (motion_a->variance < motion_b->variance) return -1;
+  if (motion_a->variance > motion_b->variance) return 1;
+  return 0;
+}
+
+static int is_better_motion(const RANSAC_MOTION *motion_a,
+                            const RANSAC_MOTION *motion_b) {
+  return compare_motions(motion_a, motion_b) < 0;
+}
+
+static void copy_points_at_indices(double *dest, const double *src,
+                                   const int *indices, int num_points) {
+  for (int i = 0; i < num_points; ++i) {
+    const int index = indices[i];
+    dest[i * 2] = src[index * 2];
+    dest[i * 2 + 1] = src[index * 2 + 1];
+  }
+}
+
+static const double kInfiniteVariance = 1e12;
+
+static void clear_motion(RANSAC_MOTION *motion, int num_points) {
+  motion->num_inliers = 0;
+  motion->variance = kInfiniteVariance;
+  memset(motion->inlier_indices, 0,
+         sizeof(*motion->inlier_indices * num_points));
+}
+
+static int ransac(const int *matched_points, int npoints,
+                  int *num_inliers_by_motion, double *params_by_motion,
+                  int num_desired_motions, const int minpts,
+                  IsDegenerateFunc is_degenerate,
+                  FindTransformationFunc find_transformation,
+                  ProjectPointsDoubleFunc projectpoints) {
+  static const double PROBABILITY_REQUIRED = 0.9;
+  static const double EPS = 1e-12;
+
+  int N = 10000, trial_count = 0;
+  int i = 0;
+  int ret_val = 0;
+
+  unsigned int seed = (unsigned int)npoints;
+
+  int indices[MAX_MINPTS] = { 0 };
+
+  double *points1, *points2;
+  double *corners1, *corners2;
+  double *image1_coord;
+
+  // Store information for the num_desired_motions best transformations found
+  // and the worst motion among them, as well as the motion currently under
+  // consideration.
+  RANSAC_MOTION *motions, *worst_kept_motion = NULL;
+  RANSAC_MOTION current_motion;
+
+  // Store the parameters and the indices of the inlier points for the motion
+  // currently under consideration.
+  double params_this_motion[MAX_PARAMDIM];
+
+  double *cnp1, *cnp2;
+
+  if (npoints < minpts * MINPTS_MULTIPLIER || npoints == 0) {
+    return 1;
+  }
+
+  points1 = (double *)aom_malloc(sizeof(*points1) * npoints * 2);
+  points2 = (double *)aom_malloc(sizeof(*points2) * npoints * 2);
+  corners1 = (double *)aom_malloc(sizeof(*corners1) * npoints * 2);
+  corners2 = (double *)aom_malloc(sizeof(*corners2) * npoints * 2);
+  image1_coord = (double *)aom_malloc(sizeof(*image1_coord) * npoints * 2);
+
+  motions =
+      (RANSAC_MOTION *)aom_malloc(sizeof(RANSAC_MOTION) * num_desired_motions);
+  for (i = 0; i < num_desired_motions; ++i) {
+    motions[i].inlier_indices =
+        (int *)aom_malloc(sizeof(*motions->inlier_indices) * npoints);
+    clear_motion(motions + i, npoints);
+  }
+  current_motion.inlier_indices =
+      (int *)aom_malloc(sizeof(*current_motion.inlier_indices) * npoints);
+  clear_motion(&current_motion, npoints);
+
+  worst_kept_motion = motions;
+
+  if (!(points1 && points2 && corners1 && corners2 && image1_coord && motions &&
+        current_motion.inlier_indices)) {
+    ret_val = 1;
+    goto finish_ransac;
+  }
+
+  cnp1 = corners1;
+  cnp2 = corners2;
+  for (i = 0; i < npoints; ++i) {
+    *(cnp1++) = *(matched_points++);
+    *(cnp1++) = *(matched_points++);
+    *(cnp2++) = *(matched_points++);
+    *(cnp2++) = *(matched_points++);
+  }
+
+  while (N > trial_count) {
+    double sum_distance = 0.0;
+    double sum_distance_squared = 0.0;
+
+    clear_motion(&current_motion, npoints);
+
+    int degenerate = 1;
+    int num_degenerate_iter = 0;
+
+    while (degenerate) {
+      num_degenerate_iter++;
+      if (!get_rand_indices(npoints, minpts, indices, &seed)) {
+        ret_val = 1;
+        goto finish_ransac;
+      }
+
+      copy_points_at_indices(points1, corners1, indices, minpts);
+      copy_points_at_indices(points2, corners2, indices, minpts);
+
+      degenerate = is_degenerate(points1);
+      if (num_degenerate_iter > MAX_DEGENERATE_ITER) {
+        ret_val = 1;
+        goto finish_ransac;
+      }
+    }
+
+    if (find_transformation(minpts, points1, points2, params_this_motion)) {
+      trial_count++;
+      continue;
+    }
+
+    projectpoints(params_this_motion, corners1, image1_coord, npoints, 2, 2);
+
+    for (i = 0; i < npoints; ++i) {
+      double dx = image1_coord[i * 2] - corners2[i * 2];
+      double dy = image1_coord[i * 2 + 1] - corners2[i * 2 + 1];
+      double distance = sqrt(dx * dx + dy * dy);
+
+      if (distance < INLIER_THRESHOLD) {
+        current_motion.inlier_indices[current_motion.num_inliers++] = i;
+        sum_distance += distance;
+        sum_distance_squared += distance * distance;
+      }
+    }
+
+    if (current_motion.num_inliers >= worst_kept_motion->num_inliers &&
+        current_motion.num_inliers > 1) {
+      int temp;
+      double fracinliers, pNoOutliers, mean_distance;
+      mean_distance = sum_distance / ((double)current_motion.num_inliers);
+      current_motion.variance =
+          sum_distance_squared / ((double)current_motion.num_inliers - 1.0) -
+          mean_distance * mean_distance * ((double)current_motion.num_inliers) /
+              ((double)current_motion.num_inliers - 1.0);
+      if (is_better_motion(&current_motion, worst_kept_motion)) {
+        // This motion is better than the worst currently kept motion. Remember
+        // the inlier points and variance. The parameters for each kept motion
+        // will be recomputed later using only the inliers.
+        worst_kept_motion->num_inliers = current_motion.num_inliers;
+        worst_kept_motion->variance = current_motion.variance;
+        memcpy(worst_kept_motion->inlier_indices, current_motion.inlier_indices,
+               sizeof(*current_motion.inlier_indices) * npoints);
+
+        assert(npoints > 0);
+        fracinliers = (double)current_motion.num_inliers / (double)npoints;
+        pNoOutliers = 1 - pow(fracinliers, minpts);
+        pNoOutliers = fmax(EPS, pNoOutliers);
+        pNoOutliers = fmin(1 - EPS, pNoOutliers);
+        temp = (int)(log(1.0 - PROBABILITY_REQUIRED) / log(pNoOutliers));
+
+        if (temp > 0 && temp < N) {
+          N = AOMMAX(temp, MIN_TRIALS);
+        }
+
+        // Determine the new worst kept motion and its num_inliers and variance.
+        for (i = 0; i < num_desired_motions; ++i) {
+          if (is_better_motion(worst_kept_motion, &motions[i])) {
+            worst_kept_motion = &motions[i];
+          }
+        }
+      }
+    }
+    trial_count++;
+  }
+
+  // Sort the motions, best first.
+  qsort(motions, num_desired_motions, sizeof(RANSAC_MOTION), compare_motions);
+
+  // Recompute the motions using only the inliers.
+  for (i = 0; i < num_desired_motions; ++i) {
+    copy_points_at_indices(points1, corners1, motions[i].inlier_indices,
+                           motions[i].num_inliers);
+    copy_points_at_indices(points2, corners2, motions[i].inlier_indices,
+                           motions[i].num_inliers);
+
+    find_transformation(motions[i].num_inliers, points1, points2,
+                        params_by_motion + (MAX_PARAMDIM - 1) * i);
+    num_inliers_by_motion[i] = motions[i].num_inliers;
+  }
+
+finish_ransac:
+  aom_free(points1);
+  aom_free(points2);
+  aom_free(corners1);
+  aom_free(corners2);
+  aom_free(image1_coord);
+  aom_free(current_motion.inlier_indices);
+  for (i = 0; i < num_desired_motions; ++i) {
+    aom_free(motions[i].inlier_indices);
+  }
+  aom_free(motions);
+
+  return ret_val;
+}
+
+static int is_collinear3(double *p1, double *p2, double *p3) {
+  static const double collinear_eps = 1e-3;
+  const double v =
+      (p2[0] - p1[0]) * (p3[1] - p1[1]) - (p2[1] - p1[1]) * (p3[0] - p1[0]);
+  return fabs(v) < collinear_eps;
+}
+
+static int is_degenerate_translation(double *p) {
+  return (p[0] - p[2]) * (p[0] - p[2]) + (p[1] - p[3]) * (p[1] - p[3]) <= 2;
+}
+
+static int is_degenerate_affine(double *p) {
+  return is_collinear3(p, p + 2, p + 4);
+}
+
+static int is_degenerate_homography(double *p) {
+  return is_collinear3(p, p + 2, p + 4) || is_collinear3(p, p + 2, p + 6) ||
+         is_collinear3(p, p + 4, p + 6) || is_collinear3(p + 2, p + 4, p + 6);
+}
+
+int ransac_translation(int *matched_points, int npoints,
+                       int *num_inliers_by_motion, double *params_by_motion,
+                       int num_desired_motions) {
+  return ransac(matched_points, npoints, num_inliers_by_motion,
+                params_by_motion, num_desired_motions, 3,
+                is_degenerate_translation, find_translation,
+                project_points_double_translation);
+}
+
+int ransac_rotzoom(int *matched_points, int npoints, int *num_inliers_by_motion,
+                   double *params_by_motion, int num_desired_motions) {
+  return ransac(matched_points, npoints, num_inliers_by_motion,
+                params_by_motion, num_desired_motions, 3, is_degenerate_affine,
+                find_rotzoom, project_points_double_rotzoom);
+}
+
+int ransac_affine(int *matched_points, int npoints, int *num_inliers_by_motion,
+                  double *params_by_motion, int num_desired_motions) {
+  return ransac(matched_points, npoints, num_inliers_by_motion,
+                params_by_motion, num_desired_motions, 3, is_degenerate_affine,
+                find_affine, project_points_double_affine);
+}
+
+int ransac_homography(int *matched_points, int npoints,
+                      int *num_inliers_by_motion, double *params_by_motion,
+                      int num_desired_motions) {
+  return ransac(matched_points, npoints, num_inliers_by_motion,
+                params_by_motion, num_desired_motions, 4,
+                is_degenerate_homography, find_homography,
+                project_points_double_homography);
+}
+
+int ransac_hortrapezoid(int *matched_points, int npoints,
+                        int *num_inliers_by_motion, double *params_by_motion,
+                        int num_desired_motions) {
+  return ransac(matched_points, npoints, num_inliers_by_motion,
+                params_by_motion, num_desired_motions, 4,
+                is_degenerate_homography, find_hortrapezoid,
+                project_points_double_hortrapezoid);
+}
+
+int ransac_vertrapezoid(int *matched_points, int npoints,
+                        int *num_inliers_by_motion, double *params_by_motion,
+                        int num_desired_motions) {
+  return ransac(matched_points, npoints, num_inliers_by_motion,
+                params_by_motion, num_desired_motions, 4,
+                is_degenerate_homography, find_vertrapezoid,
+                project_points_double_vertrapezoid);
+}
diff --git a/third_party/aom/av1/encoder/ransac.h b/third_party/aom/av1/encoder/ransac.h
new file mode 100644
index 000000000..f611add36
--- /dev/null
+++ b/third_party/aom/av1/encoder/ransac.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_RANSAC_H_
+#define AV1_ENCODER_RANSAC_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <memory.h>
+
+#include "av1/common/warped_motion.h"
+
+typedef int (*RansacFunc)(int *matched_points, int npoints,
+                          int *num_inliers_by_motion, double *params_by_motion,
+                          int num_motions);
+
+/* Each of these functions fits a motion model from a set of
+   corresponding points in 2 frames using RANSAC. */
+int ransac_homography(int *matched_points, int npoints,
+                      int *num_inliers_by_motion, double *params_by_motion,
+                      int num_motions);
+int ransac_affine(int *matched_points, int npoints, int *num_inliers_by_motion,
+                  double *params_by_motion, int num_motions);
+int ransac_hortrapezoid(int *matched_points, int npoints,
+                        int *num_inliers_by_motion, double *params_by_motion,
+                        int num_motions);
+int ransac_vertrapezoid(int *matched_points, int npoints,
+                        int *num_inliers_by_motion, double *params_by_motion,
+                        int num_motions);
+int ransac_rotzoom(int *matched_points, int npoints, int *num_inliers_by_motion,
+                   double *params_by_motion, int num_motions);
+int ransac_translation(int *matched_points, int npoints,
+                       int *num_inliers_by_motion, double *params_by_motion,
+                       int num_motions);
+#endif  // AV1_ENCODER_RANSAC_H_
diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c
new file mode 100644
index 000000000..1f2ea3606
--- /dev/null
+++ b/third_party/aom/av1/encoder/ratectrl.c
@@ -0,0 +1,1759 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/common/common.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/ratectrl.h"
+
+// Max rate target for 1080P and below encodes under normal circumstances
+// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
+#define MAX_MB_RATE 250
+#define MAXRATE_1080P 2025000
+
+#define DEFAULT_KF_BOOST 2000
+#define DEFAULT_GF_BOOST 2000
+
+#define MIN_BPB_FACTOR 0.005
+#define MAX_BPB_FACTOR 50
+
+#define FRAME_OVERHEAD_BITS 200
+#if CONFIG_HIGHBITDEPTH
+#define ASSIGN_MINQ_TABLE(bit_depth, name)                   \
+  do {                                                       \
+    switch (bit_depth) {                                     \
+      case AOM_BITS_8: name = name##_8; break;               \
+      case AOM_BITS_10: name = name##_10; break;             \
+      case AOM_BITS_12: name = name##_12; break;             \
+      default:                                               \
+        assert(0 &&                                          \
+               "bit_depth should be AOM_BITS_8, AOM_BITS_10" \
+               " or AOM_BITS_12");                           \
+        name = NULL;                                         \
+    }                                                        \
+  } while (0)
+#else
+#define ASSIGN_MINQ_TABLE(bit_depth, name) \
+  do {                                     \
+    (void)bit_depth;                       \
+    name = name##_8;                       \
+  } while (0)
+#endif
+
+// Tables relating active max Q to active min Q
+static int kf_low_motion_minq_8[QINDEX_RANGE];
+static int kf_high_motion_minq_8[QINDEX_RANGE];
+static int arfgf_low_motion_minq_8[QINDEX_RANGE];
+static int arfgf_high_motion_minq_8[QINDEX_RANGE];
+static int inter_minq_8[QINDEX_RANGE];
+static int rtc_minq_8[QINDEX_RANGE];
+
+#if CONFIG_HIGHBITDEPTH
+static int kf_low_motion_minq_10[QINDEX_RANGE];
+static int kf_high_motion_minq_10[QINDEX_RANGE];
+static int arfgf_low_motion_minq_10[QINDEX_RANGE];
+static int arfgf_high_motion_minq_10[QINDEX_RANGE];
+static int inter_minq_10[QINDEX_RANGE];
+static int rtc_minq_10[QINDEX_RANGE];
+static int kf_low_motion_minq_12[QINDEX_RANGE];
+static int kf_high_motion_minq_12[QINDEX_RANGE];
+static int arfgf_low_motion_minq_12[QINDEX_RANGE];
+static int arfgf_high_motion_minq_12[QINDEX_RANGE];
+static int inter_minq_12[QINDEX_RANGE];
+static int rtc_minq_12[QINDEX_RANGE];
+#endif
+
+static int gf_high = 2000;
+static int gf_low = 400;
+static int kf_high = 5000;
+static int kf_low = 400;
+
+// Functions to compute the active minq lookup table entries based on a
+// formulaic approach to facilitate easier adjustment of the Q tables.
+// The formulae were derived from computing a 3rd order polynomial best
+// fit to the original data (after plotting real maxq vs minq (not q index))
+static int get_minq_index(double maxq, double x3, double x2, double x1,
+                          aom_bit_depth_t bit_depth) {
+  int i;
+  const double minqtarget = AOMMIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq);
+
+  // Special case handling to deal with the step from q2.0
+  // down to lossless mode represented by q 1.0.
+  if (minqtarget <= 2.0) return 0;
+
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    if (minqtarget <= av1_convert_qindex_to_q(i, bit_depth)) return i;
+  }
+
+  return QINDEX_RANGE - 1;
+}
+
+static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low,
+                           int *arfgf_high, int *inter, int *rtc,
+                           aom_bit_depth_t bit_depth) {
+  int i;
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    const double maxq = av1_convert_qindex_to_q(i, bit_depth);
+    kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth);
+    kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+    arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth);
+    arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+    inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth);
+    rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth);
+  }
+}
+
+void av1_rc_init_minq_luts(void) {
+  init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8,
+                 arfgf_low_motion_minq_8, arfgf_high_motion_minq_8,
+                 inter_minq_8, rtc_minq_8, AOM_BITS_8);
+#if CONFIG_HIGHBITDEPTH
+  init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10,
+                 arfgf_low_motion_minq_10, arfgf_high_motion_minq_10,
+                 inter_minq_10, rtc_minq_10, AOM_BITS_10);
+  init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12,
+                 arfgf_low_motion_minq_12, arfgf_high_motion_minq_12,
+                 inter_minq_12, rtc_minq_12, AOM_BITS_12);
+#endif
+}
+
+// These functions use formulaic calculations to make playing with the
+// quantizer tables easier. If necessary they can be replaced by lookup
+// tables if and when things settle down in the experimental bitstream
+double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth) {
+// Convert the index to a real Q value (scaled down to match old Q values)
+#if CONFIG_HIGHBITDEPTH
+  switch (bit_depth) {
+    case AOM_BITS_8: return av1_ac_quant(qindex, 0, bit_depth) / 4.0;
+    case AOM_BITS_10: return av1_ac_quant(qindex, 0, bit_depth) / 16.0;
+    case AOM_BITS_12: return av1_ac_quant(qindex, 0, bit_depth) / 64.0;
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1.0;
+  }
+#else
+  return av1_ac_quant(qindex, 0, bit_depth) / 4.0;
+#endif
+}
+
+int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                       double correction_factor, aom_bit_depth_t bit_depth) {
+  const double q = av1_convert_qindex_to_q(qindex, bit_depth);
+  int enumerator = frame_type == KEY_FRAME ? 2700000 : 1800000;
+
+  assert(correction_factor <= MAX_BPB_FACTOR &&
+         correction_factor >= MIN_BPB_FACTOR);
+
+  // q based adjustment to baseline enumerator
+  enumerator += (int)(enumerator * q) >> 12;
+  return (int)(enumerator * correction_factor / q);
+}
+
+int av1_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
+                           double correction_factor,
+                           aom_bit_depth_t bit_depth) {
+  const int bpm =
+      (int)(av1_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth));
+  return AOMMAX(FRAME_OVERHEAD_BITS,
+                (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
+}
+
+int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  const int min_frame_target =
+      AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
+// Clip the frame target to the minimum setup value.
+#if CONFIG_EXT_REFS
+  if (cpi->rc.is_src_frame_alt_ref) {
+#else
+  if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
+#endif  // CONFIG_EXT_REFS
+    // If there is an active ARF at this location use the minimum
+    // bits on this frame even if it is a constructed arf.
+    // The active maximum quantizer insures that an appropriate
+    // number of bits will be spent if needed for constructed ARFs.
+    target = min_frame_target;
+  } else if (target < min_frame_target) {
+    target = min_frame_target;
+  }
+
+  // Clip the frame target to the maximum allowed value.
+  if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+  if (oxcf->rc_max_inter_bitrate_pct) {
+    const int max_rate =
+        rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+    target = AOMMIN(target, max_rate);
+  }
+
+  return target;
+}
+
+int av1_rc_clamp_iframe_target_size(const AV1_COMP *const cpi, int target) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  if (oxcf->rc_max_intra_bitrate_pct) {
+    const int max_rate =
+        rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100;
+    target = AOMMIN(target, max_rate);
+  }
+  if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+  return target;
+}
+
+// Update the buffer level: leaky bucket model.
+static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) {
+  const AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+// Non-viewable frames are a special case and are treated as pure overhead.
+#if CONFIG_EXT_REFS
+  // TODO(zoeliu): To further explore whether we should treat BWDREF_FRAME
+  //               differently, since it is a no-show frame.
+  if (!cm->show_frame && !rc->is_bwd_ref_frame)
+#else
+  if (!cm->show_frame)
+#endif  // CONFIG_EXT_REFS
+    rc->bits_off_target -= encoded_frame_size;
+  else
+    rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
+
+  // Clip the buffer level to the maximum specified buffer size.
+  rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size);
+  rc->buffer_level = rc->bits_off_target;
+}
+
+int av1_rc_get_default_min_gf_interval(int width, int height,
+                                       double framerate) {
+  // Assume we do not need any constraint lower than 4K 20 fps
+  static const double factor_safe = 3840 * 2160 * 20.0;
+  const double factor = width * height * framerate;
+  const int default_interval =
+      clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL);
+
+  if (factor <= factor_safe)
+    return default_interval;
+  else
+    return AOMMAX(default_interval,
+                  (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5));
+  // Note this logic makes:
+  // 4K24: 5
+  // 4K30: 6
+  // 4K60: 12
+}
+
+int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
+  int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
+  interval += (interval & 0x01);  // Round to even value
+  return AOMMAX(interval, min_gf_interval);
+}
+
+void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
+  int i;
+
+  if (pass == 0 && oxcf->rc_mode == AOM_CBR) {
+    rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
+    rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
+  } else {
+    rc->avg_frame_qindex[KEY_FRAME] =
+        (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2;
+    rc->avg_frame_qindex[INTER_FRAME] =
+        (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2;
+  }
+
+  rc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
+  rc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
+
+  rc->buffer_level = rc->starting_buffer_level;
+  rc->bits_off_target = rc->starting_buffer_level;
+
+  rc->rolling_target_bits = rc->avg_frame_bandwidth;
+  rc->rolling_actual_bits = rc->avg_frame_bandwidth;
+  rc->long_rolling_target_bits = rc->avg_frame_bandwidth;
+  rc->long_rolling_actual_bits = rc->avg_frame_bandwidth;
+
+  rc->total_actual_bits = 0;
+  rc->total_target_bits = 0;
+  rc->total_target_vs_actual = 0;
+
+  rc->frames_since_key = 8;  // Sensible default for first frame.
+  rc->this_key_frame_forced = 0;
+  rc->next_key_frame_forced = 0;
+  rc->source_alt_ref_pending = 0;
+  rc->source_alt_ref_active = 0;
+
+  rc->frames_till_gf_update_due = 0;
+  rc->ni_av_qi = oxcf->worst_allowed_q;
+  rc->ni_tot_qi = 0;
+  rc->ni_frames = 0;
+
+  rc->tot_q = 0.0;
+  rc->avg_q = av1_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth);
+
+  for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+    rc->rate_correction_factors[i] = 1.0;
+  }
+
+  rc->min_gf_interval = oxcf->min_gf_interval;
+  rc->max_gf_interval = oxcf->max_gf_interval;
+  if (rc->min_gf_interval == 0)
+    rc->min_gf_interval = av1_rc_get_default_min_gf_interval(
+        oxcf->width, oxcf->height, oxcf->init_framerate);
+  if (rc->max_gf_interval == 0)
+    rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
+        oxcf->init_framerate, rc->min_gf_interval);
+  rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
+}
+
+int av1_rc_drop_frame(AV1_COMP *cpi) {
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  if (!oxcf->drop_frames_water_mark) {
+    return 0;
+  } else {
+    if (rc->buffer_level < 0) {
+      // Always drop if buffer is below 0.
+      return 1;
+    } else {
+      // If buffer is below drop_mark, for now just drop every other frame
+      // (starting with the next frame) until it increases back over drop_mark.
+      int drop_mark =
+          (int)(oxcf->drop_frames_water_mark * rc->optimal_buffer_level / 100);
+      if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) {
+        --rc->decimation_factor;
+      } else if (rc->buffer_level <= drop_mark && rc->decimation_factor == 0) {
+        rc->decimation_factor = 1;
+      }
+      if (rc->decimation_factor > 0) {
+        if (rc->decimation_count > 0) {
+          --rc->decimation_count;
+          return 1;
+        } else {
+          rc->decimation_count = rc->decimation_factor;
+          return 0;
+        }
+      } else {
+        rc->decimation_count = 0;
+        return 0;
+      }
+    }
+  }
+}
+
+static double get_rate_correction_factor(const AV1_COMP *cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  double rcf;
+
+  if (cpi->common.frame_type == KEY_FRAME) {
+    rcf = rc->rate_correction_factors[KF_STD];
+  } else if (cpi->oxcf.pass == 2) {
+    RATE_FACTOR_LEVEL rf_lvl =
+        cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+    rcf = rc->rate_correction_factors[rf_lvl];
+  } else {
+    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+        !rc->is_src_frame_alt_ref &&
+        (cpi->oxcf.rc_mode != AOM_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+      rcf = rc->rate_correction_factors[GF_ARF_STD];
+    else
+      rcf = rc->rate_correction_factors[INTER_NORMAL];
+  }
+  rcf *= rcf_mult[rc->frame_size_selector];
+  return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+}
+
+static void set_rate_correction_factor(AV1_COMP *cpi, double factor) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // Normalize RCF to account for the size-dependent scaling factor.
+  factor /= rcf_mult[cpi->rc.frame_size_selector];
+
+  factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+
+  if (cpi->common.frame_type == KEY_FRAME) {
+    rc->rate_correction_factors[KF_STD] = factor;
+  } else if (cpi->oxcf.pass == 2) {
+    RATE_FACTOR_LEVEL rf_lvl =
+        cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+    rc->rate_correction_factors[rf_lvl] = factor;
+  } else {
+    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+        !rc->is_src_frame_alt_ref &&
+        (cpi->oxcf.rc_mode != AOM_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+      rc->rate_correction_factors[GF_ARF_STD] = factor;
+    else
+      rc->rate_correction_factors[INTER_NORMAL] = factor;
+  }
+}
+
+void av1_rc_update_rate_correction_factors(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  int correction_factor = 100;
+  double rate_correction_factor = get_rate_correction_factor(cpi);
+  double adjustment_limit;
+
+  int projected_size_based_on_q = 0;
+
+  // Do not update the rate factors for arf overlay frames.
+  if (cpi->rc.is_src_frame_alt_ref) return;
+
+  // Clear down mmx registers to allow floating point in what follows
+  aom_clear_system_state();
+
+  // Work out how big we would have expected the frame to be at this Q given
+  // the current correction factor.
+  // Stay in double to avoid int overflow when values are large
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled) {
+    projected_size_based_on_q =
+        av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
+  } else {
+    projected_size_based_on_q =
+        av1_estimate_bits_at_q(cpi->common.frame_type, cm->base_qindex, cm->MBs,
+                               rate_correction_factor, cm->bit_depth);
+  }
+  // Work out a size correction factor.
+  if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
+    correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) /
+                              projected_size_based_on_q);
+
+  // More heavily damped adjustment used if we have been oscillating either side
+  // of target.
+  if (correction_factor > 0) {
+    adjustment_limit =
+        0.25 + 0.5 * AOMMIN(1, fabs(log10(0.01 * correction_factor)));
+  } else {
+    adjustment_limit = 0.75;
+  }
+
+  cpi->rc.q_2_frame = cpi->rc.q_1_frame;
+  cpi->rc.q_1_frame = cm->base_qindex;
+  cpi->rc.rc_2_frame = cpi->rc.rc_1_frame;
+  if (correction_factor > 110)
+    cpi->rc.rc_1_frame = -1;
+  else if (correction_factor < 90)
+    cpi->rc.rc_1_frame = 1;
+  else
+    cpi->rc.rc_1_frame = 0;
+
+  if (correction_factor > 102) {
+    // We are not already at the worst allowable quality
+    correction_factor =
+        (int)(100 + ((correction_factor - 100) * adjustment_limit));
+    rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
+    // Keep rate_correction_factor within limits
+    if (rate_correction_factor > MAX_BPB_FACTOR)
+      rate_correction_factor = MAX_BPB_FACTOR;
+  } else if (correction_factor < 99) {
+    // We are not already at the best allowable quality
+    correction_factor =
+        (int)(100 - ((100 - correction_factor) * adjustment_limit));
+    rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
+
+    // Keep rate_correction_factor within limits
+    if (rate_correction_factor < MIN_BPB_FACTOR)
+      rate_correction_factor = MIN_BPB_FACTOR;
+  }
+
+  set_rate_correction_factor(cpi, rate_correction_factor);
+}
+
+int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame,
+                      int active_best_quality, int active_worst_quality) {
+  const AV1_COMMON *const cm = &cpi->common;
+  int q = active_worst_quality;
+  int last_error = INT_MAX;
+  int i, target_bits_per_mb, bits_per_mb_at_this_q;
+  const double correction_factor = get_rate_correction_factor(cpi);
+
+  // Calculate required scaling factor based on target frame size and size of
+  // frame produced using previous Q.
+  target_bits_per_mb =
+      (int)((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / cm->MBs;
+
+  i = active_best_quality;
+
+  do {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+      bits_per_mb_at_this_q =
+          (int)av1_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
+    } else {
+      bits_per_mb_at_this_q = (int)av1_rc_bits_per_mb(
+          cm->frame_type, i, correction_factor, cm->bit_depth);
+    }
+
+    if (bits_per_mb_at_this_q <= target_bits_per_mb) {
+      if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
+        q = i;
+      else
+        q = i - 1;
+
+      break;
+    } else {
+      last_error = bits_per_mb_at_this_q - target_bits_per_mb;
+    }
+  } while (++i <= active_worst_quality);
+
+  // In CBR mode, this makes sure q is between oscillating Qs to prevent
+  // resonance.
+  if (cpi->oxcf.rc_mode == AOM_CBR &&
+      (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
+      cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
+    q = clamp(q, AOMMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
+              AOMMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
+  }
+  return q;
+}
+
+static int get_active_quality(int q, int gfu_boost, int low, int high,
+                              int *low_motion_minq, int *high_motion_minq) {
+  if (gfu_boost > high) {
+    return low_motion_minq[q];
+  } else if (gfu_boost < low) {
+    return high_motion_minq[q];
+  } else {
+    const int gap = high - low;
+    const int offset = high - gfu_boost;
+    const int qdiff = high_motion_minq[q] - low_motion_minq[q];
+    const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+    return low_motion_minq[q] + adjustment;
+  }
+}
+
+static int get_kf_active_quality(const RATE_CONTROL *const rc, int q,
+                                 aom_bit_depth_t bit_depth) {
+  int *kf_low_motion_minq;
+  int *kf_high_motion_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq);
+  ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq);
+  return get_active_quality(q, rc->kf_boost, kf_low, kf_high,
+                            kf_low_motion_minq, kf_high_motion_minq);
+}
+
+static int get_gf_active_quality(const RATE_CONTROL *const rc, int q,
+                                 aom_bit_depth_t bit_depth) {
+  int *arfgf_low_motion_minq;
+  int *arfgf_high_motion_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq);
+  ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
+  return get_active_quality(q, rc->gfu_boost, gf_low, gf_high,
+                            arfgf_low_motion_minq, arfgf_high_motion_minq);
+}
+
+static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const unsigned int curr_frame = cpi->common.current_video_frame;
+  int active_worst_quality;
+
+  if (cpi->common.frame_type == KEY_FRAME) {
+    active_worst_quality =
+        curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] * 2;
+  } else {
+    if (!rc->is_src_frame_alt_ref &&
+        (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+      active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4
+                                             : rc->last_q[INTER_FRAME];
+    } else {
+      active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 2
+                                             : rc->last_q[INTER_FRAME] * 2;
+    }
+  }
+  return AOMMIN(active_worst_quality, rc->worst_quality);
+}
+
+// Adjust active_worst_quality level based on buffer level.
+static int calc_active_worst_quality_one_pass_cbr(const AV1_COMP *cpi) {
+  // Adjust active_worst_quality: If buffer is above the optimal/target level,
+  // bring active_worst_quality down depending on fullness of buffer.
+  // If buffer is below the optimal level, let the active_worst_quality go from
+  // ambient Q (at buffer = optimal level) to worst_quality level
+  // (at buffer = critical level).
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *rc = &cpi->rc;
+  // Buffer level below which we push active_worst to worst_quality.
+  int64_t critical_level = rc->optimal_buffer_level >> 3;
+  int64_t buff_lvl_step = 0;
+  int adjustment = 0;
+  int active_worst_quality;
+  int ambient_qp;
+  if (cm->frame_type == KEY_FRAME) return rc->worst_quality;
+  // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
+  // for the first few frames following key frame. These are both initialized
+  // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
+  // So for first few frames following key, the qp of that key frame is weighted
+  // into the active_worst_quality setting.
+  ambient_qp = (cm->current_video_frame < 5)
+                   ? AOMMIN(rc->avg_frame_qindex[INTER_FRAME],
+                            rc->avg_frame_qindex[KEY_FRAME])
+                   : rc->avg_frame_qindex[INTER_FRAME];
+  active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp * 5 / 4);
+  if (rc->buffer_level > rc->optimal_buffer_level) {
+    // Adjust down.
+    // Maximum limit for down adjustment, ~30%.
+    int max_adjustment_down = active_worst_quality / 3;
+    if (max_adjustment_down) {
+      buff_lvl_step = ((rc->maximum_buffer_size - rc->optimal_buffer_level) /
+                       max_adjustment_down);
+      if (buff_lvl_step)
+        adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) /
+                           buff_lvl_step);
+      active_worst_quality -= adjustment;
+    }
+  } else if (rc->buffer_level > critical_level) {
+    // Adjust up from ambient Q.
+    if (critical_level) {
+      buff_lvl_step = (rc->optimal_buffer_level - critical_level);
+      if (buff_lvl_step) {
+        adjustment = (int)((rc->worst_quality - ambient_qp) *
+                           (rc->optimal_buffer_level - rc->buffer_level) /
+                           buff_lvl_step);
+      }
+      active_worst_quality = ambient_qp + adjustment;
+    }
+  } else {
+    // Set to worst_quality if buffer is below critical level.
+    active_worst_quality = rc->worst_quality;
+  }
+  return active_worst_quality;
+}
+
+static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi,
+                                             int *bottom_index,
+                                             int *top_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int active_best_quality;
+  int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
+  int q;
+  int *rtc_minq;
+  ASSIGN_MINQ_TABLE(cm->bit_depth, rtc_minq);
+
+  if (frame_is_intra_only(cm)) {
+    active_best_quality = rc->best_quality;
+    // Handle the special case for key frames forced when we have reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
+    if (rc->this_key_frame_forced) {
+      int qindex = rc->last_boosted_qindex;
+      double last_boosted_q = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+      int delta_qindex = av1_compute_qdelta(
+          rc, last_boosted_q, (last_boosted_q * 0.75), cm->bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    } else if (cm->current_video_frame > 0) {
+      // not first frame of one pass and kf_boost is set
+      double q_adj_factor = 1.0;
+      double q_val;
+
+      active_best_quality = get_kf_active_quality(
+          rc, rc->avg_frame_qindex[KEY_FRAME], cm->bit_depth);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((cm->width * cm->height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
+
+      // Convert the adjustment factor to a qindex delta
+      // on active_best_quality.
+      q_val = av1_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+      active_best_quality +=
+          av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
+    }
+  } else if (!rc->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    if (rc->frames_since_key > 1 &&
+        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = rc->avg_frame_qindex[INTER_FRAME];
+    } else {
+      q = active_worst_quality;
+    }
+    active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+  } else {
+    // Use the lower of active_worst_quality and recent/average Q.
+    if (cm->current_video_frame > 1) {
+      if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+        active_best_quality = rtc_minq[rc->avg_frame_qindex[INTER_FRAME]];
+      else
+        active_best_quality = rtc_minq[active_worst_quality];
+    } else {
+      if (rc->avg_frame_qindex[KEY_FRAME] < active_worst_quality)
+        active_best_quality = rtc_minq[rc->avg_frame_qindex[KEY_FRAME]];
+      else
+        active_best_quality = rtc_minq[active_worst_quality];
+    }
+  }
+
+  // Clip the active best and worst quality values to limits
+  active_best_quality =
+      clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+  active_worst_quality =
+      clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+  // Limit Q range for the adaptive loop.
+  if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
+      !(cm->current_video_frame == 0)) {
+    int qdelta = 0;
+    aom_clear_system_state();
+    qdelta = av1_compute_qdelta_by_rate(
+        &cpi->rc, cm->frame_type, active_worst_quality, 2.0, cm->bit_depth);
+    *top_index = active_worst_quality + qdelta;
+    *top_index = AOMMAX(*top_index, *bottom_index);
+  }
+
+  // Special case code to try and match quality with forced key frames
+  if (cm->frame_type == KEY_FRAME && rc->this_key_frame_forced) {
+    q = rc->last_boosted_qindex;
+  } else {
+    q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+                          active_worst_quality);
+    if (q > *top_index) {
+      // Special case when we are targeting the max allowed rate
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
+        *top_index = q;
+      else
+        q = *top_index;
+    }
+  }
+
+  assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
+}
+
+static int get_active_cq_level(const RATE_CONTROL *rc,
+                               const AV1EncoderConfig *const oxcf) {
+  static const double cq_adjust_threshold = 0.1;
+  int active_cq_level = oxcf->cq_level;
+  if (oxcf->rc_mode == AOM_CQ && rc->total_target_bits > 0) {
+    const double x = (double)rc->total_actual_bits / rc->total_target_bits;
+    if (x < cq_adjust_threshold) {
+      active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold);
+    }
+  }
+  return active_cq_level;
+}
+
+static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi,
+                                             int *bottom_index,
+                                             int *top_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const int cq_level = get_active_cq_level(rc, oxcf);
+  int active_best_quality;
+  int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
+  int q;
+  int *inter_minq;
+  ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
+
+  if (frame_is_intra_only(cm)) {
+    if (oxcf->rc_mode == AOM_Q) {
+      const int qindex = cq_level;
+      const double q_val = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+      const int delta_qindex =
+          av1_compute_qdelta(rc, q_val, q_val * 0.25, cm->bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    } else if (rc->this_key_frame_forced) {
+      const int qindex = rc->last_boosted_qindex;
+      const double last_boosted_q =
+          av1_convert_qindex_to_q(qindex, cm->bit_depth);
+      const int delta_qindex = av1_compute_qdelta(
+          rc, last_boosted_q, last_boosted_q * 0.75, cm->bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    } else {  // not first frame of one pass and kf_boost is set
+      double q_adj_factor = 1.0;
+
+      active_best_quality = get_kf_active_quality(
+          rc, rc->avg_frame_qindex[KEY_FRAME], cm->bit_depth);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((cm->width * cm->height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
+
+      // Convert the adjustment factor to a qindex delta on active_best_quality.
+      {
+        const double q_val =
+            av1_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+        active_best_quality +=
+            av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
+      }
+    }
+  } else if (!rc->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    q = (rc->frames_since_key > 1 &&
+         rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+            ? rc->avg_frame_qindex[INTER_FRAME]
+            : rc->avg_frame_qindex[KEY_FRAME];
+    // For constrained quality dont allow Q less than the cq level
+    if (oxcf->rc_mode == AOM_CQ) {
+      if (q < cq_level) q = cq_level;
+      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+      // Constrained quality use slightly lower active best.
+      active_best_quality = active_best_quality * 15 / 16;
+    } else if (oxcf->rc_mode == AOM_Q) {
+      const int qindex = cq_level;
+      const double q_val = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+      const int delta_qindex =
+          (cpi->refresh_alt_ref_frame)
+              ? av1_compute_qdelta(rc, q_val, q_val * 0.40, cm->bit_depth)
+              : av1_compute_qdelta(rc, q_val, q_val * 0.50, cm->bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    } else {
+      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+    }
+  } else {
+    if (oxcf->rc_mode == AOM_Q) {
+      const int qindex = cq_level;
+      const double q_val = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+      const double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0,
+                                                     0.70, 1.0, 0.85, 1.0 };
+      const int delta_qindex = av1_compute_qdelta(
+          rc, q_val,
+          q_val * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL],
+          cm->bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    } else {
+      // Use the lower of active_worst_quality and recent/average Q.
+      active_best_quality = (cm->current_video_frame > 1)
+                                ? inter_minq[rc->avg_frame_qindex[INTER_FRAME]]
+                                : inter_minq[rc->avg_frame_qindex[KEY_FRAME]];
+      // For the constrained quality mode we don't want
+      // q to fall below the cq level.
+      if ((oxcf->rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
+        active_best_quality = cq_level;
+      }
+    }
+  }
+
+  // Clip the active best and worst quality values to limits
+  active_best_quality =
+      clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+  active_worst_quality =
+      clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+  // Limit Q range for the adaptive loop.
+  {
+    int qdelta = 0;
+    aom_clear_system_state();
+    if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
+        !(cm->current_video_frame == 0)) {
+      qdelta = av1_compute_qdelta_by_rate(
+          &cpi->rc, cm->frame_type, active_worst_quality, 2.0, cm->bit_depth);
+    } else if (!rc->is_src_frame_alt_ref &&
+               (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+      qdelta = av1_compute_qdelta_by_rate(
+          &cpi->rc, cm->frame_type, active_worst_quality, 1.75, cm->bit_depth);
+    }
+    *top_index = active_worst_quality + qdelta;
+    *top_index = AOMMAX(*top_index, *bottom_index);
+  }
+
+  if (oxcf->rc_mode == AOM_Q) {
+    q = active_best_quality;
+    // Special case code to try and match quality with forced key frames
+  } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) {
+    q = rc->last_boosted_qindex;
+  } else {
+    q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+                          active_worst_quality);
+    if (q > *top_index) {
+      // Special case when we are targeting the max allowed rate
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
+        *top_index = q;
+      else
+        q = *top_index;
+    }
+  }
+
+  assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
+}
+
+int av1_frame_type_qdelta(const AV1_COMP *cpi, int rf_level, int q) {
+  static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
+    1.00,  // INTER_NORMAL
+#if CONFIG_EXT_REFS
+    0.80,  // INTER_LOW
+    1.50,  // INTER_HIGH
+    1.25,  // GF_ARF_LOW
+#else
+    1.00,  // INTER_HIGH
+    1.50,  // GF_ARF_LOW
+#endif     // CONFIG_EXT_REFS
+    2.00,  // GF_ARF_STD
+    2.00,  // KF_STD
+  };
+  static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] =
+#if CONFIG_EXT_REFS
+      { INTER_FRAME, INTER_FRAME, INTER_FRAME,
+        INTER_FRAME, INTER_FRAME, KEY_FRAME };
+#else
+      { INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME };
+#endif  // CONFIG_EXT_REFS
+  const AV1_COMMON *const cm = &cpi->common;
+  int qdelta =
+      av1_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q,
+                                 rate_factor_deltas[rf_level], cm->bit_depth);
+  return qdelta;
+}
+
+#define STATIC_MOTION_THRESH 95
+static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int *bottom_index,
+                                         int *top_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  const int cq_level = get_active_cq_level(rc, oxcf);
+  int active_best_quality;
+  int active_worst_quality = cpi->twopass.active_worst_quality;
+  int q;
+  int *inter_minq;
+  ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
+
+  if (frame_is_intra_only(cm)) {
+    // Handle the special case for key frames forced when we have reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
+    if (rc->this_key_frame_forced) {
+      double last_boosted_q;
+      int delta_qindex;
+      int qindex;
+
+      if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+        qindex = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+        active_best_quality = qindex;
+        last_boosted_q = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+        delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+                                          last_boosted_q * 1.25, cm->bit_depth);
+        active_worst_quality =
+            AOMMIN(qindex + delta_qindex, active_worst_quality);
+      } else {
+        qindex = rc->last_boosted_qindex;
+        last_boosted_q = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+        delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+                                          last_boosted_q * 0.75, cm->bit_depth);
+        active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+      }
+    } else {
+      // Not forced keyframe.
+      double q_adj_factor = 1.0;
+      double q_val;
+
+      // Baseline value derived from cpi->active_worst_quality and kf boost.
+      active_best_quality =
+          get_kf_active_quality(rc, active_worst_quality, cm->bit_depth);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((cm->width * cm->height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
+
+      // Make a further adjustment based on the kf zero motion measure.
+      q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct);
+
+      // Convert the adjustment factor to a qindex delta
+      // on active_best_quality.
+      q_val = av1_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+      active_best_quality +=
+          av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
+    }
+  } else if (!rc->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    if (rc->frames_since_key > 1 &&
+        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = rc->avg_frame_qindex[INTER_FRAME];
+    } else {
+      q = active_worst_quality;
+    }
+    // For constrained quality dont allow Q less than the cq level
+    if (oxcf->rc_mode == AOM_CQ) {
+      if (q < cq_level) q = cq_level;
+
+      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+
+      // Constrained quality use slightly lower active best.
+      active_best_quality = active_best_quality * 15 / 16;
+
+    } else if (oxcf->rc_mode == AOM_Q) {
+      if (!cpi->refresh_alt_ref_frame) {
+        active_best_quality = cq_level;
+      } else {
+        active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+
+        // Modify best quality for second level arfs. For mode AOM_Q this
+        // becomes the baseline frame q.
+        if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
+          active_best_quality = (active_best_quality + cq_level + 1) / 2;
+      }
+    } else {
+      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+    }
+  } else {
+    if (oxcf->rc_mode == AOM_Q) {
+      active_best_quality = cq_level;
+    } else {
+      active_best_quality = inter_minq[active_worst_quality];
+
+      // For the constrained quality mode we don't want
+      // q to fall below the cq level.
+      if ((oxcf->rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
+        active_best_quality = cq_level;
+      }
+    }
+  }
+
+  // Extension to max or min Q if undershoot or overshoot is outside
+  // the permitted range.
+  if ((cpi->oxcf.rc_mode != AOM_Q) &&
+      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
+    if (frame_is_intra_only(cm) ||
+        (!rc->is_src_frame_alt_ref &&
+         (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
+      active_best_quality -=
+          (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
+      active_worst_quality += (cpi->twopass.extend_maxq / 2);
+    } else {
+      active_best_quality -=
+          (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2;
+      active_worst_quality += cpi->twopass.extend_maxq;
+    }
+  }
+
+  aom_clear_system_state();
+  // Static forced key frames Q restrictions dealt with elsewhere.
+  if (!(frame_is_intra_only(cm)) || !rc->this_key_frame_forced ||
+      (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
+    int qdelta = av1_frame_type_qdelta(cpi, gf_group->rf_level[gf_group->index],
+                                       active_worst_quality);
+    active_worst_quality =
+        AOMMAX(active_worst_quality + qdelta, active_best_quality);
+  }
+
+  // Modify active_best_quality for downscaled normal frames.
+  if (rc->frame_size_selector != UNSCALED && !frame_is_kf_gf_arf(cpi)) {
+    int qdelta = av1_compute_qdelta_by_rate(
+        rc, cm->frame_type, active_best_quality, 2.0, cm->bit_depth);
+    active_best_quality =
+        AOMMAX(active_best_quality + qdelta, rc->best_quality);
+  }
+
+  active_best_quality =
+      clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+  active_worst_quality =
+      clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+  if (oxcf->rc_mode == AOM_Q) {
+    q = active_best_quality;
+    // Special case code to try and match quality with forced key frames.
+  } else if (frame_is_intra_only(cm) && rc->this_key_frame_forced) {
+    // If static since last kf use better of last boosted and last kf q.
+    if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+      q = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+    } else {
+      q = rc->last_boosted_qindex;
+    }
+  } else {
+    q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+                          active_worst_quality);
+    if (q > active_worst_quality) {
+      // Special case when we are targeting the max allowed rate.
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
+        active_worst_quality = q;
+      else
+        q = active_worst_quality;
+    }
+  }
+  clamp(q, active_best_quality, active_worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+  assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
+}
+
+int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int *bottom_index,
+                             int *top_index) {
+  int q;
+  if (cpi->oxcf.pass == 0) {
+    if (cpi->oxcf.rc_mode == AOM_CBR)
+      q = rc_pick_q_and_bounds_one_pass_cbr(cpi, bottom_index, top_index);
+    else
+      q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index);
+  } else {
+    q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index);
+  }
+
+  return q;
+}
+
+void av1_rc_compute_frame_size_bounds(const AV1_COMP *cpi, int frame_target,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit) {
+  if (cpi->oxcf.rc_mode == AOM_Q) {
+    *frame_under_shoot_limit = 0;
+    *frame_over_shoot_limit = INT_MAX;
+  } else {
+    // For very small rate targets where the fractional adjustment
+    // may be tiny make sure there is at least a minimum range.
+    const int tolerance = (cpi->sf.recode_tolerance * frame_target) / 100;
+    *frame_under_shoot_limit = AOMMAX(frame_target - tolerance - 200, 0);
+    *frame_over_shoot_limit =
+        AOMMIN(frame_target + tolerance + 200, cpi->rc.max_frame_bandwidth);
+  }
+}
+
+void av1_rc_set_frame_target(AV1_COMP *cpi, int target) {
+  const AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  rc->this_frame_target = target;
+
+  // Modify frame size target when down-scaling.
+  if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC &&
+      rc->frame_size_selector != UNSCALED)
+    rc->this_frame_target = (int)(rc->this_frame_target *
+                                  rate_thresh_mult[rc->frame_size_selector]);
+
+  // Target rate per SB64 (including partial SB64s.
+  rc->sb64_target_rate = (int)((int64_t)rc->this_frame_target * 64 * 64) /
+                         (cm->width * cm->height);
+}
+
+static void update_alt_ref_frame_stats(AV1_COMP *cpi) {
+  // this frame refreshes means next frames don't unless specified by user
+  RATE_CONTROL *const rc = &cpi->rc;
+  rc->frames_since_golden = 0;
+
+  // Mark the alt ref as done (setting to 0 means no further alt refs pending).
+  rc->source_alt_ref_pending = 0;
+
+  // Set the alternate reference frame active flag
+  rc->source_alt_ref_active = 1;
+}
+
+static void update_golden_frame_stats(AV1_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
+#if CONFIG_EXT_REFS
+  // Update the Golden frame usage counts.
+  // NOTE(weitinglin): If we use show_existing_frame for an OVERLAY frame,
+  //                   only the virtual indices for the reference frame will be
+  //                   updated and cpi->refresh_golden_frame will still be zero.
+  if (cpi->refresh_golden_frame || rc->is_src_frame_alt_ref) {
+#else
+  // Update the Golden frame usage counts.
+  if (cpi->refresh_golden_frame) {
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+    // We will not use internal overlay frames to replace the golden frame
+    if (!rc->is_src_frame_ext_arf)
+#endif  // CONFIG_EXT_REFS
+      // this frame refreshes means next frames don't unless specified by user
+      rc->frames_since_golden = 0;
+
+    // If we are not using alt ref in the up and coming group clear the arf
+    // active flag. In multi arf group case, if the index is not 0 then
+    // we are overlaying a mid group arf so should not reset the flag.
+    if (cpi->oxcf.pass == 2) {
+      if (!rc->source_alt_ref_pending && (cpi->twopass.gf_group.index == 0))
+        rc->source_alt_ref_active = 0;
+    } else if (!rc->source_alt_ref_pending) {
+      rc->source_alt_ref_active = 0;
+    }
+
+    // Decrement count down till next gf
+    if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
+
+  } else if (!cpi->refresh_alt_ref_frame) {
+    // Decrement count down till next gf
+    if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
+
+    rc->frames_since_golden++;
+  }
+}
+
+void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int qindex = cm->base_qindex;
+
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+    av1_cyclic_refresh_postencode(cpi);
+  }
+
+  // Update rate control heuristics
+  rc->projected_frame_size = (int)(bytes_used << 3);
+
+  // Post encode loop adjustment of Q prediction.
+  av1_rc_update_rate_correction_factors(cpi);
+
+  // Keep a record of last Q and ambient average Q.
+  if (cm->frame_type == KEY_FRAME) {
+    rc->last_q[KEY_FRAME] = qindex;
+    rc->avg_frame_qindex[KEY_FRAME] =
+        ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
+  } else {
+    if (!rc->is_src_frame_alt_ref &&
+        !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+      rc->last_q[INTER_FRAME] = qindex;
+      rc->avg_frame_qindex[INTER_FRAME] =
+          ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
+      rc->ni_frames++;
+      rc->tot_q += av1_convert_qindex_to_q(qindex, cm->bit_depth);
+      rc->avg_q = rc->tot_q / rc->ni_frames;
+      // Calculate the average Q for normal inter frames (not key or GFU
+      // frames).
+      rc->ni_tot_qi += qindex;
+      rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
+    }
+  }
+
+  // Keep record of last boosted (KF/GF/ARF) Q value.
+  // If the current frame is coded at a lower Q then we also update it.
+  // If all mbs in this group are skipped only update if the Q value is
+  // better than that already stored.
+  // This is used to help set quality in forced key frames to reduce popping
+  if ((qindex < rc->last_boosted_qindex) || (cm->frame_type == KEY_FRAME) ||
+      (!rc->constrained_gf_group &&
+       (cpi->refresh_alt_ref_frame ||
+        (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
+    rc->last_boosted_qindex = qindex;
+  }
+  if (cm->frame_type == KEY_FRAME) rc->last_kf_qindex = qindex;
+
+  update_buffer_level(cpi, rc->projected_frame_size);
+
+  // Rolling monitors of whether we are over or underspending used to help
+  // regulate min and Max Q in two pass.
+  if (cm->frame_type != KEY_FRAME) {
+    rc->rolling_target_bits = ROUND_POWER_OF_TWO(
+        rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
+    rc->rolling_actual_bits = ROUND_POWER_OF_TWO(
+        rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
+    rc->long_rolling_target_bits = ROUND_POWER_OF_TWO(
+        rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5);
+    rc->long_rolling_actual_bits = ROUND_POWER_OF_TWO(
+        rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, 5);
+  }
+
+  // Actual bits spent
+  rc->total_actual_bits += rc->projected_frame_size;
+#if CONFIG_EXT_REFS
+  rc->total_target_bits +=
+      (cm->show_frame || rc->is_bwd_ref_frame) ? rc->avg_frame_bandwidth : 0;
+#else
+  rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0;
+#endif  // CONFIG_EXT_REFS
+
+  rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
+
+  if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame &&
+      (cm->frame_type != KEY_FRAME))
+    // Update the alternate reference frame stats as appropriate.
+    update_alt_ref_frame_stats(cpi);
+  else
+    // Update the Golden frame stats as appropriate.
+    update_golden_frame_stats(cpi);
+
+  if (cm->frame_type == KEY_FRAME) rc->frames_since_key = 0;
+
+#if CONFIG_EXT_REFS
+  if (cm->show_frame || rc->is_bwd_ref_frame) {
+#else
+  if (cm->show_frame) {
+#endif  // CONFIG_EXT_REFS
+    rc->frames_since_key++;
+    rc->frames_to_key--;
+  }
+
+  // Trigger the resizing of the next frame if it is scaled.
+  if (oxcf->pass != 0) {
+    cpi->resize_pending =
+        rc->next_frame_size_selector != rc->frame_size_selector;
+    rc->frame_size_selector = rc->next_frame_size_selector;
+  }
+}
+
+void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) {
+  // Update buffer level with zero size, update frame counters, and return.
+  update_buffer_level(cpi, 0);
+  cpi->rc.frames_since_key++;
+  cpi->rc.frames_to_key--;
+  cpi->rc.rc_2_frame = 0;
+  cpi->rc.rc_1_frame = 0;
+}
+
+// Use this macro to turn on/off use of alt-refs in one-pass mode.
+#define USE_ALTREF_FOR_ONE_PASS 1
+
+static int calc_pframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
+  static const int af_ratio = 10;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+#if USE_ALTREF_FOR_ONE_PASS
+  target =
+      (!rc->is_src_frame_alt_ref &&
+       (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))
+          ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) /
+                (rc->baseline_gf_interval + af_ratio - 1)
+          : (rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
+                (rc->baseline_gf_interval + af_ratio - 1);
+#else
+  target = rc->avg_frame_bandwidth;
+#endif
+  return av1_rc_clamp_pframe_target_size(cpi, target);
+}
+
+static int calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
+  static const int kf_ratio = 25;
+  const RATE_CONTROL *rc = &cpi->rc;
+  const int target = rc->avg_frame_bandwidth * kf_ratio;
+  return av1_rc_clamp_iframe_target_size(cpi, target);
+}
+
+void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+  // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
+  if (!cpi->refresh_alt_ref_frame &&
+      (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+       rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) {
+    cm->frame_type = KEY_FRAME;
+    rc->this_key_frame_forced =
+        cm->current_video_frame != 0 && rc->frames_to_key == 0;
+    rc->frames_to_key = cpi->oxcf.key_freq;
+    rc->kf_boost = DEFAULT_KF_BOOST;
+    rc->source_alt_ref_active = 0;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  if (rc->frames_till_gf_update_due == 0) {
+    rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+    if (rc->frames_till_gf_update_due > rc->frames_to_key) {
+      rc->frames_till_gf_update_due = rc->frames_to_key;
+      rc->constrained_gf_group = 1;
+    } else {
+      rc->constrained_gf_group = 0;
+    }
+    cpi->refresh_golden_frame = 1;
+    rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
+    rc->gfu_boost = DEFAULT_GF_BOOST;
+  }
+  if (cm->frame_type == KEY_FRAME)
+    target = calc_iframe_target_size_one_pass_vbr(cpi);
+  else
+    target = calc_pframe_target_size_one_pass_vbr(cpi);
+  av1_rc_set_frame_target(cpi, target);
+}
+
+static int calc_pframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  const RATE_CONTROL *rc = &cpi->rc;
+  const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
+  const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
+  int min_frame_target =
+      AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
+  int target;
+
+  if (oxcf->gf_cbr_boost_pct) {
+    const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100;
+    target = cpi->refresh_golden_frame
+                 ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval *
+                    af_ratio_pct) /
+                       (rc->baseline_gf_interval * 100 + af_ratio_pct - 100)
+                 : (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
+                       (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+  } else {
+    target = rc->avg_frame_bandwidth;
+  }
+
+  if (diff > 0) {
+    // Lower the target bandwidth for this frame.
+    const int pct_low = (int)AOMMIN(diff / one_pct_bits, oxcf->under_shoot_pct);
+    target -= (target * pct_low) / 200;
+  } else if (diff < 0) {
+    // Increase the target bandwidth for this frame.
+    const int pct_high =
+        (int)AOMMIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
+    target += (target * pct_high) / 200;
+  }
+  if (oxcf->rc_max_inter_bitrate_pct) {
+    const int max_rate =
+        rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+    target = AOMMIN(target, max_rate);
+  }
+  return AOMMAX(min_frame_target, target);
+}
+
+static int calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  int target;
+  if (cpi->common.current_video_frame == 0) {
+    target = ((rc->starting_buffer_level / 2) > INT_MAX)
+                 ? INT_MAX
+                 : (int)(rc->starting_buffer_level / 2);
+  } else {
+    int kf_boost = 32;
+    double framerate = cpi->framerate;
+
+    kf_boost = AOMMAX(kf_boost, (int)(2 * framerate - 16));
+    if (rc->frames_since_key < framerate / 2) {
+      kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2));
+    }
+    target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4;
+  }
+  return av1_rc_clamp_iframe_target_size(cpi, target);
+}
+
+void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+  // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
+  if ((cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+       rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) {
+    cm->frame_type = KEY_FRAME;
+    rc->this_key_frame_forced =
+        cm->current_video_frame != 0 && rc->frames_to_key == 0;
+    rc->frames_to_key = cpi->oxcf.key_freq;
+    rc->kf_boost = DEFAULT_KF_BOOST;
+    rc->source_alt_ref_active = 0;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  if (rc->frames_till_gf_update_due == 0) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      av1_cyclic_refresh_set_golden_update(cpi);
+    else
+      rc->baseline_gf_interval =
+          (rc->min_gf_interval + rc->max_gf_interval) / 2;
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+    if (rc->frames_till_gf_update_due > rc->frames_to_key)
+      rc->frames_till_gf_update_due = rc->frames_to_key;
+    cpi->refresh_golden_frame = 1;
+    rc->gfu_boost = DEFAULT_GF_BOOST;
+  }
+
+  // Any update/change of global cyclic refresh parameters (amount/delta-qp)
+  // should be done here, before the frame qp is selected.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    av1_cyclic_refresh_update_parameters(cpi);
+
+  if (cm->frame_type == KEY_FRAME)
+    target = calc_iframe_target_size_one_pass_cbr(cpi);
+  else
+    target = calc_pframe_target_size_one_pass_cbr(cpi);
+
+  av1_rc_set_frame_target(cpi, target);
+  if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC)
+    cpi->resize_pending = av1_resize_one_pass_cbr(cpi);
+  else
+    cpi->resize_pending = 0;
+}
+
+int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+                       aom_bit_depth_t bit_depth) {
+  int start_index = rc->worst_quality;
+  int target_index = rc->worst_quality;
+  int i;
+
+  // Convert the average q value to an index.
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+    start_index = i;
+    if (av1_convert_qindex_to_q(i, bit_depth) >= qstart) break;
+  }
+
+  // Convert the q target to an index
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+    target_index = i;
+    if (av1_convert_qindex_to_q(i, bit_depth) >= qtarget) break;
+  }
+
+  return target_index - start_index;
+}
+
+int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
+                               int qindex, double rate_target_ratio,
+                               aom_bit_depth_t bit_depth) {
+  int target_index = rc->worst_quality;
+  int i;
+
+  // Look up the current projected bits per block for the base index
+  const int base_bits_per_mb =
+      av1_rc_bits_per_mb(frame_type, qindex, 1.0, bit_depth);
+
+  // Find the target bits per mb based on the base value and given ratio.
+  const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
+
+  // Convert the q target to an index
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+    if (av1_rc_bits_per_mb(frame_type, i, 1.0, bit_depth) <=
+        target_bits_per_mb) {
+      target_index = i;
+      break;
+    }
+  }
+  return target_index - qindex;
+}
+
+void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi,
+                                  RATE_CONTROL *const rc) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+  // Special case code for 1 pass fixed Q mode tests
+  if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) {
+    rc->max_gf_interval = FIXED_GF_INTERVAL;
+    rc->min_gf_interval = FIXED_GF_INTERVAL;
+    rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
+  } else {
+    // Set Maximum gf/arf interval
+    rc->max_gf_interval = oxcf->max_gf_interval;
+    rc->min_gf_interval = oxcf->min_gf_interval;
+    if (rc->min_gf_interval == 0)
+      rc->min_gf_interval = av1_rc_get_default_min_gf_interval(
+          oxcf->width, oxcf->height, cpi->framerate);
+    if (rc->max_gf_interval == 0)
+      rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
+          cpi->framerate, rc->min_gf_interval);
+
+    // Extended interval for genuinely static scenes
+    rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
+
+    if (is_altref_enabled(cpi)) {
+      if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
+        rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+    }
+
+    if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
+      rc->max_gf_interval = rc->static_scene_max_gf_interval;
+
+    // Clamp min to max
+    rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval);
+  }
+}
+
+void av1_rc_update_framerate(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int vbr_max_bits;
+
+  rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate);
+  rc->min_frame_bandwidth =
+      (int)(rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100);
+
+  rc->min_frame_bandwidth =
+      AOMMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
+
+  // A maximum bitrate for a frame is defined.
+  // The baseline for this aligns with HW implementations that
+  // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
+  // per 16x16 MB (averaged over a frame). However this limit is extended if
+  // a very high rate is given on the command line or the the rate cannnot
+  // be acheived because of a user specificed max q (e.g. when the user
+  // specifies lossless encode.
+  vbr_max_bits =
+      (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) /
+            100);
+  rc->max_frame_bandwidth =
+      AOMMAX(AOMMAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+
+  av1_rc_set_gf_interval_range(cpi, rc);
+}
+
+#define VBR_PCT_ADJUSTMENT_LIMIT 50
+// For VBR...adjustment to the frame target based on error from previous frames
+static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int64_t vbr_bits_off_target = rc->vbr_bits_off_target;
+  int max_delta;
+  double position_factor = 1.0;
+
+  // How far through the clip are we.
+  // This number is used to damp the per frame rate correction.
+  // Range 0 - 1.0
+  if (cpi->twopass.total_stats.count != 0.) {
+    position_factor = sqrt((double)cpi->common.current_video_frame /
+                           cpi->twopass.total_stats.count);
+  }
+  max_delta = (int)(position_factor *
+                    ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
+
+  // vbr_bits_off_target > 0 means we have extra bits to spend
+  if (vbr_bits_off_target > 0) {
+    *this_frame_target += (vbr_bits_off_target > max_delta)
+                              ? max_delta
+                              : (int)vbr_bits_off_target;
+  } else {
+    *this_frame_target -= (vbr_bits_off_target < -max_delta)
+                              ? max_delta
+                              : (int)-vbr_bits_off_target;
+  }
+
+  // Fast redistribution of bits arising from massive local undershoot.
+  // Dont do it for kf,arf,gf or overlay frames.
+  if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref &&
+      rc->vbr_bits_off_target_fast) {
+    int one_frame_bits = AOMMAX(rc->avg_frame_bandwidth, *this_frame_target);
+    int fast_extra_bits;
+    fast_extra_bits = (int)AOMMIN(rc->vbr_bits_off_target_fast, one_frame_bits);
+    fast_extra_bits = (int)AOMMIN(
+        fast_extra_bits,
+        AOMMAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8));
+    *this_frame_target += (int)fast_extra_bits;
+    rc->vbr_bits_off_target_fast -= fast_extra_bits;
+  }
+}
+
+void av1_set_target_rate(AV1_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target_rate = rc->base_frame_target;
+
+  // Correction to rate target based on prior over or under shoot.
+  if (cpi->oxcf.rc_mode == AOM_VBR || cpi->oxcf.rc_mode == AOM_CQ)
+    vbr_rate_correction(cpi, &target_rate);
+  av1_rc_set_frame_target(cpi, target_rate);
+}
+
+// Check if we should resize, based on average QP from past x frames.
+// Only allow for resize at most one scale down for now, scaling factor is 2.
+int av1_resize_one_pass_cbr(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int resize_now = 0;
+  cpi->resize_scale_num = 1;
+  cpi->resize_scale_den = 1;
+  // Don't resize on key frame; reset the counters on key frame.
+  if (cm->frame_type == KEY_FRAME) {
+    cpi->resize_avg_qp = 0;
+    cpi->resize_count = 0;
+    return 0;
+  }
+  // Resize based on average buffer underflow and QP over some window.
+  // Ignore samples close to key frame, since QP is usually high after key.
+  if (cpi->rc.frames_since_key > 2 * cpi->framerate) {
+    const int window = (int)(5 * cpi->framerate);
+    cpi->resize_avg_qp += cm->base_qindex;
+    if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100))
+      ++cpi->resize_buffer_underflow;
+    ++cpi->resize_count;
+    // Check for resize action every "window" frames.
+    if (cpi->resize_count >= window) {
+      int avg_qp = cpi->resize_avg_qp / cpi->resize_count;
+      // Resize down if buffer level has underflowed sufficent amount in past
+      // window, and we are at original resolution.
+      // Resize back up if average QP is low, and we are currently in a resized
+      // down state.
+      if (cpi->resize_state == 0 &&
+          cpi->resize_buffer_underflow > (cpi->resize_count >> 2)) {
+        resize_now = 1;
+        cpi->resize_state = 1;
+      } else if (cpi->resize_state == 1 &&
+                 avg_qp < 40 * cpi->rc.worst_quality / 100) {
+        resize_now = -1;
+        cpi->resize_state = 0;
+      }
+      // Reset for next window measurement.
+      cpi->resize_avg_qp = 0;
+      cpi->resize_count = 0;
+      cpi->resize_buffer_underflow = 0;
+    }
+  }
+  // If decision is to resize, reset some quantities, and check is we should
+  // reduce rate correction factor,
+  if (resize_now != 0) {
+    int target_bits_per_frame;
+    int active_worst_quality;
+    int qindex;
+    int tot_scale_change;
+    // For now, resize is by 1/2 x 1/2.
+    cpi->resize_scale_num = 1;
+    cpi->resize_scale_den = 2;
+    tot_scale_change = (cpi->resize_scale_den * cpi->resize_scale_den) /
+                       (cpi->resize_scale_num * cpi->resize_scale_num);
+    // Reset buffer level to optimal, update target size.
+    rc->buffer_level = rc->optimal_buffer_level;
+    rc->bits_off_target = rc->optimal_buffer_level;
+    rc->this_frame_target = calc_pframe_target_size_one_pass_cbr(cpi);
+    // Reset cyclic refresh parameters.
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled)
+      av1_cyclic_refresh_reset_resize(cpi);
+    // Get the projected qindex, based on the scaled target frame size (scaled
+    // so target_bits_per_mb in av1_rc_regulate_q will be correct target).
+    target_bits_per_frame = (resize_now == 1)
+                                ? rc->this_frame_target * tot_scale_change
+                                : rc->this_frame_target / tot_scale_change;
+    active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
+    qindex = av1_rc_regulate_q(cpi, target_bits_per_frame, rc->best_quality,
+                               active_worst_quality);
+    // If resize is down, check if projected q index is close to worst_quality,
+    // and if so, reduce the rate correction factor (since likely can afford
+    // lower q for resized frame).
+    if (resize_now == 1 && qindex > 90 * cpi->rc.worst_quality / 100) {
+      rc->rate_correction_factors[INTER_NORMAL] *= 0.85;
+    }
+    // If resize is back up, check if projected q index is too much above the
+    // current base_qindex, and if so, reduce the rate correction factor
+    // (since prefer to keep q for resized frame at least close to previous q).
+    if (resize_now == -1 && qindex > 130 * cm->base_qindex / 100) {
+      rc->rate_correction_factors[INTER_NORMAL] *= 0.9;
+    }
+  }
+  return resize_now;
+}
diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h
new file mode 100644
index 000000000..93a9b4939
--- /dev/null
+++ b/third_party/aom/av1/encoder/ratectrl.h
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_RATECTRL_H_
+#define AV1_ENCODER_RATECTRL_H_
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Bits Per MB at different Q (Multiplied by 512)
+#define BPER_MB_NORMBITS 9
+
+#define MIN_GF_INTERVAL 4
+#define MAX_GF_INTERVAL 16
+#define FIXED_GF_INTERVAL 8  // Used in some testing modes only
+
+#if CONFIG_EXT_REFS
+typedef enum {
+  INTER_NORMAL = 0,
+  INTER_LOW = 1,
+  INTER_HIGH = 2,
+  GF_ARF_LOW = 3,
+  GF_ARF_STD = 4,
+  KF_STD = 5,
+  RATE_FACTOR_LEVELS = 6
+} RATE_FACTOR_LEVEL;
+#else
+typedef enum {
+  INTER_NORMAL = 0,
+  INTER_HIGH = 1,
+  GF_ARF_LOW = 2,
+  GF_ARF_STD = 3,
+  KF_STD = 4,
+  RATE_FACTOR_LEVELS = 5
+} RATE_FACTOR_LEVEL;
+#endif  // CONFIG_EXT_REFS
+
+// Internal frame scaling level.
+typedef enum {
+  UNSCALED = 0,     // Frame is unscaled.
+  SCALE_STEP1 = 1,  // First-level down-scaling.
+  FRAME_SCALE_STEPS
+} FRAME_SCALE_LEVEL;
+
+// Frame dimensions multiplier wrt the native frame size, in 1/16ths,
+// specified for the scale-up case.
+// e.g. 24 => 16/24 = 2/3 of native size. The restriction to 1/16th is
+// intended to match the capabilities of the normative scaling filters,
+// giving precedence to the up-scaling accuracy.
+static const int frame_scale_factor[FRAME_SCALE_STEPS] = { 16, 24 };
+
+// Multiplier of the target rate to be used as threshold for triggering scaling.
+static const double rate_thresh_mult[FRAME_SCALE_STEPS] = { 1.0, 2.0 };
+
+// Scale dependent Rate Correction Factor multipliers. Compensates for the
+// greater number of bits per pixel generated in down-scaled frames.
+static const double rcf_mult[FRAME_SCALE_STEPS] = { 1.0, 2.0 };
+
+typedef struct {
+  // Rate targetting variables
+  int base_frame_target;  // A baseline frame target before adjustment
+                          // for previous under or over shoot.
+  int this_frame_target;  // Actual frame target after rc adjustment.
+  int projected_frame_size;
+  int sb64_target_rate;
+  int last_q[FRAME_TYPES];  // Separate values for Intra/Inter
+  int last_boosted_qindex;  // Last boosted GF/KF/ARF q
+  int last_kf_qindex;       // Q index of the last key frame coded.
+
+  int gfu_boost;
+  int last_boost;
+  int kf_boost;
+
+  double rate_correction_factors[RATE_FACTOR_LEVELS];
+
+  int frames_since_golden;
+  int frames_till_gf_update_due;
+  int min_gf_interval;
+  int max_gf_interval;
+  int static_scene_max_gf_interval;
+  int baseline_gf_interval;
+  int constrained_gf_group;
+  int frames_to_key;
+  int frames_since_key;
+  int this_key_frame_forced;
+  int next_key_frame_forced;
+  int source_alt_ref_pending;
+  int source_alt_ref_active;
+  int is_src_frame_alt_ref;
+
+#if CONFIG_EXT_REFS
+  // Length of the bi-predictive frame group interval
+  int bipred_group_interval;
+
+  // NOTE: Different types of frames may have different bits allocated
+  //       accordingly, aiming to achieve the overall optimal RD performance.
+  int is_bwd_ref_frame;
+  int is_last_bipred_frame;
+  int is_bipred_frame;
+  int is_src_frame_ext_arf;
+#endif  // CONFIG_EXT_REFS
+
+  int avg_frame_bandwidth;  // Average frame size target for clip
+  int min_frame_bandwidth;  // Minimum allocation used for any frame
+  int max_frame_bandwidth;  // Maximum burst rate allowed for a frame.
+
+  int ni_av_qi;
+  int ni_tot_qi;
+  int ni_frames;
+  int avg_frame_qindex[FRAME_TYPES];
+  double tot_q;
+  double avg_q;
+
+  int64_t buffer_level;
+  int64_t bits_off_target;
+  int64_t vbr_bits_off_target;
+  int64_t vbr_bits_off_target_fast;
+
+  int decimation_factor;
+  int decimation_count;
+
+  int rolling_target_bits;
+  int rolling_actual_bits;
+
+  int long_rolling_target_bits;
+  int long_rolling_actual_bits;
+
+  int rate_error_estimate;
+
+  int64_t total_actual_bits;
+  int64_t total_target_bits;
+  int64_t total_target_vs_actual;
+
+  int worst_quality;
+  int best_quality;
+
+  int64_t starting_buffer_level;
+  int64_t optimal_buffer_level;
+  int64_t maximum_buffer_size;
+
+  // rate control history for last frame(1) and the frame before(2).
+  // -1: undershot
+  //  1: overshoot
+  //  0: not initialized.
+  int rc_1_frame;
+  int rc_2_frame;
+  int q_1_frame;
+  int q_2_frame;
+
+  // Auto frame-scaling variables.
+  FRAME_SCALE_LEVEL frame_size_selector;
+  FRAME_SCALE_LEVEL next_frame_size_selector;
+  int frame_width[FRAME_SCALE_STEPS];
+  int frame_height[FRAME_SCALE_STEPS];
+  int rf_level_maxq[RATE_FACTOR_LEVELS];
+} RATE_CONTROL;
+
+struct AV1_COMP;
+struct AV1EncoderConfig;
+
+void av1_rc_init(const struct AV1EncoderConfig *oxcf, int pass,
+                 RATE_CONTROL *rc);
+
+int av1_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs,
+                           double correction_factor, aom_bit_depth_t bit_depth);
+
+double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth);
+
+void av1_rc_init_minq_luts(void);
+
+int av1_rc_get_default_min_gf_interval(int width, int height, double framerate);
+// Note av1_rc_get_default_max_gf_interval() requires the min_gf_interval to
+// be passed in to ensure that the max_gf_interval returned is at least as bis
+// as that.
+int av1_rc_get_default_max_gf_interval(double framerate, int min_frame_rate);
+
+// Generally at the high level, the following flow is expected
+// to be enforced for rate control:
+// First call per frame, one of:
+//   av1_rc_get_one_pass_vbr_params()
+//   av1_rc_get_one_pass_cbr_params()
+//   av1_rc_get_first_pass_params()
+//   av1_rc_get_second_pass_params()
+// depending on the usage to set the rate control encode parameters desired.
+//
+// Then, call encode_frame_to_data_rate() to perform the
+// actual encode. This function will in turn call encode_frame()
+// one or more times, followed by one of:
+//   av1_rc_postencode_update()
+//   av1_rc_postencode_update_drop_frame()
+//
+// The majority of rate control parameters are only expected
+// to be set in the av1_rc_get_..._params() functions and
+// updated during the av1_rc_postencode_update...() functions.
+// The only exceptions are av1_rc_drop_frame() and
+// av1_rc_update_rate_correction_factors() functions.
+
+// Functions to set parameters for encoding before the actual
+// encode_frame_to_data_rate() function.
+void av1_rc_get_one_pass_vbr_params(struct AV1_COMP *cpi);
+void av1_rc_get_one_pass_cbr_params(struct AV1_COMP *cpi);
+
+// Post encode update of the rate control parameters based
+// on bytes used
+void av1_rc_postencode_update(struct AV1_COMP *cpi, uint64_t bytes_used);
+// Post encode update of the rate control parameters for dropped frames
+void av1_rc_postencode_update_drop_frame(struct AV1_COMP *cpi);
+
+// Updates rate correction factors
+// Changes only the rate correction factors in the rate control structure.
+void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi);
+
+// Decide if we should drop this frame: For 1-pass CBR.
+// Changes only the decimation count in the rate control structure
+int av1_rc_drop_frame(struct AV1_COMP *cpi);
+
+// Computes frame size bounds.
+void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi,
+                                      int this_frame_target,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit);
+
+// Picks q and q bounds given the target for bits
+int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, int *bottom_index,
+                             int *top_index);
+
+// Estimates q to achieve a target bits per frame
+int av1_rc_regulate_q(const struct AV1_COMP *cpi, int target_bits_per_frame,
+                      int active_best_quality, int active_worst_quality);
+
+// Estimates bits per mb for a given qindex and correction factor.
+int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                       double correction_factor, aom_bit_depth_t bit_depth);
+
+// Clamping utilities for bitrate targets for iframes and pframes.
+int av1_rc_clamp_iframe_target_size(const struct AV1_COMP *const cpi,
+                                    int target);
+int av1_rc_clamp_pframe_target_size(const struct AV1_COMP *const cpi,
+                                    int target);
+// Utility to set frame_target into the RATE_CONTROL structure
+// This function is called only from the av1_rc_get_..._params() functions.
+void av1_rc_set_frame_target(struct AV1_COMP *cpi, int target);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a target q value
+int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+                       aom_bit_depth_t bit_depth);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a value that should equate to the given rate ratio.
+int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
+                               int qindex, double rate_target_ratio,
+                               aom_bit_depth_t bit_depth);
+
+int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int rf_level, int q);
+
+void av1_rc_update_framerate(struct AV1_COMP *cpi);
+
+void av1_rc_set_gf_interval_range(const struct AV1_COMP *const cpi,
+                                  RATE_CONTROL *const rc);
+
+void av1_set_target_rate(struct AV1_COMP *cpi);
+
+int av1_resize_one_pass_cbr(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_RATECTRL_H_
diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.c b/third_party/aom/av1/encoder/ratectrl_xiph.c
new file mode 100644
index 000000000..b9f827528
--- /dev/null
+++ b/third_party/aom/av1/encoder/ratectrl_xiph.c
@@ -0,0 +1,1244 @@
+/*
+ * Copyright (c) 2001-2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "av1/common/odintrin.h"
+#include "av1/encoder/ratectrl_xiph.h"
+
+#define OD_Q57(v) ((int64_t)((uint64_t)(v) << 57))
+#define OD_F_Q45(v) ((int64_t)(((v) * ((int64_t)1 << 45))))
+#define OD_F_Q12(v) ((int32_t)(((v) * ((int32_t)1 << 12))))
+
+/*A rough lookup table for tan(x), 0 <= x < pi/2.
+  The values are Q12 fixed-point and spaced at 5 degree intervals.
+  These decisions are somewhat arbitrary, but sufficient for the 2nd order
+   Bessel follower below.
+  Values of x larger than 85 degrees are extrapolated from the last interval,
+   which is way off, but "good enough".*/
+static uint16_t OD_ROUGH_TAN_LOOKUP[18] = { 0,     358,   722,  1098, 1491,
+                                            1910,  2365,  2868, 3437, 4096,
+                                            4881,  5850,  7094, 8784, 11254,
+                                            15286, 23230, 46817 };
+
+/*alpha is Q24 in the range [0,0.5).
+  The return values is 5.12.*/
+static int od_warp_alpha(int alpha) {
+  int i;
+  int d;
+  int t0;
+  int t1;
+  i = alpha * 36 >> 24;
+  if (i >= 17) i = 16;
+  t0 = OD_ROUGH_TAN_LOOKUP[i];
+  t1 = OD_ROUGH_TAN_LOOKUP[i + 1];
+  d = alpha * 36 - (i << 24);
+  return (int)((((int64_t)t0 << 32) + ((t1 - t0) << 8) * (int64_t)d) >> 32);
+}
+
+static const int64_t OD_ATANH_LOG2[32] = {
+  0x32B803473F7AD0F4LL, 0x2F2A71BD4E25E916LL, 0x2E68B244BB93BA06LL,
+  0x2E39FB9198CE62E4LL, 0x2E2E683F68565C8FLL, 0x2E2B850BE2077FC1LL,
+  0x2E2ACC58FE7B78DBLL, 0x2E2A9E2DE52FD5F2LL, 0x2E2A92A338D53EECLL,
+  0x2E2A8FC08F5E19B6LL, 0x2E2A8F07E51A485ELL, 0x2E2A8ED9BA8AF388LL,
+  0x2E2A8ECE2FE7384ALL, 0x2E2A8ECB4D3E4B1ALL, 0x2E2A8ECA94940FE8LL,
+  0x2E2A8ECA6669811DLL, 0x2E2A8ECA5ADEDD6ALL, 0x2E2A8ECA57FC347ELL,
+  0x2E2A8ECA57438A43LL, 0x2E2A8ECA57155FB4LL, 0x2E2A8ECA5709D510LL,
+  0x2E2A8ECA5706F267LL, 0x2E2A8ECA570639BDLL, 0x2E2A8ECA57060B92LL,
+  0x2E2A8ECA57060008LL, 0x2E2A8ECA5705FD25LL, 0x2E2A8ECA5705FC6CLL,
+  0x2E2A8ECA5705FC3ELL, 0x2E2A8ECA5705FC33LL, 0x2E2A8ECA5705FC30LL,
+  0x2E2A8ECA5705FC2FLL, 0x2E2A8ECA5705FC2FLL
+};
+
+static int od_ilog64(int64_t v) {
+  static const unsigned char OD_DEBRUIJN_IDX64[64] = {
+    0,  1,  2,  7,  3,  13, 8,  19, 4,  25, 14, 28, 9,  34, 20, 40,
+    5,  17, 26, 38, 15, 46, 29, 48, 10, 31, 35, 54, 21, 50, 41, 57,
+    63, 6,  12, 18, 24, 27, 33, 39, 16, 37, 45, 47, 30, 53, 49, 56,
+    62, 11, 23, 32, 36, 44, 52, 55, 61, 22, 43, 51, 60, 42, 59, 58
+  };
+  int ret;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  v |= v >> 32;
+  ret = (int)v & 1;
+  v = (v >> 1) + 1;
+  ret += OD_DEBRUIJN_IDX64[v * UINT64_C(0x218A392CD3D5DBF) >> 58 & 0x3F];
+  return ret;
+}
+
+/*Computes the binary exponential of logq57.
+  input: a log base 2 in Q57 format
+  output: a 64 bit integer in Q0 (no fraction) */
+static int64_t od_bexp64(int64_t logq57) {
+  int64_t w;
+  int64_t z;
+  int ipart;
+  ipart = (int)(logq57 >> 57);
+  if (ipart < 0) return 0;
+  if (ipart >= 63) return 0x7FFFFFFFFFFFFFFFLL;
+  z = logq57 - OD_Q57(ipart);
+  if (z) {
+    int64_t mask;
+    int64_t wlo;
+    int i;
+    /*C doesn't give us 64x64->128 muls, so we use CORDIC.
+      This is not particularly fast, but it's not being used in time-critical
+       code; it is very accurate.*/
+    /*z is the fractional part of the log in Q62 format.
+      We need 1 bit of headroom since the magnitude can get larger than 1
+       during the iteration, and a sign bit.*/
+    z <<= 5;
+    /*w is the exponential in Q61 format (since it also needs headroom and can
+       get as large as 2.0); we could get another bit if we dropped the sign,
+       but we'll recover that bit later anyway.
+      Ideally this should start out as
+        \lim_{n->\infty} 2^{61}/\product_{i=1}^n \sqrt{1-2^{-2i}}
+       but in order to guarantee convergence we have to repeat iterations 4,
+        13 (=3*4+1), and 40 (=3*13+1, etc.), so it winds up somewhat larger.*/
+    w = 0x26A3D0E401DD846DLL;
+    for (i = 0;; i++) {
+      mask = -(z < 0);
+      w += ((w >> (i + 1)) + mask) ^ mask;
+      z -= (OD_ATANH_LOG2[i] + mask) ^ mask;
+      /*Repeat iteration 4.*/
+      if (i >= 3) break;
+      z *= 2;
+    }
+    for (;; i++) {
+      mask = -(z < 0);
+      w += ((w >> (i + 1)) + mask) ^ mask;
+      z -= (OD_ATANH_LOG2[i] + mask) ^ mask;
+      /*Repeat iteration 13.*/
+      if (i >= 12) break;
+      z *= 2;
+    }
+    for (; i < 32; i++) {
+      mask = -(z < 0);
+      w += ((w >> (i + 1)) + mask) ^ mask;
+      z = (z - ((OD_ATANH_LOG2[i] + mask) ^ mask)) * 2;
+    }
+    wlo = 0;
+    /*Skip the remaining iterations unless we really require that much
+       precision.
+      We could have bailed out earlier for smaller iparts, but that would
+       require initializing w from a table, as the limit doesn't converge to
+       61-bit precision until n=30.*/
+    if (ipart > 30) {
+      /*For these iterations, we just update the low bits, as the high bits
+         can't possibly be affected.
+        OD_ATANH_LOG2 has also converged (it actually did so one iteration
+         earlier, but that's no reason for an extra special case).*/
+      for (;; i++) {
+        mask = -(z < 0);
+        wlo += ((w >> i) + mask) ^ mask;
+        z -= (OD_ATANH_LOG2[31] + mask) ^ mask;
+        /*Repeat iteration 40.*/
+        if (i >= 39) break;
+        z <<= 1;
+      }
+      for (; i < 61; i++) {
+        mask = -(z < 0);
+        wlo += ((w >> i) + mask) ^ mask;
+        z = (z - ((OD_ATANH_LOG2[31] + mask) ^ mask)) << 1;
+      }
+    }
+    w = (w << 1) + wlo;
+  } else {
+    w = (int64_t)1 << 62;
+  }
+  if (ipart < 62) {
+    w = ((w >> (61 - ipart)) + 1) >> 1;
+  }
+  return w;
+}
+
+/*Computes the binary log of w
+  input: a 64-bit integer in Q0 (no fraction)
+  output: a 64-bit log in Q57 */
+static int64_t od_blog64(int64_t w) {
+  int64_t z;
+  int ipart;
+  if (w <= 0) return -1;
+  ipart = od_ilog64(w) - 1;
+  if (ipart > 61) {
+    w >>= ipart - 61;
+  } else {
+    w <<= 61 - ipart;
+  }
+  z = 0;
+  if (w & (w - 1)) {
+    int64_t x;
+    int64_t y;
+    int64_t u;
+    int64_t mask;
+    int i;
+    /*C doesn't give us 64x64->128 muls, so we use CORDIC.
+      This is not particularly fast, but it's not being used in time-critical
+       code; it is very accurate.*/
+    /*z is the fractional part of the log in Q61 format.*/
+    /*x and y are the cosh() and sinh(), respectively, in Q61 format.
+      We are computing z = 2*atanh(y/x) = 2*atanh((w - 1)/(w + 1)).*/
+    x = w + ((int64_t)1 << 61);
+    y = w - ((int64_t)1 << 61);
+    for (i = 0; i < 4; i++) {
+      mask = -(y < 0);
+      z += ((OD_ATANH_LOG2[i] >> i) + mask) ^ mask;
+      u = x >> (i + 1);
+      x -= ((y >> (i + 1)) + mask) ^ mask;
+      y -= (u + mask) ^ mask;
+    }
+    /*Repeat iteration 4.*/
+    for (i--; i < 13; i++) {
+      mask = -(y < 0);
+      z += ((OD_ATANH_LOG2[i] >> i) + mask) ^ mask;
+      u = x >> (i + 1);
+      x -= ((y >> (i + 1)) + mask) ^ mask;
+      y -= (u + mask) ^ mask;
+    }
+    /*Repeat iteration 13.*/
+    for (i--; i < 32; i++) {
+      mask = -(y < 0);
+      z += ((OD_ATANH_LOG2[i] >> i) + mask) ^ mask;
+      u = x >> (i + 1);
+      x -= ((y >> (i + 1)) + mask) ^ mask;
+      y -= (u + mask) ^ mask;
+    }
+    /*OD_ATANH_LOG2 has converged.*/
+    for (; i < 40; i++) {
+      mask = -(y < 0);
+      z += ((OD_ATANH_LOG2[31] >> i) + mask) ^ mask;
+      u = x >> (i + 1);
+      x -= ((y >> (i + 1)) + mask) ^ mask;
+      y -= (u + mask) ^ mask;
+    }
+    /*Repeat iteration 40.*/
+    for (i--; i < 62; i++) {
+      mask = -(y < 0);
+      z += ((OD_ATANH_LOG2[31] >> i) + mask) ^ mask;
+      u = x >> (i + 1);
+      x -= ((y >> (i + 1)) + mask) ^ mask;
+      y -= (u + mask) ^ mask;
+    }
+    z = (z + 8) >> 4;
+  }
+  return OD_Q57(ipart) + z;
+}
+
+/*Convenience function converts Q57 value to a clamped 32-bit Q24 value
+  in: input in Q57 format.
+  Return: same number in Q24 */
+static int32_t od_q57_to_q24(int64_t in) {
+  int64_t ret;
+  ret = (in + ((int64_t)1 << 32)) >> 33;
+  /*0x80000000 is automatically converted to unsigned on 32-bit systems.
+    -0x7FFFFFFF-1 is needed to avoid "promoting" the whole expression to
+    unsigned.*/
+  return (int32_t)OD_CLAMPI(-0x7FFFFFFF - 1, ret, 0x7FFFFFFF);
+}
+
+/*Binary exponential of log_scale with 24-bit fractional precision and
+   saturation.
+  log_scale: A binary logarithm in Q57 format.
+  Return: The binary exponential in Q24 format, saturated to 2**31-1 if
+   log_scale was too large.*/
+static int32_t od_bexp64_q24(int64_t log_scale) {
+  if (log_scale < OD_Q57(8)) {
+    int64_t ret;
+    ret = od_bexp64(log_scale + OD_Q57(24));
+    return ret < 0x7FFFFFFF ? (int32_t)ret : 0x7FFFFFFF;
+  }
+  return 0x7FFFFFFF;
+}
+
+/*Re-initialize Bessel filter coefficients with the specified delay.
+  This does not alter the x/y state, but changes the reaction time of the
+   filter.
+  Altering the time constant of a reactive filter without alterning internal
+   state is something that has to be done carefuly, but our design operates at
+   high enough delays and with small enough time constant changes to make it
+   safe.*/
+static void od_iir_bessel2_reinit(od_iir_bessel2 *f, int delay) {
+  int alpha;
+  int64_t one48;
+  int64_t warp;
+  int64_t k1;
+  int64_t k2;
+  int64_t d;
+  int64_t a;
+  int64_t ik2;
+  int64_t b1;
+  int64_t b2;
+  /*This borrows some code from an unreleased version of Postfish.
+    See the recipe at http://unicorn.us.com/alex/2polefilters.html for details
+     on deriving the filter coefficients.*/
+  /*alpha is Q24*/
+  alpha = (1 << 24) / delay;
+  one48 = (int64_t)1 << 48;
+  /*warp is 7.12*/
+  warp = OD_MAXI(od_warp_alpha(alpha), 1);
+  /*k1 is 9.12*/
+  k1 = 3 * warp;
+  /*k2 is 16.24.*/
+  k2 = k1 * warp;
+  /*d is 16.15.*/
+  d = ((((1 << 12) + k1) << 12) + k2 + 256) >> 9;
+  /*a is 0.32, since d is larger than both 1.0 and k2.*/
+  a = (k2 << 23) / d;
+  /*ik2 is 25.24.*/
+  ik2 = one48 / k2;
+  /*b1 is Q56; in practice, the integer ranges between -2 and 2.*/
+  b1 = 2 * a * (ik2 - (1 << 24));
+  /*b2 is Q56; in practice, the integer ranges between -2 and 2.*/
+  b2 = (one48 << 8) - ((4 * a) << 24) - b1;
+  /*All of the filter parameters are Q24.*/
+  f->c[0] = (int32_t)((b1 + ((int64_t)1 << 31)) >> 32);
+  f->c[1] = (int32_t)((b2 + ((int64_t)1 << 31)) >> 32);
+  f->g = (int32_t)((a + 128) >> 8);
+}
+
+/*Initialize a 2nd order low-pass Bessel filter with the corresponding delay
+   and initial value.
+  value is Q24.*/
+static void od_iir_bessel2_init(od_iir_bessel2 *f, int delay, int32_t value) {
+  od_iir_bessel2_reinit(f, delay);
+  f->y[1] = f->y[0] = f->x[1] = f->x[0] = value;
+}
+
+static int64_t od_iir_bessel2_update(od_iir_bessel2 *f, int32_t x) {
+  int64_t c0;
+  int64_t c1;
+  int64_t g;
+  int64_t x0;
+  int64_t x1;
+  int64_t y0;
+  int64_t y1;
+  int64_t ya;
+  c0 = f->c[0];
+  c1 = f->c[1];
+  g = f->g;
+  x0 = f->x[0];
+  x1 = f->x[1];
+  y0 = f->y[0];
+  y1 = f->y[1];
+  ya = ((x + x0 * 2 + x1) * g + y0 * c0 + y1 * c1 + (1 << 23)) >> 24;
+  f->x[1] = (int32_t)x0;
+  f->x[0] = x;
+  f->y[1] = (int32_t)y0;
+  f->y[0] = (int32_t)ya;
+  return ya;
+}
+
+static void od_enc_rc_reset(od_rc_state *rc) {
+  int64_t npixels;
+  int64_t ibpp;
+  rc->bits_per_frame = (int64_t)(rc->target_bitrate / rc->framerate);
+  /*Insane framerates or frame sizes mean insane bitrates.
+    Let's not get carried away.*/
+  if (rc->bits_per_frame > 0x400000000000LL) {
+    rc->bits_per_frame = (int64_t)0x400000000000LL;
+  } else {
+    if (rc->bits_per_frame < 32) {
+      rc->bits_per_frame = 32;
+    }
+  }
+  rc->reservoir_frame_delay = OD_MAXI(rc->reservoir_frame_delay, 12);
+  rc->reservoir_max = rc->bits_per_frame * rc->reservoir_frame_delay;
+  /*Start with a buffer fullness and fullness target of 50% */
+  rc->reservoir_target = (rc->reservoir_max + 1) >> 1;
+  rc->reservoir_fullness = rc->reservoir_target;
+  /*Pick exponents and initial scales for quantizer selection.*/
+  npixels = rc->frame_width * (int64_t)rc->frame_height;
+  rc->log_npixels = od_blog64(npixels);
+  ibpp = npixels / rc->bits_per_frame;
+  /*All of these initial scale/exp values are from Theora, and have not yet
+     been adapted to Daala, so they're certainly wrong.
+    The B-frame values especially are simply copies of the P-frame values.*/
+  if (ibpp < 1) {
+    rc->exp[OD_I_FRAME] = 59;
+    rc->log_scale[OD_I_FRAME] = od_blog64(1997) - OD_Q57(OD_COEFF_SHIFT);
+  } else if (ibpp < 2) {
+    rc->exp[OD_I_FRAME] = 55;
+    rc->log_scale[OD_I_FRAME] = od_blog64(1604) - OD_Q57(OD_COEFF_SHIFT);
+  } else {
+    rc->exp[OD_I_FRAME] = 48;
+    rc->log_scale[OD_I_FRAME] = od_blog64(834) - OD_Q57(OD_COEFF_SHIFT);
+  }
+  if (ibpp < 4) {
+    rc->exp[OD_P_FRAME] = 100;
+    rc->log_scale[OD_P_FRAME] = od_blog64(2249) - OD_Q57(OD_COEFF_SHIFT);
+  } else if (ibpp < 8) {
+    rc->exp[OD_P_FRAME] = 95;
+    rc->log_scale[OD_P_FRAME] = od_blog64(1751) - OD_Q57(OD_COEFF_SHIFT);
+  } else {
+    rc->exp[OD_P_FRAME] = 73;
+    rc->log_scale[OD_P_FRAME] = od_blog64(1260) - OD_Q57(OD_COEFF_SHIFT);
+  }
+  /*Golden P-frames both use the same log_scale and exp modeling
+     values as regular P-frames and the same scale follower.
+    For convenience in the rate calculation code, we maintain a copy of
+    the scale and exp values in OD_GOLDEN_P_FRAME.*/
+  rc->exp[OD_GOLDEN_P_FRAME] = rc->exp[OD_P_FRAME];
+  rc->log_scale[OD_GOLDEN_P_FRAME] = rc->log_scale[OD_P_FRAME];
+  rc->exp[OD_ALTREF_P_FRAME] = rc->exp[OD_P_FRAME];
+  rc->log_scale[OD_ALTREF_P_FRAME] = rc->log_scale[OD_P_FRAME];
+  /*We clamp the actual I and B frame delays to a minimum of 10 to work within
+     the range of values where later incrementing the delay works as designed.
+    10 is not an exact choice, but rather a good working trade-off.*/
+  rc->inter_p_delay = 10;
+  rc->inter_delay_target = rc->reservoir_frame_delay >> 1;
+  memset(rc->frame_count, 0, sizeof(rc->frame_count));
+  /*Drop-frame tracking is concerned with more than just the basic three frame
+     types.
+    It needs to track boosted and cut subtypes (of which there is only one
+     right now, OD_GOLDEN_P_FRAME). */
+  rc->prev_drop_count[OD_I_FRAME] = 0;
+  rc->log_drop_scale[OD_I_FRAME] = OD_Q57(0);
+  rc->prev_drop_count[OD_P_FRAME] = 0;
+  rc->log_drop_scale[OD_P_FRAME] = OD_Q57(0);
+  rc->prev_drop_count[OD_GOLDEN_P_FRAME] = 0;
+  rc->log_drop_scale[OD_GOLDEN_P_FRAME] = OD_Q57(0);
+  rc->prev_drop_count[OD_ALTREF_P_FRAME] = 0;
+  rc->log_drop_scale[OD_ALTREF_P_FRAME] = OD_Q57(0);
+  /*Set up second order followers, initialized according to corresponding
+     time constants.*/
+  od_iir_bessel2_init(&rc->scalefilter[OD_I_FRAME], 4,
+                      od_q57_to_q24(rc->log_scale[OD_I_FRAME]));
+  od_iir_bessel2_init(&rc->scalefilter[OD_P_FRAME], rc->inter_p_delay,
+                      od_q57_to_q24(rc->log_scale[OD_P_FRAME]));
+  od_iir_bessel2_init(&rc->vfrfilter[OD_I_FRAME], 4,
+                      od_bexp64_q24(rc->log_drop_scale[OD_I_FRAME]));
+  od_iir_bessel2_init(&rc->vfrfilter[OD_P_FRAME], 4,
+                      od_bexp64_q24(rc->log_drop_scale[OD_P_FRAME]));
+  od_iir_bessel2_init(&rc->vfrfilter[OD_GOLDEN_P_FRAME], 4,
+                      od_bexp64_q24(rc->log_drop_scale[OD_GOLDEN_P_FRAME]));
+  od_iir_bessel2_init(&rc->vfrfilter[OD_ALTREF_P_FRAME], 4,
+                      od_bexp64_q24(rc->log_drop_scale[OD_ALTREF_P_FRAME]));
+}
+
+int od_enc_rc_resize(od_rc_state *rc) {
+  /*If encoding has not yet begun, reset the buffer state.*/
+  if (rc->cur_frame == 0) {
+    od_enc_rc_reset(rc);
+  } else {
+    int idt;
+    /*Otherwise, update the bounds on the buffer, but not the current
+       fullness.*/
+    rc->bits_per_frame = (int64_t)(rc->target_bitrate / rc->framerate);
+    /*Insane framerates or frame sizes mean insane bitrates.
+      Let's not get carried away.*/
+    if (rc->bits_per_frame > 0x400000000000LL) {
+      rc->bits_per_frame = (int64_t)0x400000000000LL;
+    } else {
+      if (rc->bits_per_frame < 32) {
+        rc->bits_per_frame = 32;
+      }
+    }
+    rc->reservoir_frame_delay = OD_MAXI(rc->reservoir_frame_delay, 12);
+    rc->reservoir_max = rc->bits_per_frame * rc->reservoir_frame_delay;
+    rc->reservoir_target =
+        ((rc->reservoir_max + 1) >> 1) +
+        ((rc->bits_per_frame + 2) >> 2) *
+            OD_MINI(rc->keyframe_rate, rc->reservoir_frame_delay);
+    /*Update the INTER-frame scale filter delay.
+      We jump to it immediately if we've already seen enough frames; otherwise
+       it is simply set as the new target.*/
+    rc->inter_delay_target = idt = OD_MAXI(rc->reservoir_frame_delay >> 1, 10);
+    if (idt < OD_MINI(rc->inter_p_delay, rc->frame_count[OD_P_FRAME])) {
+      od_iir_bessel2_init(&rc->scalefilter[OD_P_FRAME], idt,
+                          rc->scalefilter[OD_P_FRAME].y[0]);
+      rc->inter_p_delay = idt;
+    }
+  }
+  return 0;
+}
+
+int od_enc_rc_init(od_rc_state *rc, int64_t bitrate, int delay_ms) {
+  if (rc->framerate <= 0) return 1;
+  if (rc->target_bitrate > 0) {
+    /*State has already been initialized; rather than reinitialize,
+      adjust the buffering for the new target rate. */
+    rc->target_bitrate = bitrate;
+    return od_enc_rc_resize(rc);
+  }
+  rc->target_quantizer = 0;
+  rc->target_bitrate = bitrate;
+  rc->rate_bias = 0;
+  if (bitrate > 0) {
+    /* The buffer size is clamped between [12, 256], this interval is short
+       enough to
+       allow reaction, but long enough to allow looking into the next GOP
+       (avoiding
+       the case where the last frames before an I-frame get starved).
+       The 12 frame minimum gives us some chance to distribute bit estimation
+       errors in the worst case. The 256 frame maximum means we'll require 8-10
+       seconds
+       of pre-buffering at 24-30 fps, which is not unreasonable.*/
+    rc->reservoir_frame_delay =
+        (int)OD_MINI((delay_ms / 1000) * rc->framerate, 256);
+    rc->drop_frames = 1;
+    rc->cap_overflow = 1;
+    rc->cap_underflow = 0;
+    rc->twopass_state = 0;
+    od_enc_rc_reset(rc);
+  }
+  return 0;
+}
+
+/*Scale the number of frames by the number of expected drops/duplicates.*/
+static int od_rc_scale_drop(od_rc_state *rc, int frame_type, int nframes) {
+  if (rc->prev_drop_count[frame_type] > 0 ||
+      rc->log_drop_scale[frame_type] > OD_Q57(0)) {
+    int64_t dup_scale;
+    dup_scale = od_bexp64(((rc->log_drop_scale[frame_type] +
+                            od_blog64(rc->prev_drop_count[frame_type] + 1)) >>
+                           1) +
+                          OD_Q57(8));
+    if (dup_scale < nframes << 8) {
+      int dup_scalei;
+      dup_scalei = (int)dup_scale;
+      if (dup_scalei > 0) {
+        nframes = ((nframes << 8) + dup_scalei - 1) / dup_scalei;
+      }
+    } else {
+      nframes = !!nframes;
+    }
+  }
+  return nframes;
+}
+
+/*Closed form version of frame determination code.
+  Used by rate control to predict frame types and subtypes into the future.
+  No side effects, may be called any number of times.
+  Note that it ignores end-of-file conditions; one-pass planning *should*
+   ignore end-of-file. */
+int od_frame_type(od_rc_state *rc, int64_t coding_frame_count, int *is_golden,
+                  int *is_altref, int64_t *ip_count) {
+  int frame_type;
+  if (coding_frame_count == 0) {
+    *is_golden = 1;
+    *is_altref = 1;
+    *ip_count = 0;
+    frame_type = OD_I_FRAME;
+  } else {
+    int keyrate = rc->keyframe_rate;
+    if (rc->closed_gop) {
+      int ip_per_gop;
+      int gop_n;
+      int gop_i;
+      ip_per_gop = (keyrate - 1) / 2;
+      gop_n = coding_frame_count / keyrate;
+      gop_i = coding_frame_count - gop_n * keyrate;
+      *ip_count = gop_n * ip_per_gop + (gop_i > 0) + (gop_i - 1);
+      frame_type = gop_i == 0 ? OD_I_FRAME : OD_P_FRAME;
+    } else {
+      int ip_per_gop;
+      int gop_n;
+      int gop_i;
+      ip_per_gop = (keyrate);
+      gop_n = (coding_frame_count - 1) / keyrate;
+      gop_i = coding_frame_count - gop_n * keyrate - 1;
+      *ip_count = (coding_frame_count > 0) + gop_n * ip_per_gop + (gop_i);
+      frame_type = gop_i / 1 < ip_per_gop - 1 ? OD_P_FRAME : OD_I_FRAME;
+    }
+  }
+  *is_golden =
+      (*ip_count % rc->goldenframe_rate) == 0 || frame_type == OD_I_FRAME;
+  *is_altref = (*ip_count % rc->altref_rate) == 0 || frame_type == OD_I_FRAME;
+  return frame_type;
+}
+
+/*Count frames types forward from the current frame up to but not including
+   the last I-frame in reservoir_frame_delay.
+  If reservoir_frame_delay contains no I-frames (or the current frame is the
+   only I-frame), count all reservoir_frame_delay frames.
+  Returns the number of frames counted.
+  Right now, this implementation is simple, brute-force, and expensive.
+  It is also easy to understand and debug.
+  TODO: replace with a virtual FIFO that keeps running totals as
+   repeating the counting over-and-over will have a performance impact on
+   whole-file 2pass usage.*/
+static int frame_type_count(od_rc_state *rc, int nframes[OD_FRAME_NSUBTYPES]) {
+  int i;
+  int j;
+  int acc[OD_FRAME_NSUBTYPES];
+  int count;
+  int reservoir_frames;
+  int reservoir_frame_delay;
+  memset(nframes, 0, OD_FRAME_NSUBTYPES * sizeof(*nframes));
+  memset(acc, 0, sizeof(acc));
+  count = 0;
+  reservoir_frames = 0;
+#if 1
+  /*Go ahead and count past end-of-stream.
+    We won't nail the exact bitrate on short files that end with a partial
+     GOP, but we also won't [potentially] destroy the quality of the last few
+     frames in that same case when we suddenly find out the stream is ending
+     before the original planning horizon.*/
+  reservoir_frame_delay = rc->reservoir_frame_delay;
+#else
+  /*Don't count past the end of the stream (once we know where end-of-stream
+     is).*/
+  reservoir_frame_delay =
+      rc->end_of_input ? rc->input_size + 1 : rc->reservoir_frame_delay;
+#endif
+  for (i = 0; i < reservoir_frame_delay; i++) {
+    int frame_type;
+    int is_golden;
+    int is_altref;
+    int64_t dummy;
+    frame_type =
+        od_frame_type(rc, rc->cur_frame + i, &is_golden, &is_altref, &dummy);
+    switch (frame_type) {
+      case OD_I_FRAME: {
+        for (j = 0; j < OD_FRAME_NSUBTYPES; j++) nframes[j] += acc[j];
+        reservoir_frames += count;
+        memset(acc, 0, sizeof(acc));
+        acc[OD_I_FRAME] = 1;
+        count = 1;
+        break;
+      }
+      case OD_P_FRAME: {
+        if (is_golden) {
+          ++acc[OD_GOLDEN_P_FRAME];
+          ++count;
+        } else if (is_altref) {
+          ++acc[OD_ALTREF_P_FRAME];
+          ++count;
+        } else {
+          ++acc[OD_P_FRAME];
+          ++count;
+        }
+        break;
+      }
+    }
+  }
+  /*If there were no I-frames at all, or only the first frame was an I-frame,
+     the accumulators never flushed and still contain the counts for the
+     entire buffer.
+    In both these cases, we return these counts.
+    Otherwise, we discard what remains in the accumulators as they contain
+     the counts from and past the last I-frame.*/
+  if (reservoir_frames == 0) {
+    for (i = 0; i < OD_FRAME_NSUBTYPES; i++) nframes[i] = acc[i];
+    reservoir_frames += count;
+  }
+  return reservoir_frames;
+}
+
+static int convert_to_ac_quant(int q, int bit_depth) {
+  return lrint(av1_convert_qindex_to_q(q, bit_depth));
+}
+
+int od_enc_rc_select_quantizers_and_lambdas(od_rc_state *rc,
+                                            int is_golden_frame,
+                                            int is_altref_frame, int frame_type,
+                                            int *bottom_idx, int *top_idx) {
+  int frame_subtype;
+  int64_t log_cur_scale;
+  int lossy_quantizer_min;
+  int lossy_quantizer_max;
+  double mqp_i = OD_MQP_I;
+  double mqp_p = OD_MQP_P;
+  double mqp_gp = OD_MQP_GP;
+  double mqp_ap = OD_MQP_AP;
+  int reservoir_frames;
+  int nframes[OD_FRAME_NSUBTYPES];
+  int32_t mqp_Q12[OD_FRAME_NSUBTYPES];
+  int64_t dqp_Q45[OD_FRAME_NSUBTYPES];
+  /*Verify the closed-form frame type determination code matches what the
+     input queue set.*/
+  /*One pseudo-non-closed-form caveat:
+    Once we've seen end-of-input, the batched frame determination code
+     suppresses the last open-GOP's I-frame (since it would only be
+     useful for the next GOP, which doesn't exist).
+     Thus, don't check one the input queue is drained.*/
+  if (!rc->end_of_input) {
+    int closed_form_type;
+    int closed_form_golden;
+    int closed_form_altref;
+    int64_t closed_form_cur_frame;
+    closed_form_type =
+        od_frame_type(rc, rc->cur_frame, &closed_form_golden,
+                      &closed_form_altref, &closed_form_cur_frame);
+    OD_UNUSED(closed_form_type);
+    OD_UNUSED(is_altref_frame);
+    assert(closed_form_type == frame_type);
+    assert(closed_form_cur_frame == rc->cur_frame);
+    assert(closed_form_altref == is_altref_frame);
+    assert(closed_form_golden == is_golden_frame);
+  }
+
+  log_cur_scale = (int64_t)rc->scalefilter[frame_type].y[0] << 33;
+
+  /*Count the various types and classes of frames.*/
+  reservoir_frames = frame_type_count(rc, nframes);
+  nframes[OD_I_FRAME] = od_rc_scale_drop(rc, OD_I_FRAME, nframes[OD_I_FRAME]);
+  nframes[OD_P_FRAME] = od_rc_scale_drop(rc, OD_P_FRAME, nframes[OD_P_FRAME]);
+  nframes[OD_GOLDEN_P_FRAME] =
+      od_rc_scale_drop(rc, OD_GOLDEN_P_FRAME, nframes[OD_GOLDEN_P_FRAME]);
+  nframes[OD_ALTREF_P_FRAME] =
+      od_rc_scale_drop(rc, OD_ALTREF_P_FRAME, nframes[OD_ALTREF_P_FRAME]);
+
+  switch (rc->twopass_state) {
+    default: break;
+    case 1: {
+      /*Pass 1 mode: use a fixed qi value.*/
+      return rc->firstpass_quant;
+    } break;
+    case 2: {
+      int i;
+      int64_t scale_sum[OD_FRAME_NSUBTYPES];
+      int qti;
+      /*Pass 2 mode: we know exactly how much of each frame type there is in
+         the current buffer window, and have estimates for the scales.*/
+      for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
+        nframes[i] = rc->nframes[i];
+        nframes[i] = rc->nframes[i];
+        scale_sum[i] = rc->scale_sum[i];
+      }
+      /*If we're not using the same frame type as in pass 1 (because someone
+         changed the keyframe interval), remove that scale estimate.
+        We'll add in a replacement for the correct frame type below.*/
+      qti = rc->cur_metrics.frame_type;
+      if (qti != frame_type) {
+        nframes[qti]--;
+        scale_sum[qti] -= od_bexp64_q24(rc->cur_metrics.log_scale);
+      }
+      /*Compute log_scale estimates for each frame type from the pass-1 scales
+         we measured in the current window.*/
+      for (qti = 0; qti < OD_FRAME_NSUBTYPES; qti++) {
+        rc->log_scale[qti] = nframes[qti] > 0
+                                 ? od_blog64(scale_sum[qti]) -
+                                       od_blog64(nframes[qti]) - OD_Q57(24)
+                                 : -rc->log_npixels;
+      }
+      /*If we're not using the same frame type as in pass 1, add a scale
+         estimate for the corresponding frame using the current low-pass
+         filter value.
+        This is mostly to ensure we have a valid estimate even when pass 1 had
+         no frames of this type in the buffer window.
+        TODO: We could also plan ahead and figure out how many keyframes we'll
+         be forced to add in the current buffer window.*/
+      qti = rc->cur_metrics.frame_type;
+      if (qti != frame_type) {
+        int64_t scale;
+        scale = rc->log_scale[frame_type] < OD_Q57(23)
+                    ? od_bexp64(rc->log_scale[frame_type] + OD_Q57(24))
+                    : 0x7FFFFFFFFFFFLL;
+        scale *= nframes[frame_type];
+        nframes[frame_type]++;
+        scale += od_bexp64_q24(log_cur_scale >> 33);
+        rc->log_scale[frame_type] =
+            od_blog64(scale) - od_blog64(nframes[qti]) - OD_Q57(24);
+      } else {
+        log_cur_scale = (int64_t)rc->cur_metrics.log_scale << 33;
+      }
+    } break;
+  }
+
+  /*Quantizer selection sticks to the codable, lossy portion of the quantizer
+    range.*/
+  lossy_quantizer_min = convert_to_ac_quant(rc->minq, rc->bit_depth);
+  lossy_quantizer_max = convert_to_ac_quant(rc->maxq, rc->bit_depth);
+  frame_subtype = frame_type;
+  /*Stash quantizer modulation by frame type.*/
+  mqp_Q12[OD_I_FRAME] = OD_F_Q12(mqp_i);
+  mqp_Q12[OD_P_FRAME] = OD_F_Q12(mqp_p);
+  mqp_Q12[OD_GOLDEN_P_FRAME] = OD_F_Q12(mqp_gp);
+  mqp_Q12[OD_ALTREF_P_FRAME] = OD_F_Q12(mqp_ap);
+  dqp_Q45[OD_I_FRAME] = OD_F_Q45(OD_DQP_I);
+  dqp_Q45[OD_P_FRAME] = OD_F_Q45(OD_DQP_P);
+  dqp_Q45[OD_GOLDEN_P_FRAME] = OD_F_Q45(OD_DQP_GP);
+  dqp_Q45[OD_ALTREF_P_FRAME] = OD_F_Q45(OD_DQP_AP);
+  /*Is rate control active?*/
+  if (rc->target_bitrate <= 0) {
+    /*Rate control is not active; derive quantizer directly from
+      quality parameter and frame type. */
+    /*Can't use the OD_LOSSLESS macro, as it uses state.quantizer to intuit,
+      and we've not set it yet.*/
+    if (rc->quality == 0) {
+      /*Lossless coding requested.*/
+      rc->base_quantizer = 0;
+      rc->target_quantizer = 0;
+    } else {
+      int64_t log_quantizer;
+
+      /* Adjust the modulation constants using the last frame's quantizer. */
+      double mqp_delta = (255 - rc->target_quantizer) / 2000.0f;
+      mqp_i -= mqp_delta;
+      mqp_p += mqp_delta;
+      mqp_gp -= mqp_delta;
+      mqp_Q12[OD_I_FRAME] = OD_F_Q12(mqp_i);
+      mqp_Q12[OD_P_FRAME] = OD_F_Q12(mqp_p);
+      mqp_Q12[OD_GOLDEN_P_FRAME] = OD_F_Q12(mqp_gp);
+      mqp_Q12[OD_ALTREF_P_FRAME] = OD_F_Q12(mqp_ap);
+
+      if (rc->quality == -1) {
+        /*A quality of -1 means quality was unset; use a default.*/
+        rc->base_quantizer = convert_to_ac_quant(10, rc->bit_depth);
+      } else {
+        rc->base_quantizer = convert_to_ac_quant(rc->quality, rc->bit_depth);
+      }
+
+      if (rc->periodic_boosts && !is_golden_frame) {
+        int pattern_rate = (rc->goldenframe_rate >> 1);
+        int dist_to_golden = rc->cur_frame % pattern_rate;
+        int dist_away_golden = pattern_rate - dist_to_golden;
+        int boost = dist_to_golden;
+        if (dist_away_golden > dist_to_golden) boost = dist_away_golden;
+        boost -= pattern_rate;
+        boost *= (rc->base_quantizer) / OD_PERIODIC_BOOST_DIV;
+        rc->base_quantizer = rc->base_quantizer + boost;
+      }
+
+      /*As originally written, qp modulation is applied to the coded quantizer.
+        Because we now have and use a more precise target quantizer for various
+        calculation, that needs to be modulated as well.
+        Calculate what is, effectively, a fractional coded quantizer. */
+      /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/
+      log_quantizer = od_blog64(rc->base_quantizer) - OD_Q57(OD_COEFF_SHIFT);
+      /*log_quantizer to Q21.*/
+      log_quantizer >>= 36;
+      /*scale log quantizer, result is Q33.*/
+      log_quantizer *= OD_LOG_QUANTIZER_BASE_Q12;
+      /*Add Q33 offset to Q33 log_quantizer.*/
+      log_quantizer += OD_LOG_QUANTIZER_OFFSET_Q45 >> 12;
+      /*Modulate quantizer according to frame type; result is Q45.*/
+      log_quantizer *= mqp_Q12[frame_subtype];
+      /*Add Q45 boost/cut to Q45 fractional coded quantizer.*/
+      log_quantizer += dqp_Q45[frame_subtype];
+      /*Back to log2 quantizer in Q57.*/
+      log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) *
+                          OD_LOG_QUANTIZER_EXP_Q12 +
+                      OD_Q57(OD_COEFF_SHIFT);
+      /*Convert Q57 log2 quantizer to unclamped linear target quantizer value.*/
+      rc->target_quantizer = od_bexp64(log_quantizer);
+    }
+  } else {
+    int clamp;
+    int64_t rate_bias;
+    int64_t rate_total;
+    int base_quantizer;
+    int64_t log_quantizer;
+    int qlo;
+    int qhi;
+    int i;
+    /*We clamp the allowed amount of qi change (after initialization).*/
+    clamp = rc->cur_frame > 0;
+    /*Figure out how to re-distribute bits so that we hit our fullness target
+       before the last keyframe in our current buffer window (after the current
+       frame), or the end of the buffer window, whichever comes first.*/
+    /*Single pass only right now.*/
+    /*If we've been missing our target, add a penalty term.*/
+    rate_bias = (rc->rate_bias / (rc->cur_frame + 1000)) * reservoir_frames;
+    /*rate_total is the total bits available over the next
+       reservoir_frames frames.*/
+    rate_total = rc->reservoir_fullness - rc->reservoir_target + rate_bias +
+                 reservoir_frames * rc->bits_per_frame;
+    /*Find a target quantizer that meets our rate target for the specific mix
+       of frame types we'll have over the next frame_delay frames.
+      We model the rate<->quantizer relationship as:
+       rate = scale*(quantizer**-exp)
+      In this case, we have our desired rate, an exponent selected in setup,
+       and a scale that's been measured over our frame history, so we're
+       solving for the quantizer.
+      Exponentiation with arbitrary exponents is expensive, so we work in
+       the binary log domain (binary exp and log aren't too bad):
+       rate = e2(log2_scale - log2_quantizer * exp)
+      There's no easy closed form solution, so we bisection search for it.*/
+    /*We do not currently allow rate control to select lossless encoding.*/
+    qlo = 1;
+    /*If there's a quality specified, it's used to select the
+       coarsest base quantizer we can select.
+      Otherwise we can use up to and including the coarsest codable
+       quantizer.*/
+    if (rc->quality > 0)
+      qhi = convert_to_ac_quant(rc->quality, rc->bit_depth);
+    else
+      qhi = lossy_quantizer_max;
+    base_quantizer = (qlo + qhi) >> 1;
+    while (qlo < qhi) {
+      volatile int64_t log_base_quantizer;
+      int64_t diff;
+      int64_t bits;
+      /*Count bits contributed by each frame type using the model.*/
+      bits = 0;
+      log_base_quantizer = od_blog64(base_quantizer);
+      for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
+        /*Modulate base quantizer by frame type.*/
+        /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/
+        log_quantizer = log_base_quantizer - OD_Q57(OD_COEFF_SHIFT);
+        /*log_quantizer to Q21.*/
+        log_quantizer >>= 36;
+        /*scale log quantizer, result is Q33.*/
+        log_quantizer *= OD_LOG_QUANTIZER_BASE_Q12;
+        /*Add Q33 offset to Q33 log_quantizer.*/
+        log_quantizer += OD_LOG_QUANTIZER_OFFSET_Q45 >> 12;
+        /*Modulate quantizer according to frame type; result is Q45.*/
+        log_quantizer *= mqp_Q12[i];
+        /*Add Q45 boost/cut to Q45 fractional coded quantizer.*/
+        log_quantizer += dqp_Q45[i];
+        /*Back to log2 quantizer in Q57.*/
+        log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) *
+                            OD_LOG_QUANTIZER_EXP_Q12 +
+                        OD_Q57(OD_COEFF_SHIFT);
+        /*Clamp modulated quantizer values.*/
+        log_quantizer = OD_CLAMPI(od_blog64(lossy_quantizer_min), log_quantizer,
+                                  od_blog64(lossy_quantizer_max));
+        /* All the fields here are Q57 except for the exponent which is Q6.*/
+        bits += nframes[i] * od_bexp64(rc->log_scale[i] + rc->log_npixels -
+                                       (log_quantizer >> 6) * rc->exp[i]);
+      }
+      diff = bits - rate_total;
+      if (diff > 0) {
+        qlo = base_quantizer + 1;
+      } else if (diff < 0) {
+        qhi = base_quantizer - 1;
+      } else {
+        break;
+      }
+      base_quantizer = (qlo + qhi) >> 1;
+    }
+    /*If this was not one of the initial frames, limit the change in base
+       quantizer to within [0.8*Q,1.2*Q], where Q is the previous frame's
+       base quantizer.*/
+    if (clamp) {
+      base_quantizer = OD_CLAMPI((rc->base_quantizer * 0x0CCCD + 0x8000) >> 16,
+                                 base_quantizer,
+                                 (rc->base_quantizer * 0x13333 + 0x8000) >> 16);
+    }
+    /*Modulate chosen base quantizer to produce target quantizer.*/
+    log_quantizer = od_blog64(base_quantizer);
+    /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/
+    log_quantizer -= OD_Q57(OD_COEFF_SHIFT);
+    /*log_quantizer to Q21.*/
+    log_quantizer >>= 36;
+    /*scale log quantizer, result is Q33.*/
+    log_quantizer *= OD_LOG_QUANTIZER_BASE_Q12;
+    /*Add Q33 offset to Q33 log_quantizer.*/
+    log_quantizer += OD_LOG_QUANTIZER_OFFSET_Q45 >> 12;
+    /*Modulate quantizer according to frame type; result is Q45.*/
+    log_quantizer *= mqp_Q12[frame_subtype];
+    /*Add Q45 boost/cut to Q45 fractional coded quantizer.*/
+    log_quantizer += dqp_Q45[frame_subtype];
+    /*Back to log2 quantizer in Q57.*/
+    log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) *
+                        OD_LOG_QUANTIZER_EXP_Q12 +
+                    OD_Q57(OD_COEFF_SHIFT);
+    /*Clamp modulated quantizer values.*/
+    log_quantizer = OD_CLAMPI(od_blog64(lossy_quantizer_min), log_quantizer,
+                              od_blog64(lossy_quantizer_max));
+    /*The above allocation looks only at the total rate we'll accumulate in
+       the next reservoir_frame_delay frames.
+      However we could overflow the bit reservoir on the very next frame, so
+       check for that here if we're not using a soft target.*/
+    if (rc->cap_overflow) {
+      int64_t margin;
+      int64_t soft_limit;
+      int64_t log_soft_limit;
+      int64_t log_scale_pixels;
+      int64_t exp;
+      int64_t log_qexp;
+      /*Allow 3% of the buffer for prediction error.
+        This should be plenty, and we don't mind if we go a bit over; we only
+         want to keep these bits from being completely wasted.*/
+      margin = (rc->reservoir_max + 31) >> 5;
+      /*We want to use at least this many bits next frame.*/
+      soft_limit = rc->reservoir_fullness + rc->bits_per_frame -
+                   (rc->reservoir_max - margin);
+      log_soft_limit = od_blog64(soft_limit);
+      /*If we're predicting we won't use that many bits...*/
+      log_scale_pixels = rc->log_scale[frame_subtype] + rc->log_npixels;
+      exp = rc->exp[frame_subtype];
+      log_qexp = (log_quantizer >> 6) * exp;
+      if (log_scale_pixels - log_qexp < log_soft_limit) {
+        /*Scale the adjustment based on how far into the margin we are.*/
+        log_qexp += ((log_scale_pixels - log_soft_limit - log_qexp) >> 32) *
+                    (OD_MINI(margin, soft_limit) << 32) / margin;
+        log_quantizer = (((log_qexp + (exp >> 1)) / exp) << 6);
+      }
+    }
+    /*We just checked we don't overflow the reservoir next frame, now check
+       we don't underflow and bust the budget (when not using a soft target).
+      Disabled when a quality bound is set; if we saturate quantizer to the
+       maximum possible size when we have a limiting max quality, the
+       resulting lambda can cause strange behavior.*/
+    if (rc->quality == -1) {
+      int64_t exp;
+      int64_t log_qexp;
+      int64_t log_scale_pixels;
+      int64_t log_hard_limit;
+      /*Compute the maximum number of bits we can use in the next frame.
+        Allow 50% of the rate for a single frame for prediction error.
+        This may not be enough for keyframes or sudden changes in
+         complexity.*/
+      log_hard_limit =
+          od_blog64(rc->reservoir_fullness + (rc->bits_per_frame >> 1));
+      /*If we're predicting we'll use more than this...*/
+      log_scale_pixels = rc->log_scale[frame_subtype] + rc->log_npixels;
+      exp = rc->exp[frame_subtype];
+      log_qexp = (log_quantizer >> 6) * exp;
+      if (log_scale_pixels - log_qexp > log_hard_limit) {
+        /*Force the target to hit our limit exactly.*/
+        log_qexp = log_scale_pixels - log_hard_limit;
+        log_quantizer = (log_qexp + (exp >> 1)) / exp << 6;
+        /*If that target is unreasonable, oh well; we'll have to drop.*/
+        log_quantizer = OD_MAXI(log_quantizer, od_blog64(lossy_quantizer_max));
+      }
+    }
+    /*Compute a final estimate of the number of bits we plan to use, update
+       the running rate bias measurement.*/
+    {
+      int64_t log_qexp;
+      int64_t log_scale_pixels;
+      log_scale_pixels = rc->log_scale[frame_subtype] + rc->log_npixels;
+      log_qexp = (log_quantizer >> 6) * rc->exp[frame_subtype];
+      rc->rate_bias += od_bexp64(log_scale_pixels - log_qexp);
+    }
+    rc->target_quantizer = od_bexp64(log_quantizer);
+    /*The various cappings and adjustments may have altered the log_quantizer
+       target significantly.
+      We can either update the base quantizer to be consistent with the
+       target or let it track separately.
+      Theora behavior effectively keeps them consistent, as it regenerates
+       the effective base quantizer from the target each frame rather than
+       saving both.
+      For Daala, it's easier to allow them to track separately.
+      For now, allow them to track separately and see how it behaves.*/
+    rc->base_quantizer = base_quantizer;
+  }
+  *bottom_idx = lossy_quantizer_min;
+  *top_idx = lossy_quantizer_max;
+  rc->target_quantizer = av1_qindex_from_ac(
+      OD_CLAMPI(lossy_quantizer_min, rc->target_quantizer, lossy_quantizer_max),
+      rc->bit_depth);
+  return rc->target_quantizer;
+}
+
+int od_enc_rc_update_state(od_rc_state *rc, int64_t bits, int is_golden_frame,
+                           int is_altref_frame, int frame_type, int droppable) {
+  int dropped;
+  dropped = 0;
+  /*Update rate control only if rate control is active.*/
+  if (rc->target_bitrate > 0) {
+    int64_t log_scale;
+    int frame_subtype;
+    frame_subtype = frame_type;
+    /*Track non-golden and golden P frame drops separately.*/
+    if (is_golden_frame && frame_type == OD_P_FRAME)
+      frame_subtype = OD_GOLDEN_P_FRAME;
+    else if (is_altref_frame && frame_type == OD_P_FRAME)
+      frame_subtype = OD_ALTREF_P_FRAME;
+    if (bits <= 0) {
+      /*We didn't code any blocks in this frame.*/
+      log_scale = OD_Q57(-64);
+      bits = 0;
+      ++rc->prev_drop_count[frame_subtype];
+    } else {
+      int64_t log_bits;
+      int64_t log_qexp;
+      /*Compute the estimated scale factor for this frame type.*/
+      log_bits = od_blog64(bits);
+      log_qexp = od_blog64(rc->target_quantizer);
+      log_qexp = (log_qexp >> 6) * (rc->exp[frame_type]);
+      log_scale = OD_MINI(log_bits - rc->log_npixels + log_qexp, OD_Q57(16));
+    }
+
+    switch (rc->twopass_state) {
+      case 1: {
+        int golden, altref;
+        int64_t ipc;
+        rc->cur_metrics.frame_type =
+            od_frame_type(rc, rc->cur_frame, &golden, &altref, &ipc);
+        /*Pass 1 mode: save the metrics for this frame.*/
+        rc->cur_metrics.log_scale = od_q57_to_q24(log_scale);
+      } break;
+      case 2: {
+        /*Pass 2 mode:*/
+        int m_frame_type = rc->cur_metrics.frame_type;
+        rc->nframes[m_frame_type]--;
+        rc->scale_sum[m_frame_type] -= od_bexp64_q24(rc->cur_metrics.log_scale);
+      } break;
+    }
+
+    if (bits > 0) {
+      od_iir_bessel2 *f;
+      /*If this is the first example of the given frame type we've
+         seen, we immediately replace the default scale factor guess
+         with the estimate we just computed using the first frame.*/
+      if (rc->frame_count[frame_type] == 0) {
+        f = rc->scalefilter + frame_type;
+        f->y[1] = f->y[0] = f->x[1] = f->x[0] = od_q57_to_q24(log_scale);
+        rc->log_scale[frame_type] = log_scale;
+      } else {
+        /*Lengthen the time constant for the inter filters as we collect more
+           frame statistics, until we reach our target.*/
+        if (frame_type != OD_I_FRAME &&
+            rc->inter_p_delay < rc->inter_delay_target &&
+            rc->frame_count[frame_type] >= rc->inter_p_delay) {
+          od_iir_bessel2_reinit(&rc->scalefilter[frame_type],
+                                ++rc->inter_p_delay);
+        }
+        /*Update the low-pass scale filter for this frame type
+           regardless of whether or not we drop this frame.*/
+        rc->log_scale[frame_type] =
+            od_iir_bessel2_update(rc->scalefilter + frame_type,
+                                  od_q57_to_q24(log_scale))
+            << 33;
+      }
+      /*If this frame busts our budget, it must be dropped.*/
+      if (droppable && rc->reservoir_fullness + rc->bits_per_frame < bits) {
+        ++rc->prev_drop_count[frame_subtype];
+        bits = 0;
+        dropped = 1;
+      } else {
+        uint32_t drop_count;
+        /*Update a low-pass filter to estimate the "real" frame rate taking
+           drops into account.
+          This is only done if the frame is coded, as it needs the final
+           count of dropped frames.*/
+        drop_count = rc->prev_drop_count[frame_subtype] + 1;
+        if (drop_count > 0x7F) {
+          drop_count = 0x7FFFFFFF;
+        } else {
+          drop_count <<= 24;
+        }
+        rc->log_drop_scale[frame_subtype] =
+            od_blog64(od_iir_bessel2_update(rc->vfrfilter + frame_subtype,
+                                            drop_count)) -
+            OD_Q57(24);
+        /*Zero the drop count for this frame.
+          It will be increased if we drop frames.*/
+        rc->prev_drop_count[frame_subtype] = 0;
+      }
+      /*Increment the frame count for filter adaptation purposes.*/
+      if (!rc->twopass_state) rc->frame_count[frame_type]++;
+    }
+    rc->reservoir_fullness += rc->bits_per_frame - bits;
+    /*If we're too quick filling the buffer and overflow is capped,
+      that rate is lost forever.*/
+    if (rc->cap_overflow && rc->reservoir_fullness > rc->reservoir_max) {
+      rc->reservoir_fullness = rc->reservoir_max;
+    }
+    /*If we're too quick draining the buffer and underflow is capped,
+      don't try to make up that rate later.*/
+    if (rc->cap_underflow && rc->reservoir_fullness < 0) {
+      rc->reservoir_fullness = 0;
+    }
+    /*Adjust the bias for the real bits we've used.*/
+    rc->rate_bias -= bits;
+  }
+  return dropped;
+}
+
+static INLINE void od_rc_buffer_val(od_rc_state *rc, int64_t val, int bytes) {
+  while (bytes-- > 0) {
+    rc->twopass_buffer[rc->twopass_buffer_bytes++] = (uint8_t)(val & 0xFF);
+    val >>= 8;
+  }
+}
+
+static INLINE int64_t od_rc_unbuffer_val(od_rc_state *rc, int bytes) {
+  int64_t ret = 0;
+  int shift = 0;
+  while (bytes-- > 0) {
+    ret |= ((int64_t)rc->twopass_buffer[rc->twopass_buffer_bytes++]) << shift;
+    shift += 8;
+  }
+  return ret;
+}
+
+int od_enc_rc_2pass_out(od_rc_state *rc, struct aom_codec_pkt_list *pkt_list,
+                        int summary) {
+  int i;
+  struct aom_codec_cx_pkt pkt;
+  rc->twopass_buffer = rc->firstpass_buffer;
+  rc->twopass_buffer_bytes = 0;
+  if (!rc->twopass_state) {
+    rc->twopass_state = 1;
+    for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
+      rc->frame_count[i] = 0;
+      rc->exp[i] = 0;
+      rc->scale_sum[i] = 0;
+    }
+  }
+  if (summary) {
+    od_rc_buffer_val(rc, OD_RC_2PASS_MAGIC, 4);
+    od_rc_buffer_val(rc, OD_RC_2PASS_VERSION, 1);
+    for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
+      od_rc_buffer_val(rc, rc->frame_count[i], 4);
+      od_rc_buffer_val(rc, rc->exp[i], 4);
+      od_rc_buffer_val(rc, rc->scale_sum[i], 8);
+    }
+  } else {
+    int frame_type = rc->cur_metrics.frame_type;
+    rc->scale_sum[frame_type] += od_bexp64_q24(rc->cur_metrics.log_scale);
+    rc->frame_count[frame_type]++;
+    od_rc_buffer_val(rc, rc->cur_metrics.frame_type, 1);
+    od_rc_buffer_val(rc, rc->cur_metrics.log_scale, 4);
+  }
+  pkt.data.twopass_stats.buf = rc->firstpass_buffer;
+  pkt.data.twopass_stats.sz = rc->twopass_buffer_bytes;
+  pkt.kind = AOM_CODEC_STATS_PKT;
+  aom_codec_pkt_list_add(pkt_list, &pkt);
+  return 0;
+}
+
+int od_enc_rc_2pass_in(od_rc_state *rc) {
+  /* Enable pass 2 mode if this is the first call. */
+  if (rc->twopass_state == 0) {
+    uint32_t i, total_frames = 0;
+
+    if (!rc->twopass_allframes_buf ||
+        rc->twopass_allframes_buf_size < OD_RC_2PASS_MIN)
+      return -1;
+
+    /* Find summary packet at the end */
+    rc->twopass_buffer = rc->twopass_allframes_buf;
+    rc->twopass_buffer +=
+        rc->twopass_allframes_buf_size - OD_RC_2PASS_SUMMARY_SZ;
+    rc->twopass_buffer_bytes = 0;
+
+    if (od_rc_unbuffer_val(rc, 4) != OD_RC_2PASS_MAGIC) return -1;
+    if (od_rc_unbuffer_val(rc, 1) != OD_RC_2PASS_VERSION) return -1;
+
+    for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
+      rc->frame_count[i] = od_rc_unbuffer_val(rc, 4);
+      rc->exp[i] = od_rc_unbuffer_val(rc, 4);
+      rc->scale_sum[i] = od_rc_unbuffer_val(rc, 8);
+      rc->nframes[i] = rc->frame_count[i];
+      total_frames += rc->frame_count[i];
+    }
+
+    if (total_frames < 1) return -1;
+
+    if (total_frames * OD_RC_2PASS_PACKET_SZ > rc->twopass_allframes_buf_size)
+      return -1;
+
+    od_enc_rc_reset(rc);
+
+    /* Everything looks ok */
+    rc->twopass_buffer = rc->twopass_allframes_buf;
+    rc->twopass_state = 2;
+    rc->twopass_buffer_bytes = 0;
+  }
+
+  rc->cur_metrics.frame_type = od_rc_unbuffer_val(rc, 1);
+  rc->cur_metrics.log_scale = od_rc_unbuffer_val(rc, 4);
+
+  return 0;
+}
diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.h b/third_party/aom/av1/encoder/ratectrl_xiph.h
new file mode 100644
index 000000000..a4a9052fa
--- /dev/null
+++ b/third_party/aom/av1/encoder/ratectrl_xiph.h
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2001-2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if !defined(_ratectrl_xiph_H)
+#define _ratectrl_xiph_H (1)
+
+#include "av1/encoder/ratectrl.h"
+#include "aom/internal/aom_codec_internal.h"
+
+/*Frame types.*/
+#define OD_I_FRAME (0)
+#define OD_P_FRAME (1)
+#define OD_GOLDEN_P_FRAME (2)
+#define OD_ALTREF_P_FRAME (3)
+
+#define OD_FRAME_NSUBTYPES (OD_ALTREF_P_FRAME + 1)
+
+/* Periodic boost (in between golden frames) strength - lower is more */
+#define OD_PERIODIC_BOOST_DIV (10)
+
+/* Constants for frame QP modulation <- tweak these
+ * Adjusts how the rate control system decides the quantizers per frame
+ * (sub)type */
+#define OD_MQP_I (0.98)
+#define OD_MQP_P (1.06)
+#define OD_MQP_GP (0.99)
+#define OD_MQP_AP (0.92)
+#define OD_DQP_I (-2)
+#define OD_DQP_P (0)
+#define OD_DQP_GP (-2)
+#define OD_DQP_AP (-2)
+
+/*Fractional_coded_quantizer ~=
+   log2(quantizer / (1 << OD_COEFF_SHIFT))*6.307 + 6.235*/
+/*Base/scale factor for linear quantizer to fractional coded quantizer
+   conversion (6.307 * 2^12) */
+#define OD_LOG_QUANTIZER_BASE_Q12 (0x0064EB)
+/*Inverse of above scale factor.*/
+#define OD_LOG_QUANTIZER_EXP_Q12 (0x000289)
+/*Offset for linear quantizer to fractional coded quantizer
+   conversion (6.235 * 2^45) */
+#define OD_LOG_QUANTIZER_OFFSET_Q45 (0x0000C7851EB851ECLL)
+
+#define OD_RC_2PASS_MAGIC (0x53015641) /* [A, V, 1, S] in little endian */
+#define OD_RC_2PASS_SUMMARY_SZ (4 + 1 + (4 + 4 + 8) * OD_FRAME_NSUBTYPES)
+#define OD_RC_2PASS_PACKET_SZ (1 + 4)
+#define OD_RC_2PASS_MIN (OD_RC_2PASS_PACKET_SZ + OD_RC_2PASS_SUMMARY_SZ)
+#define OD_RC_2PASS_VERSION (1)
+
+/*A 2nd order low-pass Bessel follower.
+  We use this for rate control because it has fast reaction time, but is
+   critically damped.*/
+typedef struct od_iir_bessel2 {
+  int32_t c[2];
+  int64_t g;
+  int32_t x[2];
+  int32_t y[2];
+} od_iir_bessel2;
+
+/* The 2-pass metrics associated with a single frame. */
+typedef struct od_frame_metrics {
+  /*The log base 2 of the scale factor for this frame in Q24 format.*/
+  int64_t log_scale;
+  /*The frame type from pass 1.*/
+  unsigned frame_type : 1;
+} od_frame_metrics;
+
+/*Rate control setup and working state information.*/
+typedef struct od_rc_state {
+  /* Image format */
+  int frame_width;
+  int frame_height;
+  int bit_depth;
+
+  /* Framerate */
+  double framerate;
+  /* Keyframe rate */
+  int keyframe_rate;
+  /* Golden frame period */
+  int goldenframe_rate;
+  /* Altref frame period */
+  int altref_rate;
+  /*The target bit-rate in bits per second.*/
+  int64_t target_bitrate;
+  /* Quality level for non-bitrate-targeting */
+  int quality;
+  /* Copied from oxcf->frame_periodic_boost */
+  int periodic_boosts;
+  /* Max Q */
+  int maxq;
+  /* Min Q */
+  int minq;
+  /* Quantizer to use for the first pass */
+  int firstpass_quant;
+
+  /* 2-pass metrics */
+  od_frame_metrics cur_metrics;
+
+  /* 2-pass state */
+  int64_t scale_sum[OD_FRAME_NSUBTYPES];
+  int nframes[OD_FRAME_NSUBTYPES];
+
+  /* 2-pass bytestream reader/writer context */
+  uint8_t *twopass_buffer;
+  int twopass_buffer_bytes;
+
+  /* Pass 1 stats packet storage */
+  uint8_t firstpass_buffer[OD_RC_2PASS_SUMMARY_SZ];
+
+  /* Every state packet from the first pass in a single buffer */
+  uint8_t *twopass_allframes_buf;
+  size_t twopass_allframes_buf_size;
+
+  /* Actual returned quantizer */
+  int target_quantizer;
+  /*The full-precision, unmodulated quantizer upon which
+    our modulated quantizers are based.*/
+  int base_quantizer;
+
+  /* Increments by 1 for each frame. */
+  int64_t cur_frame;
+
+  /* End of input flag */
+  int end_of_input;
+  /* Closed GOP flag */
+  int closed_gop;
+  /*The number of frames over which to distribute the reservoir usage.*/
+  int reservoir_frame_delay;
+  /*Will we drop frames to meet bitrate target?*/
+  unsigned char drop_frames;
+  /*Do we respect the maximum reservoir fullness?*/
+  unsigned char cap_overflow;
+  /*Can the reservoir go negative?*/
+  unsigned char cap_underflow;
+  /*Two-pass mode state.
+    0 => 1-pass encoding.
+    1 => 1st pass of 2-pass encoding.
+    2 => 2nd pass of 2-pass encoding.*/
+  int twopass_state;
+  /*The log of the number of pixels in a frame in Q57 format.*/
+  int64_t log_npixels;
+  /*The target average bits per frame.*/
+  int64_t bits_per_frame;
+  /*The current bit reservoir fullness (bits available to be used).*/
+  int64_t reservoir_fullness;
+  /*The target buffer fullness.
+    This is where we'd like to be by the last keyframe the appears in the next
+     buf_delay frames.*/
+  int64_t reservoir_target;
+  /*The maximum buffer fullness (total size of the buffer).*/
+  int64_t reservoir_max;
+  /*The log of estimated scale factor for the rate model in Q57 format.*/
+  int64_t log_scale[OD_FRAME_NSUBTYPES];
+  /*The exponent used in the rate model in Q8 format.*/
+  unsigned exp[OD_FRAME_NSUBTYPES];
+  /*The log of an estimated scale factor used to obtain the real framerate, for
+     VFR sources or, e.g., 12 fps content doubled to 24 fps, etc.*/
+  int64_t log_drop_scale[OD_FRAME_NSUBTYPES];
+  /*The total drop count from the previous frame.*/
+  uint32_t prev_drop_count[OD_FRAME_NSUBTYPES];
+  /*Second-order lowpass filters to track scale and VFR/drops.*/
+  od_iir_bessel2 scalefilter[OD_FRAME_NSUBTYPES];
+  od_iir_bessel2 vfrfilter[OD_FRAME_NSUBTYPES];
+  int frame_count[OD_FRAME_NSUBTYPES];
+  int inter_p_delay;
+  int inter_delay_target;
+  /*The total accumulated estimation bias.*/
+  int64_t rate_bias;
+} od_rc_state;
+
+int od_enc_rc_init(od_rc_state *rc, int64_t bitrate, int delay_ms);
+
+int od_enc_rc_select_quantizers_and_lambdas(od_rc_state *rc,
+                                            int is_golden_frame,
+                                            int is_altref_frame, int frame_type,
+                                            int *bottom_idx, int *top_idx);
+
+/* Returns 1 if the frame should be dropped */
+int od_enc_rc_update_state(od_rc_state *rc, int64_t bits, int is_golden_frame,
+                           int is_altref_frame, int frame_type, int droppable);
+
+int od_frame_type(od_rc_state *rc, int64_t coding_frame_count, int *is_golden,
+                  int *is_altref, int64_t *ip_count);
+
+int od_enc_rc_resize(od_rc_state *rc);
+
+int od_enc_rc_2pass_out(od_rc_state *rc, struct aom_codec_pkt_list *pkt_list,
+                        int summary);
+
+int od_enc_rc_2pass_in(od_rc_state *rc);
+
+#endif
diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c
new file mode 100644
index 000000000..f06e569e7
--- /dev/null
+++ b/third_party/aom/av1/encoder/rd.c
@@ -0,0 +1,1204 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/tokenize.h"
+
+#define RD_THRESH_POW 1.25
+
+// Factor to weigh the rate for switchable interp filters.
+#define SWITCHABLE_INTERP_RATE_FACTOR 1
+
+// The baseline rd thresholds for breaking out of the rd loop for
+// certain modes are assumed to be based on 8x8 blocks.
+// This table is used to correct for block size.
+// The factors here are << 2 (2 = x0.5, 32 = x8 etc).
+static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+  2,  2,  2,
+#endif
+  2,  3,  3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32,
+#if CONFIG_EXT_PARTITION
+  48, 48, 64
+#endif  // CONFIG_EXT_PARTITION
+};
+
+static void fill_mode_costs(AV1_COMP *cpi) {
+  const FRAME_CONTEXT *const fc = cpi->common.fc;
+  int i, j;
+
+  for (i = 0; i < INTRA_MODES; ++i)
+    for (j = 0; j < INTRA_MODES; ++j)
+      av1_cost_tokens(cpi->y_mode_costs[i][j], av1_kf_y_mode_prob[i][j],
+                      av1_intra_mode_tree);
+
+  for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+    av1_cost_tokens(cpi->mbmode_cost[i], fc->y_mode_prob[i],
+                    av1_intra_mode_tree);
+
+  for (i = 0; i < INTRA_MODES; ++i)
+    av1_cost_tokens(cpi->intra_uv_mode_cost[i], fc->uv_mode_prob[i],
+                    av1_intra_mode_tree);
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    av1_cost_tokens(cpi->switchable_interp_costs[i],
+                    fc->switchable_interp_prob[i], av1_switchable_interp_tree);
+
+#if CONFIG_PALETTE
+  for (i = 0; i < PALETTE_BLOCK_SIZES; ++i) {
+    av1_cost_tokens(cpi->palette_y_size_cost[i],
+                    av1_default_palette_y_size_prob[i], av1_palette_size_tree);
+    av1_cost_tokens(cpi->palette_uv_size_cost[i],
+                    av1_default_palette_uv_size_prob[i], av1_palette_size_tree);
+  }
+
+  for (i = 0; i < PALETTE_SIZES; ++i) {
+    for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) {
+      av1_cost_tokens(cpi->palette_y_color_cost[i][j],
+                      av1_default_palette_y_color_index_prob[i][j],
+                      av1_palette_color_index_tree[i]);
+      av1_cost_tokens(cpi->palette_uv_color_cost[i][j],
+                      av1_default_palette_uv_color_index_prob[i][j],
+                      av1_palette_color_index_tree[i]);
+    }
+  }
+#endif  // CONFIG_PALETTE
+
+  for (i = 0; i < MAX_TX_DEPTH; ++i)
+    for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+      av1_cost_tokens(cpi->tx_size_cost[i][j], fc->tx_size_probs[i][j],
+                      av1_tx_size_tree[i]);
+
+#if CONFIG_EXT_TX
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    int s;
+    for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+      if (use_inter_ext_tx_for_txsize[s][i]) {
+        av1_cost_tokens(cpi->inter_tx_type_costs[s][i],
+                        fc->inter_ext_tx_prob[s][i], av1_ext_tx_inter_tree[s]);
+      }
+    }
+    for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+      if (use_intra_ext_tx_for_txsize[s][i]) {
+        for (j = 0; j < INTRA_MODES; ++j)
+          av1_cost_tokens(cpi->intra_tx_type_costs[s][i][j],
+                          fc->intra_ext_tx_prob[s][i][j],
+                          av1_ext_tx_intra_tree[s]);
+      }
+    }
+  }
+#else
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    for (j = 0; j < TX_TYPES; ++j)
+      av1_cost_tokens(cpi->intra_tx_type_costs[i][j],
+                      fc->intra_ext_tx_prob[i][j], av1_ext_tx_tree);
+  }
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    av1_cost_tokens(cpi->inter_tx_type_costs[i], fc->inter_ext_tx_prob[i],
+                    av1_ext_tx_tree);
+  }
+#endif  // CONFIG_EXT_TX
+#if CONFIG_EXT_INTRA
+#if CONFIG_INTRA_INTERP
+  for (i = 0; i < INTRA_FILTERS + 1; ++i)
+    av1_cost_tokens(cpi->intra_filter_cost[i], fc->intra_filter_probs[i],
+                    av1_intra_filter_tree);
+#endif  // CONFIG_INTRA_INTERP
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_LOOP_RESTORATION
+  av1_cost_tokens(cpi->switchable_restore_cost, fc->switchable_restore_prob,
+                  av1_switchable_restore_tree);
+#endif  // CONFIG_LOOP_RESTORATION
+#if CONFIG_GLOBAL_MOTION
+  av1_cost_tokens(cpi->gmtype_cost, fc->global_motion_types_prob,
+                  av1_global_motion_types_tree);
+#endif  // CONFIG_GLOBAL_MOTION
+}
+
+void av1_fill_token_costs(av1_coeff_cost *c,
+                          av1_coeff_probs_model (*p)[PLANE_TYPES]) {
+  int i, j, k, l;
+  TX_SIZE t;
+  for (t = 0; t < TX_SIZES; ++t)
+    for (i = 0; i < PLANE_TYPES; ++i)
+      for (j = 0; j < REF_TYPES; ++j)
+        for (k = 0; k < COEF_BANDS; ++k)
+          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+            aom_prob probs[ENTROPY_NODES];
+            av1_model_to_full_probs(p[t][i][j][k][l], probs);
+            av1_cost_tokens((int *)c[t][i][j][k][0][l], probs, av1_coef_tree);
+            av1_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
+                                 av1_coef_tree);
+            assert(c[t][i][j][k][0][l][EOB_TOKEN] ==
+                   c[t][i][j][k][1][l][EOB_TOKEN]);
+          }
+}
+
+// Values are now correlated to quantizer.
+static int sad_per_bit16lut_8[QINDEX_RANGE];
+static int sad_per_bit4lut_8[QINDEX_RANGE];
+
+#if CONFIG_HIGHBITDEPTH
+static int sad_per_bit16lut_10[QINDEX_RANGE];
+static int sad_per_bit4lut_10[QINDEX_RANGE];
+static int sad_per_bit16lut_12[QINDEX_RANGE];
+static int sad_per_bit4lut_12[QINDEX_RANGE];
+#endif
+
+static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range,
+                            aom_bit_depth_t bit_depth) {
+  int i;
+  // Initialize the sad lut tables using a formulaic calculation for now.
+  // This is to make it easier to resolve the impact of experimental changes
+  // to the quantizer tables.
+  for (i = 0; i < range; i++) {
+    const double q = av1_convert_qindex_to_q(i, bit_depth);
+    bit16lut[i] = (int)(0.0418 * q + 2.4107);
+    bit4lut[i] = (int)(0.063 * q + 2.742);
+  }
+}
+
+void av1_init_me_luts(void) {
+  init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE,
+                  AOM_BITS_8);
+#if CONFIG_HIGHBITDEPTH
+  init_me_luts_bd(sad_per_bit16lut_10, sad_per_bit4lut_10, QINDEX_RANGE,
+                  AOM_BITS_10);
+  init_me_luts_bd(sad_per_bit16lut_12, sad_per_bit4lut_12, QINDEX_RANGE,
+                  AOM_BITS_12);
+#endif
+}
+
+static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
+                                         8,  8,  4,  4,  2,  2,  1,  0 };
+static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = {
+  128, 144, 128, 128, 144,
+#if CONFIG_EXT_REFS
+  // TODO(zoeliu): To adjust further following factor values.
+  128, 128, 128
+  // TODO(weitinglin): We should investigate if the values should be the same
+  //                   as the value used by OVERLAY frame
+  ,
+  144
+#endif  // CONFIG_EXT_REFS
+};
+
+int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) {
+  const int64_t q = av1_dc_quant(qindex, 0, cpi->common.bit_depth);
+#if CONFIG_HIGHBITDEPTH
+  int64_t rdmult = 0;
+  switch (cpi->common.bit_depth) {
+    case AOM_BITS_8: rdmult = 88 * q * q / 24; break;
+    case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4); break;
+    case AOM_BITS_12: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8); break;
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+#else
+  int64_t rdmult = 88 * q * q / 24;
+#endif  // CONFIG_HIGHBITDEPTH
+  if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
+    const int boost_index = AOMMIN(15, (cpi->rc.gfu_boost / 100));
+
+    rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7;
+    rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
+  }
+  if (rdmult < 1) rdmult = 1;
+  return (int)rdmult;
+}
+
+static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) {
+  double q;
+#if CONFIG_HIGHBITDEPTH
+  switch (bit_depth) {
+    case AOM_BITS_8: q = av1_dc_quant(qindex, 0, AOM_BITS_8) / 4.0; break;
+    case AOM_BITS_10: q = av1_dc_quant(qindex, 0, AOM_BITS_10) / 16.0; break;
+    case AOM_BITS_12: q = av1_dc_quant(qindex, 0, AOM_BITS_12) / 64.0; break;
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+#else
+  (void)bit_depth;
+  q = av1_dc_quant(qindex, 0, AOM_BITS_8) / 4.0;
+#endif  // CONFIG_HIGHBITDEPTH
+  // TODO(debargha): Adjust the function below.
+  return AOMMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
+}
+
+void av1_initialize_me_consts(const AV1_COMP *cpi, MACROBLOCK *x, int qindex) {
+#if CONFIG_HIGHBITDEPTH
+  switch (cpi->common.bit_depth) {
+    case AOM_BITS_8:
+      x->sadperbit16 = sad_per_bit16lut_8[qindex];
+      x->sadperbit4 = sad_per_bit4lut_8[qindex];
+      break;
+    case AOM_BITS_10:
+      x->sadperbit16 = sad_per_bit16lut_10[qindex];
+      x->sadperbit4 = sad_per_bit4lut_10[qindex];
+      break;
+    case AOM_BITS_12:
+      x->sadperbit16 = sad_per_bit16lut_12[qindex];
+      x->sadperbit4 = sad_per_bit4lut_12[qindex];
+      break;
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+  }
+#else
+  (void)cpi;
+  x->sadperbit16 = sad_per_bit16lut_8[qindex];
+  x->sadperbit4 = sad_per_bit4lut_8[qindex];
+#endif  // CONFIG_HIGHBITDEPTH
+}
+
+static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) {
+  int i, bsize, segment_id;
+
+  for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
+    const int qindex =
+        clamp(av1_get_qindex(&cm->seg, segment_id, cm->base_qindex) +
+                  cm->y_dc_delta_q,
+              0, MAXQ);
+    const int q = compute_rd_thresh_factor(qindex, cm->bit_depth);
+
+    for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
+      // Threshold here seems unnecessarily harsh but fine given actual
+      // range of values used for cpi->sf.thresh_mult[].
+      const int t = q * rd_thresh_block_size_factor[bsize];
+      const int thresh_max = INT_MAX / t;
+
+#if CONFIG_CB4X4
+      for (i = 0; i < MAX_MODES; ++i)
+        rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max
+                                                 ? rd->thresh_mult[i] * t / 4
+                                                 : INT_MAX;
+#else
+      if (bsize >= BLOCK_8X8) {
+        for (i = 0; i < MAX_MODES; ++i)
+          rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max
+                                                   ? rd->thresh_mult[i] * t / 4
+                                                   : INT_MAX;
+      } else {
+        for (i = 0; i < MAX_REFS; ++i)
+          rd->threshes[segment_id][bsize][i] =
+              rd->thresh_mult_sub8x8[i] < thresh_max
+                  ? rd->thresh_mult_sub8x8[i] * t / 4
+                  : INT_MAX;
+      }
+#endif
+    }
+  }
+}
+
+#if CONFIG_REF_MV
+void av1_set_mvcost(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int ref,
+                    int ref_mv_idx) {
+  MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
+  int8_t rf_type = av1_ref_frame_type(x->e_mbd.mi[0]->mbmi.ref_frame);
+  int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                            mbmi_ext->ref_mv_stack[rf_type], ref, ref_mv_idx);
+  (void)ref_frame;
+  x->mvcost = x->mv_cost_stack[nmv_ctx];
+  x->nmvjointcost = x->nmv_vec_cost[nmv_ctx];
+  x->mvsadcost = x->mvcost;
+  x->nmvjointsadcost = x->nmvjointcost;
+}
+#endif
+
+void av1_initialize_rd_consts(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  RD_OPT *const rd = &cpi->rd;
+  int i;
+#if CONFIG_REF_MV
+  int nmv_ctx;
+#endif
+
+  aom_clear_system_state();
+
+  rd->RDDIV = RDDIV_BITS;  // In bits (to multiply D by 128).
+  rd->RDMULT = av1_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
+
+  set_error_per_bit(x, rd->RDMULT);
+
+  set_block_thresholds(cm, rd);
+
+#if CONFIG_REF_MV
+  for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) {
+    av1_build_nmv_cost_table(
+        x->nmv_vec_cost[nmv_ctx],
+        cm->allow_high_precision_mv ? x->nmvcost_hp[nmv_ctx]
+                                    : x->nmvcost[nmv_ctx],
+        &cm->fc->nmvc[nmv_ctx], cm->allow_high_precision_mv);
+  }
+  x->mvcost = x->mv_cost_stack[0];
+  x->nmvjointcost = x->nmv_vec_cost[0];
+  x->mvsadcost = x->mvcost;
+  x->nmvjointsadcost = x->nmvjointcost;
+#else
+  av1_build_nmv_cost_table(
+      x->nmvjointcost, cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost,
+      &cm->fc->nmvc, cm->allow_high_precision_mv);
+#endif
+
+  if (cpi->oxcf.pass != 1) {
+    av1_fill_token_costs(x->token_costs, cm->fc->coef_probs);
+
+    if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
+        cm->frame_type == KEY_FRAME) {
+#if CONFIG_EXT_PARTITION_TYPES
+      for (i = 0; i < PARTITION_PLOFFSET; ++i)
+        av1_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i],
+                        av1_partition_tree);
+      for (; i < PARTITION_CONTEXTS_PRIMARY; ++i)
+        av1_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i],
+                        av1_ext_partition_tree);
+#else
+      for (i = 0; i < PARTITION_CONTEXTS_PRIMARY; ++i)
+        av1_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i],
+                        av1_partition_tree);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_UNPOISON_PARTITION_CTX
+      for (; i < PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES; ++i) {
+        aom_prob p = cm->fc->partition_prob[i][PARTITION_VERT];
+        assert(p > 0);
+        cpi->partition_cost[i][PARTITION_NONE] = INT_MAX;
+        cpi->partition_cost[i][PARTITION_HORZ] = INT_MAX;
+        cpi->partition_cost[i][PARTITION_VERT] = av1_cost_bit(p, 0);
+        cpi->partition_cost[i][PARTITION_SPLIT] = av1_cost_bit(p, 1);
+      }
+      for (; i < PARTITION_CONTEXTS_PRIMARY + 2 * PARTITION_BLOCK_SIZES; ++i) {
+        aom_prob p = cm->fc->partition_prob[i][PARTITION_HORZ];
+        assert(p > 0);
+        cpi->partition_cost[i][PARTITION_NONE] = INT_MAX;
+        cpi->partition_cost[i][PARTITION_HORZ] = av1_cost_bit(p, 0);
+        cpi->partition_cost[i][PARTITION_VERT] = INT_MAX;
+        cpi->partition_cost[i][PARTITION_SPLIT] = av1_cost_bit(p, 1);
+      }
+      cpi->partition_cost[PARTITION_CONTEXTS][PARTITION_NONE] = INT_MAX;
+      cpi->partition_cost[PARTITION_CONTEXTS][PARTITION_HORZ] = INT_MAX;
+      cpi->partition_cost[PARTITION_CONTEXTS][PARTITION_VERT] = INT_MAX;
+      cpi->partition_cost[PARTITION_CONTEXTS][PARTITION_SPLIT] = 0;
+#endif  // CONFIG_UNPOISON_PARTITION_CTX
+    }
+
+    fill_mode_costs(cpi);
+
+    if (!frame_is_intra_only(cm)) {
+#if CONFIG_REF_MV
+      for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) {
+        cpi->newmv_mode_cost[i][0] = av1_cost_bit(cm->fc->newmv_prob[i], 0);
+        cpi->newmv_mode_cost[i][1] = av1_cost_bit(cm->fc->newmv_prob[i], 1);
+      }
+
+      for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i) {
+        cpi->zeromv_mode_cost[i][0] = av1_cost_bit(cm->fc->zeromv_prob[i], 0);
+        cpi->zeromv_mode_cost[i][1] = av1_cost_bit(cm->fc->zeromv_prob[i], 1);
+      }
+
+      for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) {
+        cpi->refmv_mode_cost[i][0] = av1_cost_bit(cm->fc->refmv_prob[i], 0);
+        cpi->refmv_mode_cost[i][1] = av1_cost_bit(cm->fc->refmv_prob[i], 1);
+      }
+
+      for (i = 0; i < DRL_MODE_CONTEXTS; ++i) {
+        cpi->drl_mode_cost0[i][0] = av1_cost_bit(cm->fc->drl_prob[i], 0);
+        cpi->drl_mode_cost0[i][1] = av1_cost_bit(cm->fc->drl_prob[i], 1);
+      }
+#else
+      for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+        av1_cost_tokens((int *)cpi->inter_mode_cost[i],
+                        cm->fc->inter_mode_probs[i], av1_inter_mode_tree);
+#endif  // CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+      for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+        av1_cost_tokens((int *)cpi->inter_compound_mode_cost[i],
+                        cm->fc->inter_compound_mode_probs[i],
+                        av1_inter_compound_mode_tree);
+      for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+        av1_cost_tokens((int *)cpi->interintra_mode_cost[i],
+                        cm->fc->interintra_mode_prob[i],
+                        av1_interintra_mode_tree);
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      for (i = BLOCK_8X8; i < BLOCK_SIZES; i++) {
+        av1_cost_tokens((int *)cpi->motion_mode_cost[i],
+                        cm->fc->motion_mode_prob[i], av1_motion_mode_tree);
+      }
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+      for (i = BLOCK_8X8; i < BLOCK_SIZES; i++) {
+        cpi->motion_mode_cost1[i][0] = av1_cost_bit(cm->fc->obmc_prob[i], 0);
+        cpi->motion_mode_cost1[i][1] = av1_cost_bit(cm->fc->obmc_prob[i], 1);
+      }
+#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    }
+  }
+}
+
+static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
+  // NOTE: The tables below must be of the same size.
+
+  // The functions described below are sampled at the four most significant
+  // bits of x^2 + 8 / 256.
+
+  // Normalized rate:
+  // This table models the rate for a Laplacian source with given variance
+  // when quantized with a uniform quantizer with given stepsize. The
+  // closed form expression is:
+  // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
+  // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
+  // and H(x) is the binary entropy function.
+  static const int rate_tab_q10[] = {
+    65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142,
+    4044,  3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186,
+    3133,  3037, 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353,
+    2290,  2232, 2179, 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651,
+    1608,  1530, 1460, 1398, 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963,
+    911,   864,  821,  781,  745,  680,  623,  574,  530,  490,  455,  424,
+    395,   345,  304,  269,  239,  213,  190,  171,  154,  126,  104,  87,
+    73,    61,   52,   44,   38,   28,   21,   16,   12,   10,   8,    6,
+    5,     3,    2,    1,    1,    1,    0,    0,
+  };
+  // Normalized distortion:
+  // This table models the normalized distortion for a Laplacian source
+  // with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expression is:
+  // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
+  // where x = qpstep / sqrt(variance).
+  // Note the actual distortion is Dn * variance.
+  static const int dist_tab_q10[] = {
+    0,    0,    1,    1,    1,    2,    2,    2,    3,    3,    4,    5,
+    5,    6,    7,    7,    8,    9,    11,   12,   13,   15,   16,   17,
+    18,   21,   24,   26,   29,   31,   34,   36,   39,   44,   49,   54,
+    59,   64,   69,   73,   78,   88,   97,   106,  115,  124,  133,  142,
+    151,  167,  184,  200,  215,  231,  245,  260,  274,  301,  327,  351,
+    375,  397,  418,  439,  458,  495,  528,  559,  587,  613,  637,  659,
+    680,  717,  749,  777,  801,  823,  842,  859,  874,  899,  919,  936,
+    949,  960,  969,  977,  983,  994,  1001, 1006, 1010, 1013, 1015, 1017,
+    1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
+  };
+  static const int xsq_iq_q10[] = {
+    0,      4,      8,      12,     16,     20,     24,     28,     32,
+    40,     48,     56,     64,     72,     80,     88,     96,     112,
+    128,    144,    160,    176,    192,    208,    224,    256,    288,
+    320,    352,    384,    416,    448,    480,    544,    608,    672,
+    736,    800,    864,    928,    992,    1120,   1248,   1376,   1504,
+    1632,   1760,   1888,   2016,   2272,   2528,   2784,   3040,   3296,
+    3552,   3808,   4064,   4576,   5088,   5600,   6112,   6624,   7136,
+    7648,   8160,   9184,   10208,  11232,  12256,  13280,  14304,  15328,
+    16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,  32736,
+    36832,  40928,  45024,  49120,  53216,  57312,  61408,  65504,  73696,
+    81888,  90080,  98272,  106464, 114656, 122848, 131040, 147424, 163808,
+    180192, 196576, 212960, 229344, 245728,
+  };
+  const int tmp = (xsq_q10 >> 2) + 8;
+  const int k = get_msb(tmp) - 3;
+  const int xq = (k << 3) + ((tmp >> k) & 0x7);
+  const int one_q10 = 1 << 10;
+  const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
+  const int b_q10 = one_q10 - a_q10;
+  *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
+  *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
+}
+
+void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2,
+                                  unsigned int qstep, int *rate,
+                                  int64_t *dist) {
+  // This function models the rate and distortion for a Laplacian
+  // source with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expressions are in:
+  // Hang and Chen, "Source Model for transform video coder and its
+  // application - Part I: Fundamental Theory", IEEE Trans. Circ.
+  // Sys. for Video Tech., April 1997.
+  if (var == 0) {
+    *rate = 0;
+    *dist = 0;
+  } else {
+    int d_q10, r_q10;
+    static const uint32_t MAX_XSQ_Q10 = 245727;
+    const uint64_t xsq_q10_64 =
+        (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
+    const int xsq_q10 = (int)AOMMIN(xsq_q10_64, MAX_XSQ_Q10);
+    model_rd_norm(xsq_q10, &r_q10, &d_q10);
+    *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - AV1_PROB_COST_SHIFT);
+    *dist = (var * (int64_t)d_q10 + 512) >> 10;
+  }
+}
+
+static void get_entropy_contexts_plane(
+    BLOCK_SIZE plane_bsize, TX_SIZE tx_size, const struct macroblockd_plane *pd,
+    ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
+    ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) {
+  const int num_4x4_w = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  const int num_4x4_h = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+  const ENTROPY_CONTEXT *const above = pd->above_context;
+  const ENTROPY_CONTEXT *const left = pd->left_context;
+
+  int i;
+
+#if CONFIG_CB4X4
+  switch (tx_size) {
+    case TX_2X2:
+      memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+      memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+      break;
+    case TX_4X4:
+      for (i = 0; i < num_4x4_w; i += 2)
+        t_above[i] = !!*(const uint16_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 2)
+        t_left[i] = !!*(const uint16_t *)&left[i];
+      break;
+    case TX_8X8:
+      for (i = 0; i < num_4x4_w; i += 4)
+        t_above[i] = !!*(const uint32_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 4)
+        t_left[i] = !!*(const uint32_t *)&left[i];
+      break;
+    case TX_16X16:
+      for (i = 0; i < num_4x4_w; i += 8)
+        t_above[i] = !!*(const uint64_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 8)
+        t_left[i] = !!*(const uint64_t *)&left[i];
+      break;
+    case TX_32X32:
+      for (i = 0; i < num_4x4_w; i += 16)
+        t_above[i] =
+            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
+      for (i = 0; i < num_4x4_h; i += 16)
+        t_left[i] =
+            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
+      break;
+    case TX_4X8:
+      for (i = 0; i < num_4x4_w; i += 2)
+        t_above[i] = !!*(const uint16_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 4)
+        t_left[i] = !!*(const uint32_t *)&left[i];
+      break;
+    case TX_8X4:
+      for (i = 0; i < num_4x4_w; i += 4)
+        t_above[i] = !!*(const uint32_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 2)
+        t_left[i] = !!*(const uint16_t *)&left[i];
+      break;
+    case TX_8X16:
+      for (i = 0; i < num_4x4_w; i += 4)
+        t_above[i] = !!*(const uint32_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 8)
+        t_left[i] = !!*(const uint64_t *)&left[i];
+      break;
+    case TX_16X8:
+      for (i = 0; i < num_4x4_w; i += 8)
+        t_above[i] = !!*(const uint64_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 4)
+        t_left[i] = !!*(const uint32_t *)&left[i];
+      break;
+    case TX_16X32:
+      for (i = 0; i < num_4x4_w; i += 8)
+        t_above[i] = !!*(const uint64_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 16)
+        t_left[i] =
+            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
+      break;
+    case TX_32X16:
+      for (i = 0; i < num_4x4_w; i += 16)
+        t_above[i] =
+            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
+      for (i = 0; i < num_4x4_h; i += 8)
+        t_left[i] = !!*(const uint64_t *)&left[i];
+      break;
+
+    default: assert(0 && "Invalid transform size."); break;
+  }
+  return;
+#endif
+
+  switch (tx_size) {
+    case TX_4X4:
+      memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+      memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+      break;
+    case TX_8X8:
+      for (i = 0; i < num_4x4_w; i += 2)
+        t_above[i] = !!*(const uint16_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 2)
+        t_left[i] = !!*(const uint16_t *)&left[i];
+      break;
+    case TX_16X16:
+      for (i = 0; i < num_4x4_w; i += 4)
+        t_above[i] = !!*(const uint32_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 4)
+        t_left[i] = !!*(const uint32_t *)&left[i];
+      break;
+    case TX_32X32:
+      for (i = 0; i < num_4x4_w; i += 8)
+        t_above[i] = !!*(const uint64_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 8)
+        t_left[i] = !!*(const uint64_t *)&left[i];
+      break;
+#if CONFIG_TX64X64
+    case TX_64X64:
+      for (i = 0; i < num_4x4_w; i += 16)
+        t_above[i] =
+            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
+      for (i = 0; i < num_4x4_h; i += 16)
+        t_left[i] =
+            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
+      break;
+#endif  // CONFIG_TX64X64
+    case TX_4X8:
+      memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+      for (i = 0; i < num_4x4_h; i += 2)
+        t_left[i] = !!*(const uint16_t *)&left[i];
+      break;
+    case TX_8X4:
+      for (i = 0; i < num_4x4_w; i += 2)
+        t_above[i] = !!*(const uint16_t *)&above[i];
+      memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+      break;
+    case TX_8X16:
+      for (i = 0; i < num_4x4_w; i += 2)
+        t_above[i] = !!*(const uint16_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 4)
+        t_left[i] = !!*(const uint32_t *)&left[i];
+      break;
+    case TX_16X8:
+      for (i = 0; i < num_4x4_w; i += 4)
+        t_above[i] = !!*(const uint32_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 2)
+        t_left[i] = !!*(const uint16_t *)&left[i];
+      break;
+    case TX_16X32:
+      for (i = 0; i < num_4x4_w; i += 4)
+        t_above[i] = !!*(const uint32_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 8)
+        t_left[i] = !!*(const uint64_t *)&left[i];
+      break;
+    case TX_32X16:
+      for (i = 0; i < num_4x4_w; i += 8)
+        t_above[i] = !!*(const uint64_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 4)
+        t_left[i] = !!*(const uint32_t *)&left[i];
+      break;
+    default: assert(0 && "Invalid transform size."); break;
+  }
+}
+
+void av1_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
+                              ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) {
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  get_entropy_contexts_plane(plane_bsize, tx_size, pd, t_above, t_left);
+}
+
+void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
+                 int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
+  int i;
+  int zero_seen = 0;
+  int best_index = 0;
+  int best_sad = INT_MAX;
+  int this_sad = INT_MAX;
+  int max_mv = 0;
+  int near_same_nearest;
+  uint8_t *src_y_ptr = x->plane[0].src.buf;
+  uint8_t *ref_y_ptr;
+  const int num_mv_refs =
+      MAX_MV_REF_CANDIDATES +
+      (cpi->sf.adaptive_motion_search && block_size < x->max_partition_size);
+
+  MV pred_mv[3];
+  pred_mv[0] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv;
+  pred_mv[1] = x->mbmi_ext->ref_mvs[ref_frame][1].as_mv;
+  pred_mv[2] = x->pred_mv[ref_frame];
+  assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
+
+  near_same_nearest = x->mbmi_ext->ref_mvs[ref_frame][0].as_int ==
+                      x->mbmi_ext->ref_mvs[ref_frame][1].as_int;
+  // Get the sad for each candidate reference mv.
+  for (i = 0; i < num_mv_refs; ++i) {
+    const MV *this_mv = &pred_mv[i];
+    int fp_row, fp_col;
+
+    if (i == 1 && near_same_nearest) continue;
+    fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
+    fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
+    max_mv = AOMMAX(max_mv, AOMMAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
+
+    if (fp_row == 0 && fp_col == 0 && zero_seen) continue;
+    zero_seen |= (fp_row == 0 && fp_col == 0);
+
+    ref_y_ptr = &ref_y_buffer[ref_y_stride * fp_row + fp_col];
+    // Find sad for current vector.
+    this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
+                                           ref_y_ptr, ref_y_stride);
+    // Note if it is the best so far.
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      best_index = i;
+    }
+  }
+
+  // Note the index of the mv that worked best in the reference list.
+  x->mv_best_ref_index[ref_frame] = best_index;
+  x->max_mv_context[ref_frame] = max_mv;
+  x->pred_mv_sad[ref_frame] = best_sad;
+}
+
+void av1_setup_pred_block(const MACROBLOCKD *xd,
+                          struct buf_2d dst[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const struct scale_factors *scale,
+                          const struct scale_factors *scale_uv) {
+  int i;
+
+  dst[0].buf = src->y_buffer;
+  dst[0].stride = src->y_stride;
+  dst[1].buf = src->u_buffer;
+  dst[2].buf = src->v_buffer;
+  dst[1].stride = dst[2].stride = src->uv_stride;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    setup_pred_plane(dst + i, xd->mi[0]->mbmi.sb_type, dst[i].buf,
+                     i ? src->uv_crop_width : src->y_crop_width,
+                     i ? src->uv_crop_height : src->y_crop_height,
+                     dst[i].stride, mi_row, mi_col, i ? scale_uv : scale,
+                     xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
+  }
+}
+
+int av1_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block,
+                            int stride) {
+  const int bw = b_width_log2_lookup[plane_bsize];
+  const int y = 4 * (raster_block >> bw);
+  const int x = 4 * (raster_block & ((1 << bw) - 1));
+  return y * stride + x;
+}
+
+int16_t *av1_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block,
+                                       int16_t *base) {
+  const int stride = block_size_wide[plane_bsize];
+  return base + av1_raster_block_offset(plane_bsize, raster_block, stride);
+}
+
+YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi,
+                                             int ref_frame) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
+  const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+  return (scaled_idx != ref_idx && scaled_idx != INVALID_IDX)
+             ? &cm->buffer_pool->frame_bufs[scaled_idx].buf
+             : NULL;
+}
+
+#if CONFIG_DUAL_FILTER
+int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *xd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  if (cm->interp_filter == SWITCHABLE) {
+    const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+    int inter_filter_cost = 0;
+    int dir;
+
+    for (dir = 0; dir < 2; ++dir) {
+      if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
+          (mbmi->ref_frame[1] > INTRA_FRAME &&
+           has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
+        const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+        inter_filter_cost +=
+            cpi->switchable_interp_costs[ctx][mbmi->interp_filter[dir]];
+      }
+    }
+    return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+  } else {
+    return 0;
+  }
+}
+#else
+int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *xd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  if (cm->interp_filter == SWITCHABLE) {
+    const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+    const int ctx = av1_get_pred_context_switchable_interp(xd);
+    return SWITCHABLE_INTERP_RATE_FACTOR *
+           cpi->switchable_interp_costs[ctx][mbmi->interp_filter];
+  }
+  return 0;
+}
+#endif
+
+void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
+  int i;
+  RD_OPT *const rd = &cpi->rd;
+  SPEED_FEATURES *const sf = &cpi->sf;
+
+  // Set baseline threshold values.
+  for (i = 0; i < MAX_MODES; ++i) rd->thresh_mult[i] = cpi->oxcf.mode == 0;
+
+  if (sf->adaptive_rd_thresh) {
+    rd->thresh_mult[THR_NEARESTMV] = 300;
+#if CONFIG_EXT_REFS
+    rd->thresh_mult[THR_NEARESTL2] = 300;
+    rd->thresh_mult[THR_NEARESTL3] = 300;
+    rd->thresh_mult[THR_NEARESTB] = 300;
+#endif  // CONFIG_EXT_REFS
+    rd->thresh_mult[THR_NEARESTA] = 300;
+    rd->thresh_mult[THR_NEARESTG] = 300;
+  } else {
+    rd->thresh_mult[THR_NEARESTMV] = 0;
+#if CONFIG_EXT_REFS
+    rd->thresh_mult[THR_NEARESTL2] = 0;
+    rd->thresh_mult[THR_NEARESTL3] = 0;
+    rd->thresh_mult[THR_NEARESTB] = 0;
+#endif  // CONFIG_EXT_REFS
+    rd->thresh_mult[THR_NEARESTA] = 0;
+    rd->thresh_mult[THR_NEARESTG] = 0;
+  }
+
+  rd->thresh_mult[THR_DC] += 1000;
+
+  rd->thresh_mult[THR_NEWMV] += 1000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_NEWL2] += 1000;
+  rd->thresh_mult[THR_NEWL3] += 1000;
+  rd->thresh_mult[THR_NEWB] += 1000;
+#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_NEWA] += 1000;
+  rd->thresh_mult[THR_NEWG] += 1000;
+
+  rd->thresh_mult[THR_NEARMV] += 1000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_NEARL2] += 1000;
+  rd->thresh_mult[THR_NEARL3] += 1000;
+  rd->thresh_mult[THR_NEARB] += 1000;
+#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_NEARA] += 1000;
+  rd->thresh_mult[THR_NEARG] += 1000;
+
+  rd->thresh_mult[THR_ZEROMV] += 2000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_ZEROL2] += 2000;
+  rd->thresh_mult[THR_ZEROL3] += 2000;
+  rd->thresh_mult[THR_ZEROB] += 2000;
+#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_ZEROG] += 2000;
+  rd->thresh_mult[THR_ZEROA] += 2000;
+
+  rd->thresh_mult[THR_TM] += 1000;
+
+#if CONFIG_EXT_INTER
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 1000;
+#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] += 1000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGB] += 1000;
+#endif  // CONFIG_EXT_REFS
+
+#else  // CONFIG_EXT_INTER
+
+  rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEARESTL2A] += 1000;
+  rd->thresh_mult[THR_COMP_NEARESTL3A] += 1000;
+#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEARESTGA] += 1000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEARESTLB] += 1000;
+  rd->thresh_mult[THR_COMP_NEARESTL2B] += 1000;
+  rd->thresh_mult[THR_COMP_NEARESTL3B] += 1000;
+  rd->thresh_mult[THR_COMP_NEARESTGB] += 1000;
+#endif  // CONFIG_EXT_REFS
+
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_EXT_INTER
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARLA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTLA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARLA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLA] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLA] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLA] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARLA] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWLA] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROLA] += 2500;
+
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEAREST_NEARL2A] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTL2A] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARL2A] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2A] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL2A] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2A] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROL2A] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARL3A] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTL3A] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3A] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL3A] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3A] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL3A] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL3A] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROL3A] += 2500;
+#endif  // CONFIG_EXT_REFS
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARGA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTGA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARGA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWGA] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTGA] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGA] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARGA] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWGA] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROGA] += 2500;
+
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEAREST_NEARLB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTLB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARLB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLB] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLB] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLB] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARLB] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROLB] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARL2B] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTL2B] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARL2B] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL2B] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2B] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL2B] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2B] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROL2B] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARL3B] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTL3B] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3B] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3B] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL3B] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL3B] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROL3B] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARGB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTGB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARGB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWGB] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTGB] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGB] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARGB] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWGB] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROGB] += 2500;
+#endif  // CONFIG_EXT_REFS
+
+#else  // CONFIG_EXT_INTER
+
+  rd->thresh_mult[THR_COMP_NEARLA] += 1500;
+  rd->thresh_mult[THR_COMP_NEWLA] += 2000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEARL2A] += 1500;
+  rd->thresh_mult[THR_COMP_NEWL2A] += 2000;
+  rd->thresh_mult[THR_COMP_NEARL3A] += 1500;
+  rd->thresh_mult[THR_COMP_NEWL3A] += 2000;
+#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEARGA] += 1500;
+  rd->thresh_mult[THR_COMP_NEWGA] += 2000;
+
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEARLB] += 1500;
+  rd->thresh_mult[THR_COMP_NEWLB] += 2000;
+  rd->thresh_mult[THR_COMP_NEARL2B] += 1500;
+  rd->thresh_mult[THR_COMP_NEWL2B] += 2000;
+  rd->thresh_mult[THR_COMP_NEARL3B] += 1500;
+  rd->thresh_mult[THR_COMP_NEWL3B] += 2000;
+  rd->thresh_mult[THR_COMP_NEARGB] += 1500;
+  rd->thresh_mult[THR_COMP_NEWGB] += 2000;
+#endif  // CONFIG_EXT_REFS
+
+  rd->thresh_mult[THR_COMP_ZEROLA] += 2500;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_ZEROL2A] += 2500;
+  rd->thresh_mult[THR_COMP_ZEROL3A] += 2500;
+#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_ZEROGA] += 2500;
+
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_ZEROLB] += 2500;
+  rd->thresh_mult[THR_COMP_ZEROL2B] += 2500;
+  rd->thresh_mult[THR_COMP_ZEROL3B] += 2500;
+  rd->thresh_mult[THR_COMP_ZEROGB] += 2500;
+#endif  // CONFIG_EXT_REFS
+
+#endif  // CONFIG_EXT_INTER
+
+  rd->thresh_mult[THR_H_PRED] += 2000;
+  rd->thresh_mult[THR_V_PRED] += 2000;
+  rd->thresh_mult[THR_D135_PRED] += 2500;
+  rd->thresh_mult[THR_D207_PRED] += 2500;
+  rd->thresh_mult[THR_D153_PRED] += 2500;
+  rd->thresh_mult[THR_D63_PRED] += 2500;
+  rd->thresh_mult[THR_D117_PRED] += 2500;
+  rd->thresh_mult[THR_D45_PRED] += 2500;
+
+#if CONFIG_EXT_INTER
+  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARL] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEWL] += 2000;
+
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL2] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL2] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARL2] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEWL2] += 2000;
+
+  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL3] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL3] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARL3] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEWL3] += 2000;
+#endif  // CONFIG_EXT_REFS
+
+  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROG] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARG] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEWG] += 2000;
+
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROB] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTB] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARB] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEWB] += 2000;
+#endif  // CONFIG_EXT_REFS
+
+  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROA] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARA] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEWA] += 2000;
+#endif  // CONFIG_EXT_INTER
+}
+
+void av1_set_rd_speed_thresholds_sub8x8(AV1_COMP *cpi) {
+  static const int thresh_mult[MAX_REFS] = {
+#if CONFIG_EXT_REFS
+    2500,
+    2500,
+    2500,
+    2500,
+    2500,
+    2500,
+    4500,
+    4500,
+    4500,
+    4500,
+    4500,
+    4500,
+    4500,
+    4500,
+    2500
+#else
+    2500,
+    2500,
+    2500,
+    4500,
+    4500,
+    2500
+#endif  // CONFIG_EXT_REFS
+  };
+  RD_OPT *const rd = &cpi->rd;
+  memcpy(rd->thresh_mult_sub8x8, thresh_mult, sizeof(thresh_mult));
+}
+
+void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
+                               int (*factor_buf)[MAX_MODES], int rd_thresh,
+                               int bsize, int best_mode_index) {
+  if (rd_thresh > 0) {
+#if CONFIG_CB4X4
+    const int top_mode = MAX_MODES;
+#else
+    const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
+#endif
+    int mode;
+    for (mode = 0; mode < top_mode; ++mode) {
+      const BLOCK_SIZE min_size = AOMMAX(bsize - 1, BLOCK_4X4);
+      const BLOCK_SIZE max_size = AOMMIN(bsize + 2, (int)cm->sb_size);
+      BLOCK_SIZE bs;
+      for (bs = min_size; bs <= max_size; ++bs) {
+        int *const fact = &factor_buf[bs][mode];
+        if (mode == best_mode_index) {
+          *fact -= (*fact >> 4);
+        } else {
+          *fact = AOMMIN(*fact + RD_THRESH_INC, rd_thresh * RD_THRESH_MAX_FACT);
+        }
+      }
+    }
+  }
+}
+
+int av1_get_intra_cost_penalty(int qindex, int qdelta,
+                               aom_bit_depth_t bit_depth) {
+  const int q = av1_dc_quant(qindex, qdelta, bit_depth);
+#if CONFIG_HIGHBITDEPTH
+  switch (bit_depth) {
+    case AOM_BITS_8: return 20 * q;
+    case AOM_BITS_10: return 5 * q;
+    case AOM_BITS_12: return ROUND_POWER_OF_TWO(5 * q, 2);
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+#else
+  return 20 * q;
+#endif  // CONFIG_HIGHBITDEPTH
+}
diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h
new file mode 100644
index 000000000..c0ac1f7e7
--- /dev/null
+++ b/third_party/aom/av1/encoder/rd.h
@@ -0,0 +1,505 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_RD_H_
+#define AV1_ENCODER_RD_H_
+
+#include <limits.h>
+
+#if CONFIG_ANS
+#include "aom_dsp/ans.h"
+#endif  // CONFIG_ANS
+#include "av1/common/blockd.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/cost.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RDDIV_BITS 7
+#define RD_EPB_SHIFT 6
+
+#define RDCOST(RM, DM, R, D) \
+  (ROUND_POWER_OF_TWO(((int64_t)R) * (RM), AV1_PROB_COST_SHIFT) + (D << DM))
+
+#define RDCOST_DBL(RM, DM, R, D)                                   \
+  (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \
+   ((double)(D) * (1 << (DM))))
+
+#define QIDX_SKIP_THRESH 115
+
+#define MV_COST_WEIGHT 108
+#define MV_COST_WEIGHT_SUB 120
+
+#define INVALID_MV 0x80008000
+
+#if CONFIG_EXT_REFS
+#define MAX_REFS 15
+#else
+#define MAX_REFS 6
+#endif  // CONFIG_EXT_REFS
+
+#define RD_THRESH_MAX_FACT 64
+#define RD_THRESH_INC 1
+
+// This enumerator type needs to be kept aligned with the mode order in
+// const MODE_DEFINITION av1_mode_order[MAX_MODES] used in the rd code.
+typedef enum {
+  THR_NEARESTMV,
+#if CONFIG_EXT_REFS
+  THR_NEARESTL2,
+  THR_NEARESTL3,
+  THR_NEARESTB,
+#endif  // CONFIG_EXT_REFS
+  THR_NEARESTA,
+  THR_NEARESTG,
+
+  THR_DC,
+
+  THR_NEWMV,
+#if CONFIG_EXT_REFS
+  THR_NEWL2,
+  THR_NEWL3,
+  THR_NEWB,
+#endif  // CONFIG_EXT_REFS
+  THR_NEWA,
+  THR_NEWG,
+
+  THR_NEARMV,
+#if CONFIG_EXT_REFS
+  THR_NEARL2,
+  THR_NEARL3,
+  THR_NEARB,
+#endif  // CONFIG_EXT_REFS
+  THR_NEARA,
+  THR_NEARG,
+
+  THR_ZEROMV,
+#if CONFIG_EXT_REFS
+  THR_ZEROL2,
+  THR_ZEROL3,
+  THR_ZEROB,
+#endif  // CONFIG_EXT_REFS
+  THR_ZEROG,
+  THR_ZEROA,
+
+#if CONFIG_EXT_INTER
+
+  THR_COMP_NEAREST_NEARESTLA,
+#if CONFIG_EXT_REFS
+  THR_COMP_NEAREST_NEARESTL2A,
+  THR_COMP_NEAREST_NEARESTL3A,
+#endif  // CONFIG_EXT_REFS
+  THR_COMP_NEAREST_NEARESTGA,
+#if CONFIG_EXT_REFS
+  THR_COMP_NEAREST_NEARESTLB,
+  THR_COMP_NEAREST_NEARESTL2B,
+  THR_COMP_NEAREST_NEARESTL3B,
+  THR_COMP_NEAREST_NEARESTGB,
+#endif  // CONFIG_EXT_REFS
+
+#else  // CONFIG_EXT_INTER
+
+  THR_COMP_NEARESTLA,
+#if CONFIG_EXT_REFS
+  THR_COMP_NEARESTL2A,
+  THR_COMP_NEARESTL3A,
+#endif  // CONFIG_EXT_REFS
+  THR_COMP_NEARESTGA,
+#if CONFIG_EXT_REFS
+  THR_COMP_NEARESTLB,
+  THR_COMP_NEARESTL2B,
+  THR_COMP_NEARESTL3B,
+  THR_COMP_NEARESTGB,
+#endif  // CONFIG_EXT_REFS
+
+#endif  // CONFIG_EXT_INTER
+
+  THR_TM,
+
+#if CONFIG_ALT_INTRA
+  THR_SMOOTH,
+#endif  // CONFIG_ALT_INTRA
+
+#if CONFIG_EXT_INTER
+
+  THR_COMP_NEAR_NEARESTLA,
+  THR_COMP_NEAREST_NEARLA,
+  THR_COMP_NEAR_NEARLA,
+  THR_COMP_NEW_NEARESTLA,
+  THR_COMP_NEAREST_NEWLA,
+  THR_COMP_NEW_NEARLA,
+  THR_COMP_NEAR_NEWLA,
+  THR_COMP_NEW_NEWLA,
+  THR_COMP_ZERO_ZEROLA,
+
+#if CONFIG_EXT_REFS
+  THR_COMP_NEAR_NEARESTL2A,
+  THR_COMP_NEAREST_NEARL2A,
+  THR_COMP_NEAR_NEARL2A,
+  THR_COMP_NEW_NEARESTL2A,
+  THR_COMP_NEAREST_NEWL2A,
+  THR_COMP_NEW_NEARL2A,
+  THR_COMP_NEAR_NEWL2A,
+  THR_COMP_NEW_NEWL2A,
+  THR_COMP_ZERO_ZEROL2A,
+
+  THR_COMP_NEAR_NEARESTL3A,
+  THR_COMP_NEAREST_NEARL3A,
+  THR_COMP_NEAR_NEARL3A,
+  THR_COMP_NEW_NEARESTL3A,
+  THR_COMP_NEAREST_NEWL3A,
+  THR_COMP_NEW_NEARL3A,
+  THR_COMP_NEAR_NEWL3A,
+  THR_COMP_NEW_NEWL3A,
+  THR_COMP_ZERO_ZEROL3A,
+#endif  // CONFIG_EXT_REFS
+
+  THR_COMP_NEAR_NEARESTGA,
+  THR_COMP_NEAREST_NEARGA,
+  THR_COMP_NEAR_NEARGA,
+  THR_COMP_NEW_NEARESTGA,
+  THR_COMP_NEAREST_NEWGA,
+  THR_COMP_NEW_NEARGA,
+  THR_COMP_NEAR_NEWGA,
+  THR_COMP_NEW_NEWGA,
+  THR_COMP_ZERO_ZEROGA,
+
+#if CONFIG_EXT_REFS
+  THR_COMP_NEAR_NEARESTLB,
+  THR_COMP_NEAREST_NEARLB,
+  THR_COMP_NEAR_NEARLB,
+  THR_COMP_NEW_NEARESTLB,
+  THR_COMP_NEAREST_NEWLB,
+  THR_COMP_NEW_NEARLB,
+  THR_COMP_NEAR_NEWLB,
+  THR_COMP_NEW_NEWLB,
+  THR_COMP_ZERO_ZEROLB,
+
+  THR_COMP_NEAR_NEARESTL2B,
+  THR_COMP_NEAREST_NEARL2B,
+  THR_COMP_NEAR_NEARL2B,
+  THR_COMP_NEW_NEARESTL2B,
+  THR_COMP_NEAREST_NEWL2B,
+  THR_COMP_NEW_NEARL2B,
+  THR_COMP_NEAR_NEWL2B,
+  THR_COMP_NEW_NEWL2B,
+  THR_COMP_ZERO_ZEROL2B,
+
+  THR_COMP_NEAR_NEARESTL3B,
+  THR_COMP_NEAREST_NEARL3B,
+  THR_COMP_NEAR_NEARL3B,
+  THR_COMP_NEW_NEARESTL3B,
+  THR_COMP_NEAREST_NEWL3B,
+  THR_COMP_NEW_NEARL3B,
+  THR_COMP_NEAR_NEWL3B,
+  THR_COMP_NEW_NEWL3B,
+  THR_COMP_ZERO_ZEROL3B,
+
+  THR_COMP_NEAR_NEARESTGB,
+  THR_COMP_NEAREST_NEARGB,
+  THR_COMP_NEAR_NEARGB,
+  THR_COMP_NEW_NEARESTGB,
+  THR_COMP_NEAREST_NEWGB,
+  THR_COMP_NEW_NEARGB,
+  THR_COMP_NEAR_NEWGB,
+  THR_COMP_NEW_NEWGB,
+  THR_COMP_ZERO_ZEROGB,
+#endif  // CONFIG_EXT_REFS
+
+#else  // CONFIG_EXT_INTER
+
+  THR_COMP_NEARLA,
+  THR_COMP_NEWLA,
+#if CONFIG_EXT_REFS
+  THR_COMP_NEARL2A,
+  THR_COMP_NEWL2A,
+  THR_COMP_NEARL3A,
+  THR_COMP_NEWL3A,
+#endif  // CONFIG_EXT_REFS
+  THR_COMP_NEARGA,
+  THR_COMP_NEWGA,
+
+#if CONFIG_EXT_REFS
+  THR_COMP_NEARLB,
+  THR_COMP_NEWLB,
+  THR_COMP_NEARL2B,
+  THR_COMP_NEWL2B,
+  THR_COMP_NEARL3B,
+  THR_COMP_NEWL3B,
+  THR_COMP_NEARGB,
+  THR_COMP_NEWGB,
+#endif  // CONFIG_EXT_REFS
+
+  THR_COMP_ZEROLA,
+#if CONFIG_EXT_REFS
+  THR_COMP_ZEROL2A,
+  THR_COMP_ZEROL3A,
+#endif  // CONFIG_EXT_REFS
+  THR_COMP_ZEROGA,
+
+#if CONFIG_EXT_REFS
+  THR_COMP_ZEROLB,
+  THR_COMP_ZEROL2B,
+  THR_COMP_ZEROL3B,
+  THR_COMP_ZEROGB,
+#endif  // CONFIG_EXT_REFS
+
+#endif  // CONFIG_EXT_INTER
+
+  THR_H_PRED,
+  THR_V_PRED,
+  THR_D135_PRED,
+  THR_D207_PRED,
+  THR_D153_PRED,
+  THR_D63_PRED,
+  THR_D117_PRED,
+  THR_D45_PRED,
+
+#if CONFIG_EXT_INTER
+  THR_COMP_INTERINTRA_ZEROL,
+  THR_COMP_INTERINTRA_NEARESTL,
+  THR_COMP_INTERINTRA_NEARL,
+  THR_COMP_INTERINTRA_NEWL,
+
+#if CONFIG_EXT_REFS
+  THR_COMP_INTERINTRA_ZEROL2,
+  THR_COMP_INTERINTRA_NEARESTL2,
+  THR_COMP_INTERINTRA_NEARL2,
+  THR_COMP_INTERINTRA_NEWL2,
+
+  THR_COMP_INTERINTRA_ZEROL3,
+  THR_COMP_INTERINTRA_NEARESTL3,
+  THR_COMP_INTERINTRA_NEARL3,
+  THR_COMP_INTERINTRA_NEWL3,
+#endif  // CONFIG_EXT_REFS
+
+  THR_COMP_INTERINTRA_ZEROG,
+  THR_COMP_INTERINTRA_NEARESTG,
+  THR_COMP_INTERINTRA_NEARG,
+  THR_COMP_INTERINTRA_NEWG,
+
+#if CONFIG_EXT_REFS
+  THR_COMP_INTERINTRA_ZEROB,
+  THR_COMP_INTERINTRA_NEARESTB,
+  THR_COMP_INTERINTRA_NEARB,
+  THR_COMP_INTERINTRA_NEWB,
+#endif  // CONFIG_EXT_REFS
+
+  THR_COMP_INTERINTRA_ZEROA,
+  THR_COMP_INTERINTRA_NEARESTA,
+  THR_COMP_INTERINTRA_NEARA,
+  THR_COMP_INTERINTRA_NEWA,
+#endif  // CONFIG_EXT_INTER
+  MAX_MODES
+} THR_MODES;
+
+typedef enum {
+  THR_LAST,
+#if CONFIG_EXT_REFS
+  THR_LAST2,
+  THR_LAST3,
+  THR_BWDR,
+#endif  // CONFIG_EXT_REFS
+  THR_GOLD,
+  THR_ALTR,
+
+  THR_COMP_LA,
+#if CONFIG_EXT_REFS
+  THR_COMP_L2A,
+  THR_COMP_L3A,
+#endif  // CONFIG_EXT_REFS
+  THR_COMP_GA,
+
+#if CONFIG_EXT_REFS
+  THR_COMP_LB,
+  THR_COMP_L2B,
+  THR_COMP_L3B,
+  THR_COMP_GB,
+#endif  // CONFIG_EXT_REFS
+
+  THR_INTRA,
+} THR_MODES_SUB8X8;
+
+typedef struct RD_OPT {
+  // Thresh_mult is used to set a threshold for the rd score. A higher value
+  // means that we will accept the best mode so far more often. This number
+  // is used in combination with the current block size, and thresh_freq_fact
+  // to pick a threshold.
+  int thresh_mult[MAX_MODES];
+  int thresh_mult_sub8x8[MAX_REFS];
+
+  int threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
+
+  int64_t prediction_type_threshes[TOTAL_REFS_PER_FRAME][REFERENCE_MODES];
+
+  int RDMULT;
+  int RDDIV;
+} RD_OPT;
+
+static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
+#if CONFIG_RD_DEBUG
+  int plane;
+#endif
+  rd_stats->rate = 0;
+  rd_stats->dist = 0;
+  rd_stats->rdcost = 0;
+  rd_stats->sse = 0;
+  rd_stats->skip = 1;
+#if CONFIG_RD_DEBUG
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    rd_stats->txb_coeff_cost[plane] = 0;
+#if CONFIG_VAR_TX
+    {
+      int r, c;
+      for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
+        for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
+          rd_stats->txb_coeff_cost_map[plane][r][c] = 0;
+    }
+#endif
+  }
+#endif
+}
+
+static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
+#if CONFIG_RD_DEBUG
+  int plane;
+#endif
+  rd_stats->rate = INT_MAX;
+  rd_stats->dist = INT64_MAX;
+  rd_stats->rdcost = INT64_MAX;
+  rd_stats->sse = INT64_MAX;
+  rd_stats->skip = 0;
+#if CONFIG_RD_DEBUG
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    rd_stats->txb_coeff_cost[plane] = INT_MAX;
+#if CONFIG_VAR_TX
+    {
+      int r, c;
+      for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
+        for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
+          rd_stats->txb_coeff_cost_map[plane][r][c] = INT_MAX;
+    }
+#endif
+  }
+#endif
+}
+
+static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
+                                      const RD_STATS *rd_stats_src) {
+#if CONFIG_RD_DEBUG
+  int plane;
+#endif
+  rd_stats_dst->rate += rd_stats_src->rate;
+  rd_stats_dst->dist += rd_stats_src->dist;
+  rd_stats_dst->sse += rd_stats_src->sse;
+  rd_stats_dst->skip &= rd_stats_src->skip;
+#if CONFIG_RD_DEBUG
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane];
+#if CONFIG_VAR_TX
+    {
+      // TODO(angiebird): optimize this part
+      int r, c;
+      int ref_txb_coeff_cost = 0;
+      for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
+        for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+          rd_stats_dst->txb_coeff_cost_map[plane][r][c] +=
+              rd_stats_src->txb_coeff_cost_map[plane][r][c];
+          ref_txb_coeff_cost += rd_stats_dst->txb_coeff_cost_map[plane][r][c];
+        }
+      assert(ref_txb_coeff_cost == rd_stats_dst->txb_coeff_cost[plane]);
+    }
+#endif
+  }
+#endif
+}
+
+struct TileInfo;
+struct TileDataEnc;
+struct AV1_COMP;
+struct macroblock;
+
+int av1_compute_rd_mult(const struct AV1_COMP *cpi, int qindex);
+
+void av1_initialize_rd_consts(struct AV1_COMP *cpi);
+
+void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                              int qindex);
+
+void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n,
+                                  unsigned int qstep, int *rate, int64_t *dist);
+
+int av1_get_switchable_rate(const struct AV1_COMP *cpi, const MACROBLOCKD *xd);
+
+int av1_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block,
+                            int stride);
+
+int16_t *av1_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block,
+                                       int16_t *base);
+
+YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const struct AV1_COMP *cpi,
+                                             int ref_frame);
+
+void av1_init_me_luts(void);
+
+#if CONFIG_REF_MV
+void av1_set_mvcost(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int ref,
+                    int ref_mv_idx);
+#endif
+
+void av1_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
+                              ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]);
+
+void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi);
+
+void av1_set_rd_speed_thresholds_sub8x8(struct AV1_COMP *cpi);
+
+void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
+                               int (*fact)[MAX_MODES], int rd_thresh, int bsize,
+                               int best_mode_index);
+
+void av1_fill_token_costs(av1_coeff_cost *c,
+                          av1_coeff_probs_model (*p)[PLANE_TYPES]);
+
+static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
+                                      int thresh_fact) {
+  return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
+}
+
+void av1_mv_pred(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                 uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame,
+                 BLOCK_SIZE block_size);
+
+static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) {
+  x->errorperbit = rdmult >> RD_EPB_SHIFT;
+  x->errorperbit += (x->errorperbit == 0);
+}
+
+void av1_setup_pred_block(const MACROBLOCKD *xd,
+                          struct buf_2d dst[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const struct scale_factors *scale,
+                          const struct scale_factors *scale_uv);
+
+int av1_get_intra_cost_penalty(int qindex, int qdelta,
+                               aom_bit_depth_t bit_depth);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_RD_H_
diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c
new file mode 100644
index 000000000..a1096f782
--- /dev/null
+++ b/third_party/aom/av1/encoder/rdopt.c
@@ -0,0 +1,12713 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/idct.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+#if CONFIG_LV_MAP
+#include "av1/common/txb_common.h"
+#endif
+#if CONFIG_WARPED_MOTION
+#include "av1/common/warped_motion.h"
+#endif  // CONFIG_WARPED_MOTION
+
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#if CONFIG_LV_MAP
+#include "av1/encoder/encodetxb.h"
+#endif
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/mcomp.h"
+#if CONFIG_PALETTE
+#include "av1/encoder/palette.h"
+#endif  // CONFIG_PALETTE
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/tokenize.h"
+#if CONFIG_PVQ
+#include "av1/encoder/pvq_encoder.h"
+#endif  // CONFIG_PVQ
+#if CONFIG_PVQ || CONFIG_DAALA_DIST
+#include "av1/common/pvq.h"
+#endif  // CONFIG_PVQ || CONFIG_DAALA_DIST
+#if CONFIG_DUAL_FILTER
+#define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
+static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
+  { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 }, { 1, 0 }, { 1, 1 },
+  { 1, 2 }, { 1, 3 }, { 2, 0 }, { 2, 1 }, { 2, 2 }, { 2, 3 },
+  { 3, 0 }, { 3, 1 }, { 3, 2 }, { 3, 3 },
+};
+#endif  // CONFIG_DUAL_FILTER
+
+#if CONFIG_EXT_REFS
+
+#define LAST_FRAME_MODE_MASK                                      \
+  ((1 << INTRA_FRAME) | (1 << LAST2_FRAME) | (1 << LAST3_FRAME) | \
+   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
+#define LAST2_FRAME_MODE_MASK                                    \
+  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST3_FRAME) | \
+   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
+#define LAST3_FRAME_MODE_MASK                                    \
+  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
+   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
+#define GOLDEN_FRAME_MODE_MASK                                   \
+  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
+   (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
+#define BWDREF_FRAME_MODE_MASK                                   \
+  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
+   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME))
+#define ALTREF_FRAME_MODE_MASK                                   \
+  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
+   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME))
+
+#else
+
+#define LAST_FRAME_MODE_MASK \
+  ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
+#define GOLDEN_FRAME_MODE_MASK \
+  ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
+#define ALTREF_FRAME_MODE_MASK \
+  ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | (1 << INTRA_FRAME))
+
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+#define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | (1 << BWDREF_FRAME) | 0x01)
+#else
+#define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | 0x01)
+#endif  // CONFIG_EXT_REFS
+
+#define MIN_EARLY_TERM_INDEX 3
+#define NEW_MV_DISCOUNT_FACTOR 8
+
+#if CONFIG_EXT_INTRA
+#define ANGLE_SKIP_THRESH 10
+#define FILTER_FAST_SEARCH 1
+#endif  // CONFIG_EXT_INTRA
+
+const double ADST_FLIP_SVM[8] = { -6.6623, -2.8062, -3.2531, 3.1671,    // vert
+                                  -7.7051, -3.2234, -3.6193, 3.4533 };  // horz
+
+typedef struct {
+  PREDICTION_MODE mode;
+  MV_REFERENCE_FRAME ref_frame[2];
+} MODE_DEFINITION;
+
+typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION;
+
+struct rdcost_block_args {
+  const AV1_COMP *cpi;
+  MACROBLOCK *x;
+  ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE];
+  ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE];
+  RD_STATS rd_stats;
+  int64_t this_rd;
+  int64_t best_rd;
+  int exit_early;
+  int use_fast_coef_costing;
+};
+
+#define LAST_NEW_MV_INDEX 6
+static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
+  { NEARESTMV, { LAST_FRAME, NONE_FRAME } },
+#if CONFIG_EXT_REFS
+  { NEARESTMV, { LAST2_FRAME, NONE_FRAME } },
+  { NEARESTMV, { LAST3_FRAME, NONE_FRAME } },
+  { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } },
+  { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+  { DC_PRED, { INTRA_FRAME, NONE_FRAME } },
+
+  { NEWMV, { LAST_FRAME, NONE_FRAME } },
+#if CONFIG_EXT_REFS
+  { NEWMV, { LAST2_FRAME, NONE_FRAME } },
+  { NEWMV, { LAST3_FRAME, NONE_FRAME } },
+  { NEWMV, { BWDREF_FRAME, NONE_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { NEWMV, { ALTREF_FRAME, NONE_FRAME } },
+  { NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+  { NEARMV, { LAST_FRAME, NONE_FRAME } },
+#if CONFIG_EXT_REFS
+  { NEARMV, { LAST2_FRAME, NONE_FRAME } },
+  { NEARMV, { LAST3_FRAME, NONE_FRAME } },
+  { NEARMV, { BWDREF_FRAME, NONE_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { NEARMV, { ALTREF_FRAME, NONE_FRAME } },
+  { NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+  { ZEROMV, { LAST_FRAME, NONE_FRAME } },
+#if CONFIG_EXT_REFS
+  { ZEROMV, { LAST2_FRAME, NONE_FRAME } },
+  { ZEROMV, { LAST3_FRAME, NONE_FRAME } },
+  { ZEROMV, { BWDREF_FRAME, NONE_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { ZEROMV, { GOLDEN_FRAME, NONE_FRAME } },
+  { ZEROMV, { ALTREF_FRAME, NONE_FRAME } },
+
+// TODO(zoeliu): May need to reconsider the order on the modes to check
+
+#if CONFIG_EXT_INTER
+  { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+#if CONFIG_EXT_REFS
+  { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+#if CONFIG_EXT_REFS
+  { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+#endif  // CONFIG_EXT_REFS
+
+#else  // CONFIG_EXT_INTER
+
+  { NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+#if CONFIG_EXT_REFS
+  { NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+#if CONFIG_EXT_REFS
+  { NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+#endif  // CONFIG_EXT_REFS
+#endif  // CONFIG_EXT_INTER
+
+  { TM_PRED, { INTRA_FRAME, NONE_FRAME } },
+
+#if CONFIG_ALT_INTRA
+  { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
+#endif  // CONFIG_ALT_INTRA
+
+#if CONFIG_EXT_INTER
+  { NEAR_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { ZERO_ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
+
+#if CONFIG_EXT_REFS
+  { NEAR_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { ZERO_ZEROMV, { LAST2_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { ZERO_ZEROMV, { LAST3_FRAME, ALTREF_FRAME } },
+#endif  // CONFIG_EXT_REFS
+
+  { NEAR_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { ZERO_ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+
+#if CONFIG_EXT_REFS
+  { NEAR_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { ZERO_ZEROMV, { LAST_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { ZERO_ZEROMV, { LAST2_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { ZERO_ZEROMV, { LAST3_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { ZERO_ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+#endif  // CONFIG_EXT_REFS
+
+#else  // CONFIG_EXT_INTER
+
+  { NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+#if CONFIG_EXT_REFS
+  { NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+
+#if CONFIG_EXT_REFS
+  { NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+#endif  // CONFIG_EXT_REFS
+
+  { ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
+#if CONFIG_EXT_REFS
+  { ZEROMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { ZEROMV, { LAST3_FRAME, ALTREF_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+
+#if CONFIG_EXT_REFS
+  { ZEROMV, { LAST_FRAME, BWDREF_FRAME } },
+  { ZEROMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { ZEROMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+#endif  // CONFIG_EXT_REFS
+
+#endif  // CONFIG_EXT_INTER
+
+  { H_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { V_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D207_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D153_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D63_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D117_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
+
+#if CONFIG_EXT_INTER
+  { ZEROMV, { LAST_FRAME, INTRA_FRAME } },
+  { NEARESTMV, { LAST_FRAME, INTRA_FRAME } },
+  { NEARMV, { LAST_FRAME, INTRA_FRAME } },
+  { NEWMV, { LAST_FRAME, INTRA_FRAME } },
+
+#if CONFIG_EXT_REFS
+  { ZEROMV, { LAST2_FRAME, INTRA_FRAME } },
+  { NEARESTMV, { LAST2_FRAME, INTRA_FRAME } },
+  { NEARMV, { LAST2_FRAME, INTRA_FRAME } },
+  { NEWMV, { LAST2_FRAME, INTRA_FRAME } },
+
+  { ZEROMV, { LAST3_FRAME, INTRA_FRAME } },
+  { NEARESTMV, { LAST3_FRAME, INTRA_FRAME } },
+  { NEARMV, { LAST3_FRAME, INTRA_FRAME } },
+  { NEWMV, { LAST3_FRAME, INTRA_FRAME } },
+#endif  // CONFIG_EXT_REFS
+
+  { ZEROMV, { GOLDEN_FRAME, INTRA_FRAME } },
+  { NEARESTMV, { GOLDEN_FRAME, INTRA_FRAME } },
+  { NEARMV, { GOLDEN_FRAME, INTRA_FRAME } },
+  { NEWMV, { GOLDEN_FRAME, INTRA_FRAME } },
+
+#if CONFIG_EXT_REFS
+  { ZEROMV, { BWDREF_FRAME, INTRA_FRAME } },
+  { NEARESTMV, { BWDREF_FRAME, INTRA_FRAME } },
+  { NEARMV, { BWDREF_FRAME, INTRA_FRAME } },
+  { NEWMV, { BWDREF_FRAME, INTRA_FRAME } },
+#endif  // CONFIG_EXT_REFS
+
+  { ZEROMV, { ALTREF_FRAME, INTRA_FRAME } },
+  { NEARESTMV, { ALTREF_FRAME, INTRA_FRAME } },
+  { NEARMV, { ALTREF_FRAME, INTRA_FRAME } },
+  { NEWMV, { ALTREF_FRAME, INTRA_FRAME } },
+#endif  // CONFIG_EXT_INTER
+};
+
+static const REF_DEFINITION av1_ref_order[MAX_REFS] = {
+  { { LAST_FRAME, NONE_FRAME } },
+#if CONFIG_EXT_REFS
+  { { LAST2_FRAME, NONE_FRAME } },    { { LAST3_FRAME, NONE_FRAME } },
+  { { BWDREF_FRAME, NONE_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { { GOLDEN_FRAME, NONE_FRAME } },   { { ALTREF_FRAME, NONE_FRAME } },
+
+  { { LAST_FRAME, ALTREF_FRAME } },
+#if CONFIG_EXT_REFS
+  { { LAST2_FRAME, ALTREF_FRAME } },  { { LAST3_FRAME, ALTREF_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { { GOLDEN_FRAME, ALTREF_FRAME } },
+
+#if CONFIG_EXT_REFS
+  { { LAST_FRAME, BWDREF_FRAME } },   { { LAST2_FRAME, BWDREF_FRAME } },
+  { { LAST3_FRAME, BWDREF_FRAME } },  { { GOLDEN_FRAME, BWDREF_FRAME } },
+#endif  // CONFIG_EXT_REFS
+
+  { { INTRA_FRAME, NONE_FRAME } },
+};
+
+#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
+static INLINE int write_uniform_cost(int n, int v) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  if (l == 0) return 0;
+  if (v < m)
+    return (l - 1) * av1_cost_bit(128, 0);
+  else
+    return l * av1_cost_bit(128, 0);
+}
+#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
+
+// constants for prune 1 and prune 2 decision boundaries
+#define FAST_EXT_TX_CORR_MID 0.0
+#define FAST_EXT_TX_EDST_MID 0.1
+#define FAST_EXT_TX_CORR_MARGIN 0.5
+#define FAST_EXT_TX_EDST_MARGIN 0.3
+
+static const TX_TYPE_1D vtx_tab[TX_TYPES] = {
+  DCT_1D,      ADST_1D, DCT_1D,      ADST_1D,
+#if CONFIG_EXT_TX
+  FLIPADST_1D, DCT_1D,  FLIPADST_1D, ADST_1D, FLIPADST_1D, IDTX_1D,
+  DCT_1D,      IDTX_1D, ADST_1D,     IDTX_1D, FLIPADST_1D, IDTX_1D,
+#endif  // CONFIG_EXT_TX
+};
+
+static const TX_TYPE_1D htx_tab[TX_TYPES] = {
+  DCT_1D,  DCT_1D,      ADST_1D,     ADST_1D,
+#if CONFIG_EXT_TX
+  DCT_1D,  FLIPADST_1D, FLIPADST_1D, FLIPADST_1D, ADST_1D, IDTX_1D,
+  IDTX_1D, DCT_1D,      IDTX_1D,     ADST_1D,     IDTX_1D, FLIPADST_1D,
+#endif  // CONFIG_EXT_TX
+};
+
+#if CONFIG_DAALA_DIST
+static int od_compute_var_4x4(od_coeff *x, int stride) {
+  int sum;
+  int s2;
+  int i;
+  sum = 0;
+  s2 = 0;
+  for (i = 0; i < 4; i++) {
+    int j;
+    for (j = 0; j < 4; j++) {
+      int t;
+
+      t = x[i * stride + j];
+      sum += t;
+      s2 += t * t;
+    }
+  }
+  // TODO(yushin) : Check wheter any changes are required for high bit depth.
+  return (s2 - (sum * sum >> 4)) >> 4;
+}
+
+/* OD_DIST_LP_MID controls the frequency weighting filter used for computing
+   the distortion. For a value X, the filter is [1 X 1]/(X + 2) and
+   is applied both horizontally and vertically. For X=5, the filter is
+   a good approximation for the OD_QM8_Q4_HVS quantization matrix. */
+#define OD_DIST_LP_MID (5)
+#define OD_DIST_LP_NORM (OD_DIST_LP_MID + 2)
+
+static double od_compute_dist_8x8(int qm, int use_activity_masking, od_coeff *x,
+                                  od_coeff *y, od_coeff *e_lp, int stride) {
+  double sum;
+  int min_var;
+  double mean_var;
+  double var_stat;
+  double activity;
+  double calibration;
+  int i;
+  int j;
+  double vardist;
+
+  vardist = 0;
+  OD_ASSERT(qm != OD_FLAT_QM);
+  (void)qm;
+#if 1
+  min_var = INT_MAX;
+  mean_var = 0;
+  for (i = 0; i < 3; i++) {
+    for (j = 0; j < 3; j++) {
+      int varx;
+      int vary;
+      varx = od_compute_var_4x4(x + 2 * i * stride + 2 * j, stride);
+      vary = od_compute_var_4x4(y + 2 * i * stride + 2 * j, stride);
+      min_var = OD_MINI(min_var, varx);
+      mean_var += 1. / (1 + varx);
+      /* The cast to (double) is to avoid an overflow before the sqrt.*/
+      vardist += varx - 2 * sqrt(varx * (double)vary) + vary;
+    }
+  }
+  /* We use a different variance statistic depending on whether activity
+     masking is used, since the harmonic mean appeared slghtly worse with
+     masking off. The calibration constant just ensures that we preserve the
+     rate compared to activity=1. */
+  if (use_activity_masking) {
+    calibration = 1.95;
+    var_stat = 9. / mean_var;
+  } else {
+    calibration = 1.62;
+    var_stat = min_var;
+  }
+  /* 1.62 is a calibration constant, 0.25 is a noise floor and 1/6 is the
+     activity masking constant. */
+  activity = calibration * pow(.25 + var_stat, -1. / 6);
+#else
+  activity = 1;
+#endif  // 1
+  sum = 0;
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 8; j++)
+      sum += e_lp[i * stride + j] * (double)e_lp[i * stride + j];
+  }
+  /* Normalize the filter to unit DC response. */
+  sum *= 1. / (OD_DIST_LP_NORM * OD_DIST_LP_NORM * OD_DIST_LP_NORM *
+               OD_DIST_LP_NORM);
+  return activity * activity * (sum + vardist);
+}
+
+// Note : Inputs x and y are in a pixel domain
+static double od_compute_dist(int qm, int activity_masking, od_coeff *x,
+                              od_coeff *y, int bsize_w, int bsize_h,
+                              int qindex) {
+  int i;
+  double sum;
+  sum = 0;
+
+  assert(bsize_w >= 8 && bsize_h >= 8);
+
+  if (qm == OD_FLAT_QM) {
+    for (i = 0; i < bsize_w * bsize_h; i++) {
+      double tmp;
+      tmp = x[i] - y[i];
+      sum += tmp * tmp;
+    }
+  } else {
+    int j;
+    DECLARE_ALIGNED(16, od_coeff, e[MAX_TX_SQUARE]);
+    DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
+    DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
+    int mid = OD_DIST_LP_MID;
+    for (i = 0; i < bsize_h; i++) {
+      for (j = 0; j < bsize_w; j++) {
+        e[i * bsize_w + j] = x[i * bsize_w + j] - y[i * bsize_w + j];
+      }
+    }
+    for (i = 0; i < bsize_h; i++) {
+      tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
+      tmp[i * bsize_w + bsize_w - 1] =
+          mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
+      for (j = 1; j < bsize_w - 1; j++) {
+        tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] +
+                               e[i * bsize_w + j - 1] + e[i * bsize_w + j + 1];
+      }
+    }
+    for (j = 0; j < bsize_w; j++) {
+      e_lp[j] = mid * tmp[j] + 2 * tmp[bsize_w + j];
+      e_lp[(bsize_h - 1) * bsize_w + j] =
+          mid * tmp[(bsize_h - 1) * bsize_w + j] +
+          2 * tmp[(bsize_h - 2) * bsize_w + j];
+    }
+    for (i = 1; i < bsize_h - 1; i++) {
+      for (j = 0; j < bsize_w; j++) {
+        e_lp[i * bsize_w + j] = mid * tmp[i * bsize_w + j] +
+                                tmp[(i - 1) * bsize_w + j] +
+                                tmp[(i + 1) * bsize_w + j];
+      }
+    }
+    for (i = 0; i < bsize_h; i += 8) {
+      for (j = 0; j < bsize_w; j += 8) {
+        sum += od_compute_dist_8x8(qm, activity_masking, &x[i * bsize_w + j],
+                                   &y[i * bsize_w + j], &e_lp[i * bsize_w + j],
+                                   bsize_w);
+      }
+    }
+    /* Scale according to linear regression against SSE, for 8x8 blocks. */
+    if (activity_masking) {
+      sum *= 2.2 + (1.7 - 2.2) * (qindex - 99) / (210 - 99) +
+             (qindex < 99 ? 2.5 * (qindex - 99) / 99 * (qindex - 99) / 99 : 0);
+    } else {
+      sum *= qindex >= 128
+                 ? 1.4 + (0.9 - 1.4) * (qindex - 128) / (209 - 128)
+                 : qindex <= 43
+                       ? 1.5 + (2.0 - 1.5) * (qindex - 43) / (16 - 43)
+                       : 1.5 + (1.4 - 1.5) * (qindex - 43) / (128 - 43);
+    }
+  }
+  return sum;
+}
+
+static int64_t av1_daala_dist(const uint8_t *src, int src_stride,
+                              const uint8_t *dst, int dst_stride, int bsw,
+                              int bsh, int qm, int use_activity_masking,
+                              int qindex) {
+  int i, j;
+  int64_t d;
+  DECLARE_ALIGNED(16, od_coeff, orig[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, od_coeff, rec[MAX_TX_SQUARE]);
+
+  assert(qm == OD_HVS_QM);
+
+  for (j = 0; j < bsh; j++)
+    for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
+
+  for (j = 0; j < bsh; j++)
+    for (i = 0; i < bsw; i++) rec[j * bsw + i] = dst[j * dst_stride + i];
+
+  d = (int64_t)od_compute_dist(qm, use_activity_masking, orig, rec, bsw, bsh,
+                               qindex);
+  return d;
+}
+#endif  // CONFIG_DAALA_DIST
+
+static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
+                                         const uint8_t *src, int src_stride,
+                                         const uint8_t *dst, int dst_stride,
+                                         double *hordist, double *verdist) {
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+  const int f_index = bsize - BLOCK_16X16;
+  if (f_index < 0) {
+    const int w_shift = bw == 8 ? 1 : 2;
+    const int h_shift = bh == 8 ? 1 : 2;
+#if CONFIG_HIGHBITDEPTH
+    if (cpi->common.use_highbitdepth) {
+      const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+      const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+      for (int i = 0; i < bh; ++i)
+        for (int j = 0; j < bw; ++j) {
+          const int index = (j >> w_shift) + ((i >> h_shift) << 2);
+          esq[index] +=
+              (src16[j + i * src_stride] - dst16[j + i * dst_stride]) *
+              (src16[j + i * src_stride] - dst16[j + i * dst_stride]);
+        }
+    } else {
+#endif  // CONFIG_HIGHBITDEPTH
+
+      for (int i = 0; i < bh; ++i)
+        for (int j = 0; j < bw; ++j) {
+          const int index = (j >> w_shift) + ((i >> h_shift) << 2);
+          esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) *
+                        (src[j + i * src_stride] - dst[j + i * dst_stride]);
+        }
+#if CONFIG_HIGHBITDEPTH
+    }
+#endif  // CONFIG_HIGHBITDEPTH
+  } else {
+    cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[0]);
+    cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+                            &esq[1]);
+    cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+                            &esq[2]);
+    cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                            dst_stride, &esq[3]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[4]);
+    cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+                            &esq[5]);
+    cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+                            &esq[6]);
+    cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                            dst_stride, &esq[7]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[8]);
+    cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+                            &esq[9]);
+    cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+                            &esq[10]);
+    cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                            dst_stride, &esq[11]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[12]);
+    cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+                            &esq[13]);
+    cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+                            &esq[14]);
+    cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                            dst_stride, &esq[15]);
+  }
+
+  double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] +
+                 esq[6] + esq[7] + esq[8] + esq[9] + esq[10] + esq[11] +
+                 esq[12] + esq[13] + esq[14] + esq[15];
+  if (total > 0) {
+    const double e_recip = 1.0 / total;
+    hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip;
+    hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip;
+    hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip;
+    verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip;
+    verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip;
+    verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip;
+  } else {
+    hordist[0] = verdist[0] = 0.25;
+    hordist[1] = verdist[1] = 0.25;
+    hordist[2] = verdist[2] = 0.25;
+  }
+}
+
+static int adst_vs_flipadst(const AV1_COMP *cpi, BLOCK_SIZE bsize,
+                            const uint8_t *src, int src_stride,
+                            const uint8_t *dst, int dst_stride) {
+  int prune_bitmask = 0;
+  double svm_proj_h = 0, svm_proj_v = 0;
+  double hdist[3] = { 0, 0, 0 }, vdist[3] = { 0, 0, 0 };
+  get_energy_distribution_fine(cpi, bsize, src, src_stride, dst, dst_stride,
+                               hdist, vdist);
+
+  svm_proj_v = vdist[0] * ADST_FLIP_SVM[0] + vdist[1] * ADST_FLIP_SVM[1] +
+               vdist[2] * ADST_FLIP_SVM[2] + ADST_FLIP_SVM[3];
+  svm_proj_h = hdist[0] * ADST_FLIP_SVM[4] + hdist[1] * ADST_FLIP_SVM[5] +
+               hdist[2] * ADST_FLIP_SVM[6] + ADST_FLIP_SVM[7];
+  if (svm_proj_v > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN)
+    prune_bitmask |= 1 << FLIPADST_1D;
+  else if (svm_proj_v < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN)
+    prune_bitmask |= 1 << ADST_1D;
+
+  if (svm_proj_h > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN)
+    prune_bitmask |= 1 << (FLIPADST_1D + 8);
+  else if (svm_proj_h < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN)
+    prune_bitmask |= 1 << (ADST_1D + 8);
+
+  return prune_bitmask;
+}
+
+#if CONFIG_EXT_TX
+static void get_horver_correlation(const int16_t *diff, int stride, int w,
+                                   int h, double *hcorr, double *vcorr) {
+  // Returns hor/ver correlation coefficient
+  const int num = (h - 1) * (w - 1);
+  double num_r;
+  int i, j;
+  int64_t xy_sum = 0, xz_sum = 0;
+  int64_t x_sum = 0, y_sum = 0, z_sum = 0;
+  int64_t x2_sum = 0, y2_sum = 0, z2_sum = 0;
+  double x_var_n, y_var_n, z_var_n, xy_var_n, xz_var_n;
+  *hcorr = *vcorr = 1;
+
+  assert(num > 0);
+  num_r = 1.0 / num;
+  for (i = 1; i < h; ++i) {
+    for (j = 1; j < w; ++j) {
+      const int16_t x = diff[i * stride + j];
+      const int16_t y = diff[i * stride + j - 1];
+      const int16_t z = diff[(i - 1) * stride + j];
+      xy_sum += x * y;
+      xz_sum += x * z;
+      x_sum += x;
+      y_sum += y;
+      z_sum += z;
+      x2_sum += x * x;
+      y2_sum += y * y;
+      z2_sum += z * z;
+    }
+  }
+  x_var_n = x2_sum - (x_sum * x_sum) * num_r;
+  y_var_n = y2_sum - (y_sum * y_sum) * num_r;
+  z_var_n = z2_sum - (z_sum * z_sum) * num_r;
+  xy_var_n = xy_sum - (x_sum * y_sum) * num_r;
+  xz_var_n = xz_sum - (x_sum * z_sum) * num_r;
+  if (x_var_n > 0 && y_var_n > 0) {
+    *hcorr = xy_var_n / sqrt(x_var_n * y_var_n);
+    *hcorr = *hcorr < 0 ? 0 : *hcorr;
+  }
+  if (x_var_n > 0 && z_var_n > 0) {
+    *vcorr = xz_var_n / sqrt(x_var_n * z_var_n);
+    *vcorr = *vcorr < 0 ? 0 : *vcorr;
+  }
+}
+
+int dct_vs_idtx(const int16_t *diff, int stride, int w, int h) {
+  double hcorr, vcorr;
+  int prune_bitmask = 0;
+  get_horver_correlation(diff, stride, w, h, &hcorr, &vcorr);
+
+  if (vcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
+    prune_bitmask |= 1 << IDTX_1D;
+  else if (vcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
+    prune_bitmask |= 1 << DCT_1D;
+
+  if (hcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
+    prune_bitmask |= 1 << (IDTX_1D + 8);
+  else if (hcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
+    prune_bitmask |= 1 << (DCT_1D + 8);
+  return prune_bitmask;
+}
+
+// Performance drop: 0.5%, Speed improvement: 24%
+static int prune_two_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
+                             MACROBLOCK *x, const MACROBLOCKD *xd,
+                             int adst_flipadst, int dct_idtx) {
+  int prune = 0;
+
+  if (adst_flipadst) {
+    const struct macroblock_plane *const p = &x->plane[0];
+    const struct macroblockd_plane *const pd = &xd->plane[0];
+    prune |= adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride,
+                              pd->dst.buf, pd->dst.stride);
+  }
+  if (dct_idtx) {
+    av1_subtract_plane(x, bsize, 0);
+    const struct macroblock_plane *const p = &x->plane[0];
+    const int bw = 4 << (b_width_log2_lookup[bsize]);
+    const int bh = 4 << (b_height_log2_lookup[bsize]);
+    prune |= dct_vs_idtx(p->src_diff, bw, bw, bh);
+  }
+
+  return prune;
+}
+#endif  // CONFIG_EXT_TX
+
+// Performance drop: 0.3%, Speed improvement: 5%
+static int prune_one_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
+                             const MACROBLOCK *x, const MACROBLOCKD *xd) {
+  const struct macroblock_plane *const p = &x->plane[0];
+  const struct macroblockd_plane *const pd = &xd->plane[0];
+  return adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride, pd->dst.buf,
+                          pd->dst.stride);
+}
+
+static int prune_tx_types(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
+                          const MACROBLOCKD *const xd, int tx_set) {
+#if CONFIG_EXT_TX
+  const int *tx_set_1D = ext_tx_used_inter_1D[tx_set];
+#else
+  const int tx_set_1D[TX_TYPES_1D] = { 0 };
+#endif  // CONFIG_EXT_TX
+
+  switch (cpi->sf.tx_type_search.prune_mode) {
+    case NO_PRUNE: return 0; break;
+    case PRUNE_ONE:
+      if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D]))
+        return 0;
+      return prune_one_for_sby(cpi, bsize, x, xd);
+      break;
+#if CONFIG_EXT_TX
+    case PRUNE_TWO:
+      if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) {
+        if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return 0;
+        return prune_two_for_sby(cpi, bsize, x, xd, 0, 1);
+      }
+      if ((tx_set >= 0) && !(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D]))
+        return prune_two_for_sby(cpi, bsize, x, xd, 1, 0);
+      return prune_two_for_sby(cpi, bsize, x, xd, 1, 1);
+      break;
+#endif  // CONFIG_EXT_TX
+  }
+  assert(0);
+  return 0;
+}
+
+static int do_tx_type_search(TX_TYPE tx_type, int prune) {
+// TODO(sarahparker) implement for non ext tx
+#if CONFIG_EXT_TX
+  return !(((prune >> vtx_tab[tx_type]) & 1) |
+           ((prune >> (htx_tab[tx_type] + 8)) & 1));
+#else
+  // temporary to avoid compiler warnings
+  (void)vtx_tab;
+  (void)htx_tab;
+  (void)tx_type;
+  (void)prune;
+  return 1;
+#endif  // CONFIG_EXT_TX
+}
+
+static void model_rd_from_sse(const AV1_COMP *const cpi,
+                              const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
+                              int plane, int64_t sse, int *rate,
+                              int64_t *dist) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int dequant_shift =
+#if CONFIG_HIGHBITDEPTH
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 :
+#endif  // CONFIG_HIGHBITDEPTH
+                                                    3;
+
+  // Fast approximate the modelling function.
+  if (cpi->sf.simple_model_rd_from_var) {
+    const int64_t square_error = sse;
+    int quantizer = (pd->dequant[1] >> dequant_shift);
+
+    if (quantizer < 120)
+      *rate = (int)((square_error * (280 - quantizer)) >>
+                    (16 - AV1_PROB_COST_SHIFT));
+    else
+      *rate = 0;
+    *dist = (square_error * quantizer) >> 8;
+  } else {
+    av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[bsize],
+                                 pd->dequant[1] >> dequant_shift, rate, dist);
+  }
+
+  *dist <<= 4;
+}
+
+static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                            MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
+                            int plane_to, int *out_rate_sum,
+                            int64_t *out_dist_sum, int *skip_txfm_sb,
+                            int64_t *skip_sse_sb) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  int plane;
+  const int ref = xd->mi[0]->mbmi.ref_frame[0];
+
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  int64_t total_sse = 0;
+
+  x->pred_sse[ref] = 0;
+
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    struct macroblock_plane *const p = &x->plane[plane];
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+    const BLOCK_SIZE bs = AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
+#else
+    const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+#endif  // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+
+    unsigned int sse;
+    int rate;
+    int64_t dist;
+
+#if CONFIG_CB4X4
+    if (x->skip_chroma_rd && plane) continue;
+#endif  // CONFIG_CB4X4
+
+    // TODO(geza): Write direct sse functions that do not compute
+    // variance as well.
+    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                       &sse);
+
+    if (plane == 0) x->pred_sse[ref] = sse;
+
+    total_sse += sse;
+
+    model_rd_from_sse(cpi, xd, bs, plane, sse, &rate, &dist);
+
+    rate_sum += rate;
+    dist_sum += dist;
+  }
+
+  *skip_txfm_sb = total_sse == 0;
+  *skip_sse_sb = total_sse << 4;
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
+int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                          intptr_t block_size, int64_t *ssz) {
+  int i;
+  int64_t error = 0, sqcoeff = 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int diff = coeff[i] - dqcoeff[i];
+    error += diff * diff;
+    sqcoeff += coeff[i] * coeff[i];
+  }
+
+  *ssz = sqcoeff;
+  return error;
+}
+
+int64_t av1_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
+                             int block_size) {
+  int i;
+  int64_t error = 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int diff = coeff[i] - dqcoeff[i];
+    error += diff * diff;
+  }
+
+  return error;
+}
+
+#if CONFIG_HIGHBITDEPTH
+int64_t av1_highbd_block_error_c(const tran_low_t *coeff,
+                                 const tran_low_t *dqcoeff, intptr_t block_size,
+                                 int64_t *ssz, int bd) {
+  int i;
+  int64_t error = 0, sqcoeff = 0;
+  int shift = 2 * (bd - 8);
+  int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int64_t diff = coeff[i] - dqcoeff[i];
+    error += diff * diff;
+    sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_PVQ
+// Without PVQ, av1_block_error_c() return two kind of errors,
+// 1) reconstruction (i.e. decoded) error and
+// 2) Squared sum of transformed residue (i.e. 'coeff')
+// However, if PVQ is enabled, coeff does not keep the transformed residue
+// but instead a transformed original is kept.
+// Hence, new parameter ref vector (i.e. transformed predicted signal)
+// is required to derive the residue signal,
+// i.e. coeff - ref = residue (all transformed).
+
+#if CONFIG_HIGHBITDEPTH
+static int64_t av1_highbd_block_error2_c(const tran_low_t *coeff,
+                                         const tran_low_t *dqcoeff,
+                                         const tran_low_t *ref,
+                                         intptr_t block_size, int64_t *ssz,
+                                         int bd) {
+  int64_t error;
+  int64_t sqcoeff;
+  int shift = 2 * (bd - 8);
+  int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+  // Use the existing sse codes for calculating distortion of decoded signal:
+  // i.e. (orig - decoded)^2
+  // For high bit depth, throw away ssz until a 32-bit version of
+  // av1_block_error_fp is written.
+  int64_t ssz_trash;
+  error = av1_block_error(coeff, dqcoeff, block_size, &ssz_trash);
+  // prediction residue^2 = (orig - ref)^2
+  sqcoeff = av1_block_error(coeff, ref, block_size, &ssz_trash);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+  *ssz = sqcoeff;
+  return error;
+}
+#else
+// TODO(yushin) : Since 4x4 case does not need ssz, better to refactor into
+// a separate function that does not do the extra computations for ssz.
+static int64_t av1_block_error2_c(const tran_low_t *coeff,
+                                  const tran_low_t *dqcoeff,
+                                  const tran_low_t *ref, intptr_t block_size,
+                                  int64_t *ssz) {
+  int64_t error;
+  // Use the existing sse codes for calculating distortion of decoded signal:
+  // i.e. (orig - decoded)^2
+  error = av1_block_error_fp(coeff, dqcoeff, block_size);
+  // prediction residue^2 = (orig - ref)^2
+  *ssz = av1_block_error_fp(coeff, ref, block_size);
+  return error;
+}
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_PVQ
+
+#if !CONFIG_PVQ || CONFIG_VAR_TX
+/* The trailing '0' is a terminator which is used inside av1_cost_coeffs() to
+ * decide whether to include cost of a trailing EOB node or not (i.e. we
+ * can skip this if the last coefficient in this transform block, e.g. the
+ * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
+ * were non-zero). */
+#if !CONFIG_LV_MAP
+static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
+                       int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order,
+                       const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
+                       int use_fast_coef_costing) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const struct macroblock_plane *p = &x->plane[plane];
+  const struct macroblockd_plane *pd = &xd->plane[plane];
+  const PLANE_TYPE type = pd->plane_type;
+  const uint16_t *band_count = &band_count_table[tx_size][1];
+  const int eob = p->eobs[block];
+  const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  const int tx_size_ctx = txsize_sqr_map[tx_size];
+  unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      x->token_costs[tx_size_ctx][type][is_inter_block(mbmi)];
+  uint8_t token_cache[MAX_TX_SQUARE];
+  int pt = combine_entropy_contexts(*a, *l);
+  int c, cost;
+  const int16_t *scan = scan_order->scan;
+  const int16_t *nb = scan_order->neighbors;
+#if CONFIG_NEW_TOKENSET
+  const int ref = is_inter_block(mbmi);
+  aom_prob *blockz_probs =
+      cm->fc->blockzero_probs[txsize_sqr_map[tx_size]][type][ref];
+
+#endif  // CONFIG_NEW_TOKENSET
+
+#if CONFIG_HIGHBITDEPTH
+  const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
+#else
+  const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, 8);
+#endif  // CONFIG_HIGHBITDEPTH
+
+#if !CONFIG_VAR_TX && !CONFIG_SUPERTX
+  // Check for consistency of tx_size with mode info
+  assert(tx_size == get_tx_size(plane, xd));
+#endif  // !CONFIG_VAR_TX && !CONFIG_SUPERTX
+  (void)cm;
+
+  if (eob == 0) {
+#if CONFIG_NEW_TOKENSET
+    // single eob token
+    cost = av1_cost_bit(blockz_probs[pt], 0);
+#else
+    cost = token_costs[0][0][pt][EOB_TOKEN];
+#endif  // CONFIG_NEW_TOKENSET
+  } else {
+    if (use_fast_coef_costing) {
+      int band_left = *band_count++;
+
+      // dc token
+      int v = qcoeff[0];
+      int16_t prev_t;
+      cost = av1_get_token_cost(v, &prev_t, cat6_bits);
+#if CONFIG_NEW_TOKENSET
+      cost += (*token_costs)[!prev_t][pt][prev_t];
+#else
+      cost += (*token_costs)[0][pt][prev_t];
+#endif
+
+      token_cache[0] = av1_pt_energy_class[prev_t];
+      ++token_costs;
+
+      // ac tokens
+      for (c = 1; c < eob; c++) {
+        const int rc = scan[c];
+        int16_t t;
+
+        v = qcoeff[rc];
+        cost += av1_get_token_cost(v, &t, cat6_bits);
+#if CONFIG_NEW_TOKENSET
+        cost += (*token_costs)[!t][!prev_t][t];
+#else
+        cost += (*token_costs)[!prev_t][!prev_t][t];
+#endif
+        prev_t = t;
+        if (!--band_left) {
+          band_left = *band_count++;
+          ++token_costs;
+        }
+      }
+
+      // eob token
+      if (band_left || CONFIG_NEW_TOKENSET)
+        cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
+
+    } else {  // !use_fast_coef_costing
+      int band_left = *band_count++;
+
+      // dc token
+      int v = qcoeff[0];
+      int16_t tok;
+#if !CONFIG_NEW_TOKENSET
+      unsigned int(*tok_cost_ptr)[COEFF_CONTEXTS][ENTROPY_TOKENS];
+#endif
+      cost = av1_get_token_cost(v, &tok, cat6_bits);
+#if CONFIG_NEW_TOKENSET
+      cost += (*token_costs)[!tok][pt][tok];
+#else
+      cost += (*token_costs)[0][pt][tok];
+#endif
+
+      token_cache[0] = av1_pt_energy_class[tok];
+      ++token_costs;
+
+#if !CONFIG_NEW_TOKENSET
+      tok_cost_ptr = &((*token_costs)[!tok]);
+#endif
+
+      // ac tokens
+      for (c = 1; c < eob; c++) {
+        const int rc = scan[c];
+
+        v = qcoeff[rc];
+        cost += av1_get_token_cost(v, &tok, cat6_bits);
+        pt = get_coef_context(nb, token_cache, c);
+#if CONFIG_NEW_TOKENSET
+        cost += (*token_costs)[!tok][pt][tok];
+#else
+        cost += (*tok_cost_ptr)[pt][tok];
+#endif
+        token_cache[rc] = av1_pt_energy_class[tok];
+        if (!--band_left) {
+          band_left = *band_count++;
+          ++token_costs;
+        }
+#if !CONFIG_NEW_TOKENSET
+        tok_cost_ptr = &((*token_costs)[!tok]);
+#endif
+      }
+
+      // eob token
+      if (band_left || CONFIG_NEW_TOKENSET) {
+        pt = get_coef_context(nb, token_cache, c);
+        cost += (*token_costs)[0][pt][EOB_TOKEN];
+      }
+    }
+  }
+
+  return cost;
+}
+#endif  // !CONFIG_LV_MAP
+
+int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
+                    int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order,
+                    const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
+                    int use_fast_coef_costing) {
+#if !CONFIG_LV_MAP
+  const AV1_COMMON *const cm = &cpi->common;
+  return cost_coeffs(cm, x, plane, block, tx_size, scan_order, a, l,
+                     use_fast_coef_costing);
+#else  // !CONFIG_LV_MAP
+  (void)scan_order;
+  (void)use_fast_coef_costing;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const struct macroblockd_plane *pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+#else
+  const BLOCK_SIZE plane_bsize =
+      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
+#endif  // CONFIG_CHROMA_2X2
+#else   // CONFIG_CB4X4
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
+#endif  // CONFIG_CB4X4
+
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+  return av1_cost_coeffs_txb(cpi, x, plane, block, &txb_ctx);
+#endif  // !CONFIG_LV_MAP
+}
+#endif  // !CONFIG_PVQ || CONFIG_VAR_TX
+
+// Get transform block visible dimensions cropped to the MI units.
+static void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
+                               BLOCK_SIZE plane_bsize, int blk_row, int blk_col,
+                               BLOCK_SIZE tx_bsize, int *width, int *height,
+                               int *visible_width, int *visible_height) {
+  assert(tx_bsize <= plane_bsize);
+  int txb_height = block_size_high[tx_bsize];
+  int txb_width = block_size_wide[tx_bsize];
+  const int block_height = block_size_high[plane_bsize];
+  const int block_width = block_size_wide[plane_bsize];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  // TODO(aconverse@google.com): Investigate using crop_width/height here rather
+  // than the MI size
+  const int block_rows =
+      (xd->mb_to_bottom_edge >= 0)
+          ? block_height
+          : (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + block_height;
+  const int block_cols =
+      (xd->mb_to_right_edge >= 0)
+          ? block_width
+          : (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + block_width;
+  const int tx_unit_size = tx_size_wide_log2[0];
+  if (width) *width = txb_width;
+  if (height) *height = txb_height;
+  *visible_width = clamp(block_cols - (blk_col << tx_unit_size), 0, txb_width);
+  *visible_height =
+      clamp(block_rows - (blk_row << tx_unit_size), 0, txb_height);
+}
+
+// Compute the pixel domain sum square error on all visible 4x4s in the
+// transform block.
+static unsigned pixel_sse(const AV1_COMP *const cpi, const MACROBLOCKD *xd,
+                          int plane, const uint8_t *src, const int src_stride,
+                          const uint8_t *dst, const int dst_stride, int blk_row,
+                          int blk_col, const BLOCK_SIZE plane_bsize,
+                          const BLOCK_SIZE tx_bsize) {
+  int txb_rows, txb_cols, visible_rows, visible_cols;
+  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize,
+                     &txb_cols, &txb_rows, &visible_cols, &visible_rows);
+  assert(visible_rows > 0);
+  assert(visible_cols > 0);
+  if (txb_rows == visible_rows && txb_cols == visible_cols) {
+    unsigned sse;
+    cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+    return sse;
+  }
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint64_t sse = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
+                                           visible_cols, visible_rows);
+    return (unsigned int)ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+  unsigned sse = aom_sse_odd_size(src, src_stride, dst, dst_stride,
+                                  visible_cols, visible_rows);
+  return sse;
+}
+
+// Compute the squares sum squares on all visible 4x4s in the transform block.
+static int64_t sum_squares_visible(const MACROBLOCKD *xd, int plane,
+                                   const int16_t *diff, const int diff_stride,
+                                   int blk_row, int blk_col,
+                                   const BLOCK_SIZE plane_bsize,
+                                   const BLOCK_SIZE tx_bsize) {
+  int visible_rows, visible_cols;
+  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
+                     NULL, &visible_cols, &visible_rows);
+  return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
+}
+
+void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                    BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col,
+                    TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse,
+                    OUTPUT_STATUS output_status) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+#if CONFIG_DAALA_DIST
+  int qm = OD_HVS_QM;
+  int use_activity_masking = 0;
+#if CONFIG_PVQ
+  use_activity_masking = x->daala_enc.use_activity_masking;
+#endif  // CONFIG_PVQ
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+#else   // CONFIG_DAALA_DIST
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+#endif  // CONFIG_DAALA_DIST
+
+  if (cpi->sf.use_transform_domain_distortion && !CONFIG_DAALA_DIST) {
+    // Transform domain distortion computation is more efficient as it does
+    // not involve an inverse transform, but it is less accurate.
+    const int buffer_length = tx_size_2d[tx_size];
+    int64_t this_sse;
+    int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+    tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+    tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+#if CONFIG_PVQ
+    tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
+
+#if CONFIG_HIGHBITDEPTH
+    const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
+    *out_dist = av1_highbd_block_error2_c(coeff, dqcoeff, ref_coeff,
+                                          buffer_length, &this_sse, bd) >>
+                shift;
+#else
+    *out_dist = av1_block_error2_c(coeff, dqcoeff, ref_coeff, buffer_length,
+                                   &this_sse) >>
+                shift;
+#endif  // CONFIG_HIGHBITDEPTH
+#elif CONFIG_HIGHBITDEPTH
+    const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
+    *out_dist =
+        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse, bd) >>
+        shift;
+#else
+    *out_dist =
+        av1_block_error(coeff, dqcoeff, buffer_length, &this_sse) >> shift;
+#endif  // CONFIG_PVQ
+    *out_sse = this_sse >> shift;
+  } else {
+    const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+#if !CONFIG_PVQ || CONFIG_DAALA_DIST
+    const int bsw = block_size_wide[tx_bsize];
+    const int bsh = block_size_high[tx_bsize];
+#endif
+    const int src_stride = x->plane[plane].src.stride;
+    const int dst_stride = xd->plane[plane].dst.stride;
+    // Scale the transform block index to pixel unit.
+    const int src_idx = (blk_row * src_stride + blk_col)
+                        << tx_size_wide_log2[0];
+    const int dst_idx = (blk_row * dst_stride + blk_col)
+                        << tx_size_wide_log2[0];
+    const uint8_t *src = &x->plane[plane].src.buf[src_idx];
+    const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
+    const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+    const uint16_t eob = p->eobs[block];
+
+    assert(cpi != NULL);
+    assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
+
+#if CONFIG_DAALA_DIST
+    if (plane == 0 && bsw >= 8 && bsh >= 8) {
+      if (output_status == OUTPUT_HAS_DECODED_PIXELS) {
+        const int pred_stride = block_size_wide[plane_bsize];
+        const int pred_idx = (blk_row * pred_stride + blk_col)
+                             << tx_size_wide_log2[0];
+        const int16_t *pred = &pd->pred[pred_idx];
+        int i, j;
+        DECLARE_ALIGNED(16, uint8_t, pred8[MAX_TX_SQUARE]);
+
+        for (j = 0; j < bsh; j++)
+          for (i = 0; i < bsw; i++)
+            pred8[j * bsw + i] = pred[j * pred_stride + i];
+        *out_sse = av1_daala_dist(src, src_stride, pred8, bsw, bsw, bsh, qm,
+                                  use_activity_masking, x->qindex);
+      } else {
+        *out_sse = av1_daala_dist(src, src_stride, dst, dst_stride, bsw, bsh,
+                                  qm, use_activity_masking, x->qindex);
+      }
+    } else
+#endif  // CONFIG_DAALA_DIST
+    {
+      const int diff_stride = block_size_wide[plane_bsize];
+      const int diff_idx = (blk_row * diff_stride + blk_col)
+                           << tx_size_wide_log2[0];
+      const int16_t *diff = &p->src_diff[diff_idx];
+      *out_sse = sum_squares_visible(xd, plane, diff, diff_stride, blk_row,
+                                     blk_col, plane_bsize, tx_bsize);
+#if CONFIG_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+        *out_sse = ROUND_POWER_OF_TWO(*out_sse, (xd->bd - 8) * 2);
+#endif  // CONFIG_HIGHBITDEPTH
+    }
+    *out_sse *= 16;
+
+    if (eob) {
+      if (output_status == OUTPUT_HAS_DECODED_PIXELS) {
+#if CONFIG_DAALA_DIST
+        if (plane == 0 && bsw >= 8 && bsh >= 8)
+          *out_dist = av1_daala_dist(src, src_stride, dst, dst_stride, bsw, bsh,
+                                     qm, use_activity_masking, x->qindex);
+        else
+#endif  // CONFIG_DAALA_DIST
+          *out_dist =
+              pixel_sse(cpi, xd, plane, src, src_stride, dst, dst_stride,
+                        blk_row, blk_col, plane_bsize, tx_bsize);
+      } else {
+#if CONFIG_HIGHBITDEPTH
+        uint8_t *recon;
+        DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
+
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+          recon = CONVERT_TO_BYTEPTR(recon16);
+        else
+          recon = (uint8_t *)recon16;
+#else
+        DECLARE_ALIGNED(16, uint8_t, recon[MAX_TX_SQUARE]);
+#endif  // CONFIG_HIGHBITDEPTH
+
+#if !CONFIG_PVQ
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          aom_highbd_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, NULL, 0,
+                                   NULL, 0, bsw, bsh, xd->bd);
+        } else {
+#endif  // CONFIG_HIGHBITDEPTH
+          aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, NULL, 0, NULL,
+                            0, bsw, bsh);
+#if CONFIG_HIGHBITDEPTH
+        }
+#endif  // CONFIG_HIGHBITDEPTH
+#else
+        (void)dst;
+#endif  // !CONFIG_PVQ
+
+        const PLANE_TYPE plane_type = get_plane_type(plane);
+        TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+
+        av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, recon,
+                                    MAX_TX_SIZE, eob);
+
+#if CONFIG_DAALA_DIST
+        if (plane == 0 && bsw >= 8 && bsh >= 8) {
+          *out_dist = av1_daala_dist(src, src_stride, recon, MAX_TX_SIZE, bsw,
+                                     bsh, qm, use_activity_masking, x->qindex);
+        } else {
+          if (plane == 0) {
+            // Save decoded pixels for inter block in pd->pred to avoid
+            // block_8x8_rd_txfm_daala_dist() need to produce them
+            // by calling av1_inverse_transform_block() again.
+            const int pred_stride = block_size_wide[plane_bsize];
+            const int pred_idx = (blk_row * pred_stride + blk_col)
+                                 << tx_size_wide_log2[0];
+            int16_t *pred = &pd->pred[pred_idx];
+            int i, j;
+
+            for (j = 0; j < bsh; j++)
+              for (i = 0; i < bsw; i++)
+                pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i];
+          }
+#endif  // CONFIG_DAALA_DIST
+          *out_dist =
+              pixel_sse(cpi, xd, plane, src, src_stride, recon, MAX_TX_SIZE,
+                        blk_row, blk_col, plane_bsize, tx_bsize);
+#if CONFIG_DAALA_DIST
+        }
+#endif  // CONFIG_DAALA_DIST
+      }
+      *out_dist *= 16;
+    } else {
+      *out_dist = *out_sse;
+    }
+  }
+}
+
+static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
+                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+  struct rdcost_block_args *args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const AV1_COMP *cpi = args->cpi;
+  ENTROPY_CONTEXT *a = args->t_above + blk_col;
+  ENTROPY_CONTEXT *l = args->t_left + blk_row;
+#if !CONFIG_TXK_SEL
+  const AV1_COMMON *cm = &cpi->common;
+#endif
+  int64_t rd1, rd2, rd;
+  RD_STATS this_rd_stats;
+
+  assert(tx_size == get_tx_size(plane, xd));
+
+  av1_init_rd_stats(&this_rd_stats);
+
+  if (args->exit_early) return;
+
+  if (!is_inter_block(mbmi)) {
+    av1_predict_intra_block_facade(xd, plane, block, blk_col, blk_row, tx_size);
+    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+  }
+
+#if !CONFIG_TXK_SEL
+  // full forward transform and quantization
+  const int coeff_ctx = combine_entropy_contexts(*a, *l);
+  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  coeff_ctx, AV1_XFORM_QUANT_FP);
+  if (x->plane[plane].eobs[block] && !xd->lossless[mbmi->segment_id])
+    av1_optimize_b(cm, x, plane, block, tx_size, coeff_ctx);
+
+  if (!is_inter_block(mbmi)) {
+    struct macroblock_plane *const p = &x->plane[plane];
+    av1_inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
+                                       p->eobs[block]);
+    av1_dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
+                   tx_size, &this_rd_stats.dist, &this_rd_stats.sse,
+                   OUTPUT_HAS_DECODED_PIXELS);
+  } else {
+    av1_dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
+                   tx_size, &this_rd_stats.dist, &this_rd_stats.sse,
+                   OUTPUT_HAS_PREDICTED_PIXELS);
+  }
+#if CONFIG_CFL
+  if (plane == AOM_PLANE_Y && x->cfl_store_y) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const int dst_stride = pd->dst.stride;
+    uint8_t *dst =
+        &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+    cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size);
+  }
+#endif
+  rd = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.dist);
+  if (args->this_rd + rd > args->best_rd) {
+    args->exit_early = 1;
+    return;
+  }
+#if !CONFIG_PVQ
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const SCAN_ORDER *scan_order =
+      get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+  this_rd_stats.rate =
+      av1_cost_coeffs(cpi, x, plane, block, tx_size, scan_order, a, l,
+                      args->use_fast_coef_costing);
+#else   // !CONFIG_PVQ
+  this_rd_stats.rate = x->rate;
+#endif  // !CONFIG_PVQ
+#else   // !CONFIG_TXK_SEL
+  av1_search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
+                      tx_size, a, l, args->use_fast_coef_costing,
+                      &this_rd_stats);
+#endif  // !CONFIG_TXK_SEL
+
+#if !CONFIG_PVQ
+#if CONFIG_RD_DEBUG
+  av1_update_txb_coeff_cost(&this_rd_stats, plane, tx_size, blk_row, blk_col,
+                            this_rd_stats.rate);
+#endif  // CONFIG_RD_DEBUG
+  av1_set_txb_context(x, plane, block, tx_size, a, l);
+#endif  // !CONFIG_PVQ
+
+  rd1 = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist);
+  rd2 = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.sse);
+
+  // TODO(jingning): temporarily enabled only for luma component
+  rd = AOMMIN(rd1, rd2);
+
+#if CONFIG_DAALA_DIST
+  if (plane == 0 &&
+      (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4)) {
+    this_rd_stats.dist = 0;
+    this_rd_stats.sse = 0;
+    rd = 0;
+    x->rate_4x4[block] = this_rd_stats.rate;
+  }
+#endif  // CONFIG_DAALA_DIST
+
+#if !CONFIG_PVQ
+  this_rd_stats.skip &= !x->plane[plane].eobs[block];
+#else
+  this_rd_stats.skip &= x->pvq_skip[plane];
+#endif  // !CONFIG_PVQ
+  av1_merge_rd_stats(&args->rd_stats, &this_rd_stats);
+
+  args->this_rd += rd;
+
+  if (args->this_rd > args->best_rd) {
+    args->exit_early = 1;
+    return;
+  }
+}
+
+#if CONFIG_DAALA_DIST
+static void block_8x8_rd_txfm_daala_dist(int plane, int block, int blk_row,
+                                         int blk_col, BLOCK_SIZE plane_bsize,
+                                         TX_SIZE tx_size, void *arg) {
+  struct rdcost_block_args *args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int64_t rd, rd1, rd2;
+  RD_STATS this_rd_stats;
+  int qm = OD_HVS_QM;
+  int use_activity_masking = 0;
+
+  (void)tx_size;
+#if CONFIG_PVQ
+  use_activity_masking = x->daala_enc.use_activity_masking;
+#endif  // CONFIG_PVQ
+  av1_init_rd_stats(&this_rd_stats);
+
+  if (args->exit_early) return;
+
+  {
+    const struct macroblock_plane *const p = &x->plane[plane];
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+
+    const int src_stride = p->src.stride;
+    const int dst_stride = pd->dst.stride;
+    const int diff_stride = block_size_wide[plane_bsize];
+
+    const uint8_t *src =
+        &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+    const uint8_t *dst =
+        &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+
+    unsigned int tmp1, tmp2;
+    int qindex = x->qindex;
+    const int pred_stride = block_size_wide[plane_bsize];
+    const int pred_idx = (blk_row * pred_stride + blk_col)
+                         << tx_size_wide_log2[0];
+    int16_t *pred = &pd->pred[pred_idx];
+    int i, j;
+    const int tx_blk_size = 8;
+
+    DECLARE_ALIGNED(16, uint8_t, pred8[8 * 8]);
+
+    for (j = 0; j < tx_blk_size; j++)
+      for (i = 0; i < tx_blk_size; i++)
+        pred8[j * tx_blk_size + i] = pred[j * diff_stride + i];
+
+    tmp1 = av1_daala_dist(src, src_stride, pred8, tx_blk_size, 8, 8, qm,
+                          use_activity_masking, qindex);
+    tmp2 = av1_daala_dist(src, src_stride, dst, dst_stride, 8, 8, qm,
+                          use_activity_masking, qindex);
+
+    if (!is_inter_block(mbmi)) {
+      this_rd_stats.sse = (int64_t)tmp1 * 16;
+      this_rd_stats.dist = (int64_t)tmp2 * 16;
+    } else {
+      // For inter mode, the decoded pixels are provided in pd->pred,
+      // while the predicted pixels are in dst.
+      this_rd_stats.sse = (int64_t)tmp2 * 16;
+      this_rd_stats.dist = (int64_t)tmp1 * 16;
+    }
+  }
+
+  rd = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.dist);
+  if (args->this_rd + rd > args->best_rd) {
+    args->exit_early = 1;
+    return;
+  }
+
+  {
+    const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+    // The rate of the current 8x8 block is the sum of four 4x4 blocks in it.
+    this_rd_stats.rate = x->rate_4x4[block - max_blocks_wide - 1] +
+                         x->rate_4x4[block - max_blocks_wide] +
+                         x->rate_4x4[block - 1] + x->rate_4x4[block];
+  }
+  rd1 = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist);
+  rd2 = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.sse);
+  rd = AOMMIN(rd1, rd2);
+
+  args->rd_stats.dist += this_rd_stats.dist;
+  args->rd_stats.sse += this_rd_stats.sse;
+
+  args->this_rd += rd;
+
+  if (args->this_rd > args->best_rd) {
+    args->exit_early = 1;
+    return;
+  }
+}
+#endif  // CONFIG_DAALA_DIST
+
+static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
+                             RD_STATS *rd_stats, int64_t ref_best_rd, int plane,
+                             BLOCK_SIZE bsize, TX_SIZE tx_size,
+                             int use_fast_coef_casting) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct rdcost_block_args args;
+  av1_zero(args);
+  args.x = x;
+  args.cpi = cpi;
+  args.best_rd = ref_best_rd;
+  args.use_fast_coef_costing = use_fast_coef_casting;
+  av1_init_rd_stats(&args.rd_stats);
+
+  if (plane == 0) xd->mi[0]->mbmi.tx_size = tx_size;
+
+  av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
+
+#if CONFIG_DAALA_DIST
+  if (plane == 0 &&
+      (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
+    av1_foreach_8x8_transformed_block_in_plane(
+        xd, bsize, plane, block_rd_txfm, block_8x8_rd_txfm_daala_dist, &args);
+  else
+#endif  // CONFIG_DAALA_DIST
+    av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
+                                           &args);
+
+  if (args.exit_early) {
+    av1_invalid_rd_stats(rd_stats);
+  } else {
+    *rd_stats = args.rd_stats;
+  }
+}
+
+#if CONFIG_SUPERTX
+void av1_txfm_rd_in_plane_supertx(MACROBLOCK *x, const AV1_COMP *cpi, int *rate,
+                                  int64_t *distortion, int *skippable,
+                                  int64_t *sse, int64_t ref_best_rd, int plane,
+                                  BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                  int use_fast_coef_casting) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct rdcost_block_args args;
+  av1_zero(args);
+  args.cpi = cpi;
+  args.x = x;
+  args.best_rd = ref_best_rd;
+  args.use_fast_coef_costing = use_fast_coef_casting;
+
+#if CONFIG_EXT_TX
+  assert(tx_size < TX_SIZES);
+#endif  // CONFIG_EXT_TX
+
+  if (plane == 0) xd->mi[0]->mbmi.tx_size = tx_size;
+
+  av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
+
+  block_rd_txfm(plane, 0, 0, 0, get_plane_block_size(bsize, pd), tx_size,
+                &args);
+
+  if (args.exit_early) {
+    *rate = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse = INT64_MAX;
+    *skippable = 0;
+  } else {
+    *distortion = args.rd_stats.dist;
+    *rate = args.rd_stats.rate;
+    *sse = args.rd_stats.sse;
+    *skippable = !x->plane[plane].eobs[0];
+  }
+}
+#endif  // CONFIG_SUPERTX
+
+static int tx_size_cost(const AV1_COMP *const cpi, const MACROBLOCK *const x,
+                        BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+
+  const int tx_select =
+      cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8;
+
+  if (tx_select) {
+    const int is_inter = is_inter_block(mbmi);
+    const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
+                                     : intra_tx_size_cat_lookup[bsize];
+    const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
+    const int depth = tx_size_to_depth(coded_tx_size);
+    const int tx_size_ctx = get_tx_size_context(xd);
+    const int r_tx_size = cpi->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
+    return r_tx_size;
+  } else {
+    return 0;
+  }
+}
+
+// #TODO(angiebird): use this function whenever it's possible
+int av1_tx_type_cost(const AV1_COMP *cpi, const MACROBLOCKD *xd,
+                     BLOCK_SIZE bsize, int plane, TX_SIZE tx_size,
+                     TX_TYPE tx_type) {
+  if (plane > 0) return 0;
+
+  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int is_inter = is_inter_block(mbmi);
+#if CONFIG_EXT_TX
+  const AV1_COMMON *cm = &cpi->common;
+  if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 &&
+      !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+    const int ext_tx_set =
+        get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
+    if (is_inter) {
+      if (ext_tx_set > 0)
+        return cpi
+            ->inter_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]][tx_type];
+    } else {
+      if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
+        return cpi->intra_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]]
+                                       [mbmi->mode][tx_type];
+    }
+  }
+#else
+  (void)bsize;
+  if (tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+      !FIXED_TX_TYPE) {
+    if (is_inter) {
+      return cpi->inter_tx_type_costs[tx_size][tx_type];
+    } else {
+      return cpi->intra_tx_type_costs[tx_size]
+                                     [intra_mode_to_tx_type_context[mbmi->mode]]
+                                     [tx_type];
+    }
+  }
+#endif  // CONFIG_EXT_TX
+  return 0;
+}
+static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                        RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs,
+                        TX_TYPE tx_type, int tx_size) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int64_t rd = INT64_MAX;
+  aom_prob skip_prob = av1_get_skip_prob(cm, xd);
+  int s0, s1;
+  const int is_inter = is_inter_block(mbmi);
+  const int tx_select =
+      cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8;
+
+  const int r_tx_size = tx_size_cost(cpi, x, bs, tx_size);
+
+  assert(skip_prob > 0);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+  s0 = av1_cost_bit(skip_prob, 0);
+  s1 = av1_cost_bit(skip_prob, 1);
+
+  mbmi->tx_type = tx_type;
+  mbmi->tx_size = tx_size;
+  txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, tx_size,
+                   cpi->sf.use_fast_coef_costing);
+  if (rd_stats->rate == INT_MAX) return INT64_MAX;
+#if !CONFIG_TXK_SEL
+  int plane = 0;
+  rd_stats->rate += av1_tx_type_cost(cpi, xd, bs, plane, tx_size, tx_type);
+#endif
+
+  if (rd_stats->skip) {
+    if (is_inter) {
+      rd = RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse);
+    } else {
+      rd = RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size * tx_select,
+                  rd_stats->sse);
+    }
+  } else {
+    rd = RDCOST(x->rdmult, x->rddiv,
+                rd_stats->rate + s0 + r_tx_size * tx_select, rd_stats->dist);
+  }
+
+  if (tx_select) rd_stats->rate += r_tx_size;
+
+  if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+      !(rd_stats->skip))
+    rd = AOMMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse));
+
+  return rd;
+}
+
+static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
+                            TX_TYPE tx_type, TX_SIZE tx_size) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+  const int is_inter = is_inter_block(mbmi);
+  int prune = 0;
+  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
+    // passing -1 in for tx_type indicates that all 1D
+    // transforms should be considered for pruning
+    prune = prune_tx_types(cpi, bs, x, xd, -1);
+
+#if CONFIG_REF_MV
+  if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) return 1;
+#endif  // CONFIG_REF_MV
+  if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, tx_size))
+    return 1;
+  if (!is_inter && x->use_default_intra_tx_type &&
+      tx_type != get_default_tx_type(0, xd, 0, tx_size))
+    return 1;
+  if (is_inter && x->use_default_inter_tx_type &&
+      tx_type != get_default_tx_type(0, xd, 0, tx_size))
+    return 1;
+  if (max_tx_size >= TX_32X32 && tx_size == TX_4X4) return 1;
+#if CONFIG_EXT_TX
+  const AV1_COMMON *const cm = &cpi->common;
+  int ext_tx_set =
+      get_ext_tx_set(tx_size, bs, is_inter, cm->reduced_tx_set_used);
+  if (is_inter) {
+    if (!ext_tx_used_inter[ext_tx_set][tx_type]) return 1;
+    if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
+      if (!do_tx_type_search(tx_type, prune)) return 1;
+    }
+  } else {
+    if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
+      if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) return 1;
+    }
+    if (!ext_tx_used_intra[ext_tx_set][tx_type]) return 1;
+  }
+#else   // CONFIG_EXT_TX
+  if (tx_size >= TX_32X32 && tx_type != DCT_DCT) return 1;
+  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
+      !do_tx_type_search(tx_type, prune))
+    return 1;
+#endif  // CONFIG_EXT_TX
+  return 0;
+}
+
+#if CONFIG_EXT_INTER
+static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
+                                   MACROBLOCK *x, int *r, int64_t *d, int *s,
+                                   int64_t *sse, int64_t ref_best_rd) {
+  RD_STATS rd_stats;
+  int64_t rd = txfm_yrd(cpi, x, &rd_stats, ref_best_rd, bs, DCT_DCT,
+                        max_txsize_lookup[bs]);
+  *r = rd_stats.rate;
+  *d = rd_stats.dist;
+  *s = rd_stats.skip;
+  *sse = rd_stats.sse;
+  return rd;
+}
+#endif  // CONFIG_EXT_INTER
+
+static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                   RD_STATS *rd_stats, int64_t ref_best_rd,
+                                   BLOCK_SIZE bs) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  TX_TYPE tx_type, best_tx_type = DCT_DCT;
+  int64_t this_rd, best_rd = INT64_MAX;
+  aom_prob skip_prob = av1_get_skip_prob(cm, xd);
+  int s0 = av1_cost_bit(skip_prob, 0);
+  int s1 = av1_cost_bit(skip_prob, 1);
+  const int is_inter = is_inter_block(mbmi);
+  int prune = 0;
+  const int plane = 0;
+#if CONFIG_EXT_TX
+  int ext_tx_set;
+#endif  // CONFIG_EXT_TX
+  av1_invalid_rd_stats(rd_stats);
+
+  mbmi->tx_size = tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
+#if CONFIG_VAR_TX
+  mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+#endif  // CONFIG_VAR_TX
+#if CONFIG_EXT_TX
+  ext_tx_set =
+      get_ext_tx_set(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used);
+#endif  // CONFIG_EXT_TX
+
+  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
+#if CONFIG_EXT_TX
+    prune = prune_tx_types(cpi, bs, x, xd, ext_tx_set);
+#else
+    prune = prune_tx_types(cpi, bs, x, xd, 0);
+#endif  // CONFIG_EXT_TX
+#if CONFIG_EXT_TX
+  if (get_ext_tx_types(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used) >
+          1 &&
+      !xd->lossless[mbmi->segment_id]) {
+#if CONFIG_PVQ
+    od_rollback_buffer pre_buf, post_buf;
+
+    od_encode_checkpoint(&x->daala_enc, &pre_buf);
+    od_encode_checkpoint(&x->daala_enc, &post_buf);
+#endif  // CONFIG_PVQ
+
+    for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+      RD_STATS this_rd_stats;
+      if (is_inter) {
+        if (x->use_default_inter_tx_type &&
+            tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
+          continue;
+        if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
+        if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
+          if (!do_tx_type_search(tx_type, prune)) continue;
+        }
+      } else {
+        if (x->use_default_intra_tx_type &&
+            tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
+          continue;
+        if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
+          if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue;
+        }
+        if (!ext_tx_used_intra[ext_tx_set][tx_type]) continue;
+      }
+
+      mbmi->tx_type = tx_type;
+
+      txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs,
+                       mbmi->tx_size, cpi->sf.use_fast_coef_costing);
+#if CONFIG_PVQ
+      od_encode_rollback(&x->daala_enc, &pre_buf);
+#endif  // CONFIG_PVQ
+      if (this_rd_stats.rate == INT_MAX) continue;
+      av1_tx_type_cost(cpi, xd, bs, plane, mbmi->tx_size, tx_type);
+
+      if (this_rd_stats.skip)
+        this_rd = RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse);
+      else
+        this_rd = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate + s0,
+                         this_rd_stats.dist);
+      if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] &&
+          !this_rd_stats.skip)
+        this_rd =
+            AOMMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse));
+
+      if (this_rd < best_rd) {
+        best_rd = this_rd;
+        best_tx_type = mbmi->tx_type;
+        *rd_stats = this_rd_stats;
+#if CONFIG_PVQ
+        od_encode_checkpoint(&x->daala_enc, &post_buf);
+#endif  // CONFIG_PVQ
+      }
+    }
+#if CONFIG_PVQ
+    od_encode_rollback(&x->daala_enc, &post_buf);
+#endif  // CONFIG_PVQ
+  } else {
+    mbmi->tx_type = DCT_DCT;
+    txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
+                     cpi->sf.use_fast_coef_costing);
+  }
+#else   // CONFIG_EXT_TX
+  if (mbmi->tx_size < TX_32X32 && !xd->lossless[mbmi->segment_id]) {
+    for (tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+      RD_STATS this_rd_stats;
+      if (!is_inter && x->use_default_intra_tx_type &&
+          tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
+        continue;
+      if (is_inter && x->use_default_inter_tx_type &&
+          tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
+        continue;
+      mbmi->tx_type = tx_type;
+      txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs,
+                       mbmi->tx_size, cpi->sf.use_fast_coef_costing);
+      if (this_rd_stats.rate == INT_MAX) continue;
+
+      av1_tx_type_cost(cpi, xd, bs, plane, mbmi->tx_size, tx_type);
+      if (is_inter) {
+        if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
+            !do_tx_type_search(tx_type, prune))
+          continue;
+      }
+      if (this_rd_stats.skip)
+        this_rd = RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse);
+      else
+        this_rd = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate + s0,
+                         this_rd_stats.dist);
+      if (is_inter && !xd->lossless[mbmi->segment_id] && !this_rd_stats.skip)
+        this_rd =
+            AOMMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse));
+
+      if (this_rd < best_rd) {
+        best_rd = this_rd;
+        best_tx_type = mbmi->tx_type;
+        *rd_stats = this_rd_stats;
+      }
+    }
+  } else {
+    mbmi->tx_type = DCT_DCT;
+    txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
+                     cpi->sf.use_fast_coef_costing);
+  }
+#endif  // CONFIG_EXT_TX
+  mbmi->tx_type = best_tx_type;
+}
+
+static void choose_smallest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    RD_STATS *rd_stats, int64_t ref_best_rd,
+                                    BLOCK_SIZE bs) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+
+  mbmi->tx_size = TX_4X4;
+  mbmi->tx_type = DCT_DCT;
+#if CONFIG_VAR_TX
+  mbmi->min_tx_size = get_min_tx_size(TX_4X4);
+#endif  // CONFIG_VAR_TX
+
+  txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
+                   cpi->sf.use_fast_coef_costing);
+}
+
+#if CONFIG_TXK_SEL || CONFIG_VAR_TX
+static INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) {
+  int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * tx_size_wide_log2[0]);
+  return num_blk;
+}
+#endif  // CONFIG_TXK_SEL || CONFIG_VAR_TX
+
+static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
+                                        MACROBLOCK *x, RD_STATS *rd_stats,
+                                        int64_t ref_best_rd, BLOCK_SIZE bs) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int64_t rd = INT64_MAX;
+  int n;
+  int start_tx, end_tx;
+  int64_t best_rd = INT64_MAX, last_rd = INT64_MAX;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+  TX_SIZE best_tx_size = max_tx_size;
+  TX_TYPE best_tx_type = DCT_DCT;
+#if CONFIG_TXK_SEL
+  TX_TYPE best_txk_type[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+  const int num_blk = bsize_to_num_blk(bs);
+#endif  // CONFIG_TXK_SEL
+  const int tx_select = cm->tx_mode == TX_MODE_SELECT;
+  const int is_inter = is_inter_block(mbmi);
+#if CONFIG_PVQ
+  od_rollback_buffer buf;
+  od_encode_checkpoint(&x->daala_enc, &buf);
+#endif  // CONFIG_PVQ
+
+  av1_invalid_rd_stats(rd_stats);
+
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  int evaluate_rect_tx = 0;
+  if (tx_select) {
+    evaluate_rect_tx = is_rect_tx_allowed(xd, mbmi);
+  } else {
+    const TX_SIZE chosen_tx_size =
+        tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
+    evaluate_rect_tx = is_rect_tx(chosen_tx_size);
+    assert(IMPLIES(evaluate_rect_tx, is_rect_tx_allowed(xd, mbmi)));
+  }
+  if (evaluate_rect_tx) {
+    TX_TYPE tx_start = DCT_DCT;
+    TX_TYPE tx_end = TX_TYPES;
+#if CONFIG_TXK_SEL
+    // The tx_type becomes dummy when lv_map is on. The tx_type search will be
+    // performed in av1_search_txk_type()
+    tx_end = DCT_DCT + 1;
+#endif
+    TX_TYPE tx_type;
+    for (tx_type = tx_start; tx_type < tx_end; ++tx_type) {
+#if CONFIG_REF_MV
+      if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue;
+#endif  // CONFIG_REF_MV
+      const TX_SIZE rect_tx_size = max_txsize_rect_lookup[bs];
+      RD_STATS this_rd_stats;
+      int ext_tx_set =
+          get_ext_tx_set(rect_tx_size, bs, is_inter, cm->reduced_tx_set_used);
+      if ((is_inter && ext_tx_used_inter[ext_tx_set][tx_type]) ||
+          (!is_inter && ext_tx_used_intra[ext_tx_set][tx_type])) {
+        rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type,
+                      rect_tx_size);
+        if (rd < best_rd) {
+#if CONFIG_TXK_SEL
+          memcpy(best_txk_type, mbmi->txk_type,
+                 sizeof(best_txk_type[0]) * num_blk);
+#endif
+          best_tx_type = tx_type;
+          best_tx_size = rect_tx_size;
+          best_rd = rd;
+          *rd_stats = this_rd_stats;
+        }
+      }
+#if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
+      const int is_inter = is_inter_block(mbmi);
+      if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
+#endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
+    }
+  }
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+  if (tx_select) {
+    start_tx = max_tx_size;
+    end_tx = (max_tx_size >= TX_32X32) ? TX_8X8 : TX_4X4;
+  } else {
+    const TX_SIZE chosen_tx_size =
+        tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
+    start_tx = chosen_tx_size;
+    end_tx = chosen_tx_size;
+  }
+
+  last_rd = INT64_MAX;
+  for (n = start_tx; n >= end_tx; --n) {
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+    if (is_rect_tx(n)) break;
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+    TX_TYPE tx_start = DCT_DCT;
+    TX_TYPE tx_end = TX_TYPES;
+#if CONFIG_TXK_SEL
+    // The tx_type becomes dummy when lv_map is on. The tx_type search will be
+    // performed in av1_search_txk_type()
+    tx_end = DCT_DCT + 1;
+#endif
+    TX_TYPE tx_type;
+    for (tx_type = tx_start; tx_type < tx_end; ++tx_type) {
+      RD_STATS this_rd_stats;
+      if (skip_txfm_search(cpi, x, bs, tx_type, n)) continue;
+      rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type, n);
+#if CONFIG_PVQ
+      od_encode_rollback(&x->daala_enc, &buf);
+#endif  // CONFIG_PVQ
+      // Early termination in transform size search.
+      if (cpi->sf.tx_size_search_breakout &&
+          (rd == INT64_MAX ||
+           (this_rd_stats.skip == 1 && tx_type != DCT_DCT && n < start_tx) ||
+           (n < (int)max_tx_size && rd > last_rd)))
+        break;
+
+      last_rd = rd;
+      if (rd < best_rd) {
+#if CONFIG_TXK_SEL
+        memcpy(best_txk_type, mbmi->txk_type,
+               sizeof(best_txk_type[0]) * num_blk);
+#endif
+        best_tx_type = tx_type;
+        best_tx_size = n;
+        best_rd = rd;
+        *rd_stats = this_rd_stats;
+      }
+#if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
+      const int is_inter = is_inter_block(mbmi);
+      if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
+#endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
+    }
+  }
+  mbmi->tx_size = best_tx_size;
+  mbmi->tx_type = best_tx_type;
+#if CONFIG_TXK_SEL
+  memcpy(mbmi->txk_type, best_txk_type, sizeof(best_txk_type[0]) * num_blk);
+#endif
+
+#if CONFIG_VAR_TX
+  mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+#endif  // CONFIG_VAR_TX
+
+#if !CONFIG_EXT_TX
+  if (mbmi->tx_size >= TX_32X32) assert(mbmi->tx_type == DCT_DCT);
+#endif  // !CONFIG_EXT_TX
+#if CONFIG_PVQ
+  if (best_rd != INT64_MAX) {
+    txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs, best_tx_type, best_tx_size);
+  }
+#endif  // CONFIG_PVQ
+}
+
+static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                            RD_STATS *rd_stats, BLOCK_SIZE bs,
+                            int64_t ref_best_rd) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  av1_init_rd_stats(rd_stats);
+
+  assert(bs == xd->mi[0]->mbmi.sb_type);
+
+  if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+    choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
+  } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
+    choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
+  } else {
+    choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs);
+  }
+}
+
+static int conditional_skipintra(PREDICTION_MODE mode,
+                                 PREDICTION_MODE best_intra_mode) {
+  if (mode == D117_PRED && best_intra_mode != V_PRED &&
+      best_intra_mode != D135_PRED)
+    return 1;
+  if (mode == D63_PRED && best_intra_mode != V_PRED &&
+      best_intra_mode != D45_PRED)
+    return 1;
+  if (mode == D207_PRED && best_intra_mode != H_PRED &&
+      best_intra_mode != D45_PRED)
+    return 1;
+  if (mode == D153_PRED && best_intra_mode != H_PRED &&
+      best_intra_mode != D135_PRED)
+    return 1;
+  return 0;
+}
+
+// Model based RD estimation for luma intra blocks.
+static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                               BLOCK_SIZE bsize, int mode_cost) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  RD_STATS this_rd_stats;
+  int row, col;
+  int64_t temp_sse, this_rd;
+  const TX_SIZE tx_size = tx_size_from_tx_mode(bsize, cpi->common.tx_mode, 0);
+  const int stepr = tx_size_high_unit[tx_size];
+  const int stepc = tx_size_wide_unit[tx_size];
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  mbmi->tx_size = tx_size;
+  // Prediction.
+  const int step = stepr * stepc;
+  int block = 0;
+  for (row = 0; row < max_blocks_high; row += stepr) {
+    for (col = 0; col < max_blocks_wide; col += stepc) {
+      av1_predict_intra_block_facade(xd, 0, block, col, row, tx_size);
+      block += step;
+    }
+  }
+  // RD estimation.
+  model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate,
+                  &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse);
+#if CONFIG_EXT_INTRA
+  if (av1_is_directional_mode(mbmi->mode, bsize)) {
+    mode_cost += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
+                                    MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
+  }
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  if (mbmi->mode == DC_PRED) {
+    const aom_prob prob = cpi->common.fc->filter_intra_probs[0];
+    if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) {
+      const int mode = mbmi->filter_intra_mode_info.filter_intra_mode[0];
+      mode_cost += (av1_cost_bit(prob, 1) +
+                    write_uniform_cost(FILTER_INTRA_MODES, mode));
+    } else {
+      mode_cost += av1_cost_bit(prob, 0);
+    }
+  }
+#endif  // CONFIG_FILTER_INTRA
+  this_rd = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate + mode_cost,
+                   this_rd_stats.dist);
+  return this_rd;
+}
+
+#if CONFIG_PALETTE
+// Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
+// new_height'. Extra rows and columns are filled in by copying last valid
+// row/column.
+static void extend_palette_color_map(uint8_t *const color_map, int orig_width,
+                                     int orig_height, int new_width,
+                                     int new_height) {
+  int j;
+  assert(new_width >= orig_width);
+  assert(new_height >= orig_height);
+  if (new_width == orig_width && new_height == orig_height) return;
+
+  for (j = orig_height - 1; j >= 0; --j) {
+    memmove(color_map + j * new_width, color_map + j * orig_width, orig_width);
+    // Copy last column to extra columns.
+    memset(color_map + j * new_width + orig_width,
+           color_map[j * new_width + orig_width - 1], new_width - orig_width);
+  }
+  // Copy last row to extra rows.
+  for (j = orig_height; j < new_height; ++j) {
+    memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width,
+           new_width);
+  }
+}
+
+static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                     BLOCK_SIZE bsize, int palette_ctx,
+                                     int dc_mode_cost, MB_MODE_INFO *best_mbmi,
+                                     uint8_t *best_palette_color_map,
+                                     int64_t *best_rd, int64_t *best_model_rd,
+                                     int *rate, int *rate_tokenonly,
+                                     int64_t *distortion, int *skippable) {
+  int rate_overhead = 0;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mic = xd->mi[0];
+  MB_MODE_INFO *const mbmi = &mic->mbmi;
+  int this_rate, colors, n;
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *const src = x->plane[0].src.buf;
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  int block_width, block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                           &cols);
+
+  assert(cpi->common.allow_screen_content_tools);
+
+#if CONFIG_HIGHBITDEPTH
+  if (cpi->common.use_highbitdepth)
+    colors = av1_count_colors_highbd(src, src_stride, rows, cols,
+                                     cpi->common.bit_depth);
+  else
+#endif  // CONFIG_HIGHBITDEPTH
+    colors = av1_count_colors(src, src_stride, rows, cols);
+#if CONFIG_FILTER_INTRA
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+#endif  // CONFIG_FILTER_INTRA
+
+  if (colors > 1 && colors <= 64) {
+    int r, c, i, j, k, palette_mode_cost;
+    const int max_itr = 50;
+    uint8_t color_order[PALETTE_MAX_SIZE];
+    float *const data = x->palette_buffer->kmeans_data_buf;
+    float centroids[PALETTE_MAX_SIZE];
+    float lb, ub, val;
+    RD_STATS tokenonly_rd_stats;
+    int64_t this_rd, this_model_rd;
+    PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+#if CONFIG_HIGHBITDEPTH
+    uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+    if (cpi->common.use_highbitdepth)
+      lb = ub = src16[0];
+    else
+#endif  // CONFIG_HIGHBITDEPTH
+      lb = ub = src[0];
+
+#if CONFIG_HIGHBITDEPTH
+    if (cpi->common.use_highbitdepth) {
+      for (r = 0; r < rows; ++r) {
+        for (c = 0; c < cols; ++c) {
+          val = src16[r * src_stride + c];
+          data[r * cols + c] = val;
+          if (val < lb)
+            lb = val;
+          else if (val > ub)
+            ub = val;
+        }
+      }
+    } else {
+#endif  // CONFIG_HIGHBITDEPTH
+      for (r = 0; r < rows; ++r) {
+        for (c = 0; c < cols; ++c) {
+          val = src[r * src_stride + c];
+          data[r * cols + c] = val;
+          if (val < lb)
+            lb = val;
+          else if (val > ub)
+            ub = val;
+        }
+      }
+#if CONFIG_HIGHBITDEPTH
+    }
+#endif  // CONFIG_HIGHBITDEPTH
+
+    mbmi->mode = DC_PRED;
+#if CONFIG_FILTER_INTRA
+    mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+#endif  // CONFIG_FILTER_INTRA
+
+    if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return 0;
+
+    for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
+         --n) {
+      for (i = 0; i < n; ++i)
+        centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
+      av1_k_means(data, centroids, color_map, rows * cols, n, 1, max_itr);
+      k = av1_remove_duplicates(centroids, n);
+
+#if CONFIG_HIGHBITDEPTH
+      if (cpi->common.use_highbitdepth)
+        for (i = 0; i < k; ++i)
+          pmi->palette_colors[i] =
+              clip_pixel_highbd((int)centroids[i], cpi->common.bit_depth);
+      else
+#endif  // CONFIG_HIGHBITDEPTH
+        for (i = 0; i < k; ++i)
+          pmi->palette_colors[i] = clip_pixel((int)centroids[i]);
+      pmi->palette_size[0] = k;
+
+      av1_calc_indices(data, centroids, color_map, rows * cols, k, 1);
+      extend_palette_color_map(color_map, cols, rows, block_width,
+                               block_height);
+      palette_mode_cost =
+          dc_mode_cost +
+          cpi->palette_y_size_cost[bsize - BLOCK_8X8][k - PALETTE_MIN_SIZE] +
+          write_uniform_cost(k, color_map[0]) +
+          av1_cost_bit(
+              av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx],
+              1);
+      palette_mode_cost += av1_palette_color_cost_y(pmi, cpi->common.bit_depth);
+      for (i = 0; i < rows; ++i) {
+        for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
+          int color_idx;
+          const int color_ctx = av1_get_palette_color_index_context(
+              color_map, block_width, i, j, k, color_order, &color_idx);
+          assert(color_idx >= 0 && color_idx < k);
+          palette_mode_cost += cpi->palette_y_color_cost[k - PALETTE_MIN_SIZE]
+                                                        [color_ctx][color_idx];
+        }
+      }
+      this_model_rd = intra_model_yrd(cpi, x, bsize, palette_mode_cost);
+      if (*best_model_rd != INT64_MAX &&
+          this_model_rd > *best_model_rd + (*best_model_rd >> 1))
+        continue;
+      if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+      super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+      if (tokenonly_rd_stats.rate == INT_MAX) continue;
+      this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
+      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+      if (!xd->lossless[mbmi->segment_id] && mbmi->sb_type >= BLOCK_8X8) {
+        tokenonly_rd_stats.rate -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
+      }
+      if (this_rd < *best_rd) {
+        *best_rd = this_rd;
+        memcpy(best_palette_color_map, color_map,
+               block_width * block_height * sizeof(color_map[0]));
+        *best_mbmi = *mbmi;
+        rate_overhead = this_rate - tokenonly_rd_stats.rate;
+        if (rate) *rate = this_rate;
+        if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
+        if (distortion) *distortion = tokenonly_rd_stats.dist;
+        if (skippable) *skippable = tokenonly_rd_stats.skip;
+      }
+    }
+  }
+
+  if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
+    memcpy(color_map, best_palette_color_map,
+           rows * cols * sizeof(best_palette_color_map[0]));
+  }
+  *mbmi = *best_mbmi;
+  return rate_overhead;
+}
+#endif  // CONFIG_PALETTE
+
+static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
+    const AV1_COMP *const cpi, MACROBLOCK *x, int row, int col,
+    PREDICTION_MODE *best_mode, const int *bmode_costs, ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l, int *bestrate, int *bestratey, int64_t *bestdistortion,
+    BLOCK_SIZE bsize, TX_SIZE tx_size, int *y_skip, int64_t rd_thresh) {
+  const AV1_COMMON *const cm = &cpi->common;
+  PREDICTION_MODE mode;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int64_t best_rd = rd_thresh;
+  struct macroblock_plane *p = &x->plane[0];
+  struct macroblockd_plane *pd = &xd->plane[0];
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *src_init = &p->src.buf[row * 4 * src_stride + col * 4];
+  uint8_t *dst_init = &pd->dst.buf[row * 4 * dst_stride + col * 4];
+#if CONFIG_CB4X4
+  // TODO(jingning): This is a temporal change. The whole function should be
+  // out when cb4x4 is enabled.
+  ENTROPY_CONTEXT ta[4], tempa[4];
+  ENTROPY_CONTEXT tl[4], templ[4];
+#else
+  ENTROPY_CONTEXT ta[2], tempa[2];
+  ENTROPY_CONTEXT tl[2], templ[2];
+#endif  // CONFIG_CB4X4
+
+  const int pred_width_in_4x4_blocks = num_4x4_blocks_wide_lookup[bsize];
+  const int pred_height_in_4x4_blocks = num_4x4_blocks_high_lookup[bsize];
+  const int tx_width_unit = tx_size_wide_unit[tx_size];
+  const int tx_height_unit = tx_size_high_unit[tx_size];
+  const int pred_block_width = block_size_wide[bsize];
+  const int pred_block_height = block_size_high[bsize];
+  const int tx_width = tx_size_wide[tx_size];
+  const int tx_height = tx_size_high[tx_size];
+  const int pred_width_in_transform_blocks = pred_block_width / tx_width;
+  const int pred_height_in_transform_blocks = pred_block_height / tx_height;
+  int idx, idy;
+  int best_can_skip = 0;
+  uint8_t best_dst[8 * 8];
+#if CONFIG_HIGHBITDEPTH
+  uint16_t best_dst16[8 * 8];
+#endif  // CONFIG_HIGHBITDEPTH
+  const int is_lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  const int sub_bsize = bsize;
+#else
+  const int sub_bsize = BLOCK_4X4;
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+#if CONFIG_PVQ
+  od_rollback_buffer pre_buf, post_buf;
+  od_encode_checkpoint(&x->daala_enc, &pre_buf);
+  od_encode_checkpoint(&x->daala_enc, &post_buf);
+#endif  // CONFIG_PVQ
+
+  assert(bsize < BLOCK_8X8);
+  assert(tx_width < 8 || tx_height < 8);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  if (is_lossless)
+    assert(tx_width == 4 && tx_height == 4);
+  else
+    assert(tx_width == pred_block_width && tx_height == pred_block_height);
+#else
+  assert(tx_width == 4 && tx_height == 4);
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+  memcpy(ta, a, pred_width_in_transform_blocks * sizeof(a[0]));
+  memcpy(tl, l, pred_height_in_transform_blocks * sizeof(l[0]));
+
+  xd->mi[0]->mbmi.tx_size = tx_size;
+
+#if CONFIG_PALETTE
+  xd->mi[0]->mbmi.palette_mode_info.palette_size[0] = 0;
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+#if CONFIG_PVQ
+    od_encode_checkpoint(&x->daala_enc, &pre_buf);
+#endif
+    for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+      int64_t this_rd;
+      int ratey = 0;
+      int64_t distortion = 0;
+      int rate = bmode_costs[mode];
+      int can_skip = 1;
+
+      if (!(cpi->sf.intra_y_mode_mask[txsize_sqr_up_map[tx_size]] &
+            (1 << mode)))
+        continue;
+
+      // Only do the oblique modes if the best so far is
+      // one of the neighboring directional modes
+      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+        if (conditional_skipintra(mode, *best_mode)) continue;
+      }
+
+      memcpy(tempa, ta, pred_width_in_transform_blocks * sizeof(ta[0]));
+      memcpy(templ, tl, pred_height_in_transform_blocks * sizeof(tl[0]));
+
+      for (idy = 0; idy < pred_height_in_transform_blocks; ++idy) {
+        for (idx = 0; idx < pred_width_in_transform_blocks; ++idx) {
+          const int block_raster_idx = (row + idy) * 2 + (col + idx);
+          const int block =
+              av1_raster_order_to_block_index(tx_size, block_raster_idx);
+          const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
+          uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+#if !CONFIG_PVQ
+          int16_t *const src_diff = av1_raster_block_offset_int16(
+              BLOCK_8X8, block_raster_idx, p->src_diff);
+#endif
+          int skip;
+          assert(block < 4);
+          assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
+                         idx == 0 && idy == 0));
+          assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
+                         block == 0 || block == 2));
+          xd->mi[0]->bmi[block_raster_idx].as_mode = mode;
+          av1_predict_intra_block(
+              xd, pd->width, pd->height, txsize_to_bsize[tx_size], mode, dst,
+              dst_stride, dst, dst_stride, col + idx, row + idy, 0);
+#if !CONFIG_PVQ
+          aom_highbd_subtract_block(tx_height, tx_width, src_diff, 8, src,
+                                    src_stride, dst, dst_stride, xd->bd);
+#endif
+          if (is_lossless) {
+            TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size);
+            const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0);
+            const int coeff_ctx =
+                combine_entropy_contexts(tempa[idx], templ[idy]);
+#if !CONFIG_PVQ
+            av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
+                            tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
+            ratey += av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order,
+                                     tempa + idx, templ + idy,
+                                     cpi->sf.use_fast_coef_costing);
+            skip = (p->eobs[block] == 0);
+            can_skip &= skip;
+            tempa[idx] = !skip;
+            templ[idy] = !skip;
+#if CONFIG_EXT_TX
+            if (tx_size == TX_8X4) {
+              tempa[idx + 1] = tempa[idx];
+            } else if (tx_size == TX_4X8) {
+              templ[idy + 1] = templ[idy];
+            }
+#endif  // CONFIG_EXT_TX
+#else
+            (void)scan_order;
+
+            av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
+                            tx_size, coeff_ctx, AV1_XFORM_QUANT_B);
+
+            ratey += x->rate;
+            skip = x->pvq_skip[0];
+            tempa[idx] = !skip;
+            templ[idy] = !skip;
+            can_skip &= skip;
+#endif
+            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+              goto next_highbd;
+#if CONFIG_PVQ
+            if (!skip)
+#endif
+              av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
+                                          DCT_DCT, tx_size, dst, dst_stride,
+                                          p->eobs[block]);
+          } else {
+            int64_t dist;
+            unsigned int tmp;
+            TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size);
+            const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0);
+            const int coeff_ctx =
+                combine_entropy_contexts(tempa[idx], templ[idy]);
+#if !CONFIG_PVQ
+            av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
+                            tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
+            av1_optimize_b(cm, x, 0, block, tx_size, coeff_ctx);
+            ratey += av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order,
+                                     tempa + idx, templ + idy,
+                                     cpi->sf.use_fast_coef_costing);
+            skip = (p->eobs[block] == 0);
+            can_skip &= skip;
+            tempa[idx] = !skip;
+            templ[idy] = !skip;
+#if CONFIG_EXT_TX
+            if (tx_size == TX_8X4) {
+              tempa[idx + 1] = tempa[idx];
+            } else if (tx_size == TX_4X8) {
+              templ[idy + 1] = templ[idy];
+            }
+#endif  // CONFIG_EXT_TX
+#else
+            (void)scan_order;
+
+            av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
+                            tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
+            ratey += x->rate;
+            skip = x->pvq_skip[0];
+            tempa[idx] = !skip;
+            templ[idy] = !skip;
+            can_skip &= skip;
+#endif
+#if CONFIG_PVQ
+            if (!skip)
+#endif
+              av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
+                                          tx_type, tx_size, dst, dst_stride,
+                                          p->eobs[block]);
+            cpi->fn_ptr[sub_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
+            dist = (int64_t)tmp << 4;
+            distortion += dist;
+            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+              goto next_highbd;
+          }
+        }
+      }
+
+      rate += ratey;
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+      if (this_rd < best_rd) {
+        *bestrate = rate;
+        *bestratey = ratey;
+        *bestdistortion = distortion;
+        best_rd = this_rd;
+        best_can_skip = can_skip;
+        *best_mode = mode;
+        memcpy(a, tempa, pred_width_in_transform_blocks * sizeof(tempa[0]));
+        memcpy(l, templ, pred_height_in_transform_blocks * sizeof(templ[0]));
+#if CONFIG_PVQ
+        od_encode_checkpoint(&x->daala_enc, &post_buf);
+#endif
+        for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy) {
+          memcpy(best_dst16 + idy * 8,
+                 CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+                 pred_width_in_transform_blocks * 4 * sizeof(uint16_t));
+        }
+      }
+    next_highbd : {}
+#if CONFIG_PVQ
+      od_encode_rollback(&x->daala_enc, &pre_buf);
+#endif
+    }
+
+    if (best_rd >= rd_thresh) return best_rd;
+
+#if CONFIG_PVQ
+    od_encode_rollback(&x->daala_enc, &post_buf);
+#endif
+
+    if (y_skip) *y_skip &= best_can_skip;
+
+    for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy) {
+      memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+             best_dst16 + idy * 8,
+             pred_width_in_transform_blocks * 4 * sizeof(uint16_t));
+    }
+
+    return best_rd;
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_PVQ
+  od_encode_checkpoint(&x->daala_enc, &pre_buf);
+#endif  // CONFIG_PVQ
+
+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+    int64_t this_rd;
+    int ratey = 0;
+    int64_t distortion = 0;
+    int rate = bmode_costs[mode];
+    int can_skip = 1;
+
+    if (!(cpi->sf.intra_y_mode_mask[txsize_sqr_up_map[tx_size]] &
+          (1 << mode))) {
+      continue;
+    }
+
+    // Only do the oblique modes if the best so far is
+    // one of the neighboring directional modes
+    if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+      if (conditional_skipintra(mode, *best_mode)) continue;
+    }
+
+    memcpy(tempa, ta, pred_width_in_transform_blocks * sizeof(ta[0]));
+    memcpy(templ, tl, pred_height_in_transform_blocks * sizeof(tl[0]));
+
+    for (idy = 0; idy < pred_height_in_4x4_blocks; idy += tx_height_unit) {
+      for (idx = 0; idx < pred_width_in_4x4_blocks; idx += tx_width_unit) {
+        const int block_raster_idx = (row + idy) * 2 + (col + idx);
+        int block = av1_raster_order_to_block_index(tx_size, block_raster_idx);
+        const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
+        uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+#if !CONFIG_PVQ
+        int16_t *const src_diff = av1_raster_block_offset_int16(
+            BLOCK_8X8, block_raster_idx, p->src_diff);
+#endif  // !CONFIG_PVQ
+        int skip;
+        assert(block < 4);
+        assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
+                       idx == 0 && idy == 0));
+        assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
+                       block == 0 || block == 2));
+        xd->mi[0]->bmi[block_raster_idx].as_mode = mode;
+        av1_predict_intra_block(xd, pd->width, pd->height,
+                                txsize_to_bsize[tx_size], mode, dst, dst_stride,
+                                dst, dst_stride,
+#if CONFIG_CB4X4
+                                2 * (col + idx), 2 * (row + idy),
+#else
+                                col + idx, row + idy,
+#endif  // CONFIG_CB4X4
+                                0);
+#if !CONFIG_PVQ
+        aom_subtract_block(tx_height, tx_width, src_diff, 8, src, src_stride,
+                           dst, dst_stride);
+#endif  // !CONFIG_PVQ
+
+        TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size);
+        const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0);
+        const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]);
+#if CONFIG_CB4X4
+        block = 4 * block;
+#endif  // CONFIG_CB4X4
+#if !CONFIG_PVQ
+        const AV1_XFORM_QUANT xform_quant =
+            is_lossless ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
+        av1_xform_quant(cm, x, 0, block,
+#if CONFIG_CB4X4
+                        2 * (row + idy), 2 * (col + idx),
+#else
+                        row + idy, col + idx,
+#endif  // CONFIG_CB4X4
+                        BLOCK_8X8, tx_size, coeff_ctx, xform_quant);
+
+        if (!is_lossless) {
+          av1_optimize_b(cm, x, 0, block, tx_size, coeff_ctx);
+        }
+
+        ratey +=
+            av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order, tempa + idx,
+                            templ + idy, cpi->sf.use_fast_coef_costing);
+        skip = (p->eobs[block] == 0);
+        can_skip &= skip;
+        tempa[idx] = !skip;
+        templ[idy] = !skip;
+#if CONFIG_EXT_TX
+        if (tx_size == TX_8X4) {
+          tempa[idx + 1] = tempa[idx];
+        } else if (tx_size == TX_4X8) {
+          templ[idy + 1] = templ[idy];
+        }
+#endif  // CONFIG_EXT_TX
+#else
+        (void)scan_order;
+
+        av1_xform_quant(cm, x, 0, block,
+#if CONFIG_CB4X4
+                        2 * (row + idy), 2 * (col + idx),
+#else
+                        row + idy, col + idx,
+#endif  // CONFIG_CB4X4
+                        BLOCK_8X8, tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
+
+        ratey += x->rate;
+        skip = x->pvq_skip[0];
+        tempa[idx] = !skip;
+        templ[idy] = !skip;
+        can_skip &= skip;
+#endif  // !CONFIG_PVQ
+
+        if (!is_lossless) {  // To use the pixel domain distortion, we need to
+                             // calculate inverse txfm *before* calculating RD
+                             // cost. Compared to calculating the distortion in
+                             // the frequency domain, the overhead of encoding
+                             // effort is low.
+#if CONFIG_PVQ
+          if (!skip)
+#endif  // CONFIG_PVQ
+            av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
+                                        tx_type, tx_size, dst, dst_stride,
+                                        p->eobs[block]);
+          unsigned int tmp;
+          cpi->fn_ptr[sub_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
+          const int64_t dist = (int64_t)tmp << 4;
+          distortion += dist;
+        }
+
+        if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+          goto next;
+
+        if (is_lossless) {  // Calculate inverse txfm *after* RD cost.
+#if CONFIG_PVQ
+          if (!skip)
+#endif  // CONFIG_PVQ
+            av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
+                                        DCT_DCT, tx_size, dst, dst_stride,
+                                        p->eobs[block]);
+        }
+      }
+    }
+
+    rate += ratey;
+    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+    if (this_rd < best_rd) {
+      *bestrate = rate;
+      *bestratey = ratey;
+      *bestdistortion = distortion;
+      best_rd = this_rd;
+      best_can_skip = can_skip;
+      *best_mode = mode;
+      memcpy(a, tempa, pred_width_in_transform_blocks * sizeof(tempa[0]));
+      memcpy(l, templ, pred_height_in_transform_blocks * sizeof(templ[0]));
+#if CONFIG_PVQ
+      od_encode_checkpoint(&x->daala_enc, &post_buf);
+#endif  // CONFIG_PVQ
+      for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy)
+        memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
+               pred_width_in_transform_blocks * 4);
+    }
+  next : {}
+#if CONFIG_PVQ
+    od_encode_rollback(&x->daala_enc, &pre_buf);
+#endif  // CONFIG_PVQ
+  }     // mode decision loop
+
+  if (best_rd >= rd_thresh) return best_rd;
+
+#if CONFIG_PVQ
+  od_encode_rollback(&x->daala_enc, &post_buf);
+#endif  // CONFIG_PVQ
+
+  if (y_skip) *y_skip &= best_can_skip;
+
+  for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy)
+    memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
+           pred_width_in_transform_blocks * 4);
+
+  return best_rd;
+}
+
+static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
+                                            MACROBLOCK *mb, int *rate,
+                                            int *rate_y, int64_t *distortion,
+                                            int *y_skip, int64_t best_rd) {
+  const MACROBLOCKD *const xd = &mb->e_mbd;
+  MODE_INFO *const mic = xd->mi[0];
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi = xd->left_mi;
+  MB_MODE_INFO *const mbmi = &mic->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int pred_width_in_4x4_blocks = num_4x4_blocks_wide_lookup[bsize];
+  const int pred_height_in_4x4_blocks = num_4x4_blocks_high_lookup[bsize];
+  int idx, idy;
+  int cost = 0;
+  int64_t total_distortion = 0;
+  int tot_rate_y = 0;
+  int64_t total_rd = 0;
+  const int *bmode_costs = cpi->mbmode_cost[0];
+  const int is_lossless = xd->lossless[mbmi->segment_id];
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  const TX_SIZE tx_size = is_lossless ? TX_4X4 : max_txsize_rect_lookup[bsize];
+#else
+  const TX_SIZE tx_size = TX_4X4;
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+#if CONFIG_EXT_INTRA
+#if CONFIG_INTRA_INTERP
+  mbmi->intra_filter = INTRA_FILTER_LINEAR;
+#endif  // CONFIG_INTRA_INTERP
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+#endif  // CONFIG_FILTER_INTRA
+
+  // TODO(any): Add search of the tx_type to improve rd performance at the
+  // expense of speed.
+  mbmi->tx_type = DCT_DCT;
+  mbmi->tx_size = tx_size;
+
+  if (y_skip) *y_skip = 1;
+
+  // Pick modes for each prediction sub-block (of size 4x4, 4x8, or 8x4) in this
+  // 8x8 coding block.
+  for (idy = 0; idy < 2; idy += pred_height_in_4x4_blocks) {
+    for (idx = 0; idx < 2; idx += pred_width_in_4x4_blocks) {
+      PREDICTION_MODE best_mode = DC_PRED;
+      int r = INT_MAX, ry = INT_MAX;
+      int64_t d = INT64_MAX, this_rd = INT64_MAX;
+      int j;
+      const int pred_block_idx = idy * 2 + idx;
+      if (cpi->common.frame_type == KEY_FRAME) {
+        const PREDICTION_MODE A =
+            av1_above_block_mode(mic, above_mi, pred_block_idx);
+        const PREDICTION_MODE L =
+            av1_left_block_mode(mic, left_mi, pred_block_idx);
+
+        bmode_costs = cpi->y_mode_costs[A][L];
+      }
+      this_rd = rd_pick_intra_sub_8x8_y_subblock_mode(
+          cpi, mb, idy, idx, &best_mode, bmode_costs,
+          xd->plane[0].above_context + idx, xd->plane[0].left_context + idy, &r,
+          &ry, &d, bsize, tx_size, y_skip, best_rd - total_rd);
+#if !CONFIG_DAALA_DIST
+      if (this_rd >= best_rd - total_rd) return INT64_MAX;
+#endif  // !CONFIG_DAALA_DIST
+      total_rd += this_rd;
+      cost += r;
+      total_distortion += d;
+      tot_rate_y += ry;
+
+      mic->bmi[pred_block_idx].as_mode = best_mode;
+      for (j = 1; j < pred_height_in_4x4_blocks; ++j)
+        mic->bmi[pred_block_idx + j * 2].as_mode = best_mode;
+      for (j = 1; j < pred_width_in_4x4_blocks; ++j)
+        mic->bmi[pred_block_idx + j].as_mode = best_mode;
+
+      if (total_rd >= best_rd) return INT64_MAX;
+    }
+  }
+  mbmi->mode = mic->bmi[3].as_mode;
+
+#if CONFIG_DAALA_DIST
+  {
+    const struct macroblock_plane *p = &mb->plane[0];
+    const struct macroblockd_plane *pd = &xd->plane[0];
+    const int src_stride = p->src.stride;
+    const int dst_stride = pd->dst.stride;
+    uint8_t *src = p->src.buf;
+    uint8_t *dst = pd->dst.buf;
+    int use_activity_masking = 0;
+    int qm = OD_HVS_QM;
+
+#if CONFIG_PVQ
+    use_activity_masking = mb->daala_enc.use_activity_masking;
+#endif  // CONFIG_PVQ
+    // Daala-defined distortion computed for the block of 8x8 pixels
+    total_distortion = av1_daala_dist(src, src_stride, dst, dst_stride, 8, 8,
+                                      qm, use_activity_masking, mb->qindex)
+                       << 4;
+  }
+#endif  // CONFIG_DAALA_DIST
+  // Add in the cost of the transform type
+  if (!is_lossless) {
+    int rate_tx_type = 0;
+#if CONFIG_EXT_TX
+    if (get_ext_tx_types(tx_size, bsize, 0, cpi->common.reduced_tx_set_used) >
+        1) {
+      const int eset =
+          get_ext_tx_set(tx_size, bsize, 0, cpi->common.reduced_tx_set_used);
+      rate_tx_type = cpi->intra_tx_type_costs[eset][txsize_sqr_map[tx_size]]
+                                             [mbmi->mode][mbmi->tx_type];
+    }
+#else
+    rate_tx_type =
+        cpi->intra_tx_type_costs[txsize_sqr_map[tx_size]]
+                                [intra_mode_to_tx_type_context[mbmi->mode]]
+                                [mbmi->tx_type];
+#endif  // CONFIG_EXT_TX
+    assert(mbmi->tx_size == tx_size);
+    cost += rate_tx_type;
+    tot_rate_y += rate_tx_type;
+  }
+
+  *rate = cost;
+  *rate_y = tot_rate_y;
+  *distortion = total_distortion;
+
+  return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
+}
+
+#if CONFIG_FILTER_INTRA
+// Return 1 if an filter intra mode is selected; return 0 otherwise.
+static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    int *rate, int *rate_tokenonly,
+                                    int64_t *distortion, int *skippable,
+                                    BLOCK_SIZE bsize, int mode_cost,
+                                    int64_t *best_rd, int64_t *best_model_rd,
+                                    uint16_t skip_mask) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mic = xd->mi[0];
+  MB_MODE_INFO *mbmi = &mic->mbmi;
+  int filter_intra_selected_flag = 0;
+  FILTER_INTRA_MODE mode;
+  TX_SIZE best_tx_size = TX_4X4;
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+  TX_TYPE best_tx_type;
+
+  av1_zero(filter_intra_mode_info);
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 1;
+  mbmi->mode = DC_PRED;
+#if CONFIG_PALETTE
+  mbmi->palette_mode_info.palette_size[0] = 0;
+#endif  // CONFIG_PALETTE
+
+  for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
+    int this_rate;
+    int64_t this_rd, this_model_rd;
+    RD_STATS tokenonly_rd_stats;
+    if (skip_mask & (1 << mode)) continue;
+    mbmi->filter_intra_mode_info.filter_intra_mode[0] = mode;
+    this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost);
+    if (*best_model_rd != INT64_MAX &&
+        this_model_rd > *best_model_rd + (*best_model_rd >> 1))
+      continue;
+    if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+    super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+    if (tokenonly_rd_stats.rate == INT_MAX) continue;
+    this_rate = tokenonly_rd_stats.rate +
+                av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 1) +
+                write_uniform_cost(FILTER_INTRA_MODES, mode) + mode_cost;
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+
+    if (this_rd < *best_rd) {
+      *best_rd = this_rd;
+      best_tx_size = mic->mbmi.tx_size;
+      filter_intra_mode_info = mbmi->filter_intra_mode_info;
+      best_tx_type = mic->mbmi.tx_type;
+      *rate = this_rate;
+      *rate_tokenonly = tokenonly_rd_stats.rate;
+      *distortion = tokenonly_rd_stats.dist;
+      *skippable = tokenonly_rd_stats.skip;
+      filter_intra_selected_flag = 1;
+    }
+  }
+
+  if (filter_intra_selected_flag) {
+    mbmi->mode = DC_PRED;
+    mbmi->tx_size = best_tx_size;
+    mbmi->filter_intra_mode_info.use_filter_intra_mode[0] =
+        filter_intra_mode_info.use_filter_intra_mode[0];
+    mbmi->filter_intra_mode_info.filter_intra_mode[0] =
+        filter_intra_mode_info.filter_intra_mode[0];
+    mbmi->tx_type = best_tx_type;
+    return 1;
+  } else {
+    return 0;
+  }
+}
+#endif  // CONFIG_FILTER_INTRA
+
+#if CONFIG_EXT_INTRA
+// Run RD calculation with given luma intra prediction angle., and return
+// the RD cost. Update the best mode info. if the RD cost is the best so far.
+static int64_t calc_rd_given_intra_angle(
+    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost,
+    int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate,
+    RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size,
+    TX_TYPE *best_tx_type,
+#if CONFIG_INTRA_INTERP
+    INTRA_FILTER *best_filter,
+#endif  // CONFIG_INTRA_INTERP
+    int64_t *best_rd, int64_t *best_model_rd) {
+  int this_rate;
+  RD_STATS tokenonly_rd_stats;
+  int64_t this_rd, this_model_rd;
+  MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
+
+  mbmi->angle_delta[0] = angle_delta;
+  this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost);
+  if (*best_model_rd != INT64_MAX &&
+      this_model_rd > *best_model_rd + (*best_model_rd >> 1))
+    return INT64_MAX;
+  if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+  super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in);
+  if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX;
+
+  this_rate = tokenonly_rd_stats.rate + mode_cost +
+              write_uniform_cost(2 * max_angle_delta + 1,
+                                 mbmi->angle_delta[0] + max_angle_delta);
+  this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+
+  if (this_rd < *best_rd) {
+    *best_rd = this_rd;
+    *best_angle_delta = mbmi->angle_delta[0];
+    *best_tx_size = mbmi->tx_size;
+#if CONFIG_INTRA_INTERP
+    *best_filter = mbmi->intra_filter;
+#endif  // CONFIG_INTRA_INTERP
+    *best_tx_type = mbmi->tx_type;
+    *rate = this_rate;
+    rd_stats->rate = tokenonly_rd_stats.rate;
+    rd_stats->dist = tokenonly_rd_stats.dist;
+    rd_stats->skip = tokenonly_rd_stats.skip;
+  }
+  return this_rd;
+}
+
+// With given luma directional intra prediction mode, pick the best angle delta
+// Return the RD cost corresponding to the best angle delta.
+static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                       int *rate, RD_STATS *rd_stats,
+                                       BLOCK_SIZE bsize, int mode_cost,
+                                       int64_t best_rd,
+                                       int64_t *best_model_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mic = xd->mi[0];
+  MB_MODE_INFO *mbmi = &mic->mbmi;
+  int i, angle_delta, best_angle_delta = 0;
+  int first_try = 1;
+#if CONFIG_INTRA_INTERP
+  int p_angle;
+  const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
+  INTRA_FILTER filter, best_filter = INTRA_FILTER_LINEAR;
+#endif  // CONFIG_INTRA_INTERP
+  int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
+  TX_SIZE best_tx_size = mic->mbmi.tx_size;
+  TX_TYPE best_tx_type = mbmi->tx_type;
+
+  for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
+
+  for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+#if CONFIG_INTRA_INTERP
+    for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
+      if (FILTER_FAST_SEARCH && filter != INTRA_FILTER_LINEAR) continue;
+      mic->mbmi.intra_filter = filter;
+#endif  // CONFIG_INTRA_INTERP
+      for (i = 0; i < 2; ++i) {
+        best_rd_in = (best_rd == INT64_MAX)
+                         ? INT64_MAX
+                         : (best_rd + (best_rd >> (first_try ? 3 : 5)));
+        this_rd = calc_rd_given_intra_angle(
+            cpi, x, bsize,
+#if CONFIG_INTRA_INTERP
+            mode_cost + cpi->intra_filter_cost[intra_filter_ctx][filter],
+#else
+          mode_cost,
+#endif  // CONFIG_INTRA_INTERP
+            best_rd_in, (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate,
+            rd_stats, &best_angle_delta, &best_tx_size, &best_tx_type,
+#if CONFIG_INTRA_INTERP
+            &best_filter,
+#endif  // CONFIG_INTRA_INTERP
+            &best_rd, best_model_rd);
+        rd_cost[2 * angle_delta + i] = this_rd;
+        if (first_try && this_rd == INT64_MAX) return best_rd;
+        first_try = 0;
+        if (angle_delta == 0) {
+          rd_cost[1] = this_rd;
+          break;
+        }
+      }
+#if CONFIG_INTRA_INTERP
+    }
+#endif  // CONFIG_INTRA_INTERP
+  }
+
+  assert(best_rd != INT64_MAX);
+  for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+    int64_t rd_thresh;
+#if CONFIG_INTRA_INTERP
+    for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
+      if (FILTER_FAST_SEARCH && filter != INTRA_FILTER_LINEAR) continue;
+      mic->mbmi.intra_filter = filter;
+#endif  // CONFIG_INTRA_INTERP
+      for (i = 0; i < 2; ++i) {
+        int skip_search = 0;
+        rd_thresh = best_rd + (best_rd >> 5);
+        if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
+            rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
+          skip_search = 1;
+        if (!skip_search) {
+          calc_rd_given_intra_angle(
+              cpi, x, bsize,
+#if CONFIG_INTRA_INTERP
+              mode_cost + cpi->intra_filter_cost[intra_filter_ctx][filter],
+#else
+            mode_cost,
+#endif  // CONFIG_INTRA_INTERP
+              best_rd, (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate,
+              rd_stats, &best_angle_delta, &best_tx_size, &best_tx_type,
+#if CONFIG_INTRA_INTERP
+              &best_filter,
+#endif  // CONFIG_INTRA_INTERP
+              &best_rd, best_model_rd);
+        }
+      }
+#if CONFIG_INTRA_INTERP
+    }
+#endif  // CONFIG_INTRA_INTERP
+  }
+
+#if CONFIG_INTRA_INTERP
+  if (FILTER_FAST_SEARCH && rd_stats->rate < INT_MAX) {
+    p_angle = mode_to_angle_map[mbmi->mode] + best_angle_delta * ANGLE_STEP;
+    if (av1_is_intra_filter_switchable(p_angle)) {
+      for (filter = INTRA_FILTER_LINEAR + 1; filter < INTRA_FILTERS; ++filter) {
+        mic->mbmi.intra_filter = filter;
+        this_rd = calc_rd_given_intra_angle(
+            cpi, x, bsize,
+            mode_cost + cpi->intra_filter_cost[intra_filter_ctx][filter],
+            best_rd, best_angle_delta, MAX_ANGLE_DELTA, rate, rd_stats,
+            &best_angle_delta, &best_tx_size, &best_tx_type, &best_filter,
+            &best_rd, best_model_rd);
+      }
+    }
+  }
+#endif  // CONFIG_INTRA_INTERP
+
+  mbmi->tx_size = best_tx_size;
+  mbmi->angle_delta[0] = best_angle_delta;
+#if CONFIG_INTRA_INTERP
+  mic->mbmi.intra_filter = best_filter;
+#endif  // CONFIG_INTRA_INTERP
+  mbmi->tx_type = best_tx_type;
+  return best_rd;
+}
+
+// Indices are sign, integer, and fractional part of the gradient value
+static const uint8_t gradient_to_angle_bin[2][7][16] = {
+  {
+      { 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },
+      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+      { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+      { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+  },
+  {
+      { 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4 },
+      { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3 },
+      { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+      { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+      { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+      { 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+      { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+  },
+};
+
+static const uint8_t mode_to_angle_bin[INTRA_MODES] = {
+  0, 2, 6, 0, 4, 3, 5, 7, 1, 0,
+};
+
+static void angle_estimation(const uint8_t *src, int src_stride, int rows,
+                             int cols, uint8_t *directional_mode_skip_mask) {
+  int i, r, c, index, dx, dy, temp, sn, remd, quot;
+  uint64_t hist[DIRECTIONAL_MODES];
+  uint64_t hist_sum = 0;
+
+  memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
+  src += src_stride;
+  for (r = 1; r < rows; ++r) {
+    for (c = 1; c < cols; ++c) {
+      dx = src[c] - src[c - 1];
+      dy = src[c] - src[c - src_stride];
+      temp = dx * dx + dy * dy;
+      if (dy == 0) {
+        index = 2;
+      } else {
+        sn = (dx > 0) ^ (dy > 0);
+        dx = abs(dx);
+        dy = abs(dy);
+        remd = dx % dy;
+        quot = dx / dy;
+        remd = remd * 16 / dy;
+        index = gradient_to_angle_bin[sn][AOMMIN(quot, 6)][AOMMIN(remd, 15)];
+      }
+      hist[index] += temp;
+    }
+    src += src_stride;
+  }
+
+  for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
+  for (i = 0; i < INTRA_MODES; ++i) {
+    if (i != DC_PRED && i != TM_PRED) {
+      const uint8_t angle_bin = mode_to_angle_bin[i];
+      uint64_t score = 2 * hist[angle_bin];
+      int weight = 2;
+      if (angle_bin > 0) {
+        score += hist[angle_bin - 1];
+        ++weight;
+      }
+      if (angle_bin < DIRECTIONAL_MODES - 1) {
+        score += hist[angle_bin + 1];
+        ++weight;
+      }
+      if (score * ANGLE_SKIP_THRESH < hist_sum * weight)
+        directional_mode_skip_mask[i] = 1;
+    }
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
+                                    int rows, int cols,
+                                    uint8_t *directional_mode_skip_mask) {
+  int i, r, c, index, dx, dy, temp, sn, remd, quot;
+  uint64_t hist[DIRECTIONAL_MODES];
+  uint64_t hist_sum = 0;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+
+  memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
+  src += src_stride;
+  for (r = 1; r < rows; ++r) {
+    for (c = 1; c < cols; ++c) {
+      dx = src[c] - src[c - 1];
+      dy = src[c] - src[c - src_stride];
+      temp = dx * dx + dy * dy;
+      if (dy == 0) {
+        index = 2;
+      } else {
+        sn = (dx > 0) ^ (dy > 0);
+        dx = abs(dx);
+        dy = abs(dy);
+        remd = dx % dy;
+        quot = dx / dy;
+        remd = remd * 16 / dy;
+        index = gradient_to_angle_bin[sn][AOMMIN(quot, 6)][AOMMIN(remd, 15)];
+      }
+      hist[index] += temp;
+    }
+    src += src_stride;
+  }
+
+  for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
+  for (i = 0; i < INTRA_MODES; ++i) {
+    if (i != DC_PRED && i != TM_PRED) {
+      const uint8_t angle_bin = mode_to_angle_bin[i];
+      uint64_t score = 2 * hist[angle_bin];
+      int weight = 2;
+      if (angle_bin > 0) {
+        score += hist[angle_bin - 1];
+        ++weight;
+      }
+      if (angle_bin < DIRECTIONAL_MODES - 1) {
+        score += hist[angle_bin + 1];
+        ++weight;
+      }
+      if (score * ANGLE_SKIP_THRESH < hist_sum * weight)
+        directional_mode_skip_mask[i] = 1;
+    }
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_EXT_INTRA
+
+// This function is used only for intra_only frames
+static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                      int *rate, int *rate_tokenonly,
+                                      int64_t *distortion, int *skippable,
+                                      BLOCK_SIZE bsize, int64_t best_rd) {
+  uint8_t mode_idx;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mic = xd->mi[0];
+  MB_MODE_INFO *const mbmi = &mic->mbmi;
+  MB_MODE_INFO best_mbmi = *mbmi;
+  int64_t best_model_rd = INT64_MAX;
+#if CONFIG_EXT_INTRA
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+#if CONFIG_INTRA_INTERP
+  const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
+#endif  // CONFIG_INTRA_INTERP
+  int is_directional_mode;
+  uint8_t directional_mode_skip_mask[INTRA_MODES];
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *src = x->plane[0].src.buf;
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  int beat_best_rd = 0;
+  uint16_t filter_intra_mode_skip_mask = (1 << FILTER_INTRA_MODES) - 1;
+#endif  // CONFIG_FILTER_INTRA
+  const int *bmode_costs;
+#if CONFIG_PALETTE
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  uint8_t *best_palette_color_map =
+      cpi->common.allow_screen_content_tools
+          ? x->palette_buffer->best_palette_color_map
+          : NULL;
+  int palette_y_mode_ctx = 0;
+  const int try_palette =
+      cpi->common.allow_screen_content_tools && bsize >= BLOCK_8X8;
+#endif  // CONFIG_PALETTE
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi = xd->left_mi;
+  const PREDICTION_MODE A = av1_above_block_mode(mic, above_mi, 0);
+  const PREDICTION_MODE L = av1_left_block_mode(mic, left_mi, 0);
+  const PREDICTION_MODE FINAL_MODE_SEARCH = TM_PRED + 1;
+#if CONFIG_PVQ
+  od_rollback_buffer pre_buf, post_buf;
+
+  od_encode_checkpoint(&x->daala_enc, &pre_buf);
+  od_encode_checkpoint(&x->daala_enc, &post_buf);
+#endif  // CONFIG_PVQ
+  bmode_costs = cpi->y_mode_costs[A][L];
+
+#if CONFIG_EXT_INTRA
+  mbmi->angle_delta[0] = 0;
+  memset(directional_mode_skip_mask, 0,
+         sizeof(directional_mode_skip_mask[0]) * INTRA_MODES);
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    highbd_angle_estimation(src, src_stride, rows, cols,
+                            directional_mode_skip_mask);
+  else
+#endif  // CONFIG_HIGHBITDEPTH
+    angle_estimation(src, src_stride, rows, cols, directional_mode_skip_mask);
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+#endif  // CONFIG_FILTER_INTRA
+#if CONFIG_PALETTE
+  pmi->palette_size[0] = 0;
+  if (above_mi)
+    palette_y_mode_ctx +=
+        (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+  if (left_mi)
+    palette_y_mode_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+#endif  // CONFIG_PALETTE
+
+  if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
+    x->use_default_intra_tx_type = 1;
+  else
+    x->use_default_intra_tx_type = 0;
+
+  /* Y Search for intra prediction mode */
+  for (mode_idx = DC_PRED; mode_idx <= FINAL_MODE_SEARCH; ++mode_idx) {
+    RD_STATS this_rd_stats;
+    int this_rate, this_rate_tokenonly, s;
+    int64_t this_distortion, this_rd, this_model_rd;
+    if (mode_idx == FINAL_MODE_SEARCH) {
+      if (x->use_default_intra_tx_type == 0) break;
+      mbmi->mode = best_mbmi.mode;
+      x->use_default_intra_tx_type = 0;
+    } else {
+      mbmi->mode = mode_idx;
+    }
+#if CONFIG_PVQ
+    od_encode_rollback(&x->daala_enc, &pre_buf);
+#endif  // CONFIG_PVQ
+#if CONFIG_EXT_INTRA
+    mbmi->angle_delta[0] = 0;
+#endif  // CONFIG_EXT_INTRA
+    this_model_rd = intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode]);
+    if (best_model_rd != INT64_MAX &&
+        this_model_rd > best_model_rd + (best_model_rd >> 1))
+      continue;
+    if (this_model_rd < best_model_rd) best_model_rd = this_model_rd;
+#if CONFIG_EXT_INTRA
+    is_directional_mode = av1_is_directional_mode(mbmi->mode, bsize);
+    if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
+    if (is_directional_mode) {
+      this_rd_stats.rate = INT_MAX;
+      rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize,
+                              bmode_costs[mbmi->mode], best_rd, &best_model_rd);
+    } else {
+      super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
+    }
+#else
+    super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
+#endif  // CONFIG_EXT_INTRA
+    this_rate_tokenonly = this_rd_stats.rate;
+    this_distortion = this_rd_stats.dist;
+    s = this_rd_stats.skip;
+
+    if (this_rate_tokenonly == INT_MAX) continue;
+
+    this_rate = this_rate_tokenonly + bmode_costs[mbmi->mode];
+
+    if (!xd->lossless[mbmi->segment_id] && mbmi->sb_type >= BLOCK_8X8) {
+      // super_block_yrd above includes the cost of the tx_size in the
+      // tokenonly rate, but for intra blocks, tx_size is always coded
+      // (prediction granularity), so we account for it in the full rate,
+      // not the tokenonly rate.
+      this_rate_tokenonly -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
+    }
+#if CONFIG_PALETTE
+    if (try_palette && mbmi->mode == DC_PRED) {
+      this_rate +=
+          av1_cost_bit(av1_default_palette_y_mode_prob[bsize - BLOCK_8X8]
+                                                      [palette_y_mode_ctx],
+                       0);
+    }
+#endif  // CONFIG_PALETTE
+#if CONFIG_FILTER_INTRA
+    if (mbmi->mode == DC_PRED)
+      this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 0);
+#endif  // CONFIG_FILTER_INTRA
+#if CONFIG_EXT_INTRA
+    if (is_directional_mode) {
+#if CONFIG_INTRA_INTERP
+      const int p_angle =
+          mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
+      if (av1_is_intra_filter_switchable(p_angle))
+        this_rate +=
+            cpi->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
+#endif  // CONFIG_INTRA_INTERP
+      this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
+                                      MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
+    }
+#endif  // CONFIG_EXT_INTRA
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+#if CONFIG_FILTER_INTRA
+    if (best_rd == INT64_MAX || this_rd - best_rd < (best_rd >> 4)) {
+      filter_intra_mode_skip_mask ^= (1 << mbmi->mode);
+    }
+#endif  // CONFIG_FILTER_INTRA
+
+    if (this_rd < best_rd) {
+      best_mbmi = *mbmi;
+      best_rd = this_rd;
+#if CONFIG_FILTER_INTRA
+      beat_best_rd = 1;
+#endif  // CONFIG_FILTER_INTRA
+      *rate = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion = this_distortion;
+      *skippable = s;
+#if CONFIG_PVQ
+      od_encode_checkpoint(&x->daala_enc, &post_buf);
+#endif  // CONFIG_PVQ
+    }
+  }
+
+#if CONFIG_PVQ
+  od_encode_rollback(&x->daala_enc, &post_buf);
+#endif  // CONFIG_PVQ
+
+#if CONFIG_CFL
+  // Perform one extra txfm_rd_in_plane() call, this time with the best value so
+  // we can store reconstructed luma values
+  RD_STATS this_rd_stats;
+  x->cfl_store_y = 1;
+  txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, 0, bsize,
+                   mic->mbmi.tx_size, cpi->sf.use_fast_coef_costing);
+  x->cfl_store_y = 0;
+#endif
+
+#if CONFIG_PALETTE
+  if (try_palette) {
+    rd_pick_palette_intra_sby(cpi, x, bsize, palette_y_mode_ctx,
+                              bmode_costs[DC_PRED], &best_mbmi,
+                              best_palette_color_map, &best_rd, &best_model_rd,
+                              rate, rate_tokenonly, distortion, skippable);
+  }
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_FILTER_INTRA
+  if (beat_best_rd) {
+    if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
+                                 skippable, bsize, bmode_costs[DC_PRED],
+                                 &best_rd, &best_model_rd,
+                                 filter_intra_mode_skip_mask)) {
+      best_mbmi = *mbmi;
+    }
+  }
+#endif  // CONFIG_FILTER_INTRA
+
+  *mbmi = best_mbmi;
+  return best_rd;
+}
+
+// Return value 0: early termination triggered, no valid rd cost available;
+//              1: rd cost values are valid.
+static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                            RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                            int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
+  int plane;
+  int is_cost_valid = 1;
+  av1_init_rd_stats(rd_stats);
+
+  if (ref_best_rd < 0) is_cost_valid = 0;
+
+#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+  if (x->skip_chroma_rd) return is_cost_valid;
+
+  bsize = scale_chroma_bsize(bsize, xd->plane[1].subsampling_x,
+                             xd->plane[1].subsampling_y);
+#endif  // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+
+#if !CONFIG_PVQ
+  if (is_inter_block(mbmi) && is_cost_valid) {
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane)
+      av1_subtract_plane(x, bsize, plane);
+  }
+#endif  // !CONFIG_PVQ
+
+  if (is_cost_valid) {
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+      RD_STATS pn_rd_stats;
+      txfm_rd_in_plane(x, cpi, &pn_rd_stats, ref_best_rd, plane, bsize,
+                       uv_tx_size, cpi->sf.use_fast_coef_costing);
+      if (pn_rd_stats.rate == INT_MAX) {
+        is_cost_valid = 0;
+        break;
+      }
+      av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+      if (RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist) >
+              ref_best_rd &&
+          RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse) > ref_best_rd) {
+        is_cost_valid = 0;
+        break;
+      }
+    }
+  }
+
+  if (!is_cost_valid) {
+    // reset cost value
+    av1_invalid_rd_stats(rd_stats);
+  }
+
+  return is_cost_valid;
+}
+
+#if CONFIG_VAR_TX
+// FIXME crop these calls
+static uint64_t sum_squares_2d(const int16_t *diff, int diff_stride,
+                               TX_SIZE tx_size) {
+  return aom_sum_squares_2d_i16(diff, diff_stride, tx_size_wide[tx_size],
+                                tx_size_high[tx_size]);
+}
+
+void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
+                       int blk_row, int blk_col, int plane, int block,
+                       int plane_bsize, const ENTROPY_CONTEXT *a,
+                       const ENTROPY_CONTEXT *l, RD_STATS *rd_stats) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  int64_t tmp;
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  PLANE_TYPE plane_type = get_plane_type(plane);
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const SCAN_ORDER *const scan_order =
+      get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
+  BLOCK_SIZE txm_bsize = txsize_to_bsize[tx_size];
+  int bh = block_size_high[txm_bsize];
+  int bw = block_size_wide[txm_bsize];
+  int txb_h = tx_size_high_unit[tx_size];
+  int txb_w = tx_size_wide_unit[tx_size];
+
+  int src_stride = p->src.stride;
+  uint8_t *src =
+      &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+  uint8_t *dst =
+      &pd->dst
+           .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
+#if CONFIG_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, rec_buffer16[MAX_TX_SQUARE]);
+  uint8_t *rec_buffer;
+#else
+  DECLARE_ALIGNED(16, uint8_t, rec_buffer[MAX_TX_SQUARE]);
+#endif  // CONFIG_HIGHBITDEPTH
+  int max_blocks_high = block_size_high[plane_bsize];
+  int max_blocks_wide = block_size_wide[plane_bsize];
+  const int diff_stride = max_blocks_wide;
+  const int16_t *diff =
+      &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+  int txb_coeff_cost;
+
+  assert(tx_size < TX_SIZES_ALL);
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+
+  max_blocks_high >>= tx_size_wide_log2[0];
+  max_blocks_wide >>= tx_size_wide_log2[0];
+
+  int coeff_ctx = get_entropy_context(tx_size, a, l);
+
+  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  coeff_ctx, AV1_XFORM_QUANT_FP);
+
+  av1_optimize_b(cm, x, plane, block, tx_size, coeff_ctx);
+
+// TODO(any): Use av1_dist_block to compute distortion
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    rec_buffer = CONVERT_TO_BYTEPTR(rec_buffer16);
+    aom_highbd_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL,
+                             0, NULL, 0, bw, bh, xd->bd);
+  } else {
+    rec_buffer = (uint8_t *)rec_buffer16;
+    aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0,
+                      NULL, 0, bw, bh);
+  }
+#else
+  aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0, NULL,
+                    0, bw, bh);
+#endif  // CONFIG_HIGHBITDEPTH
+
+  if (blk_row + txb_h > max_blocks_high || blk_col + txb_w > max_blocks_wide) {
+    int idx, idy;
+    int blocks_height = AOMMIN(txb_h, max_blocks_high - blk_row);
+    int blocks_width = AOMMIN(txb_w, max_blocks_wide - blk_col);
+    tmp = 0;
+    for (idy = 0; idy < blocks_height; ++idy) {
+      for (idx = 0; idx < blocks_width; ++idx) {
+        const int16_t *d =
+            diff + ((idy * diff_stride + idx) << tx_size_wide_log2[0]);
+        tmp += sum_squares_2d(d, diff_stride, 0);
+      }
+    }
+  } else {
+    tmp = sum_squares_2d(diff, diff_stride, tx_size);
+  }
+
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
+#endif  // CONFIG_HIGHBITDEPTH
+  rd_stats->sse += tmp * 16;
+  const int eob = p->eobs[block];
+
+  av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, rec_buffer,
+                              MAX_TX_SIZE, eob);
+  if (eob > 0) {
+    if (txb_w + blk_col > max_blocks_wide ||
+        txb_h + blk_row > max_blocks_high) {
+      int idx, idy;
+      unsigned int this_dist;
+      int blocks_height = AOMMIN(txb_h, max_blocks_high - blk_row);
+      int blocks_width = AOMMIN(txb_w, max_blocks_wide - blk_col);
+      tmp = 0;
+      for (idy = 0; idy < blocks_height; ++idy) {
+        for (idx = 0; idx < blocks_width; ++idx) {
+          uint8_t *const s =
+              src + ((idy * src_stride + idx) << tx_size_wide_log2[0]);
+          uint8_t *const r =
+              rec_buffer + ((idy * MAX_TX_SIZE + idx) << tx_size_wide_log2[0]);
+          cpi->fn_ptr[0].vf(s, src_stride, r, MAX_TX_SIZE, &this_dist);
+          tmp += this_dist;
+        }
+      }
+    } else {
+      uint32_t this_dist;
+      cpi->fn_ptr[txm_bsize].vf(src, src_stride, rec_buffer, MAX_TX_SIZE,
+                                &this_dist);
+      tmp = this_dist;
+    }
+  }
+  rd_stats->dist += tmp * 16;
+  txb_coeff_cost =
+      av1_cost_coeffs(cpi, x, plane, block, tx_size, scan_order, a, l, 0);
+  rd_stats->rate += txb_coeff_cost;
+  rd_stats->skip &= (eob == 0);
+
+#if CONFIG_RD_DEBUG
+  av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col,
+                            txb_coeff_cost);
+#endif  // CONFIG_RD_DEBUG
+}
+
+static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+                            int blk_col, int plane, int block, int block32,
+                            TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize,
+                            ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl,
+                            TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+                            RD_STATS *rd_stats, int64_t ref_best_rd,
+                            int *is_cost_valid, RD_STATS *rd_stats_stack) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  TX_SIZE(*const inter_tx_size)
+  [MAX_MIB_SIZE] =
+      (TX_SIZE(*)[MAX_MIB_SIZE]) & mbmi->inter_tx_size[tx_row][tx_col];
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  int64_t this_rd = INT64_MAX;
+  ENTROPY_CONTEXT *pta = ta + blk_col;
+  ENTROPY_CONTEXT *ptl = tl + blk_row;
+  int coeff_ctx, i;
+  int ctx =
+      txfm_partition_context(tx_above + (blk_col >> 1),
+                             tx_left + (blk_row >> 1), mbmi->sb_type, tx_size);
+  int64_t sum_rd = INT64_MAX;
+  int tmp_eob = 0;
+  int zero_blk_rate;
+  RD_STATS sum_rd_stats;
+  const int tx_size_ctx = txsize_sqr_map[tx_size];
+
+  av1_init_rd_stats(&sum_rd_stats);
+
+  assert(tx_size < TX_SIZES_ALL);
+
+  if (ref_best_rd < 0) {
+    *is_cost_valid = 0;
+    return;
+  }
+
+  coeff_ctx = get_entropy_context(tx_size, pta, ptl);
+
+  av1_init_rd_stats(rd_stats);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  zero_blk_rate = x->token_costs[tx_size_ctx][pd->plane_type][1][0][0]
+                                [coeff_ctx][EOB_TOKEN];
+
+  if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
+    inter_tx_size[0][0] = tx_size;
+
+    if (tx_size == TX_32X32 && mbmi->tx_type != DCT_DCT &&
+        rd_stats_stack[block32].rate != INT_MAX) {
+      *rd_stats = rd_stats_stack[block32];
+      p->eobs[block] = !rd_stats->skip;
+      x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip;
+    } else {
+      av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
+                        plane_bsize, pta, ptl, rd_stats);
+      if (tx_size == TX_32X32) {
+        rd_stats_stack[block32] = *rd_stats;
+      }
+    }
+
+    if ((RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist) >=
+             RDCOST(x->rdmult, x->rddiv, zero_blk_rate, rd_stats->sse) ||
+         rd_stats->skip == 1) &&
+        !xd->lossless[mbmi->segment_id]) {
+#if CONFIG_RD_DEBUG
+      av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col,
+                                zero_blk_rate - rd_stats->rate);
+#endif  // CONFIG_RD_DEBUG
+      rd_stats->rate = zero_blk_rate;
+      rd_stats->dist = rd_stats->sse;
+      rd_stats->skip = 1;
+      x->blk_skip[plane][blk_row * bw + blk_col] = 1;
+      p->eobs[block] = 0;
+    } else {
+      x->blk_skip[plane][blk_row * bw + blk_col] = 0;
+      rd_stats->skip = 0;
+    }
+
+    if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+      rd_stats->rate +=
+          av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
+    this_rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist);
+    tmp_eob = p->eobs[block];
+  }
+
+  if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsl = tx_size_wide_unit[sub_txs];
+    int sub_step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+    RD_STATS this_rd_stats;
+    int this_cost_valid = 1;
+    int64_t tmp_rd = 0;
+
+    sum_rd_stats.rate =
+        av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 1);
+
+    assert(tx_size < TX_SIZES_ALL);
+
+    for (i = 0; i < 4 && this_cost_valid; ++i) {
+      int offsetr = blk_row + (i >> 1) * bsl;
+      int offsetc = blk_col + (i & 0x01) * bsl;
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+      select_tx_block(cpi, x, offsetr, offsetc, plane, block, block32, sub_txs,
+                      depth + 1, plane_bsize, ta, tl, tx_above, tx_left,
+                      &this_rd_stats, ref_best_rd - tmp_rd, &this_cost_valid,
+                      rd_stats_stack);
+
+      av1_merge_rd_stats(&sum_rd_stats, &this_rd_stats);
+
+      tmp_rd =
+          RDCOST(x->rdmult, x->rddiv, sum_rd_stats.rate, sum_rd_stats.dist);
+      if (this_rd < tmp_rd) break;
+      block += sub_step;
+    }
+    if (this_cost_valid) sum_rd = tmp_rd;
+  }
+
+  if (this_rd < sum_rd) {
+    int idx, idy;
+    for (i = 0; i < tx_size_wide_unit[tx_size]; ++i) pta[i] = !(tmp_eob == 0);
+    for (i = 0; i < tx_size_high_unit[tx_size]; ++i) ptl[i] = !(tmp_eob == 0);
+    txfm_partition_update(tx_above + (blk_col >> 1), tx_left + (blk_row >> 1),
+                          tx_size, tx_size);
+    inter_tx_size[0][0] = tx_size;
+    for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
+      for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
+        inter_tx_size[idy][idx] = tx_size;
+    mbmi->tx_size = tx_size;
+    if (this_rd == INT64_MAX) *is_cost_valid = 0;
+    x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip;
+  } else {
+    *rd_stats = sum_rd_stats;
+    if (sum_rd == INT64_MAX) *is_cost_valid = 0;
+  }
+}
+
+static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                            RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                            int64_t ref_best_rd, RD_STATS *rd_stats_stack) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int is_cost_valid = 1;
+  int64_t this_rd = 0;
+
+  if (ref_best_rd < 0) is_cost_valid = 0;
+
+  av1_init_rd_stats(rd_stats);
+
+  if (is_cost_valid) {
+    const struct macroblockd_plane *const pd = &xd->plane[0];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+    const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+    const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize];
+    const int bh = tx_size_high_unit[max_tx_size];
+    const int bw = tx_size_wide_unit[max_tx_size];
+    int idx, idy;
+    int block = 0;
+    int block32 = 0;
+    int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+    ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+
+    RD_STATS pn_rd_stats;
+    av1_init_rd_stats(&pn_rd_stats);
+
+    av1_get_entropy_contexts(bsize, 0, pd, ctxa, ctxl);
+    memcpy(tx_above, xd->above_txfm_context,
+           sizeof(TXFM_CONTEXT) * (mi_width >> 1));
+    memcpy(tx_left, xd->left_txfm_context,
+           sizeof(TXFM_CONTEXT) * (mi_height >> 1));
+
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bw) {
+        select_tx_block(cpi, x, idy, idx, 0, block, block32, max_tx_size,
+                        mi_height != mi_width, plane_bsize, ctxa, ctxl,
+                        tx_above, tx_left, &pn_rd_stats, ref_best_rd - this_rd,
+                        &is_cost_valid, rd_stats_stack);
+        av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+        this_rd += AOMMIN(
+            RDCOST(x->rdmult, x->rddiv, pn_rd_stats.rate, pn_rd_stats.dist),
+            RDCOST(x->rdmult, x->rddiv, 0, pn_rd_stats.sse));
+        block += step;
+        ++block32;
+      }
+    }
+  }
+
+  this_rd = AOMMIN(RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist),
+                   RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse));
+  if (this_rd > ref_best_rd) is_cost_valid = 0;
+
+  if (!is_cost_valid) {
+    // reset cost value
+    av1_invalid_rd_stats(rd_stats);
+  }
+}
+
+static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
+                                       RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                                       int64_t ref_best_rd, TX_TYPE tx_type,
+                                       RD_STATS *rd_stats_stack) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int is_inter = is_inter_block(mbmi);
+  aom_prob skip_prob = av1_get_skip_prob(cm, xd);
+  int s0 = av1_cost_bit(skip_prob, 0);
+  int s1 = av1_cost_bit(skip_prob, 1);
+  int64_t rd;
+  int row, col;
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+
+  mbmi->tx_type = tx_type;
+  mbmi->min_tx_size = TX_SIZES_ALL;
+  inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, rd_stats_stack);
+
+  if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+  for (row = 0; row < max_blocks_high / 2; ++row)
+    for (col = 0; col < max_blocks_wide / 2; ++col)
+      mbmi->min_tx_size = AOMMIN(
+          mbmi->min_tx_size, get_min_tx_size(mbmi->inter_tx_size[row][col]));
+
+#if CONFIG_EXT_TX
+  if (get_ext_tx_types(mbmi->min_tx_size, bsize, is_inter,
+                       cm->reduced_tx_set_used) > 1 &&
+      !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+    const int ext_tx_set = get_ext_tx_set(mbmi->min_tx_size, bsize, is_inter,
+                                          cm->reduced_tx_set_used);
+    if (is_inter) {
+      if (ext_tx_set > 0)
+        rd_stats->rate +=
+            cpi->inter_tx_type_costs[ext_tx_set]
+                                    [txsize_sqr_map[mbmi->min_tx_size]]
+                                    [mbmi->tx_type];
+    } else {
+      if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
+        rd_stats->rate +=
+            cpi->intra_tx_type_costs[ext_tx_set][mbmi->min_tx_size][mbmi->mode]
+                                    [mbmi->tx_type];
+    }
+  }
+#else   // CONFIG_EXT_TX
+  if (mbmi->min_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id])
+    rd_stats->rate +=
+        cpi->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type];
+#endif  // CONFIG_EXT_TX
+
+  if (rd_stats->skip)
+    rd = RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse);
+  else
+    rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate + s0, rd_stats->dist);
+
+  if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+      !(rd_stats->skip))
+    rd = AOMMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse));
+
+  return rd;
+}
+
+static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                               RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                               int64_t ref_best_rd) {
+  const AV1_COMMON *cm = &cpi->common;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int64_t rd = INT64_MAX;
+  int64_t best_rd = INT64_MAX;
+  TX_TYPE tx_type, best_tx_type = DCT_DCT;
+  const int is_inter = is_inter_block(mbmi);
+  TX_SIZE best_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
+  TX_SIZE best_tx = max_txsize_lookup[bsize];
+  TX_SIZE best_min_tx_size = TX_SIZES_ALL;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
+  const int n4 = bsize_to_num_blk(bsize);
+  int idx, idy;
+  int prune = 0;
+  const int count32 =
+      1 << (2 * (cm->mib_size_log2 - mi_width_log2_lookup[BLOCK_32X32]));
+#if CONFIG_EXT_PARTITION
+  RD_STATS rd_stats_stack[16];
+#else
+  RD_STATS rd_stats_stack[4];
+#endif  // CONFIG_EXT_PARTITION
+#if CONFIG_EXT_TX
+  const int ext_tx_set =
+      get_ext_tx_set(max_tx_size, bsize, is_inter, cm->reduced_tx_set_used);
+#endif  // CONFIG_EXT_TX
+
+  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
+#if CONFIG_EXT_TX
+    prune = prune_tx_types(cpi, bsize, x, xd, ext_tx_set);
+#else
+    prune = prune_tx_types(cpi, bsize, x, xd, 0);
+#endif  // CONFIG_EXT_TX
+
+  av1_invalid_rd_stats(rd_stats);
+
+  for (idx = 0; idx < count32; ++idx)
+    av1_invalid_rd_stats(&rd_stats_stack[idx]);
+
+  for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+    RD_STATS this_rd_stats;
+    av1_init_rd_stats(&this_rd_stats);
+#if CONFIG_EXT_TX
+    if (is_inter) {
+      if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
+      if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
+        if (!do_tx_type_search(tx_type, prune)) continue;
+      }
+    } else {
+      if (!ALLOW_INTRA_EXT_TX && bsize >= BLOCK_8X8) {
+        if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue;
+      }
+      if (!ext_tx_used_intra[ext_tx_set][tx_type]) continue;
+    }
+#else   // CONFIG_EXT_TX
+    if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
+        !do_tx_type_search(tx_type, prune))
+      continue;
+#endif  // CONFIG_EXT_TX
+    if (is_inter && x->use_default_inter_tx_type &&
+        tx_type != get_default_tx_type(0, xd, 0, max_tx_size))
+      continue;
+
+    if (xd->lossless[mbmi->segment_id])
+      if (tx_type != DCT_DCT) continue;
+
+    rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
+                                 tx_type, rd_stats_stack);
+
+    if (rd < best_rd) {
+      best_rd = rd;
+      *rd_stats = this_rd_stats;
+      best_tx_type = mbmi->tx_type;
+      best_tx = mbmi->tx_size;
+      best_min_tx_size = mbmi->min_tx_size;
+      memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4);
+      for (idy = 0; idy < xd->n8_h; ++idy)
+        for (idx = 0; idx < xd->n8_w; ++idx)
+          best_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
+    }
+  }
+
+  mbmi->tx_type = best_tx_type;
+  for (idy = 0; idy < xd->n8_h; ++idy)
+    for (idx = 0; idx < xd->n8_w; ++idx)
+      mbmi->inter_tx_size[idy][idx] = best_tx_size[idy][idx];
+  mbmi->tx_size = best_tx;
+  mbmi->min_tx_size = best_min_tx_size;
+  memcpy(x->blk_skip[0], best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+}
+
+static void tx_block_rd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+                        int blk_col, int plane, int block, TX_SIZE tx_size,
+                        BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *above_ctx,
+                        ENTROPY_CONTEXT *left_ctx, RD_STATS *rd_stats) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  TX_SIZE plane_tx_size;
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+  assert(tx_size < TX_SIZES_ALL);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  plane_tx_size =
+      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
+            : mbmi->inter_tx_size[tx_row][tx_col];
+
+  if (tx_size == plane_tx_size) {
+    int i;
+    ENTROPY_CONTEXT *ta = above_ctx + blk_col;
+    ENTROPY_CONTEXT *tl = left_ctx + blk_row;
+    av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
+                      plane_bsize, ta, tl, rd_stats);
+
+    for (i = 0; i < tx_size_wide_unit[tx_size]; ++i)
+      ta[i] = !(p->eobs[block] == 0);
+    for (i = 0; i < tx_size_high_unit[tx_size]; ++i)
+      tl[i] = !(p->eobs[block] == 0);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsl = tx_size_wide_unit[sub_txs];
+    int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+    int i;
+
+    assert(bsl > 0);
+
+    for (i = 0; i < 4; ++i) {
+      int offsetr = blk_row + (i >> 1) * bsl;
+      int offsetc = blk_col + (i & 0x01) * bsl;
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+      tx_block_rd(cpi, x, offsetr, offsetc, plane, block, sub_txs, plane_bsize,
+                  above_ctx, left_ctx, rd_stats);
+      block += step;
+    }
+  }
+}
+
+// Return value 0: early termination triggered, no valid rd cost available;
+//              1: rd cost values are valid.
+static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                            RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                            int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int plane;
+  int is_cost_valid = 1;
+  int64_t this_rd;
+
+  if (ref_best_rd < 0) is_cost_valid = 0;
+
+  av1_init_rd_stats(rd_stats);
+
+#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+  if (x->skip_chroma_rd) return is_cost_valid;
+  bsize = AOMMAX(BLOCK_8X8, bsize);
+#endif  // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  if (is_rect_tx(mbmi->tx_size)) {
+    return super_block_uvrd(cpi, x, rd_stats, bsize, ref_best_rd);
+  }
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+  if (is_inter_block(mbmi) && is_cost_valid) {
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane)
+      av1_subtract_plane(x, bsize, plane);
+  }
+
+  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+    const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+    const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize];
+    const int bh = tx_size_high_unit[max_tx_size];
+    const int bw = tx_size_wide_unit[max_tx_size];
+    int idx, idy;
+    int block = 0;
+    const int step = bh * bw;
+    ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE];
+    ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE];
+    RD_STATS pn_rd_stats;
+    av1_init_rd_stats(&pn_rd_stats);
+
+    av1_get_entropy_contexts(bsize, 0, pd, ta, tl);
+
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bw) {
+        tx_block_rd(cpi, x, idy, idx, plane, block, max_tx_size, plane_bsize,
+                    ta, tl, &pn_rd_stats);
+        block += step;
+      }
+    }
+
+    if (pn_rd_stats.rate == INT_MAX) {
+      is_cost_valid = 0;
+      break;
+    }
+
+    av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+
+    this_rd =
+        AOMMIN(RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist),
+               RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse));
+
+    if (this_rd > ref_best_rd) {
+      is_cost_valid = 0;
+      break;
+    }
+  }
+
+  if (!is_cost_valid) {
+    // reset cost value
+    av1_invalid_rd_stats(rd_stats);
+  }
+
+  return is_cost_valid;
+}
+#endif  // CONFIG_VAR_TX
+
+#if CONFIG_PALETTE
+static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                       int dc_mode_cost,
+                                       uint8_t *best_palette_color_map,
+                                       MB_MODE_INFO *const best_mbmi,
+                                       int64_t *best_rd, int *rate,
+                                       int *rate_tokenonly, int64_t *distortion,
+                                       int *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  int this_rate;
+  int64_t this_rd;
+  int colors_u, colors_v, colors;
+  const int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+  uint8_t *const color_map = xd->plane[1].color_index_map;
+  RD_STATS tokenonly_rd_stats;
+  int plane_block_width, plane_block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+                           &plane_block_height, &rows, &cols);
+  if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return;
+
+  mbmi->uv_mode = DC_PRED;
+#if CONFIG_FILTER_INTRA
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif  // CONFIG_FILTER_INTRA
+
+#if CONFIG_HIGHBITDEPTH
+  if (cpi->common.use_highbitdepth) {
+    colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols,
+                                       cpi->common.bit_depth);
+    colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols,
+                                       cpi->common.bit_depth);
+  } else {
+#endif  // CONFIG_HIGHBITDEPTH
+    colors_u = av1_count_colors(src_u, src_stride, rows, cols);
+    colors_v = av1_count_colors(src_v, src_stride, rows, cols);
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+
+  colors = colors_u > colors_v ? colors_u : colors_v;
+  if (colors > 1 && colors <= 64) {
+    int r, c, n, i, j;
+    const int max_itr = 50;
+    uint8_t color_order[PALETTE_MAX_SIZE];
+    float lb_u, ub_u, val_u;
+    float lb_v, ub_v, val_v;
+    float *const data = x->palette_buffer->kmeans_data_buf;
+    float centroids[2 * PALETTE_MAX_SIZE];
+
+#if CONFIG_HIGHBITDEPTH
+    uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
+    uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
+    if (cpi->common.use_highbitdepth) {
+      lb_u = src_u16[0];
+      ub_u = src_u16[0];
+      lb_v = src_v16[0];
+      ub_v = src_v16[0];
+    } else {
+#endif  // CONFIG_HIGHBITDEPTH
+      lb_u = src_u[0];
+      ub_u = src_u[0];
+      lb_v = src_v[0];
+      ub_v = src_v[0];
+#if CONFIG_HIGHBITDEPTH
+    }
+#endif  // CONFIG_HIGHBITDEPTH
+
+    for (r = 0; r < rows; ++r) {
+      for (c = 0; c < cols; ++c) {
+#if CONFIG_HIGHBITDEPTH
+        if (cpi->common.use_highbitdepth) {
+          val_u = src_u16[r * src_stride + c];
+          val_v = src_v16[r * src_stride + c];
+          data[(r * cols + c) * 2] = val_u;
+          data[(r * cols + c) * 2 + 1] = val_v;
+        } else {
+#endif  // CONFIG_HIGHBITDEPTH
+          val_u = src_u[r * src_stride + c];
+          val_v = src_v[r * src_stride + c];
+          data[(r * cols + c) * 2] = val_u;
+          data[(r * cols + c) * 2 + 1] = val_v;
+#if CONFIG_HIGHBITDEPTH
+        }
+#endif  // CONFIG_HIGHBITDEPTH
+        if (val_u < lb_u)
+          lb_u = val_u;
+        else if (val_u > ub_u)
+          ub_u = val_u;
+        if (val_v < lb_v)
+          lb_v = val_v;
+        else if (val_v > ub_v)
+          ub_v = val_v;
+      }
+    }
+
+    for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
+         --n) {
+      for (i = 0; i < n; ++i) {
+        centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
+        centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
+      }
+      av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
+#if CONFIG_PALETTE_DELTA_ENCODING
+      // Sort the U channel colors in ascending order.
+      for (i = 0; i < 2 * (n - 1); i += 2) {
+        int min_idx = i;
+        float min_val = centroids[i];
+        for (j = i + 2; j < 2 * n; j += 2)
+          if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
+        if (min_idx != i) {
+          float temp_u = centroids[i], temp_v = centroids[i + 1];
+          centroids[i] = centroids[min_idx];
+          centroids[i + 1] = centroids[min_idx + 1];
+          centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
+        }
+      }
+      av1_calc_indices(data, centroids, color_map, rows * cols, n, 2);
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
+      extend_palette_color_map(color_map, cols, rows, plane_block_width,
+                               plane_block_height);
+      pmi->palette_size[1] = n;
+      for (i = 1; i < 3; ++i) {
+        for (j = 0; j < n; ++j) {
+#if CONFIG_HIGHBITDEPTH
+          if (cpi->common.use_highbitdepth)
+            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
+                (int)centroids[j * 2 + i - 1], cpi->common.bit_depth);
+          else
+#endif  // CONFIG_HIGHBITDEPTH
+            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
+                clip_pixel((int)centroids[j * 2 + i - 1]);
+        }
+      }
+
+      super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+      if (tokenonly_rd_stats.rate == INT_MAX) continue;
+      this_rate =
+          tokenonly_rd_stats.rate + dc_mode_cost +
+          cpi->palette_uv_size_cost[bsize - BLOCK_8X8][n - PALETTE_MIN_SIZE] +
+          write_uniform_cost(n, color_map[0]) +
+          av1_cost_bit(
+              av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 1);
+      this_rate += av1_palette_color_cost_uv(pmi, cpi->common.bit_depth);
+      for (i = 0; i < rows; ++i) {
+        for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
+          int color_idx;
+          const int color_ctx = av1_get_palette_color_index_context(
+              color_map, plane_block_width, i, j, n, color_order, &color_idx);
+          assert(color_idx >= 0 && color_idx < n);
+          this_rate += cpi->palette_uv_color_cost[n - PALETTE_MIN_SIZE]
+                                                 [color_ctx][color_idx];
+        }
+      }
+
+      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+      if (this_rd < *best_rd) {
+        *best_rd = this_rd;
+        *best_mbmi = *mbmi;
+        memcpy(best_palette_color_map, color_map,
+               plane_block_width * plane_block_height *
+                   sizeof(best_palette_color_map[0]));
+        *rate = this_rate;
+        *distortion = tokenonly_rd_stats.dist;
+        *rate_tokenonly = tokenonly_rd_stats.rate;
+        *skippable = tokenonly_rd_stats.skip;
+      }
+    }
+  }
+  if (best_mbmi->palette_mode_info.palette_size[1] > 0) {
+    memcpy(color_map, best_palette_color_map,
+           rows * cols * sizeof(best_palette_color_map[0]));
+  }
+}
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_FILTER_INTRA
+// Return 1 if an filter intra mode is selected; return 0 otherwise.
+static int rd_pick_filter_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                     int *rate, int *rate_tokenonly,
+                                     int64_t *distortion, int *skippable,
+                                     BLOCK_SIZE bsize, int64_t *best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  int filter_intra_selected_flag = 0;
+  int this_rate;
+  int64_t this_rd;
+  FILTER_INTRA_MODE mode;
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+  RD_STATS tokenonly_rd_stats;
+
+  av1_zero(filter_intra_mode_info);
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 1;
+  mbmi->uv_mode = DC_PRED;
+#if CONFIG_PALETTE
+  mbmi->palette_mode_info.palette_size[1] = 0;
+#endif  // CONFIG_PALETTE
+
+  for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
+    mbmi->filter_intra_mode_info.filter_intra_mode[1] = mode;
+    if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd))
+      continue;
+
+    this_rate = tokenonly_rd_stats.rate +
+                av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 1) +
+                cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] +
+                write_uniform_cost(FILTER_INTRA_MODES, mode);
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+    if (this_rd < *best_rd) {
+      *best_rd = this_rd;
+      *rate = this_rate;
+      *rate_tokenonly = tokenonly_rd_stats.rate;
+      *distortion = tokenonly_rd_stats.dist;
+      *skippable = tokenonly_rd_stats.skip;
+      filter_intra_mode_info = mbmi->filter_intra_mode_info;
+      filter_intra_selected_flag = 1;
+    }
+  }
+
+  if (filter_intra_selected_flag) {
+    mbmi->uv_mode = DC_PRED;
+    mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
+        filter_intra_mode_info.use_filter_intra_mode[1];
+    mbmi->filter_intra_mode_info.filter_intra_mode[1] =
+        filter_intra_mode_info.filter_intra_mode[1];
+    return 1;
+  } else {
+    return 0;
+  }
+}
+#endif  // CONFIG_FILTER_INTRA
+
+#if CONFIG_EXT_INTRA
+// Run RD calculation with given chroma intra prediction angle., and return
+// the RD cost. Update the best mode info. if the RD cost is the best so far.
+static int64_t pick_intra_angle_routine_sbuv(
+    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+    int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats,
+    int *best_angle_delta, int64_t *best_rd) {
+  MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
+  int this_rate;
+  int64_t this_rd;
+  RD_STATS tokenonly_rd_stats;
+
+  if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in))
+    return INT64_MAX;
+  this_rate = tokenonly_rd_stats.rate + rate_overhead;
+  this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+  if (this_rd < *best_rd) {
+    *best_rd = this_rd;
+    *best_angle_delta = mbmi->angle_delta[1];
+    *rate = this_rate;
+    rd_stats->rate = tokenonly_rd_stats.rate;
+    rd_stats->dist = tokenonly_rd_stats.dist;
+    rd_stats->skip = tokenonly_rd_stats.skip;
+  }
+  return this_rd;
+}
+
+// With given chroma directional intra prediction mode, pick the best angle
+// delta. Return true if a RD cost that is smaller than the input one is found.
+static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE bsize, int rate_overhead,
+                                    int64_t best_rd, int *rate,
+                                    RD_STATS *rd_stats) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  int i, angle_delta, best_angle_delta = 0;
+  int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
+
+  rd_stats->rate = INT_MAX;
+  rd_stats->skip = 0;
+  rd_stats->dist = INT64_MAX;
+  for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
+
+  for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+    for (i = 0; i < 2; ++i) {
+      best_rd_in = (best_rd == INT64_MAX)
+                       ? INT64_MAX
+                       : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5)));
+      mbmi->angle_delta[1] = (1 - 2 * i) * angle_delta;
+      this_rd = pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead,
+                                              best_rd_in, rate, rd_stats,
+                                              &best_angle_delta, &best_rd);
+      rd_cost[2 * angle_delta + i] = this_rd;
+      if (angle_delta == 0) {
+        if (this_rd == INT64_MAX) return 0;
+        rd_cost[1] = this_rd;
+        break;
+      }
+    }
+  }
+
+  assert(best_rd != INT64_MAX);
+  for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+    int64_t rd_thresh;
+    for (i = 0; i < 2; ++i) {
+      int skip_search = 0;
+      rd_thresh = best_rd + (best_rd >> 5);
+      if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
+          rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
+        skip_search = 1;
+      if (!skip_search) {
+        mbmi->angle_delta[1] = (1 - 2 * i) * angle_delta;
+        pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd,
+                                      rate, rd_stats, &best_angle_delta,
+                                      &best_rd);
+      }
+    }
+  }
+
+  mbmi->angle_delta[1] = best_angle_delta;
+  return rd_stats->rate != INT_MAX;
+}
+#endif  // CONFIG_EXT_INTRA
+
+static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                       int *rate, int *rate_tokenonly,
+                                       int64_t *distortion, int *skippable,
+                                       BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO best_mbmi = *mbmi;
+  PREDICTION_MODE mode;
+  int64_t best_rd = INT64_MAX, this_rd;
+  int this_rate;
+  RD_STATS tokenonly_rd_stats;
+#if CONFIG_PVQ
+  od_rollback_buffer buf;
+  od_encode_checkpoint(&x->daala_enc, &buf);
+#endif  // CONFIG_PVQ
+#if CONFIG_PALETTE
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  uint8_t *best_palette_color_map = NULL;
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_FILTER_INTRA
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif  // CONFIG_FILTER_INTRA
+#if CONFIG_PALETTE
+  pmi->palette_size[1] = 0;
+#endif  // CONFIG_PALETTE
+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+#if CONFIG_EXT_INTRA
+    const int is_directional_mode =
+        av1_is_directional_mode(mode, mbmi->sb_type);
+#endif  // CONFIG_EXT_INTRA
+    if (!(cpi->sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
+          (1 << mode)))
+      continue;
+
+    mbmi->uv_mode = mode;
+#if CONFIG_EXT_INTRA
+    mbmi->angle_delta[1] = 0;
+    if (is_directional_mode) {
+      const int rate_overhead = cpi->intra_uv_mode_cost[mbmi->mode][mode] +
+                                write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, 0);
+      if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
+                                    &this_rate, &tokenonly_rd_stats))
+        continue;
+    } else {
+#endif  // CONFIG_EXT_INTRA
+      if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) {
+#if CONFIG_PVQ
+        od_encode_rollback(&x->daala_enc, &buf);
+#endif  // CONFIG_PVQ
+        continue;
+      }
+#if CONFIG_EXT_INTRA
+    }
+#endif  // CONFIG_EXT_INTRA
+    this_rate =
+        tokenonly_rd_stats.rate + cpi->intra_uv_mode_cost[mbmi->mode][mode];
+
+#if CONFIG_EXT_INTRA
+    if (is_directional_mode) {
+      this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
+                                      MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
+    }
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+    if (mbmi->sb_type >= BLOCK_8X8 && mode == DC_PRED)
+      this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 0);
+#endif  // CONFIG_FILTER_INTRA
+#if CONFIG_PALETTE
+    if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8 &&
+        mode == DC_PRED)
+      this_rate += av1_cost_bit(
+          av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 0);
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_PVQ
+    od_encode_rollback(&x->daala_enc, &buf);
+#endif  // CONFIG_PVQ
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+
+    if (this_rd < best_rd) {
+      best_mbmi = *mbmi;
+      best_rd = this_rd;
+      *rate = this_rate;
+      *rate_tokenonly = tokenonly_rd_stats.rate;
+      *distortion = tokenonly_rd_stats.dist;
+      *skippable = tokenonly_rd_stats.skip;
+    }
+  }
+
+#if CONFIG_PALETTE
+  if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8) {
+    best_palette_color_map = x->palette_buffer->best_palette_color_map;
+    rd_pick_palette_intra_sbuv(cpi, x,
+                               cpi->intra_uv_mode_cost[mbmi->mode][DC_PRED],
+                               best_palette_color_map, &best_mbmi, &best_rd,
+                               rate, rate_tokenonly, distortion, skippable);
+  }
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_FILTER_INTRA
+  if (mbmi->sb_type >= BLOCK_8X8) {
+    if (rd_pick_filter_intra_sbuv(cpi, x, rate, rate_tokenonly, distortion,
+                                  skippable, bsize, &best_rd))
+      best_mbmi = *mbmi;
+  }
+#endif  // CONFIG_FILTER_INTRA
+
+  *mbmi = best_mbmi;
+  // Make sure we actually chose a mode
+  assert(best_rd < INT64_MAX);
+  return best_rd;
+}
+
+static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                                 PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize,
+                                 TX_SIZE max_tx_size, int *rate_uv,
+                                 int *rate_uv_tokenonly, int64_t *dist_uv,
+                                 int *skip_uv, PREDICTION_MODE *mode_uv) {
+  // Use an estimated rd for uv_intra based on DC_PRED if the
+  // appropriate speed flag is set.
+  (void)ctx;
+#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2
+  rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
+                          bsize, max_tx_size);
+#else
+  max_tx_size = AOMMAX(max_tx_size, TX_4X4);
+  if (x->skip_chroma_rd) {
+    *rate_uv = 0;
+    *rate_uv_tokenonly = 0;
+    *dist_uv = 0;
+    *skip_uv = 1;
+    *mode_uv = DC_PRED;
+    return;
+  }
+  BLOCK_SIZE bs = scale_chroma_bsize(bsize, x->e_mbd.plane[1].subsampling_x,
+                                     x->e_mbd.plane[1].subsampling_y);
+  rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
+                          bs, max_tx_size);
+#endif  // CONFIG_CHROMA_2X2
+#else
+  rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
+                          bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
+#endif  // CONFIG_CB4X4
+  *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode;
+}
+
+static int cost_mv_ref(const AV1_COMP *const cpi, PREDICTION_MODE mode,
+                       int16_t mode_context) {
+#if CONFIG_EXT_INTER
+  if (is_inter_compound_mode(mode)) {
+    return cpi
+        ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
+  }
+#endif
+
+#if CONFIG_REF_MV
+  int mode_cost = 0;
+  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+  int16_t is_all_zero_mv = mode_context & (1 << ALL_ZERO_FLAG_OFFSET);
+
+  assert(is_inter_mode(mode));
+
+  if (mode == NEWMV) {
+    mode_cost = cpi->newmv_mode_cost[mode_ctx][0];
+    return mode_cost;
+  } else {
+    mode_cost = cpi->newmv_mode_cost[mode_ctx][1];
+    mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
+
+    if (is_all_zero_mv) return mode_cost;
+
+    if (mode == ZEROMV) {
+      mode_cost += cpi->zeromv_mode_cost[mode_ctx][0];
+      return mode_cost;
+    } else {
+      mode_cost += cpi->zeromv_mode_cost[mode_ctx][1];
+      mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+      if (mode_context & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6;
+      if (mode_context & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7;
+      if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8;
+
+      mode_cost += cpi->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+      return mode_cost;
+    }
+  }
+#else
+  assert(is_inter_mode(mode));
+  return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
+#endif  // CONFIG_REF_MV
+}
+
+#if CONFIG_EXT_INTER
+static int get_interinter_compound_type_bits(BLOCK_SIZE bsize,
+                                             COMPOUND_TYPE comp_type) {
+  (void)bsize;
+  switch (comp_type) {
+    case COMPOUND_AVERAGE: return 0;
+#if CONFIG_WEDGE
+    case COMPOUND_WEDGE: return get_interinter_wedge_bits(bsize);
+#endif  // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+    case COMPOUND_SEG: return 1;
+#endif  // CONFIG_COMPOUND_SEGMENT
+    default: assert(0); return 0;
+  }
+}
+#endif  // CONFIG_EXT_INTER
+
+static int set_and_cost_bmi_mvs(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MACROBLOCKD *xd, int i,
+    PREDICTION_MODE mode, int_mv this_mv[2],
+    int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME],
+    int_mv seg_mvs[TOTAL_REFS_PER_FRAME],
+#if CONFIG_EXT_INTER
+    int_mv compound_seg_newmvs[2],
+#endif  // CONFIG_EXT_INTER
+    int_mv *best_ref_mv[2], const int *mvjcost, int *mvcost[2], int mi_row,
+    int mi_col) {
+  MODE_INFO *const mic = xd->mi[0];
+  const MB_MODE_INFO *const mbmi = &mic->mbmi;
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  int thismvcost = 0;
+  int idx, idy;
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
+  const int is_compound = has_second_ref(mbmi);
+  int mode_ctx;
+  (void)mi_row;
+  (void)mi_col;
+
+  switch (mode) {
+    case NEWMV: this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
+#if CONFIG_EXT_INTER
+      if (!cpi->common.allow_high_precision_mv)
+        lower_mv_precision(&this_mv[0].as_mv, 0);
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_REF_MV
+      for (idx = 0; idx < 1 + is_compound; ++idx) {
+        this_mv[idx] = seg_mvs[mbmi->ref_frame[idx]];
+        av1_set_mvcost(x, mbmi->ref_frame[idx], idx, mbmi->ref_mv_idx);
+        thismvcost +=
+            av1_mv_bit_cost(&this_mv[idx].as_mv, &best_ref_mv[idx]->as_mv,
+                            x->nmvjointcost, x->mvcost, MV_COST_WEIGHT_SUB);
+      }
+      (void)mvjcost;
+      (void)mvcost;
+#else
+      thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
+                                    mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+#if !CONFIG_EXT_INTER
+      if (is_compound) {
+        this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
+        thismvcost += av1_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
+                                      mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+      }
+#endif  // !CONFIG_EXT_INTER
+#endif  // CONFIG_REF_MV
+      break;
+    case NEARMV:
+    case NEARESTMV:
+      this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
+      if (is_compound)
+        this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
+      break;
+    case ZEROMV: {
+      int ref;
+      for (ref = 0; ref < 1 + is_compound; ++ref) {
+#if CONFIG_GLOBAL_MOTION
+        this_mv[ref].as_int =
+            gm_get_motion_vector(
+                &cpi->common.global_motion[mbmi->ref_frame[ref]],
+                cpi->common.allow_high_precision_mv, mbmi->sb_type, mi_col,
+                mi_row, i)
+                .as_int;
+#else
+        this_mv[ref].as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
+      }
+      break;
+    }
+#if CONFIG_EXT_INTER
+    case NEW_NEWMV:
+      if (compound_seg_newmvs[0].as_int == INVALID_MV ||
+          compound_seg_newmvs[1].as_int == INVALID_MV) {
+        this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
+        this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
+      } else {
+        this_mv[0].as_int = compound_seg_newmvs[0].as_int;
+        this_mv[1].as_int = compound_seg_newmvs[1].as_int;
+      }
+      if (!cpi->common.allow_high_precision_mv)
+        lower_mv_precision(&this_mv[0].as_mv, 0);
+      if (!cpi->common.allow_high_precision_mv)
+        lower_mv_precision(&this_mv[1].as_mv, 0);
+#if CONFIG_REF_MV
+      av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx);
+#endif
+      thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
+                                    mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+#if CONFIG_REF_MV
+      av1_set_mvcost(x, mbmi->ref_frame[1], 1, mbmi->ref_mv_idx);
+#endif
+      thismvcost += av1_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
+                                    mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+      break;
+    case NEW_NEARMV:
+    case NEW_NEARESTMV:
+      this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
+      if (!cpi->common.allow_high_precision_mv)
+        lower_mv_precision(&this_mv[0].as_mv, 0);
+#if CONFIG_REF_MV
+      av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx);
+#endif
+      thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
+                                    mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+      this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
+      break;
+    case NEAR_NEWMV:
+    case NEAREST_NEWMV:
+      this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
+      this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
+      if (!cpi->common.allow_high_precision_mv)
+        lower_mv_precision(&this_mv[1].as_mv, 0);
+#if CONFIG_REF_MV
+      av1_set_mvcost(x, mbmi->ref_frame[1], 1, mbmi->ref_mv_idx);
+#endif
+      thismvcost += av1_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
+                                    mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+      break;
+    case NEAREST_NEARMV:
+    case NEAR_NEARESTMV:
+    case NEAREST_NEARESTMV:
+    case NEAR_NEARMV:
+      this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
+      this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
+      break;
+    case ZERO_ZEROMV:
+#if CONFIG_GLOBAL_MOTION
+      this_mv[0].as_int =
+          gm_get_motion_vector(&cpi->common.global_motion[mbmi->ref_frame[0]],
+                               cpi->common.allow_high_precision_mv,
+                               mbmi->sb_type, mi_col, mi_row, i)
+              .as_int;
+      this_mv[1].as_int =
+          gm_get_motion_vector(&cpi->common.global_motion[mbmi->ref_frame[1]],
+                               cpi->common.allow_high_precision_mv,
+                               mbmi->sb_type, mi_col, mi_row, i)
+              .as_int;
+#else
+      this_mv[0].as_int = 0;
+      this_mv[1].as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
+      break;
+#endif  // CONFIG_EXT_INTER
+    default: break;
+  }
+
+  mic->bmi[i].as_mv[0].as_int = this_mv[0].as_int;
+  if (is_compound) mic->bmi[i].as_mv[1].as_int = this_mv[1].as_int;
+
+  mic->bmi[i].as_mode = mode;
+
+#if CONFIG_REF_MV
+  if (mode == NEWMV) {
+    mic->bmi[i].pred_mv[0].as_int =
+        mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_int;
+    if (is_compound)
+      mic->bmi[i].pred_mv[1].as_int =
+          mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_int;
+  } else {
+    mic->bmi[i].pred_mv[0].as_int = this_mv[0].as_int;
+    if (is_compound) mic->bmi[i].pred_mv[1].as_int = this_mv[1].as_int;
+  }
+#endif  // CONFIG_REF_MV
+
+  for (idy = 0; idy < num_4x4_blocks_high; ++idy)
+    for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
+      memmove(&mic->bmi[i + idy * 2 + idx], &mic->bmi[i], sizeof(mic->bmi[i]));
+
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+  if (is_compound)
+    mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+  else
+#endif  // CONFIG_EXT_INTER
+    mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
+                                         mbmi->ref_frame, mbmi->sb_type, i);
+#else  // CONFIG_REF_MV
+  mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
+#endif  // CONFIG_REF_MV
+  return cost_mv_ref(cpi, mode, mode_ctx) + thismvcost;
+}
+
+static int64_t encode_inter_mb_segment_sub8x8(
+    const AV1_COMP *const cpi, MACROBLOCK *x, int64_t best_yrd, int i,
+    int *labelyrate, int64_t *distortion, int64_t *sse, ENTROPY_CONTEXT *ta,
+    ENTROPY_CONTEXT *tl, int ir, int ic, int mi_row, int mi_col) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  struct macroblock_plane *const p = &x->plane[0];
+  MODE_INFO *const mi = xd->mi[0];
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
+  const int txb_width = max_block_wide(xd, plane_bsize, 0);
+  const int txb_height = max_block_high(xd, plane_bsize, 0);
+  const int width = block_size_wide[plane_bsize];
+  const int height = block_size_high[plane_bsize];
+  int idx, idy;
+  const uint8_t *const src =
+      &p->src.buf[av1_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+  uint8_t *const dst =
+      &pd->dst.buf[av1_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)];
+  int64_t thisdistortion = 0, thissse = 0;
+  int thisrate = 0;
+  TX_SIZE tx_size = mi->mbmi.tx_size;
+  TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, i, tx_size);
+  const int num_4x4_w = tx_size_wide_unit[tx_size];
+  const int num_4x4_h = tx_size_high_unit[tx_size];
+#if !CONFIG_PVQ
+  const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 1);
+#else
+  (void)cpi;
+  (void)ta;
+  (void)tl;
+  (void)tx_type;
+#endif  // !CONFIG_PVQ
+
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  assert(IMPLIES(xd->lossless[mi->mbmi.segment_id], tx_size == TX_4X4));
+  assert(IMPLIES(!xd->lossless[mi->mbmi.segment_id],
+                 tx_size == max_txsize_rect_lookup[mi->mbmi.sb_type]));
+#else
+  assert(tx_size == TX_4X4);
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+  assert(tx_type == DCT_DCT);
+
+  av1_build_inter_predictor_sub8x8(xd, 0, i, ir, ic, mi_row, mi_col);
+
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    aom_highbd_subtract_block(
+        height, width, av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+        8, src, p->src.stride, dst, pd->dst.stride, xd->bd);
+  } else {
+    aom_subtract_block(height, width,
+                       av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+                       8, src, p->src.stride, dst, pd->dst.stride);
+  }
+#else
+  aom_subtract_block(height, width,
+                     av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+                     8, src, p->src.stride, dst, pd->dst.stride);
+#endif  // CONFIG_HIGHBITDEPTH
+
+  for (idy = 0; idy < txb_height; idy += num_4x4_h) {
+    for (idx = 0; idx < txb_width; idx += num_4x4_w) {
+      int64_t dist, ssz, rd, rd1, rd2;
+      int coeff_ctx;
+      const int k = i + (idy * 2 + idx);
+      const int block = av1_raster_order_to_block_index(tx_size, k);
+      assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
+                     idx == 0 && idy == 0));
+      coeff_ctx = combine_entropy_contexts(*(ta + (k & 1)), *(tl + (k >> 1)));
+      av1_xform_quant(cm, x, 0, block, idy + (i >> 1), idx + (i & 0x01),
+                      BLOCK_8X8, tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
+      if (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0)
+        av1_optimize_b(cm, x, 0, block, tx_size, coeff_ctx);
+      av1_dist_block(cpi, x, 0, BLOCK_8X8, block, idy + (i >> 1),
+                     idx + (i & 0x1), tx_size, &dist, &ssz,
+                     OUTPUT_HAS_PREDICTED_PIXELS);
+      thisdistortion += dist;
+      thissse += ssz;
+#if !CONFIG_PVQ
+      thisrate +=
+          av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order, (ta + (k & 1)),
+                          (tl + (k >> 1)), cpi->sf.use_fast_coef_costing);
+      *(ta + (k & 1)) = !(p->eobs[block] == 0);
+      *(tl + (k >> 1)) = !(p->eobs[block] == 0);
+#else
+      thisrate += x->rate;
+#endif  // !CONFIG_PVQ
+#if CONFIG_EXT_TX
+      if (tx_size == TX_8X4) {
+        *(ta + (k & 1) + 1) = *(ta + (k & 1));
+      }
+      if (tx_size == TX_4X8) {
+        *(tl + (k >> 1) + 1) = *(tl + (k >> 1));
+      }
+#endif  // CONFIG_EXT_TX
+      rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion);
+      rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse);
+      rd = AOMMIN(rd1, rd2);
+      if (rd >= best_yrd) return INT64_MAX;
+    }
+  }
+
+  *distortion = thisdistortion;
+  *labelyrate = thisrate;
+  *sse = thissse;
+
+  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
+}
+
+typedef struct {
+  int eobs;
+  int brate;
+  int byrate;
+  int64_t bdist;
+  int64_t bsse;
+  int64_t brdcost;
+  int_mv mvs[2];
+#if CONFIG_REF_MV
+  int_mv pred_mv[2];
+#endif  // CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+  int_mv ref_mv[2];
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_CB4X4
+  ENTROPY_CONTEXT ta[4];
+  ENTROPY_CONTEXT tl[4];
+#else
+  ENTROPY_CONTEXT ta[2];
+  ENTROPY_CONTEXT tl[2];
+#endif  // CONFIG_CB4X4
+} SEG_RDSTAT;
+
+typedef struct {
+  int_mv *ref_mv[2];
+  int_mv mvp;
+
+  int64_t segment_rd;
+  int r;
+  int64_t d;
+  int64_t sse;
+  int segment_yrate;
+  PREDICTION_MODE modes[4];
+#if CONFIG_EXT_INTER
+  SEG_RDSTAT rdstat[4][INTER_MODES + INTER_COMPOUND_MODES];
+#else
+  SEG_RDSTAT rdstat[4][INTER_MODES];
+#endif  // CONFIG_EXT_INTER
+  int mvthresh;
+} BEST_SEG_INFO;
+
+static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) {
+  return (mv->row >> 3) < mv_limits->row_min ||
+         (mv->row >> 3) > mv_limits->row_max ||
+         (mv->col >> 3) < mv_limits->col_min ||
+         (mv->col >> 3) > mv_limits->col_max;
+}
+
+static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
+  MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0]->mbmi;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
+
+  p->src.buf =
+      &p->src.buf[av1_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+  assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
+  pd->pre[0].buf =
+      &pd->pre[0].buf[av1_raster_block_offset(BLOCK_8X8, i, pd->pre[0].stride)];
+  if (has_second_ref(mbmi))
+    pd->pre[1].buf =
+        &pd->pre[1]
+             .buf[av1_raster_block_offset(BLOCK_8X8, i, pd->pre[1].stride)];
+}
+
+static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
+                                  struct buf_2d orig_pre[2]) {
+  MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
+  x->plane[0].src = orig_src;
+  x->e_mbd.plane[0].pre[0] = orig_pre[0];
+  if (has_second_ref(mbmi)) x->e_mbd.plane[0].pre[1] = orig_pre[1];
+}
+
+// Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
+// TODO(aconverse): Find out if this is still productive then clean up or remove
+static int check_best_zero_mv(
+    const AV1_COMP *const cpi, const int16_t mode_context[TOTAL_REFS_PER_FRAME],
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+    const int16_t compound_mode_context[TOTAL_REFS_PER_FRAME],
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+    int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME], int this_mode,
+    const MV_REFERENCE_FRAME ref_frames[2], const BLOCK_SIZE bsize, int block,
+    int mi_row, int mi_col) {
+  int_mv zeromv[2];
+  int comp_pred_mode = ref_frames[1] > INTRA_FRAME;
+  int cur_frm;
+  (void)mi_row;
+  (void)mi_col;
+  for (cur_frm = 0; cur_frm < 1 + comp_pred_mode; cur_frm++) {
+#if CONFIG_GLOBAL_MOTION
+    if (this_mode == ZEROMV
+#if CONFIG_EXT_INTER
+        || this_mode == ZERO_ZEROMV
+#endif  // CONFIG_EXT_INTER
+        )
+      zeromv[cur_frm].as_int =
+          gm_get_motion_vector(&cpi->common.global_motion[ref_frames[cur_frm]],
+                               cpi->common.allow_high_precision_mv, bsize,
+                               mi_col, mi_row, block)
+              .as_int;
+    else
+#endif  // CONFIG_GLOBAL_MOTION
+      zeromv[cur_frm].as_int = 0;
+  }
+#if !CONFIG_EXT_INTER
+  assert(ref_frames[1] != INTRA_FRAME);  // Just sanity check
+#endif                                   // !CONFIG_EXT_INTER
+  if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
+      frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int &&
+      (ref_frames[1] <= INTRA_FRAME ||
+       frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int)) {
+#if CONFIG_REF_MV
+    int16_t rfc =
+        av1_mode_context_analyzer(mode_context, ref_frames, bsize, block);
+#else
+    int16_t rfc = mode_context[ref_frames[0]];
+#endif  // CONFIG_REF_MV
+    int c1 = cost_mv_ref(cpi, NEARMV, rfc);
+    int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
+    int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
+
+#if !CONFIG_REF_MV
+    (void)bsize;
+    (void)block;
+#endif  // !CONFIG_REF_MV
+
+    if (this_mode == NEARMV) {
+      if (c1 > c3) return 0;
+    } else if (this_mode == NEARESTMV) {
+      if (c2 > c3) return 0;
+    } else {
+      assert(this_mode == ZEROMV);
+      if (ref_frames[1] <= INTRA_FRAME) {
+        if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
+            (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
+          return 0;
+      } else {
+        if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0 &&
+             frame_mv[NEARESTMV][ref_frames[1]].as_int == 0) ||
+            (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0 &&
+             frame_mv[NEARMV][ref_frames[1]].as_int == 0))
+          return 0;
+      }
+    }
+  }
+#if CONFIG_EXT_INTER
+  else if ((this_mode == NEAREST_NEARESTMV || this_mode == NEAREST_NEARMV ||
+            this_mode == NEAR_NEARESTMV || this_mode == NEAR_NEARMV ||
+            this_mode == ZERO_ZEROMV) &&
+           frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int &&
+           frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int) {
+#if CONFIG_REF_MV
+    int16_t rfc = compound_mode_context[ref_frames[0]];
+#else
+    int16_t rfc = mode_context[ref_frames[0]];
+#endif  // CONFIG_REF_MV
+    int c1 = cost_mv_ref(cpi, NEAREST_NEARMV, rfc);
+    int c2 = cost_mv_ref(cpi, NEAREST_NEARESTMV, rfc);
+    int c3 = cost_mv_ref(cpi, ZERO_ZEROMV, rfc);
+    int c4 = cost_mv_ref(cpi, NEAR_NEARESTMV, rfc);
+    int c5 = cost_mv_ref(cpi, NEAR_NEARMV, rfc);
+
+    if (this_mode == NEAREST_NEARMV) {
+      if (c1 > c3) return 0;
+    } else if (this_mode == NEAREST_NEARESTMV) {
+      if (c2 > c3) return 0;
+    } else if (this_mode == NEAR_NEARESTMV) {
+      if (c4 > c3) return 0;
+    } else if (this_mode == NEAR_NEARMV) {
+      if (c5 > c3) return 0;
+    } else {
+      assert(this_mode == ZERO_ZEROMV);
+      if ((c3 >= c2 && frame_mv[NEAREST_NEARESTMV][ref_frames[0]].as_int == 0 &&
+           frame_mv[NEAREST_NEARESTMV][ref_frames[1]].as_int == 0) ||
+          (c3 >= c1 && frame_mv[NEAREST_NEARMV][ref_frames[0]].as_int == 0 &&
+           frame_mv[NEAREST_NEARMV][ref_frames[1]].as_int == 0) ||
+          (c3 >= c5 && frame_mv[NEAR_NEARMV][ref_frames[0]].as_int == 0 &&
+           frame_mv[NEAR_NEARMV][ref_frames[1]].as_int == 0) ||
+          (c3 >= c4 && frame_mv[NEAR_NEARESTMV][ref_frames[0]].as_int == 0 &&
+           frame_mv[NEAR_NEARESTMV][ref_frames[1]].as_int == 0))
+        return 0;
+    }
+  }
+#endif  // CONFIG_EXT_INTER
+  return 1;
+}
+
+static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                                BLOCK_SIZE bsize, int_mv *frame_mv, int mi_row,
+                                int mi_col,
+#if CONFIG_EXT_INTER
+                                int_mv *ref_mv_sub8x8[2],
+#endif  // CONFIG_EXT_INTER
+                                int *rate_mv, const int block) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int pw = block_size_wide[bsize];
+  const int ph = block_size_high[bsize];
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  // This function should only ever be called for compound modes
+  assert(has_second_ref(mbmi));
+  const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
+  int_mv ref_mv[2];
+  int ite, ref;
+#if CONFIG_DUAL_FILTER
+  InterpFilter interp_filter[4] = {
+    mbmi->interp_filter[0], mbmi->interp_filter[1], mbmi->interp_filter[2],
+    mbmi->interp_filter[3],
+  };
+#else
+  const InterpFilter interp_filter = mbmi->interp_filter;
+#endif  // CONFIG_DUAL_FILTER
+  struct scale_factors sf;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+  // ic and ir are the 4x4 coordiantes of the sub8x8 at index "block"
+  const int ic = block & 1;
+  const int ir = (block - ic) >> 1;
+  const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
+  const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
+#if CONFIG_GLOBAL_MOTION
+  int is_global[2];
+  for (ref = 0; ref < 2; ++ref) {
+    WarpedMotionParams *const wm =
+        &xd->global_motion[xd->mi[0]->mbmi.ref_frame[ref]];
+    is_global[ref] = is_global_mv_block(xd->mi[0], block, wm->wmtype);
+  }
+#endif  // CONFIG_GLOBAL_MOTION
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+
+  // Do joint motion search in compound mode to get more accurate mv.
+  struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+  int last_besterr[2] = { INT_MAX, INT_MAX };
+  const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
+    av1_get_scaled_ref_frame(cpi, refs[0]),
+    av1_get_scaled_ref_frame(cpi, refs[1])
+  };
+
+// Prediction buffer from second frame.
+#if CONFIG_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
+  uint8_t *second_pred;
+#else
+  DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]);
+#endif  // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_EXT_INTER && CONFIG_CB4X4
+  (void)ref_mv_sub8x8;
+#endif  // CONFIG_EXT_INTER && CONFIG_CB4X4
+
+  for (ref = 0; ref < 2; ++ref) {
+#if CONFIG_EXT_INTER && !CONFIG_CB4X4
+    if (bsize < BLOCK_8X8 && ref_mv_sub8x8 != NULL)
+      ref_mv[ref].as_int = ref_mv_sub8x8[ref]->as_int;
+    else
+#endif  // CONFIG_EXT_INTER && !CONFIG_CB4X4
+      ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0];
+
+    if (scaled_ref_frame[ref]) {
+      int i;
+      // Swap out the reference frame for a version that's been scaled to
+      // match the resolution of the current frame, allowing the existing
+      // motion search code to be used without additional modifications.
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        backup_yv12[ref][i] = xd->plane[i].pre[ref];
+      av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+                           NULL);
+    }
+  }
+
+// Since we have scaled the reference frames to match the size of the current
+// frame we must use a unit scaling factor during mode selection.
+#if CONFIG_HIGHBITDEPTH
+  av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
+                                    cm->height, cm->use_highbitdepth);
+#else
+  av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
+                                    cm->height);
+#endif  // CONFIG_HIGHBITDEPTH
+
+  // Allow joint search multiple times iteratively for each reference frame
+  // and break out of the search loop if it couldn't find a better mv.
+  for (ite = 0; ite < 4; ite++) {
+    struct buf_2d ref_yv12[2];
+    int bestsme = INT_MAX;
+    int sadpb = x->sadperbit16;
+    MV *const best_mv = &x->best_mv.as_mv;
+    int search_range = 3;
+
+    MvLimits tmp_mv_limits = x->mv_limits;
+    int id = ite % 2;  // Even iterations search in the first reference frame,
+                       // odd iterations search in the second. The predictor
+                       // found for the 'other' reference frame is factored in.
+    const int plane = 0;
+    ConvolveParams conv_params = get_conv_params(0, plane);
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+    WarpTypesAllowed warp_types;
+#if CONFIG_GLOBAL_MOTION
+    warp_types.global_warp_allowed = is_global[!id];
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+    warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
+#endif  // CONFIG_WARPED_MOTION
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+
+    // Initialized here because of compiler problem in Visual Studio.
+    ref_yv12[0] = xd->plane[plane].pre[0];
+    ref_yv12[1] = xd->plane[plane].pre[1];
+
+#if CONFIG_DUAL_FILTER
+    // reload the filter types
+    interp_filter[0] =
+        (id == 0) ? mbmi->interp_filter[2] : mbmi->interp_filter[0];
+    interp_filter[1] =
+        (id == 0) ? mbmi->interp_filter[3] : mbmi->interp_filter[1];
+#endif  // CONFIG_DUAL_FILTER
+
+// Get the prediction block from the 'other' reference frame.
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
+      av1_highbd_build_inter_predictor(
+          ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
+          &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0, interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+          &warp_types, p_col, p_row,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+          plane, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
+    } else {
+      second_pred = (uint8_t *)second_pred_alloc_16;
+#endif  // CONFIG_HIGHBITDEPTH
+      av1_build_inter_predictor(
+          ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
+          &frame_mv[refs[!id]].as_mv, &sf, pw, ph, &conv_params, interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+          &warp_types, p_col, p_row, plane, !id,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+          MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
+#if CONFIG_HIGHBITDEPTH
+    }
+#endif  // CONFIG_HIGHBITDEPTH
+
+    // Do compound motion search on the current reference frame.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[id];
+    av1_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv);
+
+    // Use the mv result from the single mode as mv predictor.
+    *best_mv = frame_mv[refs[id]].as_mv;
+
+    best_mv->col >>= 3;
+    best_mv->row >>= 3;
+
+#if CONFIG_REF_MV
+    av1_set_mvcost(x, refs[id], id, mbmi->ref_mv_idx);
+#endif  // CONFIG_REF_MV
+
+    // Small-range full-pixel motion search.
+    bestsme =
+        av1_refining_search_8p_c(x, sadpb, search_range, &cpi->fn_ptr[bsize],
+                                 &ref_mv[id].as_mv, second_pred);
+    if (bestsme < INT_MAX)
+      bestsme = av1_get_mvpred_av_var(x, best_mv, &ref_mv[id].as_mv,
+                                      second_pred, &cpi->fn_ptr[bsize], 1);
+
+    x->mv_limits = tmp_mv_limits;
+
+    if (bestsme < INT_MAX) {
+      int dis; /* TODO: use dis in distortion calculation later. */
+      unsigned int sse;
+      if (cpi->sf.use_upsampled_references) {
+        // Use up-sampled reference frames.
+        struct buf_2d backup_pred = pd->pre[0];
+        const YV12_BUFFER_CONFIG *upsampled_ref =
+            get_upsampled_ref(cpi, refs[id]);
+
+        // Set pred for Y plane
+        setup_pred_plane(&pd->pre[0], bsize, upsampled_ref->y_buffer,
+                         upsampled_ref->y_crop_width,
+                         upsampled_ref->y_crop_height, upsampled_ref->y_stride,
+                         (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
+                         pd->subsampling_y);
+
+// If bsize < BLOCK_8X8, adjust pred pointer for this block
+#if !CONFIG_CB4X4
+        if (bsize < BLOCK_8X8)
+          pd->pre[0].buf =
+              &pd->pre[0].buf[(av1_raster_block_offset(BLOCK_8X8, block,
+                                                       pd->pre[0].stride))
+                              << 3];
+#endif  // !CONFIG_CB4X4
+
+        bestsme = cpi->find_fractional_mv_step(
+            x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
+            x->errorperbit, &cpi->fn_ptr[bsize], 0,
+            cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
+            &dis, &sse, second_pred, pw, ph, 1);
+
+        // Restore the reference frames.
+        pd->pre[0] = backup_pred;
+      } else {
+        (void)block;
+        bestsme = cpi->find_fractional_mv_step(
+            x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
+            x->errorperbit, &cpi->fn_ptr[bsize], 0,
+            cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
+            &dis, &sse, second_pred, pw, ph, 0);
+      }
+    }
+
+    // Restore the pointer to the first (possibly scaled) prediction buffer.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[0];
+
+    if (bestsme < last_besterr[id]) {
+      frame_mv[refs[id]].as_mv = *best_mv;
+      last_besterr[id] = bestsme;
+    } else {
+      break;
+    }
+  }
+
+  *rate_mv = 0;
+
+  for (ref = 0; ref < 2; ++ref) {
+    if (scaled_ref_frame[ref]) {
+      // Restore the prediction frame pointers to their unscaled versions.
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[ref] = backup_yv12[ref][i];
+    }
+#if CONFIG_REF_MV
+    av1_set_mvcost(x, refs[ref], ref, mbmi->ref_mv_idx);
+#endif  // CONFIG_REF_MV
+#if CONFIG_EXT_INTER && !CONFIG_CB4X4
+    if (bsize >= BLOCK_8X8)
+#endif  // CONFIG_EXT_INTER && !CONFIG_CB4X4
+      *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+                                  &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv,
+                                  x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+#if CONFIG_EXT_INTER && !CONFIG_CB4X4
+    else
+      *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+                                  &ref_mv_sub8x8[ref]->as_mv, x->nmvjointcost,
+                                  x->mvcost, MV_COST_WEIGHT);
+#endif  // CONFIG_EXT_INTER && !CONFIG_CB4X4
+  }
+}
+
+#if CONFIG_REF_MV && !CONFIG_EXT_INTER
+static void update_mv_search_and_seg_mvs(
+    int *const run_mv_search, int_mv *const seg_mvs, int has_second_rf,
+    const MV_REFERENCE_FRAME *const ref_frame,
+    const SEG_RDSTAT *const ref_rdstat, int_mv *const bsi_ref_mv[2]) {
+  if (has_second_rf) {
+    if (seg_mvs[ref_frame[0]].as_int == ref_rdstat->mvs[0].as_int &&
+        ref_rdstat->mvs[0].as_int != INVALID_MV)
+      if (bsi_ref_mv[0]->as_int == ref_rdstat->pred_mv[0].as_int)
+        --*run_mv_search;
+
+    if (seg_mvs[ref_frame[1]].as_int == ref_rdstat->mvs[1].as_int &&
+        ref_rdstat->mvs[1].as_int != INVALID_MV)
+      if (bsi_ref_mv[1]->as_int == ref_rdstat->pred_mv[1].as_int)
+        --*run_mv_search;
+  } else {
+    if (bsi_ref_mv[0]->as_int == ref_rdstat->pred_mv[0].as_int &&
+        ref_rdstat->mvs[0].as_int != INVALID_MV) {
+      *run_mv_search = 0;
+      seg_mvs[ref_frame[0]].as_int = ref_rdstat->mvs[0].as_int;
+    }
+  }
+}
+#endif  // CONFIG_REF_MV && !CONFIG_EXT_INTER
+
+static int64_t rd_pick_inter_best_sub8x8_mode(
+    const AV1_COMP *const cpi, MACROBLOCK *x, int_mv *best_ref_mv,
+    int_mv *second_best_ref_mv, int64_t best_rd, int *returntotrate,
+    int *returnyrate, int64_t *returndistortion, int *skippable, int64_t *psse,
+    int mvthresh, int_mv seg_mvs[4][TOTAL_REFS_PER_FRAME],
+#if CONFIG_EXT_INTER
+    int_mv compound_seg_newmvs[4][2],
+#endif  // CONFIG_EXT_INTER
+    BEST_SEG_INFO *bsi_buf, int filter_idx, int mi_row, int mi_col) {
+  BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
+#if CONFIG_REF_MV
+  int_mv tmp_ref_mv[2];
+#endif  // CONFIG_REF_MV
+  MACROBLOCKD *xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mi[0];
+  MB_MODE_INFO *mbmi = &mi->mbmi;
+  int mode_idx;
+  int k, br = 0, idx, idy;
+  int64_t bd = 0, block_sse = 0;
+  PREDICTION_MODE this_mode;
+  const AV1_COMMON *cm = &cpi->common;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const int label_count = 4;
+  int64_t this_segment_rd = 0;
+  int label_mv_thresh;
+  int segmentyrate = 0;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+#if CONFIG_CB4X4
+  ENTROPY_CONTEXT t_above[4], t_left[4];
+#else
+  ENTROPY_CONTEXT t_above[2], t_left[2];
+#endif  // CONFIG_CB4X4
+  int subpelmv = 1, have_ref = 0;
+  const int has_second_rf = has_second_ref(mbmi);
+  const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+#if CONFIG_PVQ
+  od_rollback_buffer pre_buf;
+
+  od_encode_checkpoint(&x->daala_enc, &pre_buf);
+#endif  // CONFIG_PVQ
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  mbmi->tx_size =
+      xd->lossless[mbmi->segment_id] ? TX_4X4 : max_txsize_rect_lookup[bsize];
+#else
+  mbmi->tx_size = TX_4X4;
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+  av1_zero(*bsi);
+
+  bsi->segment_rd = best_rd;
+  bsi->ref_mv[0] = best_ref_mv;
+  bsi->ref_mv[1] = second_best_ref_mv;
+  bsi->mvp.as_int = best_ref_mv->as_int;
+  bsi->mvthresh = mvthresh;
+
+  for (idx = 0; idx < 4; ++idx) bsi->modes[idx] = ZEROMV;
+
+#if CONFIG_REF_MV
+  for (idx = 0; idx < 4; ++idx) {
+    for (k = NEARESTMV; k <= NEWMV; ++k) {
+      bsi->rdstat[idx][INTER_OFFSET(k)].pred_mv[0].as_int = INVALID_MV;
+      bsi->rdstat[idx][INTER_OFFSET(k)].pred_mv[1].as_int = INVALID_MV;
+
+      bsi->rdstat[idx][INTER_OFFSET(k)].mvs[0].as_int = INVALID_MV;
+      bsi->rdstat[idx][INTER_OFFSET(k)].mvs[1].as_int = INVALID_MV;
+    }
+  }
+#endif  // CONFIG_REF_MV
+
+  memcpy(t_above, pd->above_context, sizeof(t_above));
+  memcpy(t_left, pd->left_context, sizeof(t_left));
+
+  // 64 makes this threshold really big effectively
+  // making it so that we very rarely check mvs on
+  // segments.   setting this to 1 would make mv thresh
+  // roughly equal to what it is for macroblocks
+  label_mv_thresh = 1 * bsi->mvthresh / label_count;
+
+  // Segmentation method overheads
+  for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+    for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+      // TODO(jingning,rbultje): rewrite the rate-distortion optimization
+      // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
+      int_mv mode_mv[MB_MODE_COUNT][2];
+      int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+      PREDICTION_MODE mode_selected = ZEROMV;
+      int64_t new_best_rd = INT64_MAX;
+      const int index = idy * 2 + idx;
+      int ref;
+#if CONFIG_REF_MV
+      CANDIDATE_MV ref_mv_stack[2][MAX_REF_MV_STACK_SIZE];
+      uint8_t ref_mv_count[2];
+#endif  // CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+      int_mv ref_mvs_sub8x8[2][2];
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_PVQ
+      od_rollback_buffer idx_buf, post_buf;
+      od_encode_checkpoint(&x->daala_enc, &idx_buf);
+      od_encode_checkpoint(&x->daala_enc, &post_buf);
+#endif  // CONFIG_PVQ
+
+      for (ref = 0; ref < 1 + has_second_rf; ++ref) {
+        const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+#if CONFIG_EXT_INTER
+        int_mv mv_ref_list[MAX_MV_REF_CANDIDATES];
+        av1_update_mv_context(cm, xd, mi, frame, mv_ref_list, index, mi_row,
+                              mi_col, NULL);
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_GLOBAL_MOTION
+        frame_mv[ZEROMV][frame].as_int =
+            gm_get_motion_vector(&cm->global_motion[frame],
+                                 cm->allow_high_precision_mv, mbmi->sb_type,
+                                 mi_col, mi_row, index)
+                .as_int;
+#else   // CONFIG_GLOBAL_MOTION
+        frame_mv[ZEROMV][frame].as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
+        av1_append_sub8x8_mvs_for_idx(cm, xd, index, ref, mi_row, mi_col,
+#if CONFIG_REF_MV
+                                      ref_mv_stack[ref], &ref_mv_count[ref],
+#endif  // CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+                                      mv_ref_list,
+#endif  // CONFIG_EXT_INTER
+                                      &frame_mv[NEARESTMV][frame],
+                                      &frame_mv[NEARMV][frame]);
+
+#if CONFIG_REF_MV
+        tmp_ref_mv[ref] = frame_mv[NEARESTMV][mbmi->ref_frame[ref]];
+        lower_mv_precision(&tmp_ref_mv[ref].as_mv, cm->allow_high_precision_mv);
+        bsi->ref_mv[ref] = &tmp_ref_mv[ref];
+        mbmi_ext->ref_mvs[frame][0] = tmp_ref_mv[ref];
+#endif  // CONFIG_REF_MV
+
+#if CONFIG_EXT_INTER
+        mv_ref_list[0].as_int = frame_mv[NEARESTMV][frame].as_int;
+        mv_ref_list[1].as_int = frame_mv[NEARMV][frame].as_int;
+        av1_find_best_ref_mvs(cm->allow_high_precision_mv, mv_ref_list,
+                              &ref_mvs_sub8x8[0][ref], &ref_mvs_sub8x8[1][ref]);
+
+        if (has_second_rf) {
+#if CONFIG_GLOBAL_MOTION
+          frame_mv[ZERO_ZEROMV][frame].as_int =
+              gm_get_motion_vector(&cm->global_motion[frame],
+                                   cm->allow_high_precision_mv, mbmi->sb_type,
+                                   mi_col, mi_row, index)
+                  .as_int;
+#else
+          frame_mv[ZERO_ZEROMV][frame].as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
+          frame_mv[NEAREST_NEARESTMV][frame].as_int =
+              frame_mv[NEARESTMV][frame].as_int;
+
+          if (ref == 0) {
+            frame_mv[NEAREST_NEARMV][frame].as_int =
+                frame_mv[NEARESTMV][frame].as_int;
+            frame_mv[NEAR_NEARESTMV][frame].as_int =
+                frame_mv[NEARMV][frame].as_int;
+            frame_mv[NEAREST_NEWMV][frame].as_int =
+                frame_mv[NEARESTMV][frame].as_int;
+            frame_mv[NEAR_NEWMV][frame].as_int = frame_mv[NEARMV][frame].as_int;
+            frame_mv[NEAR_NEARMV][frame].as_int =
+                frame_mv[NEARMV][frame].as_int;
+          } else if (ref == 1) {
+            frame_mv[NEAREST_NEARMV][frame].as_int =
+                frame_mv[NEARMV][frame].as_int;
+            frame_mv[NEAR_NEARESTMV][frame].as_int =
+                frame_mv[NEARESTMV][frame].as_int;
+            frame_mv[NEW_NEARESTMV][frame].as_int =
+                frame_mv[NEARESTMV][frame].as_int;
+            frame_mv[NEW_NEARMV][frame].as_int = frame_mv[NEARMV][frame].as_int;
+            frame_mv[NEAR_NEARMV][frame].as_int =
+                frame_mv[NEARMV][frame].as_int;
+          }
+        }
+#endif  // CONFIG_EXT_INTER
+      }
+
+// search for the best motion vector on this segment
+#if CONFIG_EXT_INTER
+      for (this_mode = (has_second_rf ? NEAREST_NEARESTMV : NEARESTMV);
+           this_mode <= (has_second_rf ? NEW_NEWMV : NEWMV); ++this_mode)
+#else
+      for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode)
+#endif  // CONFIG_EXT_INTER
+      {
+        const struct buf_2d orig_src = x->plane[0].src;
+        struct buf_2d orig_pre[2];
+        // This flag controls if the motion estimation will kick off. When it
+        // is set to a non-zero value, the encoder will force motion estimation.
+        int run_mv_search = 0;
+
+        mode_idx = INTER_OFFSET(this_mode);
+#if CONFIG_EXT_INTER
+        for (ref = 0; ref < 1 + has_second_rf; ++ref)
+          bsi->ref_mv[ref]->as_int = ref_mvs_sub8x8[0][ref].as_int;
+#endif  // CONFIG_EXT_INTER
+        bsi->rdstat[index][mode_idx].brdcost = INT64_MAX;
+        if (!(inter_mode_mask & (1 << this_mode))) continue;
+
+#if CONFIG_REF_MV
+        run_mv_search = 2;
+#if !CONFIG_EXT_INTER
+        if (filter_idx > 0 && this_mode == NEWMV) {
+          const BEST_SEG_INFO *ref_bsi = bsi_buf;
+          const SEG_RDSTAT *ref_rdstat = &ref_bsi->rdstat[index][mode_idx];
+
+          update_mv_search_and_seg_mvs(&run_mv_search, seg_mvs[index],
+                                       has_second_rf, mbmi->ref_frame,
+                                       ref_rdstat, bsi->ref_mv);
+
+          if (run_mv_search != 0 && filter_idx > 1) {
+            ref_bsi = bsi_buf + 1;
+            ref_rdstat = &ref_bsi->rdstat[index][mode_idx];
+            run_mv_search = 2;
+            update_mv_search_and_seg_mvs(&run_mv_search, seg_mvs[index],
+                                         has_second_rf, mbmi->ref_frame,
+                                         ref_rdstat, bsi->ref_mv);
+          }
+        }
+#endif  // !CONFIG_EXT_INTER
+#endif  // CONFIG_REF_MV
+
+#if CONFIG_GLOBAL_MOTION
+        if (cm->global_motion[mbmi->ref_frame[0]].wmtype == IDENTITY &&
+            (!has_second_rf ||
+             cm->global_motion[mbmi->ref_frame[1]].wmtype == IDENTITY))
+#endif  // CONFIG_GLOBAL_MOTION
+
+          if (!check_best_zero_mv(cpi, mbmi_ext->mode_context,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+                                  mbmi_ext->compound_mode_context,
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+                                  frame_mv, this_mode, mbmi->ref_frame, bsize,
+                                  index, mi_row, mi_col))
+            continue;
+
+        memcpy(orig_pre, pd->pre, sizeof(orig_pre));
+        memcpy(bsi->rdstat[index][mode_idx].ta, t_above,
+               sizeof(bsi->rdstat[index][mode_idx].ta));
+        memcpy(bsi->rdstat[index][mode_idx].tl, t_left,
+               sizeof(bsi->rdstat[index][mode_idx].tl));
+#if CONFIG_PVQ
+        od_encode_rollback(&x->daala_enc, &idx_buf);
+#endif  // CONFIG_PVQ
+
+        // motion search for newmv (single predictor case only)
+        if (!has_second_rf &&
+#if CONFIG_EXT_INTER
+            have_newmv_in_inter_mode(this_mode) &&
+            (seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV)
+#else
+            this_mode == NEWMV &&
+            (seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV ||
+             run_mv_search)
+#endif  // CONFIG_EXT_INTER
+                ) {
+          int step_param = 0;
+          int bestsme = INT_MAX;
+          int sadpb = x->sadperbit4;
+          MV mvp_full;
+          int max_mv;
+          int cost_list[5];
+          MvLimits tmp_mv_limits = x->mv_limits;
+
+          /* Is the best so far sufficiently good that we cant justify doing
+           * and new motion search. */
+          if (new_best_rd < label_mv_thresh) break;
+
+#if CONFIG_EXT_INTER
+          bsi->mvp.as_int = bsi->ref_mv[0]->as_int;
+#else
+// use previous block's result as next block's MV predictor.
+#if !CONFIG_REF_MV
+          if (index > 0) {
+            bsi->mvp.as_int = mi->bmi[index - 1].as_mv[0].as_int;
+            if (index == 2)
+              bsi->mvp.as_int = mi->bmi[index - 2].as_mv[0].as_int;
+          }
+#endif  // !CONFIG_REF_MV
+#endif  // CONFIG_EXT_INTER
+          max_mv = (index == 0) ? (int)x->max_mv_context[mbmi->ref_frame[0]]
+                                : AOMMAX(abs(bsi->mvp.as_mv.row),
+                                         abs(bsi->mvp.as_mv.col)) >>
+                                      3;
+
+          if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
+            // Take wtd average of the step_params based on the last frame's
+            // max mv magnitude and the best ref mvs of the current block for
+            // the given reference.
+            step_param =
+                (av1_init_search_range(max_mv) + cpi->mv_step_param) / 2;
+          } else {
+            step_param = cpi->mv_step_param;
+          }
+
+#if CONFIG_REF_MV
+          mvp_full.row = bsi->ref_mv[0]->as_mv.row >> 3;
+          mvp_full.col = bsi->ref_mv[0]->as_mv.col >> 3;
+#else
+          mvp_full.row = bsi->mvp.as_mv.row >> 3;
+          mvp_full.col = bsi->mvp.as_mv.col >> 3;
+#endif  // CONFIG_REF_MV
+
+          if (cpi->sf.adaptive_motion_search) {
+            mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3;
+            mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3;
+            step_param = AOMMAX(step_param, 8);
+          }
+
+          // adjust src pointer for this block
+          mi_buf_shift(x, index);
+
+          av1_set_mv_search_range(&x->mv_limits, &bsi->ref_mv[0]->as_mv);
+
+          x->best_mv.as_int = x->second_best_mv.as_int = INVALID_MV;
+
+#if CONFIG_REF_MV
+          av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx);
+#endif  // CONFIG_REF_MV
+          bestsme = av1_full_pixel_search(
+              cpi, x, bsize, &mvp_full, step_param, sadpb,
+              cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL,
+              &bsi->ref_mv[0]->as_mv, INT_MAX, 1);
+
+          x->mv_limits = tmp_mv_limits;
+
+          if (bestsme < INT_MAX) {
+            int distortion;
+            if (cpi->sf.use_upsampled_references) {
+              int best_mv_var;
+              const int try_second =
+                  x->second_best_mv.as_int != INVALID_MV &&
+                  x->second_best_mv.as_int != x->best_mv.as_int;
+              const int pw = block_size_wide[bsize];
+              const int ph = block_size_high[bsize];
+              // Use up-sampled reference frames.
+              struct buf_2d backup_pred = pd->pre[0];
+              const YV12_BUFFER_CONFIG *upsampled_ref =
+                  get_upsampled_ref(cpi, mbmi->ref_frame[0]);
+
+              // Set pred for Y plane
+              setup_pred_plane(
+                  &pd->pre[0], bsize, upsampled_ref->y_buffer,
+                  upsampled_ref->y_crop_width, upsampled_ref->y_crop_height,
+                  upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3), NULL,
+                  pd->subsampling_x, pd->subsampling_y);
+
+              // adjust pred pointer for this block
+              pd->pre[0].buf =
+                  &pd->pre[0].buf[(av1_raster_block_offset(BLOCK_8X8, index,
+                                                           pd->pre[0].stride))
+                                  << 3];
+
+              best_mv_var = cpi->find_fractional_mv_step(
+                  x, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv,
+                  x->errorperbit, &cpi->fn_ptr[bsize],
+                  cpi->sf.mv.subpel_force_stop,
+                  cpi->sf.mv.subpel_iters_per_step,
+                  cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
+                  &distortion, &x->pred_sse[mbmi->ref_frame[0]], NULL, pw, ph,
+                  1);
+
+              if (try_second) {
+                int this_var;
+                MV best_mv = x->best_mv.as_mv;
+                const MV ref_mv = bsi->ref_mv[0]->as_mv;
+                const int minc =
+                    AOMMAX(x->mv_limits.col_min * 8, ref_mv.col - MV_MAX);
+                const int maxc =
+                    AOMMIN(x->mv_limits.col_max * 8, ref_mv.col + MV_MAX);
+                const int minr =
+                    AOMMAX(x->mv_limits.row_min * 8, ref_mv.row - MV_MAX);
+                const int maxr =
+                    AOMMIN(x->mv_limits.row_max * 8, ref_mv.row + MV_MAX);
+
+                x->best_mv = x->second_best_mv;
+                if (x->best_mv.as_mv.row * 8 <= maxr &&
+                    x->best_mv.as_mv.row * 8 >= minr &&
+                    x->best_mv.as_mv.col * 8 <= maxc &&
+                    x->best_mv.as_mv.col * 8 >= minc) {
+                  this_var = cpi->find_fractional_mv_step(
+                      x, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv,
+                      x->errorperbit, &cpi->fn_ptr[bsize],
+                      cpi->sf.mv.subpel_force_stop,
+                      cpi->sf.mv.subpel_iters_per_step,
+                      cond_cost_list(cpi, cost_list), x->nmvjointcost,
+                      x->mvcost, &distortion, &x->pred_sse[mbmi->ref_frame[0]],
+                      NULL, pw, ph, 1);
+                  if (this_var < best_mv_var) best_mv = x->best_mv.as_mv;
+                  x->best_mv.as_mv = best_mv;
+                }
+              }
+
+              // Restore the reference frames.
+              pd->pre[0] = backup_pred;
+            } else {
+              cpi->find_fractional_mv_step(
+                  x, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv,
+                  x->errorperbit, &cpi->fn_ptr[bsize],
+                  cpi->sf.mv.subpel_force_stop,
+                  cpi->sf.mv.subpel_iters_per_step,
+                  cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
+                  &distortion, &x->pred_sse[mbmi->ref_frame[0]], NULL, 0, 0, 0);
+            }
+
+// save motion search result for use in compound prediction
+#if CONFIG_EXT_INTER
+            seg_mvs[index][mbmi->ref_frame[0]].as_mv = x->best_mv.as_mv;
+#else
+            seg_mvs[index][mbmi->ref_frame[0]].as_mv = x->best_mv.as_mv;
+#endif  // CONFIG_EXT_INTER
+          }
+
+          if (cpi->sf.adaptive_motion_search)
+            x->pred_mv[mbmi->ref_frame[0]] = x->best_mv.as_mv;
+
+#if CONFIG_EXT_INTER
+          mode_mv[this_mode][0] = x->best_mv;
+#else
+          mode_mv[NEWMV][0] = x->best_mv;
+#endif  // CONFIG_EXT_INTER
+
+          // restore src pointers
+          mi_buf_restore(x, orig_src, orig_pre);
+        }
+
+        if (has_second_rf) {
+#if CONFIG_EXT_INTER
+          if (seg_mvs[index][mbmi->ref_frame[1]].as_int == INVALID_MV ||
+              seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV)
+#else
+          if (seg_mvs[index][mbmi->ref_frame[1]].as_int == INVALID_MV ||
+              seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV)
+#endif  // CONFIG_EXT_INTER
+            continue;
+        }
+
+#if CONFIG_DUAL_FILTER
+        (void)run_mv_search;
+#endif  // CONFIG_DUAL_FILTER
+
+        if (has_second_rf &&
+#if CONFIG_EXT_INTER
+            this_mode == NEW_NEWMV &&
+#else
+            this_mode == NEWMV &&
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_DUAL_FILTER
+            (mbmi->interp_filter[0] == EIGHTTAP_REGULAR || run_mv_search))
+#else
+            (mbmi->interp_filter == EIGHTTAP_REGULAR || run_mv_search))
+#endif  // CONFIG_DUAL_FILTER
+        {
+          // adjust src pointers
+          mi_buf_shift(x, index);
+          if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+            int rate_mv;
+            frame_mv[this_mode][mbmi->ref_frame[0]].as_int =
+                seg_mvs[index][mbmi->ref_frame[0]].as_int;
+            frame_mv[this_mode][mbmi->ref_frame[1]].as_int =
+                seg_mvs[index][mbmi->ref_frame[1]].as_int;
+            joint_motion_search(cpi, x, bsize, frame_mv[this_mode], mi_row,
+                                mi_col,
+#if CONFIG_EXT_INTER
+                                bsi->ref_mv,
+#endif  // CONFIG_EXT_INTER
+                                &rate_mv, index);
+#if CONFIG_EXT_INTER
+            compound_seg_newmvs[index][0].as_int =
+                frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
+            compound_seg_newmvs[index][1].as_int =
+                frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
+#else
+            seg_mvs[index][mbmi->ref_frame[0]].as_int =
+                frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
+            seg_mvs[index][mbmi->ref_frame[1]].as_int =
+                frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
+#endif  // CONFIG_EXT_INTER
+          }
+          // restore src pointers
+          mi_buf_restore(x, orig_src, orig_pre);
+        }
+
+        bsi->rdstat[index][mode_idx].brate = set_and_cost_bmi_mvs(
+            cpi, x, xd, index, this_mode, mode_mv[this_mode], frame_mv,
+            seg_mvs[index],
+#if CONFIG_EXT_INTER
+            compound_seg_newmvs[index],
+#endif  // CONFIG_EXT_INTER
+            bsi->ref_mv, x->nmvjointcost, x->mvcost, mi_row, mi_col);
+
+        for (ref = 0; ref < 1 + has_second_rf; ++ref) {
+          bsi->rdstat[index][mode_idx].mvs[ref].as_int =
+              mode_mv[this_mode][ref].as_int;
+          if (num_4x4_blocks_wide > 1)
+            bsi->rdstat[index + 1][mode_idx].mvs[ref].as_int =
+                mode_mv[this_mode][ref].as_int;
+          if (num_4x4_blocks_high > 1)
+            bsi->rdstat[index + 2][mode_idx].mvs[ref].as_int =
+                mode_mv[this_mode][ref].as_int;
+#if CONFIG_REF_MV
+          bsi->rdstat[index][mode_idx].pred_mv[ref].as_int =
+              mi->bmi[index].pred_mv[ref].as_int;
+          if (num_4x4_blocks_wide > 1)
+            bsi->rdstat[index + 1][mode_idx].pred_mv[ref].as_int =
+                mi->bmi[index].pred_mv[ref].as_int;
+          if (num_4x4_blocks_high > 1)
+            bsi->rdstat[index + 2][mode_idx].pred_mv[ref].as_int =
+                mi->bmi[index].pred_mv[ref].as_int;
+#endif  // CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+          bsi->rdstat[index][mode_idx].ref_mv[ref].as_int =
+              bsi->ref_mv[ref]->as_int;
+          if (num_4x4_blocks_wide > 1)
+            bsi->rdstat[index + 1][mode_idx].ref_mv[ref].as_int =
+                bsi->ref_mv[ref]->as_int;
+          if (num_4x4_blocks_high > 1)
+            bsi->rdstat[index + 2][mode_idx].ref_mv[ref].as_int =
+                bsi->ref_mv[ref]->as_int;
+#endif  // CONFIG_EXT_INTER
+        }
+
+        // Trap vectors that reach beyond the UMV borders
+        if (mv_check_bounds(&x->mv_limits, &mode_mv[this_mode][0].as_mv) ||
+            (has_second_rf &&
+             mv_check_bounds(&x->mv_limits, &mode_mv[this_mode][1].as_mv)))
+          continue;
+
+        if (filter_idx > 0) {
+          BEST_SEG_INFO *ref_bsi = bsi_buf;
+          subpelmv = 0;
+          have_ref = 1;
+
+          for (ref = 0; ref < 1 + has_second_rf; ++ref) {
+            subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv);
+#if CONFIG_EXT_INTER
+            if (have_newmv_in_inter_mode(this_mode))
+              have_ref &=
+                  ((mode_mv[this_mode][ref].as_int ==
+                    ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int) &&
+                   (bsi->ref_mv[ref]->as_int ==
+                    ref_bsi->rdstat[index][mode_idx].ref_mv[ref].as_int));
+            else
+#endif  // CONFIG_EXT_INTER
+              have_ref &= mode_mv[this_mode][ref].as_int ==
+                          ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int;
+          }
+
+          have_ref &= ref_bsi->rdstat[index][mode_idx].brate > 0;
+
+          if (filter_idx > 1 && !subpelmv && !have_ref) {
+            ref_bsi = bsi_buf + 1;
+            have_ref = 1;
+            for (ref = 0; ref < 1 + has_second_rf; ++ref)
+#if CONFIG_EXT_INTER
+              if (have_newmv_in_inter_mode(this_mode))
+                have_ref &=
+                    ((mode_mv[this_mode][ref].as_int ==
+                      ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int) &&
+                     (bsi->ref_mv[ref]->as_int ==
+                      ref_bsi->rdstat[index][mode_idx].ref_mv[ref].as_int));
+              else
+#endif  // CONFIG_EXT_INTER
+                have_ref &= mode_mv[this_mode][ref].as_int ==
+                            ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int;
+
+            have_ref &= ref_bsi->rdstat[index][mode_idx].brate > 0;
+          }
+
+          if (!subpelmv && have_ref &&
+              ref_bsi->rdstat[index][mode_idx].brdcost < INT64_MAX) {
+#if CONFIG_REF_MV
+            bsi->rdstat[index][mode_idx].byrate =
+                ref_bsi->rdstat[index][mode_idx].byrate;
+            bsi->rdstat[index][mode_idx].bdist =
+                ref_bsi->rdstat[index][mode_idx].bdist;
+            bsi->rdstat[index][mode_idx].bsse =
+                ref_bsi->rdstat[index][mode_idx].bsse;
+            bsi->rdstat[index][mode_idx].brate +=
+                ref_bsi->rdstat[index][mode_idx].byrate;
+            bsi->rdstat[index][mode_idx].eobs =
+                ref_bsi->rdstat[index][mode_idx].eobs;
+
+            bsi->rdstat[index][mode_idx].brdcost =
+                RDCOST(x->rdmult, x->rddiv, bsi->rdstat[index][mode_idx].brate,
+                       bsi->rdstat[index][mode_idx].bdist);
+
+            memcpy(bsi->rdstat[index][mode_idx].ta,
+                   ref_bsi->rdstat[index][mode_idx].ta,
+                   sizeof(bsi->rdstat[index][mode_idx].ta));
+            memcpy(bsi->rdstat[index][mode_idx].tl,
+                   ref_bsi->rdstat[index][mode_idx].tl,
+                   sizeof(bsi->rdstat[index][mode_idx].tl));
+#else
+            memcpy(&bsi->rdstat[index][mode_idx],
+                   &ref_bsi->rdstat[index][mode_idx], sizeof(SEG_RDSTAT));
+#endif  // CONFIG_REF_MV
+            if (num_4x4_blocks_wide > 1)
+              bsi->rdstat[index + 1][mode_idx].eobs =
+                  ref_bsi->rdstat[index + 1][mode_idx].eobs;
+            if (num_4x4_blocks_high > 1)
+              bsi->rdstat[index + 2][mode_idx].eobs =
+                  ref_bsi->rdstat[index + 2][mode_idx].eobs;
+
+            if (bsi->rdstat[index][mode_idx].brdcost < new_best_rd) {
+#if CONFIG_REF_MV
+              // If the NEWMV mode is using the same motion vector as the
+              // NEARESTMV mode, skip the rest rate-distortion calculations
+              // and use the inferred motion vector modes.
+              if (this_mode == NEWMV) {
+                if (has_second_rf) {
+                  if (bsi->rdstat[index][mode_idx].mvs[0].as_int ==
+                          bsi->ref_mv[0]->as_int &&
+                      bsi->rdstat[index][mode_idx].mvs[1].as_int ==
+                          bsi->ref_mv[1]->as_int)
+                    continue;
+                } else {
+                  if (bsi->rdstat[index][mode_idx].mvs[0].as_int ==
+                      bsi->ref_mv[0]->as_int)
+                    continue;
+                }
+              }
+#endif  // CONFIG_REF_MV
+              mode_selected = this_mode;
+              new_best_rd = bsi->rdstat[index][mode_idx].brdcost;
+#if CONFIG_PVQ
+              od_encode_checkpoint(&x->daala_enc, &post_buf);
+#endif  // CONFIG_PVQ
+            }
+            continue;
+          }
+        }
+
+        bsi->rdstat[index][mode_idx].brdcost = encode_inter_mb_segment_sub8x8(
+            cpi, x, bsi->segment_rd - this_segment_rd, index,
+            &bsi->rdstat[index][mode_idx].byrate,
+            &bsi->rdstat[index][mode_idx].bdist,
+            &bsi->rdstat[index][mode_idx].bsse, bsi->rdstat[index][mode_idx].ta,
+            bsi->rdstat[index][mode_idx].tl, idy, idx, mi_row, mi_col);
+
+        if (bsi->rdstat[index][mode_idx].brdcost < INT64_MAX) {
+          bsi->rdstat[index][mode_idx].brdcost += RDCOST(
+              x->rdmult, x->rddiv, bsi->rdstat[index][mode_idx].brate, 0);
+          bsi->rdstat[index][mode_idx].brate +=
+              bsi->rdstat[index][mode_idx].byrate;
+          bsi->rdstat[index][mode_idx].eobs = p->eobs[index];
+          if (num_4x4_blocks_wide > 1)
+            bsi->rdstat[index + 1][mode_idx].eobs = p->eobs[index + 1];
+          if (num_4x4_blocks_high > 1)
+            bsi->rdstat[index + 2][mode_idx].eobs = p->eobs[index + 2];
+        }
+
+        if (bsi->rdstat[index][mode_idx].brdcost < new_best_rd) {
+#if CONFIG_REF_MV
+          // If the NEWMV mode is using the same motion vector as the
+          // NEARESTMV mode, skip the rest rate-distortion calculations
+          // and use the inferred motion vector modes.
+          if (this_mode == NEWMV) {
+            if (has_second_rf) {
+              if (bsi->rdstat[index][mode_idx].mvs[0].as_int ==
+                      bsi->ref_mv[0]->as_int &&
+                  bsi->rdstat[index][mode_idx].mvs[1].as_int ==
+                      bsi->ref_mv[1]->as_int)
+                continue;
+            } else {
+              if (bsi->rdstat[index][mode_idx].mvs[0].as_int ==
+                  bsi->ref_mv[0]->as_int)
+                continue;
+            }
+          }
+#endif  // CONFIG_REF_MV
+          mode_selected = this_mode;
+          new_best_rd = bsi->rdstat[index][mode_idx].brdcost;
+
+#if CONFIG_PVQ
+          od_encode_checkpoint(&x->daala_enc, &post_buf);
+#endif  // CONFIG_PVQ
+        }
+      } /*for each 4x4 mode*/
+
+      if (new_best_rd == INT64_MAX) {
+        int iy, midx;
+        for (iy = index + 1; iy < 4; ++iy)
+#if CONFIG_EXT_INTER
+          for (midx = 0; midx < INTER_MODES + INTER_COMPOUND_MODES; ++midx)
+#else
+          for (midx = 0; midx < INTER_MODES; ++midx)
+#endif  // CONFIG_EXT_INTER
+            bsi->rdstat[iy][midx].brdcost = INT64_MAX;
+        bsi->segment_rd = INT64_MAX;
+#if CONFIG_PVQ
+        od_encode_rollback(&x->daala_enc, &pre_buf);
+#endif  // CONFIG_PVQ
+        return INT64_MAX;
+      }
+
+      mode_idx = INTER_OFFSET(mode_selected);
+      memcpy(t_above, bsi->rdstat[index][mode_idx].ta, sizeof(t_above));
+      memcpy(t_left, bsi->rdstat[index][mode_idx].tl, sizeof(t_left));
+#if CONFIG_PVQ
+      od_encode_rollback(&x->daala_enc, &post_buf);
+#endif  // CONFIG_PVQ
+
+#if CONFIG_EXT_INTER
+      bsi->ref_mv[0]->as_int = bsi->rdstat[index][mode_idx].ref_mv[0].as_int;
+      if (has_second_rf)
+        bsi->ref_mv[1]->as_int = bsi->rdstat[index][mode_idx].ref_mv[1].as_int;
+#endif  // CONFIG_EXT_INTER
+      set_and_cost_bmi_mvs(cpi, x, xd, index, mode_selected,
+                           mode_mv[mode_selected], frame_mv, seg_mvs[index],
+#if CONFIG_EXT_INTER
+                           compound_seg_newmvs[index],
+#endif  // CONFIG_EXT_INTER
+                           bsi->ref_mv, x->nmvjointcost, x->mvcost, mi_row,
+                           mi_col);
+
+      br += bsi->rdstat[index][mode_idx].brate;
+      bd += bsi->rdstat[index][mode_idx].bdist;
+      block_sse += bsi->rdstat[index][mode_idx].bsse;
+      segmentyrate += bsi->rdstat[index][mode_idx].byrate;
+      this_segment_rd += bsi->rdstat[index][mode_idx].brdcost;
+
+      if (this_segment_rd > bsi->segment_rd) {
+        int iy, midx;
+        for (iy = index + 1; iy < 4; ++iy)
+#if CONFIG_EXT_INTER
+          for (midx = 0; midx < INTER_MODES + INTER_COMPOUND_MODES; ++midx)
+#else
+          for (midx = 0; midx < INTER_MODES; ++midx)
+#endif  // CONFIG_EXT_INTER
+            bsi->rdstat[iy][midx].brdcost = INT64_MAX;
+        bsi->segment_rd = INT64_MAX;
+#if CONFIG_PVQ
+        od_encode_rollback(&x->daala_enc, &pre_buf);
+#endif  // CONFIG_PVQ
+        return INT64_MAX;
+      }
+    }
+  } /* for each label */
+#if CONFIG_PVQ
+  od_encode_rollback(&x->daala_enc, &pre_buf);
+#endif  // CONFIG_PVQ
+
+  bsi->r = br;
+  bsi->d = bd;
+  bsi->segment_yrate = segmentyrate;
+  bsi->segment_rd = this_segment_rd;
+  bsi->sse = block_sse;
+
+  // update the coding decisions
+  for (k = 0; k < 4; ++k) bsi->modes[k] = mi->bmi[k].as_mode;
+
+#if CONFIG_DAALA_DIST
+  // Compute prediction (i.e. skip) and decoded distortion by daala-distortion.
+  {
+    const int src_stride = p->src.stride;
+    const int dst_stride = pd->dst.stride;
+    uint8_t *src = p->src.buf;
+    uint8_t *dst = pd->dst.buf;
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
+    const int use_activity_masking = 0;
+    const int qm = OD_HVS_QM;
+    const int bsw = block_size_wide[plane_bsize];
+    const int bsh = block_size_high[plane_bsize];
+    int64_t rd1, rd2;
+    int64_t daala_sse, daala_dist;
+    TX_SIZE tx_size = mbmi->tx_size;
+
+#if CONFIG_HIGHBITDEPTH
+    uint8_t *recon_8x8;
+    DECLARE_ALIGNED(16, uint16_t, recon16[8 * 8]);
+
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      recon_8x8 = CONVERT_TO_BYTEPTR(recon16);
+    else
+      recon_8x8 = (uint8_t *)recon16;
+#else
+    DECLARE_ALIGNED(16, uint8_t, recon_8x8[8 * 8]);
+#endif  // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_PVQ
+    use_activity_masking = x->daala_enc.use_activity_masking;
+#endif  // CONFIG_PVQ
+
+    // For each of sub8x8 prediction block in a 8x8 block
+    for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+      for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+        int i = idy * 2 + idx;
+        const uint8_t *const src_sub8x8 =
+            src + av1_raster_block_offset(BLOCK_8X8, i, p->src.stride);
+        uint8_t *const dst_sub8x8 =
+            dst + av1_raster_block_offset(BLOCK_8X8, i, pd->dst.stride);
+        uint8_t *recon_sub8x8 = recon_8x8 + (idy * 8 + idx) * 4;
+        const int txb_width = max_block_wide(xd, plane_bsize, 0);
+        const int txb_height = max_block_high(xd, plane_bsize, 0);
+        int idx_, idy_;
+
+        av1_build_inter_predictor_sub8x8(xd, 0, i, idy, idx, mi_row, mi_col);
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          aom_highbd_subtract_block(
+              height, width,
+              av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
+              src_sub8x8, p->src.stride, dst_sub8x8, pd->dst.stride, xd->bd);
+        } else {
+          aom_subtract_block(
+              height, width,
+              av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
+              src_sub8x8, p->src.stride, dst_sub8x8, pd->dst.stride);
+        }
+#else
+        aom_subtract_block(
+            bsh, bsw, av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+            8, src_sub8x8, p->src.stride, dst_sub8x8, pd->dst.stride);
+#endif  // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          aom_highbd_convolve_copy(dst_sub8x8, dst_stride, recon_sub8x8, 8,
+                                   NULL, 0, NULL, 0, bsw, bsh, xd->bd);
+        } else {
+#endif  // CONFIG_HIGHBITDEPTH
+          aom_convolve_copy(dst_sub8x8, dst_stride, recon_sub8x8, 8, NULL, 0,
+                            NULL, 0, bsw, bsh);
+#if CONFIG_HIGHBITDEPTH
+        }
+#endif  // CONFIG_HIGHBITDEPTH
+
+        // To get decoded pixels, do 4x4 xform and quant for each 4x4 block
+        // in a sub8x8 prediction block. In case remaining parts of
+        // sub8x8 inter mode rdo assume pd->dst stores predicted pixels,
+        // use local buffer to store decoded pixels.
+        for (idy_ = 0; idy_ < txb_height; idy_++) {
+          for (idx_ = 0; idx_ < txb_width; idx_++) {
+            int coeff_ctx = 0;
+            const tran_low_t *dqcoeff;
+            uint16_t eob;
+            const PLANE_TYPE plane_type = PLANE_TYPE_Y;
+            uint8_t *recon_4x4 = recon_sub8x8 + (idy_ * 8 + idx_) * 4;
+            const int block_raster_idx = (idy + idy_) * 2 + (idx + idx_);
+            const int block =
+                av1_raster_order_to_block_index(tx_size, block_raster_idx);
+            TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+
+            dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+            av1_xform_quant(cm, x, 0, block, idy + idy_, idx + idx_, BLOCK_8X8,
+                            tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
+            if (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0)
+              av1_optimize_b(cm, x, 0, block, tx_size, coeff_ctx);
+
+            eob = p->eobs[block];
+            av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size,
+                                        recon_4x4, 8, eob);
+          }
+        }
+      }
+    }
+    // Compute daala-distortion for a 8x8 block
+    daala_sse = av1_daala_dist(src, src_stride, pd->dst.buf, dst_stride, 8, 8,
+                               qm, use_activity_masking, x->qindex)
+                << 4;
+
+    daala_dist = av1_daala_dist(src, src_stride, recon_8x8, 8, 8, 8, qm,
+                                use_activity_masking, x->qindex)
+                 << 4;
+
+    bsi->sse = daala_sse;
+    bsi->d = daala_dist;
+
+    rd1 = RDCOST(x->rdmult, x->rddiv, bsi->r, bsi->d);
+    rd2 = RDCOST(x->rdmult, x->rddiv, 0, bsi->sse);
+    bsi->segment_rd = AOMMIN(rd1, rd2);
+  }
+#endif  // CONFIG_DAALA_DIST
+
+  if (bsi->segment_rd > best_rd) return INT64_MAX;
+  /* set it to the best */
+  for (idx = 0; idx < 4; idx++) {
+    mode_idx = INTER_OFFSET(bsi->modes[idx]);
+    mi->bmi[idx].as_mv[0].as_int = bsi->rdstat[idx][mode_idx].mvs[0].as_int;
+    if (has_second_ref(mbmi))
+      mi->bmi[idx].as_mv[1].as_int = bsi->rdstat[idx][mode_idx].mvs[1].as_int;
+#if CONFIG_REF_MV
+    mi->bmi[idx].pred_mv[0] = bsi->rdstat[idx][mode_idx].pred_mv[0];
+    if (has_second_ref(mbmi))
+      mi->bmi[idx].pred_mv[1] = bsi->rdstat[idx][mode_idx].pred_mv[1];
+#endif  // CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+    mi->bmi[idx].ref_mv[0].as_int = bsi->rdstat[idx][mode_idx].ref_mv[0].as_int;
+    if (has_second_rf)
+      mi->bmi[idx].ref_mv[1].as_int =
+          bsi->rdstat[idx][mode_idx].ref_mv[1].as_int;
+#endif  // CONFIG_EXT_INTER
+    x->plane[0].eobs[idx] = bsi->rdstat[idx][mode_idx].eobs;
+    mi->bmi[idx].as_mode = bsi->modes[idx];
+  }
+
+  /*
+   * used to set mbmi->mv.as_int
+   */
+  *returntotrate = bsi->r;
+  *returndistortion = bsi->d;
+  *returnyrate = bsi->segment_yrate;
+  *skippable = av1_is_skippable_in_plane(x, BLOCK_8X8, 0);
+  *psse = bsi->sse;
+  mbmi->mode = bsi->modes[3];
+
+  return bsi->segment_rd;
+}
+
+static void estimate_ref_frame_costs(const AV1_COMMON *cm,
+                                     const MACROBLOCKD *xd, int segment_id,
+                                     unsigned int *ref_costs_single,
+                                     unsigned int *ref_costs_comp,
+                                     aom_prob *comp_mode_p) {
+  int seg_ref_active =
+      segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+  if (seg_ref_active) {
+    memset(ref_costs_single, 0,
+           TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_single));
+    memset(ref_costs_comp, 0, TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_comp));
+    *comp_mode_p = 128;
+  } else {
+    aom_prob intra_inter_p = av1_get_intra_inter_prob(cm, xd);
+    aom_prob comp_inter_p = 128;
+
+    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+      comp_inter_p = av1_get_reference_mode_prob(cm, xd);
+      *comp_mode_p = comp_inter_p;
+    } else {
+      *comp_mode_p = 128;
+    }
+
+    ref_costs_single[INTRA_FRAME] = av1_cost_bit(intra_inter_p, 0);
+
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
+      aom_prob ref_single_p1 = av1_get_pred_prob_single_ref_p1(cm, xd);
+      aom_prob ref_single_p2 = av1_get_pred_prob_single_ref_p2(cm, xd);
+#if CONFIG_EXT_REFS
+      aom_prob ref_single_p3 = av1_get_pred_prob_single_ref_p3(cm, xd);
+      aom_prob ref_single_p4 = av1_get_pred_prob_single_ref_p4(cm, xd);
+      aom_prob ref_single_p5 = av1_get_pred_prob_single_ref_p5(cm, xd);
+#endif  // CONFIG_EXT_REFS
+
+      unsigned int base_cost = av1_cost_bit(intra_inter_p, 1);
+
+      ref_costs_single[LAST_FRAME] =
+#if CONFIG_EXT_REFS
+          ref_costs_single[LAST2_FRAME] = ref_costs_single[LAST3_FRAME] =
+              ref_costs_single[BWDREF_FRAME] =
+#endif  // CONFIG_EXT_REFS
+                  ref_costs_single[GOLDEN_FRAME] =
+                      ref_costs_single[ALTREF_FRAME] = base_cost;
+
+#if CONFIG_EXT_REFS
+      ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0);
+      ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p1, 0);
+      ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p1, 0);
+      ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 0);
+      ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
+      ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
+
+      ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p3, 0);
+      ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p3, 0);
+      ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p3, 1);
+      ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p3, 1);
+
+      ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p2, 0);
+      ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p2, 1);
+
+      ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p4, 0);
+      ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p4, 1);
+
+      ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p5, 0);
+      ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p5, 1);
+#else
+      ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0);
+      ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 1);
+      ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
+
+      ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p2, 0);
+      ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p2, 1);
+#endif  // CONFIG_EXT_REFS
+    } else {
+      ref_costs_single[LAST_FRAME] = 512;
+#if CONFIG_EXT_REFS
+      ref_costs_single[LAST2_FRAME] = 512;
+      ref_costs_single[LAST3_FRAME] = 512;
+      ref_costs_single[BWDREF_FRAME] = 512;
+#endif  // CONFIG_EXT_REFS
+      ref_costs_single[GOLDEN_FRAME] = 512;
+      ref_costs_single[ALTREF_FRAME] = 512;
+    }
+
+    if (cm->reference_mode != SINGLE_REFERENCE) {
+      aom_prob ref_comp_p = av1_get_pred_prob_comp_ref_p(cm, xd);
+#if CONFIG_EXT_REFS
+      aom_prob ref_comp_p1 = av1_get_pred_prob_comp_ref_p1(cm, xd);
+      aom_prob ref_comp_p2 = av1_get_pred_prob_comp_ref_p2(cm, xd);
+      aom_prob bwdref_comp_p = av1_get_pred_prob_comp_bwdref_p(cm, xd);
+#endif  // CONFIG_EXT_REFS
+
+      unsigned int base_cost = av1_cost_bit(intra_inter_p, 1);
+
+      ref_costs_comp[LAST_FRAME] =
+#if CONFIG_EXT_REFS
+          ref_costs_comp[LAST2_FRAME] = ref_costs_comp[LAST3_FRAME] =
+#endif  // CONFIG_EXT_REFS
+              ref_costs_comp[GOLDEN_FRAME] = base_cost;
+
+#if CONFIG_EXT_REFS
+      ref_costs_comp[BWDREF_FRAME] = ref_costs_comp[ALTREF_FRAME] = 0;
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+      ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
+      ref_costs_comp[LAST2_FRAME] += av1_cost_bit(ref_comp_p, 0);
+      ref_costs_comp[LAST3_FRAME] += av1_cost_bit(ref_comp_p, 1);
+      ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
+
+      ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p1, 1);
+      ref_costs_comp[LAST2_FRAME] += av1_cost_bit(ref_comp_p1, 0);
+
+      ref_costs_comp[LAST3_FRAME] += av1_cost_bit(ref_comp_p2, 0);
+      ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p2, 1);
+
+      // NOTE(zoeliu): BWDREF and ALTREF each add an extra cost by coding 1
+      //               more bit.
+      ref_costs_comp[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
+      ref_costs_comp[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1);
+#else
+      ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
+      ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
+#endif  // CONFIG_EXT_REFS
+    } else {
+      ref_costs_comp[LAST_FRAME] = 512;
+#if CONFIG_EXT_REFS
+      ref_costs_comp[LAST2_FRAME] = 512;
+      ref_costs_comp[LAST3_FRAME] = 512;
+      ref_costs_comp[BWDREF_FRAME] = 512;
+      ref_costs_comp[ALTREF_FRAME] = 512;
+#endif  // CONFIG_EXT_REFS
+      ref_costs_comp[GOLDEN_FRAME] = 512;
+    }
+  }
+}
+
+static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+                                 int mode_index,
+                                 int64_t comp_pred_diff[REFERENCE_MODES],
+                                 int skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  // Take a snapshot of the coding context so it can be
+  // restored if we decide to encode this way
+  ctx->skip = x->skip;
+  ctx->skippable = skippable;
+  ctx->best_mode_index = mode_index;
+  ctx->mic = *xd->mi[0];
+  ctx->mbmi_ext = *x->mbmi_ext;
+  ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
+  ctx->comp_pred_diff = (int)comp_pred_diff[COMPOUND_REFERENCE];
+  ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
+}
+
+static void setup_buffer_inter(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
+    BLOCK_SIZE block_size, int mi_row, int mi_col,
+    int_mv frame_nearest_mv[TOTAL_REFS_PER_FRAME],
+    int_mv frame_near_mv[TOTAL_REFS_PER_FRAME],
+    struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE]) {
+  const AV1_COMMON *cm = &cpi->common;
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
+  const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+
+  assert(yv12 != NULL);
+
+  // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
+  // use the UV scaling factors.
+  av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
+
+  // Gets an initial list of candidate vectors from neighbours and orders them
+  av1_find_mv_refs(
+      cm, xd, mi, ref_frame,
+#if CONFIG_REF_MV
+      &mbmi_ext->ref_mv_count[ref_frame], mbmi_ext->ref_mv_stack[ref_frame],
+#if CONFIG_EXT_INTER
+      mbmi_ext->compound_mode_context,
+#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_REF_MV
+      candidates, mi_row, mi_col, NULL, NULL, mbmi_ext->mode_context);
+
+  // Candidate refinement carried out at encoder and decoder
+  av1_find_best_ref_mvs(cm->allow_high_precision_mv, candidates,
+                        &frame_nearest_mv[ref_frame],
+                        &frame_near_mv[ref_frame]);
+
+// Further refinement that is encode side only to test the top few candidates
+// in full and choose the best as the centre point for subsequent searches.
+// The current implementation doesn't support scaling.
+#if CONFIG_CB4X4
+  av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
+              block_size);
+#else
+  if (!av1_is_scaled(sf) && block_size >= BLOCK_8X8)
+    av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
+                block_size);
+#endif  // CONFIG_CB4X4
+}
+
+static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+#if CONFIG_EXT_INTER
+                                 int ref_idx,
+#endif  // CONFIG_EXT_INTER
+                                 int *rate_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const AV1_COMMON *cm = &cpi->common;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+  int bestsme = INT_MAX;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  MV mvp_full;
+#if CONFIG_EXT_INTER
+  int ref = mbmi->ref_frame[ref_idx];
+#else
+  int ref = mbmi->ref_frame[0];
+  int ref_idx = 0;
+#endif  // CONFIG_EXT_INTER
+  MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+
+  MvLimits tmp_mv_limits = x->mv_limits;
+  int cost_list[5];
+
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref);
+
+  MV pred_mv[3];
+  pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+  pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv;
+  pred_mv[2] = x->pred_mv[ref];
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[ref_idx];
+
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+  av1_set_mv_search_range(&x->mv_limits, &ref_mv);
+
+#if CONFIG_REF_MV
+  av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
+#endif  // CONFIG_REF_MV
+
+  // Work out the size of the first step in the mv step search.
+  // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc.
+  if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
+    // Take wtd average of the step_params based on the last frame's
+    // max mv magnitude and that based on the best ref mvs of the current
+    // block for the given reference.
+    step_param =
+        (av1_init_search_range(x->max_mv_context[ref]) + cpi->mv_step_param) /
+        2;
+  } else {
+    step_param = cpi->mv_step_param;
+  }
+
+  if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size) {
+    int boffset =
+        2 * (b_width_log2_lookup[cm->sb_size] -
+             AOMMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
+    step_param = AOMMAX(step_param, boffset);
+  }
+
+  if (cpi->sf.adaptive_motion_search) {
+    int bwl = b_width_log2_lookup[bsize];
+    int bhl = b_height_log2_lookup[bsize];
+    int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
+
+    if (tlevel < 5) step_param += 2;
+
+    // prev_mv_sad is not setup for dynamically scaled frames.
+    if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) {
+      int i;
+      for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+        if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+          x->pred_mv[ref].row = 0;
+          x->pred_mv[ref].col = 0;
+          x->best_mv.as_int = INVALID_MV;
+
+          if (scaled_ref_frame) {
+            int j;
+            for (j = 0; j < MAX_MB_PLANE; ++j)
+              xd->plane[j].pre[ref_idx] = backup_yv12[j];
+          }
+          return;
+        }
+      }
+    }
+  }
+
+  av1_set_mv_search_range(&x->mv_limits, &ref_mv);
+
+#if CONFIG_MOTION_VAR
+  if (mbmi->motion_mode != SIMPLE_TRANSLATION)
+    mvp_full = mbmi->mv[0].as_mv;
+  else
+#endif  // CONFIG_MOTION_VAR
+    mvp_full = pred_mv[x->mv_best_ref_index[ref]];
+
+  mvp_full.col >>= 3;
+  mvp_full.row >>= 3;
+
+  x->best_mv.as_int = x->second_best_mv.as_int = INVALID_MV;
+
+#if CONFIG_MOTION_VAR
+  switch (mbmi->motion_mode) {
+    case SIMPLE_TRANSLATION:
+#endif  // CONFIG_MOTION_VAR
+      bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
+                                      sadpb, cond_cost_list(cpi, cost_list),
+                                      &ref_mv, INT_MAX, 1);
+#if CONFIG_MOTION_VAR
+      break;
+    case OBMC_CAUSAL:
+      bestsme = av1_obmc_full_pixel_diamond(
+          cpi, x, &mvp_full, step_param, sadpb,
+          MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv,
+          &(x->best_mv.as_mv), 0);
+      break;
+    default: assert("Invalid motion mode!\n");
+  }
+#endif  // CONFIG_MOTION_VAR
+
+  x->mv_limits = tmp_mv_limits;
+
+  if (bestsme < INT_MAX) {
+    int dis; /* TODO: use dis in distortion calculation later. */
+#if CONFIG_MOTION_VAR
+    switch (mbmi->motion_mode) {
+      case SIMPLE_TRANSLATION:
+#endif  // CONFIG_MOTION_VAR
+        if (cpi->sf.use_upsampled_references) {
+          int best_mv_var;
+          const int try_second = x->second_best_mv.as_int != INVALID_MV &&
+                                 x->second_best_mv.as_int != x->best_mv.as_int;
+          const int pw = block_size_wide[bsize];
+          const int ph = block_size_high[bsize];
+          // Use up-sampled reference frames.
+          struct macroblockd_plane *const pd = &xd->plane[0];
+          struct buf_2d backup_pred = pd->pre[ref_idx];
+          const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
+
+          // Set pred for Y plane
+          setup_pred_plane(
+              &pd->pre[ref_idx], bsize, upsampled_ref->y_buffer,
+              upsampled_ref->y_crop_width, upsampled_ref->y_crop_height,
+              upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3), NULL,
+              pd->subsampling_x, pd->subsampling_y);
+
+          best_mv_var = cpi->find_fractional_mv_step(
+              x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
+              &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+              cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
+              x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, pw, ph,
+              1);
+
+          if (try_second) {
+            const int minc =
+                AOMMAX(x->mv_limits.col_min * 8, ref_mv.col - MV_MAX);
+            const int maxc =
+                AOMMIN(x->mv_limits.col_max * 8, ref_mv.col + MV_MAX);
+            const int minr =
+                AOMMAX(x->mv_limits.row_min * 8, ref_mv.row - MV_MAX);
+            const int maxr =
+                AOMMIN(x->mv_limits.row_max * 8, ref_mv.row + MV_MAX);
+            int this_var;
+            MV best_mv = x->best_mv.as_mv;
+
+            x->best_mv = x->second_best_mv;
+            if (x->best_mv.as_mv.row * 8 <= maxr &&
+                x->best_mv.as_mv.row * 8 >= minr &&
+                x->best_mv.as_mv.col * 8 <= maxc &&
+                x->best_mv.as_mv.col * 8 >= minc) {
+              this_var = cpi->find_fractional_mv_step(
+                  x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
+                  &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+                  cpi->sf.mv.subpel_iters_per_step,
+                  cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
+                  &dis, &x->pred_sse[ref], NULL, pw, ph, 1);
+              if (this_var < best_mv_var) best_mv = x->best_mv.as_mv;
+              x->best_mv.as_mv = best_mv;
+            }
+          }
+
+          // Restore the reference frames.
+          pd->pre[ref_idx] = backup_pred;
+        } else {
+          cpi->find_fractional_mv_step(
+              x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
+              &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+              cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
+              x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0,
+              0);
+        }
+#if CONFIG_MOTION_VAR
+        break;
+      case OBMC_CAUSAL:
+        av1_find_best_obmc_sub_pixel_tree_up(
+            cpi, x, mi_row, mi_col, &x->best_mv.as_mv, &ref_mv,
+            cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize],
+            cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step,
+            x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], 0,
+            cpi->sf.use_upsampled_references);
+        break;
+      default: assert("Invalid motion mode!\n");
+    }
+#endif  // CONFIG_MOTION_VAR
+  }
+  *rate_mv = av1_mv_bit_cost(&x->best_mv.as_mv, &ref_mv, x->nmvjointcost,
+                             x->mvcost, MV_COST_WEIGHT);
+
+#if CONFIG_MOTION_VAR
+  if (cpi->sf.adaptive_motion_search && mbmi->motion_mode == SIMPLE_TRANSLATION)
+#else
+  if (cpi->sf.adaptive_motion_search)
+#endif  // CONFIG_MOTION_VAR
+    x->pred_mv[ref] = x->best_mv.as_mv;
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[ref_idx] = backup_yv12[i];
+  }
+}
+
+static INLINE void restore_dst_buf(MACROBLOCKD *xd, BUFFER_SET dst) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].dst.buf = dst.plane[i];
+    xd->plane[i].dst.stride = dst.stride[i];
+  }
+}
+
+#if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+static void do_masked_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    const uint8_t *mask, int mask_stride,
+                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                    int_mv *tmp_mv, int *rate_mv, int ref_idx) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const AV1_COMMON *cm = &cpi->common;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+  int bestsme = INT_MAX;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  MV mvp_full;
+  int ref = mbmi->ref_frame[ref_idx];
+  MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+
+  MvLimits tmp_mv_limits = x->mv_limits;
+
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref);
+  int i;
+
+  MV pred_mv[3];
+  pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+  pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv;
+  pred_mv[2] = x->pred_mv[ref];
+
+#if CONFIG_REF_MV
+  av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
+#endif  // CONFIG_REF_MV
+
+  if (scaled_ref_frame) {
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[ref_idx];
+
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+  av1_set_mv_search_range(&x->mv_limits, &ref_mv);
+
+  // Work out the size of the first step in the mv step search.
+  // 0 here is maximum length first step. 1 is MAX >> 1 etc.
+  if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
+    // Take wtd average of the step_params based on the last frame's
+    // max mv magnitude and that based on the best ref mvs of the current
+    // block for the given reference.
+    step_param =
+        (av1_init_search_range(x->max_mv_context[ref]) + cpi->mv_step_param) /
+        2;
+  } else {
+    step_param = cpi->mv_step_param;
+  }
+
+  // TODO(debargha): is show_frame needed here?
+  if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size && cm->show_frame) {
+    int boffset =
+        2 * (b_width_log2_lookup[cm->sb_size] -
+             AOMMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
+    step_param = AOMMAX(step_param, boffset);
+  }
+
+  if (cpi->sf.adaptive_motion_search) {
+    int bwl = b_width_log2_lookup[bsize];
+    int bhl = b_height_log2_lookup[bsize];
+    int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
+
+    if (tlevel < 5) step_param += 2;
+
+    // prev_mv_sad is not setup for dynamically scaled frames.
+    if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) {
+      for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+        if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+          x->pred_mv[ref].row = 0;
+          x->pred_mv[ref].col = 0;
+          tmp_mv->as_int = INVALID_MV;
+
+          if (scaled_ref_frame) {
+            int j;
+            for (j = 0; j < MAX_MB_PLANE; ++j)
+              xd->plane[j].pre[ref_idx] = backup_yv12[j];
+          }
+          return;
+        }
+      }
+    }
+  }
+
+  mvp_full = pred_mv[x->mv_best_ref_index[ref]];
+
+  mvp_full.col >>= 3;
+  mvp_full.row >>= 3;
+
+  bestsme = av1_masked_full_pixel_diamond(
+      cpi, x, mask, mask_stride, &mvp_full, step_param, sadpb,
+      MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv,
+      &tmp_mv->as_mv, ref_idx);
+
+  x->mv_limits = tmp_mv_limits;
+
+  if (bestsme < INT_MAX) {
+    int dis; /* TODO: use dis in distortion calculation later. */
+    av1_find_best_masked_sub_pixel_tree_up(
+        cpi, x, mask, mask_stride, mi_row, mi_col, &tmp_mv->as_mv, &ref_mv,
+        cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize],
+        cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step,
+        x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], ref_idx,
+        cpi->sf.use_upsampled_references);
+  }
+  *rate_mv = av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost,
+                             x->mvcost, MV_COST_WEIGHT);
+
+  if (cpi->sf.adaptive_motion_search && cm->show_frame)
+    x->pred_mv[ref] = tmp_mv->as_mv;
+
+  if (scaled_ref_frame) {
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[ref_idx] = backup_yv12[i];
+  }
+}
+
+static void do_masked_motion_search_indexed(
+    const AV1_COMP *const cpi, MACROBLOCK *x,
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize,
+    int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv, int which) {
+  // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  BLOCK_SIZE sb_type = mbmi->sb_type;
+  const uint8_t *mask;
+  const int mask_stride = block_size_wide[bsize];
+
+  mask = av1_get_compound_type_mask(comp_data, sb_type);
+
+  if (which == 0 || which == 2)
+    do_masked_motion_search(cpi, x, mask, mask_stride, bsize, mi_row, mi_col,
+                            &tmp_mv[0], &rate_mv[0], 0);
+
+  if (which == 1 || which == 2) {
+// get the negative mask
+#if CONFIG_COMPOUND_SEGMENT
+    uint8_t inv_mask_buf[2 * MAX_SB_SQUARE];
+    const int h = block_size_high[bsize];
+    mask = av1_get_compound_type_mask_inverse(
+        comp_data, inv_mask_buf, h, mask_stride, mask_stride, sb_type);
+#else
+    mask = av1_get_compound_type_mask_inverse(comp_data, sb_type);
+#endif  // CONFIG_COMPOUND_SEGMENT
+    do_masked_motion_search(cpi, x, mask, mask_stride, bsize, mi_row, mi_col,
+                            &tmp_mv[1], &rate_mv[1], 1);
+  }
+}
+#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+#endif  // CONFIG_EXT_INTER
+
+// In some situations we want to discount tha pparent cost of a new motion
+// vector. Where there is a subtle motion field and especially where there is
+// low spatial complexity then it can be hard to cover the cost of a new motion
+// vector in a single block, even if that motion vector reduces distortion.
+// However, once established that vector may be usable through the nearest and
+// near mv modes to reduce distortion in subsequent blocks and also improve
+// visual quality.
+static int discount_newmv_test(const AV1_COMP *const cpi, int this_mode,
+                               int_mv this_mv,
+                               int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME],
+                               int ref_frame) {
+  return (!cpi->rc.is_src_frame_alt_ref && (this_mode == NEWMV) &&
+          (this_mv.as_int != 0) &&
+          ((mode_mv[NEARESTMV][ref_frame].as_int == 0) ||
+           (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) &&
+          ((mode_mv[NEARMV][ref_frame].as_int == 0) ||
+           (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV)));
+}
+
+#define LEFT_TOP_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
+#define RIGHT_BOTTOM_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
+
+// TODO(jingning): this mv clamping function should be block size dependent.
+static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
+  clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+           xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+           xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+           xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+}
+
+#if CONFIG_EXT_INTER
+#if CONFIG_WEDGE
+static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
+                               const BLOCK_SIZE bsize, const uint8_t *pred0,
+                               int stride0, const uint8_t *pred1, int stride1) {
+  const struct macroblock_plane *const p = &x->plane[0];
+  const uint8_t *src = p->src.buf;
+  int src_stride = p->src.stride;
+  const int f_index = bsize - BLOCK_8X8;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  uint32_t esq[2][4], var;
+  int64_t tl, br;
+
+#if CONFIG_HIGHBITDEPTH
+  if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    pred0 = CONVERT_TO_BYTEPTR(pred0);
+    pred1 = CONVERT_TO_BYTEPTR(pred1);
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+
+  var = cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
+  var = cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred0 + bw / 2,
+                                stride0, &esq[0][1]);
+  var = cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride,
+                                pred0 + bh / 2 * stride0, stride0, &esq[0][2]);
+  var = cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride,
+                                pred0 + bh / 2 * stride0 + bw / 2, stride0,
+                                &esq[0][3]);
+  var = cpi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
+  var = cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred1 + bw / 2,
+                                stride1, &esq[1][1]);
+  var = cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride,
+                                pred1 + bh / 2 * stride1, stride0, &esq[1][2]);
+  var = cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride,
+                                pred1 + bh / 2 * stride1 + bw / 2, stride0,
+                                &esq[1][3]);
+  (void)var;
+
+  tl = (int64_t)(esq[0][0] + esq[0][1] + esq[0][2]) -
+       (int64_t)(esq[1][0] + esq[1][1] + esq[1][2]);
+  br = (int64_t)(esq[1][3] + esq[1][1] + esq[1][2]) -
+       (int64_t)(esq[0][3] + esq[0][1] + esq[0][2]);
+  return (tl + br > 0);
+}
+#endif  // CONFIG_WEDGE
+#endif  // CONFIG_EXT_INTER
+
+#if !CONFIG_DUAL_FILTER
+static InterpFilter predict_interp_filter(
+    const AV1_COMP *cpi, const MACROBLOCK *x, const BLOCK_SIZE bsize,
+    const int mi_row, const int mi_col,
+    InterpFilter (*single_filter)[TOTAL_REFS_PER_FRAME]) {
+  InterpFilter best_filter = SWITCHABLE;
+  const AV1_COMMON *cm = &cpi->common;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  int bsl = mi_width_log2_lookup[bsize];
+  int pred_filter_search =
+      cpi->sf.cb_pred_filter_search
+          ? (((mi_row + mi_col) >> bsl) +
+             get_chessboard_index(cm->current_video_frame)) &
+                0x1
+          : 0;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int is_comp_pred = has_second_ref(mbmi);
+  const int this_mode = mbmi->mode;
+  int refs[2] = { mbmi->ref_frame[0],
+                  (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+  if (pred_filter_search) {
+    InterpFilter af = SWITCHABLE, lf = SWITCHABLE;
+    if (xd->up_available) af = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
+    if (xd->left_available) lf = xd->mi[-1]->mbmi.interp_filter;
+
+#if CONFIG_EXT_INTER
+    if ((this_mode != NEWMV && this_mode != NEW_NEWMV) || (af == lf))
+#else
+    if ((this_mode != NEWMV) || (af == lf))
+#endif  // CONFIG_EXT_INTER
+      best_filter = af;
+  }
+  if (is_comp_pred) {
+    if (cpi->sf.adaptive_mode_search) {
+#if CONFIG_EXT_INTER
+      switch (this_mode) {
+        case NEAREST_NEARESTMV:
+          if (single_filter[NEARESTMV][refs[0]] ==
+              single_filter[NEARESTMV][refs[1]])
+            best_filter = single_filter[NEARESTMV][refs[0]];
+          break;
+        case NEAREST_NEARMV:
+          if (single_filter[NEARESTMV][refs[0]] ==
+              single_filter[NEARMV][refs[1]])
+            best_filter = single_filter[NEARESTMV][refs[0]];
+          break;
+        case NEAR_NEARESTMV:
+          if (single_filter[NEARMV][refs[0]] ==
+              single_filter[NEARESTMV][refs[1]])
+            best_filter = single_filter[NEARMV][refs[0]];
+          break;
+        case NEAR_NEARMV:
+          if (single_filter[NEARMV][refs[0]] == single_filter[NEARMV][refs[1]])
+            best_filter = single_filter[NEARMV][refs[0]];
+          break;
+        case ZERO_ZEROMV:
+          if (single_filter[ZEROMV][refs[0]] == single_filter[ZEROMV][refs[1]])
+            best_filter = single_filter[ZEROMV][refs[0]];
+          break;
+        case NEW_NEWMV:
+          if (single_filter[NEWMV][refs[0]] == single_filter[NEWMV][refs[1]])
+            best_filter = single_filter[NEWMV][refs[0]];
+          break;
+        case NEAREST_NEWMV:
+          if (single_filter[NEARESTMV][refs[0]] ==
+              single_filter[NEWMV][refs[1]])
+            best_filter = single_filter[NEARESTMV][refs[0]];
+          break;
+        case NEAR_NEWMV:
+          if (single_filter[NEARMV][refs[0]] == single_filter[NEWMV][refs[1]])
+            best_filter = single_filter[NEARMV][refs[0]];
+          break;
+        case NEW_NEARESTMV:
+          if (single_filter[NEWMV][refs[0]] ==
+              single_filter[NEARESTMV][refs[1]])
+            best_filter = single_filter[NEWMV][refs[0]];
+          break;
+        case NEW_NEARMV:
+          if (single_filter[NEWMV][refs[0]] == single_filter[NEARMV][refs[1]])
+            best_filter = single_filter[NEWMV][refs[0]];
+          break;
+        default:
+          if (single_filter[this_mode][refs[0]] ==
+              single_filter[this_mode][refs[1]])
+            best_filter = single_filter[this_mode][refs[0]];
+          break;
+      }
+#else
+      if (single_filter[this_mode][refs[0]] ==
+          single_filter[this_mode][refs[1]])
+        best_filter = single_filter[this_mode][refs[0]];
+#endif  // CONFIG_EXT_INTER
+    }
+  }
+  if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
+    best_filter = EIGHTTAP_REGULAR;
+  }
+  return best_filter;
+}
+#endif  // !CONFIG_DUAL_FILTER
+
+#if CONFIG_EXT_INTER
+// Choose the best wedge index and sign
+#if CONFIG_WEDGE
+static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
+                          const BLOCK_SIZE bsize, const uint8_t *const p0,
+                          const uint8_t *const p1, int *const best_wedge_sign,
+                          int *const best_wedge_index) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const src = &x->plane[0].src;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int N = bw * bh;
+  int rate;
+  int64_t dist;
+  int64_t rd, best_rd = INT64_MAX;
+  int wedge_index;
+  int wedge_sign;
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  const uint8_t *mask;
+  uint64_t sse;
+#if CONFIG_HIGHBITDEPTH
+  const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+#else
+  const int bd_round = 0;
+#endif  // CONFIG_HIGHBITDEPTH
+
+  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]);
+
+  int64_t sign_limit;
+
+#if CONFIG_HIGHBITDEPTH
+  if (hbd) {
+    aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+    aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
+    aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
+                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+  } else  // NOLINT
+#endif    // CONFIG_HIGHBITDEPTH
+  {
+    aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw);
+    aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
+    aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
+  }
+
+  sign_limit = ((int64_t)aom_sum_squares_i16(r0, N) -
+                (int64_t)aom_sum_squares_i16(r1, N)) *
+               (1 << WEDGE_WEIGHT_BITS) / 2;
+
+  if (N < 64)
+    av1_wedge_compute_delta_squares_c(ds, r0, r1, N);
+  else
+    av1_wedge_compute_delta_squares(ds, r0, r1, N);
+
+  for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+    mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize);
+
+    // TODO(jingning): Make sse2 functions support N = 16 case
+    if (N < 64)
+      wedge_sign = av1_wedge_sign_from_residuals_c(ds, mask, N, sign_limit);
+    else
+      wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit);
+
+    mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+    if (N < 64)
+      sse = av1_wedge_sse_from_residuals_c(r1, d10, mask, N);
+    else
+      sse = av1_wedge_sse_from_residuals(r1, d10, mask, N);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+    model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
+    rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+
+    if (rd < best_rd) {
+      *best_wedge_index = wedge_index;
+      *best_wedge_sign = wedge_sign;
+      best_rd = rd;
+    }
+  }
+
+  return best_rd;
+}
+
+// Choose the best wedge index the specified sign
+static int64_t pick_wedge_fixed_sign(
+    const AV1_COMP *const cpi, const MACROBLOCK *const x,
+    const BLOCK_SIZE bsize, const uint8_t *const p0, const uint8_t *const p1,
+    const int wedge_sign, int *const best_wedge_index) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const src = &x->plane[0].src;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int N = bw * bh;
+  int rate;
+  int64_t dist;
+  int64_t rd, best_rd = INT64_MAX;
+  int wedge_index;
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  const uint8_t *mask;
+  uint64_t sse;
+#if CONFIG_HIGHBITDEPTH
+  const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+#else
+  const int bd_round = 0;
+#endif  // CONFIG_HIGHBITDEPTH
+
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
+
+#if CONFIG_HIGHBITDEPTH
+  if (hbd) {
+    aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
+    aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
+                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+  } else  // NOLINT
+#endif    // CONFIG_HIGHBITDEPTH
+  {
+    aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
+    aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
+  }
+
+  for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+    mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+    if (N < 64)
+      sse = av1_wedge_sse_from_residuals_c(r1, d10, mask, N);
+    else
+      sse = av1_wedge_sse_from_residuals(r1, d10, mask, N);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+    model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
+    rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+
+    if (rd < best_rd) {
+      *best_wedge_index = wedge_index;
+      best_rd = rd;
+    }
+  }
+
+  return best_rd;
+}
+
+static int64_t pick_interinter_wedge(const AV1_COMP *const cpi,
+                                     MACROBLOCK *const x,
+                                     const BLOCK_SIZE bsize,
+                                     const uint8_t *const p0,
+                                     const uint8_t *const p1) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int bw = block_size_wide[bsize];
+
+  int64_t rd;
+  int wedge_index = -1;
+  int wedge_sign = 0;
+
+  assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+
+  if (cpi->sf.fast_wedge_sign_estimate) {
+    wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
+    rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, wedge_sign, &wedge_index);
+  } else {
+    rd = pick_wedge(cpi, x, bsize, p0, p1, &wedge_sign, &wedge_index);
+  }
+
+  mbmi->wedge_sign = wedge_sign;
+  mbmi->wedge_index = wedge_index;
+  return rd;
+}
+#endif  // CONFIG_WEDGE
+
+#if CONFIG_COMPOUND_SEGMENT
+static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
+                                   MACROBLOCK *const x, const BLOCK_SIZE bsize,
+                                   const uint8_t *const p0,
+                                   const uint8_t *const p1) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const struct buf_2d *const src = &x->plane[0].src;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int N = bw * bh;
+  int rate;
+  uint64_t sse;
+  int64_t dist;
+  int64_t rd0;
+  SEG_MASK_TYPE cur_mask_type;
+  int64_t best_rd = INT64_MAX;
+  SEG_MASK_TYPE best_mask_type = 0;
+#if CONFIG_HIGHBITDEPTH
+  const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+#else
+  const int bd_round = 0;
+#endif  // CONFIG_HIGHBITDEPTH
+  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
+
+#if CONFIG_HIGHBITDEPTH
+  if (hbd) {
+    aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+    aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
+    aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
+                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+  } else  // NOLINT
+#endif    // CONFIG_HIGHBITDEPTH
+  {
+    aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw);
+    aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
+    aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
+  }
+
+  // try each mask type and its inverse
+  for (cur_mask_type = 0; cur_mask_type < SEG_MASK_TYPES; cur_mask_type++) {
+// build mask and inverse
+#if CONFIG_HIGHBITDEPTH
+    if (hbd)
+      build_compound_seg_mask_highbd(
+          xd->seg_mask, cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
+          CONVERT_TO_BYTEPTR(p1), bw, bsize, bh, bw, xd->bd);
+    else
+#endif  // CONFIG_HIGHBITDEPTH
+      build_compound_seg_mask(xd->seg_mask, cur_mask_type, p0, bw, p1, bw,
+                              bsize, bh, bw);
+
+    // compute rd for mask
+    sse = av1_wedge_sse_from_residuals(r1, d10, xd->seg_mask, N);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+    model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
+    rd0 = RDCOST(x->rdmult, x->rddiv, rate, dist);
+
+    if (rd0 < best_rd) {
+      best_mask_type = cur_mask_type;
+      best_rd = rd0;
+    }
+  }
+
+  // make final mask
+  mbmi->mask_type = best_mask_type;
+#if CONFIG_HIGHBITDEPTH
+  if (hbd)
+    build_compound_seg_mask_highbd(
+        xd->seg_mask, mbmi->mask_type, CONVERT_TO_BYTEPTR(p0), bw,
+        CONVERT_TO_BYTEPTR(p1), bw, bsize, bh, bw, xd->bd);
+  else
+#endif  // CONFIG_HIGHBITDEPTH
+    build_compound_seg_mask(xd->seg_mask, mbmi->mask_type, p0, bw, p1, bw,
+                            bsize, bh, bw);
+
+  return best_rd;
+}
+#endif  // CONFIG_COMPOUND_SEGMENT
+
+#if CONFIG_WEDGE && CONFIG_INTERINTRA
+static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
+                                     const MACROBLOCK *const x,
+                                     const BLOCK_SIZE bsize,
+                                     const uint8_t *const p0,
+                                     const uint8_t *const p1) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+
+  int64_t rd;
+  int wedge_index = -1;
+
+  assert(is_interintra_wedge_used(bsize));
+
+  rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, 0, &wedge_index);
+
+  mbmi->interintra_wedge_sign = 0;
+  mbmi->interintra_wedge_index = wedge_index;
+  return rd;
+}
+#endif  // CONFIG_WEDGE && CONFIG_INTERINTRA
+
+#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    const BLOCK_SIZE bsize,
+                                    const uint8_t *const p0,
+                                    const uint8_t *const p1) {
+  const COMPOUND_TYPE compound_type =
+      x->e_mbd.mi[0]->mbmi.interinter_compound_type;
+  switch (compound_type) {
+#if CONFIG_WEDGE
+    case COMPOUND_WEDGE: return pick_interinter_wedge(cpi, x, bsize, p0, p1);
+#endif  // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+    case COMPOUND_SEG: return pick_interinter_seg(cpi, x, bsize, p0, p1);
+#endif  // CONFIG_COMPOUND_SEGMENT
+    default: assert(0); return 0;
+  }
+}
+
+static int interinter_compound_motion_search(const AV1_COMP *const cpi,
+                                             MACROBLOCK *x,
+                                             const BLOCK_SIZE bsize,
+                                             const int this_mode, int mi_row,
+                                             int mi_col) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int_mv tmp_mv[2];
+  int rate_mvs[2], tmp_rate_mv = 0;
+  const INTERINTER_COMPOUND_DATA compound_data = {
+#if CONFIG_WEDGE
+    mbmi->wedge_index,
+    mbmi->wedge_sign,
+#endif  // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+    mbmi->mask_type,
+    xd->seg_mask,
+#endif  // CONFIG_COMPOUND_SEGMENT
+    mbmi->interinter_compound_type
+  };
+  if (this_mode == NEW_NEWMV) {
+    do_masked_motion_search_indexed(cpi, x, &compound_data, bsize, mi_row,
+                                    mi_col, tmp_mv, rate_mvs, 2);
+    tmp_rate_mv = rate_mvs[0] + rate_mvs[1];
+    mbmi->mv[0].as_int = tmp_mv[0].as_int;
+    mbmi->mv[1].as_int = tmp_mv[1].as_int;
+  } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
+    do_masked_motion_search_indexed(cpi, x, &compound_data, bsize, mi_row,
+                                    mi_col, tmp_mv, rate_mvs, 0);
+    tmp_rate_mv = rate_mvs[0];
+    mbmi->mv[0].as_int = tmp_mv[0].as_int;
+  } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+    do_masked_motion_search_indexed(cpi, x, &compound_data, bsize, mi_row,
+                                    mi_col, tmp_mv, rate_mvs, 1);
+    tmp_rate_mv = rate_mvs[1];
+    mbmi->mv[1].as_int = tmp_mv[1].as_int;
+  }
+  return tmp_rate_mv;
+}
+
+static int64_t build_and_cost_compound_type(
+    const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
+    const BLOCK_SIZE bsize, const int this_mode, int rs2, int rate_mv,
+    BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, uint8_t **preds1,
+    int *strides, int mi_row, int mi_col) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int rate_sum;
+  int64_t dist_sum;
+  int64_t best_rd_cur = INT64_MAX;
+  int64_t rd = INT64_MAX;
+  int tmp_skip_txfm_sb;
+  int64_t tmp_skip_sse_sb;
+  const COMPOUND_TYPE compound_type = mbmi->interinter_compound_type;
+
+  best_rd_cur = pick_interinter_mask(cpi, x, bsize, *preds0, *preds1);
+  best_rd_cur += RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv, 0);
+
+  if (have_newmv_in_inter_mode(this_mode) &&
+      use_masked_motion_search(compound_type)) {
+    *out_rate_mv = interinter_compound_motion_search(cpi, x, bsize, this_mode,
+                                                     mi_row, mi_col);
+    av1_build_inter_predictors_sby(xd, mi_row, mi_col, ctx, bsize);
+    model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+                    &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
+    rd = RDCOST(x->rdmult, x->rddiv, rs2 + *out_rate_mv + rate_sum, dist_sum);
+    if (rd >= best_rd_cur) {
+      mbmi->mv[0].as_int = cur_mv[0].as_int;
+      mbmi->mv[1].as_int = cur_mv[1].as_int;
+      *out_rate_mv = rate_mv;
+      av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0,
+#if CONFIG_SUPERTX
+                                               0, 0,
+#endif  // CONFIG_SUPERTX
+                                               preds0, strides, preds1,
+                                               strides);
+    }
+    av1_subtract_plane(x, bsize, 0);
+    rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                             &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+    if (rd != INT64_MAX)
+      rd = RDCOST(x->rdmult, x->rddiv, rs2 + *out_rate_mv + rate_sum, dist_sum);
+    best_rd_cur = rd;
+
+  } else {
+    av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0,
+#if CONFIG_SUPERTX
+                                             0, 0,
+#endif  // CONFIG_SUPERTX
+                                             preds0, strides, preds1, strides);
+    av1_subtract_plane(x, bsize, 0);
+    rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                             &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+    if (rd != INT64_MAX)
+      rd = RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv + rate_sum, dist_sum);
+    best_rd_cur = rd;
+  }
+  return best_rd_cur;
+}
+#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+#endif  // CONFIG_EXT_INTER
+
+typedef struct {
+#if CONFIG_MOTION_VAR
+  // Inter prediction buffers and respective strides
+  uint8_t *above_pred_buf[MAX_MB_PLANE];
+  int above_pred_stride[MAX_MB_PLANE];
+  uint8_t *left_pred_buf[MAX_MB_PLANE];
+  int left_pred_stride[MAX_MB_PLANE];
+#endif  // CONFIG_MOTION_VAR
+  int_mv *single_newmv;
+#if CONFIG_EXT_INTER
+  // Pointer to array of motion vectors to use for each ref and their rates
+  // Should point to first of 2 arrays in 2D array
+  int *single_newmv_rate;
+  // Pointers costs of compound inter-intra and inter-inter predictions
+  int *compmode_interintra_cost;
+  int *compmode_interinter_cost;
+  // Pointer to array of predicted rate-distortion
+  // Should point to first of 2 arrays in 2D array
+  int64_t (*modelled_rd)[TOTAL_REFS_PER_FRAME];
+#endif  // CONFIG_EXT_INTER
+  InterpFilter single_filter[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+} HandleInterModeArgs;
+
+static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                            const BLOCK_SIZE bsize,
+                            int_mv (*const mode_mv)[TOTAL_REFS_PER_FRAME],
+                            const int mi_row, const int mi_col,
+                            int *const rate_mv, int_mv *const single_newmv,
+                            HandleInterModeArgs *const args) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const int is_comp_pred = has_second_ref(mbmi);
+  const PREDICTION_MODE this_mode = mbmi->mode;
+#if CONFIG_EXT_INTER
+  const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
+#endif  // CONFIG_EXT_INTER
+  int_mv *const frame_mv = mode_mv[this_mode];
+  const int refs[2] = { mbmi->ref_frame[0],
+                        mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
+  int i;
+
+  (void)args;
+
+  if (is_comp_pred) {
+#if CONFIG_EXT_INTER
+    for (i = 0; i < 2; ++i) {
+      single_newmv[refs[i]].as_int = args->single_newmv[refs[i]].as_int;
+    }
+
+    if (this_mode == NEW_NEWMV) {
+      frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+      frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+      if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+        joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, NULL,
+                            rate_mv, 0);
+      } else {
+        *rate_mv = 0;
+        for (i = 0; i < 2; ++i) {
+#if CONFIG_REF_MV
+          av1_set_mvcost(x, refs[i], i, mbmi->ref_mv_idx);
+#endif  // CONFIG_REF_MV
+          *rate_mv += av1_mv_bit_cost(
+              &frame_mv[refs[i]].as_mv, &mbmi_ext->ref_mvs[refs[i]][0].as_mv,
+              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+        }
+      }
+    } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+      frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+#if CONFIG_REF_MV
+      av1_set_mvcost(x, refs[1], 1, mbmi->ref_mv_idx);
+#endif  // CONFIG_REF_MV
+      *rate_mv = av1_mv_bit_cost(&frame_mv[refs[1]].as_mv,
+                                 &mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+                                 x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+    } else {
+      assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV);
+      frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+#if CONFIG_REF_MV
+      av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
+#endif  // CONFIG_REF_MV
+      *rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
+                                 &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                 x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+    }
+#else
+    // Initialize mv using single prediction mode result.
+    frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+    frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+    if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+      joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, rate_mv, 0);
+    } else {
+      *rate_mv = 0;
+      for (i = 0; i < 2; ++i) {
+#if CONFIG_REF_MV
+        av1_set_mvcost(x, refs[i], i, mbmi->ref_mv_idx);
+#endif  // CONFIG_REF_MV
+        *rate_mv += av1_mv_bit_cost(&frame_mv[refs[i]].as_mv,
+                                    &mbmi_ext->ref_mvs[refs[i]][0].as_mv,
+                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+      }
+    }
+#endif  // CONFIG_EXT_INTER
+  } else {
+#if CONFIG_EXT_INTER
+    if (is_comp_interintra_pred) {
+      x->best_mv = args->single_newmv[refs[0]];
+      *rate_mv = args->single_newmv_rate[refs[0]];
+    } else {
+      single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, rate_mv);
+      args->single_newmv[refs[0]] = x->best_mv;
+      args->single_newmv_rate[refs[0]] = *rate_mv;
+    }
+#else
+    single_motion_search(cpi, x, bsize, mi_row, mi_col, rate_mv);
+    single_newmv[refs[0]] = x->best_mv;
+#endif  // CONFIG_EXT_INTER
+
+    if (x->best_mv.as_int == INVALID_MV) return INT64_MAX;
+
+    frame_mv[refs[0]] = x->best_mv;
+    xd->mi[0]->bmi[0].as_mv[0] = x->best_mv;
+
+    // Estimate the rate implications of a new mv but discount this
+    // under certain circumstances where we want to help initiate a weak
+    // motion field, where the distortion gain for a single block may not
+    // be enough to overcome the cost of a new mv.
+    if (discount_newmv_test(cpi, this_mode, x->best_mv, mode_mv, refs[0])) {
+      *rate_mv = AOMMAX(*rate_mv / NEW_MV_DISCOUNT_FACTOR, 1);
+    }
+  }
+
+  return 0;
+}
+
+int64_t interpolation_filter_search(
+    MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+    int mi_row, int mi_col, const BUFFER_SET *const tmp_dst,
+    BUFFER_SET *const orig_dst,
+    InterpFilter (*const single_filter)[TOTAL_REFS_PER_FRAME],
+    int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb,
+    int64_t *const skip_sse_sb) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int i;
+  int tmp_rate;
+  int64_t tmp_dist;
+
+  (void)single_filter;
+
+  InterpFilter assign_filter = SWITCHABLE;
+
+  if (cm->interp_filter == SWITCHABLE) {
+#if !CONFIG_DUAL_FILTER
+    assign_filter = av1_is_interp_needed(xd)
+                        ? predict_interp_filter(cpi, x, bsize, mi_row, mi_col,
+                                                single_filter)
+                        : cm->interp_filter;
+#endif  // !CONFIG_DUAL_FILTER
+  } else {
+    assign_filter = cm->interp_filter;
+  }
+
+  set_default_interp_filters(mbmi, assign_filter);
+
+  *switchable_rate = av1_get_switchable_rate(cpi, xd);
+  av1_build_inter_predictors_sb(xd, mi_row, mi_col, orig_dst, bsize);
+  model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, &tmp_dist,
+                  skip_txfm_sb, skip_sse_sb);
+  *rd = RDCOST(x->rdmult, x->rddiv, *switchable_rate + tmp_rate, tmp_dist);
+
+  if (assign_filter == SWITCHABLE) {
+    // do interp_filter search
+    if (av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd)) {
+#if CONFIG_DUAL_FILTER
+      const int filter_set_size = DUAL_FILTER_SET_SIZE;
+#else
+      const int filter_set_size = SWITCHABLE_FILTERS;
+#endif  // CONFIG_DUAL_FILTER
+      int best_in_temp = 0;
+#if CONFIG_DUAL_FILTER
+      InterpFilter best_filter[4];
+      av1_copy(best_filter, mbmi->interp_filter);
+#else
+      InterpFilter best_filter = mbmi->interp_filter;
+#endif  // CONFIG_DUAL_FILTER
+      restore_dst_buf(xd, *tmp_dst);
+      // EIGHTTAP_REGULAR mode is calculated beforehand
+      for (i = 1; i < filter_set_size; ++i) {
+        int tmp_skip_sb = 0;
+        int64_t tmp_skip_sse = INT64_MAX;
+        int tmp_rs;
+        int64_t tmp_rd;
+#if CONFIG_DUAL_FILTER
+        mbmi->interp_filter[0] = filter_sets[i][0];
+        mbmi->interp_filter[1] = filter_sets[i][1];
+        mbmi->interp_filter[2] = filter_sets[i][0];
+        mbmi->interp_filter[3] = filter_sets[i][1];
+#else
+        mbmi->interp_filter = (InterpFilter)i;
+#endif  // CONFIG_DUAL_FILTER
+        tmp_rs = av1_get_switchable_rate(cpi, xd);
+        av1_build_inter_predictors_sb(xd, mi_row, mi_col, orig_dst, bsize);
+        model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+                        &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
+        tmp_rd = RDCOST(x->rdmult, x->rddiv, tmp_rs + tmp_rate, tmp_dist);
+
+        if (tmp_rd < *rd) {
+          *rd = tmp_rd;
+          *switchable_rate = av1_get_switchable_rate(cpi, xd);
+#if CONFIG_DUAL_FILTER
+          av1_copy(best_filter, mbmi->interp_filter);
+#else
+          best_filter = mbmi->interp_filter;
+#endif  // CONFIG_DUAL_FILTER
+          *skip_txfm_sb = tmp_skip_sb;
+          *skip_sse_sb = tmp_skip_sse;
+          best_in_temp = !best_in_temp;
+          if (best_in_temp) {
+            restore_dst_buf(xd, *orig_dst);
+          } else {
+            restore_dst_buf(xd, *tmp_dst);
+          }
+        }
+      }
+      if (best_in_temp) {
+        restore_dst_buf(xd, *tmp_dst);
+      } else {
+        restore_dst_buf(xd, *orig_dst);
+      }
+#if CONFIG_DUAL_FILTER
+      av1_copy(mbmi->interp_filter, best_filter);
+#else
+      mbmi->interp_filter = best_filter;
+#endif  // CONFIG_DUAL_FILTER
+    } else {
+#if CONFIG_DUAL_FILTER
+      for (i = 0; i < 4; ++i)
+        assert(mbmi->interp_filter[i] == EIGHTTAP_REGULAR);
+#else
+      assert(mbmi->interp_filter == EIGHTTAP_REGULAR);
+#endif  // CONFIG_DUAL_FILTER
+    }
+  }
+
+  return 0;
+}
+
+// TODO(afergs): Refactor the MBMI references in here - there's four
+// TODO(afergs): Refactor optional args - add them to a struct or remove
+static int64_t motion_mode_rd(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+    RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
+    int *disable_skip, int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME], int mi_row,
+    int mi_col, HandleInterModeArgs *const args, const int64_t ref_best_rd,
+    const int *refs, int rate_mv,
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_EXT_INTER
+    int rate2_bmc_nocoeff, MB_MODE_INFO *best_bmc_mbmi,
+#if CONFIG_MOTION_VAR
+    int rate_mv_bmc,
+#endif  // CONFIG_MOTION_VAR
+#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    int rs, int *skip_txfm_sb, int64_t *skip_sse_sb, BUFFER_SET *orig_dst) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mi[0];
+  MB_MODE_INFO *mbmi = &mi->mbmi;
+  const int is_comp_pred = has_second_ref(mbmi);
+  const PREDICTION_MODE this_mode = mbmi->mode;
+
+  (void)mode_mv;
+  (void)mi_row;
+  (void)mi_col;
+  (void)args;
+  (void)refs;
+  (void)rate_mv;
+  (void)is_comp_pred;
+  (void)this_mode;
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  MOTION_MODE motion_mode, last_motion_mode_allowed;
+  int rate2_nocoeff = 0, best_xskip, best_disable_skip = 0;
+  RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
+  MB_MODE_INFO base_mbmi, best_mbmi;
+#if CONFIG_VAR_TX
+  uint8_t best_blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
+#endif  // CONFIG_VAR_TX
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if CONFIG_WARPED_MOTION
+  int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+#endif  // CONFIG_WARPED_MOTION
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  av1_invalid_rd_stats(&best_rd_stats);
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+  if (cm->interp_filter == SWITCHABLE) rd_stats->rate += rs;
+#if CONFIG_WARPED_MOTION
+  aom_clear_system_state();
+  mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
+#if CONFIG_EXT_INTER
+  best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
+#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_WARPED_MOTION
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  rate2_nocoeff = rd_stats->rate;
+  last_motion_mode_allowed = motion_mode_allowed(
+#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+      0, xd->global_motion,
+#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+      mi);
+  base_mbmi = *mbmi;
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  int64_t best_rd = INT64_MAX;
+  for (motion_mode = SIMPLE_TRANSLATION;
+       motion_mode <= last_motion_mode_allowed; motion_mode++) {
+    int64_t tmp_rd = INT64_MAX;
+    int tmp_rate;
+    int64_t tmp_dist;
+#if CONFIG_EXT_INTER
+    int tmp_rate2 =
+        motion_mode != SIMPLE_TRANSLATION ? rate2_bmc_nocoeff : rate2_nocoeff;
+#else
+    int tmp_rate2 = rate2_nocoeff;
+#endif  // CONFIG_EXT_INTER
+
+    *mbmi = base_mbmi;
+    mbmi->motion_mode = motion_mode;
+#if CONFIG_MOTION_VAR
+    if (mbmi->motion_mode == OBMC_CAUSAL) {
+#if CONFIG_EXT_INTER
+      *mbmi = *best_bmc_mbmi;
+      mbmi->motion_mode = OBMC_CAUSAL;
+#endif  // CONFIG_EXT_INTER
+      if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
+        int tmp_rate_mv = 0;
+
+        single_motion_search(cpi, x, bsize, mi_row, mi_col,
+#if CONFIG_EXT_INTER
+                             0,
+#endif  // CONFIG_EXT_INTER
+                             &tmp_rate_mv);
+        mbmi->mv[0].as_int = x->best_mv.as_int;
+        if (discount_newmv_test(cpi, this_mode, mbmi->mv[0], mode_mv,
+                                refs[0])) {
+          tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+        }
+#if CONFIG_EXT_INTER
+        tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
+#else
+        tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_DUAL_FILTER
+        if (!has_subpel_mv_component(xd->mi[0], xd, 0))
+          mbmi->interp_filter[0] = EIGHTTAP_REGULAR;
+        if (!has_subpel_mv_component(xd->mi[0], xd, 1))
+          mbmi->interp_filter[1] = EIGHTTAP_REGULAR;
+#endif  // CONFIG_DUAL_FILTER
+        av1_build_inter_predictors_sb(xd, mi_row, mi_col, orig_dst, bsize);
+#if CONFIG_EXT_INTER
+      } else {
+        av1_build_inter_predictors_sb(xd, mi_row, mi_col, orig_dst, bsize);
+#endif  // CONFIG_EXT_INTER
+      }
+      av1_build_obmc_inter_prediction(
+          cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride,
+          args->left_pred_buf, args->left_pred_stride);
+      model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+                      &tmp_dist, skip_txfm_sb, skip_sse_sb);
+    }
+#endif  // CONFIG_MOTION_VAR
+
+#if CONFIG_WARPED_MOTION
+    if (mbmi->motion_mode == WARPED_CAUSAL) {
+#if CONFIG_EXT_INTER
+      *mbmi = *best_bmc_mbmi;
+      mbmi->motion_mode = WARPED_CAUSAL;
+#endif  // CONFIG_EXT_INTER
+      mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE;
+#if CONFIG_DUAL_FILTER
+      for (int dir = 0; dir < 4; ++dir)
+        mbmi->interp_filter[dir] = cm->interp_filter == SWITCHABLE
+                                       ? EIGHTTAP_REGULAR
+                                       : cm->interp_filter;
+#else
+      mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR
+                                                            : cm->interp_filter;
+#endif  // CONFIG_DUAL_FILTER
+
+      if (find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
+                          mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
+                          &mbmi->wm_params[0], mi_row, mi_col) == 0) {
+        av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, bsize);
+        model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+                        &tmp_dist, skip_txfm_sb, skip_sse_sb);
+      } else {
+        continue;
+      }
+    }
+#endif  // CONFIG_WARPED_MOTION
+    x->skip = 0;
+
+    rd_stats->dist = 0;
+    rd_stats->sse = 0;
+    rd_stats->skip = 1;
+    rd_stats->rate = tmp_rate2;
+    if (last_motion_mode_allowed > SIMPLE_TRANSLATION) {
+#if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
+      if (last_motion_mode_allowed == WARPED_CAUSAL)
+#endif  // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
+        rd_stats->rate += cpi->motion_mode_cost[bsize][mbmi->motion_mode];
+#if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
+      else
+        rd_stats->rate += cpi->motion_mode_cost1[bsize][mbmi->motion_mode];
+#endif  // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
+    }
+#if CONFIG_WARPED_MOTION
+    if (mbmi->motion_mode == WARPED_CAUSAL) {
+      rd_stats->rate -= rs;
+    }
+#endif  // CONFIG_WARPED_MOTION
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    if (!*skip_txfm_sb) {
+      int64_t rdcosty = INT64_MAX;
+      int is_cost_valid_uv = 0;
+
+      // cost and distortion
+      av1_subtract_plane(x, bsize, 0);
+#if CONFIG_VAR_TX
+      if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+        select_tx_type_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
+      } else {
+        int idx, idy;
+        super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
+        for (idy = 0; idy < xd->n8_h; ++idy)
+          for (idx = 0; idx < xd->n8_w; ++idx)
+            mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
+        memset(x->blk_skip[0], rd_stats_y->skip,
+               sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+      }
+#else
+    /* clang-format off */
+      super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
+/* clang-format on */
+#endif  // CONFIG_VAR_TX
+
+      if (rd_stats_y->rate == INT_MAX) {
+        av1_invalid_rd_stats(rd_stats);
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+        if (mbmi->motion_mode != SIMPLE_TRANSLATION) {
+          continue;
+        } else {
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+          restore_dst_buf(xd, *orig_dst);
+          return INT64_MAX;
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+        }
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      }
+
+      av1_merge_rd_stats(rd_stats, rd_stats_y);
+
+      rdcosty = RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist);
+      rdcosty = AOMMIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse));
+/* clang-format off */
+#if CONFIG_VAR_TX
+      is_cost_valid_uv =
+          inter_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty);
+#else
+      is_cost_valid_uv =
+          super_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty);
+#endif  // CONFIG_VAR_TX
+      if (!is_cost_valid_uv) {
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+        continue;
+#else
+        restore_dst_buf(xd, *orig_dst);
+        return INT64_MAX;
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      }
+      /* clang-format on */
+      av1_merge_rd_stats(rd_stats, rd_stats_uv);
+#if CONFIG_RD_DEBUG
+      // record transform block coefficient cost
+      // TODO(angiebird): So far rd_debug tool only detects discrepancy of
+      // coefficient cost. Therefore, it is fine to copy rd_stats into mbmi
+      // here because we already collect the coefficient cost. Move this part to
+      // other place when we need to compare non-coefficient cost.
+      mbmi->rd_stats = *rd_stats;
+#endif  // CONFIG_RD_DEBUG
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      if (rd_stats->skip) {
+        rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
+        rd_stats_y->rate = 0;
+        rd_stats_uv->rate = 0;
+        rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+        mbmi->skip = 0;
+        // here mbmi->skip temporarily plays a role as what this_skip2 does
+      } else if (!xd->lossless[mbmi->segment_id] &&
+                 (RDCOST(x->rdmult, x->rddiv,
+                         rd_stats_y->rate + rd_stats_uv->rate +
+                             av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
+                         rd_stats->dist) >=
+                  RDCOST(x->rdmult, x->rddiv,
+                         av1_cost_bit(av1_get_skip_prob(cm, xd), 1),
+                         rd_stats->sse))) {
+        rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
+        rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+        rd_stats->dist = rd_stats->sse;
+        rd_stats_y->rate = 0;
+        rd_stats_uv->rate = 0;
+        mbmi->skip = 1;
+      } else {
+        rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+        mbmi->skip = 0;
+      }
+      *disable_skip = 0;
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    } else {
+      x->skip = 1;
+      *disable_skip = 1;
+      mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, 1);
+
+// The cost of skip bit needs to be added.
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      mbmi->skip = 0;
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+
+      rd_stats->dist = *skip_sse_sb;
+      rd_stats->sse = *skip_sse_sb;
+      rd_stats_y->rate = 0;
+      rd_stats_uv->rate = 0;
+      rd_stats->skip = 1;
+    }
+
+#if CONFIG_GLOBAL_MOTION
+    if (this_mode == ZEROMV
+#if CONFIG_EXT_INTER
+        || this_mode == ZERO_ZEROMV
+#endif  // CONFIG_EXT_INTER
+        ) {
+      if (is_nontrans_global_motion(xd)) {
+        rd_stats->rate -= rs;
+#if CONFIG_DUAL_FILTER
+        mbmi->interp_filter[0] = cm->interp_filter == SWITCHABLE
+                                     ? EIGHTTAP_REGULAR
+                                     : cm->interp_filter;
+        mbmi->interp_filter[1] = cm->interp_filter == SWITCHABLE
+                                     ? EIGHTTAP_REGULAR
+                                     : cm->interp_filter;
+#else
+        mbmi->interp_filter = cm->interp_filter == SWITCHABLE
+                                  ? EIGHTTAP_REGULAR
+                                  : cm->interp_filter;
+#endif  // CONFIG_DUAL_FILTER
+      }
+    }
+#endif  // CONFIG_GLOBAL_MOTION
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    tmp_rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist);
+    if (mbmi->motion_mode == SIMPLE_TRANSLATION || (tmp_rd < best_rd)) {
+      best_mbmi = *mbmi;
+      best_rd = tmp_rd;
+      best_rd_stats = *rd_stats;
+      best_rd_stats_y = *rd_stats_y;
+      best_rd_stats_uv = *rd_stats_uv;
+#if CONFIG_VAR_TX
+      for (int i = 0; i < MAX_MB_PLANE; ++i)
+        memcpy(best_blk_skip[i], x->blk_skip[i],
+               sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+#endif  // CONFIG_VAR_TX
+      best_xskip = x->skip;
+      best_disable_skip = *disable_skip;
+    }
+  }
+
+  if (best_rd == INT64_MAX) {
+    av1_invalid_rd_stats(rd_stats);
+    restore_dst_buf(xd, *orig_dst);
+    return INT64_MAX;
+  }
+  *mbmi = best_mbmi;
+  *rd_stats = best_rd_stats;
+  *rd_stats_y = best_rd_stats_y;
+  *rd_stats_uv = best_rd_stats_uv;
+#if CONFIG_VAR_TX
+  for (int i = 0; i < MAX_MB_PLANE; ++i)
+    memcpy(x->blk_skip[i], best_blk_skip[i],
+           sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+#endif  // CONFIG_VAR_TX
+  x->skip = best_xskip;
+  *disable_skip = best_disable_skip;
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+  restore_dst_buf(xd, *orig_dst);
+  return 0;
+}
+
+static int64_t handle_inter_mode(
+    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+    RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
+    int *disable_skip, int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME], int mi_row,
+    int mi_col, HandleInterModeArgs *args, const int64_t ref_best_rd) {
+  const AV1_COMMON *cm = &cpi->common;
+  (void)cm;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mi[0];
+  MB_MODE_INFO *mbmi = &mi->mbmi;
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const int is_comp_pred = has_second_ref(mbmi);
+  const int this_mode = mbmi->mode;
+  int_mv *frame_mv = mode_mv[this_mode];
+  int i;
+  int refs[2] = { mbmi->ref_frame[0],
+                  (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+  int_mv cur_mv[2];
+  int rate_mv = 0;
+#if CONFIG_EXT_INTER
+  int pred_exists = 1;
+  const int bw = block_size_wide[bsize];
+  int_mv single_newmv[TOTAL_REFS_PER_FRAME];
+#if CONFIG_INTERINTRA
+  const unsigned int *const interintra_mode_cost =
+      cpi->interintra_mode_cost[size_group_lookup[bsize]];
+#endif  // CONFIG_INTERINTRA
+  const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
+#if CONFIG_REF_MV
+  uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+#endif  // CONFIG_REF_MV
+#else
+  int_mv *const single_newmv = args->single_newmv;
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+#else
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf_[MAX_MB_PLANE * MAX_SB_SQUARE]);
+#endif  // CONFIG_HIGHBITDEPTH
+  uint8_t *tmp_buf;
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_EXT_INTER
+  int rate2_bmc_nocoeff;
+  MB_MODE_INFO best_bmc_mbmi;
+#if CONFIG_MOTION_VAR
+  int rate_mv_bmc;
+#endif  // CONFIG_MOTION_VAR
+#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  int64_t rd = INT64_MAX;
+  BUFFER_SET orig_dst, tmp_dst;
+  int rs = 0;
+
+  int skip_txfm_sb = 0;
+  int64_t skip_sse_sb = INT64_MAX;
+  int16_t mode_ctx;
+
+#if CONFIG_EXT_INTER
+  *args->compmode_interintra_cost = 0;
+  mbmi->use_wedge_interintra = 0;
+  *args->compmode_interinter_cost = 0;
+  mbmi->interinter_compound_type = COMPOUND_AVERAGE;
+
+  // is_comp_interintra_pred implies !is_comp_pred
+  assert(!is_comp_interintra_pred || (!is_comp_pred));
+  // is_comp_interintra_pred implies is_interintra_allowed(mbmi->sb_type)
+  assert(!is_comp_interintra_pred || is_interintra_allowed(mbmi));
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+  if (is_comp_pred)
+    mode_ctx = mbmi_ext->compound_mode_context[refs[0]];
+  else
+#endif  // CONFIG_EXT_INTER
+    mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
+                                         mbmi->ref_frame, bsize, -1);
+#else   // CONFIG_REF_MV
+  mode_ctx = mbmi_ext->mode_context[refs[0]];
+#endif  // CONFIG_REF_MV
+
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_);
+  else
+#endif  // CONFIG_HIGHBITDEPTH
+    tmp_buf = tmp_buf_;
+  // Make sure that we didn't leave the plane destination buffers set
+  // to tmp_buf at the end of the last iteration
+  assert(xd->plane[0].dst.buf != tmp_buf);
+
+#if CONFIG_WARPED_MOTION
+  mbmi->num_proj_ref[0] = 0;
+  mbmi->num_proj_ref[1] = 0;
+#endif  // CONFIG_WARPED_MOTION
+
+  if (is_comp_pred) {
+    if (frame_mv[refs[0]].as_int == INVALID_MV ||
+        frame_mv[refs[1]].as_int == INVALID_MV)
+      return INT64_MAX;
+  }
+
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  if (have_newmv_in_inter_mode(this_mode)) {
+    const int64_t ret_val = handle_newmv(cpi, x, bsize, mode_mv, mi_row, mi_col,
+                                         &rate_mv, single_newmv, args);
+    if (ret_val != 0)
+      return ret_val;
+    else
+      rd_stats->rate += rate_mv;
+  }
+  for (i = 0; i < is_comp_pred + 1; ++i) {
+    cur_mv[i] = frame_mv[refs[i]];
+    // Clip "next_nearest" so that it does not extend to far out of image
+    if (this_mode != NEWMV) clamp_mv2(&cur_mv[i].as_mv, xd);
+    if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX;
+    mbmi->mv[i].as_int = cur_mv[i].as_int;
+  }
+
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+  if (this_mode == NEAREST_NEARESTMV)
+#else
+  if (this_mode == NEARESTMV && is_comp_pred)
+#endif  // CONFIG_EXT_INTER
+  {
+#if !CONFIG_EXT_INTER
+    uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+#endif  // !CONFIG_EXT_INTER
+    if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
+      cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
+      cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
+
+      for (i = 0; i < 2; ++i) {
+        clamp_mv2(&cur_mv[i].as_mv, xd);
+        if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX;
+        mbmi->mv[i].as_int = cur_mv[i].as_int;
+      }
+    }
+  }
+
+#if CONFIG_EXT_INTER
+  if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
+    if (this_mode == NEAREST_NEWMV || this_mode == NEAREST_NEARMV) {
+      cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
+
+      lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
+      clamp_mv2(&cur_mv[0].as_mv, xd);
+      if (mv_check_bounds(&x->mv_limits, &cur_mv[0].as_mv)) return INT64_MAX;
+      mbmi->mv[0].as_int = cur_mv[0].as_int;
+    }
+
+    if (this_mode == NEW_NEARESTMV || this_mode == NEAR_NEARESTMV) {
+      cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
+
+      lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
+      clamp_mv2(&cur_mv[1].as_mv, xd);
+      if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
+      mbmi->mv[1].as_int = cur_mv[1].as_int;
+    }
+  }
+
+  if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
+    int ref_mv_idx = mbmi->ref_mv_idx + 1;
+    if (this_mode == NEAR_NEWMV || this_mode == NEAR_NEARESTMV ||
+        this_mode == NEAR_NEARMV) {
+      cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+
+      lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
+      clamp_mv2(&cur_mv[0].as_mv, xd);
+      if (mv_check_bounds(&x->mv_limits, &cur_mv[0].as_mv)) return INT64_MAX;
+      mbmi->mv[0].as_int = cur_mv[0].as_int;
+    }
+
+    if (this_mode == NEW_NEARMV || this_mode == NEAREST_NEARMV ||
+        this_mode == NEAR_NEARMV) {
+      cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+
+      lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
+      clamp_mv2(&cur_mv[1].as_mv, xd);
+      if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
+      mbmi->mv[1].as_int = cur_mv[1].as_int;
+    }
+  }
+#else
+  if (this_mode == NEARMV && is_comp_pred) {
+    uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+    if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
+      int ref_mv_idx = mbmi->ref_mv_idx + 1;
+      cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+      cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+
+      for (i = 0; i < 2; ++i) {
+        clamp_mv2(&cur_mv[i].as_mv, xd);
+        if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX;
+        mbmi->mv[i].as_int = cur_mv[i].as_int;
+      }
+    }
+  }
+#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_REF_MV
+
+  // do first prediction into the destination buffer. Do the next
+  // prediction into a temporary buffer. Then keep track of which one
+  // of these currently holds the best predictor, and use the other
+  // one for future predictions. In the end, copy from tmp_buf to
+  // dst if necessary.
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE;
+    tmp_dst.stride[i] = MAX_SB_SIZE;
+  }
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    orig_dst.plane[i] = xd->plane[i].dst.buf;
+    orig_dst.stride[i] = xd->plane[i].dst.stride;
+  }
+
+  // We don't include the cost of the second reference here, because there
+  // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
+  // words if you present them in that order, the second one is always known
+  // if the first is known.
+  //
+  // Under some circumstances we discount the cost of new mv mode to encourage
+  // initiation of a motion field.
+  if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv,
+                          refs[0])) {
+#if CONFIG_EXT_INTER
+    rd_stats->rate +=
+        AOMMIN(cost_mv_ref(cpi, this_mode, mode_ctx),
+               cost_mv_ref(cpi, is_comp_pred ? NEAREST_NEARESTMV : NEARESTMV,
+                           mode_ctx));
+#else
+    rd_stats->rate += AOMMIN(cost_mv_ref(cpi, this_mode, mode_ctx),
+                             cost_mv_ref(cpi, NEARESTMV, mode_ctx));
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+  } else {
+    rd_stats->rate += cost_mv_ref(cpi, this_mode, mode_ctx);
+  }
+
+  if (RDCOST(x->rdmult, x->rddiv, rd_stats->rate, 0) > ref_best_rd &&
+#if CONFIG_EXT_INTER
+      mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV
+#else
+      mbmi->mode != NEARESTMV
+#endif  // CONFIG_EXT_INTER
+      )
+    return INT64_MAX;
+
+  int64_t ret_val = interpolation_filter_search(
+      x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, args->single_filter,
+      &rd, &rs, &skip_txfm_sb, &skip_sse_sb);
+  if (ret_val != 0) return ret_val;
+
+#if CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  best_bmc_mbmi = *mbmi;
+  rate2_bmc_nocoeff = rd_stats->rate;
+  if (cm->interp_filter == SWITCHABLE) rate2_bmc_nocoeff += rs;
+#if CONFIG_MOTION_VAR
+  rate_mv_bmc = rate_mv;
+#endif  // CONFIG_MOTION_VAR
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+  if (is_comp_pred) {
+    int rate_sum, rs2;
+    int64_t dist_sum;
+    int64_t best_rd_compound = INT64_MAX, best_rd_cur = INT64_MAX;
+    INTERINTER_COMPOUND_DATA best_compound_data;
+    int_mv best_mv[2];
+    int best_tmp_rate_mv = rate_mv;
+    int tmp_skip_txfm_sb;
+    int64_t tmp_skip_sse_sb;
+    int compound_type_cost[COMPOUND_TYPES];
+    uint8_t pred0[2 * MAX_SB_SQUARE];
+    uint8_t pred1[2 * MAX_SB_SQUARE];
+    uint8_t *preds0[1] = { pred0 };
+    uint8_t *preds1[1] = { pred1 };
+    int strides[1] = { bw };
+    int tmp_rate_mv;
+    int masked_compound_used = is_any_masked_compound_used(bsize);
+    COMPOUND_TYPE cur_type;
+
+    best_mv[0].as_int = cur_mv[0].as_int;
+    best_mv[1].as_int = cur_mv[1].as_int;
+    memset(&best_compound_data, 0, sizeof(best_compound_data));
+#if CONFIG_COMPOUND_SEGMENT
+    uint8_t tmp_mask_buf[2 * MAX_SB_SQUARE];
+    best_compound_data.seg_mask = tmp_mask_buf;
+#endif  // CONFIG_COMPOUND_SEGMENT
+    av1_cost_tokens(compound_type_cost, cm->fc->compound_type_prob[bsize],
+                    av1_compound_type_tree);
+
+    if (masked_compound_used) {
+      av1_cost_tokens(compound_type_cost, cm->fc->compound_type_prob[bsize],
+                      av1_compound_type_tree);
+      // get inter predictors to use for masked compound modes
+      av1_build_inter_predictors_for_planes_single_buf(
+          xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides);
+      av1_build_inter_predictors_for_planes_single_buf(
+          xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides);
+    }
+
+    for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
+      if (!is_interinter_compound_used(cur_type, bsize)) break;
+      tmp_rate_mv = rate_mv;
+      best_rd_cur = INT64_MAX;
+      mbmi->interinter_compound_type = cur_type;
+      rs2 = av1_cost_literal(get_interinter_compound_type_bits(
+                bsize, mbmi->interinter_compound_type)) +
+            (masked_compound_used
+                 ? compound_type_cost[mbmi->interinter_compound_type]
+                 : 0);
+
+      switch (cur_type) {
+        case COMPOUND_AVERAGE:
+          av1_build_inter_predictors_sby(xd, mi_row, mi_col, &orig_dst, bsize);
+          av1_subtract_plane(x, bsize, 0);
+          rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                                   &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
+                                   INT64_MAX);
+          if (rd != INT64_MAX)
+            best_rd_cur =
+                RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv + rate_sum, dist_sum);
+          best_rd_compound = best_rd_cur;
+          break;
+#if CONFIG_WEDGE
+        case COMPOUND_WEDGE:
+          if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
+              best_rd_compound / 3 < ref_best_rd) {
+            best_rd_cur = build_and_cost_compound_type(
+                cpi, x, cur_mv, bsize, this_mode, rs2, rate_mv, &orig_dst,
+                &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col);
+          }
+          break;
+#endif  // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+        case COMPOUND_SEG:
+          if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
+              best_rd_compound / 3 < ref_best_rd) {
+            best_rd_cur = build_and_cost_compound_type(
+                cpi, x, cur_mv, bsize, this_mode, rs2, rate_mv, &orig_dst,
+                &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col);
+          }
+          break;
+#endif  // CONFIG_COMPOUND_SEGMENT
+        default: assert(0); return 0;
+      }
+
+      if (best_rd_cur < best_rd_compound) {
+        best_rd_compound = best_rd_cur;
+#if CONFIG_WEDGE
+        best_compound_data.wedge_index = mbmi->wedge_index;
+        best_compound_data.wedge_sign = mbmi->wedge_sign;
+#endif  // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+        best_compound_data.mask_type = mbmi->mask_type;
+        memcpy(best_compound_data.seg_mask, xd->seg_mask,
+               2 * MAX_SB_SQUARE * sizeof(uint8_t));
+#endif  // CONFIG_COMPOUND_SEGMENT
+        best_compound_data.interinter_compound_type =
+            mbmi->interinter_compound_type;
+        if (have_newmv_in_inter_mode(this_mode)) {
+          if (use_masked_motion_search(cur_type)) {
+            best_tmp_rate_mv = tmp_rate_mv;
+            best_mv[0].as_int = mbmi->mv[0].as_int;
+            best_mv[1].as_int = mbmi->mv[1].as_int;
+          } else {
+            best_mv[0].as_int = cur_mv[0].as_int;
+            best_mv[1].as_int = cur_mv[1].as_int;
+          }
+        }
+      }
+      // reset to original mvs for next iteration
+      mbmi->mv[0].as_int = cur_mv[0].as_int;
+      mbmi->mv[1].as_int = cur_mv[1].as_int;
+    }
+#if CONFIG_WEDGE
+    mbmi->wedge_index = best_compound_data.wedge_index;
+    mbmi->wedge_sign = best_compound_data.wedge_sign;
+#endif  // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+    mbmi->mask_type = best_compound_data.mask_type;
+    memcpy(xd->seg_mask, best_compound_data.seg_mask,
+           2 * MAX_SB_SQUARE * sizeof(uint8_t));
+#endif  // CONFIG_COMPOUND_SEGMENT
+    mbmi->interinter_compound_type =
+        best_compound_data.interinter_compound_type;
+    if (have_newmv_in_inter_mode(this_mode)) {
+      mbmi->mv[0].as_int = best_mv[0].as_int;
+      mbmi->mv[1].as_int = best_mv[1].as_int;
+      xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
+      xd->mi[0]->bmi[0].as_mv[1].as_int = mbmi->mv[1].as_int;
+      if (use_masked_motion_search(mbmi->interinter_compound_type)) {
+        rd_stats->rate += best_tmp_rate_mv - rate_mv;
+        rate_mv = best_tmp_rate_mv;
+      }
+    }
+
+    if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) {
+      restore_dst_buf(xd, orig_dst);
+      return INT64_MAX;
+    }
+
+    pred_exists = 0;
+
+    *args->compmode_interinter_cost =
+        av1_cost_literal(get_interinter_compound_type_bits(
+            bsize, mbmi->interinter_compound_type)) +
+        (masked_compound_used
+             ? compound_type_cost[mbmi->interinter_compound_type]
+             : 0);
+  }
+
+#if CONFIG_INTERINTRA
+  if (is_comp_interintra_pred) {
+    INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
+    int64_t best_interintra_rd = INT64_MAX;
+    int rmode, rate_sum;
+    int64_t dist_sum;
+    int j;
+    int tmp_rate_mv = 0;
+    int tmp_skip_txfm_sb;
+    int64_t tmp_skip_sse_sb;
+    DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_SB_SQUARE]);
+    uint8_t *intrapred;
+
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      intrapred = CONVERT_TO_BYTEPTR(intrapred_);
+    else
+#endif  // CONFIG_HIGHBITDEPTH
+      intrapred = intrapred_;
+
+    mbmi->ref_frame[1] = NONE_FRAME;
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      xd->plane[j].dst.buf = tmp_buf + j * MAX_SB_SQUARE;
+      xd->plane[j].dst.stride = bw;
+    }
+    av1_build_inter_predictors_sby(xd, mi_row, mi_col, &orig_dst, bsize);
+    restore_dst_buf(xd, orig_dst);
+    mbmi->ref_frame[1] = INTRA_FRAME;
+    mbmi->use_wedge_interintra = 0;
+
+    for (j = 0; j < INTERINTRA_MODES; ++j) {
+      mbmi->interintra_mode = (INTERINTRA_MODE)j;
+      rmode = interintra_mode_cost[mbmi->interintra_mode];
+      av1_build_intra_predictors_for_interintra(xd, bsize, 0, &orig_dst,
+                                                intrapred, bw);
+      av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+      model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+                      &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
+      rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_mv + rate_sum, dist_sum);
+      if (rd < best_interintra_rd) {
+        best_interintra_rd = rd;
+        best_interintra_mode = mbmi->interintra_mode;
+      }
+    }
+    mbmi->interintra_mode = best_interintra_mode;
+    rmode = interintra_mode_cost[mbmi->interintra_mode];
+    av1_build_intra_predictors_for_interintra(xd, bsize, 0, &orig_dst,
+                                              intrapred, bw);
+    av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+    av1_subtract_plane(x, bsize, 0);
+    rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                             &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+    if (rd != INT64_MAX)
+      rd = RDCOST(x->rdmult, x->rddiv, rate_mv + rmode + rate_sum, dist_sum);
+    best_interintra_rd = rd;
+
+    if (ref_best_rd < INT64_MAX && best_interintra_rd > 2 * ref_best_rd) {
+      // Don't need to call restore_dst_buf here
+      return INT64_MAX;
+    }
+#if CONFIG_WEDGE
+    if (is_interintra_wedge_used(bsize)) {
+      int64_t best_interintra_rd_nowedge = INT64_MAX;
+      int64_t best_interintra_rd_wedge = INT64_MAX;
+      int_mv tmp_mv;
+      int rwedge = av1_cost_bit(cm->fc->wedge_interintra_prob[bsize], 0);
+      if (rd != INT64_MAX)
+        rd = RDCOST(x->rdmult, x->rddiv, rmode + rate_mv + rwedge + rate_sum,
+                    dist_sum);
+      best_interintra_rd_nowedge = rd;
+
+      // Disable wedge search if source variance is small
+      if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) {
+        mbmi->use_wedge_interintra = 1;
+
+        rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) +
+                 av1_cost_bit(cm->fc->wedge_interintra_prob[bsize], 1);
+
+        best_interintra_rd_wedge =
+            pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+
+        best_interintra_rd_wedge +=
+            RDCOST(x->rdmult, x->rddiv, rmode + rate_mv + rwedge, 0);
+        // Refine motion vector.
+        if (have_newmv_in_inter_mode(this_mode)) {
+          // get negative of mask
+          const uint8_t *mask = av1_get_contiguous_soft_mask(
+              mbmi->interintra_wedge_index, 1, bsize);
+          do_masked_motion_search(cpi, x, mask, bw, bsize, mi_row, mi_col,
+                                  &tmp_mv, &tmp_rate_mv, 0);
+          mbmi->mv[0].as_int = tmp_mv.as_int;
+          av1_build_inter_predictors_sby(xd, mi_row, mi_col, &orig_dst, bsize);
+          model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+                          &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
+          rd = RDCOST(x->rdmult, x->rddiv,
+                      rmode + tmp_rate_mv + rwedge + rate_sum, dist_sum);
+          if (rd < best_interintra_rd_wedge) {
+            best_interintra_rd_wedge = rd;
+          } else {
+            tmp_mv.as_int = cur_mv[0].as_int;
+            tmp_rate_mv = rate_mv;
+          }
+        } else {
+          tmp_mv.as_int = cur_mv[0].as_int;
+          tmp_rate_mv = rate_mv;
+          av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+        }
+        // Evaluate closer to true rd
+        av1_subtract_plane(x, bsize, 0);
+        rd =
+            estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                                &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+        if (rd != INT64_MAX)
+          rd = RDCOST(x->rdmult, x->rddiv,
+                      rmode + tmp_rate_mv + rwedge + rate_sum, dist_sum);
+        best_interintra_rd_wedge = rd;
+        if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
+          mbmi->use_wedge_interintra = 1;
+          best_interintra_rd = best_interintra_rd_wedge;
+          mbmi->mv[0].as_int = tmp_mv.as_int;
+          rd_stats->rate += tmp_rate_mv - rate_mv;
+          rate_mv = tmp_rate_mv;
+        } else {
+          mbmi->use_wedge_interintra = 0;
+          best_interintra_rd = best_interintra_rd_nowedge;
+          mbmi->mv[0].as_int = cur_mv[0].as_int;
+        }
+      } else {
+        mbmi->use_wedge_interintra = 0;
+        best_interintra_rd = best_interintra_rd_nowedge;
+      }
+    }
+#endif  // CONFIG_WEDGE
+
+    pred_exists = 0;
+    *args->compmode_interintra_cost =
+        av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 1);
+    *args->compmode_interintra_cost +=
+        interintra_mode_cost[mbmi->interintra_mode];
+    if (is_interintra_wedge_used(bsize)) {
+      *args->compmode_interintra_cost += av1_cost_bit(
+          cm->fc->wedge_interintra_prob[bsize], mbmi->use_wedge_interintra);
+      if (mbmi->use_wedge_interintra) {
+        *args->compmode_interintra_cost +=
+            av1_cost_literal(get_interintra_wedge_bits(bsize));
+      }
+    }
+  } else if (is_interintra_allowed(mbmi)) {
+    *args->compmode_interintra_cost =
+        av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 0);
+  }
+#endif  // CONFIG_INTERINTRA
+
+  if (pred_exists == 0) {
+    int tmp_rate;
+    int64_t tmp_dist;
+    av1_build_inter_predictors_sb(xd, mi_row, mi_col, &orig_dst, bsize);
+    model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+                    &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
+    rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
+  }
+#endif  // CONFIG_EXT_INTER
+
+  if (!is_comp_pred)
+#if CONFIG_DUAL_FILTER
+    args->single_filter[this_mode][refs[0]] = mbmi->interp_filter[0];
+#else
+    args->single_filter[this_mode][refs[0]] = mbmi->interp_filter;
+#endif  // CONFIG_DUAL_FILTER
+
+#if CONFIG_EXT_INTER
+  if (args->modelled_rd != NULL) {
+    if (is_comp_pred) {
+      const int mode0 = compound_ref0_mode(this_mode);
+      const int mode1 = compound_ref1_mode(this_mode);
+      const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]],
+                                 args->modelled_rd[mode1][refs[1]]);
+      if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) {
+        restore_dst_buf(xd, orig_dst);
+        return INT64_MAX;
+      }
+    } else if (!is_comp_interintra_pred) {
+      args->modelled_rd[this_mode][refs[0]] = rd;
+    }
+  }
+#endif  // CONFIG_EXT_INTER
+
+  if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+    // if current pred_error modeled rd is substantially more than the best
+    // so far, do not bother doing full rd
+    if (rd / 2 > ref_best_rd) {
+      restore_dst_buf(xd, orig_dst);
+      return INT64_MAX;
+    }
+  }
+
+  ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
+                           disable_skip, mode_mv, mi_row, mi_col, args,
+                           ref_best_rd, refs, rate_mv,
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_EXT_INTER
+                           rate2_bmc_nocoeff, &best_bmc_mbmi,
+#if CONFIG_MOTION_VAR
+                           rate_mv_bmc,
+#endif  // CONFIG_MOTION_VAR
+#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+                           rs, &skip_txfm_sb, &skip_sse_sb, &orig_dst);
+  if (ret_val != 0) return ret_val;
+
+  return 0;  // The rate-distortion cost will be re-calculated by caller.
+}
+
+#if CONFIG_INTRABC
+static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
+                                       RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                                       int64_t best_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  if (bsize < BLOCK_8X8 || !cm->allow_screen_content_tools) return INT64_MAX;
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const TileInfo *tile = &xd->tile;
+  MODE_INFO *const mi = xd->mi[0];
+  const int mi_row = -xd->mb_to_top_edge / (8 * MI_SIZE);
+  const int mi_col = -xd->mb_to_left_edge / (8 * MI_SIZE);
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int sb_row = mi_row / MAX_MIB_SIZE;
+
+  int_mv dv_ref;
+  av1_find_ref_dv(&dv_ref, mi_row, mi_col);
+
+  const MvLimits tmp_mv_limits = x->mv_limits;
+
+  // TODO(aconverse@google.com): Handle same row DV.
+  x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
+  x->mv_limits.col_max = (tile->mi_col_end - mi_col) * MI_SIZE - w;
+  x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
+  x->mv_limits.row_max = (sb_row * MAX_MIB_SIZE - mi_row) * MI_SIZE - h;
+  assert(x->mv_limits.col_min >= tmp_mv_limits.col_min);
+  assert(x->mv_limits.col_max <= tmp_mv_limits.col_max);
+  assert(x->mv_limits.row_min >= tmp_mv_limits.row_min);
+  assert(x->mv_limits.row_max <= tmp_mv_limits.row_max);
+  av1_set_mv_search_range(&x->mv_limits, &dv_ref.as_mv);
+
+  if (x->mv_limits.col_max < x->mv_limits.col_min ||
+      x->mv_limits.row_max < x->mv_limits.row_min) {
+    x->mv_limits = tmp_mv_limits;
+    return INT64_MAX;
+  }
+
+  struct buf_2d yv12_mb[MAX_MB_PLANE];
+  av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, mi_row, mi_col, NULL, NULL);
+  for (int i = 0; i < MAX_MB_PLANE; ++i) {
+    xd->plane[i].pre[0] = yv12_mb[i];
+  }
+
+  int step_param = cpi->mv_step_param;
+  MV mvp_full = dv_ref.as_mv;
+  mvp_full.col >>= 3;
+  mvp_full.row >>= 3;
+  int sadpb = x->sadperbit16;
+  int cost_list[5];
+  int bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
+                                      sadpb, cond_cost_list(cpi, cost_list),
+                                      &dv_ref.as_mv, INT_MAX, 1);
+
+  x->mv_limits = tmp_mv_limits;
+  if (bestsme == INT_MAX) return INT64_MAX;
+  mvp_full = x->best_mv.as_mv;
+  MV dv = {.row = mvp_full.row * 8, .col = mvp_full.col * 8 };
+  if (mv_check_bounds(&x->mv_limits, &dv)) return INT64_MAX;
+  if (!is_dv_valid(dv, tile, mi_row, mi_col, bsize)) return INT64_MAX;
+  MB_MODE_INFO *mbmi = &mi->mbmi;
+  MB_MODE_INFO best_mbmi = *mbmi;
+  RD_STATS best_rdcost = *rd_cost;
+  int best_skip = x->skip;
+#if CONFIG_PALETTE
+  memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info));
+#endif
+  mbmi->use_intrabc = 1;
+  mbmi->mode = DC_PRED;
+  mbmi->uv_mode = DC_PRED;
+  mbmi->mv[0].as_mv = dv;
+#if CONFIG_DUAL_FILTER
+  for (int idx = 0; idx < 4; ++idx) mbmi->interp_filter[idx] = BILINEAR;
+#else
+  mbmi->interp_filter = BILINEAR;
+#endif
+  mbmi->skip = 0;
+  x->skip = 0;
+  av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, bsize);
+
+  int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, x->nmvjointcost, x->mvcost,
+                                MV_COST_WEIGHT);
+  const PREDICTION_MODE A = av1_above_block_mode(mi, xd->above_mi, 0);
+  const PREDICTION_MODE L = av1_left_block_mode(mi, xd->left_mi, 0);
+  const int rate_mode =
+      cpi->y_mode_costs[A][L][DC_PRED] + av1_cost_bit(INTRABC_PROB, 1);
+
+  RD_STATS rd_stats, rd_stats_uv;
+  av1_subtract_plane(x, bsize, 0);
+  super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
+  super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+  av1_merge_rd_stats(&rd_stats, &rd_stats_uv);
+#if CONFIG_RD_DEBUG
+  mbmi->rd_stats = rd_stats;
+#endif
+
+  const aom_prob skip_prob = av1_get_skip_prob(cm, xd);
+
+  RD_STATS rdc_noskip;
+  av1_init_rd_stats(&rdc_noskip);
+  rdc_noskip.rate =
+      rate_mode + rate_mv + rd_stats.rate + av1_cost_bit(skip_prob, 0);
+  rdc_noskip.dist = rd_stats.dist;
+  rdc_noskip.rdcost =
+      RDCOST(x->rdmult, x->rddiv, rdc_noskip.rate, rdc_noskip.dist);
+  if (rdc_noskip.rdcost < best_rd) {
+    best_rd = rdc_noskip.rdcost;
+    best_mbmi = *mbmi;
+    best_skip = x->skip;
+    best_rdcost = rdc_noskip;
+  }
+
+  x->skip = 1;
+  mbmi->skip = 1;
+  RD_STATS rdc_skip;
+  av1_init_rd_stats(&rdc_skip);
+  rdc_skip.rate = rate_mode + rate_mv + av1_cost_bit(skip_prob, 1);
+  rdc_skip.dist = rd_stats.sse;
+  rdc_skip.rdcost = RDCOST(x->rdmult, x->rddiv, rdc_skip.rate, rdc_skip.dist);
+  if (rdc_skip.rdcost < best_rd) {
+    best_rd = rdc_skip.rdcost;
+    best_mbmi = *mbmi;
+    best_skip = x->skip;
+    best_rdcost = rdc_skip;
+  }
+  *mbmi = best_mbmi;
+  *rd_cost = best_rdcost;
+  x->skip = best_skip;
+  return best_rd;
+}
+#endif  // CONFIG_INTRABC
+
+void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
+                               RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = xd->plane;
+  int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
+  int y_skip = 0, uv_skip = 0;
+  int64_t dist_y = 0, dist_uv = 0;
+  TX_SIZE max_uv_tx_size;
+  const int unify_bsize = CONFIG_CB4X4;
+
+  ctx->skip = 0;
+  xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
+  xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME;
+#if CONFIG_INTRABC
+  xd->mi[0]->mbmi.use_intrabc = 0;
+#endif  // CONFIG_INTRABC
+
+  const int64_t intra_yrd =
+      (bsize >= BLOCK_8X8 || unify_bsize)
+          ? rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y,
+                                   &y_skip, bsize, best_rd)
+          : rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+                                         &dist_y, &y_skip, best_rd);
+
+  if (intra_yrd < best_rd) {
+    max_uv_tx_size = uv_txsize_lookup[bsize][xd->mi[0]->mbmi.tx_size]
+                                     [pd[1].subsampling_x][pd[1].subsampling_y];
+
+#if CONFIG_CB4X4
+#if !CONFIG_CHROMA_2X2
+    max_uv_tx_size = AOMMAX(max_uv_tx_size, TX_4X4);
+#endif  // !CONFIG_CHROMA_2X2
+    if (!x->skip_chroma_rd)
+      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
+                              &uv_skip, bsize, max_uv_tx_size);
+#else
+    rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
+                            &uv_skip, AOMMAX(BLOCK_8X8, bsize), max_uv_tx_size);
+#endif  // CONFIG_CB4X4
+
+    if (y_skip && uv_skip) {
+      rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
+                      av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+      rd_cost->dist = dist_y + dist_uv;
+    } else {
+      rd_cost->rate =
+          rate_y + rate_uv + av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+      rd_cost->dist = dist_y + dist_uv;
+    }
+    rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+  } else {
+    rd_cost->rate = INT_MAX;
+  }
+
+#if CONFIG_INTRABC
+  if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd)
+    best_rd = rd_cost->rdcost;
+  if (rd_pick_intrabc_mode_sb(cpi, x, rd_cost, bsize, best_rd) < best_rd) {
+    ctx->skip = x->skip;  // FIXME where is the proper place to set this?!
+    assert(rd_cost->rate != INT_MAX);
+    rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+  }
+#endif
+  if (rd_cost->rate == INT_MAX) return;
+
+  ctx->mic = *xd->mi[0];
+  ctx->mbmi_ext = *x->mbmi_ext;
+}
+
+// Do we have an internal image edge (e.g. formatting bars).
+int av1_internal_image_edge(const AV1_COMP *cpi) {
+  return (cpi->oxcf.pass == 2) &&
+         ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
+          (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
+}
+
+// Checks to see if a super block is on a horizontal image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) {
+  int top_edge = 0;
+  int bottom_edge = cpi->common.mi_rows;
+  int is_active_h_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (cpi->oxcf.pass == 2) {
+    const TWO_PASS *const twopass = &cpi->twopass;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+
+    bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+    bottom_edge = AOMMAX(top_edge, bottom_edge);
+  }
+
+  if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
+      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
+    is_active_h_edge = 1;
+  }
+  return is_active_h_edge;
+}
+
+// Checks to see if a super block is on a vertical image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
+  int left_edge = 0;
+  int right_edge = cpi->common.mi_cols;
+  int is_active_v_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (cpi->oxcf.pass == 2) {
+    const TWO_PASS *const twopass = &cpi->twopass;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+
+    right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+    right_edge = AOMMAX(left_edge, right_edge);
+  }
+
+  if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
+      ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
+    is_active_v_edge = 1;
+  }
+  return is_active_v_edge;
+}
+
+// Checks to see if a super block is at the edge of the active image.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int av1_active_edge_sb(const AV1_COMP *cpi, int mi_row, int mi_col) {
+  return av1_active_h_edge(cpi, mi_row, cpi->common.mib_size) ||
+         av1_active_v_edge(cpi, mi_col, cpi->common.mib_size);
+}
+
+#if CONFIG_PALETTE
+static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+  float *const data = x->palette_buffer->kmeans_data_buf;
+  float centroids[2 * PALETTE_MAX_SIZE];
+  uint8_t *const color_map = xd->plane[1].color_index_map;
+  int r, c;
+#if CONFIG_HIGHBITDEPTH
+  const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
+  const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
+#endif  // CONFIG_HIGHBITDEPTH
+  int plane_block_width, plane_block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+                           &plane_block_height, &rows, &cols);
+  (void)cpi;
+
+  for (r = 0; r < rows; ++r) {
+    for (c = 0; c < cols; ++c) {
+#if CONFIG_HIGHBITDEPTH
+      if (cpi->common.use_highbitdepth) {
+        data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
+      } else {
+#endif  // CONFIG_HIGHBITDEPTH
+        data[(r * cols + c) * 2] = src_u[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
+#if CONFIG_HIGHBITDEPTH
+      }
+#endif  // CONFIG_HIGHBITDEPTH
+    }
+  }
+
+  for (r = 1; r < 3; ++r) {
+    for (c = 0; c < pmi->palette_size[1]; ++c) {
+      centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
+    }
+  }
+
+  av1_calc_indices(data, centroids, color_map, rows * cols,
+                   pmi->palette_size[1], 2);
+  extend_palette_color_map(color_map, cols, rows, plane_block_width,
+                           plane_block_height);
+}
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_FILTER_INTRA
+static void pick_filter_intra_interframe(
+    const AV1_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+    BLOCK_SIZE bsize, int mi_row, int mi_col, int *rate_uv_intra,
+    int *rate_uv_tokenonly, int64_t *dist_uv, int *skip_uv,
+    PREDICTION_MODE *mode_uv, FILTER_INTRA_MODE_INFO *filter_intra_mode_info_uv,
+#if CONFIG_EXT_INTRA
+    int8_t *uv_angle_delta,
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_PALETTE
+    PALETTE_MODE_INFO *pmi_uv, int palette_ctx,
+#endif  // CONFIG_PALETTE
+    int skip_mask, unsigned int *ref_costs_single, int64_t *best_rd,
+    int64_t *best_intra_rd, PREDICTION_MODE *best_intra_mode,
+    int *best_mode_index, int *best_skip2, int *best_mode_skippable,
+#if CONFIG_SUPERTX
+    int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+    int64_t *best_pred_rd, MB_MODE_INFO *best_mbmode, RD_STATS *rd_cost) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_PALETTE
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+#endif  // CONFIG_PALETTE
+  int rate2 = 0, rate_y = INT_MAX, skippable = 0, rate_uv, rate_dummy, i;
+  int dc_mode_index;
+  const int *const intra_mode_cost = cpi->mbmode_cost[size_group_lookup[bsize]];
+  int64_t distortion2 = 0, distortion_y = 0, this_rd = *best_rd;
+  int64_t distortion_uv, model_rd = INT64_MAX;
+  TX_SIZE uv_tx;
+
+  for (i = 0; i < MAX_MODES; ++i)
+    if (av1_mode_order[i].mode == DC_PRED &&
+        av1_mode_order[i].ref_frame[0] == INTRA_FRAME)
+      break;
+  dc_mode_index = i;
+  assert(i < MAX_MODES);
+
+  // TODO(huisu): use skip_mask for further speedup.
+  (void)skip_mask;
+  mbmi->mode = DC_PRED;
+  mbmi->uv_mode = DC_PRED;
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  if (!rd_pick_filter_intra_sby(cpi, x, &rate_dummy, &rate_y, &distortion_y,
+                                &skippable, bsize, intra_mode_cost[mbmi->mode],
+                                &this_rd, &model_rd, 0)) {
+    return;
+  }
+  if (rate_y == INT_MAX) return;
+
+  uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x]
+                          [xd->plane[1].subsampling_y];
+  if (rate_uv_intra[uv_tx] == INT_MAX) {
+    choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
+                         &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx],
+                         &skip_uv[uv_tx], &mode_uv[uv_tx]);
+#if CONFIG_PALETTE
+    if (cm->allow_screen_content_tools) pmi_uv[uv_tx] = *pmi;
+#endif  // CONFIG_PALETTE
+    filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
+#if CONFIG_EXT_INTRA
+    uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
+#endif  // CONFIG_EXT_INTRA
+  }
+
+  rate_uv = rate_uv_tokenonly[uv_tx];
+  distortion_uv = dist_uv[uv_tx];
+  skippable = skippable && skip_uv[uv_tx];
+  mbmi->uv_mode = mode_uv[uv_tx];
+#if CONFIG_PALETTE
+  if (cm->allow_screen_content_tools) {
+    pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
+    memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+           pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+           2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+  }
+#endif  // CONFIG_PALETTE
+#if CONFIG_EXT_INTRA
+  mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
+#endif  // CONFIG_EXT_INTRA
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
+      filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
+  if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
+    mbmi->filter_intra_mode_info.filter_intra_mode[1] =
+        filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
+  }
+
+  rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv +
+          cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
+#if CONFIG_PALETTE
+  if (cpi->common.allow_screen_content_tools && mbmi->mode == DC_PRED &&
+      bsize >= BLOCK_8X8)
+    rate2 += av1_cost_bit(
+        av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0);
+#endif  // CONFIG_PALETTE
+
+  if (!xd->lossless[mbmi->segment_id]) {
+    // super_block_yrd above includes the cost of the tx_size in the
+    // tokenonly rate, but for intra blocks, tx_size is always coded
+    // (prediction granularity), so we account for it in the full rate,
+    // not the tokenonly rate.
+    rate_y -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
+  }
+
+  rate2 += av1_cost_bit(cm->fc->filter_intra_probs[0],
+                        mbmi->filter_intra_mode_info.use_filter_intra_mode[0]);
+  rate2 += write_uniform_cost(
+      FILTER_INTRA_MODES, mbmi->filter_intra_mode_info.filter_intra_mode[0]);
+#if CONFIG_EXT_INTRA
+  if (av1_is_directional_mode(mbmi->uv_mode, bsize)) {
+    rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
+                                MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
+  }
+#endif  // CONFIG_EXT_INTRA
+  if (mbmi->mode == DC_PRED) {
+    rate2 +=
+        av1_cost_bit(cpi->common.fc->filter_intra_probs[1],
+                     mbmi->filter_intra_mode_info.use_filter_intra_mode[1]);
+    if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1])
+      rate2 +=
+          write_uniform_cost(FILTER_INTRA_MODES,
+                             mbmi->filter_intra_mode_info.filter_intra_mode[1]);
+  }
+  distortion2 = distortion_y + distortion_uv;
+  av1_encode_intra_block_plane((AV1_COMMON *)cm, x, bsize, 0, 0, mi_row,
+                               mi_col);
+
+  rate2 += ref_costs_single[INTRA_FRAME];
+
+  if (skippable) {
+    rate2 -= (rate_y + rate_uv);
+    rate_y = 0;
+    rate_uv = 0;
+    rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+  } else {
+    rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+  }
+  this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+
+  if (this_rd < *best_intra_rd) {
+    *best_intra_rd = this_rd;
+    *best_intra_mode = mbmi->mode;
+  }
+  for (i = 0; i < REFERENCE_MODES; ++i)
+    best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd);
+
+  if (this_rd < *best_rd) {
+    *best_mode_index = dc_mode_index;
+    mbmi->mv[0].as_int = 0;
+    rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+    if (x->skip)
+      *returnrate_nocoef = rate2;
+    else
+      *returnrate_nocoef = rate2 - rate_y - rate_uv;
+    *returnrate_nocoef -= av1_cost_bit(av1_get_skip_prob(cm, xd), skippable);
+    *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd),
+                                       mbmi->ref_frame[0] != INTRA_FRAME);
+#endif  // CONFIG_SUPERTX
+    rd_cost->dist = distortion2;
+    rd_cost->rdcost = this_rd;
+    *best_rd = this_rd;
+    *best_mbmode = *mbmi;
+    *best_skip2 = 0;
+    *best_mode_skippable = skippable;
+  }
+}
+#endif  // CONFIG_FILTER_INTRA
+
+#if CONFIG_MOTION_VAR
+static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
+                                      const MACROBLOCKD *xd, int mi_row,
+                                      int mi_col, const uint8_t *above,
+                                      int above_stride, const uint8_t *left,
+                                      int left_stride);
+#endif  // CONFIG_MOTION_VAR
+
+void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
+                               MACROBLOCK *x, int mi_row, int mi_col,
+                               RD_STATS *rd_cost,
+#if CONFIG_SUPERTX
+                               int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                               int64_t best_rd_so_far) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RD_OPT *const rd_opt = &cpi->rd;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_PALETTE
+  const int try_palette =
+      cpi->common.allow_screen_content_tools && bsize >= BLOCK_8X8;
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+#endif  // CONFIG_PALETTE
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const struct segmentation *const seg = &cm->seg;
+  PREDICTION_MODE this_mode;
+  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
+  unsigned char segment_id = mbmi->segment_id;
+  int comp_pred, i, k;
+  int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+  struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE];
+  int_mv single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } };
+#if CONFIG_EXT_INTER
+  int single_newmv_rate[TOTAL_REFS_PER_FRAME] = { 0 };
+  int64_t modelled_rd[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+#endif  // CONFIG_EXT_INTER
+  static const int flag_list[TOTAL_REFS_PER_FRAME] = {
+    0,
+    AOM_LAST_FLAG,
+#if CONFIG_EXT_REFS
+    AOM_LAST2_FLAG,
+    AOM_LAST3_FLAG,
+#endif  // CONFIG_EXT_REFS
+    AOM_GOLD_FLAG,
+#if CONFIG_EXT_REFS
+    AOM_BWD_FLAG,
+#endif  // CONFIG_EXT_REFS
+    AOM_ALT_FLAG
+  };
+  int64_t best_rd = best_rd_so_far;
+  int best_rate_y = INT_MAX, best_rate_uv = INT_MAX;
+  int64_t best_pred_diff[REFERENCE_MODES];
+  int64_t best_pred_rd[REFERENCE_MODES];
+  MB_MODE_INFO best_mbmode;
+#if CONFIG_REF_MV
+  int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+  int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+#endif  // CONFIG_REF_MV
+  int best_mode_skippable = 0;
+  int midx, best_mode_index = -1;
+  unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
+  unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
+  aom_prob comp_mode_p;
+  int64_t best_intra_rd = INT64_MAX;
+  unsigned int best_pred_sse = UINT_MAX;
+  PREDICTION_MODE best_intra_mode = DC_PRED;
+  int rate_uv_intra[TX_SIZES_ALL], rate_uv_tokenonly[TX_SIZES_ALL];
+  int64_t dist_uvs[TX_SIZES_ALL];
+  int skip_uvs[TX_SIZES_ALL];
+  PREDICTION_MODE mode_uv[TX_SIZES_ALL];
+#if CONFIG_PALETTE
+  PALETTE_MODE_INFO pmi_uv[TX_SIZES_ALL];
+#endif  // CONFIG_PALETTE
+#if CONFIG_EXT_INTRA
+  int8_t uv_angle_delta[TX_SIZES_ALL];
+  int is_directional_mode, angle_stats_ready = 0;
+  uint8_t directional_mode_skip_mask[INTRA_MODES];
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  int8_t dc_skipped = 1;
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info_uv[TX_SIZES_ALL];
+#endif  // CONFIG_FILTER_INTRA
+  const int intra_cost_penalty = av1_get_intra_cost_penalty(
+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  const int *const intra_mode_cost = cpi->mbmode_cost[size_group_lookup[bsize]];
+  int best_skip2 = 0;
+  uint8_t ref_frame_skip_mask[2] = { 0 };
+#if CONFIG_EXT_INTER
+  uint32_t mode_skip_mask[TOTAL_REFS_PER_FRAME] = { 0 };
+  MV_REFERENCE_FRAME best_single_inter_ref = LAST_FRAME;
+  int64_t best_single_inter_rd = INT64_MAX;
+#else
+  uint16_t mode_skip_mask[TOTAL_REFS_PER_FRAME] = { 0 };
+#endif  // CONFIG_EXT_INTER
+  int mode_skip_start = sf->mode_skip_start + 1;
+  const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
+  const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
+  int64_t mode_threshold[MAX_MODES];
+  int *mode_map = tile_data->mode_map[bsize];
+  const int mode_search_skip_flags = sf->mode_search_skip_flags;
+#if CONFIG_PVQ
+  od_rollback_buffer pre_buf;
+#endif  // CONFIG_PVQ
+
+  HandleInterModeArgs args = {
+#if CONFIG_MOTION_VAR
+    { NULL },
+    { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
+    { NULL },
+    { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
+#endif  // CONFIG_MOTION_VAR
+#if CONFIG_EXT_INTER
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+#else   // CONFIG_EXT_INTER
+    NULL,
+#endif  // CONFIG_EXT_INTER
+    { { 0 } },
+  };
+
+#if CONFIG_PALETTE || CONFIG_EXT_INTRA
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+#endif  // CONFIG_PALETTE || CONFIG_EXT_INTRA
+#if CONFIG_PALETTE
+  int palette_ctx = 0;
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi = xd->left_mi;
+#endif  // CONFIG_PALETTE
+#if CONFIG_MOTION_VAR
+#if CONFIG_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+#else
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
+#endif  // CONFIG_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, int32_t, weighted_src_buf[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, int32_t, mask2d_buf[MAX_SB_SQUARE]);
+  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    args.above_pred_buf[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
+    args.above_pred_buf[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
+    args.above_pred_buf[2] =
+        CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_SB_SQUARE * len);
+    args.left_pred_buf[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
+    args.left_pred_buf[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
+    args.left_pred_buf[2] =
+        CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_SB_SQUARE * len);
+  } else {
+#endif  // CONFIG_HIGHBITDEPTH
+    args.above_pred_buf[0] = tmp_buf1;
+    args.above_pred_buf[1] = tmp_buf1 + MAX_SB_SQUARE;
+    args.above_pred_buf[2] = tmp_buf1 + 2 * MAX_SB_SQUARE;
+    args.left_pred_buf[0] = tmp_buf2;
+    args.left_pred_buf[1] = tmp_buf2 + MAX_SB_SQUARE;
+    args.left_pred_buf[2] = tmp_buf2 + 2 * MAX_SB_SQUARE;
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_MOTION_VAR
+
+  av1_zero(best_mbmode);
+
+#if CONFIG_PALETTE
+  av1_zero(pmi_uv);
+  if (try_palette) {
+    if (above_mi)
+      palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    if (left_mi)
+      palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+  }
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_EXT_INTRA
+  memset(directional_mode_skip_mask, 0,
+         sizeof(directional_mode_skip_mask[0]) * INTRA_MODES);
+#endif  // CONFIG_EXT_INTRA
+
+  estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
+                           &comp_mode_p);
+
+  for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX;
+  for (i = 0; i < TX_SIZES_ALL; i++) rate_uv_intra[i] = INT_MAX;
+  for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) x->pred_sse[i] = INT_MAX;
+  for (i = 0; i < MB_MODE_COUNT; ++i) {
+    for (k = 0; k < TOTAL_REFS_PER_FRAME; ++k) {
+      args.single_filter[i][k] = SWITCHABLE;
+    }
+  }
+
+  rd_cost->rate = INT_MAX;
+#if CONFIG_SUPERTX
+  *returnrate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    x->pred_mv_sad[ref_frame] = INT_MAX;
+    x->mbmi_ext->mode_context[ref_frame] = 0;
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+    x->mbmi_ext->compound_mode_context[ref_frame] = 0;
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
+      setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
+                         frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+    }
+    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+#if CONFIG_GLOBAL_MOTION
+    frame_mv[ZEROMV][ref_frame].as_int =
+        gm_get_motion_vector(&cm->global_motion[ref_frame],
+                             cm->allow_high_precision_mv, bsize, mi_col, mi_row,
+                             0)
+            .as_int;
+#else   // CONFIG_GLOBAL_MOTION
+    frame_mv[ZEROMV][ref_frame].as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_EXT_INTER
+    frame_mv[NEW_NEWMV][ref_frame].as_int = INVALID_MV;
+#if CONFIG_GLOBAL_MOTION
+    frame_mv[ZERO_ZEROMV][ref_frame].as_int =
+        gm_get_motion_vector(&cm->global_motion[ref_frame],
+                             cm->allow_high_precision_mv, bsize, mi_col, mi_row,
+                             0)
+            .as_int;
+#else   // CONFIG_GLOBAL_MOTION
+    frame_mv[ZERO_ZEROMV][ref_frame].as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
+#endif  // CONFIG_EXT_INTER
+  }
+
+#if CONFIG_REF_MV
+  for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
+    MODE_INFO *const mi = xd->mi[0];
+    int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
+    x->mbmi_ext->mode_context[ref_frame] = 0;
+    av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
+                     mbmi_ext->ref_mv_stack[ref_frame],
+#if CONFIG_EXT_INTER
+                     mbmi_ext->compound_mode_context,
+#endif  // CONFIG_EXT_INTER
+                     candidates, mi_row, mi_col, NULL, NULL,
+                     mbmi_ext->mode_context);
+    if (mbmi_ext->ref_mv_count[ref_frame] < 2) {
+      MV_REFERENCE_FRAME rf[2];
+      av1_set_ref_frame(rf, ref_frame);
+      if (mbmi_ext->ref_mvs[rf[0]][0].as_int !=
+              frame_mv[ZEROMV][rf[0]].as_int ||
+          mbmi_ext->ref_mvs[rf[0]][1].as_int !=
+              frame_mv[ZEROMV][rf[0]].as_int ||
+          mbmi_ext->ref_mvs[rf[1]][0].as_int !=
+              frame_mv[ZEROMV][rf[1]].as_int ||
+          mbmi_ext->ref_mvs[rf[1]][1].as_int != frame_mv[ZEROMV][rf[1]].as_int)
+        mbmi_ext->mode_context[ref_frame] &= ~(1 << ALL_ZERO_FLAG_OFFSET);
+    }
+  }
+#endif  // CONFIG_REF_MV
+
+#if CONFIG_MOTION_VAR
+  av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
+  if (check_num_overlappable_neighbors(mbmi) &&
+      is_motion_variation_allowed_bsize(bsize)) {
+    av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col,
+                                        args.above_pred_buf, dst_width1,
+                                        dst_height1, args.above_pred_stride);
+    av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col,
+                                       args.left_pred_buf, dst_width2,
+                                       dst_height2, args.left_pred_stride);
+    av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+                         mi_col);
+    x->mask_buf = mask2d_buf;
+    x->wsrc_buf = weighted_src_buf;
+    calc_target_weighted_pred(cm, x, xd, mi_row, mi_col, args.above_pred_buf[0],
+                              args.above_pred_stride[0], args.left_pred_buf[0],
+                              args.left_pred_stride[0]);
+  }
+#endif  // CONFIG_MOTION_VAR
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
+// Skip checking missing references in both single and compound reference
+// modes. Note that a mode will be skipped iff both reference frames
+// are masked out.
+#if CONFIG_EXT_REFS
+      if (ref_frame == BWDREF_FRAME || ref_frame == ALTREF_FRAME) {
+        ref_frame_skip_mask[0] |= (1 << ref_frame);
+        ref_frame_skip_mask[1] |= ((1 << ref_frame) | 0x01);
+      } else {
+#endif  // CONFIG_EXT_REFS
+        ref_frame_skip_mask[0] |= (1 << ref_frame);
+        ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+#if CONFIG_EXT_REFS
+      }
+#endif  // CONFIG_EXT_REFS
+    } else {
+      for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+        // Skip fixed mv modes for poor references
+        if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
+          mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
+          break;
+        }
+      }
+    }
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+    }
+  }
+
+  // Disable this drop out case if the ref frame
+  // segment level feature is enabled for this segment. This is to
+  // prevent the possibility that we end up unable to pick any mode.
+  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+    // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+    // unless ARNR filtering is enabled in which case we want
+    // an unfiltered alternative. We allow near/nearest as well
+    // because they may result in zero-zero MVs but be cheaper.
+    if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+      int_mv zeromv;
+      ref_frame_skip_mask[0] = (1 << LAST_FRAME) |
+#if CONFIG_EXT_REFS
+                               (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |
+                               (1 << BWDREF_FRAME) |
+#endif  // CONFIG_EXT_REFS
+                               (1 << GOLDEN_FRAME);
+      ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+      // TODO(zoeliu): To further explore whether following needs to be done for
+      //               BWDREF_FRAME as well.
+      mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
+#if CONFIG_GLOBAL_MOTION
+      zeromv.as_int = gm_get_motion_vector(&cm->global_motion[ALTREF_FRAME],
+                                           cm->allow_high_precision_mv, bsize,
+                                           mi_col, mi_row, 0)
+                          .as_int;
+#else
+      zeromv.as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
+      if (frame_mv[NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
+      if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
+#if CONFIG_EXT_INTER
+      if (frame_mv[NEAREST_NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARESTMV);
+      if (frame_mv[NEAREST_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARMV);
+      if (frame_mv[NEAR_NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEAR_NEARESTMV);
+      if (frame_mv[NEAR_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEAR_NEARMV);
+#endif  // CONFIG_EXT_INTER
+    }
+  }
+
+  if (cpi->rc.is_src_frame_alt_ref) {
+    if (sf->alt_ref_search_fp) {
+      assert(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]);
+      mode_skip_mask[ALTREF_FRAME] = 0;
+      ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
+      ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+    }
+  }
+
+  if (sf->alt_ref_search_fp)
+    if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX)
+      if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
+        mode_skip_mask[ALTREF_FRAME] |= INTER_ALL;
+
+  if (sf->adaptive_mode_search) {
+    if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
+        cpi->rc.frames_since_golden >= 3)
+      if (x->pred_mv_sad[GOLDEN_FRAME] > (x->pred_mv_sad[LAST_FRAME] << 1))
+        mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
+  }
+
+  if (bsize > sf->max_intra_bsize) {
+    ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
+    ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
+  }
+
+  mode_skip_mask[INTRA_FRAME] |=
+      ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
+
+  for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0;
+  for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
+    mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
+
+  midx = sf->schedule_mode_search ? mode_skip_start : 0;
+  while (midx > 4) {
+    uint8_t end_pos = 0;
+    for (i = 5; i < midx; ++i) {
+      if (mode_threshold[mode_map[i - 1]] > mode_threshold[mode_map[i]]) {
+        uint8_t tmp = mode_map[i];
+        mode_map[i] = mode_map[i - 1];
+        mode_map[i - 1] = tmp;
+        end_pos = i;
+      }
+    }
+    midx = end_pos;
+  }
+
+  if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
+    x->use_default_intra_tx_type = 1;
+  else
+    x->use_default_intra_tx_type = 0;
+
+  if (cpi->sf.tx_type_search.fast_inter_tx_type_search)
+    x->use_default_inter_tx_type = 1;
+  else
+    x->use_default_inter_tx_type = 0;
+#if CONFIG_PVQ
+  od_encode_checkpoint(&x->daala_enc, &pre_buf);
+#endif  // CONFIG_PVQ
+#if CONFIG_EXT_INTER
+  for (i = 0; i < MB_MODE_COUNT; ++i)
+    for (ref_frame = 0; ref_frame < TOTAL_REFS_PER_FRAME; ++ref_frame)
+      modelled_rd[i][ref_frame] = INT64_MAX;
+#endif  // CONFIG_EXT_INTER
+
+  for (midx = 0; midx < MAX_MODES; ++midx) {
+    int mode_index;
+    int mode_excluded = 0;
+    int64_t this_rd = INT64_MAX;
+    int disable_skip = 0;
+    int compmode_cost = 0;
+#if CONFIG_EXT_INTER
+    int compmode_interintra_cost = 0;
+    int compmode_interinter_cost = 0;
+#endif  // CONFIG_EXT_INTER
+    int rate2 = 0, rate_y = 0, rate_uv = 0;
+    int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+    int skippable = 0;
+    int this_skip2 = 0;
+    int64_t total_sse = INT64_MAX;
+#if CONFIG_REF_MV
+    uint8_t ref_frame_type;
+#endif  // CONFIG_REF_MV
+#if CONFIG_PVQ
+    od_encode_rollback(&x->daala_enc, &pre_buf);
+#endif  // CONFIG_PVQ
+    mode_index = mode_map[midx];
+    this_mode = av1_mode_order[mode_index].mode;
+    ref_frame = av1_mode_order[mode_index].ref_frame[0];
+    second_ref_frame = av1_mode_order[mode_index].ref_frame[1];
+#if CONFIG_REF_MV
+    mbmi->ref_mv_idx = 0;
+#endif  // CONFIG_REF_MV
+
+#if CONFIG_EXT_INTER
+    if (ref_frame > INTRA_FRAME && second_ref_frame == INTRA_FRAME) {
+      // Mode must by compatible
+      if (!is_interintra_allowed_mode(this_mode)) continue;
+      if (!is_interintra_allowed_bsize(bsize)) continue;
+    }
+
+    if (is_inter_compound_mode(this_mode)) {
+      frame_mv[this_mode][ref_frame].as_int =
+          frame_mv[compound_ref0_mode(this_mode)][ref_frame].as_int;
+      frame_mv[this_mode][second_ref_frame].as_int =
+          frame_mv[compound_ref1_mode(this_mode)][second_ref_frame].as_int;
+    }
+#endif  // CONFIG_EXT_INTER
+
+    // Look at the reference frame of the best mode so far and set the
+    // skip mask to look at a subset of the remaining modes.
+    if (midx == mode_skip_start && best_mode_index >= 0) {
+      switch (best_mbmode.ref_frame[0]) {
+        case INTRA_FRAME: break;
+        case LAST_FRAME:
+          ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+          break;
+#if CONFIG_EXT_REFS
+        case LAST2_FRAME:
+          ref_frame_skip_mask[0] |= LAST2_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+          break;
+        case LAST3_FRAME:
+          ref_frame_skip_mask[0] |= LAST3_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+          break;
+#endif  // CONFIG_EXT_REFS
+        case GOLDEN_FRAME:
+          ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+          break;
+#if CONFIG_EXT_REFS
+        case BWDREF_FRAME:
+          ref_frame_skip_mask[0] |= BWDREF_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+          break;
+#endif  // CONFIG_EXT_REFS
+        case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALTREF_FRAME_MODE_MASK;
+#if CONFIG_EXT_REFS
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+#endif  // CONFIG_EXT_REFS
+          break;
+        case NONE_FRAME:
+        case TOTAL_REFS_PER_FRAME:
+          assert(0 && "Invalid Reference frame");
+          break;
+      }
+    }
+
+    if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
+        (ref_frame_skip_mask[1] & (1 << AOMMAX(0, second_ref_frame))))
+      continue;
+
+    if (mode_skip_mask[ref_frame] & (1 << this_mode)) continue;
+
+    // Test best rd so far against threshold for trying this mode.
+    if (best_mode_skippable && sf->schedule_mode_search)
+      mode_threshold[mode_index] <<= 1;
+
+    if (best_rd < mode_threshold[mode_index]) continue;
+
+    // This is only used in motion vector unit test.
+    if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue;
+
+#if CONFIG_LOWDELAY_COMPOUND  // Changes LL bitstream
+#if CONFIG_EXT_REFS
+    if (cpi->oxcf.pass == 0) {
+      // Complexity-compression trade-offs
+      // if (ref_frame == ALTREF_FRAME) continue;
+      // if (ref_frame == BWDREF_FRAME) continue;
+      if (second_ref_frame == ALTREF_FRAME) continue;
+      // if (second_ref_frame == BWDREF_FRAME) continue;
+    }
+#endif
+#endif
+    comp_pred = second_ref_frame > INTRA_FRAME;
+    if (comp_pred) {
+      if (!cpi->allow_comp_inter_inter) continue;
+
+      // Skip compound inter modes if ARF is not available.
+      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+
+      // Do not allow compound prediction if the segment level reference frame
+      // feature is in use as in this case there can only be one reference.
+      if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue;
+
+      if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+          best_mode_index >= 0 && best_mbmode.ref_frame[0] == INTRA_FRAME)
+        continue;
+
+      mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
+    } else {
+      if (ref_frame != INTRA_FRAME)
+        mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
+    }
+
+    if (ref_frame == INTRA_FRAME) {
+      if (sf->adaptive_mode_search)
+        if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_pred_sse)
+          continue;
+
+      if (this_mode != DC_PRED) {
+        // Disable intra modes other than DC_PRED for blocks with low variance
+        // Threshold for intra skipping based on source variance
+        // TODO(debargha): Specialize the threshold for super block sizes
+        const unsigned int skip_intra_var_thresh = 64;
+        if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
+            x->source_variance < skip_intra_var_thresh)
+          continue;
+        // Only search the oblique modes if the best so far is
+        // one of the neighboring directional modes
+        if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+            (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
+          if (best_mode_index >= 0 && best_mbmode.ref_frame[0] > INTRA_FRAME)
+            continue;
+        }
+        if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+          if (conditional_skipintra(this_mode, best_intra_mode)) continue;
+        }
+      }
+#if CONFIG_GLOBAL_MOTION
+    } else if (cm->global_motion[ref_frame].wmtype == IDENTITY &&
+               (!comp_pred ||
+                cm->global_motion[second_ref_frame].wmtype == IDENTITY)) {
+#else   // CONFIG_GLOBAL_MOTION
+    } else {
+#endif  // CONFIG_GLOBAL_MOTION
+      const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, second_ref_frame };
+      if (!check_best_zero_mv(cpi, mbmi_ext->mode_context,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+                              mbmi_ext->compound_mode_context,
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+                              frame_mv, this_mode, ref_frames, bsize, -1,
+                              mi_row, mi_col))
+        continue;
+    }
+
+    mbmi->mode = this_mode;
+    mbmi->uv_mode = DC_PRED;
+    mbmi->ref_frame[0] = ref_frame;
+    mbmi->ref_frame[1] = second_ref_frame;
+#if CONFIG_PALETTE
+    pmi->palette_size[0] = 0;
+    pmi->palette_size[1] = 0;
+#endif  // CONFIG_PALETTE
+#if CONFIG_FILTER_INTRA
+    mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+    mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif  // CONFIG_FILTER_INTRA
+        // Evaluate all sub-pel filters irrespective of whether we can use
+        // them for this frame.
+
+    set_default_interp_filters(mbmi, cm->interp_filter);
+
+    mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+    mbmi->motion_mode = SIMPLE_TRANSLATION;
+
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+    // Select prediction reference frames.
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+    }
+
+#if CONFIG_EXT_INTER
+    mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
+#endif  // CONFIG_EXT_INTER
+
+    if (ref_frame == INTRA_FRAME) {
+      RD_STATS rd_stats_y;
+      TX_SIZE uv_tx;
+      struct macroblockd_plane *const pd = &xd->plane[1];
+#if CONFIG_EXT_INTRA
+      is_directional_mode = av1_is_directional_mode(mbmi->mode, bsize);
+      if (is_directional_mode) {
+        int rate_dummy;
+        int64_t model_rd = INT64_MAX;
+        if (!angle_stats_ready) {
+          const int src_stride = x->plane[0].src.stride;
+          const uint8_t *src = x->plane[0].src.buf;
+#if CONFIG_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+            highbd_angle_estimation(src, src_stride, rows, cols,
+                                    directional_mode_skip_mask);
+          else
+#endif  // CONFIG_HIGHBITDEPTH
+            angle_estimation(src, src_stride, rows, cols,
+                             directional_mode_skip_mask);
+          angle_stats_ready = 1;
+        }
+        if (directional_mode_skip_mask[mbmi->mode]) continue;
+        rd_stats_y.rate = INT_MAX;
+        rd_pick_intra_angle_sby(cpi, x, &rate_dummy, &rd_stats_y, bsize,
+                                intra_mode_cost[mbmi->mode], best_rd,
+                                &model_rd);
+      } else {
+        mbmi->angle_delta[0] = 0;
+        super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd);
+      }
+#else
+      super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd);
+#endif  // CONFIG_EXT_INTRA
+      rate_y = rd_stats_y.rate;
+      distortion_y = rd_stats_y.dist;
+      skippable = rd_stats_y.skip;
+
+      if (rate_y == INT_MAX) continue;
+
+#if CONFIG_FILTER_INTRA
+      if (mbmi->mode == DC_PRED) dc_skipped = 0;
+#endif  // CONFIG_FILTER_INTRA
+
+      uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][pd->subsampling_x]
+                              [pd->subsampling_y];
+      if (rate_uv_intra[uv_tx] == INT_MAX) {
+        choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
+                             &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx],
+                             &skip_uvs[uv_tx], &mode_uv[uv_tx]);
+#if CONFIG_PALETTE
+        if (try_palette) pmi_uv[uv_tx] = *pmi;
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_EXT_INTRA
+        uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+        filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
+#endif  // CONFIG_FILTER_INTRA
+      }
+
+      rate_uv = rate_uv_tokenonly[uv_tx];
+      distortion_uv = dist_uvs[uv_tx];
+      skippable = skippable && skip_uvs[uv_tx];
+      mbmi->uv_mode = mode_uv[uv_tx];
+#if CONFIG_PALETTE
+      if (try_palette) {
+        pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
+        memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+               pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+               2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+      }
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_EXT_INTRA
+      mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+      mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
+          filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
+      if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
+        mbmi->filter_intra_mode_info.filter_intra_mode[1] =
+            filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
+      }
+#endif  // CONFIG_FILTER_INTRA
+
+#if CONFIG_CB4X4
+      rate2 = rate_y + intra_mode_cost[mbmi->mode];
+      if (!x->skip_chroma_rd)
+        rate2 += rate_uv + cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
+#else
+      rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv +
+              cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
+#endif  // CONFIG_CB4X4
+
+#if CONFIG_PALETTE
+      if (try_palette && mbmi->mode == DC_PRED) {
+        rate2 += av1_cost_bit(
+            av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0);
+      }
+#endif  // CONFIG_PALETTE
+
+      if (!xd->lossless[mbmi->segment_id] && bsize >= BLOCK_8X8) {
+        // super_block_yrd above includes the cost of the tx_size in the
+        // tokenonly rate, but for intra blocks, tx_size is always coded
+        // (prediction granularity), so we account for it in the full rate,
+        // not the tokenonly rate.
+        rate_y -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
+      }
+#if CONFIG_EXT_INTRA
+      if (is_directional_mode) {
+#if CONFIG_INTRA_INTERP
+        const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
+        const int p_angle =
+            mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
+        if (av1_is_intra_filter_switchable(p_angle))
+          rate2 += cpi->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
+#endif  // CONFIG_INTRA_INTERP
+        rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
+                                    MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
+      }
+      if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED) {
+        rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
+                                    MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
+      }
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+      if (mbmi->mode == DC_PRED) {
+        rate2 +=
+            av1_cost_bit(cm->fc->filter_intra_probs[0],
+                         mbmi->filter_intra_mode_info.use_filter_intra_mode[0]);
+        if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) {
+          rate2 += write_uniform_cost(
+              FILTER_INTRA_MODES,
+              mbmi->filter_intra_mode_info.filter_intra_mode[0]);
+        }
+      }
+      if (mbmi->uv_mode == DC_PRED) {
+        rate2 +=
+            av1_cost_bit(cpi->common.fc->filter_intra_probs[1],
+                         mbmi->filter_intra_mode_info.use_filter_intra_mode[1]);
+        if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1])
+          rate2 += write_uniform_cost(
+              FILTER_INTRA_MODES,
+              mbmi->filter_intra_mode_info.filter_intra_mode[1]);
+      }
+#endif  // CONFIG_FILTER_INTRA
+      if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
+        rate2 += intra_cost_penalty;
+      distortion2 = distortion_y + distortion_uv;
+    } else {
+#if CONFIG_REF_MV
+      int_mv backup_ref_mv[2];
+
+#if !SUB8X8_COMP_REF
+      if (bsize < BLOCK_8X8 && mbmi->ref_frame[1] > INTRA_FRAME) continue;
+#endif  // !SUB8X8_COMP_REF
+
+      backup_ref_mv[0] = mbmi_ext->ref_mvs[ref_frame][0];
+      if (comp_pred) backup_ref_mv[1] = mbmi_ext->ref_mvs[second_ref_frame][0];
+#endif  // CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+      if (second_ref_frame == INTRA_FRAME) {
+        if (best_single_inter_ref != ref_frame) continue;
+        mbmi->interintra_mode = intra_to_interintra_mode[best_intra_mode];
+// TODO(debargha|geza.lore):
+// Should we use ext_intra modes for interintra?
+#if CONFIG_EXT_INTRA
+        mbmi->angle_delta[0] = 0;
+        mbmi->angle_delta[1] = 0;
+#if CONFIG_INTRA_INTERP
+        mbmi->intra_filter = INTRA_FILTER_LINEAR;
+#endif  // CONFIG_INTRA_INTERP
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+        mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+        mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif  // CONFIG_FILTER_INTRA
+      }
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_REF_MV
+      mbmi->ref_mv_idx = 0;
+      ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+
+#if CONFIG_EXT_INTER
+      if (comp_pred) {
+        if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
+          int ref_mv_idx = 0;
+          // Special case: NEAR_NEWMV and NEW_NEARMV modes use
+          // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
+          // mbmi->ref_mv_idx (like NEWMV)
+          if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
+            ref_mv_idx = 1;
+
+          if (compound_ref0_mode(mbmi->mode) == NEWMV) {
+            int_mv this_mv =
+                mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+            clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+                         xd->n8_h << MI_SIZE_LOG2, xd);
+            mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
+          }
+          if (compound_ref1_mode(mbmi->mode) == NEWMV) {
+            int_mv this_mv =
+                mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+            clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+                         xd->n8_h << MI_SIZE_LOG2, xd);
+            mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
+          }
+        }
+      } else {
+#endif  // CONFIG_EXT_INTER
+        if (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
+          int ref;
+          for (ref = 0; ref < 1 + comp_pred; ++ref) {
+            int_mv this_mv =
+                (ref == 0) ? mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv
+                           : mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
+            clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+                         xd->n8_h << MI_SIZE_LOG2, xd);
+            mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
+          }
+        }
+#if CONFIG_EXT_INTER
+      }
+#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_REF_MV
+      {
+        RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
+        av1_init_rd_stats(&rd_stats);
+        rd_stats.rate = rate2;
+
+        // Point to variables that are maintained between loop iterations
+        args.single_newmv = single_newmv;
+#if CONFIG_EXT_INTER
+        args.single_newmv_rate = single_newmv_rate;
+        args.compmode_interintra_cost = &compmode_interintra_cost;
+        args.compmode_interinter_cost = &compmode_interinter_cost;
+        args.modelled_rd = modelled_rd;
+#endif  // CONFIG_EXT_INTER
+        this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
+                                    &rd_stats_uv, &disable_skip, frame_mv,
+                                    mi_row, mi_col, &args, best_rd);
+// Prevent pointers from escaping local scope
+#if CONFIG_EXT_INTER
+        args.compmode_interintra_cost = NULL;
+        args.compmode_interinter_cost = NULL;
+#endif  // CONFIG_EXT_INTER
+
+        rate2 = rd_stats.rate;
+        skippable = rd_stats.skip;
+        distortion2 = rd_stats.dist;
+        total_sse = rd_stats.sse;
+        rate_y = rd_stats_y.rate;
+        rate_uv = rd_stats_uv.rate;
+      }
+
+#if CONFIG_REF_MV
+// TODO(jingning): This needs some refactoring to improve code quality
+// and reduce redundant steps.
+#if CONFIG_EXT_INTER
+      if ((have_nearmv_in_inter_mode(mbmi->mode) &&
+           mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
+          ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) &&
+           mbmi_ext->ref_mv_count[ref_frame_type] > 1)) {
+#else
+      if ((mbmi->mode == NEARMV &&
+           mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
+          (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1)) {
+#endif
+        int_mv backup_mv = frame_mv[NEARMV][ref_frame];
+        MB_MODE_INFO backup_mbmi = *mbmi;
+        int backup_skip = x->skip;
+        int64_t tmp_ref_rd = this_rd;
+        int ref_idx;
+
+// TODO(jingning): This should be deprecated shortly.
+#if CONFIG_EXT_INTER
+        int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
+#else
+        int idx_offset = (mbmi->mode == NEARMV) ? 1 : 0;
+#endif  // CONFIG_EXT_INTER
+        int ref_set =
+            AOMMIN(2, mbmi_ext->ref_mv_count[ref_frame_type] - 1 - idx_offset);
+
+        uint8_t drl_ctx =
+            av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx_offset);
+        // Dummy
+        int_mv backup_fmv[2];
+        backup_fmv[0] = frame_mv[NEWMV][ref_frame];
+        if (comp_pred) backup_fmv[1] = frame_mv[NEWMV][second_ref_frame];
+
+        rate2 += (rate2 < INT_MAX ? cpi->drl_mode_cost0[drl_ctx][0] : 0);
+
+        if (this_rd < INT64_MAX) {
+          if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
+              RDCOST(x->rdmult, x->rddiv, 0, total_sse))
+            tmp_ref_rd =
+                RDCOST(x->rdmult, x->rddiv,
+                       rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
+                       distortion2);
+          else
+            tmp_ref_rd =
+                RDCOST(x->rdmult, x->rddiv,
+                       rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
+                           rate_y - rate_uv,
+                       total_sse);
+        }
+#if CONFIG_VAR_TX
+        for (i = 0; i < MAX_MB_PLANE; ++i)
+          memcpy(x->blk_skip_drl[i], x->blk_skip[i],
+                 sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif  // CONFIG_VAR_TX
+
+        for (ref_idx = 0; ref_idx < ref_set; ++ref_idx) {
+          int64_t tmp_alt_rd = INT64_MAX;
+          int dummy_disable_skip = 0;
+          int ref;
+          int_mv cur_mv;
+          RD_STATS tmp_rd_stats, tmp_rd_stats_y, tmp_rd_stats_uv;
+#if CONFIG_EXT_INTER
+          int tmp_compmode_interintra_cost = 0;
+          int tmp_compmode_interinter_cost = 0;
+#endif  // CONFIG_EXT_INTER
+
+          av1_invalid_rd_stats(&tmp_rd_stats);
+          x->skip = 0;
+
+          mbmi->ref_mv_idx = 1 + ref_idx;
+
+#if CONFIG_EXT_INTER
+          if (comp_pred) {
+            int ref_mv_idx = mbmi->ref_mv_idx;
+            // Special case: NEAR_NEWMV and NEW_NEARMV modes use
+            // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
+            // mbmi->ref_mv_idx (like NEWMV)
+            if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
+              ref_mv_idx = 1 + mbmi->ref_mv_idx;
+
+            if (compound_ref0_mode(mbmi->mode) == NEWMV) {
+              int_mv this_mv =
+                  mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+                           xd->n8_h << MI_SIZE_LOG2, xd);
+              mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
+            } else if (compound_ref0_mode(mbmi->mode) == NEARESTMV) {
+              int_mv this_mv =
+                  mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
+              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+                           xd->n8_h << MI_SIZE_LOG2, xd);
+              mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
+            }
+
+            if (compound_ref1_mode(mbmi->mode) == NEWMV) {
+              int_mv this_mv =
+                  mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+                           xd->n8_h << MI_SIZE_LOG2, xd);
+              mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
+            } else if (compound_ref1_mode(mbmi->mode) == NEARESTMV) {
+              int_mv this_mv =
+                  mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
+              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+                           xd->n8_h << MI_SIZE_LOG2, xd);
+              mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
+            }
+          } else {
+#endif  // CONFIG_EXT_INTER
+            for (ref = 0; ref < 1 + comp_pred; ++ref) {
+              int_mv this_mv =
+                  (ref == 0)
+                      ? mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
+                            .this_mv
+                      : mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
+                            .comp_mv;
+              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+                           xd->n8_h << MI_SIZE_LOG2, xd);
+              mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
+            }
+#if CONFIG_EXT_INTER
+          }
+#endif
+
+          cur_mv =
+              mbmi_ext->ref_mv_stack[ref_frame][mbmi->ref_mv_idx + idx_offset]
+                  .this_mv;
+          clamp_mv2(&cur_mv.as_mv, xd);
+
+          if (!mv_check_bounds(&x->mv_limits, &cur_mv.as_mv)) {
+            int_mv dummy_single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } };
+#if CONFIG_EXT_INTER
+            int dummy_single_newmv_rate[TOTAL_REFS_PER_FRAME] = { 0 };
+#endif  // CONFIG_EXT_INTER
+
+            frame_mv[NEARMV][ref_frame] = cur_mv;
+            av1_init_rd_stats(&tmp_rd_stats);
+
+            // Point to variables that are not maintained between iterations
+            args.single_newmv = dummy_single_newmv;
+#if CONFIG_EXT_INTER
+            args.single_newmv_rate = dummy_single_newmv_rate;
+            args.compmode_interintra_cost = &tmp_compmode_interintra_cost;
+            args.compmode_interinter_cost = &tmp_compmode_interinter_cost;
+            args.modelled_rd = NULL;
+#endif  // CONFIG_EXT_INTER
+            tmp_alt_rd = handle_inter_mode(
+                cpi, x, bsize, &tmp_rd_stats, &tmp_rd_stats_y, &tmp_rd_stats_uv,
+                &dummy_disable_skip, frame_mv, mi_row, mi_col, &args, best_rd);
+            // Prevent pointers from escaping local scope
+            args.single_newmv = NULL;
+#if CONFIG_EXT_INTER
+            args.single_newmv_rate = NULL;
+            args.compmode_interintra_cost = NULL;
+            args.compmode_interinter_cost = NULL;
+#endif  // CONFIG_EXT_INTER
+          }
+
+          for (i = 0; i < mbmi->ref_mv_idx; ++i) {
+            uint8_t drl1_ctx = 0;
+            drl1_ctx = av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
+                                   i + idx_offset);
+            tmp_rd_stats.rate +=
+                (tmp_rd_stats.rate < INT_MAX ? cpi->drl_mode_cost0[drl1_ctx][1]
+                                             : 0);
+          }
+
+          if (mbmi_ext->ref_mv_count[ref_frame_type] >
+                  mbmi->ref_mv_idx + idx_offset + 1 &&
+              ref_idx < ref_set - 1) {
+            uint8_t drl1_ctx =
+                av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
+                            mbmi->ref_mv_idx + idx_offset);
+            tmp_rd_stats.rate +=
+                (tmp_rd_stats.rate < INT_MAX ? cpi->drl_mode_cost0[drl1_ctx][0]
+                                             : 0);
+          }
+
+          if (tmp_alt_rd < INT64_MAX) {
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+            tmp_alt_rd = RDCOST(x->rdmult, x->rddiv, tmp_rd_stats.rate,
+                                tmp_rd_stats.dist);
+#else
+            if (RDCOST(x->rdmult, x->rddiv,
+                       tmp_rd_stats_y.rate + tmp_rd_stats_uv.rate,
+                       tmp_rd_stats.dist) <
+                RDCOST(x->rdmult, x->rddiv, 0, tmp_rd_stats.sse))
+              tmp_alt_rd =
+                  RDCOST(x->rdmult, x->rddiv,
+                         tmp_rd_stats.rate +
+                             av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
+                         tmp_rd_stats.dist);
+            else
+              tmp_alt_rd =
+                  RDCOST(x->rdmult, x->rddiv,
+                         tmp_rd_stats.rate +
+                             av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
+                             tmp_rd_stats_y.rate - tmp_rd_stats_uv.rate,
+                         tmp_rd_stats.sse);
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+          }
+
+          if (tmp_ref_rd > tmp_alt_rd) {
+            rate2 = tmp_rd_stats.rate;
+            disable_skip = dummy_disable_skip;
+            distortion2 = tmp_rd_stats.dist;
+            skippable = tmp_rd_stats.skip;
+            rate_y = tmp_rd_stats_y.rate;
+            rate_uv = tmp_rd_stats_uv.rate;
+            total_sse = tmp_rd_stats.sse;
+            this_rd = tmp_alt_rd;
+            tmp_ref_rd = tmp_alt_rd;
+            backup_mbmi = *mbmi;
+            backup_skip = x->skip;
+#if CONFIG_VAR_TX
+            for (i = 0; i < MAX_MB_PLANE; ++i)
+              memcpy(x->blk_skip_drl[i], x->blk_skip[i],
+                     sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif  // CONFIG_VAR_TX
+#if CONFIG_EXT_INTER
+            compmode_interintra_cost = tmp_compmode_interintra_cost;
+            compmode_interinter_cost = tmp_compmode_interinter_cost;
+#endif  // CONFIG_EXT_INTER
+          } else {
+            *mbmi = backup_mbmi;
+            x->skip = backup_skip;
+          }
+        }
+
+        frame_mv[NEARMV][ref_frame] = backup_mv;
+        frame_mv[NEWMV][ref_frame] = backup_fmv[0];
+        if (comp_pred) frame_mv[NEWMV][second_ref_frame] = backup_fmv[1];
+#if CONFIG_VAR_TX
+        for (i = 0; i < MAX_MB_PLANE; ++i)
+          memcpy(x->blk_skip[i], x->blk_skip_drl[i],
+                 sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif  // CONFIG_VAR_TX
+      }
+      mbmi_ext->ref_mvs[ref_frame][0] = backup_ref_mv[0];
+      if (comp_pred) mbmi_ext->ref_mvs[second_ref_frame][0] = backup_ref_mv[1];
+#endif  // CONFIG_REF_MV
+
+      if (this_rd == INT64_MAX) continue;
+
+#if SUB8X8_COMP_REF
+      compmode_cost = av1_cost_bit(comp_mode_p, comp_pred);
+#else
+      if (mbmi->sb_type >= BLOCK_8X8)
+        compmode_cost = av1_cost_bit(comp_mode_p, comp_pred);
+#endif  // SUB8X8_COMP_REF
+
+      if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost;
+    }
+
+#if CONFIG_EXT_INTER
+    rate2 += compmode_interintra_cost;
+    if (cm->reference_mode != SINGLE_REFERENCE && comp_pred)
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      if (mbmi->motion_mode == SIMPLE_TRANSLATION)
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+        rate2 += compmode_interinter_cost;
+#endif  // CONFIG_EXT_INTER
+
+    // Estimate the reference frame signaling cost and add it
+    // to the rolling cost variable.
+    if (comp_pred) {
+      rate2 += ref_costs_comp[ref_frame];
+#if CONFIG_EXT_REFS
+      rate2 += ref_costs_comp[second_ref_frame];
+#endif  // CONFIG_EXT_REFS
+    } else {
+      rate2 += ref_costs_single[ref_frame];
+    }
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    if (ref_frame == INTRA_FRAME) {
+#else
+    if (!disable_skip) {
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      if (skippable) {
+        // Back out the coefficient coding costs
+        rate2 -= (rate_y + rate_uv);
+        rate_y = 0;
+        rate_uv = 0;
+        // Cost the skip mb case
+        rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+      } else if (ref_frame != INTRA_FRAME && !xd->lossless[mbmi->segment_id]) {
+#if CONFIG_REF_MV
+        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv + rate_skip0,
+                   distortion2) <
+            RDCOST(x->rdmult, x->rddiv, rate_skip1, total_sse)) {
+#else
+        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
+            RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
+#endif  // CONFIG_REF_MV
+          // Add in the cost of the no skip flag.
+          rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+        } else {
+          // FIXME(rbultje) make this work for splitmv also
+          rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+          distortion2 = total_sse;
+          assert(total_sse >= 0);
+          rate2 -= (rate_y + rate_uv);
+          this_skip2 = 1;
+          rate_y = 0;
+          rate_uv = 0;
+        }
+      } else {
+        // Add in the cost of the no skip flag.
+        rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+      }
+
+      // Calculate the final RD estimate for this mode.
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    } else {
+      this_skip2 = mbmi->skip;
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+      if (this_skip2) {
+        rate_y = 0;
+        rate_uv = 0;
+      }
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    }
+
+    if (ref_frame == INTRA_FRAME) {
+      // Keep record of best intra rd
+      if (this_rd < best_intra_rd) {
+        best_intra_rd = this_rd;
+        best_intra_mode = mbmi->mode;
+      }
+#if CONFIG_EXT_INTER
+    } else if (second_ref_frame == NONE_FRAME) {
+      if (this_rd < best_single_inter_rd) {
+        best_single_inter_rd = this_rd;
+        best_single_inter_ref = mbmi->ref_frame[0];
+      }
+#endif  // CONFIG_EXT_INTER
+    }
+
+    if (!disable_skip && ref_frame == INTRA_FRAME) {
+      for (i = 0; i < REFERENCE_MODES; ++i)
+        best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd);
+    }
+
+    // Did this mode help.. i.e. is it the new best mode
+    if (this_rd < best_rd || x->skip) {
+      if (!mode_excluded) {
+        // Note index of best mode so far
+        best_mode_index = mode_index;
+
+        if (ref_frame == INTRA_FRAME) {
+          /* required for left and above block mv */
+          mbmi->mv[0].as_int = 0;
+        } else {
+          best_pred_sse = x->pred_sse[ref_frame];
+        }
+
+        rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+        if (x->skip)
+          *returnrate_nocoef = rate2;
+        else
+          *returnrate_nocoef = rate2 - rate_y - rate_uv;
+        *returnrate_nocoef -= av1_cost_bit(
+            av1_get_skip_prob(cm, xd), disable_skip || skippable || this_skip2);
+        *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd),
+                                           mbmi->ref_frame[0] != INTRA_FRAME);
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+        MODE_INFO *const mi = xd->mi[0];
+        const MOTION_MODE motion_allowed = motion_mode_allowed(
+#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+            0, xd->global_motion,
+#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+            mi);
+        if (motion_allowed == WARPED_CAUSAL)
+          *returnrate_nocoef -= cpi->motion_mode_cost[bsize][mbmi->motion_mode];
+        else if (motion_allowed == OBMC_CAUSAL)
+          *returnrate_nocoef -=
+              cpi->motion_mode_cost1[bsize][mbmi->motion_mode];
+#else
+        *returnrate_nocoef -= cpi->motion_mode_cost[bsize][mbmi->motion_mode];
+#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#endif  // CONFIG_SUPERTX
+        rd_cost->dist = distortion2;
+        rd_cost->rdcost = this_rd;
+        best_rd = this_rd;
+        best_mbmode = *mbmi;
+        best_skip2 = this_skip2;
+        best_mode_skippable = skippable;
+        best_rate_y = rate_y + av1_cost_bit(av1_get_skip_prob(cm, xd),
+                                            this_skip2 || skippable);
+        best_rate_uv = rate_uv;
+
+#if CONFIG_VAR_TX
+        for (i = 0; i < MAX_MB_PLANE; ++i)
+          memcpy(ctx->blk_skip[i], x->blk_skip[i],
+                 sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif  // CONFIG_VAR_TX
+      }
+    }
+
+    /* keep record of best compound/single-only prediction */
+    if (!disable_skip && ref_frame != INTRA_FRAME) {
+      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+      if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+        single_rate = rate2 - compmode_cost;
+        hybrid_rate = rate2;
+      } else {
+        single_rate = rate2;
+        hybrid_rate = rate2 + compmode_cost;
+      }
+
+      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+      if (!comp_pred) {
+        if (single_rd < best_pred_rd[SINGLE_REFERENCE])
+          best_pred_rd[SINGLE_REFERENCE] = single_rd;
+      } else {
+        if (single_rd < best_pred_rd[COMPOUND_REFERENCE])
+          best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+      }
+      if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
+        best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+    }
+
+    if (x->skip && !comp_pred) break;
+  }
+
+  if (xd->lossless[mbmi->segment_id] == 0 && best_mode_index >= 0 &&
+      ((sf->tx_type_search.fast_inter_tx_type_search == 1 &&
+        is_inter_mode(best_mbmode.mode)) ||
+       (sf->tx_type_search.fast_intra_tx_type_search == 1 &&
+        !is_inter_mode(best_mbmode.mode)))) {
+    int skip_blk = 0;
+    RD_STATS rd_stats_y, rd_stats_uv;
+
+    x->use_default_inter_tx_type = 0;
+    x->use_default_intra_tx_type = 0;
+
+    *mbmi = best_mbmode;
+
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+    // Select prediction reference frames.
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+      if (has_second_ref(mbmi))
+        xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+    }
+
+    if (is_inter_mode(mbmi->mode)) {
+      av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, bsize);
+#if CONFIG_MOTION_VAR
+      if (mbmi->motion_mode == OBMC_CAUSAL) {
+        av1_build_obmc_inter_prediction(
+            cm, xd, mi_row, mi_col, args.above_pred_buf, args.above_pred_stride,
+            args.left_pred_buf, args.left_pred_stride);
+      }
+#endif  // CONFIG_MOTION_VAR
+      av1_subtract_plane(x, bsize, 0);
+#if CONFIG_VAR_TX
+      if (cm->tx_mode == TX_MODE_SELECT || xd->lossless[mbmi->segment_id]) {
+        select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+      } else {
+        int idx, idy;
+        super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+        for (idy = 0; idy < xd->n8_h; ++idy)
+          for (idx = 0; idx < xd->n8_w; ++idx)
+            mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
+        memset(x->blk_skip[0], rd_stats_y.skip,
+               sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+      }
+
+      inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+#else
+      super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+      super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+#endif  // CONFIG_VAR_TX
+    } else {
+      super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+      super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+    }
+
+    if (RDCOST(x->rdmult, x->rddiv, rd_stats_y.rate + rd_stats_uv.rate,
+               (rd_stats_y.dist + rd_stats_uv.dist)) >
+        RDCOST(x->rdmult, x->rddiv, 0, (rd_stats_y.sse + rd_stats_uv.sse))) {
+      skip_blk = 1;
+      rd_stats_y.rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+      rd_stats_uv.rate = 0;
+      rd_stats_y.dist = rd_stats_y.sse;
+      rd_stats_uv.dist = rd_stats_uv.sse;
+    } else {
+      skip_blk = 0;
+      rd_stats_y.rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+    }
+
+    if (RDCOST(x->rdmult, x->rddiv, best_rate_y + best_rate_uv, rd_cost->dist) >
+        RDCOST(x->rdmult, x->rddiv, rd_stats_y.rate + rd_stats_uv.rate,
+               (rd_stats_y.dist + rd_stats_uv.dist))) {
+#if CONFIG_VAR_TX
+      int idx, idy;
+#endif  // CONFIG_VAR_TX
+      best_mbmode.tx_type = mbmi->tx_type;
+      best_mbmode.tx_size = mbmi->tx_size;
+#if CONFIG_VAR_TX
+      for (idy = 0; idy < xd->n8_h; ++idy)
+        for (idx = 0; idx < xd->n8_w; ++idx)
+          best_mbmode.inter_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
+
+      for (i = 0; i < MAX_MB_PLANE; ++i)
+        memcpy(ctx->blk_skip[i], x->blk_skip[i],
+               sizeof(uint8_t) * ctx->num_4x4_blk);
+
+      best_mbmode.min_tx_size = mbmi->min_tx_size;
+#endif  // CONFIG_VAR_TX
+      rd_cost->rate +=
+          (rd_stats_y.rate + rd_stats_uv.rate - best_rate_y - best_rate_uv);
+      rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
+      rd_cost->rdcost =
+          RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+      best_skip2 = skip_blk;
+    }
+  }
+
+#if CONFIG_PALETTE
+  // Only try palette mode when the best mode so far is an intra mode.
+  if (try_palette && !is_inter_mode(best_mbmode.mode)) {
+    int rate2 = 0;
+#if CONFIG_SUPERTX
+    int best_rate_nocoef;
+#endif  // CONFIG_SUPERTX
+    int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd,
+            best_model_rd_palette = INT64_MAX;
+    int skippable = 0, rate_overhead_palette = 0;
+    RD_STATS rd_stats_y;
+    TX_SIZE uv_tx;
+    uint8_t *const best_palette_color_map =
+        x->palette_buffer->best_palette_color_map;
+    uint8_t *const color_map = xd->plane[0].color_index_map;
+    MB_MODE_INFO best_mbmi_palette = best_mbmode;
+
+    mbmi->mode = DC_PRED;
+    mbmi->uv_mode = DC_PRED;
+    mbmi->ref_frame[0] = INTRA_FRAME;
+    mbmi->ref_frame[1] = NONE_FRAME;
+    rate_overhead_palette = rd_pick_palette_intra_sby(
+        cpi, x, bsize, palette_ctx, intra_mode_cost[DC_PRED],
+        &best_mbmi_palette, best_palette_color_map, &best_rd_palette,
+        &best_model_rd_palette, NULL, NULL, NULL, NULL);
+    if (pmi->palette_size[0] == 0) goto PALETTE_EXIT;
+    memcpy(color_map, best_palette_color_map,
+           rows * cols * sizeof(best_palette_color_map[0]));
+    super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd);
+    if (rd_stats_y.rate == INT_MAX) goto PALETTE_EXIT;
+    uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x]
+                            [xd->plane[1].subsampling_y];
+    if (rate_uv_intra[uv_tx] == INT_MAX) {
+      choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
+                           &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx],
+                           &skip_uvs[uv_tx], &mode_uv[uv_tx]);
+      pmi_uv[uv_tx] = *pmi;
+#if CONFIG_EXT_INTRA
+      uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+      filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
+#endif  // CONFIG_FILTER_INTRA
+    }
+    mbmi->uv_mode = mode_uv[uv_tx];
+    pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
+    if (pmi->palette_size[1] > 0) {
+      memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+             pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+             2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+    }
+#if CONFIG_EXT_INTRA
+    mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+    mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
+        filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
+    if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
+      mbmi->filter_intra_mode_info.filter_intra_mode[1] =
+          filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
+    }
+#endif  // CONFIG_FILTER_INTRA
+    skippable = rd_stats_y.skip && skip_uvs[uv_tx];
+    distortion2 = rd_stats_y.dist + dist_uvs[uv_tx];
+    rate2 = rd_stats_y.rate + rate_overhead_palette + rate_uv_intra[uv_tx];
+    rate2 += ref_costs_single[INTRA_FRAME];
+
+    if (skippable) {
+      rate2 -= (rd_stats_y.rate + rate_uv_tokenonly[uv_tx]);
+#if CONFIG_SUPERTX
+      best_rate_nocoef = rate2;
+#endif  // CONFIG_SUPERTX
+      rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+    } else {
+#if CONFIG_SUPERTX
+      best_rate_nocoef = rate2 - (rd_stats_y.rate + rate_uv_tokenonly[uv_tx]);
+#endif  // CONFIG_SUPERTX
+      rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+    }
+    this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+    if (this_rd < best_rd) {
+      best_mode_index = 3;
+      mbmi->mv[0].as_int = 0;
+      rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+      *returnrate_nocoef = best_rate_nocoef;
+#endif  // CONFIG_SUPERTX
+      rd_cost->dist = distortion2;
+      rd_cost->rdcost = this_rd;
+      best_rd = this_rd;
+      best_mbmode = *mbmi;
+      best_skip2 = 0;
+      best_mode_skippable = skippable;
+    }
+  }
+PALETTE_EXIT:
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_FILTER_INTRA
+  // TODO(huisu): filter-intra is turned off in lossless mode for now to
+  // avoid a unit test failure
+  if (!xd->lossless[mbmi->segment_id] &&
+#if CONFIG_PALETTE
+      pmi->palette_size[0] == 0 &&
+#endif  // CONFIG_PALETTE
+      !dc_skipped && best_mode_index >= 0 &&
+      best_intra_rd < (best_rd + (best_rd >> 3))) {
+    pick_filter_intra_interframe(
+        cpi, x, ctx, bsize, mi_row, mi_col, rate_uv_intra, rate_uv_tokenonly,
+        dist_uvs, skip_uvs, mode_uv, filter_intra_mode_info_uv,
+#if CONFIG_EXT_INTRA
+        uv_angle_delta,
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_PALETTE
+        pmi_uv, palette_ctx,
+#endif  // CONFIG_PALETTE
+        0, ref_costs_single, &best_rd, &best_intra_rd, &best_intra_mode,
+        &best_mode_index, &best_skip2, &best_mode_skippable,
+#if CONFIG_SUPERTX
+        returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+        best_pred_rd, &best_mbmode, rd_cost);
+  }
+#endif  // CONFIG_FILTER_INTRA
+
+  // The inter modes' rate costs are not calculated precisely in some cases.
+  // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
+  // ZEROMV. Here, checks are added for those cases, and the mode decisions
+  // are corrected.
+  if (best_mbmode.mode == NEWMV
+#if CONFIG_EXT_INTER
+      || best_mbmode.mode == NEW_NEWMV
+#endif  // CONFIG_EXT_INTER
+      ) {
+    const MV_REFERENCE_FRAME refs[2] = { best_mbmode.ref_frame[0],
+                                         best_mbmode.ref_frame[1] };
+    int comp_pred_mode = refs[1] > INTRA_FRAME;
+    int_mv zeromv[2];
+#if CONFIG_REF_MV
+    const uint8_t rf_type = av1_ref_frame_type(best_mbmode.ref_frame);
+#endif  // CONFIG_REF_MV
+#if CONFIG_GLOBAL_MOTION
+    zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]],
+                                            cm->allow_high_precision_mv, bsize,
+                                            mi_col, mi_row, 0)
+                           .as_int;
+    zeromv[1].as_int = comp_pred_mode
+                           ? gm_get_motion_vector(&cm->global_motion[refs[1]],
+                                                  cm->allow_high_precision_mv,
+                                                  bsize, mi_col, mi_row, 0)
+                                 .as_int
+                           : 0;
+#else
+    zeromv[0].as_int = 0;
+    zeromv[1].as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_REF_MV
+    if (!comp_pred_mode) {
+      int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
+                        ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2)
+                        : INT_MAX;
+
+      for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
+        int_mv cur_mv = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
+        if (cur_mv.as_int == best_mbmode.mv[0].as_int) {
+          best_mbmode.mode = NEARMV;
+          best_mbmode.ref_mv_idx = i;
+        }
+      }
+
+      if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int)
+        best_mbmode.mode = NEARESTMV;
+      else if (best_mbmode.mv[0].as_int == zeromv[0].as_int)
+        best_mbmode.mode = ZEROMV;
+    } else {
+      int_mv nearestmv[2];
+      int_mv nearmv[2];
+
+#if CONFIG_EXT_INTER
+      if (mbmi_ext->ref_mv_count[rf_type] > 1) {
+        nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][1].this_mv;
+        nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][1].comp_mv;
+      } else {
+        nearmv[0] = frame_mv[NEARMV][refs[0]];
+        nearmv[1] = frame_mv[NEARMV][refs[1]];
+      }
+#else
+      int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
+                        ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2)
+                        : INT_MAX;
+
+      for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
+        nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
+        nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][i + 1].comp_mv;
+
+        if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
+            nearmv[1].as_int == best_mbmode.mv[1].as_int) {
+          best_mbmode.mode = NEARMV;
+          best_mbmode.ref_mv_idx = i;
+        }
+      }
+#endif  // CONFIG_EXT_INTER
+      if (mbmi_ext->ref_mv_count[rf_type] >= 1) {
+        nearestmv[0] = mbmi_ext->ref_mv_stack[rf_type][0].this_mv;
+        nearestmv[1] = mbmi_ext->ref_mv_stack[rf_type][0].comp_mv;
+      } else {
+        nearestmv[0] = frame_mv[NEARESTMV][refs[0]];
+        nearestmv[1] = frame_mv[NEARESTMV][refs[1]];
+      }
+
+      if (nearestmv[0].as_int == best_mbmode.mv[0].as_int &&
+          nearestmv[1].as_int == best_mbmode.mv[1].as_int) {
+#if CONFIG_EXT_INTER
+        best_mbmode.mode = NEAREST_NEARESTMV;
+      } else {
+        int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
+                          ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2)
+                          : INT_MAX;
+
+        for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
+          nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
+          nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][i + 1].comp_mv;
+
+          // Try switching to the NEAR_NEAREST type modes first
+          if (nearestmv[0].as_int == best_mbmode.mv[0].as_int &&
+              nearmv[1].as_int == best_mbmode.mv[1].as_int) {
+            best_mbmode.mode = NEAREST_NEARMV;
+            best_mbmode.ref_mv_idx = i;
+          } else if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
+                     nearestmv[1].as_int == best_mbmode.mv[1].as_int) {
+            best_mbmode.mode = NEAR_NEARESTMV;
+            best_mbmode.ref_mv_idx = i;
+          } else if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
+                     nearmv[1].as_int == best_mbmode.mv[1].as_int) {
+            best_mbmode.mode = NEAR_NEARMV;
+            best_mbmode.ref_mv_idx = i;
+          }
+        }
+
+        if (best_mbmode.mode == NEW_NEWMV &&
+            best_mbmode.mv[0].as_int == zeromv[0].as_int &&
+            best_mbmode.mv[1].as_int == zeromv[1].as_int)
+          best_mbmode.mode = ZERO_ZEROMV;
+      }
+#else
+        best_mbmode.mode = NEARESTMV;
+      } else if (best_mbmode.mv[0].as_int == zeromv[0].as_int &&
+                 best_mbmode.mv[1].as_int == zeromv[1].as_int) {
+        best_mbmode.mode = ZEROMV;
+      }
+#endif  // CONFIG_EXT_INTER
+    }
+#else
+#if CONFIG_EXT_INTER
+    if (!comp_pred_mode) {
+#endif  // CONFIG_EXT_INTER
+      if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+          ((comp_pred_mode &&
+            frame_mv[NEARESTMV][refs[1]].as_int == best_mbmode.mv[1].as_int) ||
+           !comp_pred_mode))
+        best_mbmode.mode = NEARESTMV;
+      else if (frame_mv[NEARMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+               ((comp_pred_mode &&
+                 frame_mv[NEARMV][refs[1]].as_int ==
+                     best_mbmode.mv[1].as_int) ||
+                !comp_pred_mode))
+        best_mbmode.mode = NEARMV;
+      else if (best_mbmode.mv[0].as_int == zeromv[0].as_int &&
+               ((comp_pred_mode &&
+                 best_mbmode.mv[1].as_int == zeromv[1].as_int) ||
+                !comp_pred_mode))
+        best_mbmode.mode = ZEROMV;
+#if CONFIG_EXT_INTER
+    } else {
+#if CONFIG_GLOBAL_MOTION
+      zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]],
+                                              cm->allow_high_precision_mv,
+                                              bsize, mi_col, mi_row, 0)
+                             .as_int;
+      zeromv[1].as_int = comp_pred_mode
+                             ? gm_get_motion_vector(&cm->global_motion[refs[1]],
+                                                    cm->allow_high_precision_mv,
+                                                    bsize, mi_col, mi_row, 0)
+                                   .as_int
+                             : 0;
+#else
+      zeromv[0].as_int = 0;
+      zeromv[1].as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
+      if (frame_mv[NEAREST_NEARESTMV][refs[0]].as_int ==
+              best_mbmode.mv[0].as_int &&
+          frame_mv[NEAREST_NEARESTMV][refs[1]].as_int ==
+              best_mbmode.mv[1].as_int)
+        best_mbmode.mode = NEAREST_NEARESTMV;
+      else if (frame_mv[NEAREST_NEARMV][refs[0]].as_int ==
+                   best_mbmode.mv[0].as_int &&
+               frame_mv[NEAREST_NEARMV][refs[1]].as_int ==
+                   best_mbmode.mv[1].as_int)
+        best_mbmode.mode = NEAREST_NEARMV;
+      else if (frame_mv[NEAR_NEARESTMV][refs[0]].as_int ==
+                   best_mbmode.mv[0].as_int &&
+               frame_mv[NEAR_NEARESTMV][refs[1]].as_int ==
+                   best_mbmode.mv[1].as_int)
+        best_mbmode.mode = NEAR_NEARESTMV;
+      else if (frame_mv[NEAR_NEARMV][refs[0]].as_int ==
+                   best_mbmode.mv[0].as_int &&
+               frame_mv[NEAR_NEARMV][refs[1]].as_int ==
+                   best_mbmode.mv[1].as_int)
+        best_mbmode.mode = NEAR_NEARMV;
+      else if (best_mbmode.mv[0].as_int == zeromv[0].as_int &&
+               best_mbmode.mv[1].as_int == zeromv[1].as_int)
+        best_mbmode.mode = ZERO_ZEROMV;
+    }
+#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_REF_MV
+  }
+
+#if CONFIG_REF_MV
+  // Make sure that the ref_mv_idx is only nonzero when we're
+  // using a mode which can support ref_mv_idx
+  if (best_mbmode.ref_mv_idx != 0 &&
+#if CONFIG_EXT_INTER
+      !(best_mbmode.mode == NEWMV || best_mbmode.mode == NEW_NEWMV ||
+        have_nearmv_in_inter_mode(best_mbmode.mode))) {
+#else
+      !(best_mbmode.mode == NEARMV || best_mbmode.mode == NEWMV)) {
+#endif
+    best_mbmode.ref_mv_idx = 0;
+  }
+
+  {
+    int8_t ref_frame_type = av1_ref_frame_type(best_mbmode.ref_frame);
+    int16_t mode_ctx = mbmi_ext->mode_context[ref_frame_type];
+    if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) {
+      int_mv zeromv[2];
+#if CONFIG_GLOBAL_MOTION
+      const MV_REFERENCE_FRAME refs[2] = { best_mbmode.ref_frame[0],
+                                           best_mbmode.ref_frame[1] };
+      zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]],
+                                              cm->allow_high_precision_mv,
+                                              bsize, mi_col, mi_row, 0)
+                             .as_int;
+      zeromv[1].as_int = (refs[1] != NONE_FRAME)
+                             ? gm_get_motion_vector(&cm->global_motion[refs[1]],
+                                                    cm->allow_high_precision_mv,
+                                                    bsize, mi_col, mi_row, 0)
+                                   .as_int
+                             : 0;
+      lower_mv_precision(&zeromv[0].as_mv, cm->allow_high_precision_mv);
+      lower_mv_precision(&zeromv[1].as_mv, cm->allow_high_precision_mv);
+#else
+      zeromv[0].as_int = zeromv[1].as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
+      if (best_mbmode.ref_frame[0] > INTRA_FRAME &&
+          best_mbmode.mv[0].as_int == zeromv[0].as_int &&
+#if CONFIG_EXT_INTER
+          (best_mbmode.ref_frame[1] <= INTRA_FRAME)
+#else
+          (best_mbmode.ref_frame[1] == NONE_FRAME ||
+           best_mbmode.mv[1].as_int == zeromv[1].as_int)
+#endif  // CONFIG_EXT_INTER
+              ) {
+        best_mbmode.mode = ZEROMV;
+      }
+    }
+  }
+#endif  // CONFIG_REF_MV
+
+  if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
+
+#if CONFIG_DUAL_FILTER
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == best_mbmode.interp_filter[0]) ||
+         !is_inter_block(&best_mbmode));
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == best_mbmode.interp_filter[1]) ||
+         !is_inter_block(&best_mbmode));
+  if (best_mbmode.ref_frame[1] > INTRA_FRAME) {
+    assert((cm->interp_filter == SWITCHABLE) ||
+           (cm->interp_filter == best_mbmode.interp_filter[2]) ||
+           !is_inter_block(&best_mbmode));
+    assert((cm->interp_filter == SWITCHABLE) ||
+           (cm->interp_filter == best_mbmode.interp_filter[3]) ||
+           !is_inter_block(&best_mbmode));
+  }
+#else
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == best_mbmode.interp_filter) ||
+         !is_inter_block(&best_mbmode));
+#endif  // CONFIG_DUAL_FILTER
+
+  if (!cpi->rc.is_src_frame_alt_ref)
+    av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
+                              sf->adaptive_rd_thresh, bsize, best_mode_index);
+
+  // macroblock modes
+  *mbmi = best_mbmode;
+  x->skip |= best_skip2;
+
+// Note: this section is needed since the mode may have been forced to
+// ZEROMV by the all-zero mode handling of ref-mv.
+#if CONFIG_GLOBAL_MOTION
+  if (mbmi->mode == ZEROMV
+#if CONFIG_EXT_INTER
+      || mbmi->mode == ZERO_ZEROMV
+#endif  // CONFIG_EXT_INTER
+      ) {
+#if CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR
+    // Correct the motion mode for ZEROMV
+    const MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
+#if SEPARATE_GLOBAL_MOTION
+        0, xd->global_motion,
+#endif  // SEPARATE_GLOBAL_MOTION
+        xd->mi[0]);
+    if (mbmi->motion_mode > last_motion_mode_allowed)
+      mbmi->motion_mode = last_motion_mode_allowed;
+#endif  // CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR
+
+    // Correct the interpolation filter for ZEROMV
+    if (is_nontrans_global_motion(xd)) {
+#if CONFIG_DUAL_FILTER
+      mbmi->interp_filter[0] = cm->interp_filter == SWITCHABLE
+                                   ? EIGHTTAP_REGULAR
+                                   : cm->interp_filter;
+      mbmi->interp_filter[1] = cm->interp_filter == SWITCHABLE
+                                   ? EIGHTTAP_REGULAR
+                                   : cm->interp_filter;
+#else
+      mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR
+                                                            : cm->interp_filter;
+#endif  // CONFIG_DUAL_FILTER
+    }
+  }
+#endif  // CONFIG_GLOBAL_MOTION
+
+#if CONFIG_REF_MV
+  for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
+    if (mbmi->mode != NEWMV)
+      mbmi->pred_mv[i].as_int = mbmi->mv[i].as_int;
+    else
+      mbmi->pred_mv[i].as_int = mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_int;
+  }
+#endif  // CONFIG_REF_MV
+
+  for (i = 0; i < REFERENCE_MODES; ++i) {
+    if (best_pred_rd[i] == INT64_MAX)
+      best_pred_diff[i] = INT_MIN;
+    else
+      best_pred_diff[i] = best_rd - best_pred_rd[i];
+  }
+
+  x->skip |= best_mode_skippable;
+
+  assert(best_mode_index >= 0);
+
+  store_coding_context(x, ctx, best_mode_index, best_pred_diff,
+                       best_mode_skippable);
+
+#if CONFIG_PALETTE
+  if (cm->allow_screen_content_tools && pmi->palette_size[1] > 0) {
+    restore_uv_color_map(cpi, x);
+  }
+#endif  // CONFIG_PALETTE
+}
+
+void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
+                                        TileDataEnc *tile_data, MACROBLOCK *x,
+                                        int mi_row, int mi_col,
+                                        RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                                        PICK_MODE_CONTEXT *ctx,
+                                        int64_t best_rd_so_far) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  unsigned char segment_id = mbmi->segment_id;
+  const int comp_pred = 0;
+  int i;
+  int64_t best_pred_diff[REFERENCE_MODES];
+  unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
+  unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
+  aom_prob comp_mode_p;
+  InterpFilter best_filter = SWITCHABLE;
+  int64_t this_rd = INT64_MAX;
+  int rate2 = 0;
+  const int64_t distortion2 = 0;
+  (void)mi_row;
+  (void)mi_col;
+
+  estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
+                           &comp_mode_p);
+
+  for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) x->pred_sse[i] = INT_MAX;
+  for (i = LAST_FRAME; i < TOTAL_REFS_PER_FRAME; ++i)
+    x->pred_mv_sad[i] = INT_MAX;
+
+  rd_cost->rate = INT_MAX;
+
+  assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
+
+#if CONFIG_PALETTE
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_FILTER_INTRA
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif  // CONFIG_FILTER_INTRA
+  mbmi->mode = ZEROMV;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->uv_mode = DC_PRED;
+  mbmi->ref_frame[0] = LAST_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+#if CONFIG_GLOBAL_MOTION
+  mbmi->mv[0].as_int =
+      gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]],
+                           cm->allow_high_precision_mv, bsize, mi_col, mi_row,
+                           0)
+          .as_int;
+#else   // CONFIG_GLOBAL_MOTION
+  mbmi->mv[0].as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
+  mbmi->tx_size = max_txsize_lookup[bsize];
+  x->skip = 1;
+
+#if CONFIG_REF_MV
+  mbmi->ref_mv_idx = 0;
+  mbmi->pred_mv[0].as_int = 0;
+#endif  // CONFIG_REF_MV
+
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+#if CONFIG_MOTION_VAR
+  av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
+#endif
+#if CONFIG_WARPED_MOTION
+  if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) {
+    int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+    mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
+  }
+#endif
+
+  set_default_interp_filters(mbmi, cm->interp_filter);
+
+  if (cm->interp_filter != SWITCHABLE) {
+    best_filter = cm->interp_filter;
+  } else {
+    best_filter = EIGHTTAP_REGULAR;
+    if (av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd) &&
+        x->source_variance >= cpi->sf.disable_filter_search_var_thresh) {
+      int rs;
+      int best_rs = INT_MAX;
+      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+#if CONFIG_DUAL_FILTER
+        int k;
+        for (k = 0; k < 4; ++k) mbmi->interp_filter[k] = i;
+#else
+        mbmi->interp_filter = i;
+#endif  // CONFIG_DUAL_FILTER
+        rs = av1_get_switchable_rate(cpi, xd);
+        if (rs < best_rs) {
+          best_rs = rs;
+#if CONFIG_DUAL_FILTER
+          best_filter = mbmi->interp_filter[0];
+#else
+          best_filter = mbmi->interp_filter;
+#endif  // CONFIG_DUAL_FILTER
+        }
+      }
+    }
+  }
+// Set the appropriate filter
+#if CONFIG_DUAL_FILTER
+  for (i = 0; i < 4; ++i) mbmi->interp_filter[i] = best_filter;
+#else
+  mbmi->interp_filter = best_filter;
+#endif  // CONFIG_DUAL_FILTER
+  rate2 += av1_get_switchable_rate(cpi, xd);
+
+  if (cm->reference_mode == REFERENCE_MODE_SELECT)
+    rate2 += av1_cost_bit(comp_mode_p, comp_pred);
+
+  // Estimate the reference frame signaling cost and add it
+  // to the rolling cost variable.
+  rate2 += ref_costs_single[LAST_FRAME];
+  this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+
+  rd_cost->rate = rate2;
+  rd_cost->dist = distortion2;
+  rd_cost->rdcost = this_rd;
+
+  if (this_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
+
+#if CONFIG_DUAL_FILTER
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == mbmi->interp_filter[0]));
+#else
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == mbmi->interp_filter));
+#endif  // CONFIG_DUAL_FILTER
+
+  av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
+                            cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
+
+  av1_zero(best_pred_diff);
+
+  store_coding_context(x, ctx, THR_ZEROMV, best_pred_diff, 0);
+}
+
+void av1_rd_pick_inter_mode_sub8x8(const struct AV1_COMP *cpi,
+                                   TileDataEnc *tile_data, struct macroblock *x,
+                                   int mi_row, int mi_col,
+                                   struct RD_STATS *rd_cost,
+#if CONFIG_SUPERTX
+                                   int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                   BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                                   int64_t best_rd_so_far) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RD_OPT *const rd_opt = &cpi->rd;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const struct segmentation *const seg = &cm->seg;
+  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
+  unsigned char segment_id = mbmi->segment_id;
+  int comp_pred, i;
+  int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+  struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE];
+  static const int flag_list[TOTAL_REFS_PER_FRAME] = {
+    0,
+    AOM_LAST_FLAG,
+#if CONFIG_EXT_REFS
+    AOM_LAST2_FLAG,
+    AOM_LAST3_FLAG,
+#endif  // CONFIG_EXT_REFS
+    AOM_GOLD_FLAG,
+#if CONFIG_EXT_REFS
+    AOM_BWD_FLAG,
+#endif  // CONFIG_EXT_REFS
+    AOM_ALT_FLAG
+  };
+  int64_t best_rd = best_rd_so_far;
+  int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
+  int64_t best_pred_diff[REFERENCE_MODES];
+  int64_t best_pred_rd[REFERENCE_MODES];
+  MB_MODE_INFO best_mbmode;
+  int ref_index, best_ref_index = 0;
+  unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
+  unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
+  aom_prob comp_mode_p;
+#if CONFIG_DUAL_FILTER
+  InterpFilter tmp_best_filter[4] = { 0 };
+#else
+  InterpFilter tmp_best_filter = SWITCHABLE;
+#endif  // CONFIG_DUAL_FILTER
+  int rate_uv_intra, rate_uv_tokenonly = INT_MAX;
+  int64_t dist_uv = INT64_MAX;
+  int skip_uv;
+  PREDICTION_MODE mode_uv = DC_PRED;
+  const int intra_cost_penalty = av1_get_intra_cost_penalty(
+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  int_mv seg_mvs[4][TOTAL_REFS_PER_FRAME];
+  b_mode_info best_bmodes[4];
+  int best_skip2 = 0;
+  int ref_frame_skip_mask[2] = { 0 };
+  int internal_active_edge =
+      av1_active_edge_sb(cpi, mi_row, mi_col) && av1_internal_image_edge(cpi);
+#if CONFIG_PVQ
+  od_rollback_buffer pre_buf;
+
+  od_encode_checkpoint(&x->daala_enc, &pre_buf);
+#endif  // CONFIG_PVQ
+
+#if CONFIG_SUPERTX
+  best_rd_so_far = INT64_MAX;
+  best_rd = best_rd_so_far;
+  best_yrd = best_rd_so_far;
+#endif  // CONFIG_SUPERTX
+  av1_zero(best_mbmode);
+
+#if CONFIG_FILTER_INTRA
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif  // CONFIG_FILTER_INTRA
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+#if CONFIG_EXT_INTER
+  mbmi->interinter_compound_type = COMPOUND_AVERAGE;
+  mbmi->use_wedge_interintra = 0;
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_WARPED_MOTION
+  mbmi->num_proj_ref[0] = 0;
+  mbmi->num_proj_ref[1] = 0;
+#endif  // CONFIG_WARPED_MOTION
+
+  for (i = 0; i < 4; i++) {
+    int j;
+    for (j = 0; j < TOTAL_REFS_PER_FRAME; j++)
+      seg_mvs[i][j].as_int = INVALID_MV;
+  }
+
+  estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
+                           &comp_mode_p);
+
+  for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX;
+  rate_uv_intra = INT_MAX;
+
+  rd_cost->rate = INT_MAX;
+#if CONFIG_SUPERTX
+  *returnrate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    x->mbmi_ext->mode_context[ref_frame] = 0;
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+    x->mbmi_ext->compound_mode_context[ref_frame] = 0;
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
+                         frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+    } else {
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+    }
+    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+#if CONFIG_EXT_INTER
+#endif  // CONFIG_EXT_INTER
+    frame_mv[ZEROMV][ref_frame].as_int = 0;
+  }
+
+#if CONFIG_PALETTE
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+#endif  // CONFIG_PALETTE
+
+  for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) {
+    int mode_excluded = 0;
+    int64_t this_rd = INT64_MAX;
+    int disable_skip = 0;
+    int compmode_cost = 0;
+    int rate2 = 0, rate_y = 0, rate_uv = 0;
+    int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+    int skippable = 0;
+    int this_skip2 = 0;
+    int64_t total_sse = INT_MAX;
+
+#if CONFIG_PVQ
+    od_encode_rollback(&x->daala_enc, &pre_buf);
+#endif  // CONFIG_PVQ
+
+    ref_frame = av1_ref_order[ref_index].ref_frame[0];
+    second_ref_frame = av1_ref_order[ref_index].ref_frame[1];
+
+#if CONFIG_REF_MV
+    mbmi->ref_mv_idx = 0;
+#endif  // CONFIG_REF_MV
+
+    // Look at the reference frame of the best mode so far and set the
+    // skip mask to look at a subset of the remaining modes.
+    if (ref_index > 2 && sf->mode_skip_start < MAX_MODES) {
+      if (ref_index == 3) {
+        switch (best_mbmode.ref_frame[0]) {
+          case INTRA_FRAME: break;
+          case LAST_FRAME:
+            ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) |
+#if CONFIG_EXT_REFS
+                                      (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |
+                                      (1 << BWDREF_FRAME) |
+#endif  // CONFIG_EXT_REFS
+                                      (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+            break;
+#if CONFIG_EXT_REFS
+          case LAST2_FRAME:
+            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << LAST3_FRAME) |
+                                      (1 << GOLDEN_FRAME) |
+                                      (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+            break;
+          case LAST3_FRAME:
+            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << LAST2_FRAME) |
+                                      (1 << GOLDEN_FRAME) |
+                                      (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+            break;
+#endif  // CONFIG_EXT_REFS
+          case GOLDEN_FRAME:
+            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) |
+#if CONFIG_EXT_REFS
+                                      (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |
+                                      (1 << BWDREF_FRAME) |
+#endif  // CONFIG_EXT_REFS
+                                      (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+            break;
+#if CONFIG_EXT_REFS
+          case BWDREF_FRAME:
+            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << LAST2_FRAME) |
+                                      (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) |
+                                      (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[1] |= (1 << ALTREF_FRAME) | 0x01;
+            break;
+#endif  // CONFIG_EXT_REFS
+          case ALTREF_FRAME:
+            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) |
+#if CONFIG_EXT_REFS
+                                      (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |
+                                      (1 << BWDREF_FRAME) |
+#endif  // CONFIG_EXT_REFS
+                                      (1 << GOLDEN_FRAME);
+#if CONFIG_EXT_REFS
+            ref_frame_skip_mask[1] |= (1 << BWDREF_FRAME) | 0x01;
+#endif  // CONFIG_EXT_REFS
+            break;
+          case NONE_FRAME:
+          case TOTAL_REFS_PER_FRAME:
+            assert(0 && "Invalid Reference frame");
+            break;
+        }
+      }
+    }
+
+    if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
+        (ref_frame_skip_mask[1] & (1 << AOMMAX(0, second_ref_frame))))
+      continue;
+
+    // Test best rd so far against threshold for trying this mode.
+    if (!internal_active_edge &&
+        rd_less_than_thresh(best_rd,
+                            rd_opt->threshes[segment_id][bsize][ref_index],
+                            tile_data->thresh_freq_fact[bsize][ref_index]))
+      continue;
+
+    // This is only used in motion vector unit test.
+    if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue;
+
+#if CONFIG_LOWDELAY_COMPOUND  // Changes LL bitstream
+#if CONFIG_EXT_REFS
+    if (cpi->oxcf.pass == 0) {
+      // Complexity-compression trade-offs
+      // if (ref_frame == ALTREF_FRAME) continue;
+      // if (ref_frame == BWDREF_FRAME) continue;
+      if (second_ref_frame == ALTREF_FRAME) continue;
+      // if (second_ref_frame == BWDREF_FRAME) continue;
+    }
+#endif
+#endif
+    comp_pred = second_ref_frame > INTRA_FRAME;
+    if (comp_pred) {
+      if (!cpi->allow_comp_inter_inter) continue;
+      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+      // Do not allow compound prediction if the segment level reference frame
+      // feature is in use as in this case there can only be one reference.
+      if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue;
+
+      if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+          best_mbmode.ref_frame[0] == INTRA_FRAME)
+        continue;
+    }
+
+    // TODO(jingning, jkoleszar): scaling reference frame not supported for
+    // sub8x8 blocks.
+    if (ref_frame > INTRA_FRAME &&
+        av1_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
+      continue;
+
+    if (second_ref_frame > INTRA_FRAME &&
+        av1_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf))
+      continue;
+
+    if (comp_pred)
+      mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
+    else if (ref_frame != INTRA_FRAME)
+      mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
+
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+      continue;
+      // Disable this drop out case if the ref frame
+      // segment level feature is enabled for this segment. This is to
+      // prevent the possibility that we end up unable to pick any mode.
+    } else if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+      // unless ARNR filtering is enabled in which case we want
+      // an unfiltered alternative. We allow near/nearest as well
+      // because they may result in zero-zero MVs but be cheaper.
+      if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
+        continue;
+    }
+
+    mbmi->tx_size = TX_4X4;
+    mbmi->uv_mode = DC_PRED;
+    mbmi->ref_frame[0] = ref_frame;
+    mbmi->ref_frame[1] = second_ref_frame;
+// Evaluate all sub-pel filters irrespective of whether we can use
+// them for this frame.
+#if CONFIG_DUAL_FILTER
+    for (i = 0; i < 4; ++i)
+      mbmi->interp_filter[i] = cm->interp_filter == SWITCHABLE
+                                   ? EIGHTTAP_REGULAR
+                                   : cm->interp_filter;
+#else
+    mbmi->interp_filter =
+        cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR : cm->interp_filter;
+#endif  // CONFIG_DUAL_FILTER
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+    // Select prediction reference frames.
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+    }
+
+#if CONFIG_VAR_TX
+    mbmi->inter_tx_size[0][0] = mbmi->tx_size;
+    mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+#endif  // CONFIG_VAR_TX
+
+    if (ref_frame == INTRA_FRAME) {
+      int rate;
+      if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y, &distortion_y,
+                                       NULL, best_rd) >= best_rd)
+        continue;
+      rate2 += rate;
+      rate2 += intra_cost_penalty;
+      distortion2 += distortion_y;
+
+      if (rate_uv_intra == INT_MAX) {
+        choose_intra_uv_mode(cpi, x, ctx, bsize, TX_4X4, &rate_uv_intra,
+                             &rate_uv_tokenonly, &dist_uv, &skip_uv, &mode_uv);
+      }
+      rate2 += rate_uv_intra;
+      rate_uv = rate_uv_tokenonly;
+      distortion2 += dist_uv;
+      distortion_uv = dist_uv;
+      mbmi->uv_mode = mode_uv;
+    } else {
+      int rate;
+      int64_t distortion;
+      int64_t this_rd_thresh;
+      int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
+      int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
+      int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
+      int tmp_best_skippable = 0;
+      int switchable_filter_index;
+      int_mv *second_ref =
+          comp_pred ? &x->mbmi_ext->ref_mvs[second_ref_frame][0] : NULL;
+      b_mode_info tmp_best_bmodes[16];  // Should this be 4 ?
+      MB_MODE_INFO tmp_best_mbmode;
+#if CONFIG_DUAL_FILTER
+      BEST_SEG_INFO bsi[DUAL_FILTER_SET_SIZE];
+#else
+      BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
+#endif  // CONFIG_DUAL_FILTER
+      int pred_exists = 0;
+      int uv_skippable;
+#if CONFIG_EXT_INTER
+      int_mv compound_seg_newmvs[4][2];
+      for (i = 0; i < 4; i++) {
+        compound_seg_newmvs[i][0].as_int = INVALID_MV;
+        compound_seg_newmvs[i][1].as_int = INVALID_MV;
+      }
+#endif  // CONFIG_EXT_INTER
+
+      this_rd_thresh = (ref_frame == LAST_FRAME)
+                           ? rd_opt->threshes[segment_id][bsize][THR_LAST]
+                           : rd_opt->threshes[segment_id][bsize][THR_ALTR];
+#if CONFIG_EXT_REFS
+      this_rd_thresh = (ref_frame == LAST2_FRAME)
+                           ? rd_opt->threshes[segment_id][bsize][THR_LAST2]
+                           : this_rd_thresh;
+      this_rd_thresh = (ref_frame == LAST3_FRAME)
+                           ? rd_opt->threshes[segment_id][bsize][THR_LAST3]
+                           : this_rd_thresh;
+      this_rd_thresh = (ref_frame == BWDREF_FRAME)
+                           ? rd_opt->threshes[segment_id][bsize][THR_BWDR]
+                           : this_rd_thresh;
+#endif  // CONFIG_EXT_REFS
+      this_rd_thresh = (ref_frame == GOLDEN_FRAME)
+                           ? rd_opt->threshes[segment_id][bsize][THR_GOLD]
+                           : this_rd_thresh;
+
+      // TODO(any): Add search of the tx_type to improve rd performance at the
+      // expense of speed.
+      mbmi->tx_type = DCT_DCT;
+
+      if (cm->interp_filter != BILINEAR) {
+#if CONFIG_DUAL_FILTER
+        tmp_best_filter[0] = EIGHTTAP_REGULAR;
+        tmp_best_filter[1] = EIGHTTAP_REGULAR;
+        tmp_best_filter[2] = EIGHTTAP_REGULAR;
+        tmp_best_filter[3] = EIGHTTAP_REGULAR;
+#else
+        tmp_best_filter = EIGHTTAP_REGULAR;
+#endif  // CONFIG_DUAL_FILTER
+        if (x->source_variance < sf->disable_filter_search_var_thresh) {
+#if CONFIG_DUAL_FILTER
+          tmp_best_filter[0] = EIGHTTAP_REGULAR;
+#else
+          tmp_best_filter = EIGHTTAP_REGULAR;
+#endif  // CONFIG_DUAL_FILTER
+        } else if (sf->adaptive_pred_interp_filter == 1 &&
+                   ctx->pred_interp_filter < SWITCHABLE) {
+#if CONFIG_DUAL_FILTER
+          tmp_best_filter[0] = ctx->pred_interp_filter;
+#else
+          tmp_best_filter = ctx->pred_interp_filter;
+#endif  // CONFIG_DUAL_FILTER
+        } else if (sf->adaptive_pred_interp_filter == 2) {
+#if CONFIG_DUAL_FILTER
+          tmp_best_filter[0] = ctx->pred_interp_filter < SWITCHABLE
+                                   ? ctx->pred_interp_filter
+                                   : 0;
+#else
+          tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE
+                                ? ctx->pred_interp_filter
+                                : 0;
+#endif  // CONFIG_DUAL_FILTER
+        } else {
+#if CONFIG_DUAL_FILTER
+          const int filter_set_size = DUAL_FILTER_SET_SIZE;
+#else
+          const int filter_set_size = SWITCHABLE_FILTERS;
+#endif  // CONFIG_DUAL_FILTER
+          for (switchable_filter_index = 0;
+               switchable_filter_index < filter_set_size;
+               ++switchable_filter_index) {
+            int newbest, rs;
+            int64_t rs_rd;
+            MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
+#if CONFIG_DUAL_FILTER
+            mbmi->interp_filter[0] = filter_sets[switchable_filter_index][0];
+            mbmi->interp_filter[1] = filter_sets[switchable_filter_index][1];
+            mbmi->interp_filter[2] = filter_sets[switchable_filter_index][0];
+            mbmi->interp_filter[3] = filter_sets[switchable_filter_index][1];
+#else
+            mbmi->interp_filter = switchable_filter_index;
+#endif  // CONFIG_DUAL_FILTER
+            tmp_rd = rd_pick_inter_best_sub8x8_mode(
+                cpi, x, &mbmi_ext->ref_mvs[ref_frame][0], second_ref, best_yrd,
+                &rate, &rate_y, &distortion, &skippable, &total_sse,
+                (int)this_rd_thresh, seg_mvs,
+#if CONFIG_EXT_INTER
+                compound_seg_newmvs,
+#endif  // CONFIG_EXT_INTER
+                bsi, switchable_filter_index, mi_row, mi_col);
+            if (tmp_rd == INT64_MAX) continue;
+            rs = av1_get_switchable_rate(cpi, xd);
+            rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
+            if (cm->interp_filter == SWITCHABLE) tmp_rd += rs_rd;
+
+            newbest = (tmp_rd < tmp_best_rd);
+            if (newbest) {
+#if CONFIG_DUAL_FILTER
+              tmp_best_filter[0] = mbmi->interp_filter[0];
+              tmp_best_filter[1] = mbmi->interp_filter[1];
+              tmp_best_filter[2] = mbmi->interp_filter[2];
+              tmp_best_filter[3] = mbmi->interp_filter[3];
+#else
+              tmp_best_filter = mbmi->interp_filter;
+#endif  // CONFIG_DUAL_FILTER
+              tmp_best_rd = tmp_rd;
+            }
+            if ((newbest && cm->interp_filter == SWITCHABLE) ||
+                (
+#if CONFIG_DUAL_FILTER
+                    mbmi->interp_filter[0] == cm->interp_filter
+#else
+                    mbmi->interp_filter == cm->interp_filter
+#endif  // CONFIG_DUAL_FILTER
+                    && cm->interp_filter != SWITCHABLE)) {
+              tmp_best_rdu = tmp_rd;
+              tmp_best_rate = rate;
+              tmp_best_ratey = rate_y;
+              tmp_best_distortion = distortion;
+              tmp_best_sse = total_sse;
+              tmp_best_skippable = skippable;
+              tmp_best_mbmode = *mbmi;
+              for (i = 0; i < 4; i++) {
+                tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
+              }
+              pred_exists = 1;
+            }
+          }  // switchable_filter_index loop
+        }
+      }
+
+      if (tmp_best_rdu == INT64_MAX && pred_exists) continue;
+
+#if CONFIG_DUAL_FILTER
+      mbmi->interp_filter[0] =
+          (cm->interp_filter == SWITCHABLE ? tmp_best_filter[0]
+                                           : cm->interp_filter);
+      mbmi->interp_filter[1] =
+          (cm->interp_filter == SWITCHABLE ? tmp_best_filter[1]
+                                           : cm->interp_filter);
+      mbmi->interp_filter[2] =
+          (cm->interp_filter == SWITCHABLE ? tmp_best_filter[2]
+                                           : cm->interp_filter);
+      mbmi->interp_filter[3] =
+          (cm->interp_filter == SWITCHABLE ? tmp_best_filter[3]
+                                           : cm->interp_filter);
+#else
+      mbmi->interp_filter =
+          (cm->interp_filter == SWITCHABLE ? tmp_best_filter
+                                           : cm->interp_filter);
+#endif  // CONFIG_DUAL_FILTER
+
+      if (!pred_exists) {
+        // Handles the special case when a filter that is not in the
+        // switchable list (bilinear) is indicated at the frame level
+        tmp_rd = rd_pick_inter_best_sub8x8_mode(
+            cpi, x, &x->mbmi_ext->ref_mvs[ref_frame][0], second_ref, best_yrd,
+            &rate, &rate_y, &distortion, &skippable, &total_sse,
+            (int)this_rd_thresh, seg_mvs,
+#if CONFIG_EXT_INTER
+            compound_seg_newmvs,
+#endif  // CONFIG_EXT_INTER
+            bsi, 0, mi_row, mi_col);
+        if (tmp_rd == INT64_MAX) continue;
+      } else {
+        total_sse = tmp_best_sse;
+        rate = tmp_best_rate;
+        rate_y = tmp_best_ratey;
+        distortion = tmp_best_distortion;
+        skippable = tmp_best_skippable;
+        *mbmi = tmp_best_mbmode;
+        for (i = 0; i < 4; i++) xd->mi[0]->bmi[i] = tmp_best_bmodes[i];
+      }
+      // Add in the cost of the transform type
+      if (!xd->lossless[mbmi->segment_id]) {
+        int rate_tx_type = 0;
+#if CONFIG_EXT_TX
+        if (get_ext_tx_types(mbmi->tx_size, bsize, 1, cm->reduced_tx_set_used) >
+            1) {
+          const int eset =
+              get_ext_tx_set(mbmi->tx_size, bsize, 1, cm->reduced_tx_set_used);
+          rate_tx_type =
+              cpi->inter_tx_type_costs[eset][mbmi->tx_size][mbmi->tx_type];
+        }
+#else
+        if (mbmi->tx_size < TX_32X32) {
+          rate_tx_type = cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
+        }
+#endif  // CONFIG_EXT_TX
+        rate += rate_tx_type;
+        rate_y += rate_tx_type;
+      }
+
+      rate2 += rate;
+      distortion2 += distortion;
+
+      if (cm->interp_filter == SWITCHABLE)
+        rate2 += av1_get_switchable_rate(cpi, xd);
+
+      if (!mode_excluded)
+        mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
+                                  : cm->reference_mode == COMPOUND_REFERENCE;
+
+      compmode_cost = av1_cost_bit(comp_mode_p, comp_pred);
+
+      tmp_best_rdu =
+          best_rd - AOMMIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
+                           RDCOST(x->rdmult, x->rddiv, 0, total_sse));
+
+      if (tmp_best_rdu > 0) {
+        // If even the 'Y' rd value of split is higher than best so far
+        // then dont bother looking at UV
+        int is_cost_valid_uv;
+        RD_STATS rd_stats_uv;
+        av1_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, NULL,
+                                        BLOCK_8X8);
+#if CONFIG_VAR_TX
+        is_cost_valid_uv =
+            inter_block_uvrd(cpi, x, &rd_stats_uv, BLOCK_8X8, tmp_best_rdu);
+#else
+        is_cost_valid_uv =
+            super_block_uvrd(cpi, x, &rd_stats_uv, BLOCK_8X8, tmp_best_rdu);
+#endif  // CONFIG_VAR_TX
+        rate_uv = rd_stats_uv.rate;
+        distortion_uv = rd_stats_uv.dist;
+        uv_skippable = rd_stats_uv.skip;
+        uv_sse = rd_stats_uv.sse;
+
+        if (!is_cost_valid_uv) continue;
+        rate2 += rate_uv;
+        distortion2 += distortion_uv;
+        skippable = skippable && uv_skippable;
+        total_sse += uv_sse;
+      } else {
+        continue;
+      }
+    }
+
+    if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost;
+
+    // Estimate the reference frame signaling cost and add it
+    // to the rolling cost variable.
+    if (second_ref_frame > INTRA_FRAME) {
+      rate2 += ref_costs_comp[ref_frame];
+#if CONFIG_EXT_REFS
+      rate2 += ref_costs_comp[second_ref_frame];
+#endif  // CONFIG_EXT_REFS
+    } else {
+      rate2 += ref_costs_single[ref_frame];
+    }
+
+    if (!disable_skip) {
+      // Skip is never coded at the segment level for sub8x8 blocks and instead
+      // always coded in the bitstream at the mode info level.
+
+      if (ref_frame != INTRA_FRAME && !xd->lossless[mbmi->segment_id]) {
+        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
+            RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
+          // Add in the cost of the no skip flag.
+          rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+        } else {
+          // FIXME(rbultje) make this work for splitmv also
+          rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+          distortion2 = total_sse;
+          assert(total_sse >= 0);
+          rate2 -= (rate_y + rate_uv);
+          rate_y = 0;
+          rate_uv = 0;
+          this_skip2 = 1;
+        }
+      } else {
+        // Add in the cost of the no skip flag.
+        rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+      }
+
+      // Calculate the final RD estimate for this mode.
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+    }
+
+    if (!disable_skip && ref_frame == INTRA_FRAME) {
+      for (i = 0; i < REFERENCE_MODES; ++i)
+        best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd);
+    }
+
+    // Did this mode help.. i.e. is it the new best mode
+    if (this_rd < best_rd || x->skip) {
+      if (!mode_excluded) {
+        // Note index of best mode so far
+        best_ref_index = ref_index;
+
+        if (ref_frame == INTRA_FRAME) {
+          /* required for left and above block mv */
+          mbmi->mv[0].as_int = 0;
+        }
+
+        rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+        *returnrate_nocoef = rate2 - rate_y - rate_uv;
+        if (!disable_skip)
+          *returnrate_nocoef -=
+              av1_cost_bit(av1_get_skip_prob(cm, xd), this_skip2);
+        *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd),
+                                           mbmi->ref_frame[0] != INTRA_FRAME);
+        assert(*returnrate_nocoef > 0);
+#endif  // CONFIG_SUPERTX
+        rd_cost->dist = distortion2;
+        rd_cost->rdcost = this_rd;
+        best_rd = this_rd;
+        best_yrd =
+            best_rd - RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
+        best_mbmode = *mbmi;
+        best_skip2 = this_skip2;
+
+#if CONFIG_VAR_TX
+        for (i = 0; i < MAX_MB_PLANE; ++i)
+          memset(ctx->blk_skip[i], 0, sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif  // CONFIG_VAR_TX
+
+        for (i = 0; i < 4; i++) best_bmodes[i] = xd->mi[0]->bmi[i];
+      }
+    }
+
+    /* keep record of best compound/single-only prediction */
+    if (!disable_skip && ref_frame != INTRA_FRAME) {
+      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+      if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+        single_rate = rate2 - compmode_cost;
+        hybrid_rate = rate2;
+      } else {
+        single_rate = rate2;
+        hybrid_rate = rate2 + compmode_cost;
+      }
+
+      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+      if (!comp_pred && single_rd < best_pred_rd[SINGLE_REFERENCE])
+        best_pred_rd[SINGLE_REFERENCE] = single_rd;
+      else if (comp_pred && single_rd < best_pred_rd[COMPOUND_REFERENCE])
+        best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+
+      if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
+        best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+    }
+
+    if (x->skip && !comp_pred) break;
+  }
+
+  if (best_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+    *returnrate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
+    return;
+  }
+
+  if (best_rd == INT64_MAX) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->dist = INT64_MAX;
+    rd_cost->rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+    *returnrate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
+    return;
+  }
+
+#if CONFIG_DUAL_FILTER
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == best_mbmode.interp_filter[0]) ||
+         !is_inter_block(&best_mbmode));
+#else
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == best_mbmode.interp_filter) ||
+         !is_inter_block(&best_mbmode));
+#endif  // CONFIG_DUAL_FILTER
+
+  av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
+                            sf->adaptive_rd_thresh, bsize, best_ref_index);
+
+  // macroblock modes
+  *mbmi = best_mbmode;
+#if CONFIG_VAR_TX
+  mbmi->inter_tx_size[0][0] = mbmi->tx_size;
+#endif  // CONFIG_VAR_TX
+
+  x->skip |= best_skip2;
+  if (!is_inter_block(&best_mbmode)) {
+    for (i = 0; i < 4; i++) xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
+  } else {
+    for (i = 0; i < 4; ++i)
+      memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
+
+#if CONFIG_REF_MV
+    mbmi->pred_mv[0].as_int = xd->mi[0]->bmi[3].pred_mv[0].as_int;
+    mbmi->pred_mv[1].as_int = xd->mi[0]->bmi[3].pred_mv[1].as_int;
+#endif  // CONFIG_REF_MV
+    mbmi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int;
+    mbmi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
+  }
+
+// Note: this section is needed since the mode may have been forced to ZEROMV
+#if CONFIG_GLOBAL_MOTION
+  if (mbmi->mode == ZEROMV
+#if CONFIG_EXT_INTER
+      || mbmi->mode == ZERO_ZEROMV
+#endif  // CONFIG_EXT_INTER
+      ) {
+    if (is_nontrans_global_motion(xd)) {
+#if CONFIG_DUAL_FILTER
+      mbmi->interp_filter[0] = cm->interp_filter == SWITCHABLE
+                                   ? EIGHTTAP_REGULAR
+                                   : cm->interp_filter;
+      mbmi->interp_filter[1] = cm->interp_filter == SWITCHABLE
+                                   ? EIGHTTAP_REGULAR
+                                   : cm->interp_filter;
+#else
+      mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR
+                                                            : cm->interp_filter;
+#endif  // CONFIG_DUAL_FILTER
+    }
+  }
+#endif  // CONFIG_GLOBAL_MOTION
+
+  for (i = 0; i < REFERENCE_MODES; ++i) {
+    if (best_pred_rd[i] == INT64_MAX)
+      best_pred_diff[i] = INT_MIN;
+    else
+      best_pred_diff[i] = best_rd - best_pred_rd[i];
+  }
+
+  store_coding_context(x, ctx, best_ref_index, best_pred_diff, 0);
+}
+
+#if CONFIG_MOTION_VAR
+// This function has a structure similar to av1_build_obmc_inter_prediction
+//
+// The OBMC predictor is computed as:
+//
+//  PObmc(x,y) =
+//    AOM_BLEND_A64(Mh(x),
+//                  AOM_BLEND_A64(Mv(y), P(x,y), PAbove(x,y)),
+//                  PLeft(x, y))
+//
+// Scaling up by AOM_BLEND_A64_MAX_ALPHA ** 2 and omitting the intermediate
+// rounding, this can be written as:
+//
+//  AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * Pobmc(x,y) =
+//    Mh(x) * Mv(y) * P(x,y) +
+//      Mh(x) * Cv(y) * Pabove(x,y) +
+//      AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
+//
+// Where :
+//
+//  Cv(y) = AOM_BLEND_A64_MAX_ALPHA - Mv(y)
+//  Ch(y) = AOM_BLEND_A64_MAX_ALPHA - Mh(y)
+//
+// This function computes 'wsrc' and 'mask' as:
+//
+//  wsrc(x, y) =
+//    AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * src(x, y) -
+//      Mh(x) * Cv(y) * Pabove(x,y) +
+//      AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
+//
+//  mask(x, y) = Mh(x) * Mv(y)
+//
+// These can then be used to efficiently approximate the error for any
+// predictor P in the context of the provided neighbouring predictors by
+// computing:
+//
+//  error(x, y) =
+//    wsrc(x, y) - mask(x, y) * P(x, y) / (AOM_BLEND_A64_MAX_ALPHA ** 2)
+//
+static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
+                                      const MACROBLOCKD *xd, int mi_row,
+                                      int mi_col, const uint8_t *above,
+                                      int above_stride, const uint8_t *left,
+                                      int left_stride) {
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  int row, col, i;
+  const int bw = xd->n8_w << MI_SIZE_LOG2;
+  const int bh = xd->n8_h << MI_SIZE_LOG2;
+  int32_t *mask_buf = x->mask_buf;
+  int32_t *wsrc_buf = x->wsrc_buf;
+  const int wsrc_stride = bw;
+  const int mask_stride = bw;
+  const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
+#if CONFIG_HIGHBITDEPTH
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#else
+  const int is_hbd = 0;
+#endif  // CONFIG_HIGHBITDEPTH
+
+  // plane 0 should not be subsampled
+  assert(xd->plane[0].subsampling_x == 0);
+  assert(xd->plane[0].subsampling_y == 0);
+
+  av1_zero_array(wsrc_buf, bw * bh);
+  for (i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA;
+
+  // handle above row
+  if (xd->up_available) {
+    const int overlap = num_4x4_blocks_high_lookup[bsize] * 2;
+    const int miw = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
+    const int mi_row_offset = -1;
+    const uint8_t *const mask1d = av1_get_obmc_mask(overlap);
+    const int neighbor_limit = max_neighbor_obmc[b_width_log2_lookup[bsize]];
+    int neighbor_count = 0;
+
+    assert(miw > 0);
+
+    i = 0;
+    do {  // for each mi in the above row
+      const int mi_col_offset = i;
+      const MB_MODE_INFO *const above_mbmi =
+          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
+      const BLOCK_SIZE a_bsize = above_mbmi->sb_type;
+      const int mi_step = AOMMIN(xd->n8_w, mi_size_wide[a_bsize]);
+      const int neighbor_bw = mi_step * MI_SIZE;
+
+      if (is_neighbor_overlappable(above_mbmi)) {
+        if (!CONFIG_CB4X4 && (a_bsize == BLOCK_4X4 || a_bsize == BLOCK_4X8))
+          neighbor_count += 2;
+        else
+          neighbor_count++;
+        if (neighbor_count > neighbor_limit) break;
+
+        const int tmp_stride = above_stride;
+        int32_t *wsrc = wsrc_buf + (i * MI_SIZE);
+        int32_t *mask = mask_buf + (i * MI_SIZE);
+
+        if (!is_hbd) {
+          const uint8_t *tmp = above;
+
+          for (row = 0; row < overlap; ++row) {
+            const uint8_t m0 = mask1d[row];
+            const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+            for (col = 0; col < neighbor_bw; ++col) {
+              wsrc[col] = m1 * tmp[col];
+              mask[col] = m0;
+            }
+            wsrc += wsrc_stride;
+            mask += mask_stride;
+            tmp += tmp_stride;
+          }
+#if CONFIG_HIGHBITDEPTH
+        } else {
+          const uint16_t *tmp = CONVERT_TO_SHORTPTR(above);
+
+          for (row = 0; row < overlap; ++row) {
+            const uint8_t m0 = mask1d[row];
+            const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+            for (col = 0; col < neighbor_bw; ++col) {
+              wsrc[col] = m1 * tmp[col];
+              mask[col] = m0;
+            }
+            wsrc += wsrc_stride;
+            mask += mask_stride;
+            tmp += tmp_stride;
+          }
+#endif  // CONFIG_HIGHBITDEPTH
+        }
+      }
+
+      above += neighbor_bw;
+      i += mi_step;
+    } while (i < miw);
+  }
+
+  for (i = 0; i < bw * bh; ++i) {
+    wsrc_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
+    mask_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
+  }
+
+  // handle left column
+  if (xd->left_available) {
+    const int overlap = num_4x4_blocks_wide_lookup[bsize] * 2;
+    const int mih = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
+    const int mi_col_offset = -1;
+    const uint8_t *const mask1d = av1_get_obmc_mask(overlap);
+    const int neighbor_limit = max_neighbor_obmc[b_height_log2_lookup[bsize]];
+    int neighbor_count = 0;
+
+    assert(mih > 0);
+
+    i = 0;
+    do {  // for each mi in the left column
+      const int mi_row_offset = i;
+      const MB_MODE_INFO *const left_mbmi =
+          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
+      const BLOCK_SIZE l_bsize = left_mbmi->sb_type;
+      const int mi_step = AOMMIN(xd->n8_h, mi_size_high[l_bsize]);
+      const int neighbor_bh = mi_step * MI_SIZE;
+
+      if (is_neighbor_overlappable(left_mbmi)) {
+        if (!CONFIG_CB4X4 && (l_bsize == BLOCK_4X4 || l_bsize == BLOCK_8X4))
+          neighbor_count += 2;
+        else
+          neighbor_count++;
+        if (neighbor_count > neighbor_limit) break;
+
+        const int tmp_stride = left_stride;
+        int32_t *wsrc = wsrc_buf + (i * MI_SIZE * wsrc_stride);
+        int32_t *mask = mask_buf + (i * MI_SIZE * mask_stride);
+
+        if (!is_hbd) {
+          const uint8_t *tmp = left;
+
+          for (row = 0; row < neighbor_bh; ++row) {
+            for (col = 0; col < overlap; ++col) {
+              const uint8_t m0 = mask1d[col];
+              const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+              wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+                          (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+              mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+            }
+            wsrc += wsrc_stride;
+            mask += mask_stride;
+            tmp += tmp_stride;
+          }
+#if CONFIG_HIGHBITDEPTH
+        } else {
+          const uint16_t *tmp = CONVERT_TO_SHORTPTR(left);
+
+          for (row = 0; row < neighbor_bh; ++row) {
+            for (col = 0; col < overlap; ++col) {
+              const uint8_t m0 = mask1d[col];
+              const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+              wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+                          (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+              mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+            }
+            wsrc += wsrc_stride;
+            mask += mask_stride;
+            tmp += tmp_stride;
+          }
+#endif  // CONFIG_HIGHBITDEPTH
+        }
+      }
+
+      left += neighbor_bh * left_stride;
+      i += mi_step;
+    } while (i < mih);
+  }
+
+  if (!is_hbd) {
+    const uint8_t *src = x->plane[0].src.buf;
+
+    for (row = 0; row < bh; ++row) {
+      for (col = 0; col < bw; ++col) {
+        wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+      }
+      wsrc_buf += wsrc_stride;
+      src += x->plane[0].src.stride;
+    }
+#if CONFIG_HIGHBITDEPTH
+  } else {
+    const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
+
+    for (row = 0; row < bh; ++row) {
+      for (col = 0; col < bw; ++col) {
+        wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+      }
+      wsrc_buf += wsrc_stride;
+      src += x->plane[0].src.stride;
+    }
+#endif  // CONFIG_HIGHBITDEPTH
+  }
+}
+
+#if CONFIG_NCOBMC
+void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
+                         int mi_row, int mi_col) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO backup_mbmi;
+  BLOCK_SIZE bsize = mbmi->sb_type;
+  int ref, skip_blk, backup_skip = x->skip;
+  int64_t rd_causal;
+  RD_STATS rd_stats_y, rd_stats_uv;
+  int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+  int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+
+  // Recompute the best causal predictor and rd
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
+    assert(cfg != NULL);
+    av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
+                         &xd->block_refs[ref]->sf);
+  }
+  av1_setup_dst_planes(x->e_mbd.plane, bsize,
+                       get_frame_new_buffer(&cpi->common), mi_row, mi_col);
+
+  av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, bsize);
+
+  av1_subtract_plane(x, bsize, 0);
+  super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+  super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+  assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
+  if (rd_stats_y.skip && rd_stats_uv.skip) {
+    rd_stats_y.rate = rate_skip1;
+    rd_stats_uv.rate = 0;
+    rd_stats_y.dist = rd_stats_y.sse;
+    rd_stats_uv.dist = rd_stats_uv.sse;
+    skip_blk = 0;
+  } else if (RDCOST(x->rdmult, x->rddiv,
+                    (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
+                    (rd_stats_y.dist + rd_stats_uv.dist)) >
+             RDCOST(x->rdmult, x->rddiv, rate_skip1,
+                    (rd_stats_y.sse + rd_stats_uv.sse))) {
+    rd_stats_y.rate = rate_skip1;
+    rd_stats_uv.rate = 0;
+    rd_stats_y.dist = rd_stats_y.sse;
+    rd_stats_uv.dist = rd_stats_uv.sse;
+    skip_blk = 1;
+  } else {
+    rd_stats_y.rate += rate_skip0;
+    skip_blk = 0;
+  }
+  backup_skip = skip_blk;
+  backup_mbmi = *mbmi;
+  rd_causal = RDCOST(x->rdmult, x->rddiv, (rd_stats_y.rate + rd_stats_uv.rate),
+                     (rd_stats_y.dist + rd_stats_uv.dist));
+  rd_causal += RDCOST(x->rdmult, x->rddiv,
+                      av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 0), 0);
+
+  // Check non-causal mode
+  mbmi->motion_mode = OBMC_CAUSAL;
+  av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+
+  av1_subtract_plane(x, bsize, 0);
+  super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+  super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+  assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
+  if (rd_stats_y.skip && rd_stats_uv.skip) {
+    rd_stats_y.rate = rate_skip1;
+    rd_stats_uv.rate = 0;
+    rd_stats_y.dist = rd_stats_y.sse;
+    rd_stats_uv.dist = rd_stats_uv.sse;
+    skip_blk = 0;
+  } else if (RDCOST(x->rdmult, x->rddiv,
+                    (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
+                    (rd_stats_y.dist + rd_stats_uv.dist)) >
+             RDCOST(x->rdmult, x->rddiv, rate_skip1,
+                    (rd_stats_y.sse + rd_stats_uv.sse))) {
+    rd_stats_y.rate = rate_skip1;
+    rd_stats_uv.rate = 0;
+    rd_stats_y.dist = rd_stats_y.sse;
+    rd_stats_uv.dist = rd_stats_uv.sse;
+    skip_blk = 1;
+  } else {
+    rd_stats_y.rate += rate_skip0;
+    skip_blk = 0;
+  }
+
+  if (rd_causal >
+      RDCOST(x->rdmult, x->rddiv,
+             rd_stats_y.rate + rd_stats_uv.rate +
+                 av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 1),
+             (rd_stats_y.dist + rd_stats_uv.dist))) {
+    x->skip = skip_blk;
+  } else {
+    *mbmi = backup_mbmi;
+    x->skip = backup_skip;
+  }
+}
+#endif  // CONFIG_NCOBMC
+#endif  // CONFIG_MOTION_VAR
diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h
new file mode 100644
index 000000000..a7053b289
--- /dev/null
+++ b/third_party/aom/av1/encoder/rdopt.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_RDOPT_H_
+#define AV1_ENCODER_RDOPT_H_
+
+#include "av1/common/blockd.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/context_tree.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct TileInfo;
+struct AV1_COMP;
+struct macroblock;
+struct RD_STATS;
+
+#if CONFIG_RD_DEBUG
+static INLINE void av1_update_txb_coeff_cost(RD_STATS *rd_stats, int plane,
+                                             TX_SIZE tx_size, int blk_row,
+                                             int blk_col, int txb_coeff_cost) {
+  (void)blk_row;
+  (void)blk_col;
+  (void)tx_size;
+  rd_stats->txb_coeff_cost[plane] += txb_coeff_cost;
+
+#if CONFIG_VAR_TX
+  {
+    const int txb_h = tx_size_high_unit[tx_size];
+    const int txb_w = tx_size_wide_unit[tx_size];
+    int idx, idy;
+    for (idy = 0; idy < txb_h; ++idy)
+      for (idx = 0; idx < txb_w; ++idx)
+        rd_stats->txb_coeff_cost_map[plane][blk_row + idy][blk_col + idx] = 0;
+
+    rd_stats->txb_coeff_cost_map[plane][blk_row][blk_col] = txb_coeff_cost;
+  }
+  assert(blk_row < TXB_COEFF_COST_MAP_SIZE);
+  assert(blk_col < TXB_COEFF_COST_MAP_SIZE);
+#endif
+}
+#endif
+
+typedef enum OUTPUT_STATUS {
+  OUTPUT_HAS_PREDICTED_PIXELS,
+  OUTPUT_HAS_DECODED_PIXELS
+} OUTPUT_STATUS;
+
+void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                    BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col,
+                    TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse,
+                    OUTPUT_STATUS output_status);
+
+#if !CONFIG_PVQ || CONFIG_VAR_TX
+int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
+                    int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order,
+                    const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
+                    int use_fast_coef_costing);
+#endif
+void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
+                               struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd);
+
+unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
+                                           const struct buf_2d *ref,
+                                           BLOCK_SIZE bs);
+#if CONFIG_HIGHBITDEPTH
+unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi,
+                                                const struct buf_2d *ref,
+                                                BLOCK_SIZE bs, int bd);
+#endif
+
+void av1_rd_pick_inter_mode_sb(const struct AV1_COMP *cpi,
+                               struct TileDataEnc *tile_data,
+                               struct macroblock *x, int mi_row, int mi_col,
+                               struct RD_STATS *rd_cost,
+#if CONFIG_SUPERTX
+                               int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                               int64_t best_rd_so_far);
+
+void av1_rd_pick_inter_mode_sb_seg_skip(
+    const struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
+    struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost,
+    BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
+
+int av1_internal_image_edge(const struct AV1_COMP *cpi);
+int av1_active_h_edge(const struct AV1_COMP *cpi, int mi_row, int mi_step);
+int av1_active_v_edge(const struct AV1_COMP *cpi, int mi_col, int mi_step);
+int av1_active_edge_sb(const struct AV1_COMP *cpi, int mi_row, int mi_col);
+
+void av1_rd_pick_inter_mode_sub8x8(const struct AV1_COMP *cpi,
+                                   struct TileDataEnc *tile_data,
+                                   struct macroblock *x, int mi_row, int mi_col,
+                                   struct RD_STATS *rd_cost,
+#if CONFIG_SUPERTX
+                                   int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                   BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                                   int64_t best_rd_so_far);
+
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
+                         int mi_row, int mi_col);
+#endif  // CONFIG_MOTION_VAR && CONFIG_NCOBMC
+
+#if CONFIG_SUPERTX
+#if CONFIG_VAR_TX
+void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
+                       int blk_row, int blk_col, int plane, int block,
+                       int plane_bsize, const ENTROPY_CONTEXT *a,
+                       const ENTROPY_CONTEXT *l, RD_STATS *rd_stats);
+#endif
+
+void av1_txfm_rd_in_plane_supertx(MACROBLOCK *x, const AV1_COMP *cpi, int *rate,
+                                  int64_t *distortion, int *skippable,
+                                  int64_t *sse, int64_t ref_best_rd, int plane,
+                                  BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                  int use_fast_coef_casting);
+#endif  // CONFIG_SUPERTX
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+int av1_tx_type_cost(const AV1_COMP *cpi, const MACROBLOCKD *xd,
+                     BLOCK_SIZE bsize, int plane, TX_SIZE tx_size,
+                     TX_TYPE tx_type);
+
+#endif  // AV1_ENCODER_RDOPT_H_
diff --git a/third_party/aom/av1/encoder/segmentation.c b/third_party/aom/av1/encoder/segmentation.c
new file mode 100644
index 000000000..b581a61d0
--- /dev/null
+++ b/third_party/aom/av1/encoder/segmentation.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/pred_common.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/subexp.h"
+
+void av1_enable_segmentation(struct segmentation *seg) {
+  seg->enabled = 1;
+  seg->update_map = 1;
+  seg->update_data = 1;
+}
+
+void av1_disable_segmentation(struct segmentation *seg) {
+  seg->enabled = 0;
+  seg->update_map = 0;
+  seg->update_data = 0;
+}
+
+void av1_set_segment_data(struct segmentation *seg, signed char *feature_data,
+                          unsigned char abs_delta) {
+  seg->abs_delta = abs_delta;
+
+  memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data));
+}
+void av1_disable_segfeature(struct segmentation *seg, int segment_id,
+                            SEG_LVL_FEATURES feature_id) {
+  seg->feature_mask[segment_id] &= ~(1 << feature_id);
+}
+
+void av1_clear_segdata(struct segmentation *seg, int segment_id,
+                       SEG_LVL_FEATURES feature_id) {
+  seg->feature_data[segment_id][feature_id] = 0;
+}
+
+// Based on set of segment counts calculate a probability tree
+static void calc_segtree_probs(unsigned *segcounts,
+                               aom_prob *segment_tree_probs,
+                               const aom_prob *cur_tree_probs,
+                               const int probwt) {
+  // Work out probabilities of each segment
+  const unsigned cc[4] = { segcounts[0] + segcounts[1],
+                           segcounts[2] + segcounts[3],
+                           segcounts[4] + segcounts[5],
+                           segcounts[6] + segcounts[7] };
+  const unsigned ccc[2] = { cc[0] + cc[1], cc[2] + cc[3] };
+  int i;
+
+  segment_tree_probs[0] = get_binary_prob(ccc[0], ccc[1]);
+  segment_tree_probs[1] = get_binary_prob(cc[0], cc[1]);
+  segment_tree_probs[2] = get_binary_prob(cc[2], cc[3]);
+  segment_tree_probs[3] = get_binary_prob(segcounts[0], segcounts[1]);
+  segment_tree_probs[4] = get_binary_prob(segcounts[2], segcounts[3]);
+  segment_tree_probs[5] = get_binary_prob(segcounts[4], segcounts[5]);
+  segment_tree_probs[6] = get_binary_prob(segcounts[6], segcounts[7]);
+
+  for (i = 0; i < 7; i++) {
+    const unsigned *ct =
+        i == 0 ? ccc : i < 3 ? cc + (i & 2) : segcounts + (i - 3) * 2;
+    av1_prob_diff_update_savings_search(ct, cur_tree_probs[i],
+                                        &segment_tree_probs[i],
+                                        DIFF_UPDATE_PROB, probwt);
+  }
+}
+
+// Based on set of segment counts and probabilities calculate a cost estimate
+static int cost_segmap(unsigned *segcounts, aom_prob *probs) {
+  const int c01 = segcounts[0] + segcounts[1];
+  const int c23 = segcounts[2] + segcounts[3];
+  const int c45 = segcounts[4] + segcounts[5];
+  const int c67 = segcounts[6] + segcounts[7];
+  const int c0123 = c01 + c23;
+  const int c4567 = c45 + c67;
+
+  // Cost the top node of the tree
+  int cost = c0123 * av1_cost_zero(probs[0]) + c4567 * av1_cost_one(probs[0]);
+
+  // Cost subsequent levels
+  if (c0123 > 0) {
+    cost += c01 * av1_cost_zero(probs[1]) + c23 * av1_cost_one(probs[1]);
+
+    if (c01 > 0)
+      cost += segcounts[0] * av1_cost_zero(probs[3]) +
+              segcounts[1] * av1_cost_one(probs[3]);
+    if (c23 > 0)
+      cost += segcounts[2] * av1_cost_zero(probs[4]) +
+              segcounts[3] * av1_cost_one(probs[4]);
+  }
+
+  if (c4567 > 0) {
+    cost += c45 * av1_cost_zero(probs[2]) + c67 * av1_cost_one(probs[2]);
+
+    if (c45 > 0)
+      cost += segcounts[4] * av1_cost_zero(probs[5]) +
+              segcounts[5] * av1_cost_one(probs[5]);
+    if (c67 > 0)
+      cost += segcounts[6] * av1_cost_zero(probs[6]) +
+              segcounts[7] * av1_cost_one(probs[6]);
+  }
+
+  return cost;
+}
+
+static void count_segs(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                       const TileInfo *tile, MODE_INFO **mi,
+                       unsigned *no_pred_segcounts,
+                       unsigned (*temporal_predictor_count)[2],
+                       unsigned *t_unpred_seg_counts, int bw, int bh,
+                       int mi_row, int mi_col) {
+  int segment_id;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  xd->mi = mi;
+  segment_id = xd->mi[0]->mbmi.segment_id;
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
+#if CONFIG_DEPENDENT_HORZTILES
+                 cm->dependent_horz_tiles,
+#endif  // CONFIG_DEPENDENT_HORZTILES
+                 cm->mi_rows, cm->mi_cols);
+
+  // Count the number of hits on each segment with no prediction
+  no_pred_segcounts[segment_id]++;
+
+  // Temporal prediction not allowed on key frames
+  if (cm->frame_type != KEY_FRAME) {
+    const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+    // Test to see if the segment id matches the predicted value.
+    const int pred_segment_id =
+        get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col);
+    const int pred_flag = pred_segment_id == segment_id;
+    const int pred_context = av1_get_pred_context_seg_id(xd);
+
+    // Store the prediction status for this mb and update counts
+    // as appropriate
+    xd->mi[0]->mbmi.seg_id_predicted = pred_flag;
+    temporal_predictor_count[pred_context][pred_flag]++;
+
+    // Update the "unpredicted" segment count
+    if (!pred_flag) t_unpred_seg_counts[segment_id]++;
+  }
+}
+
+static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                          const TileInfo *tile, MODE_INFO **mi,
+                          unsigned *no_pred_segcounts,
+                          unsigned (*temporal_predictor_count)[2],
+                          unsigned *t_unpred_seg_counts, int mi_row, int mi_col,
+                          BLOCK_SIZE bsize) {
+  const int mis = cm->mi_stride;
+  const int bs = mi_size_wide[bsize], hbs = bs / 2;
+#if CONFIG_EXT_PARTITION_TYPES
+  PARTITION_TYPE partition;
+#else
+  int bw, bh;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+#if CONFIG_EXT_PARTITION_TYPES
+  if (bsize == BLOCK_8X8)
+    partition = PARTITION_NONE;
+  else
+    partition = get_partition(cm, mi_row, mi_col, bsize);
+  switch (partition) {
+    case PARTITION_NONE:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, bs, bs, mi_row, mi_col);
+      break;
+    case PARTITION_HORZ:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
+                 mi_row + hbs, mi_col);
+      break;
+    case PARTITION_VERT:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row,
+                 mi_col + hbs);
+      break;
+    case PARTITION_HORZ_A:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, hbs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+                 mi_row, mi_col + hbs);
+      count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
+                 mi_row + hbs, mi_col);
+      break;
+    case PARTITION_HORZ_B:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+                 mi_row + hbs, mi_col);
+      count_segs(cm, xd, tile, mi + hbs + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+                 mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_VERT_A:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, hbs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+                 mi_row + hbs, mi_col);
+      count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row,
+                 mi_col + hbs);
+      break;
+    case PARTITION_VERT_B:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+                 mi_row, mi_col + hbs);
+      count_segs(cm, xd, tile, mi + hbs + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+                 mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_SPLIT: {
+      const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
+      int n;
+
+      assert(num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type] < bs &&
+             num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type] < bs);
+
+      for (n = 0; n < 4; n++) {
+        const int mi_dc = hbs * (n & 1);
+        const int mi_dr = hbs * (n >> 1);
+
+        count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc], no_pred_segcounts,
+                      temporal_predictor_count, t_unpred_seg_counts,
+                      mi_row + mi_dr, mi_col + mi_dc, subsize);
+      }
+    } break;
+    default: assert(0);
+  }
+#else
+  bw = mi_size_wide[mi[0]->mbmi.sb_type];
+  bh = mi_size_high[mi[0]->mbmi.sb_type];
+
+  if (bw == bs && bh == bs) {
+    count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+               t_unpred_seg_counts, bs, bs, mi_row, mi_col);
+  } else if (bw == bs && bh < bs) {
+    count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+               t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
+    count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+               temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
+               mi_row + hbs, mi_col);
+  } else if (bw < bs && bh == bs) {
+    count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+               t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
+    count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
+               temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row,
+               mi_col + hbs);
+  } else {
+    const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
+    int n;
+
+    assert(bw < bs && bh < bs);
+
+    for (n = 0; n < 4; n++) {
+      const int mi_dc = hbs * (n & 1);
+      const int mi_dr = hbs * (n >> 1);
+
+      count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc], no_pred_segcounts,
+                    temporal_predictor_count, t_unpred_seg_counts,
+                    mi_row + mi_dr, mi_col + mi_dc, subsize);
+    }
+  }
+#endif  // CONFIG_EXT_PARTITION_TYPES
+}
+
+void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) {
+  struct segmentation *seg = &cm->seg;
+  struct segmentation_probs *segp = &cm->fc->seg;
+
+  int no_pred_cost;
+  int t_pred_cost = INT_MAX;
+
+  int i, tile_col, tile_row, mi_row, mi_col;
+#if CONFIG_TILE_GROUPS
+  const int probwt = cm->num_tg;
+#else
+  const int probwt = 1;
+#endif
+
+  unsigned(*temporal_predictor_count)[2] = cm->counts.seg.pred;
+  unsigned *no_pred_segcounts = cm->counts.seg.tree_total;
+  unsigned *t_unpred_seg_counts = cm->counts.seg.tree_mispred;
+
+  aom_prob no_pred_tree[SEG_TREE_PROBS];
+  aom_prob t_pred_tree[SEG_TREE_PROBS];
+  aom_prob t_nopred_prob[PREDICTION_PROBS];
+
+  (void)xd;
+
+  // We are about to recompute all the segment counts, so zero the accumulators.
+  av1_zero(cm->counts.seg);
+
+  // First of all generate stats regarding how well the last segment map
+  // predicts this one
+  for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+    TileInfo tile_info;
+    av1_tile_set_row(&tile_info, cm, tile_row);
+    for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
+      MODE_INFO **mi_ptr;
+      av1_tile_set_col(&tile_info, cm, tile_col);
+#if CONFIG_TILE_GROUPS && CONFIG_DEPENDENT_HORZTILES
+      av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col);
+#endif
+      mi_ptr = cm->mi_grid_visible + tile_info.mi_row_start * cm->mi_stride +
+               tile_info.mi_col_start;
+      for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+           mi_row += cm->mib_size, mi_ptr += cm->mib_size * cm->mi_stride) {
+        MODE_INFO **mi = mi_ptr;
+        for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+             mi_col += cm->mib_size, mi += cm->mib_size) {
+          count_segs_sb(cm, xd, &tile_info, mi, no_pred_segcounts,
+                        temporal_predictor_count, t_unpred_seg_counts, mi_row,
+                        mi_col, cm->sb_size);
+        }
+      }
+    }
+  }
+
+  // Work out probability tree for coding segments without prediction
+  // and the cost.
+  calc_segtree_probs(no_pred_segcounts, no_pred_tree, segp->tree_probs, probwt);
+  no_pred_cost = cost_segmap(no_pred_segcounts, no_pred_tree);
+
+  // Key frames cannot use temporal prediction
+  if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
+    // Work out probability tree for coding those segments not
+    // predicted using the temporal method and the cost.
+    calc_segtree_probs(t_unpred_seg_counts, t_pred_tree, segp->tree_probs,
+                       probwt);
+    t_pred_cost = cost_segmap(t_unpred_seg_counts, t_pred_tree);
+
+    // Add in the cost of the signaling for each prediction context.
+    for (i = 0; i < PREDICTION_PROBS; i++) {
+      const int count0 = temporal_predictor_count[i][0];
+      const int count1 = temporal_predictor_count[i][1];
+
+      t_nopred_prob[i] = get_binary_prob(count0, count1);
+      av1_prob_diff_update_savings_search(
+          temporal_predictor_count[i], segp->pred_probs[i], &t_nopred_prob[i],
+          DIFF_UPDATE_PROB, probwt);
+
+      // Add in the predictor signaling cost
+      t_pred_cost += count0 * av1_cost_zero(t_nopred_prob[i]) +
+                     count1 * av1_cost_one(t_nopred_prob[i]);
+    }
+  }
+
+  // Now choose which coding method to use.
+  if (t_pred_cost < no_pred_cost) {
+    assert(!cm->error_resilient_mode);
+    seg->temporal_update = 1;
+  } else {
+    seg->temporal_update = 0;
+  }
+}
+
+void av1_reset_segment_features(AV1_COMMON *cm) {
+  struct segmentation *seg = &cm->seg;
+
+  // Set up default state for MB feature flags
+  seg->enabled = 0;
+  seg->update_map = 0;
+  seg->update_data = 0;
+  av1_clearall_segfeatures(seg);
+}
diff --git a/third_party/aom/av1/encoder/segmentation.h b/third_party/aom/av1/encoder/segmentation.h
new file mode 100644
index 000000000..c1491ca2a
--- /dev/null
+++ b/third_party/aom/av1/encoder/segmentation.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_SEGMENTATION_H_
+#define AV1_ENCODER_SEGMENTATION_H_
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_enable_segmentation(struct segmentation *seg);
+void av1_disable_segmentation(struct segmentation *seg);
+
+void av1_disable_segfeature(struct segmentation *seg, int segment_id,
+                            SEG_LVL_FEATURES feature_id);
+void av1_clear_segdata(struct segmentation *seg, int segment_id,
+                       SEG_LVL_FEATURES feature_id);
+
+// The values given for each segment can be either deltas (from the default
+// value chosen for the frame) or absolute values.
+//
+// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for
+// SEGMENT_ALT_LF)
+// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for
+// SEGMENT_ALT_LF)
+//
+// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
+// the absolute values given).
+void av1_set_segment_data(struct segmentation *seg, signed char *feature_data,
+                          unsigned char abs_delta);
+
+void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd);
+
+void av1_reset_segment_features(AV1_COMMON *cm);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_SEGMENTATION_H_
diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c
new file mode 100644
index 000000000..20c96761b
--- /dev/null
+++ b/third_party/aom/av1/encoder/speed_features.c
@@ -0,0 +1,506 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/rdopt.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#define MAX_MESH_SPEED 5  // Max speed setting for mesh motion method
+static MESH_PATTERN
+    good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
+      { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+      { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+      { { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } },
+      { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+      { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+      { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+    };
+static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] = {
+  50, 25, 15, 5, 1, 1
+};
+
+#if CONFIG_INTRABC
+// TODO(aconverse@google.com): These settings are pretty relaxed, tune them for
+// each speed setting
+static MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
+  { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
+};
+static uint8_t intrabc_max_mesh_pct[MAX_MESH_SPEED + 1] = { 100, 100, 100,
+                                                            25,  25,  10 };
+#endif
+
+// Intra only frames, golden frames (except alt ref overlays) and
+// alt ref frames tend to be coded at a higher than ambient quality
+static int frame_is_boosted(const AV1_COMP *cpi) {
+  return frame_is_kf_gf_arf(cpi);
+}
+
+// Sets a partition size down to which the auto partition code will always
+// search (can go lower), based on the image dimensions. The logic here
+// is that the extent to which ringing artefacts are offensive, depends
+// partly on the screen area that over which they propogate. Propogation is
+// limited by transform block size but the screen area take up by a given block
+// size will be larger for a small image format stretched to full screen.
+static BLOCK_SIZE set_partition_min_limit(AV1_COMMON *const cm) {
+  unsigned int screen_area = (cm->width * cm->height);
+
+  // Select block size based on image format size.
+  if (screen_area < 1280 * 720) {
+    // Formats smaller in area than 720P
+    return BLOCK_4X4;
+  } else if (screen_area < 1920 * 1080) {
+    // Format >= 720P and < 1080P
+    return BLOCK_8X8;
+  } else {
+    // Formats 1080P and up
+    return BLOCK_16X16;
+  }
+}
+
+static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
+                                                       SPEED_FEATURES *sf,
+                                                       int speed) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  if (speed >= 1) {
+    if (AOMMIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask =
+          cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+      sf->partition_search_breakout_dist_thr = (1 << 23);
+    } else {
+      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+      sf->partition_search_breakout_dist_thr = (1 << 21);
+    }
+  }
+
+  if (speed >= 2) {
+    if (AOMMIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask =
+          cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+      sf->adaptive_pred_interp_filter = 0;
+      sf->partition_search_breakout_dist_thr = (1 << 24);
+      sf->partition_search_breakout_rate_thr = 120;
+    } else {
+      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+      sf->partition_search_breakout_dist_thr = (1 << 22);
+      sf->partition_search_breakout_rate_thr = 100;
+    }
+    sf->rd_auto_partition_min_limit = set_partition_min_limit(cm);
+  }
+
+  if (speed >= 3) {
+    if (AOMMIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = DISABLE_ALL_SPLIT;
+      sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
+      sf->partition_search_breakout_dist_thr = (1 << 25);
+      sf->partition_search_breakout_rate_thr = 200;
+    } else {
+      sf->max_intra_bsize = BLOCK_32X32;
+      sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+      sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0;
+      sf->partition_search_breakout_dist_thr = (1 << 23);
+      sf->partition_search_breakout_rate_thr = 120;
+    }
+  }
+
+  // If this is a two pass clip that fits the criteria for animated or
+  // graphics content then reset disable_split_mask for speeds 1-4.
+  // Also if the image edge is internal to the coded area.
+  if ((speed >= 1) && (cpi->oxcf.pass == 2) &&
+      ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+       (av1_internal_image_edge(cpi)))) {
+    sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+  }
+
+  if (speed >= 4) {
+    if (AOMMIN(cm->width, cm->height) >= 720) {
+      sf->partition_search_breakout_dist_thr = (1 << 26);
+    } else {
+      sf->partition_search_breakout_dist_thr = (1 << 24);
+    }
+    sf->disable_split_mask = DISABLE_ALL_SPLIT;
+  }
+}
+
+static void set_good_speed_feature(AV1_COMP *cpi, AV1_COMMON *cm,
+                                   SPEED_FEATURES *sf, int speed) {
+  const int boosted = frame_is_boosted(cpi);
+
+  if (speed >= 1) {
+    sf->tx_type_search.fast_intra_tx_type_search = 1;
+    sf->tx_type_search.fast_inter_tx_type_search = 1;
+  }
+
+  if (speed >= 2) {
+    if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+        av1_internal_image_edge(cpi)) {
+      sf->use_square_partition_only = !frame_is_boosted(cpi);
+    } else {
+      sf->use_square_partition_only = !frame_is_intra_only(cm);
+    }
+
+    sf->less_rectangular_check = 1;
+
+    sf->use_rd_breakout = 1;
+    sf->adaptive_motion_search = 1;
+    sf->mv.auto_mv_step_size = 1;
+    sf->adaptive_rd_thresh = 1;
+    sf->mv.subpel_iters_per_step = 1;
+    sf->mode_skip_start = 10;
+    sf->adaptive_pred_interp_filter = 1;
+
+    sf->recode_loop = ALLOW_RECODE_KFARFGF;
+#if CONFIG_TX64X64
+    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC_H_V;
+#endif  // CONFIG_TX64X64
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+
+    sf->tx_size_search_breakout = 1;
+    sf->partition_search_breakout_rate_thr = 80;
+    sf->tx_type_search.prune_mode = PRUNE_ONE;
+    // Use transform domain distortion.
+    // Note var-tx expt always uses pixel domain distortion.
+    sf->use_transform_domain_distortion = 1;
+#if CONFIG_EXT_INTER
+    sf->disable_wedge_search_var_thresh = 100;
+    sf->fast_wedge_sign_estimate = 1;
+#endif  // CONFIG_EXT_INTER
+  }
+
+  if (speed >= 3) {
+    sf->tx_size_search_method =
+        frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL;
+    sf->mode_search_skip_flags =
+        (cm->frame_type == KEY_FRAME)
+            ? 0
+            : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+                  FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR;
+    sf->disable_filter_search_var_thresh = 100;
+    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+    sf->allow_partition_search_skip = 1;
+    sf->use_upsampled_references = 0;
+    sf->adaptive_rd_thresh = 2;
+#if CONFIG_EXT_TX
+    sf->tx_type_search.prune_mode = PRUNE_TWO;
+#endif
+  }
+
+  if (speed >= 4) {
+    sf->use_square_partition_only = !frame_is_intra_only(cm);
+    sf->tx_size_search_method =
+        frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
+    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
+    sf->adaptive_pred_interp_filter = 0;
+    sf->adaptive_mode_search = 1;
+    sf->cb_partition_search = !boosted;
+    sf->cb_pred_filter_search = 1;
+    sf->alt_ref_search_fp = 1;
+    sf->recode_loop = ALLOW_RECODE_KFMAXBW;
+    sf->adaptive_rd_thresh = 3;
+    sf->mode_skip_start = 6;
+#if CONFIG_TX64X64
+    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
+    sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC;
+#endif  // CONFIG_TX64X64
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
+    sf->adaptive_interp_filter_search = 1;
+  }
+
+  if (speed >= 5) {
+    sf->use_square_partition_only = 1;
+    sf->tx_size_search_method = USE_LARGESTALL;
+    sf->mv.search_method = BIGDIA;
+    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+    sf->adaptive_rd_thresh = 4;
+    if (cm->frame_type != KEY_FRAME)
+      sf->mode_search_skip_flags |= FLAG_EARLY_TERMINATE;
+    sf->disable_filter_search_var_thresh = 200;
+    sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
+    sf->use_fast_coef_costing = 1;
+    sf->partition_search_breakout_rate_thr = 300;
+  }
+
+  if (speed >= 6) {
+    int i;
+    sf->optimize_coefficients = 0;
+    sf->mv.search_method = HEX;
+    sf->disable_filter_search_var_thresh = 500;
+    for (i = 0; i < TX_SIZES; ++i) {
+      sf->intra_y_mode_mask[i] = INTRA_DC;
+      sf->intra_uv_mode_mask[i] = INTRA_DC;
+    }
+    sf->partition_search_breakout_rate_thr = 500;
+    sf->mv.reduce_first_step_size = 1;
+    sf->simple_model_rd_from_var = 1;
+  }
+  if (speed >= 7) {
+    const int is_keyframe = cm->frame_type == KEY_FRAME;
+    const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
+    sf->default_max_partition_size = BLOCK_32X32;
+    sf->default_min_partition_size = BLOCK_8X8;
+#if CONFIG_TX64X64
+    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
+#endif  // CONFIG_TX64X64
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+    sf->frame_parameter_update = 0;
+    sf->mv.search_method = FAST_HEX;
+    sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW;
+    sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST;
+#if CONFIG_EXT_PARTITION
+    sf->inter_mode_mask[BLOCK_64X128] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_128X64] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_128X128] = INTER_NEAREST;
+#endif  // CONFIG_EXT_PARTITION
+    sf->partition_search_type = REFERENCE_PARTITION;
+    sf->default_min_partition_size = BLOCK_8X8;
+    sf->reuse_inter_pred_sby = 1;
+    sf->force_frame_boost =
+        is_keyframe ||
+        (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
+    sf->max_delta_qindex = is_keyframe ? 20 : 15;
+    sf->coeff_prob_appx_step = 4;
+    sf->mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+  }
+}
+
+void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  AV1_COMMON *const cm = &cpi->common;
+  RD_OPT *const rd = &cpi->rd;
+  int i;
+
+// Limit memory usage for high resolutions
+#if CONFIG_EXT_REFS
+  // TODO(zoeliu): Temporary solution to resolve the insufficient RAM issue for
+  //               ext-refs. Need to work with @yunqingwang to have a more
+  //               effective solution.
+  if (AOMMIN(cm->width, cm->height) > 720) {
+    // Turn off the use of upsampled references for HD resolution
+    sf->use_upsampled_references = 0;
+  } else if ((AOMMIN(cm->width, cm->height) > 540) &&
+             (oxcf->profile != PROFILE_0)) {
+    sf->use_upsampled_references = 0;
+  }
+#else
+  if (AOMMIN(cm->width, cm->height) > 1080) {
+    sf->use_upsampled_references = 0;
+  } else if ((AOMMIN(cm->width, cm->height) > 720) &&
+             (oxcf->profile != PROFILE_0)) {
+    sf->use_upsampled_references = 0;
+  }
+#endif  // CONFIG_EXT_REFS
+
+  if (oxcf->mode == GOOD) {
+    set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
+  }
+
+  if (sf->disable_split_mask == DISABLE_ALL_SPLIT) {
+    sf->adaptive_pred_interp_filter = 0;
+  }
+
+  // Check for masked out split cases.
+  for (i = 0; i < MAX_REFS; ++i) {
+    if (sf->disable_split_mask & (1 << i)) {
+      rd->thresh_mult_sub8x8[i] = INT_MAX;
+    }
+  }
+
+  // This is only used in motion vector unit test.
+  if (cpi->oxcf.motion_vector_unit_test == 1)
+    cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
+  else if (cpi->oxcf.motion_vector_unit_test == 2)
+    cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+}
+
+void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  int i;
+
+  // best quality defaults
+  sf->frame_parameter_update = 1;
+  sf->mv.search_method = NSTEP;
+  sf->recode_loop = ALLOW_RECODE;
+  sf->mv.subpel_search_method = SUBPEL_TREE;
+  sf->mv.subpel_iters_per_step = 2;
+  sf->mv.subpel_force_stop = 0;
+  sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf);
+  sf->mv.reduce_first_step_size = 0;
+  sf->coeff_prob_appx_step = 1;
+  sf->mv.auto_mv_step_size = 0;
+  sf->mv.fullpel_search_step_param = 6;
+  sf->comp_inter_joint_search_thresh = BLOCK_4X4;
+  sf->adaptive_rd_thresh = 0;
+  sf->tx_size_search_method = USE_FULL_RD;
+  sf->adaptive_motion_search = 0;
+  sf->adaptive_pred_interp_filter = 0;
+  sf->adaptive_mode_search = 0;
+  sf->cb_pred_filter_search = 0;
+  sf->cb_partition_search = 0;
+  sf->alt_ref_search_fp = 0;
+  sf->partition_search_type = SEARCH_PARTITION;
+  sf->tx_type_search.prune_mode = NO_PRUNE;
+  sf->tx_type_search.fast_intra_tx_type_search = 0;
+  sf->tx_type_search.fast_inter_tx_type_search = 0;
+  sf->less_rectangular_check = 0;
+  sf->use_square_partition_only = 0;
+  sf->auto_min_max_partition_size = NOT_IN_USE;
+  sf->rd_auto_partition_min_limit = BLOCK_4X4;
+  sf->default_max_partition_size = BLOCK_LARGEST;
+  sf->default_min_partition_size = BLOCK_4X4;
+  sf->adjust_partitioning_from_last_frame = 0;
+  sf->last_partitioning_redo_frequency = 4;
+  sf->disable_split_mask = 0;
+  sf->mode_search_skip_flags = 0;
+  sf->force_frame_boost = 0;
+  sf->max_delta_qindex = 0;
+  sf->disable_filter_search_var_thresh = 0;
+  sf->adaptive_interp_filter_search = 0;
+  sf->allow_partition_search_skip = 0;
+  sf->use_upsampled_references = 1;
+#if CONFIG_EXT_INTER
+  sf->disable_wedge_search_var_thresh = 0;
+  sf->fast_wedge_sign_estimate = 0;
+#endif  // CONFIG_EXT_INTER
+
+  for (i = 0; i < TX_SIZES; i++) {
+    sf->intra_y_mode_mask[i] = INTRA_ALL;
+    sf->intra_uv_mode_mask[i] = INTRA_ALL;
+  }
+  sf->use_rd_breakout = 0;
+  sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
+  sf->use_fast_coef_updates = TWO_LOOP;
+  sf->use_fast_coef_costing = 0;
+  sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
+  sf->schedule_mode_search = 0;
+  for (i = 0; i < BLOCK_SIZES; ++i) sf->inter_mode_mask[i] = INTER_ALL;
+  sf->max_intra_bsize = BLOCK_LARGEST;
+  sf->reuse_inter_pred_sby = 0;
+  // This setting only takes effect when partition_search_type is set
+  // to FIXED_PARTITION.
+  sf->always_this_block_size = BLOCK_16X16;
+  sf->search_type_check_frequency = 50;
+  // Recode loop tolerance %.
+  sf->recode_tolerance = 25;
+  sf->default_interp_filter = SWITCHABLE;
+  sf->tx_size_search_breakout = 0;
+  sf->partition_search_breakout_dist_thr = 0;
+  sf->partition_search_breakout_rate_thr = 0;
+  sf->simple_model_rd_from_var = 0;
+
+  // Set this at the appropriate speed levels
+  sf->use_transform_domain_distortion = 0;
+
+  if (oxcf->mode == GOOD
+#if CONFIG_XIPHRC
+      || oxcf->pass == 1
+#endif
+      )
+    set_good_speed_feature(cpi, cm, sf, oxcf->speed);
+
+  // sf->partition_search_breakout_dist_thr is set assuming max 64x64
+  // blocks. Normalise this if the blocks are bigger.
+  if (MAX_SB_SIZE_LOG2 > 6) {
+    sf->partition_search_breakout_dist_thr <<= 2 * (MAX_SB_SIZE_LOG2 - 6);
+  }
+
+  cpi->full_search_sad = av1_full_search_sad;
+  cpi->diamond_search_sad = av1_diamond_search_sad;
+
+  sf->allow_exhaustive_searches = 1;
+  int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed;
+  if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
+    sf->exhaustive_searches_thresh = (1 << 24);
+  else
+    sf->exhaustive_searches_thresh = (1 << 25);
+  sf->max_exaustive_pct = good_quality_max_mesh_pct[speed];
+  if (speed > 0)
+    sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1;
+
+  for (i = 0; i < MAX_MESH_STEP; ++i) {
+    sf->mesh_patterns[i].range = good_quality_mesh_patterns[speed][i].range;
+    sf->mesh_patterns[i].interval =
+        good_quality_mesh_patterns[speed][i].interval;
+  }
+#if CONFIG_INTRABC
+  if ((frame_is_intra_only(cm) && cm->allow_screen_content_tools) &&
+      (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION ||
+       cpi->oxcf.content == AOM_CONTENT_SCREEN)) {
+    for (i = 0; i < MAX_MESH_STEP; ++i) {
+      sf->mesh_patterns[i].range = intrabc_mesh_patterns[speed][i].range;
+      sf->mesh_patterns[i].interval = intrabc_mesh_patterns[speed][i].interval;
+    }
+    sf->max_exaustive_pct = intrabc_max_mesh_pct[speed];
+  }
+#endif  // CONFIG_INTRABC
+
+#if !CONFIG_XIPHRC
+  // Slow quant, dct and trellis not worthwhile for first pass
+  // so make sure they are always turned off.
+  if (oxcf->pass == 1) sf->optimize_coefficients = 0;
+#endif
+
+  // No recode for 1 pass.
+  if (oxcf->pass == 0) {
+    sf->recode_loop = DISALLOW_RECODE;
+    sf->optimize_coefficients = 0;
+  }
+
+  if (sf->mv.subpel_search_method == SUBPEL_TREE) {
+    cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) {
+    cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) {
+    cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned_more;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_EVENMORE) {
+    cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned_evenmore;
+  }
+
+#if !CONFIG_AOM_QM
+  x->optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
+#else
+  // FIXME: trellis not very efficient for quantisation matrices
+  x->optimize = 0;
+#endif
+
+  x->min_partition_size = sf->default_min_partition_size;
+  x->max_partition_size = sf->default_max_partition_size;
+
+  if (!cpi->oxcf.frame_periodic_boost) {
+    sf->max_delta_qindex = 0;
+  }
+
+  // This is only used in motion vector unit test.
+  if (cpi->oxcf.motion_vector_unit_test == 1)
+    cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
+  else if (cpi->oxcf.motion_vector_unit_test == 2)
+    cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+}
diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h
new file mode 100644
index 000000000..af54a1a9a
--- /dev/null
+++ b/third_party/aom/av1/encoder/speed_features.h
@@ -0,0 +1,484 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_SPEED_FEATURES_H_
+#define AV1_ENCODER_SPEED_FEATURES_H_
+
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+  INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) |
+              (1 << D135_PRED) | (1 << D117_PRED) | (1 << D153_PRED) |
+              (1 << D207_PRED) | (1 << D63_PRED) |
+#if CONFIG_ALT_INTRA
+              (1 << SMOOTH_PRED) |
+#endif  // CONFIG_ALT_INTRA
+              (1 << TM_PRED),
+  INTRA_DC = (1 << DC_PRED),
+  INTRA_DC_TM = (1 << DC_PRED) | (1 << TM_PRED),
+  INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+  INTRA_DC_TM_H_V =
+      (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) | (1 << H_PRED)
+};
+
+#if CONFIG_EXT_INTER
+enum {
+  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV) |
+              (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) |
+              (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) | (1 << NEW_NEWMV) |
+              (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) | (1 << NEW_NEARMV) |
+              (1 << NEW_NEARESTMV) | (1 << ZERO_ZEROMV),
+  INTER_NEAREST = (1 << NEARESTMV) | (1 << NEAREST_NEARESTMV) |
+                  (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) |
+                  (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV),
+  INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV) |
+                      (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) |
+                      (1 << NEAR_NEARESTMV) | (1 << NEAREST_NEARMV) |
+                      (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
+                      (1 << NEW_NEARMV) | (1 << NEAR_NEWMV),
+  INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) |
+                       (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) |
+                       (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) |
+                       (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV),
+  INTER_NEAREST_NEW_ZERO =
+      (1 << NEARESTMV) | (1 << ZEROMV) | (1 << NEWMV) |
+      (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) | (1 << NEW_NEWMV) |
+      (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) | (1 << NEW_NEARESTMV) |
+      (1 << NEAREST_NEWMV) | (1 << NEW_NEARMV) | (1 << NEAR_NEWMV),
+  INTER_NEAREST_NEAR_NEW =
+      (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV) |
+      (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) | (1 << NEAREST_NEARMV) |
+      (1 << NEAR_NEARESTMV) | (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
+      (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) | (1 << NEAR_NEARMV),
+  INTER_NEAREST_NEAR_ZERO =
+      (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) |
+      (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) | (1 << NEAREST_NEARMV) |
+      (1 << NEAR_NEARESTMV) | (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) |
+      (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) | (1 << NEAR_NEARMV),
+};
+#else
+enum {
+  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
+  INTER_NEAREST = (1 << NEARESTMV),
+  INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV),
+  INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << ZEROMV),
+  INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) | (1 << NEWMV),
+  INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV),
+  INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV),
+};
+#endif  // CONFIG_EXT_INTER
+
+enum {
+  DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) |
+                            (1 << THR_ALTR) | (1 << THR_GOLD) | (1 << THR_LAST),
+
+  DISABLE_ALL_SPLIT = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT,
+
+  DISABLE_COMPOUND_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA),
+
+  LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) |
+                              (1 << THR_ALTR) | (1 << THR_GOLD)
+};
+
+typedef enum {
+  DIAMOND = 0,
+  NSTEP = 1,
+  HEX = 2,
+  BIGDIA = 3,
+  SQUARE = 4,
+  FAST_HEX = 5,
+  FAST_DIAMOND = 6
+} SEARCH_METHODS;
+
+typedef enum {
+  // No recode.
+  DISALLOW_RECODE = 0,
+  // Allow recode for KF and exceeding maximum frame bandwidth.
+  ALLOW_RECODE_KFMAXBW = 1,
+  // Allow recode only for KF/ARF/GF frames.
+  ALLOW_RECODE_KFARFGF = 2,
+  // Allow recode for all frames based on bitrate constraints.
+  ALLOW_RECODE = 3,
+} RECODE_LOOP_TYPE;
+
+typedef enum {
+  SUBPEL_TREE = 0,
+  SUBPEL_TREE_PRUNED = 1,           // Prunes 1/2-pel searches
+  SUBPEL_TREE_PRUNED_MORE = 2,      // Prunes 1/2-pel searches more aggressively
+  SUBPEL_TREE_PRUNED_EVENMORE = 3,  // Prunes 1/2- and 1/4-pel searches
+  // Other methods to come
+} SUBPEL_SEARCH_METHODS;
+
+typedef enum {
+  NO_MOTION_THRESHOLD = 0,
+  LOW_MOTION_THRESHOLD = 7
+} MOTION_THRESHOLD;
+
+typedef enum {
+  USE_FULL_RD = 0,
+  USE_LARGESTALL,
+  USE_TX_8X8
+} TX_SIZE_SEARCH_METHOD;
+
+typedef enum {
+  NOT_IN_USE = 0,
+  RELAXED_NEIGHBORING_MIN_MAX = 1
+} AUTO_MIN_MAX_MODE;
+
+typedef enum {
+  // Try the full image with different values.
+  LPF_PICK_FROM_FULL_IMAGE,
+  // Try a small portion of the image with different values.
+  LPF_PICK_FROM_SUBIMAGE,
+  // Estimate the level based on quantizer and frame type
+  LPF_PICK_FROM_Q,
+  // Pick 0 to disable LPF if LPF was enabled last frame
+  LPF_PICK_MINIMAL_LPF
+} LPF_PICK_METHOD;
+
+typedef enum {
+  // Terminate search early based on distortion so far compared to
+  // qp step, distortion in the neighborhood of the frame, etc.
+  FLAG_EARLY_TERMINATE = 1 << 0,
+
+  // Skips comp inter modes if the best so far is an intra mode.
+  FLAG_SKIP_COMP_BESTINTRA = 1 << 1,
+
+  // Skips oblique intra modes if the best so far is an inter mode.
+  FLAG_SKIP_INTRA_BESTINTER = 1 << 3,
+
+  // Skips oblique intra modes  at angles 27, 63, 117, 153 if the best
+  // intra so far is not one of the neighboring directions.
+  FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4,
+
+  // Skips intra modes other than DC_PRED if the source variance is small
+  FLAG_SKIP_INTRA_LOWVAR = 1 << 5,
+} MODE_SEARCH_SKIP_LOGIC;
+
+typedef enum {
+  FLAG_SKIP_EIGHTTAP_REGULAR = 1 << EIGHTTAP_REGULAR,
+  FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH,
+  FLAG_SKIP_MULTITAP_SHARP = 1 << MULTITAP_SHARP,
+} INTERP_FILTER_MASK;
+
+typedef enum {
+  NO_PRUNE = 0,
+  // eliminates one tx type in vertical and horizontal direction
+  PRUNE_ONE = 1,
+#if CONFIG_EXT_TX
+  // eliminates two tx types in each direction
+  PRUNE_TWO = 2,
+#endif
+} TX_TYPE_PRUNE_MODE;
+
+typedef struct {
+  TX_TYPE_PRUNE_MODE prune_mode;
+  int fast_intra_tx_type_search;
+  int fast_inter_tx_type_search;
+} TX_TYPE_SEARCH;
+
+typedef enum {
+  // Search partitions using RD criterion
+  SEARCH_PARTITION,
+
+  // Always use a fixed size partition
+  FIXED_PARTITION,
+
+  REFERENCE_PARTITION,
+
+  // Use an arbitrary partitioning scheme based on source variance within
+  // a 64X64 SB
+  VAR_BASED_PARTITION,
+
+  // Use non-fixed partitions based on source variance
+  SOURCE_VAR_BASED_PARTITION
+} PARTITION_SEARCH_TYPE;
+
+typedef enum {
+  // Does a dry run to see if any of the contexts need to be updated or not,
+  // before the final run.
+  TWO_LOOP = 0,
+
+  // No dry run, also only half the coef contexts and bands are updated.
+  // The rest are not updated at all.
+  ONE_LOOP_REDUCED = 1
+} FAST_COEFF_UPDATE;
+
+typedef struct MV_SPEED_FEATURES {
+  // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
+  SEARCH_METHODS search_method;
+
+  // This parameter controls which step in the n-step process we start at.
+  // It's changed adaptively based on circumstances.
+  int reduce_first_step_size;
+
+  // If this is set to 1, we limit the motion search range to 2 times the
+  // largest motion vector found in the last frame.
+  int auto_mv_step_size;
+
+  // Subpel_search_method can only be subpel_tree which does a subpixel
+  // logarithmic search that keeps stepping at 1/2 pixel units until
+  // you stop getting a gain, and then goes on to 1/4 and repeats
+  // the same process. Along the way it skips many diagonals.
+  SUBPEL_SEARCH_METHODS subpel_search_method;
+
+  // Maximum number of steps in logarithmic subpel search before giving up.
+  int subpel_iters_per_step;
+
+  // Control when to stop subpel search
+  int subpel_force_stop;
+
+  // This variable sets the step_param used in full pel motion search.
+  int fullpel_search_step_param;
+} MV_SPEED_FEATURES;
+
+#define MAX_MESH_STEP 4
+
+typedef struct MESH_PATTERN {
+  int range;
+  int interval;
+} MESH_PATTERN;
+
+typedef struct SPEED_FEATURES {
+  MV_SPEED_FEATURES mv;
+
+  // Frame level coding parameter update
+  int frame_parameter_update;
+
+  RECODE_LOOP_TYPE recode_loop;
+
+  // Trellis (dynamic programming) optimization of quantized values (+1, 0).
+  int optimize_coefficients;
+
+  // Always set to 0. If on it enables 0 cost background transmission
+  // (except for the initial transmission of the segmentation). The feature is
+  // disabled because the addition of very large block sizes make the
+  // backgrounds very to cheap to encode, and the segmentation we have
+  // adds overhead.
+  int static_segmentation;
+
+  // If 1 we iterate finding a best reference for 2 ref frames together - via
+  // a log search that iterates 4 times (check around mv for last for best
+  // error of combined predictor then check around mv for alt). If 0 we
+  // we just use the best motion vector found for each frame by itself.
+  BLOCK_SIZE comp_inter_joint_search_thresh;
+
+  // This variable is used to cap the maximum number of times we skip testing a
+  // mode to be evaluated. A high value means we will be faster.
+  int adaptive_rd_thresh;
+
+  // Coefficient probability model approximation step size
+  int coeff_prob_appx_step;
+
+  // The threshold is to determine how slow the motino is, it is used when
+  // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION
+  MOTION_THRESHOLD lf_motion_threshold;
+
+  // Determine which method we use to determine transform size. We can choose
+  // between options like full rd, largest for prediction size, largest
+  // for intra and model coefs for the rest.
+  TX_SIZE_SEARCH_METHOD tx_size_search_method;
+
+  // After looking at the first set of modes (set by index here), skip
+  // checking modes for reference frames that don't match the reference frame
+  // of the best so far.
+  int mode_skip_start;
+
+  PARTITION_SEARCH_TYPE partition_search_type;
+
+  TX_TYPE_SEARCH tx_type_search;
+
+  // Used if partition_search_type = FIXED_SIZE_PARTITION
+  BLOCK_SIZE always_this_block_size;
+
+  // Skip rectangular partition test when partition type none gives better
+  // rd than partition type split.
+  int less_rectangular_check;
+
+  // Disable testing non square partitions. (eg 16x32)
+  int use_square_partition_only;
+
+  // Sets min and max partition sizes for this superblock based on the
+  // same superblock in last encoded frame, and the left and above neighbor.
+  AUTO_MIN_MAX_MODE auto_min_max_partition_size;
+  // Ensures the rd based auto partition search will always
+  // go down at least to the specified level.
+  BLOCK_SIZE rd_auto_partition_min_limit;
+
+  // Min and max partition size we enable (block_size) as per auto
+  // min max, but also used by adjust partitioning, and pick_partitioning.
+  BLOCK_SIZE default_min_partition_size;
+  BLOCK_SIZE default_max_partition_size;
+
+  // Whether or not we allow partitions one smaller or one greater than the last
+  // frame's partitioning. Only used if use_lastframe_partitioning is set.
+  int adjust_partitioning_from_last_frame;
+
+  // How frequently we re do the partitioning from scratch. Only used if
+  // use_lastframe_partitioning is set.
+  int last_partitioning_redo_frequency;
+
+  // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
+  // it always, to allow it for only Last frame and Intra, disable it for all
+  // inter modes or to enable it always.
+  int disable_split_mask;
+
+  // TODO(jingning): combine the related motion search speed features
+  // This allows us to use motion search at other sizes as a starting
+  // point for this motion search and limits the search range around it.
+  int adaptive_motion_search;
+
+  // Flag for allowing some use of exhaustive searches;
+  int allow_exhaustive_searches;
+
+  // Threshold for allowing exhaistive motion search.
+  int exhaustive_searches_thresh;
+
+  // Maximum number of exhaustive searches for a frame.
+  int max_exaustive_pct;
+
+  // Pattern to be used for any exhaustive mesh searches.
+  MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
+
+  int schedule_mode_search;
+
+  // Allows sub 8x8 modes to use the prediction filter that was determined
+  // best for 8x8 mode. If set to 0 we always re check all the filters for
+  // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter
+  // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
+  int adaptive_pred_interp_filter;
+
+  // Adaptive prediction mode search
+  int adaptive_mode_search;
+
+  // Chessboard pattern prediction filter type search
+  int cb_pred_filter_search;
+
+  int cb_partition_search;
+
+  int alt_ref_search_fp;
+
+  // Use finer quantizer in every other few frames that run variable block
+  // partition type search.
+  int force_frame_boost;
+
+  // Maximally allowed base quantization index fluctuation.
+  int max_delta_qindex;
+
+  // Implements various heuristics to skip searching modes
+  // The heuristics selected are based on  flags
+  // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
+  unsigned int mode_search_skip_flags;
+
+  // A source variance threshold below which filter search is disabled
+  // Choose a very large value (UINT_MAX) to use 8-tap always
+  unsigned int disable_filter_search_var_thresh;
+
+#if CONFIG_EXT_INTER
+  // A source variance threshold below which wedge search is disabled
+  unsigned int disable_wedge_search_var_thresh;
+
+  // Whether fast wedge sign estimate is used
+  int fast_wedge_sign_estimate;
+#endif  // CONFIG_EXT_INTER
+
+  // These bit masks allow you to enable or disable intra modes for each
+  // transform size separately.
+  int intra_y_mode_mask[TX_SIZES];
+  int intra_uv_mode_mask[TX_SIZES];
+
+  // These bit masks allow you to enable or disable intra modes for each
+  // prediction block size separately.
+  int intra_y_mode_bsize_mask[BLOCK_SIZES];
+
+  // This variable enables an early break out of mode testing if the model for
+  // rd built from the prediction signal indicates a value that's much
+  // higher than the best rd we've seen so far.
+  int use_rd_breakout;
+
+  // This feature controls how the loop filter level is determined.
+  LPF_PICK_METHOD lpf_pick;
+
+  // This feature limits the number of coefficients updates we actually do
+  // by only looking at counts from 1/2 the bands.
+  FAST_COEFF_UPDATE use_fast_coef_updates;
+
+  // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV
+  // modes are used in order from LSB to MSB for each BLOCK_SIZE.
+  int inter_mode_mask[BLOCK_SIZES];
+
+  // This feature controls whether we do the expensive context update and
+  // calculation in the rd coefficient costing loop.
+  int use_fast_coef_costing;
+
+  // This feature controls the tolerence vs target used in deciding whether to
+  // recode a frame. It has no meaning if recode is disabled.
+  int recode_tolerance;
+
+  // This variable controls the maximum block size where intra blocks can be
+  // used in inter frames.
+  // TODO(aconverse): Fold this into one of the other many mode skips
+  BLOCK_SIZE max_intra_bsize;
+
+  // The frequency that we check if SOURCE_VAR_BASED_PARTITION or
+  // FIXED_PARTITION search type should be used.
+  int search_type_check_frequency;
+
+  // When partition is pre-set, the inter prediction result from pick_inter_mode
+  // can be reused in final block encoding process. It is enabled only for real-
+  // time mode speed 6.
+  int reuse_inter_pred_sby;
+
+  // default interp filter choice
+  InterpFilter default_interp_filter;
+
+  // Early termination in transform size search, which only applies while
+  // tx_size_search_method is USE_FULL_RD.
+  int tx_size_search_breakout;
+
+  // adaptive interp_filter search to allow skip of certain filter types.
+  int adaptive_interp_filter_search;
+
+  // mask for skip evaluation of certain interp_filter type.
+  INTERP_FILTER_MASK interp_filter_search_mask;
+
+  // Partition search early breakout thresholds.
+  int64_t partition_search_breakout_dist_thr;
+  int partition_search_breakout_rate_thr;
+
+  // Allow skipping partition search for still image frame
+  int allow_partition_search_skip;
+
+  // Fast approximation of av1_model_rd_from_var_lapndz
+  int simple_model_rd_from_var;
+
+  // Do sub-pixel search in up-sampled reference frames
+  int use_upsampled_references;
+
+  // Whether to compute distortion in the image domain (slower but
+  // more accurate), or in the transform domain (faster but less acurate).
+  int use_transform_domain_distortion;
+} SPEED_FEATURES;
+
+struct AV1_COMP;
+
+void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi);
+void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_SPEED_FEATURES_H_
diff --git a/third_party/aom/av1/encoder/subexp.c b/third_party/aom/av1/encoder/subexp.c
new file mode 100644
index 000000000..8960d3341
--- /dev/null
+++ b/third_party/aom/av1/encoder/subexp.c
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "aom_dsp/bitwriter.h"
+
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/subexp.h"
+
+static const uint8_t update_bits[255] = {
+  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,
+  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  8,  8,  8,  8,  8,  8,
+  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+  8,  8,  8,  8,  8,  8,  8,  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 0,
+};
+#define MIN_DELP_BITS 5
+
+static int recenter_nonneg(int v, int m) {
+  if (v > (m << 1))
+    return v;
+  else if (v >= m)
+    return ((v - m) << 1);
+  else
+    return ((m - v) << 1) - 1;
+}
+
+static int remap_prob(int v, int m) {
+  int i;
+  static const uint8_t map_table[MAX_PROB - 1] = {
+    // generated by:
+    //   map_table[j] = split_index(j, MAX_PROB - 1, MODULUS_PARAM);
+    20,  21,  22,  23,  24,  25,  0,   26,  27,  28,  29,  30,  31,  32,  33,
+    34,  35,  36,  37,  1,   38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+    48,  49,  2,   50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,
+    3,   62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  4,   74,
+    75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  5,   86,  87,  88,
+    89,  90,  91,  92,  93,  94,  95,  96,  97,  6,   98,  99,  100, 101, 102,
+    103, 104, 105, 106, 107, 108, 109, 7,   110, 111, 112, 113, 114, 115, 116,
+    117, 118, 119, 120, 121, 8,   122, 123, 124, 125, 126, 127, 128, 129, 130,
+    131, 132, 133, 9,   134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
+    145, 10,  146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11,
+    158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12,  170, 171,
+    172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 13,  182, 183, 184, 185,
+    186, 187, 188, 189, 190, 191, 192, 193, 14,  194, 195, 196, 197, 198, 199,
+    200, 201, 202, 203, 204, 205, 15,  206, 207, 208, 209, 210, 211, 212, 213,
+    214, 215, 216, 217, 16,  218, 219, 220, 221, 222, 223, 224, 225, 226, 227,
+    228, 229, 17,  230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
+    18,  242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 19,
+  };
+  v--;
+  m--;
+  if ((m << 1) <= MAX_PROB)
+    i = recenter_nonneg(v, m) - 1;
+  else
+    i = recenter_nonneg(MAX_PROB - 1 - v, MAX_PROB - 1 - m) - 1;
+
+  i = map_table[i];
+  return i;
+}
+
+static int prob_diff_update_cost(aom_prob newp, aom_prob oldp) {
+  int delp = remap_prob(newp, oldp);
+  return update_bits[delp] << AV1_PROB_COST_SHIFT;
+}
+
+static void encode_uniform(aom_writer *w, int v) {
+  const int l = 8;
+  const int m = (1 << l) - 190;
+  if (v < m) {
+    aom_write_literal(w, v, l - 1);
+  } else {
+    aom_write_literal(w, m + ((v - m) >> 1), l - 1);
+    aom_write_literal(w, (v - m) & 1, 1);
+  }
+}
+
+static INLINE int write_bit_gte(aom_writer *w, int word, int test) {
+  aom_write_literal(w, word >= test, 1);
+  return word >= test;
+}
+
+static void encode_term_subexp(aom_writer *w, int word) {
+  if (!write_bit_gte(w, word, 16)) {
+    aom_write_literal(w, word, 4);
+  } else if (!write_bit_gte(w, word, 32)) {
+    aom_write_literal(w, word - 16, 4);
+  } else if (!write_bit_gte(w, word, 64)) {
+    aom_write_literal(w, word - 32, 5);
+  } else {
+    encode_uniform(w, word - 64);
+  }
+}
+
+void av1_write_prob_diff_update(aom_writer *w, aom_prob newp, aom_prob oldp) {
+  const int delp = remap_prob(newp, oldp);
+  encode_term_subexp(w, delp);
+}
+
+int av1_prob_diff_update_savings_search(const unsigned int *ct, aom_prob oldp,
+                                        aom_prob *bestp, aom_prob upd,
+                                        int probwt) {
+  const uint32_t old_b = cost_branch256(ct, oldp);
+  int bestsavings = 0;
+  aom_prob newp, bestnewp = oldp;
+  const int step = *bestp > oldp ? -1 : 1;
+  const int upd_cost = av1_cost_one(upd) - av1_cost_zero(upd);
+
+  if (old_b > (uint32_t)upd_cost + (MIN_DELP_BITS << AV1_PROB_COST_SHIFT)) {
+    for (newp = *bestp; newp != oldp; newp += step) {
+      const int new_b = cost_branch256(ct, newp);
+      const int update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
+      const int savings = (int)((int64_t)old_b - new_b - update_b * probwt);
+      if (savings > bestsavings) {
+        bestsavings = savings;
+        bestnewp = newp;
+      }
+    }
+  }
+  *bestp = bestnewp;
+  return bestsavings;
+}
+
+int av1_prob_diff_update_savings_search_model(const unsigned int *ct,
+                                              const aom_prob oldp,
+                                              aom_prob *bestp, aom_prob upd,
+                                              int stepsize, int probwt) {
+  int i, old_b, new_b, update_b, savings, bestsavings;
+  int newp;
+  const int step_sign = *bestp > oldp ? -1 : 1;
+  const int step = stepsize * step_sign;
+  const int upd_cost = av1_cost_one(upd) - av1_cost_zero(upd);
+  const aom_prob *newplist, *oldplist;
+  aom_prob bestnewp;
+  oldplist = av1_pareto8_full[oldp - 1];
+  old_b = cost_branch256(ct + 2 * PIVOT_NODE, oldp);
+  for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
+    old_b += cost_branch256(ct + 2 * i, oldplist[i - UNCONSTRAINED_NODES]);
+
+  bestsavings = 0;
+  bestnewp = oldp;
+
+  assert(stepsize > 0);
+
+  if (old_b > upd_cost + (MIN_DELP_BITS << AV1_PROB_COST_SHIFT)) {
+    for (newp = *bestp; (newp - oldp) * step_sign < 0; newp += step) {
+      if (newp < 1 || newp > 255) continue;
+      newplist = av1_pareto8_full[newp - 1];
+      new_b = cost_branch256(ct + 2 * PIVOT_NODE, newp);
+      for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
+        new_b += cost_branch256(ct + 2 * i, newplist[i - UNCONSTRAINED_NODES]);
+      update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
+      savings = old_b - new_b - update_b * probwt;
+      if (savings > bestsavings) {
+        bestsavings = savings;
+        bestnewp = newp;
+      }
+    }
+  }
+
+  *bestp = bestnewp;
+  return bestsavings;
+}
+
+#if CONFIG_SUBFRAME_PROB_UPDATE
+static int get_cost(unsigned int ct[][2], aom_prob p, int n) {
+  int i, p0 = p;
+  unsigned int total_ct[2] = { 0, 0 };
+  int cost = 0;
+
+  for (i = 0; i <= n; ++i) {
+    cost += cost_branch256(ct[i], p);
+    total_ct[0] += ct[i][0];
+    total_ct[1] += ct[i][1];
+    if (i < n)
+      p = av1_merge_probs(p0, total_ct, COEF_COUNT_SAT, COEF_MAX_UPDATE_FACTOR);
+  }
+  return cost;
+}
+
+int av1_prob_update_search_subframe(unsigned int ct[][2], aom_prob oldp,
+                                    aom_prob *bestp, aom_prob upd, int n) {
+  const int old_b = get_cost(ct, oldp, n);
+  int bestsavings = 0;
+  const int upd_cost = av1_cost_one(upd) - av1_cost_zero(upd);
+  aom_prob newp, bestnewp = oldp;
+  const int step = *bestp > oldp ? -1 : 1;
+
+  for (newp = *bestp; newp != oldp; newp += step) {
+    const int new_b = get_cost(ct, newp, n);
+    const int update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
+    const int savings = old_b - new_b - update_b;
+    if (savings > bestsavings) {
+      bestsavings = savings;
+      bestnewp = newp;
+    }
+  }
+  *bestp = bestnewp;
+  return bestsavings;
+}
+
+int av1_prob_update_search_model_subframe(
+    unsigned int ct[ENTROPY_NODES][COEF_PROBS_BUFS][2], const aom_prob *oldp,
+    aom_prob *bestp, aom_prob upd, int stepsize, int n) {
+  int i, old_b, new_b, update_b, savings, bestsavings;
+  int newp;
+  const int step_sign = *bestp > oldp[PIVOT_NODE] ? -1 : 1;
+  const int step = stepsize * step_sign;
+  const int upd_cost = av1_cost_one(upd) - av1_cost_zero(upd);
+  aom_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
+  av1_model_to_full_probs(oldp, oldplist);
+  memcpy(newplist, oldp, sizeof(aom_prob) * UNCONSTRAINED_NODES);
+  for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
+    old_b += get_cost(ct[i], oldplist[i], n);
+  old_b += get_cost(ct[PIVOT_NODE], oldplist[PIVOT_NODE], n);
+
+  bestsavings = 0;
+  bestnewp = oldp[PIVOT_NODE];
+
+  assert(stepsize > 0);
+
+  for (newp = *bestp; (newp - oldp[PIVOT_NODE]) * step_sign < 0; newp += step) {
+    if (newp < 1 || newp > 255) continue;
+    newplist[PIVOT_NODE] = newp;
+    av1_model_to_full_probs(newplist, newplist);
+    for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
+      new_b += get_cost(ct[i], newplist[i], n);
+    new_b += get_cost(ct[PIVOT_NODE], newplist[PIVOT_NODE], n);
+    update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) + upd_cost;
+    savings = old_b - new_b - update_b;
+    if (savings > bestsavings) {
+      bestsavings = savings;
+      bestnewp = newp;
+    }
+  }
+
+  *bestp = bestnewp;
+  return bestsavings;
+}
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+
+void av1_cond_prob_diff_update(aom_writer *w, aom_prob *oldp,
+                               const unsigned int ct[2], int probwt) {
+  const aom_prob upd = DIFF_UPDATE_PROB;
+  aom_prob newp = get_binary_prob(ct[0], ct[1]);
+  const int savings =
+      av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd, probwt);
+  assert(newp >= 1);
+  if (savings > 0) {
+    aom_write(w, 1, upd);
+    av1_write_prob_diff_update(w, newp, *oldp);
+    *oldp = newp;
+  } else {
+    aom_write(w, 0, upd);
+  }
+}
+
+int av1_cond_prob_diff_update_savings(aom_prob *oldp, const unsigned int ct[2],
+                                      int probwt) {
+  const aom_prob upd = DIFF_UPDATE_PROB;
+  aom_prob newp = get_binary_prob(ct[0], ct[1]);
+  const int savings =
+      av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd, probwt);
+  return savings;
+}
diff --git a/third_party/aom/av1/encoder/subexp.h b/third_party/aom/av1/encoder/subexp.h
new file mode 100644
index 000000000..049265cb8
--- /dev/null
+++ b/third_party/aom/av1/encoder/subexp.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_SUBEXP_H_
+#define AV1_ENCODER_SUBEXP_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/prob.h"
+
+void av1_write_prob_diff_update(aom_writer *w, aom_prob newp, aom_prob oldpm);
+
+void av1_cond_prob_diff_update(aom_writer *w, aom_prob *oldp,
+                               const unsigned int ct[2], int probwt);
+
+int av1_prob_diff_update_savings_search(const unsigned int *ct, aom_prob oldp,
+                                        aom_prob *bestp, aom_prob upd,
+                                        int probwt);
+
+int av1_prob_diff_update_savings_search_model(const unsigned int *ct,
+                                              const aom_prob oldp,
+                                              aom_prob *bestp, aom_prob upd,
+                                              int stepsize, int probwt);
+
+int av1_cond_prob_diff_update_savings(aom_prob *oldp, const unsigned int ct[2],
+                                      int probwt);
+#if CONFIG_SUBFRAME_PROB_UPDATE
+int av1_prob_update_search_subframe(unsigned int ct[][2], aom_prob oldp,
+                                    aom_prob *bestp, aom_prob upd, int n);
+int av1_prob_update_search_model_subframe(
+    unsigned int ct[ENTROPY_NODES][COEF_PROBS_BUFS][2], const aom_prob *oldp,
+    aom_prob *bestp, aom_prob upd, int stepsize, int n);
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_SUBEXP_H_
diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c
new file mode 100644
index 000000000..de962fe84
--- /dev/null
+++ b/third_party/aom/av1/encoder/temporal_filter.c
@@ -0,0 +1,719 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <limits.h>
+
+#include "./aom_config.h"
+#include "av1/common/alloccommon.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/odintrin.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/temporal_filter.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_scale/aom_scale.h"
+
+static void temporal_filter_predictors_mb_c(
+    MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr,
+    int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col,
+    uint8_t *pred, struct scale_factors *scale, int x, int y) {
+  const int which_mv = 0;
+  const MV mv = { mv_row, mv_col };
+  enum mv_precision mv_precision_uv;
+  int uv_stride;
+  // TODO(angiebird): change plane setting accordingly
+  ConvolveParams conv_params = get_conv_params(which_mv, 0);
+
+#if USE_TEMPORALFILTER_12TAP
+#if CONFIG_DUAL_FILTER
+  const InterpFilter interp_filter[4] = { TEMPORALFILTER_12TAP,
+                                          TEMPORALFILTER_12TAP,
+                                          TEMPORALFILTER_12TAP,
+                                          TEMPORALFILTER_12TAP };
+#else
+  const InterpFilter interp_filter = TEMPORALFILTER_12TAP;
+#endif
+  (void)xd;
+#else
+  const InterpFilter interp_filter = xd->mi[0]->mbmi.interp_filter;
+#endif  // USE_TEMPORALFILTER_12TAP
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+  WarpTypesAllowed warp_types;
+  memset(&warp_types, 0, sizeof(WarpTypesAllowed));
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+
+  if (uv_block_width == 8) {
+    uv_stride = (stride + 1) >> 1;
+    mv_precision_uv = MV_PRECISION_Q4;
+  } else {
+    uv_stride = stride;
+    mv_precision_uv = MV_PRECISION_Q3;
+  }
+
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    av1_highbd_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale,
+                                     16, 16, which_mv, interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                                     &warp_types, x, y,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                                     0, MV_PRECISION_Q3, x, y, xd);
+
+    av1_highbd_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256],
+                                     uv_block_width, &mv, scale, uv_block_width,
+                                     uv_block_height, which_mv, interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                                     &warp_types, x, y,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                                     1, mv_precision_uv, x, y, xd);
+
+    av1_highbd_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512],
+                                     uv_block_width, &mv, scale, uv_block_width,
+                                     uv_block_height, which_mv, interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                                     &warp_types, x, y,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                                     2, mv_precision_uv, x, y, xd);
+    return;
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+  av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16,
+                            &conv_params, interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                            &warp_types, x, y, 0, 0,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                            MV_PRECISION_Q3, x, y, xd);
+
+  av1_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], uv_block_width,
+                            &mv, scale, uv_block_width, uv_block_height,
+                            &conv_params, interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                            &warp_types, x, y, 1, 0,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                            mv_precision_uv, x, y, xd);
+
+  av1_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], uv_block_width,
+                            &mv, scale, uv_block_width, uv_block_height,
+                            &conv_params, interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                            &warp_types, x, y, 2, 0,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+                            mv_precision_uv, x, y, xd);
+}
+
+void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
+                                 uint8_t *frame2, unsigned int block_width,
+                                 unsigned int block_height, int strength,
+                                 int filter_weight, unsigned int *accumulator,
+                                 uint16_t *count) {
+  unsigned int i, j, k;
+  int modifier;
+  int byte = 0;
+  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+  for (i = 0, k = 0; i < block_height; i++) {
+    for (j = 0; j < block_width; j++, k++) {
+      int pixel_value = *frame2;
+
+      // non-local mean approach
+      int diff_sse[9] = { 0 };
+      int idx, idy, index = 0;
+
+      for (idy = -1; idy <= 1; ++idy) {
+        for (idx = -1; idx <= 1; ++idx) {
+          int row = (int)i + idy;
+          int col = (int)j + idx;
+
+          if (row >= 0 && row < (int)block_height && col >= 0 &&
+              col < (int)block_width) {
+            int diff = frame1[byte + idy * (int)stride + idx] -
+                       frame2[idy * (int)block_width + idx];
+            diff_sse[index] = diff * diff;
+            ++index;
+          }
+        }
+      }
+
+      assert(index > 0);
+
+      modifier = 0;
+      for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx];
+
+      modifier *= 3;
+      modifier /= index;
+
+      ++frame2;
+
+      modifier += rounding;
+      modifier >>= strength;
+
+      if (modifier > 16) modifier = 16;
+
+      modifier = 16 - modifier;
+      modifier *= filter_weight;
+
+      count[k] += modifier;
+      accumulator[k] += modifier * pixel_value;
+
+      byte++;
+    }
+
+    byte += stride - block_width;
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_temporal_filter_apply_c(
+    uint8_t *frame1_8, unsigned int stride, uint8_t *frame2_8,
+    unsigned int block_width, unsigned int block_height, int strength,
+    int filter_weight, unsigned int *accumulator, uint16_t *count) {
+  uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
+  uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
+  unsigned int i, j, k;
+  int modifier;
+  int byte = 0;
+  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+  for (i = 0, k = 0; i < block_height; i++) {
+    for (j = 0; j < block_width; j++, k++) {
+      int pixel_value = *frame2;
+
+      // non-local mean approach
+      int diff_sse[9] = { 0 };
+      int idx, idy, index = 0;
+
+      for (idy = -1; idy <= 1; ++idy) {
+        for (idx = -1; idx <= 1; ++idx) {
+          int row = (int)i + idy;
+          int col = (int)j + idx;
+
+          if (row >= 0 && row < (int)block_height && col >= 0 &&
+              col < (int)block_width) {
+            int diff = frame1[byte + idy * (int)stride + idx] -
+                       frame2[idy * (int)block_width + idx];
+            diff_sse[index] = diff * diff;
+            ++index;
+          }
+        }
+      }
+
+      assert(index > 0);
+
+      modifier = 0;
+      for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx];
+
+      modifier *= 3;
+      modifier /= index;
+
+      ++frame2;
+
+      modifier += rounding;
+      modifier >>= strength;
+
+      if (modifier > 16) modifier = 16;
+
+      modifier = 16 - modifier;
+      modifier *= filter_weight;
+
+      count[k] += modifier;
+      accumulator[k] += modifier * pixel_value;
+
+      byte++;
+    }
+
+    byte += stride - block_width;
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
+                                              uint8_t *arf_frame_buf,
+                                              uint8_t *frame_ptr_buf,
+                                              int stride) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  int bestsme = INT_MAX;
+  int distortion;
+  unsigned int sse;
+  int cost_list[5];
+  MvLimits tmp_mv_limits = x->mv_limits;
+
+  MV best_ref_mv1 = { 0, 0 };
+  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+  // Save input state
+  struct buf_2d src = x->plane[0].src;
+  struct buf_2d pre = xd->plane[0].pre[0];
+
+  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+  // Setup frame pointers
+  x->plane[0].src.buf = arf_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = frame_ptr_buf;
+  xd->plane[0].pre[0].stride = stride;
+
+  step_param = mv_sf->reduce_first_step_size;
+  step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+#if CONFIG_REF_MV
+  x->mvcost = x->mv_cost_stack[0];
+  x->nmvjointcost = x->nmv_vec_cost[0];
+  x->mvsadcost = x->mvcost;
+  x->nmvjointsadcost = x->nmvjointcost;
+#endif
+
+  // Ignore mv costing by sending NULL pointer instead of cost arrays
+  av1_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
+                 cond_cost_list(cpi, cost_list), &cpi->fn_ptr[BLOCK_16X16], 0,
+                 &best_ref_mv1);
+
+  x->mv_limits = tmp_mv_limits;
+
+  // Ignore mv costing by sending NULL pointer instead of cost array
+  bestsme = cpi->find_fractional_mv_step(
+      x, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
+      &cpi->fn_ptr[BLOCK_16X16], 0, mv_sf->subpel_iters_per_step,
+      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
+      0);
+
+  x->e_mbd.mi[0]->bmi[0].as_mv[0] = x->best_mv;
+
+  // Restore input state
+  x->plane[0].src = src;
+  xd->plane[0].pre[0] = pre;
+
+  return bestsme;
+}
+
+static void temporal_filter_iterate_c(AV1_COMP *cpi,
+                                      YV12_BUFFER_CONFIG **frames,
+                                      int frame_count, int alt_ref_index,
+                                      int strength,
+                                      struct scale_factors *scale) {
+  int byte;
+  int frame;
+  int mb_col, mb_row;
+  unsigned int filter_weight;
+  int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4;
+  int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4;
+  int mb_y_offset = 0;
+  int mb_uv_offset = 0;
+  DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]);
+  DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]);
+  MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
+  YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
+  uint8_t *dst1, *dst2;
+#if CONFIG_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, predictor16[16 * 16 * 3]);
+  DECLARE_ALIGNED(16, uint8_t, predictor8[16 * 16 * 3]);
+  uint8_t *predictor;
+#else
+  DECLARE_ALIGNED(16, uint8_t, predictor[16 * 16 * 3]);
+#endif
+  const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
+  const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x;
+
+  // Save input state
+  uint8_t *input_buffer[MAX_MB_PLANE];
+  int i;
+#if CONFIG_HIGHBITDEPTH
+  if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    predictor = CONVERT_TO_BYTEPTR(predictor16);
+  } else {
+    predictor = predictor8;
+  }
+#endif
+
+  for (i = 0; i < MAX_MB_PLANE; i++) input_buffer[i] = mbd->plane[i].pre[0].buf;
+
+  for (mb_row = 0; mb_row < mb_rows; mb_row++) {
+    // Source frames are extended to 16 pixels. This is different than
+    //  L/A/G reference frames that have a border of 32 (AV1ENCBORDERINPIXELS)
+    // A 6/8 tap filter is used for motion search.  This requires 2 pixels
+    //  before and 3 pixels after.  So the largest Y mv on a border would
+    //  then be 16 - AOM_INTERP_EXTEND. The UV blocks are half the size of the
+    //  Y and therefore only extended by 8.  The largest mv that a UV block
+    //  can support is 8 - AOM_INTERP_EXTEND.  A UV mv is half of a Y mv.
+    //  (16 - AOM_INTERP_EXTEND) >> 1 which is greater than
+    //  8 - AOM_INTERP_EXTEND.
+    // To keep the mv in play for both Y and UV planes the max that it
+    //  can be on a border is therefore 16 - (2*AOM_INTERP_EXTEND+1).
+    cpi->td.mb.mv_limits.row_min =
+        -((mb_row * 16) + (17 - 2 * AOM_INTERP_EXTEND));
+    cpi->td.mb.mv_limits.row_max =
+        ((mb_rows - 1 - mb_row) * 16) + (17 - 2 * AOM_INTERP_EXTEND);
+
+    for (mb_col = 0; mb_col < mb_cols; mb_col++) {
+      int j, k;
+      int stride;
+
+      memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0]));
+      memset(count, 0, 16 * 16 * 3 * sizeof(count[0]));
+
+      cpi->td.mb.mv_limits.col_min =
+          -((mb_col * 16) + (17 - 2 * AOM_INTERP_EXTEND));
+      cpi->td.mb.mv_limits.col_max =
+          ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * AOM_INTERP_EXTEND);
+
+      for (frame = 0; frame < frame_count; frame++) {
+        const int thresh_low = 10000;
+        const int thresh_high = 20000;
+
+        if (frames[frame] == NULL) continue;
+
+        mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0;
+        mbd->mi[0]->bmi[0].as_mv[0].as_mv.col = 0;
+
+        if (frame == alt_ref_index) {
+          filter_weight = 2;
+        } else {
+          // Find best match in this frame by MC
+          int err = temporal_filter_find_matching_mb_c(
+              cpi, frames[alt_ref_index]->y_buffer + mb_y_offset,
+              frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride);
+
+          // Assign higher weight to matching MB if it's error
+          // score is lower. If not applying MC default behavior
+          // is to weight all MBs equal.
+          filter_weight = err < thresh_low ? 2 : err < thresh_high ? 1 : 0;
+        }
+
+        if (filter_weight != 0) {
+          // Construct the predictors
+          temporal_filter_predictors_mb_c(
+              mbd, frames[frame]->y_buffer + mb_y_offset,
+              frames[frame]->u_buffer + mb_uv_offset,
+              frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride,
+              mb_uv_width, mb_uv_height, mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
+              mbd->mi[0]->bmi[0].as_mv[0].as_mv.col, predictor, scale,
+              mb_col * 16, mb_row * 16);
+
+#if CONFIG_HIGHBITDEPTH
+          if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            int adj_strength = strength + 2 * (mbd->bd - 8);
+            // Apply the filter (YUV)
+            av1_highbd_temporal_filter_apply(
+                f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16,
+                adj_strength, filter_weight, accumulator, count);
+            av1_highbd_temporal_filter_apply(
+                f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
+                mb_uv_width, mb_uv_height, adj_strength, filter_weight,
+                accumulator + 256, count + 256);
+            av1_highbd_temporal_filter_apply(
+                f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
+                mb_uv_width, mb_uv_height, adj_strength, filter_weight,
+                accumulator + 512, count + 512);
+          } else {
+            // Apply the filter (YUV)
+            av1_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
+                                        predictor, 16, 16, strength,
+                                        filter_weight, accumulator, count);
+            av1_temporal_filter_apply_c(
+                f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
+                mb_uv_width, mb_uv_height, strength, filter_weight,
+                accumulator + 256, count + 256);
+            av1_temporal_filter_apply_c(
+                f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
+                mb_uv_width, mb_uv_height, strength, filter_weight,
+                accumulator + 512, count + 512);
+          }
+#else
+          // Apply the filter (YUV)
+          av1_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
+                                      predictor, 16, 16, strength,
+                                      filter_weight, accumulator, count);
+          av1_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
+                                      predictor + 256, mb_uv_width,
+                                      mb_uv_height, strength, filter_weight,
+                                      accumulator + 256, count + 256);
+          av1_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
+                                      predictor + 512, mb_uv_width,
+                                      mb_uv_height, strength, filter_weight,
+                                      accumulator + 512, count + 512);
+#endif  // CONFIG_HIGHBITDEPTH
+        }
+      }
+
+#if CONFIG_HIGHBITDEPTH
+      if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        uint16_t *dst1_16;
+        uint16_t *dst2_16;
+        // Normalize filter output to produce AltRef frame
+        dst1 = cpi->alt_ref_buffer.y_buffer;
+        dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+        stride = cpi->alt_ref_buffer.y_stride;
+        byte = mb_y_offset;
+        for (i = 0, k = 0; i < 16; i++) {
+          for (j = 0; j < 16; j++, k++) {
+            dst1_16[byte] =
+                (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+
+            // move to next pixel
+            byte++;
+          }
+
+          byte += stride - 16;
+        }
+
+        dst1 = cpi->alt_ref_buffer.u_buffer;
+        dst2 = cpi->alt_ref_buffer.v_buffer;
+        dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+        dst2_16 = CONVERT_TO_SHORTPTR(dst2);
+        stride = cpi->alt_ref_buffer.uv_stride;
+        byte = mb_uv_offset;
+        for (i = 0, k = 256; i < mb_uv_height; i++) {
+          for (j = 0; j < mb_uv_width; j++, k++) {
+            int m = k + 256;
+
+            // U
+            dst1_16[byte] =
+                (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+
+            // V
+            dst2_16[byte] =
+                (uint16_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
+
+            // move to next pixel
+            byte++;
+          }
+
+          byte += stride - mb_uv_width;
+        }
+      } else {
+        // Normalize filter output to produce AltRef frame
+        dst1 = cpi->alt_ref_buffer.y_buffer;
+        stride = cpi->alt_ref_buffer.y_stride;
+        byte = mb_y_offset;
+        for (i = 0, k = 0; i < 16; i++) {
+          for (j = 0; j < 16; j++, k++) {
+            dst1[byte] =
+                (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+
+            // move to next pixel
+            byte++;
+          }
+          byte += stride - 16;
+        }
+
+        dst1 = cpi->alt_ref_buffer.u_buffer;
+        dst2 = cpi->alt_ref_buffer.v_buffer;
+        stride = cpi->alt_ref_buffer.uv_stride;
+        byte = mb_uv_offset;
+        for (i = 0, k = 256; i < mb_uv_height; i++) {
+          for (j = 0; j < mb_uv_width; j++, k++) {
+            int m = k + 256;
+
+            // U
+            dst1[byte] =
+                (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+
+            // V
+            dst2[byte] =
+                (uint8_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
+
+            // move to next pixel
+            byte++;
+          }
+          byte += stride - mb_uv_width;
+        }
+      }
+#else
+      // Normalize filter output to produce AltRef frame
+      dst1 = cpi->alt_ref_buffer.y_buffer;
+      stride = cpi->alt_ref_buffer.y_stride;
+      byte = mb_y_offset;
+      for (i = 0, k = 0; i < 16; i++) {
+        for (j = 0; j < 16; j++, k++) {
+          dst1[byte] =
+              (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+
+          // move to next pixel
+          byte++;
+        }
+        byte += stride - 16;
+      }
+
+      dst1 = cpi->alt_ref_buffer.u_buffer;
+      dst2 = cpi->alt_ref_buffer.v_buffer;
+      stride = cpi->alt_ref_buffer.uv_stride;
+      byte = mb_uv_offset;
+      for (i = 0, k = 256; i < mb_uv_height; i++) {
+        for (j = 0; j < mb_uv_width; j++, k++) {
+          int m = k + 256;
+
+          // U
+          dst1[byte] =
+              (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+
+          // V
+          dst2[byte] =
+              (uint8_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
+
+          // move to next pixel
+          byte++;
+        }
+        byte += stride - mb_uv_width;
+      }
+#endif  // CONFIG_HIGHBITDEPTH
+      mb_y_offset += 16;
+      mb_uv_offset += mb_uv_width;
+    }
+    mb_y_offset += 16 * (f->y_stride - mb_cols);
+    mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols;
+  }
+
+  // Restore input state
+  for (i = 0; i < MAX_MB_PLANE; i++) mbd->plane[i].pre[0].buf = input_buffer[i];
+}
+
+// Apply buffer limits and context specific adjustments to arnr filter.
+static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
+                               int *arnr_frames, int *arnr_strength) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const int frames_after_arf =
+      av1_lookahead_depth(cpi->lookahead) - distance - 1;
+  int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1;
+  int frames_bwd;
+  int q, frames, strength;
+
+  // Define the forward and backwards filter limits for this arnr group.
+  if (frames_fwd > frames_after_arf) frames_fwd = frames_after_arf;
+  if (frames_fwd > distance) frames_fwd = distance;
+
+  frames_bwd = frames_fwd;
+
+  // For even length filter there is one more frame backward
+  // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
+  if (frames_bwd < distance) frames_bwd += (oxcf->arnr_max_frames + 1) & 0x1;
+
+  // Set the baseline active filter size.
+  frames = frames_bwd + 1 + frames_fwd;
+
+  // Adjust the strength based on active max q.
+  if (cpi->common.current_video_frame > 1)
+    q = ((int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
+                                      cpi->common.bit_depth));
+  else
+    q = ((int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[KEY_FRAME],
+                                      cpi->common.bit_depth));
+  if (q > 16) {
+    strength = oxcf->arnr_strength;
+  } else {
+    strength = oxcf->arnr_strength - ((16 - q) / 2);
+    if (strength < 0) strength = 0;
+  }
+
+  // Adjust number of frames in filter and strength based on gf boost level.
+  if (frames > group_boost / 150) {
+    frames = group_boost / 150;
+    frames += !(frames & 1);
+  }
+
+  if (strength > group_boost / 300) {
+    strength = group_boost / 300;
+  }
+
+  // Adjustments for second level arf in multi arf case.
+  if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) {
+      strength >>= 1;
+    }
+  }
+
+  *arnr_frames = frames;
+  *arnr_strength = strength;
+}
+
+void av1_temporal_filter(AV1_COMP *cpi, int distance) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int frame;
+  int frames_to_blur;
+  int start_frame;
+  int strength;
+  int frames_to_blur_backward;
+  int frames_to_blur_forward;
+  struct scale_factors sf;
+  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
+#if CONFIG_EXT_REFS
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+#endif
+
+  // Apply context specific adjustments to the arnr filter parameters.
+  adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength);
+// TODO(weitinglin): Currently, we enforce the filtering strength on
+//                   extra ARFs' to be zeros. We should investigate in which
+//                   case it is more beneficial to use non-zero strength
+//                   filtering.
+#if CONFIG_EXT_REFS
+  if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW) {
+    strength = 0;
+    frames_to_blur = 1;
+  }
+#endif
+
+#if CONFIG_EXT_REFS
+  if (strength == 0 && frames_to_blur == 1) {
+    cpi->is_arf_filter_off[gf_group->arf_update_idx[gf_group->index]] = 1;
+  } else {
+    cpi->is_arf_filter_off[gf_group->arf_update_idx[gf_group->index]] = 0;
+  }
+#endif
+
+  frames_to_blur_backward = (frames_to_blur / 2);
+  frames_to_blur_forward = ((frames_to_blur - 1) / 2);
+  start_frame = distance + frames_to_blur_forward;
+
+  // Setup frame pointers, NULL indicates frame not included in filter.
+  for (frame = 0; frame < frames_to_blur; ++frame) {
+    const int which_buffer = start_frame - frame;
+    struct lookahead_entry *buf =
+        av1_lookahead_peek(cpi->lookahead, which_buffer);
+    frames[frames_to_blur - 1 - frame] = &buf->img;
+  }
+
+  if (frames_to_blur > 0) {
+// Setup scaling factors. Scaling on each of the arnr frames is not
+// supported.
+// ARF is produced at the native frame size and resized when coded.
+#if CONFIG_HIGHBITDEPTH
+    av1_setup_scale_factors_for_frame(
+        &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
+        frames[0]->y_crop_width, frames[0]->y_crop_height,
+        cpi->common.use_highbitdepth);
+#else
+    av1_setup_scale_factors_for_frame(
+        &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
+        frames[0]->y_crop_width, frames[0]->y_crop_height);
+#endif  // CONFIG_HIGHBITDEPTH
+  }
+
+  temporal_filter_iterate_c(cpi, frames, frames_to_blur,
+                            frames_to_blur_backward, strength, &sf);
+}
diff --git a/third_party/aom/av1/encoder/temporal_filter.h b/third_party/aom/av1/encoder/temporal_filter.h
new file mode 100644
index 000000000..bc0863a63
--- /dev/null
+++ b/third_party/aom/av1/encoder/temporal_filter.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_TEMPORAL_FILTER_H_
+#define AV1_ENCODER_TEMPORAL_FILTER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_temporal_filter(AV1_COMP *cpi, int distance);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_TEMPORAL_FILTER_H_
diff --git a/third_party/aom/av1/encoder/tokenize.c b/third_party/aom/av1/encoder/tokenize.c
new file mode 100644
index 000000000..f48493bf8
--- /dev/null
+++ b/third_party/aom/av1/encoder/tokenize.c
@@ -0,0 +1,887 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/entropy.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/tokenize.h"
+
+static const TOKENVALUE dct_cat_lt_10_value_tokens[] = {
+  { 9, 63 }, { 9, 61 }, { 9, 59 }, { 9, 57 }, { 9, 55 }, { 9, 53 }, { 9, 51 },
+  { 9, 49 }, { 9, 47 }, { 9, 45 }, { 9, 43 }, { 9, 41 }, { 9, 39 }, { 9, 37 },
+  { 9, 35 }, { 9, 33 }, { 9, 31 }, { 9, 29 }, { 9, 27 }, { 9, 25 }, { 9, 23 },
+  { 9, 21 }, { 9, 19 }, { 9, 17 }, { 9, 15 }, { 9, 13 }, { 9, 11 }, { 9, 9 },
+  { 9, 7 },  { 9, 5 },  { 9, 3 },  { 9, 1 },  { 8, 31 }, { 8, 29 }, { 8, 27 },
+  { 8, 25 }, { 8, 23 }, { 8, 21 }, { 8, 19 }, { 8, 17 }, { 8, 15 }, { 8, 13 },
+  { 8, 11 }, { 8, 9 },  { 8, 7 },  { 8, 5 },  { 8, 3 },  { 8, 1 },  { 7, 15 },
+  { 7, 13 }, { 7, 11 }, { 7, 9 },  { 7, 7 },  { 7, 5 },  { 7, 3 },  { 7, 1 },
+  { 6, 7 },  { 6, 5 },  { 6, 3 },  { 6, 1 },  { 5, 3 },  { 5, 1 },  { 4, 1 },
+  { 3, 1 },  { 2, 1 },  { 1, 1 },  { 0, 0 },  { 1, 0 },  { 2, 0 },  { 3, 0 },
+  { 4, 0 },  { 5, 0 },  { 5, 2 },  { 6, 0 },  { 6, 2 },  { 6, 4 },  { 6, 6 },
+  { 7, 0 },  { 7, 2 },  { 7, 4 },  { 7, 6 },  { 7, 8 },  { 7, 10 }, { 7, 12 },
+  { 7, 14 }, { 8, 0 },  { 8, 2 },  { 8, 4 },  { 8, 6 },  { 8, 8 },  { 8, 10 },
+  { 8, 12 }, { 8, 14 }, { 8, 16 }, { 8, 18 }, { 8, 20 }, { 8, 22 }, { 8, 24 },
+  { 8, 26 }, { 8, 28 }, { 8, 30 }, { 9, 0 },  { 9, 2 },  { 9, 4 },  { 9, 6 },
+  { 9, 8 },  { 9, 10 }, { 9, 12 }, { 9, 14 }, { 9, 16 }, { 9, 18 }, { 9, 20 },
+  { 9, 22 }, { 9, 24 }, { 9, 26 }, { 9, 28 }, { 9, 30 }, { 9, 32 }, { 9, 34 },
+  { 9, 36 }, { 9, 38 }, { 9, 40 }, { 9, 42 }, { 9, 44 }, { 9, 46 }, { 9, 48 },
+  { 9, 50 }, { 9, 52 }, { 9, 54 }, { 9, 56 }, { 9, 58 }, { 9, 60 }, { 9, 62 }
+};
+const TOKENVALUE *av1_dct_cat_lt_10_value_tokens =
+    dct_cat_lt_10_value_tokens +
+    (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens)) /
+        2;
+// The corresponding costs of the extrabits for the tokens in the above table
+// are stored in the table below. The values are obtained from looking up the
+// entry for the specified extrabits in the table corresponding to the token
+// (as defined in cost element av1_extra_bits)
+// e.g. {9, 63} maps to cat5_cost[63 >> 1], {1, 1} maps to sign_cost[1 >> 1]
+static const int dct_cat_lt_10_value_cost[] = {
+  3773, 3750, 3704, 3681, 3623, 3600, 3554, 3531, 3432, 3409, 3363, 3340, 3282,
+  3259, 3213, 3190, 3136, 3113, 3067, 3044, 2986, 2963, 2917, 2894, 2795, 2772,
+  2726, 2703, 2645, 2622, 2576, 2553, 3197, 3116, 3058, 2977, 2881, 2800, 2742,
+  2661, 2615, 2534, 2476, 2395, 2299, 2218, 2160, 2079, 2566, 2427, 2334, 2195,
+  2023, 1884, 1791, 1652, 1893, 1696, 1453, 1256, 1229, 864,  512,  512,  512,
+  512,  0,    512,  512,  512,  512,  864,  1229, 1256, 1453, 1696, 1893, 1652,
+  1791, 1884, 2023, 2195, 2334, 2427, 2566, 2079, 2160, 2218, 2299, 2395, 2476,
+  2534, 2615, 2661, 2742, 2800, 2881, 2977, 3058, 3116, 3197, 2553, 2576, 2622,
+  2645, 2703, 2726, 2772, 2795, 2894, 2917, 2963, 2986, 3044, 3067, 3113, 3136,
+  3190, 3213, 3259, 3282, 3340, 3363, 3409, 3432, 3531, 3554, 3600, 3623, 3681,
+  3704, 3750, 3773,
+};
+const int *av1_dct_cat_lt_10_value_cost =
+    dct_cat_lt_10_value_cost +
+    (sizeof(dct_cat_lt_10_value_cost) / sizeof(*dct_cat_lt_10_value_cost)) / 2;
+
+// Array indices are identical to previously-existing CONTEXT_NODE indices
+/* clang-format off */
+const aom_tree_index av1_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+  -EOB_TOKEN, 2,                       // 0  = EOB
+  -ZERO_TOKEN, 4,                      // 1  = ZERO
+  -ONE_TOKEN, 6,                       // 2  = ONE
+  8, 12,                               // 3  = LOW_VAL
+  -TWO_TOKEN, 10,                      // 4  = TWO
+  -THREE_TOKEN, -FOUR_TOKEN,           // 5  = THREE
+  14, 16,                              // 6  = HIGH_LOW
+  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 7  = CAT_ONE
+  18, 20,                              // 8  = CAT_THREEFOUR
+  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 9  = CAT_THREE
+  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 10 = CAT_FIVE
+};
+/* clang-format on */
+
+static const int16_t zero_cost[] = { 0 };
+static const int16_t sign_cost[1] = { 512 };
+static const int16_t cat1_cost[1 << 1] = { 864, 1229 };
+static const int16_t cat2_cost[1 << 2] = { 1256, 1453, 1696, 1893 };
+static const int16_t cat3_cost[1 << 3] = { 1652, 1791, 1884, 2023,
+                                           2195, 2334, 2427, 2566 };
+static const int16_t cat4_cost[1 << 4] = { 2079, 2160, 2218, 2299, 2395, 2476,
+                                           2534, 2615, 2661, 2742, 2800, 2881,
+                                           2977, 3058, 3116, 3197 };
+static const int16_t cat5_cost[1 << 5] = {
+  2553, 2576, 2622, 2645, 2703, 2726, 2772, 2795, 2894, 2917, 2963,
+  2986, 3044, 3067, 3113, 3136, 3190, 3213, 3259, 3282, 3340, 3363,
+  3409, 3432, 3531, 3554, 3600, 3623, 3681, 3704, 3750, 3773
+};
+const int16_t av1_cat6_low_cost[256] = {
+  3378, 3390, 3401, 3413, 3435, 3447, 3458, 3470, 3517, 3529, 3540, 3552, 3574,
+  3586, 3597, 3609, 3671, 3683, 3694, 3706, 3728, 3740, 3751, 3763, 3810, 3822,
+  3833, 3845, 3867, 3879, 3890, 3902, 3973, 3985, 3996, 4008, 4030, 4042, 4053,
+  4065, 4112, 4124, 4135, 4147, 4169, 4181, 4192, 4204, 4266, 4278, 4289, 4301,
+  4323, 4335, 4346, 4358, 4405, 4417, 4428, 4440, 4462, 4474, 4485, 4497, 4253,
+  4265, 4276, 4288, 4310, 4322, 4333, 4345, 4392, 4404, 4415, 4427, 4449, 4461,
+  4472, 4484, 4546, 4558, 4569, 4581, 4603, 4615, 4626, 4638, 4685, 4697, 4708,
+  4720, 4742, 4754, 4765, 4777, 4848, 4860, 4871, 4883, 4905, 4917, 4928, 4940,
+  4987, 4999, 5010, 5022, 5044, 5056, 5067, 5079, 5141, 5153, 5164, 5176, 5198,
+  5210, 5221, 5233, 5280, 5292, 5303, 5315, 5337, 5349, 5360, 5372, 4988, 5000,
+  5011, 5023, 5045, 5057, 5068, 5080, 5127, 5139, 5150, 5162, 5184, 5196, 5207,
+  5219, 5281, 5293, 5304, 5316, 5338, 5350, 5361, 5373, 5420, 5432, 5443, 5455,
+  5477, 5489, 5500, 5512, 5583, 5595, 5606, 5618, 5640, 5652, 5663, 5675, 5722,
+  5734, 5745, 5757, 5779, 5791, 5802, 5814, 5876, 5888, 5899, 5911, 5933, 5945,
+  5956, 5968, 6015, 6027, 6038, 6050, 6072, 6084, 6095, 6107, 5863, 5875, 5886,
+  5898, 5920, 5932, 5943, 5955, 6002, 6014, 6025, 6037, 6059, 6071, 6082, 6094,
+  6156, 6168, 6179, 6191, 6213, 6225, 6236, 6248, 6295, 6307, 6318, 6330, 6352,
+  6364, 6375, 6387, 6458, 6470, 6481, 6493, 6515, 6527, 6538, 6550, 6597, 6609,
+  6620, 6632, 6654, 6666, 6677, 6689, 6751, 6763, 6774, 6786, 6808, 6820, 6831,
+  6843, 6890, 6902, 6913, 6925, 6947, 6959, 6970, 6982
+};
+const int av1_cat6_high_cost[CAT6_HIGH_COST_ENTRIES] = {
+  100,   2263,  2739,  4902,  3160,  5323,  5799,  7962,  3678,  5841,  6317,
+  8480,  6738,  8901,  9377,  11540, 3678,  5841,  6317,  8480,  6738,  8901,
+  9377,  11540, 7256,  9419,  9895,  12058, 10316, 12479, 12955, 15118, 3678,
+  5841,  6317,  8480,  6738,  8901,  9377,  11540, 7256,  9419,  9895,  12058,
+  10316, 12479, 12955, 15118, 7256,  9419,  9895,  12058, 10316, 12479, 12955,
+  15118, 10834, 12997, 13473, 15636, 13894, 16057, 16533, 18696,
+#if CONFIG_HIGHBITDEPTH
+  4193,  6356,  6832,  8995,  7253,  9416,  9892,  12055, 7771,  9934,  10410,
+  12573, 10831, 12994, 13470, 15633, 7771,  9934,  10410, 12573, 10831, 12994,
+  13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771,
+  9934,  10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151,
+  14409, 16572, 17048, 19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048,
+  19211, 14927, 17090, 17566, 19729, 17987, 20150, 20626, 22789, 4193,  6356,
+  6832,  8995,  7253,  9416,  9892,  12055, 7771,  9934,  10410, 12573, 10831,
+  12994, 13470, 15633, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633,
+  11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771,  9934,  10410,
+  12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572,
+  17048, 19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927,
+  17090, 17566, 19729, 17987, 20150, 20626, 22789, 8286,  10449, 10925, 13088,
+  11346, 13509, 13985, 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563,
+  19726, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605,
+  18081, 20244, 18502, 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924,
+  17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304,
+  15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659,
+  23822, 22080, 24243, 24719, 26882, 4193,  6356,  6832,  8995,  7253,  9416,
+  9892,  12055, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633, 7771,
+  9934,  10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151,
+  14409, 16572, 17048, 19211, 7771,  9934,  10410, 12573, 10831, 12994, 13470,
+  15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 11349, 13512,
+  13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090, 17566, 19729, 17987,
+  20150, 20626, 22789, 8286,  10449, 10925, 13088, 11346, 13509, 13985, 16148,
+  11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503,
+  16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665,
+  21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442,
+  17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244,
+  18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719,
+  26882, 8286,  10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027,
+  14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924,
+  17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304,
+  11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081,
+  20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665,
+  21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379,
+  14542, 15018, 17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759,
+  19017, 21180, 21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656,
+  23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120,
+  18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595,
+  24758, 25234, 27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397,
+  23113, 25276, 25752, 27915, 26173, 28336, 28812, 30975, 4193,  6356,  6832,
+  8995,  7253,  9416,  9892,  12055, 7771,  9934,  10410, 12573, 10831, 12994,
+  13470, 15633, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633, 11349,
+  13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771,  9934,  10410, 12573,
+  10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048,
+  19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090,
+  17566, 19729, 17987, 20150, 20626, 22789, 8286,  10449, 10925, 13088, 11346,
+  13509, 13985, 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726,
+  11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081,
+  20244, 18502, 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087,
+  17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442,
+  17605, 18081, 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822,
+  22080, 24243, 24719, 26882, 8286,  10449, 10925, 13088, 11346, 13509, 13985,
+  16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027,
+  14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502,
+  20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726,
+  15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081,
+  20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243,
+  24719, 26882, 12379, 14542, 15018, 17181, 15439, 17602, 18078, 20241, 15957,
+  18120, 18596, 20759, 19017, 21180, 21656, 23819, 15957, 18120, 18596, 20759,
+  19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234,
+  27397, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698,
+  22174, 24337, 22595, 24758, 25234, 27397, 19535, 21698, 22174, 24337, 22595,
+  24758, 25234, 27397, 23113, 25276, 25752, 27915, 26173, 28336, 28812, 30975,
+  8286,  10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027, 14503,
+  16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924, 17087,
+  17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 11864,
+  14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244,
+  18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665, 21141,
+  23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379, 14542,
+  15018, 17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017,
+  21180, 21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819,
+  19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596,
+  20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758,
+  25234, 27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113,
+  25276, 25752, 27915, 26173, 28336, 28812, 30975, 12379, 14542, 15018, 17181,
+  15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180, 21656,
+  23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698,
+  22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759, 19017,
+  21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397,
+  19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276, 25752,
+  27915, 26173, 28336, 28812, 30975, 16472, 18635, 19111, 21274, 19532, 21695,
+  22171, 24334, 20050, 22213, 22689, 24852, 23110, 25273, 25749, 27912, 20050,
+  22213, 22689, 24852, 23110, 25273, 25749, 27912, 23628, 25791, 26267, 28430,
+  26688, 28851, 29327, 31490, 20050, 22213, 22689, 24852, 23110, 25273, 25749,
+  27912, 23628, 25791, 26267, 28430, 26688, 28851, 29327, 31490, 23628, 25791,
+  26267, 28430, 26688, 28851, 29327, 31490, 27206, 29369, 29845, 32008, 30266,
+  32429, 32905, 35068
+#endif
+};
+
+const uint8_t av1_cat6_skipped_bits_discount[8] = {
+  0, 3, 6, 9, 12, 18, 24, 30
+};
+
+#if CONFIG_NEW_MULTISYMBOL
+const av1_extra_bit av1_extra_bits[ENTROPY_TOKENS] = {
+  { 0, 0, 0, zero_cost },                        // ZERO_TOKEN
+  { 0, 0, 1, sign_cost },                        // ONE_TOKEN
+  { 0, 0, 2, sign_cost },                        // TWO_TOKEN
+  { 0, 0, 3, sign_cost },                        // THREE_TOKEN
+  { 0, 0, 4, sign_cost },                        // FOUR_TOKEN
+  { av1_cat1_cdf, 1, CAT1_MIN_VAL, cat1_cost },  // CATEGORY1_TOKEN
+  { av1_cat2_cdf, 2, CAT2_MIN_VAL, cat2_cost },  // CATEGORY2_TOKEN
+  { av1_cat3_cdf, 3, CAT3_MIN_VAL, cat3_cost },  // CATEGORY3_TOKEN
+  { av1_cat4_cdf, 4, CAT4_MIN_VAL, cat4_cost },  // CATEGORY4_TOKEN
+  { av1_cat5_cdf, 5, CAT5_MIN_VAL, cat5_cost },  // CATEGORY5_TOKEN
+  { av1_cat6_cdf, 18, CAT6_MIN_VAL, 0 },         // CATEGORY6_TOKEN
+  { 0, 0, 0, zero_cost }                         // EOB_TOKEN
+};
+#else
+const av1_extra_bit av1_extra_bits[ENTROPY_TOKENS] = {
+  { 0, 0, 0, zero_cost },                         // ZERO_TOKEN
+  { 0, 0, 1, sign_cost },                         // ONE_TOKEN
+  { 0, 0, 2, sign_cost },                         // TWO_TOKEN
+  { 0, 0, 3, sign_cost },                         // THREE_TOKEN
+  { 0, 0, 4, sign_cost },                         // FOUR_TOKEN
+  { av1_cat1_prob, 1, CAT1_MIN_VAL, cat1_cost },  // CATEGORY1_TOKEN
+  { av1_cat2_prob, 2, CAT2_MIN_VAL, cat2_cost },  // CATEGORY2_TOKEN
+  { av1_cat3_prob, 3, CAT3_MIN_VAL, cat3_cost },  // CATEGORY3_TOKEN
+  { av1_cat4_prob, 4, CAT4_MIN_VAL, cat4_cost },  // CATEGORY4_TOKEN
+  { av1_cat5_prob, 5, CAT5_MIN_VAL, cat5_cost },  // CATEGORY5_TOKEN
+  { av1_cat6_prob, 18, CAT6_MIN_VAL, 0 },         // CATEGORY6_TOKEN
+  { 0, 0, 0, zero_cost }                          // EOB_TOKEN
+};
+#endif
+
+#if !CONFIG_EC_MULTISYMBOL
+const struct av1_token av1_coef_encodings[ENTROPY_TOKENS] = {
+  { 2, 2 },  { 6, 3 },   { 28, 5 },  { 58, 6 },  { 59, 6 },  { 60, 6 },
+  { 61, 6 }, { 124, 7 }, { 125, 7 }, { 126, 7 }, { 127, 7 }, { 0, 1 }
+};
+#endif  // !CONFIG_EC_MULTISYMBOL
+
+struct tokenize_b_args {
+  const AV1_COMP *cpi;
+  ThreadData *td;
+  TOKENEXTRA **tp;
+  int this_rate;
+};
+
+#if !CONFIG_PVQ || CONFIG_VAR_TX
+static void cost_coeffs_b(int plane, int block, int blk_row, int blk_col,
+                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+  struct tokenize_b_args *const args = arg;
+  const AV1_COMP *const cpi = args->cpi;
+  const AV1_COMMON *cm = &cpi->common;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const PLANE_TYPE type = pd->plane_type;
+  const int ref = is_inter_block(mbmi);
+  const TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, ref);
+  const int rate = av1_cost_coeffs(cpi, x, plane, block, tx_size, scan_order,
+                                   pd->above_context + blk_col,
+                                   pd->left_context + blk_row, 0);
+  args->this_rate += rate;
+  (void)plane_bsize;
+  av1_set_contexts(xd, pd, plane, tx_size, p->eobs[block] > 0, blk_col,
+                   blk_row);
+}
+
+static void set_entropy_context_b(int plane, int block, int blk_row,
+                                  int blk_col, BLOCK_SIZE plane_bsize,
+                                  TX_SIZE tx_size, void *arg) {
+  struct tokenize_b_args *const args = arg;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  (void)plane_bsize;
+  av1_set_contexts(xd, pd, plane, tx_size, p->eobs[block] > 0, blk_col,
+                   blk_row);
+}
+
+#if CONFIG_NEW_TOKENSET
+static INLINE void add_token(TOKENEXTRA **t,
+                             aom_cdf_prob (*tail_cdf)[CDF_SIZE(ENTROPY_TOKENS)],
+                             aom_cdf_prob (*head_cdf)[CDF_SIZE(ENTROPY_TOKENS)],
+                             int eob_val, int first_val, int32_t extra,
+                             uint8_t token) {
+  (*t)->token = token;
+  (*t)->extra = extra;
+  (*t)->tail_cdf = tail_cdf;
+  (*t)->head_cdf = head_cdf;
+  (*t)->eob_val = eob_val;
+  (*t)->first_val = first_val;
+  (*t)++;
+}
+
+#else  // CONFIG_NEW_TOKENSET
+static INLINE void add_token(
+    TOKENEXTRA **t, const aom_prob *context_tree,
+#if CONFIG_EC_MULTISYMBOL
+    aom_cdf_prob (*token_cdf)[CDF_SIZE(ENTROPY_TOKENS)],
+#endif  // CONFIG_EC_MULTISYMBOL
+    int32_t extra, uint8_t token, uint8_t skip_eob_node, unsigned int *counts) {
+  (*t)->token = token;
+  (*t)->extra = extra;
+  (*t)->context_tree = context_tree;
+#if CONFIG_EC_MULTISYMBOL
+  (*t)->token_cdf = token_cdf;
+#endif  // CONFIG_EC_MULTISYMBOL
+  (*t)->skip_eob_node = skip_eob_node;
+  (*t)++;
+  ++counts[token];
+}
+#endif  // CONFIG_NEW_TOKENSET
+#endif  // !CONFIG_PVQ || CONFIG_VAR_TX
+
+#if CONFIG_PALETTE
+void av1_tokenize_palette_sb(const AV1_COMP *cpi,
+                             const struct ThreadData *const td, int plane,
+                             TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                             int *rate) {
+  const MACROBLOCK *const x = &td->mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const uint8_t *const color_map = xd->plane[plane].color_index_map;
+  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int n = pmi->palette_size[plane];
+  int i, j;
+  int this_rate = 0;
+  uint8_t color_order[PALETTE_MAX_SIZE];
+  const aom_prob(
+      *const probs)[PALETTE_COLOR_INDEX_CONTEXTS][PALETTE_COLORS - 1] =
+      plane == 0 ? av1_default_palette_y_color_index_prob
+                 : av1_default_palette_uv_color_index_prob;
+  int plane_block_width, rows, cols;
+  av1_get_block_dimensions(bsize, plane, xd, &plane_block_width, NULL, &rows,
+                           &cols);
+  assert(plane == 0 || plane == 1);
+
+#if CONFIG_PALETTE_THROUGHPUT
+  int k;
+  for (k = 1; k < rows + cols - 1; ++k) {
+    for (j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) {
+      i = k - j;
+#else
+  for (i = 0; i < rows; ++i) {
+    for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
+#endif  // CONFIG_PALETTE_THROUGHPUT
+      int color_new_idx;
+      const int color_ctx = av1_get_palette_color_index_context(
+          color_map, plane_block_width, i, j, n, color_order, &color_new_idx);
+      assert(color_new_idx >= 0 && color_new_idx < n);
+      if (dry_run == DRY_RUN_COSTCOEFFS)
+        this_rate += cpi->palette_y_color_cost[n - PALETTE_MIN_SIZE][color_ctx]
+                                              [color_new_idx];
+      (*t)->token = color_new_idx;
+      (*t)->context_tree = probs[n - PALETTE_MIN_SIZE][color_ctx];
+      (*t)->skip_eob_node = 0;
+      ++(*t);
+    }
+  }
+  if (rate) *rate += this_rate;
+}
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_PVQ
+static void add_pvq_block(AV1_COMMON *const cm, MACROBLOCK *const x,
+                          PVQ_INFO *pvq) {
+  PVQ_QUEUE *q = x->pvq_q;
+  if (q->curr_pos >= q->buf_len) {
+    int new_buf_len = 2 * q->buf_len + 1;
+    PVQ_INFO *new_buf;
+    CHECK_MEM_ERROR(cm, new_buf, aom_malloc(new_buf_len * sizeof(PVQ_INFO)));
+    memcpy(new_buf, q->buf, q->buf_len * sizeof(PVQ_INFO));
+    aom_free(q->buf);
+    q->buf = new_buf;
+    q->buf_len = new_buf_len;
+  }
+  OD_COPY(q->buf + q->curr_pos, pvq, 1);
+  ++q->curr_pos;
+}
+
+// NOTE: This does not actually generate tokens, instead we store the encoding
+// decisions made for PVQ in a queue that we will read from when
+// actually writing the bitstream in write_modes_b
+static void tokenize_pvq(int plane, int block, int blk_row, int blk_col,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+  struct tokenize_b_args *const args = arg;
+  const AV1_COMP *cpi = args->cpi;
+  const AV1_COMMON *const cm = &cpi->common;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  PVQ_INFO *pvq_info;
+
+  (void)block;
+  (void)blk_row;
+  (void)blk_col;
+  (void)plane_bsize;
+  (void)tx_size;
+
+  assert(block < MAX_PVQ_BLOCKS_IN_SB);
+  pvq_info = &x->pvq[block][plane];
+  add_pvq_block((AV1_COMMON * const)cm, x, pvq_info);
+}
+#endif  // CONFIG_PVQ
+
+static void tokenize_b(int plane, int block, int blk_row, int blk_col,
+                       BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+#if !CONFIG_PVQ
+  struct tokenize_b_args *const args = arg;
+  const AV1_COMP *cpi = args->cpi;
+  const AV1_COMMON *const cm = &cpi->common;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TOKENEXTRA **tp = args->tp;
+  uint8_t token_cache[MAX_TX_SQUARE];
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  int pt; /* near block/prev token context index */
+  int c;
+  TOKENEXTRA *t = *tp; /* store tokens starting here */
+  const int eob = p->eobs[block];
+  const PLANE_TYPE type = pd->plane_type;
+  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+#if CONFIG_SUPERTX
+  const int segment_id = AOMMIN(mbmi->segment_id, mbmi->segment_id_supertx);
+#else
+  const int segment_id = mbmi->segment_id;
+#endif  // CONFIG_SUEPRTX
+  const int16_t *scan, *nb;
+  const TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size);
+  const SCAN_ORDER *const scan_order =
+      get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+  const int ref = is_inter_block(mbmi);
+  unsigned int(*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      td->rd_counts.coef_counts[txsize_sqr_map[tx_size]][type][ref];
+#if !CONFIG_NEW_TOKENSET
+#if CONFIG_SUBFRAME_PROB_UPDATE
+  const aom_prob(*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+      cpi->subframe_stats.coef_probs_buf[cpi->common.coef_probs_update_idx]
+                                        [txsize_sqr_map[tx_size]][type][ref];
+#else
+  aom_prob(*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+      cpi->common.fc->coef_probs[txsize_sqr_map[tx_size]][type][ref];
+#endif  // CONFIG_SUBFRAME_PROB_UPDATE
+#endif  // !CONFIG_NEW_TOKENSET
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#elif CONFIG_EC_MULTISYMBOL
+  FRAME_CONTEXT *ec_ctx = cpi->common.fc;
+#endif
+#if CONFIG_NEW_TOKENSET
+  aom_cdf_prob(
+      *const coef_head_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
+      ec_ctx->coef_head_cdfs[txsize_sqr_map[tx_size]][type][ref];
+  aom_cdf_prob(
+      *const coef_tail_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
+      ec_ctx->coef_tail_cdfs[txsize_sqr_map[tx_size]][type][ref];
+  unsigned int(*const blockz_count)[2] =
+      td->counts->blockz_count[txsize_sqr_map[tx_size]][type][ref];
+  int eob_val;
+  int first_val = 1;
+#else
+#if CONFIG_EC_MULTISYMBOL
+  aom_cdf_prob(*const coef_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
+      ec_ctx->coef_cdfs[txsize_sqr_map[tx_size]][type][ref];
+#endif
+  int skip_eob = 0;
+#endif
+  const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+  unsigned int(*const eob_branch)[COEFF_CONTEXTS] =
+      td->counts->eob_branch[txsize_sqr_map[tx_size]][type][ref];
+  const uint8_t *const band = get_band_translate(tx_size);
+  int16_t token;
+  EXTRABIT extra;
+  (void)plane_bsize;
+  pt = get_entropy_context(tx_size, pd->above_context + blk_col,
+                           pd->left_context + blk_row);
+  scan = scan_order->scan;
+  nb = scan_order->neighbors;
+  c = 0;
+
+#if CONFIG_NEW_TOKENSET
+  if (eob == 0)
+    add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt], 1,
+              1, 0, BLOCK_Z_TOKEN);
+
+  ++blockz_count[pt][eob != 0];
+
+  while (c < eob) {
+    int v = qcoeff[scan[c]];
+    first_val = (c == 0);
+
+    if (!v) {
+      add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt],
+                0, first_val, 0, ZERO_TOKEN);
+      ++counts[band[c]][pt][ZERO_TOKEN];
+      token_cache[scan[c]] = 0;
+    } else {
+      eob_val =
+          (c + 1 == eob) ? (c + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
+
+      av1_get_token_extra(v, &token, &extra);
+
+      add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt],
+                eob_val, first_val, extra, (uint8_t)token);
+
+      if (eob_val != LAST_EOB) {
+        ++counts[band[c]][pt][token];
+        ++eob_branch[band[c]][pt];
+        counts[band[c]][pt][EOB_TOKEN] += eob_val != NO_EOB;
+      }
+
+      token_cache[scan[c]] = av1_pt_energy_class[token];
+    }
+    ++c;
+    pt = get_coef_context(nb, token_cache, AOMMIN(c, eob - 1));
+  }
+#else
+  while (c < eob) {
+    const int v = qcoeff[scan[c]];
+    eob_branch[band[c]][pt] += !skip_eob;
+
+    av1_get_token_extra(v, &token, &extra);
+
+    add_token(&t, coef_probs[band[c]][pt],
+#if CONFIG_EC_MULTISYMBOL
+              &coef_cdfs[band[c]][pt],
+#endif
+              extra, (uint8_t)token, (uint8_t)skip_eob, counts[band[c]][pt]);
+
+    token_cache[scan[c]] = av1_pt_energy_class[token];
+    ++c;
+    pt = get_coef_context(nb, token_cache, c);
+    skip_eob = (token == ZERO_TOKEN);
+  }
+  if (c < seg_eob) {
+    add_token(&t, coef_probs[band[c]][pt],
+#if CONFIG_EC_MULTISYMBOL
+              NULL,
+#endif
+              0, EOB_TOKEN, 0, counts[band[c]][pt]);
+    ++eob_branch[band[c]][pt];
+  }
+#endif  // CONFIG_NEW_TOKENSET
+
+#if CONFIG_COEF_INTERLEAVE
+  t->token = EOSB_TOKEN;
+  t++;
+#endif
+
+  *tp = t;
+
+#if CONFIG_ADAPT_SCAN
+  // Since dqcoeff is not available here, we pass qcoeff into
+  // av1_update_scan_count_facade(). The update behavior should be the same
+  // because av1_update_scan_count_facade() only cares if coefficients are zero
+  // or not.
+  av1_update_scan_count_facade((AV1_COMMON *)cm, td->counts, tx_size, tx_type,
+                               qcoeff, c);
+#endif
+
+  av1_set_contexts(xd, pd, plane, tx_size, c > 0, blk_col, blk_row);
+#else   // !CONFIG_PVQ
+  tokenize_pvq(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
+#endif  // !CONFIG_PVQ
+}
+
+struct is_skippable_args {
+  uint16_t *eobs;
+  int *skippable;
+};
+static void is_skippable(int plane, int block, int blk_row, int blk_col,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *argv) {
+  struct is_skippable_args *args = argv;
+  (void)plane;
+  (void)plane_bsize;
+  (void)tx_size;
+  (void)blk_row;
+  (void)blk_col;
+  args->skippable[0] &= (!args->eobs[block]);
+}
+
+// TODO(yaowu): rewrite and optimize this function to remove the usage of
+//              av1_foreach_transform_block() and simplify is_skippable().
+int av1_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+  int result = 1;
+  struct is_skippable_args args = { x->plane[plane].eobs, &result };
+  av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane, is_skippable,
+                                         &args);
+  return result;
+}
+
+#if CONFIG_VAR_TX
+void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
+                    TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int blk_row,
+                    int blk_col, int block, int plane, void *arg) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  TX_SIZE plane_tx_size;
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  plane_tx_size =
+      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
+            : mbmi->inter_tx_size[tx_row][tx_col];
+
+  if (tx_size == plane_tx_size) {
+    plane_bsize = get_plane_block_size(mbmi->sb_type, pd);
+    if (!dry_run)
+      tokenize_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
+    else if (dry_run == DRY_RUN_NORMAL)
+      set_entropy_context_b(plane, block, blk_row, blk_col, plane_bsize,
+                            tx_size, arg);
+    else if (dry_run == DRY_RUN_COSTCOEFFS)
+      cost_coeffs_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
+  } else {
+    // Half the block size in transform block unit.
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsl = tx_size_wide_unit[sub_txs];
+    int i;
+
+    assert(bsl > 0);
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + ((i >> 1) * bsl);
+      const int offsetc = blk_col + ((i & 0x01) * bsl);
+
+      int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+      tokenize_vartx(td, t, dry_run, sub_txs, plane_bsize, offsetr, offsetc,
+                     block, plane, arg);
+      block += step;
+    }
+  }
+}
+
+void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+                           RUN_TYPE dry_run, int mi_row, int mi_col,
+                           BLOCK_SIZE bsize, int *rate) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  TOKENEXTRA *t_backup = *t;
+  const int ctx = av1_get_skip_context(xd);
+  const int skip_inc =
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
+  struct tokenize_b_args arg = { cpi, td, t, 0 };
+  int plane;
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  if (mbmi->skip) {
+    if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
+    reset_skip_context(xd, bsize);
+    if (dry_run) *t = t_backup;
+    return;
+  }
+
+  if (!dry_run)
+    td->counts->skip[ctx][0] += skip_inc;
+  else
+    *t = t_backup;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_CB4X4
+    if (!is_chroma_reference(mi_row, mi_col, bsize,
+                             xd->plane[plane].subsampling_x,
+                             xd->plane[plane].subsampling_y)) {
+#if !CONFIG_PVQ
+      if (!dry_run) {
+        (*t)->token = EOSB_TOKEN;
+        (*t)++;
+      }
+#endif
+      continue;
+    }
+#endif
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+    const BLOCK_SIZE plane_bsize =
+        AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
+#else
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+#endif
+    const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+    const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, plane_bsize);
+    const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+    int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
+    int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
+    int idx, idy;
+    int block = 0;
+    int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bw) {
+        tokenize_vartx(td, t, dry_run, max_tx_size, plane_bsize, idy, idx,
+                       block, plane, &arg);
+        block += step;
+      }
+    }
+
+    if (!dry_run) {
+      (*t)->token = EOSB_TOKEN;
+      (*t)++;
+    }
+  }
+  if (rate) *rate += arg.this_rate;
+}
+#endif  // CONFIG_VAR_TX
+
+void av1_tokenize_sb(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+                     RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
+                     const int mi_row, const int mi_col) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int ctx = av1_get_skip_context(xd);
+  const int skip_inc =
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
+  struct tokenize_b_args arg = { cpi, td, t, 0 };
+  if (mbmi->skip) {
+    if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
+    reset_skip_context(xd, bsize);
+    return;
+  }
+
+  if (!dry_run) {
+#if CONFIG_COEF_INTERLEAVE
+    td->counts->skip[ctx][0] += skip_inc;
+    av1_foreach_transformed_block_interleave(xd, bsize, tokenize_b, &arg);
+#else
+    int plane;
+
+    td->counts->skip[ctx][0] += skip_inc;
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_CB4X4
+      if (!is_chroma_reference(mi_row, mi_col, bsize,
+                               xd->plane[plane].subsampling_x,
+                               xd->plane[plane].subsampling_y)) {
+#if !CONFIG_PVQ
+        (*t)->token = EOSB_TOKEN;
+        (*t)++;
+#endif
+        continue;
+      }
+#else
+      (void)mi_row;
+      (void)mi_col;
+#endif
+      av1_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b,
+                                             &arg);
+#if !CONFIG_PVQ
+      (*t)->token = EOSB_TOKEN;
+      (*t)++;
+#endif  // !CONFIG_PVQ
+    }
+#endif
+  }
+#if !CONFIG_PVQ
+  else if (dry_run == DRY_RUN_NORMAL) {
+    int plane;
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_CB4X4
+      if (!is_chroma_reference(mi_row, mi_col, bsize,
+                               xd->plane[plane].subsampling_x,
+                               xd->plane[plane].subsampling_y))
+        continue;
+#else
+      (void)mi_row;
+      (void)mi_col;
+#endif
+      av1_foreach_transformed_block_in_plane(xd, bsize, plane,
+                                             set_entropy_context_b, &arg);
+    }
+  } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+    int plane;
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_CB4X4
+      if (!is_chroma_reference(mi_row, mi_col, bsize,
+                               xd->plane[plane].subsampling_x,
+                               xd->plane[plane].subsampling_y))
+        continue;
+#else
+      (void)mi_row;
+      (void)mi_col;
+#endif
+      av1_foreach_transformed_block_in_plane(xd, bsize, plane, cost_coeffs_b,
+                                             &arg);
+    }
+  }
+#endif  // !CONFIG_PVQ
+
+  if (rate) *rate += arg.this_rate;
+}
+
+#if CONFIG_SUPERTX
+void av1_tokenize_sb_supertx(const AV1_COMP *cpi, ThreadData *td,
+                             TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                             int *rate) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &td->mb.e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  TOKENEXTRA *t_backup = *t;
+  const int ctx = av1_get_skip_context(xd);
+  const int skip_inc =
+      !segfeature_active(&cm->seg, mbmi->segment_id_supertx, SEG_LVL_SKIP);
+  struct tokenize_b_args arg = { cpi, td, t, 0 };
+  if (mbmi->skip) {
+    if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
+    reset_skip_context(xd, bsize);
+    if (dry_run) *t = t_backup;
+    return;
+  }
+
+  if (!dry_run) {
+    int plane;
+    td->counts->skip[ctx][0] += skip_inc;
+
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      av1_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b,
+                                             &arg);
+      (*t)->token = EOSB_TOKEN;
+      (*t)++;
+    }
+  } else if (dry_run == DRY_RUN_NORMAL) {
+    int plane;
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane)
+      av1_foreach_transformed_block_in_plane(xd, bsize, plane,
+                                             set_entropy_context_b, &arg);
+    *t = t_backup;
+  } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+    int plane;
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane)
+      av1_foreach_transformed_block_in_plane(xd, bsize, plane, cost_coeffs_b,
+                                             &arg);
+  }
+  if (rate) *rate += arg.this_rate;
+}
+#endif  // CONFIG_SUPERTX
diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h
new file mode 100644
index 000000000..3928111d6
--- /dev/null
+++ b/third_party/aom/av1/encoder/tokenize.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_TOKENIZE_H_
+#define AV1_ENCODER_TOKENIZE_H_
+
+#include "av1/common/entropy.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/treewriter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define EOSB_TOKEN 127  // Not signalled, encoder only
+
+#if CONFIG_HIGHBITDEPTH
+typedef int32_t EXTRABIT;
+#else
+typedef int16_t EXTRABIT;
+#endif
+
+typedef struct {
+  int16_t token;
+  EXTRABIT extra;
+} TOKENVALUE;
+
+typedef struct {
+#if CONFIG_NEW_TOKENSET
+  aom_cdf_prob (*tail_cdf)[CDF_SIZE(ENTROPY_TOKENS)];
+  aom_cdf_prob (*head_cdf)[CDF_SIZE(ENTROPY_TOKENS)];
+  int eob_val;
+  int first_val;
+#elif CONFIG_EC_MULTISYMBOL
+  aom_cdf_prob (*token_cdf)[CDF_SIZE(ENTROPY_TOKENS)];
+#endif
+  const aom_prob *context_tree;
+  EXTRABIT extra;
+  uint8_t token;
+  uint8_t skip_eob_node;
+} TOKENEXTRA;
+
+extern const aom_tree_index av1_coef_tree[];
+extern const aom_tree_index av1_coef_con_tree[];
+#if !CONFIG_EC_MULTISYMBOL
+extern const struct av1_token av1_coef_encodings[];
+#endif  // !CONFIG_EC_MULTISYMBOL
+
+int av1_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+
+struct AV1_COMP;
+struct ThreadData;
+
+typedef enum {
+  OUTPUT_ENABLED = 0,
+  DRY_RUN_NORMAL,
+  DRY_RUN_COSTCOEFFS,
+} RUN_TYPE;
+
+// Note in all the tokenize functions rate if non NULL is incremented
+// with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS,
+// otherwise rate is not incremented.
+#if CONFIG_VAR_TX
+void av1_tokenize_sb_vartx(const struct AV1_COMP *cpi, struct ThreadData *td,
+                           TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
+                           int mi_col, BLOCK_SIZE bsize, int *rate);
+#endif
+#if CONFIG_PALETTE
+void av1_tokenize_palette_sb(const struct AV1_COMP *cpi,
+                             const struct ThreadData *const td, int plane,
+                             TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                             int *rate);
+#endif  // CONFIG_PALETTE
+void av1_tokenize_sb(const struct AV1_COMP *cpi, struct ThreadData *td,
+                     TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                     int *rate, const int mi_row, const int mi_col);
+#if CONFIG_SUPERTX
+void av1_tokenize_sb_supertx(const struct AV1_COMP *cpi, struct ThreadData *td,
+                             TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                             int *rate);
+#endif
+
+extern const int16_t *av1_dct_value_cost_ptr;
+/* TODO: The Token field should be broken out into a separate char array to
+ *  improve cache locality, since it's needed for costing when the rest of the
+ *  fields are not.
+ */
+extern const TOKENVALUE *av1_dct_value_tokens_ptr;
+extern const TOKENVALUE *av1_dct_cat_lt_10_value_tokens;
+extern const int *av1_dct_cat_lt_10_value_cost;
+extern const int16_t av1_cat6_low_cost[256];
+#if CONFIG_HIGHBITDEPTH
+#define CAT6_HIGH_COST_ENTRIES 1024
+#else
+#define CAT6_HIGH_COST_ENTRIES 64
+#endif
+extern const int av1_cat6_high_cost[CAT6_HIGH_COST_ENTRIES];
+extern const uint8_t av1_cat6_skipped_bits_discount[8];
+
+static INLINE void av1_get_token_extra(int v, int16_t *token, EXTRABIT *extra) {
+  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+    *token = CATEGORY6_TOKEN;
+    if (v >= CAT6_MIN_VAL)
+      *extra = 2 * v - 2 * CAT6_MIN_VAL;
+    else
+      *extra = -2 * v - 2 * CAT6_MIN_VAL + 1;
+    return;
+  }
+  *token = av1_dct_cat_lt_10_value_tokens[v].token;
+  *extra = av1_dct_cat_lt_10_value_tokens[v].extra;
+}
+static INLINE int16_t av1_get_token(int v) {
+  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) return 10;
+  return av1_dct_cat_lt_10_value_tokens[v].token;
+}
+
+static INLINE int av1_get_token_cost(int v, int16_t *token, int cat6_bits) {
+  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+    EXTRABIT extrabits;
+    *token = CATEGORY6_TOKEN;
+    extrabits = abs(v) - CAT6_MIN_VAL;
+    return av1_cat6_low_cost[extrabits & 0xff] +
+           av1_cat6_high_cost[extrabits >> 8] -
+           av1_cat6_skipped_bits_discount[18 - cat6_bits];
+  }
+  *token = av1_dct_cat_lt_10_value_tokens[v].token;
+  return av1_dct_cat_lt_10_value_cost[v];
+}
+
+#if !CONFIG_PVQ || CONFIG_VAR_TX
+static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
+                             TX_SIZE tx_size) {
+  const int eob_max = tx_size_2d[tx_size];
+  return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_TOKENIZE_H_
diff --git a/third_party/aom/av1/encoder/treewriter.c b/third_party/aom/av1/encoder/treewriter.c
new file mode 100644
index 000000000..50be72413
--- /dev/null
+++ b/third_party/aom/av1/encoder/treewriter.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/treewriter.h"
+
+static void tree2tok(struct av1_token *tokens, const aom_tree_index *tree,
+                     int i, int v, int l) {
+  v += v;
+  ++l;
+
+  do {
+    const aom_tree_index j = tree[i++];
+    if (j <= 0) {
+      tokens[-j].value = v;
+      tokens[-j].len = l;
+    } else {
+      tree2tok(tokens, tree, j, v, l);
+    }
+  } while (++v & 1);
+}
+
+void av1_tokens_from_tree(struct av1_token *tokens,
+                          const aom_tree_index *tree) {
+  tree2tok(tokens, tree, 0, 0, 0);
+}
+
+static unsigned int convert_distribution(unsigned int i, aom_tree tree,
+                                         unsigned int branch_ct[][2],
+                                         const unsigned int num_events[]) {
+  unsigned int left, right;
+
+  if (tree[i] <= 0)
+    left = num_events[-tree[i]];
+  else
+    left = convert_distribution(tree[i], tree, branch_ct, num_events);
+
+  if (tree[i + 1] <= 0)
+    right = num_events[-tree[i + 1]];
+  else
+    right = convert_distribution(tree[i + 1], tree, branch_ct, num_events);
+
+  branch_ct[i >> 1][0] = left;
+  branch_ct[i >> 1][1] = right;
+  return left + right;
+}
+
+void av1_tree_probs_from_distribution(aom_tree tree,
+                                      unsigned int branch_ct[/* n-1 */][2],
+                                      const unsigned int num_events[/* n */]) {
+  convert_distribution(0, tree, branch_ct, num_events);
+}
diff --git a/third_party/aom/av1/encoder/treewriter.h b/third_party/aom/av1/encoder/treewriter.h
new file mode 100644
index 000000000..9a4cb86cb
--- /dev/null
+++ b/third_party/aom/av1/encoder/treewriter.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_TREEWRITER_H_
+#define AV1_ENCODER_TREEWRITER_H_
+
+#include "aom_dsp/bitwriter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_tree_probs_from_distribution(aom_tree tree,
+                                      unsigned int branch_ct[/* n - 1 */][2],
+                                      const unsigned int num_events[/* n */]);
+
+struct av1_token {
+  int value;
+  int len;
+};
+
+void av1_tokens_from_tree(struct av1_token *, const aom_tree_index *);
+
+static INLINE void av1_write_token(aom_writer *w, const aom_tree_index *tree,
+                                   const aom_prob *probs,
+                                   const struct av1_token *token) {
+  aom_write_tree(w, tree, probs, token->value, token->len, 0);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_TREEWRITER_H_
diff --git a/third_party/aom/av1/encoder/variance_tree.c b/third_party/aom/av1/encoder/variance_tree.c
new file mode 100644
index 000000000..9384cd78e
--- /dev/null
+++ b/third_party/aom/av1/encoder/variance_tree.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/variance_tree.h"
+#include "av1/encoder/encoder.h"
+
+void av1_setup_var_tree(struct AV1Common *cm, ThreadData *td) {
+  int i, j;
+#if CONFIG_EXT_PARTITION
+  const int leaf_nodes = 1024;
+  const int tree_nodes = 1024 + 256 + 64 + 16 + 4 + 1;
+#else
+  const int leaf_nodes = 256;
+  const int tree_nodes = 256 + 64 + 16 + 4 + 1;
+#endif  // CONFIG_EXT_PARTITION
+  int index = 0;
+  VAR_TREE *this_var;
+  int nodes;
+
+  aom_free(td->var_tree);
+  CHECK_MEM_ERROR(cm, td->var_tree,
+                  aom_calloc(tree_nodes, sizeof(*td->var_tree)));
+
+  this_var = &td->var_tree[0];
+
+  // Sets up all the leaf nodes in the tree.
+  for (index = 0; index < leaf_nodes; ++index) {
+    VAR_TREE *const leaf = &td->var_tree[index];
+    leaf->split[0] = NULL;
+  }
+
+  // Each node has 4 leaf nodes, fill in the child pointers
+  // from leafs to the root.
+  for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+    for (i = 0; i < nodes; ++i, ++index) {
+      VAR_TREE *const node = &td->var_tree[index];
+      for (j = 0; j < 4; j++) node->split[j] = this_var++;
+    }
+  }
+
+  // Set up the root node for the largest superblock size
+  i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2;
+  td->var_root[i] = &td->var_tree[tree_nodes - 1];
+  // Set up the root nodes for the rest of the possible superblock sizes
+  while (--i >= 0) {
+    td->var_root[i] = td->var_root[i + 1]->split[0];
+  }
+}
+
+void av1_free_var_tree(ThreadData *td) {
+  aom_free(td->var_tree);
+  td->var_tree = NULL;
+}
diff --git a/third_party/aom/av1/encoder/variance_tree.h b/third_party/aom/av1/encoder/variance_tree.h
new file mode 100644
index 000000000..a9f27302e
--- /dev/null
+++ b/third_party/aom/av1/encoder/variance_tree.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_VARIANCE_TREE_H_
+#define AV1_ENCODER_VARIANCE_TREE_H_
+
+#include <assert.h>
+
+#include "./aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+struct ThreadData;
+
+typedef struct {
+  int64_t sum_square_error;
+  int64_t sum_error;
+  int log2_count;
+  int variance;
+} VAR;
+
+typedef struct {
+  VAR none;
+  VAR horz[2];
+  VAR vert[2];
+} partition_variance;
+
+typedef struct VAR_TREE {
+  int force_split;
+  partition_variance variances;
+  struct VAR_TREE *split[4];
+  BLOCK_SIZE bsize;
+  const uint8_t *src;
+  const uint8_t *ref;
+  int src_stride;
+  int ref_stride;
+  int width;
+  int height;
+#if CONFIG_HIGHBITDEPTH
+  int highbd;
+#endif  // CONFIG_HIGHBITDEPTH
+} VAR_TREE;
+
+void av1_setup_var_tree(struct AV1Common *cm, struct ThreadData *td);
+void av1_free_var_tree(struct ThreadData *td);
+
+// Set variance values given sum square error, sum error, count.
+static INLINE void fill_variance(int64_t s2, int64_t s, int c, VAR *v) {
+  v->sum_square_error = s2;
+  v->sum_error = s;
+  v->log2_count = c;
+  v->variance =
+      (int)(256 * (v->sum_square_error -
+                   ((v->sum_error * v->sum_error) >> v->log2_count)) >>
+            v->log2_count);
+}
+
+static INLINE void sum_2_variances(const VAR *a, const VAR *b, VAR *r) {
+  assert(a->log2_count == b->log2_count);
+  fill_variance(a->sum_square_error + b->sum_square_error,
+                a->sum_error + b->sum_error, a->log2_count + 1, r);
+}
+
+static INLINE void fill_variance_node(VAR_TREE *vt) {
+  sum_2_variances(&vt->split[0]->variances.none, &vt->split[1]->variances.none,
+                  &vt->variances.horz[0]);
+  sum_2_variances(&vt->split[2]->variances.none, &vt->split[3]->variances.none,
+                  &vt->variances.horz[1]);
+  sum_2_variances(&vt->split[0]->variances.none, &vt->split[2]->variances.none,
+                  &vt->variances.vert[0]);
+  sum_2_variances(&vt->split[1]->variances.none, &vt->split[3]->variances.none,
+                  &vt->variances.vert[1]);
+  sum_2_variances(&vt->variances.vert[0], &vt->variances.vert[1],
+                  &vt->variances.none);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif /* AV1_ENCODER_VARIANCE_TREE_H_ */
diff --git a/third_party/aom/av1/encoder/wedge_utils.c b/third_party/aom/av1/encoder/wedge_utils.c
new file mode 100644
index 000000000..e6edbb6af
--- /dev/null
+++ b/third_party/aom/av1/encoder/wedge_utils.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+
+#include "aom_ports/mem.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * Computes SSE of a compound predictor constructed from 2 fundamental
+ * predictors p0 and p1 using blending with mask.
+ *
+ * r1:  Residuals of p1.
+ *      (source - p1)
+ * d:   Difference of p1 and p0.
+ *      (p1 - p0)
+ * m:   The blending mask
+ * N:   Number of pixels
+ *
+ * 'r1', 'd', and 'm' are contiguous.
+ *
+ * Computes:
+ *  Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to:
+ *  Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2),
+ *    where r0 is (source - p0), and r1 is (source - p1), which is in turn
+ *    is equivalent to:
+ *  Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2),
+ *    which is the SSE of the residuals of the compound predictor scaled up by
+ *    MAX_MASK_VALUE**2.
+ *
+ * Note that we clamp the partial term in the loop to 16 bits signed. This is
+ * to facilitate equivalent SIMD implementation. It should have no effect if
+ * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always
+ * holds for 8 bit input, and on real input, it should hold practically always,
+ * as residuals are expected to be small.
+ */
+uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d,
+                                        const uint8_t *m, int N) {
+  uint64_t csse = 0;
+  int i;
+
+  for (i = 0; i < N; i++) {
+    int32_t t = MAX_MASK_VALUE * r1[i] + m[i] * d[i];
+    t = clamp(t, INT16_MIN, INT16_MAX);
+    csse += t * t;
+  }
+  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * Choose the mask sign for a compound predictor.
+ *
+ * ds:    Difference of the squares of the residuals.
+ *        r0**2 - r1**2
+ * m:     The blending mask
+ * N:     Number of pixels
+ * limit: Pre-computed threshold value.
+ *        MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
+ *
+ * 'ds' and 'm' are contiguous.
+ *
+ * Returns true if the negated mask has lower SSE compared to the positive
+ * mask. Computation is based on:
+ *  Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2)
+ *                                     >
+ *                                Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2)
+ *
+ *  which can be simplified to:
+ *
+ *  Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
+ *
+ *  The right hand side does not depend on the mask, and needs to be passed as
+ *  the 'limit' parameter.
+ *
+ *  After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left
+ *  hand side is simply a scalar product between an int16_t and uint8_t vector.
+ *
+ *  Note that for efficiency, ds is stored on 16 bits. Real input residuals
+ *  being small, this should not cause a noticeable issue.
+ */
+int av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m, int N,
+                                    int64_t limit) {
+  int64_t acc = 0;
+
+  do {
+    acc += *ds++ * *m++;
+  } while (--N);
+
+  return acc > limit;
+}
+
+/**
+ * Compute the element-wise difference of the squares of 2 arrays.
+ *
+ * d: Difference of the squares of the inputs: a**2 - b**2
+ * a: First input array
+ * b: Second input array
+ * N: Number of elements
+ *
+ * 'd', 'a', and 'b' are contiguous.
+ *
+ * The result is saturated to signed 16 bits.
+ */
+void av1_wedge_compute_delta_squares_c(int16_t *d, const int16_t *a,
+                                       const int16_t *b, int N) {
+  int i;
+
+  for (i = 0; i < N; i++)
+    d[i] = clamp(a[i] * a[i] - b[i] * b[i], INT16_MIN, INT16_MAX);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
new file mode 100644
index 000000000..fa5626002
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <stdint.h>
+
+#include "./av1_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+// Coefficient quantization phase 1
+// param[0-2] : rounding/quan/dequan constants
+static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
+                                         const int shift, const int scale,
+                                         __m128i *qcoeff, __m128i *dquan,
+                                         __m128i *sign) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi32(1);
+
+  *sign = _mm_cmplt_epi32(*coeff, zero);
+  *sign = _mm_or_si128(*sign, one);
+  *coeff = _mm_abs_epi32(*coeff);
+
+  qcoeff[0] = _mm_add_epi32(*coeff, param[0]);
+  qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero);
+  qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero);
+
+  qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]);
+  qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
+  dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
+  dquan[0] = _mm_srli_epi64(dquan[0], scale);
+}
+
+// Coefficient quantization phase 2
+static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
+                                         const __m128i *sign,
+                                         const __m128i *param, const int shift,
+                                         const int scale, tran_low_t *qAddr,
+                                         tran_low_t *dqAddr) {
+  __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0);
+  __m128i mask0H = _mm_set_epi32(0, 0, -1, -1);
+
+  qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]);
+  qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift);
+  dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]);
+  dquan[1] = _mm_srli_epi64(dquan[1], scale);
+
+  // combine L&H
+  qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8);
+  qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d);
+
+  qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H);
+  qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L);
+
+  dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8);
+  dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d);
+
+  dquan[0] = _mm_and_si128(dquan[0], mask0H);
+  dquan[1] = _mm_and_si128(dquan[1], mask0L);
+
+  qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]);
+  dquan[0] = _mm_or_si128(dquan[0], dquan[1]);
+
+  qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
+  dquan[0] = _mm_sign_epi32(dquan[0], *sign);
+
+  _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
+  _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
+}
+
+static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
+                            __m128i *eob) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, iscanIdx;
+  const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr);
+  const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4));
+  __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero);
+  __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero);
+
+  nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero);
+  nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero);
+
+  mask = _mm_packs_epi32(nz_flag0, nz_flag1);
+  iscanIdx = _mm_loadu_si128((__m128i const *)iscan);
+  iscanIdx = _mm_sub_epi16(iscanIdx, mask);
+  iscanIdx = _mm_and_si128(iscanIdx, mask);
+  *eob = _mm_max_epi16(*eob, iscanIdx);
+}
+
+static INLINE uint16_t get_accumulated_eob(__m128i *eob) {
+  __m128i eob_shuffled;
+  uint16_t eobValue;
+  eob_shuffled = _mm_shuffle_epi32(*eob, 0xe);
+  *eob = _mm_max_epi16(*eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe);
+  *eob = _mm_max_epi16(*eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1);
+  *eob = _mm_max_epi16(*eob, eob_shuffled);
+  eobValue = _mm_extract_epi16(*eob, 0);
+  return eobValue;
+}
+
+void av1_highbd_quantize_fp_sse4_1(
+    const tran_low_t *coeff_ptr, intptr_t count, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, int log_scale) {
+  __m128i coeff[2], qcoeff[2], dequant[2], qparam[3], coeff_sign;
+  __m128i eob = _mm_setzero_si128();
+  const tran_low_t *src = coeff_ptr;
+  tran_low_t *quanAddr = qcoeff_ptr;
+  tran_low_t *dquanAddr = dqcoeff_ptr;
+  const int shift = 16 - log_scale;
+  const int coeff_stride = 4;
+  const int quan_stride = coeff_stride;
+  (void)skip_block;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)scan;
+
+  memset(quanAddr, 0, count * sizeof(quanAddr[0]));
+  memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
+
+  if (!skip_block) {
+    coeff[0] = _mm_loadu_si128((__m128i const *)src);
+
+    qparam[0] =
+        _mm_set_epi32(round_ptr[1], round_ptr[1], round_ptr[1], round_ptr[0]);
+    qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[0]);
+    qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[0]);
+
+    // DC and first 3 AC
+    quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
+                          &coeff_sign);
+
+    // update round/quan/dquan for AC
+    qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
+    qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[1]);
+    qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[1]);
+
+    quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+                          log_scale, quanAddr, dquanAddr);
+
+    // next 4 AC
+    coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+    quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
+                          &coeff_sign);
+    quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+                          log_scale, quanAddr + quan_stride,
+                          dquanAddr + quan_stride);
+
+    find_eob(quanAddr, iscan, &eob);
+
+    count -= 8;
+
+    // loop for the rest of AC
+    while (count > 0) {
+      src += coeff_stride << 1;
+      quanAddr += quan_stride << 1;
+      dquanAddr += quan_stride << 1;
+      iscan += quan_stride << 1;
+
+      coeff[0] = _mm_loadu_si128((__m128i const *)src);
+      coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+
+      quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff,
+                            dequant, &coeff_sign);
+      quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+                            log_scale, quanAddr, dquanAddr);
+
+      quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff,
+                            dequant, &coeff_sign);
+      quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+                            log_scale, quanAddr + quan_stride,
+                            dquanAddr + quan_stride);
+
+      find_eob(quanAddr, iscan, &eob);
+
+      count -= 8;
+    }
+    *eob_ptr = get_accumulated_eob(&eob);
+  } else {
+    *eob_ptr = 0;
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
new file mode 100644
index 000000000..f9c95b6cb
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "aom/aom_integer.h"
+
+void av1_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                          int skip_block, const int16_t *zbin_ptr,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+                          int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                          uint16_t *eob_ptr, const int16_t *scan_ptr,
+                          const int16_t *iscan_ptr) {
+  __m128i zero;
+  __m128i thr;
+  int16_t nzflag;
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+
+  coeff_ptr += n_coeffs;
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+
+  if (!skip_block) {
+    __m128i eob;
+    __m128i round, quant, dequant;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        round = _mm_load_si128((const __m128i *)round_ptr);
+        quant = _mm_load_si128((const __m128i *)quant_ptr);
+        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        // Do DC and first 15 AC
+        coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs));
+        coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    thr = _mm_srai_epi16(dequant, 1);
+
+    // AC only loop
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+
+        coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs));
+        coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+                 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+        if (nzflag) {
+          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+          // Reinsert signs
+          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
+          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
+          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        } else {
+          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
+        }
+      }
+
+      if (nzflag) {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm
new file mode 100644
index 000000000..ad4ae274e
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm
@@ -0,0 +1,204 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro QUANTIZE_FP 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, \
+                                eob, scan, iscan
+  cmp                    dword skipm, 0
+  jne .blank
+
+  ; actual quantize loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, dequantmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+%ifidn %1, fp_32x32
+  pcmpeqw                         m5, m5
+  psrlw                           m5, 15
+  paddw                           m1, m5
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  mova                            m3, [r2q]                ; m3 = dequant
+  mov                             r3, qcoeffmp
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+%ifidn %1, fp_32x32
+  psllw                           m2, 1
+%endif
+  pxor                            m5, m5                   ; m5 = dedicated zero
+
+  lea                         coeffq, [  coeffq+ncoeffq*2]
+  lea                            r5q, [  r5q+ncoeffq*2]
+  lea                            r3q, [ r3q+ncoeffq*2]
+  lea                            r4q, [r4q+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpeqw                         m7, m7
+
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  mova            [r3q+ncoeffq*2+ 0], m8
+  mova            [r3q+ncoeffq*2+16], m13
+%ifidn %1, fp_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; r4[i] = r3[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
+%ifidn %1, fp_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+  psrlw                           m0, m3, 2
+%else
+  psrlw                           m0, m3, 1
+%endif
+  mova            [r4q+ncoeffq*2+ 0], m8
+  mova            [r4q+ncoeffq*2+16], m13
+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m7                   ; m11 = scan[i] + 1
+  pandn                           m8, m6                   ; m8 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jz .accumulate_eob
+
+.ac_only_loop:
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+
+  pcmpgtw                         m7, m6,  m0
+  pcmpgtw                        m12, m11, m0
+  pmovmskb                       r6d, m7
+  pmovmskb                       r2d, m12
+
+  or                              r6, r2
+  jz .skip_iter
+
+  pcmpeqw                         m7, m7
+
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  mova            [r3q+ncoeffq*2+ 0], m14
+  mova            [r3q+ncoeffq*2+16], m13
+%ifidn %1, fp_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; r4[i] = r3[i] * q
+  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
+%ifidn %1, fp_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+  mova            [r4q+ncoeffq*2+ 0], m14
+  mova            [r4q+ncoeffq*2+16], m13
+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m7                   ; m11 = scan[i] + 1
+  pandn                          m14, m6                   ; m14 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+  jmp .accumulate_eob
+.skip_iter:
+  mova            [r3q+ncoeffq*2+ 0], m5
+  mova            [r3q+ncoeffq*2+16], m5
+  mova            [r4q+ncoeffq*2+ 0], m5
+  mova            [r4q+ncoeffq*2+16], m5
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+.accumulate_eob:
+  ; horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  pextrw                          r6, m8, 0
+  mov                           [r2], r6
+  RET
+
+  ; skip-block, i.e. just write all zeroes
+.blank:
+  mov                             r0, dqcoeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, qcoeffmp
+  mov                             r3, eobmp
+
+  lea                            r0q, [r0q+ncoeffq*2]
+  lea                            r2q, [r2q+ncoeffq*2]
+  neg                        ncoeffq
+  pxor                            m7, m7
+.blank_loop:
+  mova            [r0q+ncoeffq*2+ 0], m7
+  mova            [r0q+ncoeffq*2+16], m7
+  mova            [r2q+ncoeffq*2+ 0], m7
+  mova            [r2q+ncoeffq*2+16], m7
+  add                        ncoeffq, mmsize
+  jl .blank_loop
+  mov                     word [r3q], 0
+  RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FP fp, 7
+QUANTIZE_FP fp_32x32, 7
diff --git a/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
new file mode 100644
index 000000000..dcc697ba3
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
@@ -0,0 +1,219 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+        paddusw         xmm15, xmm3  ; sum_s
+        paddusw         xmm14, xmm4  ; sum_r
+        movdqa          xmm1, xmm3
+        pmaddwd         xmm1, xmm1
+        paddd           xmm13, xmm1 ; sum_sq_s
+        movdqa          xmm2, xmm4
+        pmaddwd         xmm2, xmm2
+        paddd           xmm12, xmm2 ; sum_sq_r
+        pmaddwd         xmm3, xmm4
+        paddd           xmm11, xmm3  ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+        movdqa          xmm2,%1
+        punpckldq       %1,xmm0
+        punpckhdq       xmm2,xmm0
+        paddq           %1,xmm2
+        movdqa          xmm2,%1
+        punpcklqdq      %1,xmm0
+        punpckhqdq      xmm2,xmm0
+        paddq           %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+        movdqa          xmm1, %1
+        punpcklwd       %1,xmm0
+        punpckhwd       xmm1,xmm0
+        paddd           %1, xmm1
+        SUM_ACROSS_Q    %1
+%endmacro
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(av1_ssim_parms_16x16_sse2) PRIVATE
+sym(av1_ssim_parms_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 16      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movdqu          xmm5, [rsi]
+    movdqu          xmm6, [rdi]
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpckhbw       xmm3, xmm0 ; high_s
+    punpckhbw       xmm4, xmm0 ; high_r
+
+    TABULATE_SSIM
+
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(av1_ssim_parms_8x8_sse2) PRIVATE
+sym(av1_ssim_parms_8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 8      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movq            xmm3, [rsi]
+    movq            xmm4, [rdi]
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
new file mode 100644
index 000000000..37c4b0d88
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
@@ -0,0 +1,3884 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+
+#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/fwd_txfm_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
+
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+                                   int stride, int flipud, int fliplr) {
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+  __m128i mask;
+
+  if (!flipud) {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+  } else {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+    in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+    in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+    in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+  }
+
+  in[0] = _mm_slli_epi16(in[0], 4);
+  in[1] = _mm_slli_epi16(in[1], 4);
+  in[2] = _mm_slli_epi16(in[2], 4);
+  in[3] = _mm_slli_epi16(in[3], 4);
+
+  mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
+  in[0] = _mm_add_epi16(in[0], mask);
+  in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
+}
+
+static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
+  const __m128i kOne = _mm_set1_epi16(1);
+  __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
+  __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
+  __m128i out01 = _mm_add_epi16(in01, kOne);
+  __m128i out23 = _mm_add_epi16(in23, kOne);
+  out01 = _mm_srai_epi16(out01, 2);
+  out23 = _mm_srai_epi16(out23, 2);
+  store_output(&out01, (output + 0 * 8));
+  store_output(&out23, (output + 1 * 8));
+}
+
+static INLINE void transpose_4x4(__m128i *res) {
+  // Combine and transpose
+  // 00 01 02 03 20 21 22 23
+  // 10 11 12 13 30 31 32 33
+  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+
+  // 00 10 20 30 01 11 21 31
+  // 02 12 22 32 03 13 23 33
+  // only use the first 4 16-bit integers
+  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
+  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
+}
+
+static void fdct4_sse2(__m128i *in) {
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u[4], v[4];
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpacklo_epi16(in[3], in[2]);
+
+  v[0] = _mm_add_epi16(u[0], u[1]);
+  v[1] = _mm_sub_epi16(u[0], u[1]);
+
+  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
+  u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
+  u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
+  u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[1] = _mm_packs_epi32(u[2], u[3]);
+  transpose_4x4(in);
+}
+
+static void fadst4_sse2(__m128i *in) {
+  const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
+  const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
+  const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
+  const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
+  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
+  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u[8], v[8];
+  __m128i in7 = _mm_add_epi16(in[0], in[1]);
+
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpacklo_epi16(in[2], in[3]);
+  u[2] = _mm_unpacklo_epi16(in7, kZero);
+  u[3] = _mm_unpacklo_epi16(in[2], kZero);
+  u[4] = _mm_unpacklo_epi16(in[3], kZero);
+
+  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
+  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
+  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
+  v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
+  v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
+  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
+  v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
+
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[1] = _mm_sub_epi32(v[2], v[6]);
+  u[2] = _mm_add_epi32(v[3], v[4]);
+  u[3] = _mm_sub_epi32(u[2], u[0]);
+  u[4] = _mm_slli_epi32(v[5], 2);
+  u[5] = _mm_sub_epi32(u[4], v[5]);
+  u[6] = _mm_add_epi32(u[3], u[5]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[2]);
+  in[1] = _mm_packs_epi32(u[1], u[3]);
+  transpose_4x4(in);
+}
+
+#if CONFIG_EXT_TX
+static void fidtx4_sse2(__m128i *in) {
+  const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
+  const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i v0, v1, v2, v3;
+  __m128i u0, u1, u2, u3;
+
+  v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
+  v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
+  v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
+  v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
+
+  u0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
+  u1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
+  u2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
+  u3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
+
+  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u0, u2);
+  in[1] = _mm_packs_epi32(u1, u3);
+  transpose_4x4(in);
+}
+#endif  // CONFIG_EXT_TX
+
+void av1_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
+                     int tx_type) {
+  __m128i in[4];
+
+  switch (tx_type) {
+    case DCT_DCT: aom_fdct4x4_sse2(input, output, stride); break;
+    case ADST_DCT:
+      load_buffer_4x4(input, in, stride, 0, 0);
+      fadst4_sse2(in);
+      fdct4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case DCT_ADST:
+      load_buffer_4x4(input, in, stride, 0, 0);
+      fdct4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case ADST_ADST:
+      load_buffer_4x4(input, in, stride, 0, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_4x4(input, in, stride, 1, 0);
+      fadst4_sse2(in);
+      fdct4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_4x4(input, in, stride, 0, 1);
+      fdct4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_4x4(input, in, stride, 1, 1);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_4x4(input, in, stride, 0, 1);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_4x4(input, in, stride, 1, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case IDTX:
+      load_buffer_4x4(input, in, stride, 0, 0);
+      fidtx4_sse2(in);
+      fidtx4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case V_DCT:
+      load_buffer_4x4(input, in, stride, 0, 0);
+      fdct4_sse2(in);
+      fidtx4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case H_DCT:
+      load_buffer_4x4(input, in, stride, 0, 0);
+      fidtx4_sse2(in);
+      fdct4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case V_ADST:
+      load_buffer_4x4(input, in, stride, 0, 0);
+      fadst4_sse2(in);
+      fidtx4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case H_ADST:
+      load_buffer_4x4(input, in, stride, 0, 0);
+      fidtx4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case V_FLIPADST:
+      load_buffer_4x4(input, in, stride, 1, 0);
+      fadst4_sse2(in);
+      fidtx4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case H_FLIPADST:
+      load_buffer_4x4(input, in, stride, 0, 1);
+      fidtx4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0);
+  }
+}
+
+void av1_fdct8x8_quant_sse2(const int16_t *input, int stride,
+                            int16_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t *zbin_ptr,
+                            const int16_t *round_ptr, const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+                            int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                            uint16_t *eob_ptr, const int16_t *scan_ptr,
+                            const int16_t *iscan_ptr) {
+  __m128i zero;
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // Load input
+  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  __m128i *in[8];
+  int index = 0;
+
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)coeff_ptr;
+
+  // Pre-condition input (shift by two)
+  in0 = _mm_slli_epi16(in0, 2);
+  in1 = _mm_slli_epi16(in1, 2);
+  in2 = _mm_slli_epi16(in2, 2);
+  in3 = _mm_slli_epi16(in3, 2);
+  in4 = _mm_slli_epi16(in4, 2);
+  in5 = _mm_slli_epi16(in5, 2);
+  in6 = _mm_slli_epi16(in6, 2);
+  in7 = _mm_slli_epi16(in7, 2);
+
+  in[0] = &in0;
+  in[1] = &in1;
+  in[2] = &in2;
+  in[3] = &in3;
+  in[4] = &in4;
+  in[5] = &in5;
+  in[6] = &in6;
+  in[7] = &in7;
+
+  // We do two passes, first the columns, then the rows. The results of the
+  // first pass are transposed so that the same column code can be reused. The
+  // results of the second pass are also transposed so that the rows (processed
+  // as columns) are put back in row positions.
+  for (pass = 0; pass < 2; pass++) {
+    // To store results of each pass before the transpose.
+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+    // Add/subtract
+    const __m128i q0 = _mm_add_epi16(in0, in7);
+    const __m128i q1 = _mm_add_epi16(in1, in6);
+    const __m128i q2 = _mm_add_epi16(in2, in5);
+    const __m128i q3 = _mm_add_epi16(in3, in4);
+    const __m128i q4 = _mm_sub_epi16(in3, in4);
+    const __m128i q5 = _mm_sub_epi16(in2, in5);
+    const __m128i q6 = _mm_sub_epi16(in1, in6);
+    const __m128i q7 = _mm_sub_epi16(in0, in7);
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m128i r0 = _mm_add_epi16(q0, q3);
+      const __m128i r1 = _mm_add_epi16(q1, q2);
+      const __m128i r2 = _mm_sub_epi16(q1, q2);
+      const __m128i r3 = _mm_sub_epi16(q0, q3);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res0 = _mm_packs_epi32(w0, w1);
+      res4 = _mm_packs_epi32(w2, w3);
+      res2 = _mm_packs_epi32(w4, w5);
+      res6 = _mm_packs_epi32(w6, w7);
+    }
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+      // dct_const_round_shift
+      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+      // Combine
+      const __m128i r0 = _mm_packs_epi32(s0, s1);
+      const __m128i r1 = _mm_packs_epi32(s2, s3);
+      // Add/subtract
+      const __m128i x0 = _mm_add_epi16(q4, r0);
+      const __m128i x1 = _mm_sub_epi16(q4, r0);
+      const __m128i x2 = _mm_sub_epi16(q7, r1);
+      const __m128i x3 = _mm_add_epi16(q7, r1);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res1 = _mm_packs_epi32(w0, w1);
+      res7 = _mm_packs_epi32(w2, w3);
+      res5 = _mm_packs_epi32(w4, w5);
+      res3 = _mm_packs_epi32(w6, w7);
+    }
+    // Transpose the 8x8.
+    {
+      // 00 01 02 03 04 05 06 07
+      // 10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27
+      // 30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47
+      // 50 51 52 53 54 55 56 57
+      // 60 61 62 63 64 65 66 67
+      // 70 71 72 73 74 75 76 77
+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+      // 00 10 01 11 02 12 03 13
+      // 20 30 21 31 22 32 23 33
+      // 04 14 05 15 06 16 07 17
+      // 24 34 25 35 26 36 27 37
+      // 40 50 41 51 42 52 43 53
+      // 60 70 61 71 62 72 63 73
+      // 54 54 55 55 56 56 57 57
+      // 64 74 65 75 66 76 67 77
+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+      // 00 10 20 30 01 11 21 31
+      // 40 50 60 70 41 51 61 71
+      // 02 12 22 32 03 13 23 33
+      // 42 52 62 72 43 53 63 73
+      // 04 14 24 34 05 15 21 36
+      // 44 54 64 74 45 55 61 76
+      // 06 16 26 36 07 17 27 37
+      // 46 56 66 76 47 57 67 77
+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }
+  // Post-condition output and store it
+  {
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+    in0 = _mm_sub_epi16(in0, sign_in0);
+    in1 = _mm_sub_epi16(in1, sign_in1);
+    in2 = _mm_sub_epi16(in2, sign_in2);
+    in3 = _mm_sub_epi16(in3, sign_in3);
+    in4 = _mm_sub_epi16(in4, sign_in4);
+    in5 = _mm_sub_epi16(in5, sign_in5);
+    in6 = _mm_sub_epi16(in6, sign_in6);
+    in7 = _mm_sub_epi16(in7, sign_in7);
+    in0 = _mm_srai_epi16(in0, 1);
+    in1 = _mm_srai_epi16(in1, 1);
+    in2 = _mm_srai_epi16(in2, 1);
+    in3 = _mm_srai_epi16(in3, 1);
+    in4 = _mm_srai_epi16(in4, 1);
+    in5 = _mm_srai_epi16(in5, 1);
+    in6 = _mm_srai_epi16(in6, 1);
+    in7 = _mm_srai_epi16(in7, 1);
+  }
+
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+
+  if (!skip_block) {
+    __m128i eob;
+    __m128i round, quant, dequant;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        round = _mm_load_si128((const __m128i *)round_ptr);
+        quant = _mm_load_si128((const __m128i *)quant_ptr);
+        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        // Do DC and first 15 AC
+        coeff0 = *in[0];
+        coeff1 = *in[1];
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // AC only loop
+    index = 2;
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+
+        assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
+        coeff0 = *in[index];
+        coeff1 = *in[index + 1];
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+      index += 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
+  }
+}
+
+// load 8x8 array
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+                                   int stride, int flipud, int fliplr) {
+  if (!flipud) {
+    in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  } else {
+    in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+    in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = mm_reverse_epi16(in[0]);
+    in[1] = mm_reverse_epi16(in[1]);
+    in[2] = mm_reverse_epi16(in[2]);
+    in[3] = mm_reverse_epi16(in[3]);
+    in[4] = mm_reverse_epi16(in[4]);
+    in[5] = mm_reverse_epi16(in[5]);
+    in[6] = mm_reverse_epi16(in[6]);
+    in[7] = mm_reverse_epi16(in[7]);
+  }
+
+  in[0] = _mm_slli_epi16(in[0], 2);
+  in[1] = _mm_slli_epi16(in[1], 2);
+  in[2] = _mm_slli_epi16(in[2], 2);
+  in[3] = _mm_slli_epi16(in[3], 2);
+  in[4] = _mm_slli_epi16(in[4], 2);
+  in[5] = _mm_slli_epi16(in[5], 2);
+  in[6] = _mm_slli_epi16(in[6], 2);
+  in[7] = _mm_slli_epi16(in[7], 2);
+}
+
+// right shift and rounding
+static INLINE void right_shift_8x8(__m128i *res, const int bit) {
+  __m128i sign0 = _mm_srai_epi16(res[0], 15);
+  __m128i sign1 = _mm_srai_epi16(res[1], 15);
+  __m128i sign2 = _mm_srai_epi16(res[2], 15);
+  __m128i sign3 = _mm_srai_epi16(res[3], 15);
+  __m128i sign4 = _mm_srai_epi16(res[4], 15);
+  __m128i sign5 = _mm_srai_epi16(res[5], 15);
+  __m128i sign6 = _mm_srai_epi16(res[6], 15);
+  __m128i sign7 = _mm_srai_epi16(res[7], 15);
+
+  if (bit == 2) {
+    const __m128i const_rounding = _mm_set1_epi16(1);
+    res[0] = _mm_adds_epi16(res[0], const_rounding);
+    res[1] = _mm_adds_epi16(res[1], const_rounding);
+    res[2] = _mm_adds_epi16(res[2], const_rounding);
+    res[3] = _mm_adds_epi16(res[3], const_rounding);
+    res[4] = _mm_adds_epi16(res[4], const_rounding);
+    res[5] = _mm_adds_epi16(res[5], const_rounding);
+    res[6] = _mm_adds_epi16(res[6], const_rounding);
+    res[7] = _mm_adds_epi16(res[7], const_rounding);
+  }
+
+  res[0] = _mm_sub_epi16(res[0], sign0);
+  res[1] = _mm_sub_epi16(res[1], sign1);
+  res[2] = _mm_sub_epi16(res[2], sign2);
+  res[3] = _mm_sub_epi16(res[3], sign3);
+  res[4] = _mm_sub_epi16(res[4], sign4);
+  res[5] = _mm_sub_epi16(res[5], sign5);
+  res[6] = _mm_sub_epi16(res[6], sign6);
+  res[7] = _mm_sub_epi16(res[7], sign7);
+
+  if (bit == 1) {
+    res[0] = _mm_srai_epi16(res[0], 1);
+    res[1] = _mm_srai_epi16(res[1], 1);
+    res[2] = _mm_srai_epi16(res[2], 1);
+    res[3] = _mm_srai_epi16(res[3], 1);
+    res[4] = _mm_srai_epi16(res[4], 1);
+    res[5] = _mm_srai_epi16(res[5], 1);
+    res[6] = _mm_srai_epi16(res[6], 1);
+    res[7] = _mm_srai_epi16(res[7], 1);
+  } else {
+    res[0] = _mm_srai_epi16(res[0], 2);
+    res[1] = _mm_srai_epi16(res[1], 2);
+    res[2] = _mm_srai_epi16(res[2], 2);
+    res[3] = _mm_srai_epi16(res[3], 2);
+    res[4] = _mm_srai_epi16(res[4], 2);
+    res[5] = _mm_srai_epi16(res[5], 2);
+    res[6] = _mm_srai_epi16(res[6], 2);
+    res[7] = _mm_srai_epi16(res[7], 2);
+  }
+}
+
+// write 8x8 array
+static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
+                                    int stride) {
+  store_output(&res[0], (output + 0 * stride));
+  store_output(&res[1], (output + 1 * stride));
+  store_output(&res[2], (output + 2 * stride));
+  store_output(&res[3], (output + 3 * stride));
+  store_output(&res[4], (output + 4 * stride));
+  store_output(&res[5], (output + 5 * stride));
+  store_output(&res[6], (output + 6 * stride));
+  store_output(&res[7], (output + 7 * stride));
+}
+
+// perform in-place transpose
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  // 04 14 05 15 06 16 07 17
+  // 24 34 25 35 26 36 27 37
+  // 40 50 41 51 42 52 43 53
+  // 60 70 61 71 62 72 63 73
+  // 44 54 45 55 46 56 47 57
+  // 64 74 65 75 66 76 67 77
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+  // 00 10 20 30 01 11 21 31
+  // 40 50 60 70 41 51 61 71
+  // 02 12 22 32 03 13 23 33
+  // 42 52 62 72 43 53 63 73
+  // 04 14 24 34 05 15 25 35
+  // 44 54 64 74 45 55 65 75
+  // 06 16 26 36 07 17 27 37
+  // 46 56 66 76 47 57 67 77
+  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+  // 00 10 20 30 40 50 60 70
+  // 01 11 21 31 41 51 61 71
+  // 02 12 22 32 42 52 62 72
+  // 03 13 23 33 43 53 63 73
+  // 04 14 24 34 44 54 64 74
+  // 05 15 25 35 45 55 65 75
+  // 06 16 26 36 46 56 66 76
+  // 07 17 27 37 47 57 67 77
+}
+
+static void fdct8_sse2(__m128i *in) {
+  // constants
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+  // stage 1
+  s0 = _mm_add_epi16(in[0], in[7]);
+  s1 = _mm_add_epi16(in[1], in[6]);
+  s2 = _mm_add_epi16(in[2], in[5]);
+  s3 = _mm_add_epi16(in[3], in[4]);
+  s4 = _mm_sub_epi16(in[3], in[4]);
+  s5 = _mm_sub_epi16(in[2], in[5]);
+  s6 = _mm_sub_epi16(in[1], in[6]);
+  s7 = _mm_sub_epi16(in[0], in[7]);
+
+  u0 = _mm_add_epi16(s0, s3);
+  u1 = _mm_add_epi16(s1, s2);
+  u2 = _mm_sub_epi16(s1, s2);
+  u3 = _mm_sub_epi16(s0, s3);
+  // interleave and perform butterfly multiplication/addition
+  v0 = _mm_unpacklo_epi16(u0, u1);
+  v1 = _mm_unpackhi_epi16(u0, u1);
+  v2 = _mm_unpacklo_epi16(u2, u3);
+  v3 = _mm_unpackhi_epi16(u2, u3);
+
+  u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
+  u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
+  u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
+  u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
+  u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
+  u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
+  u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
+  u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[4] = _mm_packs_epi32(u2, u3);
+  in[6] = _mm_packs_epi32(u6, u7);
+
+  // stage 2
+  // interleave and perform butterfly multiplication/addition
+  u0 = _mm_unpacklo_epi16(s6, s5);
+  u1 = _mm_unpackhi_epi16(s6, s5);
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+
+  // shift and rounding
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+
+  u0 = _mm_packs_epi32(v0, v1);
+  u1 = _mm_packs_epi32(v2, v3);
+
+  // stage 3
+  s0 = _mm_add_epi16(s4, u0);
+  s1 = _mm_sub_epi16(s4, u0);
+  s2 = _mm_sub_epi16(s7, u1);
+  s3 = _mm_add_epi16(s7, u1);
+
+  // stage 4
+  u0 = _mm_unpacklo_epi16(s0, s3);
+  u1 = _mm_unpackhi_epi16(s0, s3);
+  u2 = _mm_unpacklo_epi16(s1, s2);
+  u3 = _mm_unpackhi_epi16(s1, s2);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
+  v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
+  v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
+  v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
+  v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
+  v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
+  v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
+  v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
+
+  // shift and rounding
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  in[1] = _mm_packs_epi32(v0, v1);
+  in[3] = _mm_packs_epi32(v4, v5);
+  in[5] = _mm_packs_epi32(v2, v3);
+  in[7] = _mm_packs_epi32(v6, v7);
+
+  // transpose
+  array_transpose_8x8(in, in);
+}
+
+static void fadst8_sse2(__m128i *in) {
+  // Constants
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__const_0 = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+  // properly aligned for butterfly input
+  in0 = in[7];
+  in1 = in[0];
+  in2 = in[5];
+  in3 = in[2];
+  in4 = in[3];
+  in5 = in[4];
+  in6 = in[1];
+  in7 = in[6];
+
+  // column transformation
+  // stage 1
+  // interleave and multiply/add into 32-bit integer
+  s0 = _mm_unpacklo_epi16(in0, in1);
+  s1 = _mm_unpackhi_epi16(in0, in1);
+  s2 = _mm_unpacklo_epi16(in2, in3);
+  s3 = _mm_unpackhi_epi16(in2, in3);
+  s4 = _mm_unpacklo_epi16(in4, in5);
+  s5 = _mm_unpackhi_epi16(in4, in5);
+  s6 = _mm_unpacklo_epi16(in6, in7);
+  s7 = _mm_unpackhi_epi16(in6, in7);
+
+  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+  // addition
+  w0 = _mm_add_epi32(u0, u8);
+  w1 = _mm_add_epi32(u1, u9);
+  w2 = _mm_add_epi32(u2, u10);
+  w3 = _mm_add_epi32(u3, u11);
+  w4 = _mm_add_epi32(u4, u12);
+  w5 = _mm_add_epi32(u5, u13);
+  w6 = _mm_add_epi32(u6, u14);
+  w7 = _mm_add_epi32(u7, u15);
+  w8 = _mm_sub_epi32(u0, u8);
+  w9 = _mm_sub_epi32(u1, u9);
+  w10 = _mm_sub_epi32(u2, u10);
+  w11 = _mm_sub_epi32(u3, u11);
+  w12 = _mm_sub_epi32(u4, u12);
+  w13 = _mm_sub_epi32(u5, u13);
+  w14 = _mm_sub_epi32(u6, u14);
+  w15 = _mm_sub_epi32(u7, u15);
+
+  // shift and rounding
+  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+  // back to 16-bit and pack 8 integers into __m128i
+  v0 = _mm_add_epi32(w0, w4);
+  v1 = _mm_add_epi32(w1, w5);
+  v2 = _mm_add_epi32(w2, w6);
+  v3 = _mm_add_epi32(w3, w7);
+  v4 = _mm_sub_epi32(w0, w4);
+  v5 = _mm_sub_epi32(w1, w5);
+  v6 = _mm_sub_epi32(w2, w6);
+  v7 = _mm_sub_epi32(w3, w7);
+
+  w0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  w1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  w2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  w3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  w4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  w5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  w6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  w7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(w0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(w1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(w2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(w3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(w4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(w5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(w6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(w7, DCT_CONST_BITS);
+
+  in[4] = _mm_packs_epi32(u8, u9);
+  in[5] = _mm_packs_epi32(u10, u11);
+  in[6] = _mm_packs_epi32(u12, u13);
+  in[7] = _mm_packs_epi32(u14, u15);
+
+  // stage 2
+  s0 = _mm_packs_epi32(v0, v1);
+  s1 = _mm_packs_epi32(v2, v3);
+  s2 = _mm_packs_epi32(v4, v5);
+  s3 = _mm_packs_epi32(v6, v7);
+
+  u0 = _mm_unpacklo_epi16(in[4], in[5]);
+  u1 = _mm_unpackhi_epi16(in[4], in[5]);
+  u2 = _mm_unpacklo_epi16(in[6], in[7]);
+  u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+  w0 = _mm_add_epi32(v0, v4);
+  w1 = _mm_add_epi32(v1, v5);
+  w2 = _mm_add_epi32(v2, v6);
+  w3 = _mm_add_epi32(v3, v7);
+  w4 = _mm_sub_epi32(v0, v4);
+  w5 = _mm_sub_epi32(v1, v5);
+  w6 = _mm_sub_epi32(v2, v6);
+  w7 = _mm_sub_epi32(v3, v7);
+
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  // back to 16-bit intergers
+  s4 = _mm_packs_epi32(u0, u1);
+  s5 = _mm_packs_epi32(u2, u3);
+  s6 = _mm_packs_epi32(u4, u5);
+  s7 = _mm_packs_epi32(u6, u7);
+
+  // stage 3
+  u0 = _mm_unpacklo_epi16(s2, s3);
+  u1 = _mm_unpackhi_epi16(s2, s3);
+  u2 = _mm_unpacklo_epi16(s6, s7);
+  u3 = _mm_unpackhi_epi16(s6, s7);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  s2 = _mm_packs_epi32(v0, v1);
+  s3 = _mm_packs_epi32(v2, v3);
+  s6 = _mm_packs_epi32(v4, v5);
+  s7 = _mm_packs_epi32(v6, v7);
+
+  // FIXME(jingning): do subtract using bit inversion?
+  in[0] = s0;
+  in[1] = _mm_sub_epi16(k__const_0, s4);
+  in[2] = s6;
+  in[3] = _mm_sub_epi16(k__const_0, s2);
+  in[4] = s3;
+  in[5] = _mm_sub_epi16(k__const_0, s7);
+  in[6] = s5;
+  in[7] = _mm_sub_epi16(k__const_0, s1);
+
+  // transpose
+  array_transpose_8x8(in, in);
+}
+
+#if CONFIG_EXT_TX
+static void fidtx8_sse2(__m128i *in) {
+  in[0] = _mm_slli_epi16(in[0], 1);
+  in[1] = _mm_slli_epi16(in[1], 1);
+  in[2] = _mm_slli_epi16(in[2], 1);
+  in[3] = _mm_slli_epi16(in[3], 1);
+  in[4] = _mm_slli_epi16(in[4], 1);
+  in[5] = _mm_slli_epi16(in[5], 1);
+  in[6] = _mm_slli_epi16(in[6], 1);
+  in[7] = _mm_slli_epi16(in[7], 1);
+
+  array_transpose_8x8(in, in);
+}
+#endif  // CONFIG_EXT_TX
+
+void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
+                     int tx_type) {
+  __m128i in[8];
+
+  switch (tx_type) {
+    case DCT_DCT: aom_fdct8x8_sse2(input, output, stride); break;
+    case ADST_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fadst8_sse2(in);
+      fdct8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fdct8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case ADST_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_8x8(input, in, stride, 1, 0);
+      fadst8_sse2(in);
+      fdct8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1);
+      fdct8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 1, 1);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x8(input, in, stride, 1, 0);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case IDTX:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fidtx8_sse2(in);
+      fidtx8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case V_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fdct8_sse2(in);
+      fidtx8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case H_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fidtx8_sse2(in);
+      fdct8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case V_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fadst8_sse2(in);
+      fidtx8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case H_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fidtx8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case V_FLIPADST:
+      load_buffer_8x8(input, in, stride, 1, 0);
+      fadst8_sse2(in);
+      fidtx8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case H_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1);
+      fidtx8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0);
+  }
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0,
+                                     __m128i *in1, int stride, int flipud,
+                                     int fliplr) {
+  // Load 4 8x8 blocks
+  const int16_t *topL = input;
+  const int16_t *topR = input + 8;
+  const int16_t *botL = input + 8 * stride;
+  const int16_t *botR = input + 8 * stride + 8;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    // Swap left columns
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+    // Swap right columns
+    tmp = topR;
+    topR = botR;
+    botR = tmp;
+  }
+
+  if (fliplr) {
+    // Swap top rows
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+    // Swap bottom rows
+    tmp = botL;
+    botL = botR;
+    botR = tmp;
+  }
+
+  // load first 8 columns
+  load_buffer_8x8(topL, in0, stride, flipud, fliplr);
+  load_buffer_8x8(botL, in0 + 8, stride, flipud, fliplr);
+
+  // load second 8 columns
+  load_buffer_8x8(topR, in1, stride, flipud, fliplr);
+  load_buffer_8x8(botR, in1 + 8, stride, flipud, fliplr);
+}
+
+static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
+                                      __m128i *in1, int stride) {
+  // write first 8 columns
+  write_buffer_8x8(output, in0, stride);
+  write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
+  // write second 8 columns
+  output += 8;
+  write_buffer_8x8(output, in1, stride);
+  write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
+  __m128i tbuf[8];
+  array_transpose_8x8(res0, res0);
+  array_transpose_8x8(res1, tbuf);
+  array_transpose_8x8(res0 + 8, res1);
+  array_transpose_8x8(res1 + 8, res1 + 8);
+
+  res0[8] = tbuf[0];
+  res0[9] = tbuf[1];
+  res0[10] = tbuf[2];
+  res0[11] = tbuf[3];
+  res0[12] = tbuf[4];
+  res0[13] = tbuf[5];
+  res0[14] = tbuf[6];
+  res0[15] = tbuf[7];
+}
+
+static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
+  // perform rounding operations
+  right_shift_8x8(res0, 2);
+  right_shift_8x8(res0 + 8, 2);
+  right_shift_8x8(res1, 2);
+  right_shift_8x8(res1 + 8, 2);
+}
+
+static void fdct16_8col(__m128i *in) {
+  // perform 16x16 1-D DCT for 8 columns
+  __m128i i[8], s[8], p[8], t[8], u[16], v[16];
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  // stage 1
+  i[0] = _mm_add_epi16(in[0], in[15]);
+  i[1] = _mm_add_epi16(in[1], in[14]);
+  i[2] = _mm_add_epi16(in[2], in[13]);
+  i[3] = _mm_add_epi16(in[3], in[12]);
+  i[4] = _mm_add_epi16(in[4], in[11]);
+  i[5] = _mm_add_epi16(in[5], in[10]);
+  i[6] = _mm_add_epi16(in[6], in[9]);
+  i[7] = _mm_add_epi16(in[7], in[8]);
+
+  s[0] = _mm_sub_epi16(in[7], in[8]);
+  s[1] = _mm_sub_epi16(in[6], in[9]);
+  s[2] = _mm_sub_epi16(in[5], in[10]);
+  s[3] = _mm_sub_epi16(in[4], in[11]);
+  s[4] = _mm_sub_epi16(in[3], in[12]);
+  s[5] = _mm_sub_epi16(in[2], in[13]);
+  s[6] = _mm_sub_epi16(in[1], in[14]);
+  s[7] = _mm_sub_epi16(in[0], in[15]);
+
+  p[0] = _mm_add_epi16(i[0], i[7]);
+  p[1] = _mm_add_epi16(i[1], i[6]);
+  p[2] = _mm_add_epi16(i[2], i[5]);
+  p[3] = _mm_add_epi16(i[3], i[4]);
+  p[4] = _mm_sub_epi16(i[3], i[4]);
+  p[5] = _mm_sub_epi16(i[2], i[5]);
+  p[6] = _mm_sub_epi16(i[1], i[6]);
+  p[7] = _mm_sub_epi16(i[0], i[7]);
+
+  u[0] = _mm_add_epi16(p[0], p[3]);
+  u[1] = _mm_add_epi16(p[1], p[2]);
+  u[2] = _mm_sub_epi16(p[1], p[2]);
+  u[3] = _mm_sub_epi16(p[0], p[3]);
+
+  v[0] = _mm_unpacklo_epi16(u[0], u[1]);
+  v[1] = _mm_unpackhi_epi16(u[0], u[1]);
+  v[2] = _mm_unpacklo_epi16(u[2], u[3]);
+  v[3] = _mm_unpackhi_epi16(u[2], u[3]);
+
+  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
+  u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
+  u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
+  u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
+  u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
+  u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
+  u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
+  u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[4] = _mm_packs_epi32(u[4], u[5]);
+  in[8] = _mm_packs_epi32(u[2], u[3]);
+  in[12] = _mm_packs_epi32(u[6], u[7]);
+
+  u[0] = _mm_unpacklo_epi16(p[5], p[6]);
+  u[1] = _mm_unpackhi_epi16(p[5], p[6]);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+  u[0] = _mm_packs_epi32(v[0], v[1]);
+  u[1] = _mm_packs_epi32(v[2], v[3]);
+
+  t[0] = _mm_add_epi16(p[4], u[0]);
+  t[1] = _mm_sub_epi16(p[4], u[0]);
+  t[2] = _mm_sub_epi16(p[7], u[1]);
+  t[3] = _mm_add_epi16(p[7], u[1]);
+
+  u[0] = _mm_unpacklo_epi16(t[0], t[3]);
+  u[1] = _mm_unpackhi_epi16(t[0], t[3]);
+  u[2] = _mm_unpacklo_epi16(t[1], t[2]);
+  u[3] = _mm_unpackhi_epi16(t[1], t[2]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  in[2] = _mm_packs_epi32(v[0], v[1]);
+  in[6] = _mm_packs_epi32(v[4], v[5]);
+  in[10] = _mm_packs_epi32(v[2], v[3]);
+  in[14] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[2], s[5]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[5]);
+  u[2] = _mm_unpacklo_epi16(s[3], s[4]);
+  u[3] = _mm_unpackhi_epi16(s[3], s[4]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[2] = _mm_packs_epi32(v[0], v[1]);
+  t[3] = _mm_packs_epi32(v[2], v[3]);
+  t[4] = _mm_packs_epi32(v[4], v[5]);
+  t[5] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 3
+  p[0] = _mm_add_epi16(s[0], t[3]);
+  p[1] = _mm_add_epi16(s[1], t[2]);
+  p[2] = _mm_sub_epi16(s[1], t[2]);
+  p[3] = _mm_sub_epi16(s[0], t[3]);
+  p[4] = _mm_sub_epi16(s[7], t[4]);
+  p[5] = _mm_sub_epi16(s[6], t[5]);
+  p[6] = _mm_add_epi16(s[6], t[5]);
+  p[7] = _mm_add_epi16(s[7], t[4]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(p[1], p[6]);
+  u[1] = _mm_unpackhi_epi16(p[1], p[6]);
+  u[2] = _mm_unpacklo_epi16(p[2], p[5]);
+  u[3] = _mm_unpackhi_epi16(p[2], p[5]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[1] = _mm_packs_epi32(v[0], v[1]);
+  t[2] = _mm_packs_epi32(v[2], v[3]);
+  t[5] = _mm_packs_epi32(v[4], v[5]);
+  t[6] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 5
+  s[0] = _mm_add_epi16(p[0], t[1]);
+  s[1] = _mm_sub_epi16(p[0], t[1]);
+  s[2] = _mm_sub_epi16(p[3], t[2]);
+  s[3] = _mm_add_epi16(p[3], t[2]);
+  s[4] = _mm_add_epi16(p[4], t[5]);
+  s[5] = _mm_sub_epi16(p[4], t[5]);
+  s[6] = _mm_sub_epi16(p[7], t[6]);
+  s[7] = _mm_add_epi16(p[7], t[6]);
+
+  // stage 6
+  u[0] = _mm_unpacklo_epi16(s[0], s[7]);
+  u[1] = _mm_unpackhi_epi16(s[0], s[7]);
+  u[2] = _mm_unpacklo_epi16(s[1], s[6]);
+  u[3] = _mm_unpackhi_epi16(s[1], s[6]);
+  u[4] = _mm_unpacklo_epi16(s[2], s[5]);
+  u[5] = _mm_unpackhi_epi16(s[2], s[5]);
+  u[6] = _mm_unpacklo_epi16(s[3], s[4]);
+  u[7] = _mm_unpackhi_epi16(s[3], s[4]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
+  v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
+  v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
+  v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
+  v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
+  v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
+  v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
+  v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
+  v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
+  v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
+  v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[1] = _mm_packs_epi32(v[0], v[1]);
+  in[9] = _mm_packs_epi32(v[2], v[3]);
+  in[5] = _mm_packs_epi32(v[4], v[5]);
+  in[13] = _mm_packs_epi32(v[6], v[7]);
+  in[3] = _mm_packs_epi32(v[8], v[9]);
+  in[11] = _mm_packs_epi32(v[10], v[11]);
+  in[7] = _mm_packs_epi32(v[12], v[13]);
+  in[15] = _mm_packs_epi32(v[14], v[15]);
+}
+
+static void fadst16_8col(__m128i *in) {
+  // perform 16x16 1-D ADST for 8 columns
+  __m128i s[16], x[16], u[32], v[32];
+  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kZero = _mm_set1_epi16(0);
+
+  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+  u[0] = _mm_add_epi32(v[0], v[16]);
+  u[1] = _mm_add_epi32(v[1], v[17]);
+  u[2] = _mm_add_epi32(v[2], v[18]);
+  u[3] = _mm_add_epi32(v[3], v[19]);
+  u[4] = _mm_add_epi32(v[4], v[20]);
+  u[5] = _mm_add_epi32(v[5], v[21]);
+  u[6] = _mm_add_epi32(v[6], v[22]);
+  u[7] = _mm_add_epi32(v[7], v[23]);
+  u[8] = _mm_add_epi32(v[8], v[24]);
+  u[9] = _mm_add_epi32(v[9], v[25]);
+  u[10] = _mm_add_epi32(v[10], v[26]);
+  u[11] = _mm_add_epi32(v[11], v[27]);
+  u[12] = _mm_add_epi32(v[12], v[28]);
+  u[13] = _mm_add_epi32(v[13], v[29]);
+  u[14] = _mm_add_epi32(v[14], v[30]);
+  u[15] = _mm_add_epi32(v[15], v[31]);
+  u[16] = _mm_sub_epi32(v[0], v[16]);
+  u[17] = _mm_sub_epi32(v[1], v[17]);
+  u[18] = _mm_sub_epi32(v[2], v[18]);
+  u[19] = _mm_sub_epi32(v[3], v[19]);
+  u[20] = _mm_sub_epi32(v[4], v[20]);
+  u[21] = _mm_sub_epi32(v[5], v[21]);
+  u[22] = _mm_sub_epi32(v[6], v[22]);
+  u[23] = _mm_sub_epi32(v[7], v[23]);
+  u[24] = _mm_sub_epi32(v[8], v[24]);
+  u[25] = _mm_sub_epi32(v[9], v[25]);
+  u[26] = _mm_sub_epi32(v[10], v[26]);
+  u[27] = _mm_sub_epi32(v[11], v[27]);
+  u[28] = _mm_sub_epi32(v[12], v[28]);
+  u[29] = _mm_sub_epi32(v[13], v[29]);
+  u[30] = _mm_sub_epi32(v[14], v[30]);
+  u[31] = _mm_sub_epi32(v[15], v[31]);
+
+  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+  v[0] = _mm_add_epi32(u[0], u[8]);
+  v[1] = _mm_add_epi32(u[1], u[9]);
+  v[2] = _mm_add_epi32(u[2], u[10]);
+  v[3] = _mm_add_epi32(u[3], u[11]);
+  v[4] = _mm_add_epi32(u[4], u[12]);
+  v[5] = _mm_add_epi32(u[5], u[13]);
+  v[6] = _mm_add_epi32(u[6], u[14]);
+  v[7] = _mm_add_epi32(u[7], u[15]);
+
+  v[16] = _mm_add_epi32(v[0], v[4]);
+  v[17] = _mm_add_epi32(v[1], v[5]);
+  v[18] = _mm_add_epi32(v[2], v[6]);
+  v[19] = _mm_add_epi32(v[3], v[7]);
+  v[20] = _mm_sub_epi32(v[0], v[4]);
+  v[21] = _mm_sub_epi32(v[1], v[5]);
+  v[22] = _mm_sub_epi32(v[2], v[6]);
+  v[23] = _mm_sub_epi32(v[3], v[7]);
+  v[16] = _mm_add_epi32(v[16], k__DCT_CONST_ROUNDING);
+  v[17] = _mm_add_epi32(v[17], k__DCT_CONST_ROUNDING);
+  v[18] = _mm_add_epi32(v[18], k__DCT_CONST_ROUNDING);
+  v[19] = _mm_add_epi32(v[19], k__DCT_CONST_ROUNDING);
+  v[20] = _mm_add_epi32(v[20], k__DCT_CONST_ROUNDING);
+  v[21] = _mm_add_epi32(v[21], k__DCT_CONST_ROUNDING);
+  v[22] = _mm_add_epi32(v[22], k__DCT_CONST_ROUNDING);
+  v[23] = _mm_add_epi32(v[23], k__DCT_CONST_ROUNDING);
+  v[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+  v[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+  v[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+  v[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+  v[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+  v[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+  v[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+  v[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+  s[0] = _mm_packs_epi32(v[16], v[17]);
+  s[1] = _mm_packs_epi32(v[18], v[19]);
+  s[2] = _mm_packs_epi32(v[20], v[21]);
+  s[3] = _mm_packs_epi32(v[22], v[23]);
+
+  v[8] = _mm_sub_epi32(u[0], u[8]);
+  v[9] = _mm_sub_epi32(u[1], u[9]);
+  v[10] = _mm_sub_epi32(u[2], u[10]);
+  v[11] = _mm_sub_epi32(u[3], u[11]);
+  v[12] = _mm_sub_epi32(u[4], u[12]);
+  v[13] = _mm_sub_epi32(u[5], u[13]);
+  v[14] = _mm_sub_epi32(u[6], u[14]);
+  v[15] = _mm_sub_epi32(u[7], u[15]);
+
+  v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+  s[4] = _mm_packs_epi32(v[8], v[9]);
+  s[5] = _mm_packs_epi32(v[10], v[11]);
+  s[6] = _mm_packs_epi32(v[12], v[13]);
+  s[7] = _mm_packs_epi32(v[14], v[15]);
+  //
+
+  s[8] = _mm_packs_epi32(u[16], u[17]);
+  s[9] = _mm_packs_epi32(u[18], u[19]);
+  s[10] = _mm_packs_epi32(u[20], u[21]);
+  s[11] = _mm_packs_epi32(u[22], u[23]);
+  s[12] = _mm_packs_epi32(u[24], u[25]);
+  s[13] = _mm_packs_epi32(u[26], u[27]);
+  s[14] = _mm_packs_epi32(u[28], u[29]);
+  s[15] = _mm_packs_epi32(u[30], u[31]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], v[8]);
+  u[1] = _mm_add_epi32(v[1], v[9]);
+  u[2] = _mm_add_epi32(v[2], v[10]);
+  u[3] = _mm_add_epi32(v[3], v[11]);
+  u[4] = _mm_add_epi32(v[4], v[12]);
+  u[5] = _mm_add_epi32(v[5], v[13]);
+  u[6] = _mm_add_epi32(v[6], v[14]);
+  u[7] = _mm_add_epi32(v[7], v[15]);
+  u[8] = _mm_sub_epi32(v[0], v[8]);
+  u[9] = _mm_sub_epi32(v[1], v[9]);
+  u[10] = _mm_sub_epi32(v[2], v[10]);
+  u[11] = _mm_sub_epi32(v[3], v[11]);
+  u[12] = _mm_sub_epi32(v[4], v[12]);
+  u[13] = _mm_sub_epi32(v[5], v[13]);
+  u[14] = _mm_sub_epi32(v[6], v[14]);
+  u[15] = _mm_sub_epi32(v[7], v[15]);
+
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+  v[8] = _mm_add_epi32(u[0], u[4]);
+  v[9] = _mm_add_epi32(u[1], u[5]);
+  v[10] = _mm_add_epi32(u[2], u[6]);
+  v[11] = _mm_add_epi32(u[3], u[7]);
+  v[12] = _mm_sub_epi32(u[0], u[4]);
+  v[13] = _mm_sub_epi32(u[1], u[5]);
+  v[14] = _mm_sub_epi32(u[2], u[6]);
+  v[15] = _mm_sub_epi32(u[3], u[7]);
+
+  v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+  s[8] = _mm_packs_epi32(v[8], v[9]);
+  s[9] = _mm_packs_epi32(v[10], v[11]);
+  s[10] = _mm_packs_epi32(v[12], v[13]);
+  s[11] = _mm_packs_epi32(v[14], v[15]);
+
+  x[12] = _mm_packs_epi32(u[8], u[9]);
+  x[13] = _mm_packs_epi32(u[10], u[11]);
+  x[14] = _mm_packs_epi32(u[12], u[13]);
+  x[15] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  u[0] = _mm_unpacklo_epi16(s[4], s[5]);
+  u[1] = _mm_unpackhi_epi16(s[4], s[5]);
+  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+  u[0] = _mm_add_epi32(v[0], v[4]);
+  u[1] = _mm_add_epi32(v[1], v[5]);
+  u[2] = _mm_add_epi32(v[2], v[6]);
+  u[3] = _mm_add_epi32(v[3], v[7]);
+  u[4] = _mm_sub_epi32(v[0], v[4]);
+  u[5] = _mm_sub_epi32(v[1], v[5]);
+  u[6] = _mm_sub_epi32(v[2], v[6]);
+  u[7] = _mm_sub_epi32(v[3], v[7]);
+  u[8] = _mm_add_epi32(v[8], v[12]);
+  u[9] = _mm_add_epi32(v[9], v[13]);
+  u[10] = _mm_add_epi32(v[10], v[14]);
+  u[11] = _mm_add_epi32(v[11], v[15]);
+  u[12] = _mm_sub_epi32(v[8], v[12]);
+  u[13] = _mm_sub_epi32(v[9], v[13]);
+  u[14] = _mm_sub_epi32(v[10], v[14]);
+  u[15] = _mm_sub_epi32(v[11], v[15]);
+
+  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[4] = _mm_packs_epi32(v[0], v[1]);
+  s[5] = _mm_packs_epi32(v[2], v[3]);
+  s[6] = _mm_packs_epi32(v[4], v[5]);
+  s[7] = _mm_packs_epi32(v[6], v[7]);
+
+  s[12] = _mm_packs_epi32(v[8], v[9]);
+  s[13] = _mm_packs_epi32(v[10], v[11]);
+  s[14] = _mm_packs_epi32(v[12], v[13]);
+  s[15] = _mm_packs_epi32(v[14], v[15]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[0] = s[0];
+  in[1] = _mm_sub_epi16(kZero, s[8]);
+  in[2] = s[12];
+  in[3] = _mm_sub_epi16(kZero, s[4]);
+  in[4] = _mm_packs_epi32(v[4], v[5]);
+  in[5] = _mm_packs_epi32(v[12], v[13]);
+  in[6] = _mm_packs_epi32(v[8], v[9]);
+  in[7] = _mm_packs_epi32(v[0], v[1]);
+  in[8] = _mm_packs_epi32(v[2], v[3]);
+  in[9] = _mm_packs_epi32(v[10], v[11]);
+  in[10] = _mm_packs_epi32(v[14], v[15]);
+  in[11] = _mm_packs_epi32(v[6], v[7]);
+  in[12] = s[5];
+  in[13] = _mm_sub_epi16(kZero, s[13]);
+  in[14] = s[9];
+  in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+static void fdct16_sse2(__m128i *in0, __m128i *in1) {
+  fdct16_8col(in0);
+  fdct16_8col(in1);
+  array_transpose_16x16(in0, in1);
+}
+
+static void fadst16_sse2(__m128i *in0, __m128i *in1) {
+  fadst16_8col(in0);
+  fadst16_8col(in1);
+  array_transpose_16x16(in0, in1);
+}
+
+#if CONFIG_EXT_TX
+static void fidtx16_sse2(__m128i *in0, __m128i *in1) {
+  idtx16_8col(in0);
+  idtx16_8col(in1);
+  array_transpose_16x16(in0, in1);
+}
+#endif  // CONFIG_EXT_TX
+
+void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
+                       int tx_type) {
+  __m128i in0[16], in1[16];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fdct16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case ADST_DCT:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case DCT_ADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fdct16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case ADST_ADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_16x16(input, in0, in1, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 1);
+      fdct16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 1, 1);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 1);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16x16(input, in0, in1, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case IDTX:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fidtx16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fidtx16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case V_DCT:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fdct16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fidtx16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case H_DCT:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fidtx16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case V_ADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fidtx16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case H_ADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fidtx16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case V_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fidtx16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case H_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 1);
+      fidtx16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+
+static INLINE void prepare_4x8_row_first(__m128i *in) {
+  in[0] = _mm_unpacklo_epi64(in[0], in[2]);
+  in[1] = _mm_unpacklo_epi64(in[1], in[3]);
+  transpose_4x4(in);
+  in[4] = _mm_unpacklo_epi64(in[4], in[6]);
+  in[5] = _mm_unpacklo_epi64(in[5], in[7]);
+  transpose_4x4(in + 4);
+}
+
+// Load input into the left-hand half of in (ie, into lanes 0..3 of
+// each element of in). The right hand half (lanes 4..7) should be
+// treated as being filled with "don't care" values.
+static INLINE void load_buffer_4x8(const int16_t *input, __m128i *in,
+                                   int stride, int flipud, int fliplr) {
+  const int shift = 2;
+  if (!flipud) {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+    in[4] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride));
+    in[5] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride));
+    in[6] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride));
+    in[7] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride));
+  } else {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride));
+    in[4] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+    in[5] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[6] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[7] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+    in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+    in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+    in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+    in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
+    in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
+    in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
+    in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
+  }
+
+  in[0] = _mm_slli_epi16(in[0], shift);
+  in[1] = _mm_slli_epi16(in[1], shift);
+  in[2] = _mm_slli_epi16(in[2], shift);
+  in[3] = _mm_slli_epi16(in[3], shift);
+  in[4] = _mm_slli_epi16(in[4], shift);
+  in[5] = _mm_slli_epi16(in[5], shift);
+  in[6] = _mm_slli_epi16(in[6], shift);
+  in[7] = _mm_slli_epi16(in[7], shift);
+
+  scale_sqrt2_8x4(in);
+  scale_sqrt2_8x4(in + 4);
+  prepare_4x8_row_first(in);
+}
+
+static INLINE void write_buffer_4x8(tran_low_t *output, __m128i *res) {
+  __m128i in01, in23, in45, in67, sign01, sign23, sign45, sign67;
+  const int shift = 1;
+
+  // revert the 8x8 txfm's transpose
+  array_transpose_8x8(res, res);
+
+  in01 = _mm_unpacklo_epi64(res[0], res[1]);
+  in23 = _mm_unpacklo_epi64(res[2], res[3]);
+  in45 = _mm_unpacklo_epi64(res[4], res[5]);
+  in67 = _mm_unpacklo_epi64(res[6], res[7]);
+
+  sign01 = _mm_srai_epi16(in01, 15);
+  sign23 = _mm_srai_epi16(in23, 15);
+  sign45 = _mm_srai_epi16(in45, 15);
+  sign67 = _mm_srai_epi16(in67, 15);
+
+  in01 = _mm_sub_epi16(in01, sign01);
+  in23 = _mm_sub_epi16(in23, sign23);
+  in45 = _mm_sub_epi16(in45, sign45);
+  in67 = _mm_sub_epi16(in67, sign67);
+
+  in01 = _mm_srai_epi16(in01, shift);
+  in23 = _mm_srai_epi16(in23, shift);
+  in45 = _mm_srai_epi16(in45, shift);
+  in67 = _mm_srai_epi16(in67, shift);
+
+  store_output(&in01, (output + 0 * 8));
+  store_output(&in23, (output + 1 * 8));
+  store_output(&in45, (output + 2 * 8));
+  store_output(&in67, (output + 3 * 8));
+}
+
+void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride,
+                     int tx_type) {
+  __m128i in[8];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_4x8(input, in, stride, 0, 0);
+      fdct4_sse2(in);
+      fdct4_sse2(in + 4);
+      fdct8_sse2(in);
+      break;
+    case ADST_DCT:
+      load_buffer_4x8(input, in, stride, 0, 0);
+      fdct4_sse2(in);
+      fdct4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+    case DCT_ADST:
+      load_buffer_4x8(input, in, stride, 0, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fdct8_sse2(in);
+      break;
+    case ADST_ADST:
+      load_buffer_4x8(input, in, stride, 0, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_4x8(input, in, stride, 1, 0);
+      fdct4_sse2(in);
+      fdct4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_4x8(input, in, stride, 0, 1);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fdct8_sse2(in);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_4x8(input, in, stride, 1, 1);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_4x8(input, in, stride, 0, 1);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_4x8(input, in, stride, 1, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+    case IDTX:
+      load_buffer_4x8(input, in, stride, 0, 0);
+      fidtx4_sse2(in);
+      fidtx4_sse2(in + 4);
+      fidtx8_sse2(in);
+      break;
+    case V_DCT:
+      load_buffer_4x8(input, in, stride, 0, 0);
+      fidtx4_sse2(in);
+      fidtx4_sse2(in + 4);
+      fdct8_sse2(in);
+      break;
+    case H_DCT:
+      load_buffer_4x8(input, in, stride, 0, 0);
+      fdct4_sse2(in);
+      fdct4_sse2(in + 4);
+      fidtx8_sse2(in);
+      break;
+    case V_ADST:
+      load_buffer_4x8(input, in, stride, 0, 0);
+      fidtx4_sse2(in);
+      fidtx4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+    case H_ADST:
+      load_buffer_4x8(input, in, stride, 0, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fidtx8_sse2(in);
+      break;
+    case V_FLIPADST:
+      load_buffer_4x8(input, in, stride, 1, 0);
+      fidtx4_sse2(in);
+      fidtx4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+    case H_FLIPADST:
+      load_buffer_4x8(input, in, stride, 0, 1);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fidtx8_sse2(in);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  write_buffer_4x8(output, in);
+}
+
+// Load input into the left-hand half of in (ie, into lanes 0..3 of
+// each element of in). The right hand half (lanes 4..7) should be
+// treated as being filled with "don't care" values.
+// The input is split horizontally into two 4x4
+// chunks 'l' and 'r'. Then 'l' is stored in the top-left 4x4
+// block of 'in' and 'r' is stored in the bottom-left block.
+// This is to allow us to reuse 4x4 transforms.
+static INLINE void load_buffer_8x4(const int16_t *input, __m128i *in,
+                                   int stride, int flipud, int fliplr) {
+  const int shift = 2;
+  if (!flipud) {
+    in[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
+  } else {
+    in[0] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
+    in[1] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
+    in[2] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
+    in[3] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = mm_reverse_epi16(in[0]);
+    in[1] = mm_reverse_epi16(in[1]);
+    in[2] = mm_reverse_epi16(in[2]);
+    in[3] = mm_reverse_epi16(in[3]);
+  }
+
+  in[0] = _mm_slli_epi16(in[0], shift);
+  in[1] = _mm_slli_epi16(in[1], shift);
+  in[2] = _mm_slli_epi16(in[2], shift);
+  in[3] = _mm_slli_epi16(in[3], shift);
+
+  scale_sqrt2_8x4(in);
+
+  in[4] = _mm_shuffle_epi32(in[0], 0xe);
+  in[5] = _mm_shuffle_epi32(in[1], 0xe);
+  in[6] = _mm_shuffle_epi32(in[2], 0xe);
+  in[7] = _mm_shuffle_epi32(in[3], 0xe);
+}
+
+static INLINE void write_buffer_8x4(tran_low_t *output, __m128i *res) {
+  __m128i out0, out1, out2, out3, sign0, sign1, sign2, sign3;
+  const int shift = 1;
+  sign0 = _mm_srai_epi16(res[0], 15);
+  sign1 = _mm_srai_epi16(res[1], 15);
+  sign2 = _mm_srai_epi16(res[2], 15);
+  sign3 = _mm_srai_epi16(res[3], 15);
+
+  out0 = _mm_sub_epi16(res[0], sign0);
+  out1 = _mm_sub_epi16(res[1], sign1);
+  out2 = _mm_sub_epi16(res[2], sign2);
+  out3 = _mm_sub_epi16(res[3], sign3);
+
+  out0 = _mm_srai_epi16(out0, shift);
+  out1 = _mm_srai_epi16(out1, shift);
+  out2 = _mm_srai_epi16(out2, shift);
+  out3 = _mm_srai_epi16(out3, shift);
+
+  store_output(&out0, (output + 0 * 8));
+  store_output(&out1, (output + 1 * 8));
+  store_output(&out2, (output + 2 * 8));
+  store_output(&out3, (output + 3 * 8));
+}
+
+void av1_fht8x4_sse2(const int16_t *input, tran_low_t *output, int stride,
+                     int tx_type) {
+  __m128i in[8];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_8x4(input, in, stride, 0, 0);
+      fdct4_sse2(in);
+      fdct4_sse2(in + 4);
+      fdct8_sse2(in);
+      break;
+    case ADST_DCT:
+      load_buffer_8x4(input, in, stride, 0, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fdct8_sse2(in);
+      break;
+    case DCT_ADST:
+      load_buffer_8x4(input, in, stride, 0, 0);
+      fdct4_sse2(in);
+      fdct4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+    case ADST_ADST:
+      load_buffer_8x4(input, in, stride, 0, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_8x4(input, in, stride, 1, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fdct8_sse2(in);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x4(input, in, stride, 0, 1);
+      fdct4_sse2(in);
+      fdct4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x4(input, in, stride, 1, 1);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x4(input, in, stride, 0, 1);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x4(input, in, stride, 1, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+    case IDTX:
+      load_buffer_8x4(input, in, stride, 0, 0);
+      fidtx4_sse2(in);
+      fidtx4_sse2(in + 4);
+      fidtx8_sse2(in);
+      break;
+    case V_DCT:
+      load_buffer_8x4(input, in, stride, 0, 0);
+      fdct4_sse2(in);
+      fdct4_sse2(in + 4);
+      fidtx8_sse2(in);
+      break;
+    case H_DCT:
+      load_buffer_8x4(input, in, stride, 0, 0);
+      fidtx4_sse2(in);
+      fidtx4_sse2(in + 4);
+      fdct8_sse2(in);
+      break;
+    case V_ADST:
+      load_buffer_8x4(input, in, stride, 0, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fidtx8_sse2(in);
+      break;
+    case H_ADST:
+      load_buffer_8x4(input, in, stride, 0, 0);
+      fidtx4_sse2(in);
+      fidtx4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+    case V_FLIPADST:
+      load_buffer_8x4(input, in, stride, 1, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fidtx8_sse2(in);
+      break;
+    case H_FLIPADST:
+      load_buffer_8x4(input, in, stride, 0, 1);
+      fidtx4_sse2(in);
+      fidtx4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  write_buffer_8x4(output, in);
+}
+
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in,
+                                    int stride, int flipud, int fliplr) {
+  // Load 2 8x8 blocks
+  const int16_t *t = input;
+  const int16_t *b = input + 8 * stride;
+
+  if (flipud) {
+    const int16_t *const tmp = t;
+    t = b;
+    b = tmp;
+  }
+
+  load_buffer_8x8(t, in, stride, flipud, fliplr);
+  scale_sqrt2_8x8(in);
+  load_buffer_8x8(b, in + 8, stride, flipud, fliplr);
+  scale_sqrt2_8x8(in + 8);
+}
+
+static INLINE void round_power_of_two_signed(__m128i *x, int n) {
+  const __m128i rounding = _mm_set1_epi16((1 << n) >> 1);
+  const __m128i sign = _mm_srai_epi16(*x, 15);
+  const __m128i res = _mm_add_epi16(_mm_add_epi16(*x, rounding), sign);
+  *x = _mm_srai_epi16(res, n);
+}
+
+static void row_8x16_rounding(__m128i *in, int bits) {
+  int i;
+  for (i = 0; i < 16; i++) {
+    round_power_of_two_signed(&in[i], bits);
+  }
+}
+
+void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
+                      int tx_type) {
+  __m128i in[16];
+
+  __m128i *const t = in;      // Alias to top 8x8 sub block
+  __m128i *const b = in + 8;  // Alias to bottom 8x8 sub block
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_8x16(input, in, stride, 0, 0);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fdct8_sse2(t);
+      fdct8_sse2(b);
+      row_8x16_rounding(in, 2);
+      fdct16_8col(in);
+      break;
+    case ADST_DCT:
+      load_buffer_8x16(input, in, stride, 0, 0);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fdct8_sse2(t);
+      fdct8_sse2(b);
+      row_8x16_rounding(in, 2);
+      fadst16_8col(in);
+      break;
+    case DCT_ADST:
+      load_buffer_8x16(input, in, stride, 0, 0);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fadst8_sse2(t);
+      fadst8_sse2(b);
+      row_8x16_rounding(in, 2);
+      fdct16_8col(in);
+      break;
+    case ADST_ADST:
+      load_buffer_8x16(input, in, stride, 0, 0);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fadst8_sse2(t);
+      fadst8_sse2(b);
+      row_8x16_rounding(in, 2);
+      fadst16_8col(in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_8x16(input, in, stride, 1, 0);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fdct8_sse2(t);
+      fdct8_sse2(b);
+      row_8x16_rounding(in, 2);
+      fadst16_8col(in);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x16(input, in, stride, 0, 1);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fadst8_sse2(t);
+      fadst8_sse2(b);
+      row_8x16_rounding(in, 2);
+      fdct16_8col(in);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x16(input, in, stride, 1, 1);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fadst8_sse2(t);
+      fadst8_sse2(b);
+      row_8x16_rounding(in, 2);
+      fadst16_8col(in);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x16(input, in, stride, 0, 1);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fadst8_sse2(t);
+      fadst8_sse2(b);
+      row_8x16_rounding(in, 2);
+      fadst16_8col(in);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x16(input, in, stride, 1, 0);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fadst8_sse2(t);
+      fadst8_sse2(b);
+      row_8x16_rounding(in, 2);
+      fadst16_8col(in);
+      break;
+    case IDTX:
+      load_buffer_8x16(input, in, stride, 0, 0);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fidtx8_sse2(t);
+      fidtx8_sse2(b);
+      row_8x16_rounding(in, 2);
+      idtx16_8col(in);
+      break;
+    case V_DCT:
+      load_buffer_8x16(input, in, stride, 0, 0);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fidtx8_sse2(t);
+      fidtx8_sse2(b);
+      row_8x16_rounding(in, 2);
+      fdct16_8col(in);
+      break;
+    case H_DCT:
+      load_buffer_8x16(input, in, stride, 0, 0);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fdct8_sse2(t);
+      fdct8_sse2(b);
+      row_8x16_rounding(in, 2);
+      idtx16_8col(in);
+      break;
+    case V_ADST:
+      load_buffer_8x16(input, in, stride, 0, 0);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fidtx8_sse2(t);
+      fidtx8_sse2(b);
+      row_8x16_rounding(in, 2);
+      fadst16_8col(in);
+      break;
+    case H_ADST:
+      load_buffer_8x16(input, in, stride, 0, 0);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fadst8_sse2(t);
+      fadst8_sse2(b);
+      row_8x16_rounding(in, 2);
+      idtx16_8col(in);
+      break;
+    case V_FLIPADST:
+      load_buffer_8x16(input, in, stride, 1, 0);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fidtx8_sse2(t);
+      fidtx8_sse2(b);
+      row_8x16_rounding(in, 2);
+      fadst16_8col(in);
+      break;
+    case H_FLIPADST:
+      load_buffer_8x16(input, in, stride, 0, 1);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fadst8_sse2(t);
+      fadst8_sse2(b);
+      row_8x16_rounding(in, 2);
+      idtx16_8col(in);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  write_buffer_8x8(output, t, 8);
+  write_buffer_8x8(output + 64, b, 8);
+}
+
+static INLINE void load_buffer_16x8(const int16_t *input, __m128i *in,
+                                    int stride, int flipud, int fliplr) {
+  // Load 2 8x8 blocks
+  const int16_t *l = input;
+  const int16_t *r = input + 8;
+
+  if (fliplr) {
+    const int16_t *const tmp = l;
+    l = r;
+    r = tmp;
+  }
+
+  // load first 8 columns
+  load_buffer_8x8(l, in, stride, flipud, fliplr);
+  scale_sqrt2_8x8(in);
+  load_buffer_8x8(r, in + 8, stride, flipud, fliplr);
+  scale_sqrt2_8x8(in + 8);
+}
+
+#define col_16x8_rounding row_8x16_rounding
+
+void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride,
+                      int tx_type) {
+  __m128i in[16];
+
+  __m128i *const l = in;      // Alias to left 8x8 sub block
+  __m128i *const r = in + 8;  // Alias to right 8x8 sub block, which we store
+                              // in the second half of the array
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_16x8(input, in, stride, 0, 0);
+      fdct8_sse2(l);
+      fdct8_sse2(r);
+      col_16x8_rounding(in, 2);
+      fdct16_8col(in);
+      break;
+    case ADST_DCT:
+      load_buffer_16x8(input, in, stride, 0, 0);
+      fadst8_sse2(l);
+      fadst8_sse2(r);
+      col_16x8_rounding(in, 2);
+      fdct16_8col(in);
+      break;
+    case DCT_ADST:
+      load_buffer_16x8(input, in, stride, 0, 0);
+      fdct8_sse2(l);
+      fdct8_sse2(r);
+      col_16x8_rounding(in, 2);
+      fadst16_8col(in);
+      break;
+    case ADST_ADST:
+      load_buffer_16x8(input, in, stride, 0, 0);
+      fadst8_sse2(l);
+      fadst8_sse2(r);
+      col_16x8_rounding(in, 2);
+      fadst16_8col(in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_16x8(input, in, stride, 1, 0);
+      fadst8_sse2(l);
+      fadst8_sse2(r);
+      col_16x8_rounding(in, 2);
+      fdct16_8col(in);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16x8(input, in, stride, 0, 1);
+      fdct8_sse2(l);
+      fdct8_sse2(r);
+      col_16x8_rounding(in, 2);
+      fadst16_8col(in);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16x8(input, in, stride, 1, 1);
+      fadst8_sse2(l);
+      fadst8_sse2(r);
+      col_16x8_rounding(in, 2);
+      fadst16_8col(in);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16x8(input, in, stride, 0, 1);
+      fadst8_sse2(l);
+      fadst8_sse2(r);
+      col_16x8_rounding(in, 2);
+      fadst16_8col(in);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16x8(input, in, stride, 1, 0);
+      fadst8_sse2(l);
+      fadst8_sse2(r);
+      col_16x8_rounding(in, 2);
+      fadst16_8col(in);
+      break;
+    case IDTX:
+      load_buffer_16x8(input, in, stride, 0, 0);
+      fidtx8_sse2(l);
+      fidtx8_sse2(r);
+      col_16x8_rounding(in, 2);
+      idtx16_8col(in);
+      break;
+    case V_DCT:
+      load_buffer_16x8(input, in, stride, 0, 0);
+      fdct8_sse2(l);
+      fdct8_sse2(r);
+      col_16x8_rounding(in, 2);
+      idtx16_8col(in);
+      break;
+    case H_DCT:
+      load_buffer_16x8(input, in, stride, 0, 0);
+      fidtx8_sse2(l);
+      fidtx8_sse2(r);
+      col_16x8_rounding(in, 2);
+      fdct16_8col(in);
+      break;
+    case V_ADST:
+      load_buffer_16x8(input, in, stride, 0, 0);
+      fadst8_sse2(l);
+      fadst8_sse2(r);
+      col_16x8_rounding(in, 2);
+      idtx16_8col(in);
+      break;
+    case H_ADST:
+      load_buffer_16x8(input, in, stride, 0, 0);
+      fidtx8_sse2(l);
+      fidtx8_sse2(r);
+      col_16x8_rounding(in, 2);
+      fadst16_8col(in);
+      break;
+    case V_FLIPADST:
+      load_buffer_16x8(input, in, stride, 1, 0);
+      fadst8_sse2(l);
+      fadst8_sse2(r);
+      col_16x8_rounding(in, 2);
+      idtx16_8col(in);
+      break;
+    case H_FLIPADST:
+      load_buffer_16x8(input, in, stride, 0, 1);
+      fidtx8_sse2(l);
+      fidtx8_sse2(r);
+      col_16x8_rounding(in, 2);
+      fadst16_8col(in);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  array_transpose_8x8(l, l);
+  array_transpose_8x8(r, r);
+  write_buffer_8x8(output, l, 16);
+  write_buffer_8x8(output + 8, r, 16);
+}
+
+// Note: The 16-column 32-element transforms expect their input to be
+// split up into a 2x2 grid of 8x16 blocks
+static INLINE void fdct32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+                                __m128i *br) {
+  fdct32_8col(tl, bl);
+  fdct32_8col(tr, br);
+  array_transpose_16x16(tl, tr);
+  array_transpose_16x16(bl, br);
+}
+
+#if CONFIG_EXT_TX
+static INLINE void fidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+                                 __m128i *br) {
+  int i;
+  for (i = 0; i < 16; ++i) {
+    tl[i] = _mm_slli_epi16(tl[i], 2);
+    tr[i] = _mm_slli_epi16(tr[i], 2);
+    bl[i] = _mm_slli_epi16(bl[i], 2);
+    br[i] = _mm_slli_epi16(br[i], 2);
+  }
+  array_transpose_16x16(tl, tr);
+  array_transpose_16x16(bl, br);
+}
+#endif
+
+static INLINE void load_buffer_16x32(const int16_t *input, __m128i *intl,
+                                     __m128i *intr, __m128i *inbl,
+                                     __m128i *inbr, int stride, int flipud,
+                                     int fliplr) {
+  int i;
+  if (flipud) {
+    input = input + 31 * stride;
+    stride = -stride;
+  }
+
+  for (i = 0; i < 16; ++i) {
+    intl[i] = _mm_slli_epi16(
+        _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
+    intr[i] = _mm_slli_epi16(
+        _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
+    inbl[i] = _mm_slli_epi16(
+        _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 0)), 2);
+    inbr[i] = _mm_slli_epi16(
+        _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 8)), 2);
+  }
+
+  if (fliplr) {
+    __m128i tmp;
+    for (i = 0; i < 16; ++i) {
+      tmp = intl[i];
+      intl[i] = mm_reverse_epi16(intr[i]);
+      intr[i] = mm_reverse_epi16(tmp);
+      tmp = inbl[i];
+      inbl[i] = mm_reverse_epi16(inbr[i]);
+      inbr[i] = mm_reverse_epi16(tmp);
+    }
+  }
+
+  scale_sqrt2_8x16(intl);
+  scale_sqrt2_8x16(intr);
+  scale_sqrt2_8x16(inbl);
+  scale_sqrt2_8x16(inbr);
+}
+
+static INLINE void write_buffer_16x32(tran_low_t *output, __m128i *restl,
+                                      __m128i *restr, __m128i *resbl,
+                                      __m128i *resbr) {
+  int i;
+  for (i = 0; i < 16; ++i) {
+    store_output(&restl[i], output + i * 16 + 0);
+    store_output(&restr[i], output + i * 16 + 8);
+    store_output(&resbl[i], output + (i + 16) * 16 + 0);
+    store_output(&resbr[i], output + (i + 16) * 16 + 8);
+  }
+}
+
+static INLINE void round_signed_8x8(__m128i *in, const int bit) {
+  const __m128i rounding = _mm_set1_epi16((1 << bit) >> 1);
+  __m128i sign0 = _mm_srai_epi16(in[0], 15);
+  __m128i sign1 = _mm_srai_epi16(in[1], 15);
+  __m128i sign2 = _mm_srai_epi16(in[2], 15);
+  __m128i sign3 = _mm_srai_epi16(in[3], 15);
+  __m128i sign4 = _mm_srai_epi16(in[4], 15);
+  __m128i sign5 = _mm_srai_epi16(in[5], 15);
+  __m128i sign6 = _mm_srai_epi16(in[6], 15);
+  __m128i sign7 = _mm_srai_epi16(in[7], 15);
+
+  in[0] = _mm_add_epi16(_mm_add_epi16(in[0], rounding), sign0);
+  in[1] = _mm_add_epi16(_mm_add_epi16(in[1], rounding), sign1);
+  in[2] = _mm_add_epi16(_mm_add_epi16(in[2], rounding), sign2);
+  in[3] = _mm_add_epi16(_mm_add_epi16(in[3], rounding), sign3);
+  in[4] = _mm_add_epi16(_mm_add_epi16(in[4], rounding), sign4);
+  in[5] = _mm_add_epi16(_mm_add_epi16(in[5], rounding), sign5);
+  in[6] = _mm_add_epi16(_mm_add_epi16(in[6], rounding), sign6);
+  in[7] = _mm_add_epi16(_mm_add_epi16(in[7], rounding), sign7);
+
+  in[0] = _mm_srai_epi16(in[0], bit);
+  in[1] = _mm_srai_epi16(in[1], bit);
+  in[2] = _mm_srai_epi16(in[2], bit);
+  in[3] = _mm_srai_epi16(in[3], bit);
+  in[4] = _mm_srai_epi16(in[4], bit);
+  in[5] = _mm_srai_epi16(in[5], bit);
+  in[6] = _mm_srai_epi16(in[6], bit);
+  in[7] = _mm_srai_epi16(in[7], bit);
+}
+
+static INLINE void round_signed_16x16(__m128i *in0, __m128i *in1) {
+  const int bit = 4;
+  round_signed_8x8(in0, bit);
+  round_signed_8x8(in0 + 8, bit);
+  round_signed_8x8(in1, bit);
+  round_signed_8x8(in1 + 8, bit);
+}
+
+// Note:
+//  suffix "t" indicates the transpose operation comes first
+static void fdct16t_sse2(__m128i *in0, __m128i *in1) {
+  array_transpose_16x16(in0, in1);
+  fdct16_8col(in0);
+  fdct16_8col(in1);
+}
+
+static void fadst16t_sse2(__m128i *in0, __m128i *in1) {
+  array_transpose_16x16(in0, in1);
+  fadst16_8col(in0);
+  fadst16_8col(in1);
+}
+
+static INLINE void fdct32t_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+                                 __m128i *br) {
+  array_transpose_16x16(tl, tr);
+  array_transpose_16x16(bl, br);
+  fdct32_8col(tl, bl);
+  fdct32_8col(tr, br);
+}
+
+typedef enum transpose_indicator_ {
+  transpose,
+  no_transpose,
+} transpose_indicator;
+
+static INLINE void fhalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+                                      __m128i *br, transpose_indicator t) {
+  __m128i tmpl[16], tmpr[16];
+  int i;
+
+  // Copy the bottom half of the input to temporary storage
+  for (i = 0; i < 16; ++i) {
+    tmpl[i] = bl[i];
+    tmpr[i] = br[i];
+  }
+
+  // Generate the bottom half of the output
+  for (i = 0; i < 16; ++i) {
+    bl[i] = _mm_slli_epi16(tl[i], 2);
+    br[i] = _mm_slli_epi16(tr[i], 2);
+  }
+  array_transpose_16x16(bl, br);
+
+  // Copy the temporary storage back to the top half of the input
+  for (i = 0; i < 16; ++i) {
+    tl[i] = tmpl[i];
+    tr[i] = tmpr[i];
+  }
+
+  // Generate the top half of the output
+  scale_sqrt2_8x16(tl);
+  scale_sqrt2_8x16(tr);
+  if (t == transpose)
+    fdct16t_sse2(tl, tr);
+  else
+    fdct16_sse2(tl, tr);
+}
+
+// Note on data layout, for both this and the 32x16 transforms:
+// So that we can reuse the 16-element transforms easily,
+// we want to split the input into 8x16 blocks.
+// For 16x32, this means the input is a 2x2 grid of such blocks.
+// For 32x16, it means the input is a 4x1 grid.
+void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride,
+                       int tx_type) {
+  __m128i intl[16], intr[16], inbl[16], inbr[16];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+      fdct16t_sse2(intl, intr);
+      fdct16t_sse2(inbl, inbr);
+      round_signed_16x16(intl, intr);
+      round_signed_16x16(inbl, inbr);
+      fdct32t_16col(intl, intr, inbl, inbr);
+      break;
+    case ADST_DCT:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+      fdct16t_sse2(intl, intr);
+      fdct16t_sse2(inbl, inbr);
+      round_signed_16x16(intl, intr);
+      round_signed_16x16(inbl, inbr);
+      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
+      break;
+    case DCT_ADST:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+      fadst16t_sse2(intl, intr);
+      fadst16t_sse2(inbl, inbr);
+      round_signed_16x16(intl, intr);
+      round_signed_16x16(inbl, inbr);
+      fdct32t_16col(intl, intr, inbl, inbr);
+      break;
+    case ADST_ADST:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+      fadst16t_sse2(intl, intr);
+      fadst16t_sse2(inbl, inbr);
+      round_signed_16x16(intl, intr);
+      round_signed_16x16(inbl, inbr);
+      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
+      fdct16t_sse2(intl, intr);
+      fdct16t_sse2(inbl, inbr);
+      round_signed_16x16(intl, intr);
+      round_signed_16x16(inbl, inbr);
+      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
+      fadst16t_sse2(intl, intr);
+      fadst16t_sse2(inbl, inbr);
+      round_signed_16x16(intl, intr);
+      round_signed_16x16(inbl, inbr);
+      fdct32t_16col(intl, intr, inbl, inbr);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 1);
+      fadst16t_sse2(intl, intr);
+      fadst16t_sse2(inbl, inbr);
+      round_signed_16x16(intl, intr);
+      round_signed_16x16(inbl, inbr);
+      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
+      fadst16t_sse2(intl, intr);
+      fadst16t_sse2(inbl, inbr);
+      round_signed_16x16(intl, intr);
+      round_signed_16x16(inbl, inbr);
+      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
+      fadst16t_sse2(intl, intr);
+      fadst16t_sse2(inbl, inbr);
+      round_signed_16x16(intl, intr);
+      round_signed_16x16(inbl, inbr);
+      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
+      break;
+    case IDTX:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+      fidtx16_sse2(intl, intr);
+      fidtx16_sse2(inbl, inbr);
+      round_signed_16x16(intl, intr);
+      round_signed_16x16(inbl, inbr);
+      fidtx32_16col(intl, intr, inbl, inbr);
+      break;
+    case V_DCT:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+      fidtx16_sse2(intl, intr);
+      fidtx16_sse2(inbl, inbr);
+      round_signed_16x16(intl, intr);
+      round_signed_16x16(inbl, inbr);
+      fdct32t_16col(intl, intr, inbl, inbr);
+      break;
+    case H_DCT:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+      fdct16t_sse2(intl, intr);
+      fdct16t_sse2(inbl, inbr);
+      round_signed_16x16(intl, intr);
+      round_signed_16x16(inbl, inbr);
+      fidtx32_16col(intl, intr, inbl, inbr);
+      break;
+    case V_ADST:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+      fidtx16_sse2(intl, intr);
+      fidtx16_sse2(inbl, inbr);
+      round_signed_16x16(intl, intr);
+      round_signed_16x16(inbl, inbr);
+      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
+      break;
+    case H_ADST:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+      fadst16t_sse2(intl, intr);
+      fadst16t_sse2(inbl, inbr);
+      round_signed_16x16(intl, intr);
+      round_signed_16x16(inbl, inbr);
+      fidtx32_16col(intl, intr, inbl, inbr);
+      break;
+    case V_FLIPADST:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
+      fidtx16_sse2(intl, intr);
+      fidtx16_sse2(inbl, inbr);
+      round_signed_16x16(intl, intr);
+      round_signed_16x16(inbl, inbr);
+      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
+      break;
+    case H_FLIPADST:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
+      fadst16t_sse2(intl, intr);
+      fadst16t_sse2(inbl, inbr);
+      round_signed_16x16(intl, intr);
+      round_signed_16x16(inbl, inbr);
+      fidtx32_16col(intl, intr, inbl, inbr);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  write_buffer_16x32(output, intl, intr, inbl, inbr);
+}
+
+static INLINE void load_buffer_32x16(const int16_t *input, __m128i *in0,
+                                     __m128i *in1, __m128i *in2, __m128i *in3,
+                                     int stride, int flipud, int fliplr) {
+  int i;
+  if (flipud) {
+    input += 15 * stride;
+    stride = -stride;
+  }
+
+  for (i = 0; i < 16; ++i) {
+    in0[i] = _mm_slli_epi16(
+        _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
+    in1[i] = _mm_slli_epi16(
+        _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
+    in2[i] = _mm_slli_epi16(
+        _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2);
+    in3[i] = _mm_slli_epi16(
+        _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2);
+  }
+
+  if (fliplr) {
+    for (i = 0; i < 16; ++i) {
+      __m128i tmp1 = in0[i];
+      __m128i tmp2 = in1[i];
+      in0[i] = mm_reverse_epi16(in3[i]);
+      in1[i] = mm_reverse_epi16(in2[i]);
+      in2[i] = mm_reverse_epi16(tmp2);
+      in3[i] = mm_reverse_epi16(tmp1);
+    }
+  }
+
+  scale_sqrt2_8x16(in0);
+  scale_sqrt2_8x16(in1);
+  scale_sqrt2_8x16(in2);
+  scale_sqrt2_8x16(in3);
+}
+
+static INLINE void write_buffer_32x16(tran_low_t *output, __m128i *res0,
+                                      __m128i *res1, __m128i *res2,
+                                      __m128i *res3) {
+  int i;
+  for (i = 0; i < 16; ++i) {
+    store_output(&res0[i], output + i * 32 + 0);
+    store_output(&res1[i], output + i * 32 + 8);
+    store_output(&res2[i], output + i * 32 + 16);
+    store_output(&res3[i], output + i * 32 + 24);
+  }
+}
+
+void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride,
+                       int tx_type) {
+  __m128i in0[16], in1[16], in2[16], in3[16];
+
+  load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
+  switch (tx_type) {
+    case DCT_DCT:
+      fdct16_sse2(in0, in1);
+      fdct16_sse2(in2, in3);
+      round_signed_16x16(in0, in1);
+      round_signed_16x16(in2, in3);
+      fdct32_16col(in0, in1, in2, in3);
+      break;
+    case ADST_DCT:
+      fadst16_sse2(in0, in1);
+      fadst16_sse2(in2, in3);
+      round_signed_16x16(in0, in1);
+      round_signed_16x16(in2, in3);
+      fdct32_16col(in0, in1, in2, in3);
+      break;
+    case DCT_ADST:
+      fdct16_sse2(in0, in1);
+      fdct16_sse2(in2, in3);
+      round_signed_16x16(in0, in1);
+      round_signed_16x16(in2, in3);
+      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
+      break;
+    case ADST_ADST:
+      fadst16_sse2(in0, in1);
+      fadst16_sse2(in2, in3);
+      round_signed_16x16(in0, in1);
+      round_signed_16x16(in2, in3);
+      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      fadst16_sse2(in2, in3);
+      round_signed_16x16(in0, in1);
+      round_signed_16x16(in2, in3);
+      fdct32_16col(in0, in1, in2, in3);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
+      fdct16_sse2(in0, in1);
+      fdct16_sse2(in2, in3);
+      round_signed_16x16(in0, in1);
+      round_signed_16x16(in2, in3);
+      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 1);
+      fadst16_sse2(in0, in1);
+      fadst16_sse2(in2, in3);
+      round_signed_16x16(in0, in1);
+      round_signed_16x16(in2, in3);
+      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
+      fadst16_sse2(in0, in1);
+      fadst16_sse2(in2, in3);
+      round_signed_16x16(in0, in1);
+      round_signed_16x16(in2, in3);
+      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      fadst16_sse2(in2, in3);
+      round_signed_16x16(in0, in1);
+      round_signed_16x16(in2, in3);
+      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
+      break;
+    case IDTX:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
+      fidtx16_sse2(in0, in1);
+      fidtx16_sse2(in2, in3);
+      round_signed_16x16(in0, in1);
+      round_signed_16x16(in2, in3);
+      fidtx32_16col(in0, in1, in2, in3);
+      break;
+    case V_DCT:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
+      fdct16_sse2(in0, in1);
+      fdct16_sse2(in2, in3);
+      round_signed_16x16(in0, in1);
+      round_signed_16x16(in2, in3);
+      fidtx32_16col(in0, in1, in2, in3);
+      break;
+    case H_DCT:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
+      fidtx16_sse2(in0, in1);
+      fidtx16_sse2(in2, in3);
+      round_signed_16x16(in0, in1);
+      round_signed_16x16(in2, in3);
+      fdct32_16col(in0, in1, in2, in3);
+      break;
+    case V_ADST:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
+      fadst16_sse2(in0, in1);
+      fadst16_sse2(in2, in3);
+      round_signed_16x16(in0, in1);
+      round_signed_16x16(in2, in3);
+      fidtx32_16col(in0, in1, in2, in3);
+      break;
+    case H_ADST:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
+      fidtx16_sse2(in0, in1);
+      fidtx16_sse2(in2, in3);
+      round_signed_16x16(in0, in1);
+      round_signed_16x16(in2, in3);
+      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
+      break;
+    case V_FLIPADST:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      fadst16_sse2(in2, in3);
+      round_signed_16x16(in0, in1);
+      round_signed_16x16(in2, in3);
+      fidtx32_16col(in0, in1, in2, in3);
+      break;
+    case H_FLIPADST:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
+      fidtx16_sse2(in0, in1);
+      fidtx16_sse2(in2, in3);
+      round_signed_16x16(in0, in1);
+      round_signed_16x16(in2, in3);
+      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  write_buffer_32x16(output, in0, in1, in2, in3);
+}
+
+// Note:
+// 32x32 hybrid fwd txfm
+//  4x2 grids of 8x16 block. Each block is represented by __m128i in[16]
+static INLINE void load_buffer_32x32(const int16_t *input,
+                                     __m128i *in0 /*in0[32]*/,
+                                     __m128i *in1 /*in1[32]*/,
+                                     __m128i *in2 /*in2[32]*/,
+                                     __m128i *in3 /*in3[32]*/, int stride,
+                                     int flipud, int fliplr) {
+  if (flipud) {
+    input += 31 * stride;
+    stride = -stride;
+  }
+
+  int i;
+  for (i = 0; i < 32; ++i) {
+    in0[i] = _mm_slli_epi16(
+        _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
+    in1[i] = _mm_slli_epi16(
+        _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
+    in2[i] = _mm_slli_epi16(
+        _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2);
+    in3[i] = _mm_slli_epi16(
+        _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2);
+  }
+
+  if (fliplr) {
+    for (i = 0; i < 32; ++i) {
+      __m128i tmp1 = in0[i];
+      __m128i tmp2 = in1[i];
+      in0[i] = mm_reverse_epi16(in3[i]);
+      in1[i] = mm_reverse_epi16(in2[i]);
+      in2[i] = mm_reverse_epi16(tmp2);
+      in3[i] = mm_reverse_epi16(tmp1);
+    }
+  }
+}
+
+static INLINE void swap_16x16(__m128i *b0l /*b0l[16]*/,
+                              __m128i *b0r /*b0r[16]*/,
+                              __m128i *b1l /*b1l[16]*/,
+                              __m128i *b1r /*b1r[16]*/) {
+  int i;
+  for (i = 0; i < 16; ++i) {
+    __m128i tmp0 = b1l[i];
+    __m128i tmp1 = b1r[i];
+    b1l[i] = b0l[i];
+    b1r[i] = b0r[i];
+    b0l[i] = tmp0;
+    b0r[i] = tmp1;
+  }
+}
+
+static INLINE void fdct32(__m128i *in0, __m128i *in1, __m128i *in2,
+                          __m128i *in3) {
+  fdct32_8col(in0, &in0[16]);
+  fdct32_8col(in1, &in1[16]);
+  fdct32_8col(in2, &in2[16]);
+  fdct32_8col(in3, &in3[16]);
+
+  array_transpose_16x16(in0, in1);
+  array_transpose_16x16(&in0[16], &in1[16]);
+  array_transpose_16x16(in2, in3);
+  array_transpose_16x16(&in2[16], &in3[16]);
+
+  swap_16x16(&in0[16], &in1[16], in2, in3);
+}
+
+static INLINE void fhalfright32(__m128i *in0, __m128i *in1, __m128i *in2,
+                                __m128i *in3) {
+  fhalfright32_16col(in0, in1, &in0[16], &in1[16], no_transpose);
+  fhalfright32_16col(in2, in3, &in2[16], &in3[16], no_transpose);
+  swap_16x16(&in0[16], &in1[16], in2, in3);
+}
+
+#if CONFIG_EXT_TX
+static INLINE void fidtx32(__m128i *in0, __m128i *in1, __m128i *in2,
+                           __m128i *in3) {
+  fidtx32_16col(in0, in1, &in0[16], &in1[16]);
+  fidtx32_16col(in2, in3, &in2[16], &in3[16]);
+  swap_16x16(&in0[16], &in1[16], in2, in3);
+}
+#endif
+
+static INLINE void round_signed_32x32(__m128i *in0, __m128i *in1, __m128i *in2,
+                                      __m128i *in3) {
+  round_signed_16x16(in0, in1);
+  round_signed_16x16(&in0[16], &in1[16]);
+  round_signed_16x16(in2, in3);
+  round_signed_16x16(&in2[16], &in3[16]);
+}
+
+static INLINE void write_buffer_32x32(__m128i *in0, __m128i *in1, __m128i *in2,
+                                      __m128i *in3, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 32; ++i) {
+    store_output(&in0[i], output + i * 32 + 0);
+    store_output(&in1[i], output + i * 32 + 8);
+    store_output(&in2[i], output + i * 32 + 16);
+    store_output(&in3[i], output + i * 32 + 24);
+  }
+}
+
+void av1_fht32x32_sse2(const int16_t *input, tran_low_t *output, int stride,
+                       int tx_type) {
+  __m128i in0[32], in1[32], in2[32], in3[32];
+
+  load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 0);
+  switch (tx_type) {
+    case DCT_DCT:
+      fdct32(in0, in1, in2, in3);
+      round_signed_32x32(in0, in1, in2, in3);
+      fdct32(in0, in1, in2, in3);
+      break;
+    case ADST_DCT:
+      fhalfright32(in0, in1, in2, in3);
+      round_signed_32x32(in0, in1, in2, in3);
+      fdct32(in0, in1, in2, in3);
+      break;
+    case DCT_ADST:
+      fdct32(in0, in1, in2, in3);
+      round_signed_32x32(in0, in1, in2, in3);
+      fhalfright32(in0, in1, in2, in3);
+      break;
+    case ADST_ADST:
+      fhalfright32(in0, in1, in2, in3);
+      round_signed_32x32(in0, in1, in2, in3);
+      fhalfright32(in0, in1, in2, in3);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
+      fhalfright32(in0, in1, in2, in3);
+      round_signed_32x32(in0, in1, in2, in3);
+      fdct32(in0, in1, in2, in3);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
+      fdct32(in0, in1, in2, in3);
+      round_signed_32x32(in0, in1, in2, in3);
+      fhalfright32(in0, in1, in2, in3);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 1);
+      fhalfright32(in0, in1, in2, in3);
+      round_signed_32x32(in0, in1, in2, in3);
+      fhalfright32(in0, in1, in2, in3);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
+      fhalfright32(in0, in1, in2, in3);
+      round_signed_32x32(in0, in1, in2, in3);
+      fhalfright32(in0, in1, in2, in3);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
+      fhalfright32(in0, in1, in2, in3);
+      round_signed_32x32(in0, in1, in2, in3);
+      fhalfright32(in0, in1, in2, in3);
+      break;
+    case IDTX:
+      fidtx32(in0, in1, in2, in3);
+      round_signed_32x32(in0, in1, in2, in3);
+      fidtx32(in0, in1, in2, in3);
+      break;
+    case V_DCT:
+      fdct32(in0, in1, in2, in3);
+      round_signed_32x32(in0, in1, in2, in3);
+      fidtx32(in0, in1, in2, in3);
+      break;
+    case H_DCT:
+      fidtx32(in0, in1, in2, in3);
+      round_signed_32x32(in0, in1, in2, in3);
+      fdct32(in0, in1, in2, in3);
+      break;
+    case V_ADST:
+      fhalfright32(in0, in1, in2, in3);
+      round_signed_32x32(in0, in1, in2, in3);
+      fidtx32(in0, in1, in2, in3);
+      break;
+    case H_ADST:
+      fidtx32(in0, in1, in2, in3);
+      round_signed_32x32(in0, in1, in2, in3);
+      fhalfright32(in0, in1, in2, in3);
+      break;
+    case V_FLIPADST:
+      load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
+      fhalfright32(in0, in1, in2, in3);
+      round_signed_32x32(in0, in1, in2, in3);
+      fidtx32(in0, in1, in2, in3);
+      break;
+    case H_FLIPADST:
+      load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
+      fidtx32(in0, in1, in2, in3);
+      round_signed_32x32(in0, in1, in2, in3);
+      fhalfright32(in0, in1, in2, in3);
+      break;
+#endif
+    default: assert(0);
+  }
+  write_buffer_32x32(in0, in1, in2, in3, output);
+}
diff --git a/third_party/aom/av1/encoder/x86/dct_sse2.asm b/third_party/aom/av1/encoder/x86/dct_sse2.asm
new file mode 100644
index 000000000..a99db3d6e
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/dct_sse2.asm
@@ -0,0 +1,87 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+  paddw           m0,        m1
+  movq            m4,        m0
+  psubw           m3,        m2
+  psubw           m4,        m3
+  psraw           m4,        1
+  movq            m5,        m4
+  psubw           m5,        m1 ;b1
+  psubw           m4,        m2 ;c1
+  psubw           m0,        m4
+  paddw           m3,        m5
+                                ; m0 a0
+  SWAP            1,         4  ; m1 c1
+  SWAP            2,         3  ; m2 d1
+  SWAP            3,         5  ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+                                ; 00 01 02 03
+                                ; 10 11 12 13
+                                ; 20 21 22 23
+                                ; 30 31 32 33
+  punpcklwd       m0,        m1 ; 00 10 01 11  02 12 03 13
+  punpcklwd       m2,        m3 ; 20 30 21 31  22 32 23 33
+  mova            m1,        m0
+  punpckldq       m0,        m2 ; 00 10 20 30  01 11 21 31
+  punpckhdq       m1,        m2 ; 02 12 22 32  03 13 23 33
+%endmacro
+
+INIT_XMM sse2
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+  lea             r3q,       [inputq + strideq*4]
+  movq            m0,        [inputq] ;a1
+  movq            m1,        [inputq + strideq*2] ;b1
+  movq            m2,        [r3q] ;c1
+  movq            m3,        [r3q + strideq*2] ;d1
+
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  SWAP            1,         2
+  psrldq          m1,        m0, 8
+  psrldq          m3,        m2, 8
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+
+  psllw           m0,        2
+  psllw           m1,        2
+
+%if CONFIG_HIGHBITDEPTH
+  ; sign extension
+  mova            m2,             m0
+  mova            m3,             m1
+  punpcklwd       m0,             m0
+  punpcklwd       m1,             m1
+  punpckhwd       m2,             m2
+  punpckhwd       m3,             m3
+  psrad           m0,             16
+  psrad           m1,             16
+  psrad           m2,             16
+  psrad           m3,             16
+  mova            [outputq],      m0
+  mova            [outputq + 16], m2
+  mova            [outputq + 32], m1
+  mova            [outputq + 48], m3
+%else
+  mova            [outputq],      m0
+  mova            [outputq + 16], m1
+%endif
+
+  RET
diff --git a/third_party/aom/av1/encoder/x86/dct_ssse3.c b/third_party/aom/av1/encoder/x86/dct_ssse3.c
new file mode 100644
index 000000000..717a99af8
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/dct_ssse3.c
@@ -0,0 +1,469 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#if defined(_MSC_VER) && _MSC_VER <= 1500
+// Need to include math.h before calling tmmintrin.h/intrin.h
+// in certain versions of MSVS.
+#include <math.h>
+#endif
+#include <tmmintrin.h>  // SSSE3
+
+#include "./av1_rtcd.h"
+#include "aom_dsp/x86/inv_txfm_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+void av1_fdct8x8_quant_ssse3(
+    const int16_t *input, int stride, int16_t *coeff_ptr, intptr_t n_coeffs,
+    int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr,
+    const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+    int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  __m128i zero;
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // Load input
+  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  __m128i *in[8];
+  int index = 0;
+
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)coeff_ptr;
+
+  // Pre-condition input (shift by two)
+  in0 = _mm_slli_epi16(in0, 2);
+  in1 = _mm_slli_epi16(in1, 2);
+  in2 = _mm_slli_epi16(in2, 2);
+  in3 = _mm_slli_epi16(in3, 2);
+  in4 = _mm_slli_epi16(in4, 2);
+  in5 = _mm_slli_epi16(in5, 2);
+  in6 = _mm_slli_epi16(in6, 2);
+  in7 = _mm_slli_epi16(in7, 2);
+
+  in[0] = &in0;
+  in[1] = &in1;
+  in[2] = &in2;
+  in[3] = &in3;
+  in[4] = &in4;
+  in[5] = &in5;
+  in[6] = &in6;
+  in[7] = &in7;
+
+  // We do two passes, first the columns, then the rows. The results of the
+  // first pass are transposed so that the same column code can be reused. The
+  // results of the second pass are also transposed so that the rows (processed
+  // as columns) are put back in row positions.
+  for (pass = 0; pass < 2; pass++) {
+    // To store results of each pass before the transpose.
+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+    // Add/subtract
+    const __m128i q0 = _mm_add_epi16(in0, in7);
+    const __m128i q1 = _mm_add_epi16(in1, in6);
+    const __m128i q2 = _mm_add_epi16(in2, in5);
+    const __m128i q3 = _mm_add_epi16(in3, in4);
+    const __m128i q4 = _mm_sub_epi16(in3, in4);
+    const __m128i q5 = _mm_sub_epi16(in2, in5);
+    const __m128i q6 = _mm_sub_epi16(in1, in6);
+    const __m128i q7 = _mm_sub_epi16(in0, in7);
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m128i r0 = _mm_add_epi16(q0, q3);
+      const __m128i r1 = _mm_add_epi16(q1, q2);
+      const __m128i r2 = _mm_sub_epi16(q1, q2);
+      const __m128i r3 = _mm_sub_epi16(q0, q3);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+      // dct_const_round_shift
+
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+
+      res0 = _mm_packs_epi32(w0, w1);
+      res4 = _mm_packs_epi32(w2, w3);
+      res2 = _mm_packs_epi32(w4, w5);
+      res6 = _mm_packs_epi32(w6, w7);
+    }
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i d0 = _mm_sub_epi16(q6, q5);
+      const __m128i d1 = _mm_add_epi16(q6, q5);
+      const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16);
+      const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16);
+
+      // Add/subtract
+      const __m128i x0 = _mm_add_epi16(q4, r0);
+      const __m128i x1 = _mm_sub_epi16(q4, r0);
+      const __m128i x2 = _mm_sub_epi16(q7, r1);
+      const __m128i x3 = _mm_add_epi16(q7, r1);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res1 = _mm_packs_epi32(w0, w1);
+      res7 = _mm_packs_epi32(w2, w3);
+      res5 = _mm_packs_epi32(w4, w5);
+      res3 = _mm_packs_epi32(w6, w7);
+    }
+    // Transpose the 8x8.
+    {
+      // 00 01 02 03 04 05 06 07
+      // 10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27
+      // 30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47
+      // 50 51 52 53 54 55 56 57
+      // 60 61 62 63 64 65 66 67
+      // 70 71 72 73 74 75 76 77
+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+      // 00 10 01 11 02 12 03 13
+      // 20 30 21 31 22 32 23 33
+      // 04 14 05 15 06 16 07 17
+      // 24 34 25 35 26 36 27 37
+      // 40 50 41 51 42 52 43 53
+      // 60 70 61 71 62 72 63 73
+      // 54 54 55 55 56 56 57 57
+      // 64 74 65 75 66 76 67 77
+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+      // 00 10 20 30 01 11 21 31
+      // 40 50 60 70 41 51 61 71
+      // 02 12 22 32 03 13 23 33
+      // 42 52 62 72 43 53 63 73
+      // 04 14 24 34 05 15 21 36
+      // 44 54 64 74 45 55 61 76
+      // 06 16 26 36 07 17 27 37
+      // 46 56 66 76 47 57 67 77
+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }
+  // Post-condition output and store it
+  {
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+    in0 = _mm_sub_epi16(in0, sign_in0);
+    in1 = _mm_sub_epi16(in1, sign_in1);
+    in2 = _mm_sub_epi16(in2, sign_in2);
+    in3 = _mm_sub_epi16(in3, sign_in3);
+    in4 = _mm_sub_epi16(in4, sign_in4);
+    in5 = _mm_sub_epi16(in5, sign_in5);
+    in6 = _mm_sub_epi16(in6, sign_in6);
+    in7 = _mm_sub_epi16(in7, sign_in7);
+    in0 = _mm_srai_epi16(in0, 1);
+    in1 = _mm_srai_epi16(in1, 1);
+    in2 = _mm_srai_epi16(in2, 1);
+    in3 = _mm_srai_epi16(in3, 1);
+    in4 = _mm_srai_epi16(in4, 1);
+    in5 = _mm_srai_epi16(in5, 1);
+    in6 = _mm_srai_epi16(in6, 1);
+    in7 = _mm_srai_epi16(in7, 1);
+  }
+
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+
+  if (!skip_block) {
+    __m128i eob;
+    __m128i round, quant, dequant, thr;
+    int16_t nzflag;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        round = _mm_load_si128((const __m128i *)round_ptr);
+        quant = _mm_load_si128((const __m128i *)quant_ptr);
+        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        // Do DC and first 15 AC
+        coeff0 = *in[0];
+        coeff1 = *in[1];
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // AC only loop
+    index = 2;
+    thr = _mm_srai_epi16(dequant, 1);
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+
+        assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
+        coeff0 = *in[index];
+        coeff1 = *in[index + 1];
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+                 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+        if (nzflag) {
+          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+          // Reinsert signs
+          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
+          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
+          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        } else {
+          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
+        }
+      }
+
+      if (nzflag) {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+      index += 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
new file mode 100644
index 000000000..ae733a1ce
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>  // AVX2
+
+#include "./av1_rtcd.h"
+#include "aom/aom_integer.h"
+
+int64_t av1_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff,
+                             intptr_t block_size, int64_t *ssz) {
+  __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
+  __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
+  __m256i sse_reg_64hi, ssz_reg_64hi;
+  __m128i sse_reg128, ssz_reg128;
+  int64_t sse;
+  int i;
+  const __m256i zero_reg = _mm256_set1_epi16(0);
+
+  // init sse and ssz registerd to zero
+  sse_reg = _mm256_set1_epi16(0);
+  ssz_reg = _mm256_set1_epi16(0);
+
+  for (i = 0; i < block_size; i += 16) {
+    // load 32 bytes from coeff and dqcoeff
+    coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i));
+    dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i));
+    // dqcoeff - coeff
+    dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
+    // madd (dqcoeff - coeff)
+    dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
+    // madd coeff
+    coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
+    // expand each double word of madd (dqcoeff - coeff) to quad word
+    exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
+    exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
+    // expand each double word of madd (coeff) to quad word
+    exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
+    exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
+    // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
+    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
+    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
+    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
+    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
+  }
+  // save the higher 64 bit of each 128 bit lane
+  sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
+  ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
+  // add the higher 64 bit to the low 64 bit
+  sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
+  ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);
+
+  // add each 64 bit from each of the 128 bit lane of the 256 bit
+  sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
+                             _mm256_extractf128_si256(sse_reg, 1));
+
+  ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
+                             _mm256_extractf128_si256(ssz_reg, 1));
+
+  // store the results
+  _mm_storel_epi64((__m128i *)(&sse), sse_reg128);
+
+  _mm_storel_epi64((__m128i *)(ssz), ssz_reg128);
+  _mm256_zeroupper();
+  return sse;
+}
diff --git a/third_party/aom/av1/encoder/x86/error_sse2.asm b/third_party/aom/av1/encoder/x86/error_sse2.asm
new file mode 100644
index 000000000..4680f1fab
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/error_sse2.asm
@@ -0,0 +1,125 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; int64_t av1_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
+;                         int64_t *ssz)
+
+INIT_XMM sse2
+cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m6, m6                 ; ssz accumulator
+  pxor      m5, m5                 ; dedicated zero register
+  lea     uqcq, [uqcq+sizeq*2]
+  lea     dqcq, [dqcq+sizeq*2]
+  neg    sizeq
+.loop:
+  mova      m2, [uqcq+sizeq*2]
+  mova      m0, [dqcq+sizeq*2]
+  mova      m3, [uqcq+sizeq*2+mmsize]
+  mova      m1, [dqcq+sizeq*2+mmsize]
+  psubw     m0, m2
+  psubw     m1, m3
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+  pmaddwd   m0, m0
+  pmaddwd   m1, m1
+  pmaddwd   m2, m2
+  pmaddwd   m3, m3
+  ; accumulate in 64bit
+  punpckldq m7, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m7
+  punpckldq m7, m1, m5
+  paddq     m4, m0
+  punpckhdq m1, m5
+  paddq     m4, m7
+  punpckldq m7, m2, m5
+  paddq     m4, m1
+  punpckhdq m2, m5
+  paddq     m6, m7
+  punpckldq m7, m3, m5
+  paddq     m6, m2
+  punpckhdq m3, m5
+  paddq     m6, m7
+  paddq     m6, m3
+  add    sizeq, mmsize
+  jl .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  movhlps   m7, m6
+  paddq     m4, m5
+  paddq     m6, m7
+%if ARCH_X86_64
+  movq    rax, m4
+  movq [sszq], m6
+%else
+  mov     eax, sszm
+  pshufd   m5, m4, 0x1
+  movq  [eax], m6
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET
+
+; Compute the sum of squared difference between two int16_t vectors.
+; int64_t av1_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
+;                            intptr_t block_size)
+
+INIT_XMM sse2
+cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m5, m5                 ; dedicated zero register
+  lea     uqcq, [uqcq+sizeq*2]
+  lea     dqcq, [dqcq+sizeq*2]
+  neg    sizeq
+.loop:
+  mova      m2, [uqcq+sizeq*2]
+  mova      m0, [dqcq+sizeq*2]
+  mova      m3, [uqcq+sizeq*2+mmsize]
+  mova      m1, [dqcq+sizeq*2+mmsize]
+  psubw     m0, m2
+  psubw     m1, m3
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+  pmaddwd   m0, m0
+  pmaddwd   m1, m1
+  ; accumulate in 64bit
+  punpckldq m3, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m3
+  punpckldq m3, m1, m5
+  paddq     m4, m0
+  punpckhdq m1, m5
+  paddq     m4, m3
+  paddq     m4, m1
+  add    sizeq, mmsize
+  jl .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  paddq     m4, m5
+%if ARCH_X86_64
+  movq    rax, m4
+%else
+  pshufd   m5, m4, 0x1
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET
diff --git a/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c
new file mode 100644
index 000000000..777304ace
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "av1/common/common.h"
+
+int64_t av1_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz,
+                                    int bps) {
+  int i, j, test;
+  uint32_t temp[4];
+  __m128i max, min, cmp0, cmp1, cmp2, cmp3;
+  int64_t error = 0, sqcoeff = 0;
+  const int shift = 2 * (bps - 8);
+  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i += 8) {
+    // Load the data into xmm registers
+    __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i));
+    __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4));
+    __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i));
+    __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4));
+    // Check if any values require more than 15 bit
+    max = _mm_set1_epi32(0x3fff);
+    min = _mm_set1_epi32(0xffffc000);
+    cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
+                         _mm_cmplt_epi32(mm_coeff, min));
+    cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
+                         _mm_cmplt_epi32(mm_coeff2, min));
+    cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
+                         _mm_cmplt_epi32(mm_dqcoeff, min));
+    cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
+                         _mm_cmplt_epi32(mm_dqcoeff2, min));
+    test = _mm_movemask_epi8(
+        _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)));
+
+    if (!test) {
+      __m128i mm_diff, error_sse2, sqcoeff_sse2;
+      mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
+      mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
+      mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
+      error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
+      sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
+      _mm_storeu_si128((__m128i *)temp, error_sse2);
+      error = error + temp[0] + temp[1] + temp[2] + temp[3];
+      _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2);
+      sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
+    } else {
+      for (j = 0; j < 8; j++) {
+        const int64_t diff = coeff[i + j] - dqcoeff[i + j];
+        error += diff * diff;
+        sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
+      }
+    }
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
new file mode 100644
index 000000000..f201a29aa
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -0,0 +1,1895 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "av1/common/av1_fwd_txfm2d_cfg.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
+
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+                                   int stride, int flipud, int fliplr,
+                                   int shift) {
+  if (!flipud) {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+  } else {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+    in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+    in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+    in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+  }
+
+  in[0] = _mm_cvtepi16_epi32(in[0]);
+  in[1] = _mm_cvtepi16_epi32(in[1]);
+  in[2] = _mm_cvtepi16_epi32(in[2]);
+  in[3] = _mm_cvtepi16_epi32(in[3]);
+
+  in[0] = _mm_slli_epi32(in[0], shift);
+  in[1] = _mm_slli_epi32(in[1], shift);
+  in[2] = _mm_slli_epi32(in[2], shift);
+  in[3] = _mm_slli_epi32(in[3], shift);
+}
+
+// We only use stage-2 bit;
+// shift[0] is used in load_buffer_4x4()
+// shift[1] is used in txfm_func_col()
+// shift[2] is used in txfm_func_row()
+static void fdct4x4_sse4_1(__m128i *in, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i s0, s1, s2, s3;
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+
+  s0 = _mm_add_epi32(in[0], in[3]);
+  s1 = _mm_add_epi32(in[1], in[2]);
+  s2 = _mm_sub_epi32(in[1], in[2]);
+  s3 = _mm_sub_epi32(in[0], in[3]);
+
+  // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
+  u0 = _mm_mullo_epi32(s0, cospi32);
+  u1 = _mm_mullo_epi32(s1, cospi32);
+  u2 = _mm_add_epi32(u0, u1);
+  v0 = _mm_sub_epi32(u0, u1);
+
+  u3 = _mm_add_epi32(u2, rnding);
+  v1 = _mm_add_epi32(v0, rnding);
+
+  u0 = _mm_srai_epi32(u3, bit);
+  u2 = _mm_srai_epi32(v1, bit);
+
+  // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
+  v0 = _mm_mullo_epi32(s2, cospi48);
+  v1 = _mm_mullo_epi32(s3, cospi16);
+  v2 = _mm_add_epi32(v0, v1);
+
+  v3 = _mm_add_epi32(v2, rnding);
+  u1 = _mm_srai_epi32(v3, bit);
+
+  v0 = _mm_mullo_epi32(s2, cospi16);
+  v1 = _mm_mullo_epi32(s3, cospi48);
+  v2 = _mm_sub_epi32(v1, v0);
+
+  v3 = _mm_add_epi32(v2, rnding);
+  u3 = _mm_srai_epi32(v3, bit);
+
+  // Note: shift[1] and shift[2] are zeros
+
+  // Transpose 4x4 32-bit
+  v0 = _mm_unpacklo_epi32(u0, u1);
+  v1 = _mm_unpackhi_epi32(u0, u1);
+  v2 = _mm_unpacklo_epi32(u2, u3);
+  v3 = _mm_unpackhi_epi32(u2, u3);
+
+  in[0] = _mm_unpacklo_epi64(v0, v2);
+  in[1] = _mm_unpackhi_epi64(v0, v2);
+  in[2] = _mm_unpacklo_epi64(v1, v3);
+  in[3] = _mm_unpackhi_epi64(v1, v3);
+}
+
+static INLINE void write_buffer_4x4(__m128i *res, tran_low_t *output) {
+  _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+  _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+  _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+  _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+}
+
+// Note:
+//  We implement av1_fwd_txfm2d_4x4(). This function is kept here since
+//  av1_highbd_fht4x4_c() is not removed yet
+void av1_highbd_fht4x4_sse4_1(const int16_t *input, tran_low_t *output,
+                              int stride, int tx_type) {
+  (void)input;
+  (void)output;
+  (void)stride;
+  (void)tx_type;
+  assert(0);
+}
+
+static void fadst4x4_sse4_1(__m128i *in, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i kZero = _mm_setzero_si128();
+  __m128i s0, s1, s2, s3;
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+
+  // stage 0
+  // stage 1
+  // stage 2
+  u0 = _mm_mullo_epi32(in[3], cospi8);
+  u1 = _mm_mullo_epi32(in[0], cospi56);
+  u2 = _mm_add_epi32(u0, u1);
+  s0 = _mm_add_epi32(u2, rnding);
+  s0 = _mm_srai_epi32(s0, bit);
+
+  v0 = _mm_mullo_epi32(in[3], cospi56);
+  v1 = _mm_mullo_epi32(in[0], cospi8);
+  v2 = _mm_sub_epi32(v0, v1);
+  s1 = _mm_add_epi32(v2, rnding);
+  s1 = _mm_srai_epi32(s1, bit);
+
+  u0 = _mm_mullo_epi32(in[1], cospi40);
+  u1 = _mm_mullo_epi32(in[2], cospi24);
+  u2 = _mm_add_epi32(u0, u1);
+  s2 = _mm_add_epi32(u2, rnding);
+  s2 = _mm_srai_epi32(s2, bit);
+
+  v0 = _mm_mullo_epi32(in[1], cospi24);
+  v1 = _mm_mullo_epi32(in[2], cospi40);
+  v2 = _mm_sub_epi32(v0, v1);
+  s3 = _mm_add_epi32(v2, rnding);
+  s3 = _mm_srai_epi32(s3, bit);
+
+  // stage 3
+  u0 = _mm_add_epi32(s0, s2);
+  u2 = _mm_sub_epi32(s0, s2);
+  u1 = _mm_add_epi32(s1, s3);
+  u3 = _mm_sub_epi32(s1, s3);
+
+  // stage 4
+  v0 = _mm_mullo_epi32(u2, cospi32);
+  v1 = _mm_mullo_epi32(u3, cospi32);
+  v2 = _mm_add_epi32(v0, v1);
+  s2 = _mm_add_epi32(v2, rnding);
+  u2 = _mm_srai_epi32(s2, bit);
+
+  v2 = _mm_sub_epi32(v0, v1);
+  s3 = _mm_add_epi32(v2, rnding);
+  u3 = _mm_srai_epi32(s3, bit);
+
+  // u0, u1, u2, u3
+  u2 = _mm_sub_epi32(kZero, u2);
+  u1 = _mm_sub_epi32(kZero, u1);
+
+  // u0, u2, u3, u1
+  // Transpose 4x4 32-bit
+  v0 = _mm_unpacklo_epi32(u0, u2);
+  v1 = _mm_unpackhi_epi32(u0, u2);
+  v2 = _mm_unpacklo_epi32(u3, u1);
+  v3 = _mm_unpackhi_epi32(u3, u1);
+
+  in[0] = _mm_unpacklo_epi64(v0, v2);
+  in[1] = _mm_unpackhi_epi64(v0, v2);
+  in[2] = _mm_unpacklo_epi64(v1, v3);
+  in[3] = _mm_unpackhi_epi64(v1, v3);
+}
+
+void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
+                               int input_stride, int tx_type, int bd) {
+  __m128i in[4];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &fwd_txfm_2d_cfg_dct_dct_4;
+      load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_4;
+      load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case DCT_ADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_4;
+      load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_4;
+      load_buffer_4x4(input, in, input_stride, 1, 0, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_4;
+      load_buffer_4x4(input, in, input_stride, 0, 1, cfg->shift[0]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(input, in, input_stride, 1, 1, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(input, in, input_stride, 0, 1, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(input, in, input_stride, 1, 0, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+#endif
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+                                   int stride, int flipud, int fliplr,
+                                   int shift) {
+  __m128i u;
+  if (!flipud) {
+    in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  } else {
+    in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+    in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = mm_reverse_epi16(in[0]);
+    in[1] = mm_reverse_epi16(in[1]);
+    in[2] = mm_reverse_epi16(in[2]);
+    in[3] = mm_reverse_epi16(in[3]);
+    in[4] = mm_reverse_epi16(in[4]);
+    in[5] = mm_reverse_epi16(in[5]);
+    in[6] = mm_reverse_epi16(in[6]);
+    in[7] = mm_reverse_epi16(in[7]);
+  }
+
+  u = _mm_unpackhi_epi64(in[4], in[4]);
+  in[8] = _mm_cvtepi16_epi32(in[4]);
+  in[9] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[5], in[5]);
+  in[10] = _mm_cvtepi16_epi32(in[5]);
+  in[11] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[6], in[6]);
+  in[12] = _mm_cvtepi16_epi32(in[6]);
+  in[13] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[7], in[7]);
+  in[14] = _mm_cvtepi16_epi32(in[7]);
+  in[15] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[3], in[3]);
+  in[6] = _mm_cvtepi16_epi32(in[3]);
+  in[7] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[2], in[2]);
+  in[4] = _mm_cvtepi16_epi32(in[2]);
+  in[5] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[1], in[1]);
+  in[2] = _mm_cvtepi16_epi32(in[1]);
+  in[3] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[0], in[0]);
+  in[0] = _mm_cvtepi16_epi32(in[0]);
+  in[1] = _mm_cvtepi16_epi32(u);
+
+  in[0] = _mm_slli_epi32(in[0], shift);
+  in[1] = _mm_slli_epi32(in[1], shift);
+  in[2] = _mm_slli_epi32(in[2], shift);
+  in[3] = _mm_slli_epi32(in[3], shift);
+  in[4] = _mm_slli_epi32(in[4], shift);
+  in[5] = _mm_slli_epi32(in[5], shift);
+  in[6] = _mm_slli_epi32(in[6], shift);
+  in[7] = _mm_slli_epi32(in[7], shift);
+
+  in[8] = _mm_slli_epi32(in[8], shift);
+  in[9] = _mm_slli_epi32(in[9], shift);
+  in[10] = _mm_slli_epi32(in[10], shift);
+  in[11] = _mm_slli_epi32(in[11], shift);
+  in[12] = _mm_slli_epi32(in[12], shift);
+  in[13] = _mm_slli_epi32(in[13], shift);
+  in[14] = _mm_slli_epi32(in[14], shift);
+  in[15] = _mm_slli_epi32(in[15], shift);
+}
+
+static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
+  const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+
+  in[0] = _mm_add_epi32(in[0], rounding);
+  in[1] = _mm_add_epi32(in[1], rounding);
+  in[2] = _mm_add_epi32(in[2], rounding);
+  in[3] = _mm_add_epi32(in[3], rounding);
+  in[4] = _mm_add_epi32(in[4], rounding);
+  in[5] = _mm_add_epi32(in[5], rounding);
+  in[6] = _mm_add_epi32(in[6], rounding);
+  in[7] = _mm_add_epi32(in[7], rounding);
+  in[8] = _mm_add_epi32(in[8], rounding);
+  in[9] = _mm_add_epi32(in[9], rounding);
+  in[10] = _mm_add_epi32(in[10], rounding);
+  in[11] = _mm_add_epi32(in[11], rounding);
+  in[12] = _mm_add_epi32(in[12], rounding);
+  in[13] = _mm_add_epi32(in[13], rounding);
+  in[14] = _mm_add_epi32(in[14], rounding);
+  in[15] = _mm_add_epi32(in[15], rounding);
+
+  in[0] = _mm_srai_epi32(in[0], shift);
+  in[1] = _mm_srai_epi32(in[1], shift);
+  in[2] = _mm_srai_epi32(in[2], shift);
+  in[3] = _mm_srai_epi32(in[3], shift);
+  in[4] = _mm_srai_epi32(in[4], shift);
+  in[5] = _mm_srai_epi32(in[5], shift);
+  in[6] = _mm_srai_epi32(in[6], shift);
+  in[7] = _mm_srai_epi32(in[7], shift);
+  in[8] = _mm_srai_epi32(in[8], shift);
+  in[9] = _mm_srai_epi32(in[9], shift);
+  in[10] = _mm_srai_epi32(in[10], shift);
+  in[11] = _mm_srai_epi32(in[11], shift);
+  in[12] = _mm_srai_epi32(in[12], shift);
+  in[13] = _mm_srai_epi32(in[13], shift);
+  in[14] = _mm_srai_epi32(in[14], shift);
+  in[15] = _mm_srai_epi32(in[15], shift);
+}
+
+static INLINE void write_buffer_8x8(const __m128i *res, tran_low_t *output) {
+  _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+  _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+  _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+  _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+
+  _mm_store_si128((__m128i *)(output + 4 * 4), res[4]);
+  _mm_store_si128((__m128i *)(output + 5 * 4), res[5]);
+  _mm_store_si128((__m128i *)(output + 6 * 4), res[6]);
+  _mm_store_si128((__m128i *)(output + 7 * 4), res[7]);
+
+  _mm_store_si128((__m128i *)(output + 8 * 4), res[8]);
+  _mm_store_si128((__m128i *)(output + 9 * 4), res[9]);
+  _mm_store_si128((__m128i *)(output + 10 * 4), res[10]);
+  _mm_store_si128((__m128i *)(output + 11 * 4), res[11]);
+
+  _mm_store_si128((__m128i *)(output + 12 * 4), res[12]);
+  _mm_store_si128((__m128i *)(output + 13 * 4), res[13]);
+  _mm_store_si128((__m128i *)(output + 14 * 4), res[14]);
+  _mm_store_si128((__m128i *)(output + 15 * 4), res[15]);
+}
+
+static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u[8], v[8];
+
+  // Even 8 points 0, 2, ..., 14
+  // stage 0
+  // stage 1
+  u[0] = _mm_add_epi32(in[0], in[14]);
+  v[7] = _mm_sub_epi32(in[0], in[14]);  // v[7]
+  u[1] = _mm_add_epi32(in[2], in[12]);
+  u[6] = _mm_sub_epi32(in[2], in[12]);
+  u[2] = _mm_add_epi32(in[4], in[10]);
+  u[5] = _mm_sub_epi32(in[4], in[10]);
+  u[3] = _mm_add_epi32(in[6], in[8]);
+  v[4] = _mm_sub_epi32(in[6], in[8]);  // v[4]
+
+  // stage 2
+  v[0] = _mm_add_epi32(u[0], u[3]);
+  v[3] = _mm_sub_epi32(u[0], u[3]);
+  v[1] = _mm_add_epi32(u[1], u[2]);
+  v[2] = _mm_sub_epi32(u[1], u[2]);
+
+  v[5] = _mm_mullo_epi32(u[5], cospim32);
+  v[6] = _mm_mullo_epi32(u[6], cospi32);
+  v[5] = _mm_add_epi32(v[5], v[6]);
+  v[5] = _mm_add_epi32(v[5], rnding);
+  v[5] = _mm_srai_epi32(v[5], bit);
+
+  u[0] = _mm_mullo_epi32(u[5], cospi32);
+  v[6] = _mm_mullo_epi32(u[6], cospim32);
+  v[6] = _mm_sub_epi32(u[0], v[6]);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
+
+  // stage 3
+  // type 0
+  v[0] = _mm_mullo_epi32(v[0], cospi32);
+  v[1] = _mm_mullo_epi32(v[1], cospi32);
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_sub_epi32(v[0], v[1]);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // type 1
+  v[0] = _mm_mullo_epi32(v[2], cospi48);
+  v[1] = _mm_mullo_epi32(v[3], cospi16);
+  u[2] = _mm_add_epi32(v[0], v[1]);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  v[0] = _mm_mullo_epi32(v[2], cospi16);
+  v[1] = _mm_mullo_epi32(v[3], cospi48);
+  u[3] = _mm_sub_epi32(v[1], v[0]);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  u[4] = _mm_add_epi32(v[4], v[5]);
+  u[5] = _mm_sub_epi32(v[4], v[5]);
+  u[6] = _mm_sub_epi32(v[7], v[6]);
+  u[7] = _mm_add_epi32(v[7], v[6]);
+
+  // stage 4
+  // stage 5
+  v[0] = _mm_mullo_epi32(u[4], cospi56);
+  v[1] = _mm_mullo_epi32(u[7], cospi8);
+  v[0] = _mm_add_epi32(v[0], v[1]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[2] = _mm_srai_epi32(v[0], bit);  // buf0[4]
+
+  v[0] = _mm_mullo_epi32(u[4], cospi8);
+  v[1] = _mm_mullo_epi32(u[7], cospi56);
+  v[0] = _mm_sub_epi32(v[1], v[0]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[14] = _mm_srai_epi32(v[0], bit);  // buf0[7]
+
+  v[0] = _mm_mullo_epi32(u[5], cospi24);
+  v[1] = _mm_mullo_epi32(u[6], cospi40);
+  v[0] = _mm_add_epi32(v[0], v[1]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[10] = _mm_srai_epi32(v[0], bit);  // buf0[5]
+
+  v[0] = _mm_mullo_epi32(u[5], cospi40);
+  v[1] = _mm_mullo_epi32(u[6], cospi24);
+  v[0] = _mm_sub_epi32(v[1], v[0]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[6] = _mm_srai_epi32(v[0], bit);  // buf0[6]
+
+  out[0] = u[0];   // buf0[0]
+  out[8] = u[1];   // buf0[1]
+  out[4] = u[2];   // buf0[2]
+  out[12] = u[3];  // buf0[3]
+
+  // Odd 8 points: 1, 3, ..., 15
+  // stage 0
+  // stage 1
+  u[0] = _mm_add_epi32(in[1], in[15]);
+  v[7] = _mm_sub_epi32(in[1], in[15]);  // v[7]
+  u[1] = _mm_add_epi32(in[3], in[13]);
+  u[6] = _mm_sub_epi32(in[3], in[13]);
+  u[2] = _mm_add_epi32(in[5], in[11]);
+  u[5] = _mm_sub_epi32(in[5], in[11]);
+  u[3] = _mm_add_epi32(in[7], in[9]);
+  v[4] = _mm_sub_epi32(in[7], in[9]);  // v[4]
+
+  // stage 2
+  v[0] = _mm_add_epi32(u[0], u[3]);
+  v[3] = _mm_sub_epi32(u[0], u[3]);
+  v[1] = _mm_add_epi32(u[1], u[2]);
+  v[2] = _mm_sub_epi32(u[1], u[2]);
+
+  v[5] = _mm_mullo_epi32(u[5], cospim32);
+  v[6] = _mm_mullo_epi32(u[6], cospi32);
+  v[5] = _mm_add_epi32(v[5], v[6]);
+  v[5] = _mm_add_epi32(v[5], rnding);
+  v[5] = _mm_srai_epi32(v[5], bit);
+
+  u[0] = _mm_mullo_epi32(u[5], cospi32);
+  v[6] = _mm_mullo_epi32(u[6], cospim32);
+  v[6] = _mm_sub_epi32(u[0], v[6]);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
+
+  // stage 3
+  // type 0
+  v[0] = _mm_mullo_epi32(v[0], cospi32);
+  v[1] = _mm_mullo_epi32(v[1], cospi32);
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_sub_epi32(v[0], v[1]);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // type 1
+  v[0] = _mm_mullo_epi32(v[2], cospi48);
+  v[1] = _mm_mullo_epi32(v[3], cospi16);
+  u[2] = _mm_add_epi32(v[0], v[1]);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  v[0] = _mm_mullo_epi32(v[2], cospi16);
+  v[1] = _mm_mullo_epi32(v[3], cospi48);
+  u[3] = _mm_sub_epi32(v[1], v[0]);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  u[4] = _mm_add_epi32(v[4], v[5]);
+  u[5] = _mm_sub_epi32(v[4], v[5]);
+  u[6] = _mm_sub_epi32(v[7], v[6]);
+  u[7] = _mm_add_epi32(v[7], v[6]);
+
+  // stage 4
+  // stage 5
+  v[0] = _mm_mullo_epi32(u[4], cospi56);
+  v[1] = _mm_mullo_epi32(u[7], cospi8);
+  v[0] = _mm_add_epi32(v[0], v[1]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[3] = _mm_srai_epi32(v[0], bit);  // buf0[4]
+
+  v[0] = _mm_mullo_epi32(u[4], cospi8);
+  v[1] = _mm_mullo_epi32(u[7], cospi56);
+  v[0] = _mm_sub_epi32(v[1], v[0]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[15] = _mm_srai_epi32(v[0], bit);  // buf0[7]
+
+  v[0] = _mm_mullo_epi32(u[5], cospi24);
+  v[1] = _mm_mullo_epi32(u[6], cospi40);
+  v[0] = _mm_add_epi32(v[0], v[1]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[11] = _mm_srai_epi32(v[0], bit);  // buf0[5]
+
+  v[0] = _mm_mullo_epi32(u[5], cospi40);
+  v[1] = _mm_mullo_epi32(u[6], cospi24);
+  v[0] = _mm_sub_epi32(v[1], v[0]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[7] = _mm_srai_epi32(v[0], bit);  // buf0[6]
+
+  out[1] = u[0];   // buf0[0]
+  out[9] = u[1];   // buf0[1]
+  out[5] = u[2];   // buf0[2]
+  out[13] = u[3];  // buf0[3]
+}
+
+static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i kZero = _mm_setzero_si128();
+  __m128i u[8], v[8], x;
+
+  // Even 8 points: 0, 2, ..., 14
+  // stage 0
+  // stage 1
+  // stage 2
+  // (1)
+  u[0] = _mm_mullo_epi32(in[14], cospi4);
+  x = _mm_mullo_epi32(in[0], cospi60);
+  u[0] = _mm_add_epi32(u[0], x);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_mullo_epi32(in[14], cospi60);
+  x = _mm_mullo_epi32(in[0], cospi4);
+  u[1] = _mm_sub_epi32(u[1], x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // (2)
+  u[2] = _mm_mullo_epi32(in[10], cospi20);
+  x = _mm_mullo_epi32(in[4], cospi44);
+  u[2] = _mm_add_epi32(u[2], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_mullo_epi32(in[10], cospi44);
+  x = _mm_mullo_epi32(in[4], cospi20);
+  u[3] = _mm_sub_epi32(u[3], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  // (3)
+  u[4] = _mm_mullo_epi32(in[6], cospi36);
+  x = _mm_mullo_epi32(in[8], cospi28);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(in[6], cospi28);
+  x = _mm_mullo_epi32(in[8], cospi36);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  // (4)
+  u[6] = _mm_mullo_epi32(in[2], cospi52);
+  x = _mm_mullo_epi32(in[12], cospi12);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(in[2], cospi12);
+  x = _mm_mullo_epi32(in[12], cospi52);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 3
+  v[0] = _mm_add_epi32(u[0], u[4]);
+  v[4] = _mm_sub_epi32(u[0], u[4]);
+  v[1] = _mm_add_epi32(u[1], u[5]);
+  v[5] = _mm_sub_epi32(u[1], u[5]);
+  v[2] = _mm_add_epi32(u[2], u[6]);
+  v[6] = _mm_sub_epi32(u[2], u[6]);
+  v[3] = _mm_add_epi32(u[3], u[7]);
+  v[7] = _mm_sub_epi32(u[3], u[7]);
+
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  u[6] = _mm_mullo_epi32(v[6], cospim48);
+  x = _mm_mullo_epi32(v[7], cospi16);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(v[6], cospi16);
+  x = _mm_mullo_epi32(v[7], cospim48);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 5
+  v[0] = _mm_add_epi32(u[0], u[2]);
+  v[2] = _mm_sub_epi32(u[0], u[2]);
+  v[1] = _mm_add_epi32(u[1], u[3]);
+  v[3] = _mm_sub_epi32(u[1], u[3]);
+  v[4] = _mm_add_epi32(u[4], u[6]);
+  v[6] = _mm_sub_epi32(u[4], u[6]);
+  v[5] = _mm_add_epi32(u[5], u[7]);
+  v[7] = _mm_sub_epi32(u[5], u[7]);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  u[2] = _mm_add_epi32(v[0], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_sub_epi32(v[0], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  v[0] = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  u[6] = _mm_add_epi32(v[0], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(v[0], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 7
+  out[0] = u[0];
+  out[2] = _mm_sub_epi32(kZero, u[4]);
+  out[4] = u[6];
+  out[6] = _mm_sub_epi32(kZero, u[2]);
+  out[8] = u[3];
+  out[10] = _mm_sub_epi32(kZero, u[7]);
+  out[12] = u[5];
+  out[14] = _mm_sub_epi32(kZero, u[1]);
+
+  // Odd 8 points: 1, 3, ..., 15
+  // stage 0
+  // stage 1
+  // stage 2
+  // (1)
+  u[0] = _mm_mullo_epi32(in[15], cospi4);
+  x = _mm_mullo_epi32(in[1], cospi60);
+  u[0] = _mm_add_epi32(u[0], x);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_mullo_epi32(in[15], cospi60);
+  x = _mm_mullo_epi32(in[1], cospi4);
+  u[1] = _mm_sub_epi32(u[1], x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // (2)
+  u[2] = _mm_mullo_epi32(in[11], cospi20);
+  x = _mm_mullo_epi32(in[5], cospi44);
+  u[2] = _mm_add_epi32(u[2], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_mullo_epi32(in[11], cospi44);
+  x = _mm_mullo_epi32(in[5], cospi20);
+  u[3] = _mm_sub_epi32(u[3], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  // (3)
+  u[4] = _mm_mullo_epi32(in[7], cospi36);
+  x = _mm_mullo_epi32(in[9], cospi28);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(in[7], cospi28);
+  x = _mm_mullo_epi32(in[9], cospi36);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  // (4)
+  u[6] = _mm_mullo_epi32(in[3], cospi52);
+  x = _mm_mullo_epi32(in[13], cospi12);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(in[3], cospi12);
+  x = _mm_mullo_epi32(in[13], cospi52);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 3
+  v[0] = _mm_add_epi32(u[0], u[4]);
+  v[4] = _mm_sub_epi32(u[0], u[4]);
+  v[1] = _mm_add_epi32(u[1], u[5]);
+  v[5] = _mm_sub_epi32(u[1], u[5]);
+  v[2] = _mm_add_epi32(u[2], u[6]);
+  v[6] = _mm_sub_epi32(u[2], u[6]);
+  v[3] = _mm_add_epi32(u[3], u[7]);
+  v[7] = _mm_sub_epi32(u[3], u[7]);
+
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  u[6] = _mm_mullo_epi32(v[6], cospim48);
+  x = _mm_mullo_epi32(v[7], cospi16);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(v[6], cospi16);
+  x = _mm_mullo_epi32(v[7], cospim48);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 5
+  v[0] = _mm_add_epi32(u[0], u[2]);
+  v[2] = _mm_sub_epi32(u[0], u[2]);
+  v[1] = _mm_add_epi32(u[1], u[3]);
+  v[3] = _mm_sub_epi32(u[1], u[3]);
+  v[4] = _mm_add_epi32(u[4], u[6]);
+  v[6] = _mm_sub_epi32(u[4], u[6]);
+  v[5] = _mm_add_epi32(u[5], u[7]);
+  v[7] = _mm_sub_epi32(u[5], u[7]);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  u[2] = _mm_add_epi32(v[0], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_sub_epi32(v[0], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  v[0] = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  u[6] = _mm_add_epi32(v[0], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(v[0], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 7
+  out[1] = u[0];
+  out[3] = _mm_sub_epi32(kZero, u[4]);
+  out[5] = u[6];
+  out[7] = _mm_sub_epi32(kZero, u[2]);
+  out[9] = u[3];
+  out[11] = _mm_sub_epi32(kZero, u[7]);
+  out[13] = u[5];
+  out[15] = _mm_sub_epi32(kZero, u[1]);
+}
+
+void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
+                               int tx_type, int bd) {
+  __m128i in[16], out[16];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &fwd_txfm_2d_cfg_dct_dct_8;
+      load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_8;
+      load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case DCT_ADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_8;
+      load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_8;
+      load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_8;
+      load_buffer_8x8(input, in, stride, 0, 1, cfg->shift[0]);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(input, in, stride, 1, 1, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(input, in, stride, 0, 1, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+// Hybrid Transform 16x16
+
+static INLINE void convert_8x8_to_16x16(const __m128i *in, __m128i *out) {
+  int row_index = 0;
+  int dst_index = 0;
+  int src_index = 0;
+
+  // row 0, 1, .., 7
+  do {
+    out[dst_index] = in[src_index];
+    out[dst_index + 1] = in[src_index + 1];
+    out[dst_index + 2] = in[src_index + 16];
+    out[dst_index + 3] = in[src_index + 17];
+    dst_index += 4;
+    src_index += 2;
+    row_index += 1;
+  } while (row_index < 8);
+
+  // row 8, 9, ..., 15
+  src_index += 16;
+  do {
+    out[dst_index] = in[src_index];
+    out[dst_index + 1] = in[src_index + 1];
+    out[dst_index + 2] = in[src_index + 16];
+    out[dst_index + 3] = in[src_index + 17];
+    dst_index += 4;
+    src_index += 2;
+    row_index += 1;
+  } while (row_index < 16);
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out,
+                                     int stride, int flipud, int fliplr,
+                                     int shift) {
+  __m128i in[64];
+  // Load 4 8x8 blocks
+  const int16_t *topL = input;
+  const int16_t *topR = input + 8;
+  const int16_t *botL = input + 8 * stride;
+  const int16_t *botR = input + 8 * stride + 8;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    // Swap left columns
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+    // Swap right columns
+    tmp = topR;
+    topR = botR;
+    botR = tmp;
+  }
+
+  if (fliplr) {
+    // Swap top rows
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+    // Swap bottom rows
+    tmp = botL;
+    botL = botR;
+    botR = tmp;
+  }
+
+  // load first 8 columns
+  load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift);
+  load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift);
+
+  // load second 8 columns
+  load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift);
+  load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift);
+
+  convert_8x8_to_16x16(in, out);
+}
+
+static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u[16], v[16], x;
+  const int col_num = 4;
+  int col;
+
+  // Calculate the column 0, 1, 2, 3
+  for (col = 0; col < col_num; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[1] = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[2] = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[3] = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[4] = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[5] = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[6] = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[9] = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[7] = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+    u[8] = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+
+    // stage 2
+    v[0] = _mm_add_epi32(u[0], u[7]);
+    v[7] = _mm_sub_epi32(u[0], u[7]);
+    v[1] = _mm_add_epi32(u[1], u[6]);
+    v[6] = _mm_sub_epi32(u[1], u[6]);
+    v[2] = _mm_add_epi32(u[2], u[5]);
+    v[5] = _mm_sub_epi32(u[2], u[5]);
+    v[3] = _mm_add_epi32(u[3], u[4]);
+    v[4] = _mm_sub_epi32(u[3], u[4]);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    v[10] = _mm_mullo_epi32(u[10], cospim32);
+    x = _mm_mullo_epi32(u[13], cospi32);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_mullo_epi32(u[10], cospi32);
+    x = _mm_mullo_epi32(u[13], cospim32);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[11] = _mm_mullo_epi32(u[11], cospim32);
+    x = _mm_mullo_epi32(u[12], cospi32);
+    v[11] = _mm_add_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(u[11], cospi32);
+    x = _mm_mullo_epi32(u[12], cospim32);
+    v[12] = _mm_sub_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 3
+    u[0] = _mm_add_epi32(v[0], v[3]);
+    u[3] = _mm_sub_epi32(v[0], v[3]);
+    u[1] = _mm_add_epi32(v[1], v[2]);
+    u[2] = _mm_sub_epi32(v[1], v[2]);
+    u[4] = v[4];
+
+    u[5] = _mm_mullo_epi32(v[5], cospim32);
+    x = _mm_mullo_epi32(v[6], cospi32);
+    u[5] = _mm_add_epi32(u[5], x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_mullo_epi32(v[5], cospi32);
+    x = _mm_mullo_epi32(v[6], cospim32);
+    u[6] = _mm_sub_epi32(u[6], x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[7] = v[7];
+    u[8] = _mm_add_epi32(v[8], v[11]);
+    u[11] = _mm_sub_epi32(v[8], v[11]);
+    u[9] = _mm_add_epi32(v[9], v[10]);
+    u[10] = _mm_sub_epi32(v[9], v[10]);
+    u[12] = _mm_sub_epi32(v[15], v[12]);
+    u[15] = _mm_add_epi32(v[15], v[12]);
+    u[13] = _mm_sub_epi32(v[14], v[13]);
+    u[14] = _mm_add_epi32(v[14], v[13]);
+
+    // stage 4
+    u[0] = _mm_mullo_epi32(u[0], cospi32);
+    u[1] = _mm_mullo_epi32(u[1], cospi32);
+    v[0] = _mm_add_epi32(u[0], u[1]);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_sub_epi32(u[0], u[1]);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = _mm_mullo_epi32(u[2], cospi48);
+    x = _mm_mullo_epi32(u[3], cospi16);
+    v[2] = _mm_add_epi32(v[2], x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_mullo_epi32(u[2], cospi16);
+    x = _mm_mullo_epi32(u[3], cospi48);
+    v[3] = _mm_sub_epi32(x, v[3]);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = _mm_add_epi32(u[4], u[5]);
+    v[5] = _mm_sub_epi32(u[4], u[5]);
+    v[6] = _mm_sub_epi32(u[7], u[6]);
+    v[7] = _mm_add_epi32(u[7], u[6]);
+    v[8] = u[8];
+
+    v[9] = _mm_mullo_epi32(u[9], cospim16);
+    x = _mm_mullo_epi32(u[14], cospi48);
+    v[9] = _mm_add_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[14] = _mm_mullo_epi32(u[9], cospi48);
+    x = _mm_mullo_epi32(u[14], cospim16);
+    v[14] = _mm_sub_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[10] = _mm_mullo_epi32(u[10], cospim48);
+    x = _mm_mullo_epi32(u[13], cospim16);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_mullo_epi32(u[10], cospim16);
+    x = _mm_mullo_epi32(u[13], cospim48);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[11] = u[11];
+    v[12] = u[12];
+    v[15] = u[15];
+
+    // stage 5
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+
+    u[4] = _mm_mullo_epi32(v[4], cospi56);
+    x = _mm_mullo_epi32(v[7], cospi8);
+    u[4] = _mm_add_epi32(u[4], x);
+    u[4] = _mm_add_epi32(u[4], rnding);
+    u[4] = _mm_srai_epi32(u[4], bit);
+
+    u[7] = _mm_mullo_epi32(v[4], cospi8);
+    x = _mm_mullo_epi32(v[7], cospi56);
+    u[7] = _mm_sub_epi32(x, u[7]);
+    u[7] = _mm_add_epi32(u[7], rnding);
+    u[7] = _mm_srai_epi32(u[7], bit);
+
+    u[5] = _mm_mullo_epi32(v[5], cospi24);
+    x = _mm_mullo_epi32(v[6], cospi40);
+    u[5] = _mm_add_epi32(u[5], x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_mullo_epi32(v[5], cospi40);
+    x = _mm_mullo_epi32(v[6], cospi24);
+    u[6] = _mm_sub_epi32(x, u[6]);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[8] = _mm_add_epi32(v[8], v[9]);
+    u[9] = _mm_sub_epi32(v[8], v[9]);
+    u[10] = _mm_sub_epi32(v[11], v[10]);
+    u[11] = _mm_add_epi32(v[11], v[10]);
+    u[12] = _mm_add_epi32(v[12], v[13]);
+    u[13] = _mm_sub_epi32(v[12], v[13]);
+    u[14] = _mm_sub_epi32(v[15], v[14]);
+    u[15] = _mm_add_epi32(v[15], v[14]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = _mm_mullo_epi32(u[8], cospi60);
+    x = _mm_mullo_epi32(u[15], cospi4);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[15] = _mm_mullo_epi32(u[8], cospi4);
+    x = _mm_mullo_epi32(u[15], cospi60);
+    v[15] = _mm_sub_epi32(x, v[15]);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    v[9] = _mm_mullo_epi32(u[9], cospi28);
+    x = _mm_mullo_epi32(u[14], cospi36);
+    v[9] = _mm_add_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[14] = _mm_mullo_epi32(u[9], cospi36);
+    x = _mm_mullo_epi32(u[14], cospi28);
+    v[14] = _mm_sub_epi32(x, v[14]);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[10] = _mm_mullo_epi32(u[10], cospi44);
+    x = _mm_mullo_epi32(u[13], cospi20);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_mullo_epi32(u[10], cospi20);
+    x = _mm_mullo_epi32(u[13], cospi44);
+    v[13] = _mm_sub_epi32(x, v[13]);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[11] = _mm_mullo_epi32(u[11], cospi12);
+    x = _mm_mullo_epi32(u[12], cospi52);
+    v[11] = _mm_add_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(u[11], cospi52);
+    x = _mm_mullo_epi32(u[12], cospi12);
+    v[12] = _mm_sub_epi32(x, v[12]);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    out[0 * col_num + col] = v[0];
+    out[1 * col_num + col] = v[8];
+    out[2 * col_num + col] = v[4];
+    out[3 * col_num + col] = v[12];
+    out[4 * col_num + col] = v[2];
+    out[5 * col_num + col] = v[10];
+    out[6 * col_num + col] = v[6];
+    out[7 * col_num + col] = v[14];
+    out[8 * col_num + col] = v[1];
+    out[9 * col_num + col] = v[9];
+    out[10 * col_num + col] = v[5];
+    out[11 * col_num + col] = v[13];
+    out[12 * col_num + col] = v[3];
+    out[13 * col_num + col] = v[11];
+    out[14 * col_num + col] = v[7];
+    out[15 * col_num + col] = v[15];
+  }
+}
+
+static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u[16], v[16], x, y;
+  const int col_num = 4;
+  int col;
+
+  // Calculate the column 0, 1, 2, 3
+  for (col = 0; col < col_num; ++col) {
+    // stage 0
+    // stage 1
+    // stage 2
+    v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2);
+    x = _mm_mullo_epi32(in[0 * col_num + col], cospi62);
+    v[0] = _mm_add_epi32(v[0], x);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62);
+    x = _mm_mullo_epi32(in[0 * col_num + col], cospi2);
+    v[1] = _mm_sub_epi32(v[1], x);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10);
+    x = _mm_mullo_epi32(in[2 * col_num + col], cospi54);
+    v[2] = _mm_add_epi32(v[2], x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54);
+    x = _mm_mullo_epi32(in[2 * col_num + col], cospi10);
+    v[3] = _mm_sub_epi32(v[3], x);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18);
+    x = _mm_mullo_epi32(in[4 * col_num + col], cospi46);
+    v[4] = _mm_add_epi32(v[4], x);
+    v[4] = _mm_add_epi32(v[4], rnding);
+    v[4] = _mm_srai_epi32(v[4], bit);
+
+    v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46);
+    x = _mm_mullo_epi32(in[4 * col_num + col], cospi18);
+    v[5] = _mm_sub_epi32(v[5], x);
+    v[5] = _mm_add_epi32(v[5], rnding);
+    v[5] = _mm_srai_epi32(v[5], bit);
+
+    v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26);
+    x = _mm_mullo_epi32(in[6 * col_num + col], cospi38);
+    v[6] = _mm_add_epi32(v[6], x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38);
+    x = _mm_mullo_epi32(in[6 * col_num + col], cospi26);
+    v[7] = _mm_sub_epi32(v[7], x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34);
+    x = _mm_mullo_epi32(in[8 * col_num + col], cospi30);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30);
+    x = _mm_mullo_epi32(in[8 * col_num + col], cospi34);
+    v[9] = _mm_sub_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42);
+    x = _mm_mullo_epi32(in[10 * col_num + col], cospi22);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22);
+    x = _mm_mullo_epi32(in[10 * col_num + col], cospi42);
+    v[11] = _mm_sub_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50);
+    x = _mm_mullo_epi32(in[12 * col_num + col], cospi14);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14);
+    x = _mm_mullo_epi32(in[12 * col_num + col], cospi50);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58);
+    x = _mm_mullo_epi32(in[14 * col_num + col], cospi6);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6);
+    x = _mm_mullo_epi32(in[14 * col_num + col], cospi58);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 3
+    u[0] = _mm_add_epi32(v[0], v[8]);
+    u[8] = _mm_sub_epi32(v[0], v[8]);
+    u[1] = _mm_add_epi32(v[1], v[9]);
+    u[9] = _mm_sub_epi32(v[1], v[9]);
+    u[2] = _mm_add_epi32(v[2], v[10]);
+    u[10] = _mm_sub_epi32(v[2], v[10]);
+    u[3] = _mm_add_epi32(v[3], v[11]);
+    u[11] = _mm_sub_epi32(v[3], v[11]);
+    u[4] = _mm_add_epi32(v[4], v[12]);
+    u[12] = _mm_sub_epi32(v[4], v[12]);
+    u[5] = _mm_add_epi32(v[5], v[13]);
+    u[13] = _mm_sub_epi32(v[5], v[13]);
+    u[6] = _mm_add_epi32(v[6], v[14]);
+    u[14] = _mm_sub_epi32(v[6], v[14]);
+    u[7] = _mm_add_epi32(v[7], v[15]);
+    u[15] = _mm_sub_epi32(v[7], v[15]);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = _mm_mullo_epi32(u[8], cospi8);
+    x = _mm_mullo_epi32(u[9], cospi56);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[9] = _mm_mullo_epi32(u[8], cospi56);
+    x = _mm_mullo_epi32(u[9], cospi8);
+    v[9] = _mm_sub_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[10] = _mm_mullo_epi32(u[10], cospi40);
+    x = _mm_mullo_epi32(u[11], cospi24);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_mullo_epi32(u[10], cospi24);
+    x = _mm_mullo_epi32(u[11], cospi40);
+    v[11] = _mm_sub_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(u[12], cospim56);
+    x = _mm_mullo_epi32(u[13], cospi8);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(u[12], cospi8);
+    x = _mm_mullo_epi32(u[13], cospim56);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(u[14], cospim24);
+    x = _mm_mullo_epi32(u[15], cospi40);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(u[14], cospi40);
+    x = _mm_mullo_epi32(u[15], cospim24);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 5
+    u[0] = _mm_add_epi32(v[0], v[4]);
+    u[4] = _mm_sub_epi32(v[0], v[4]);
+    u[1] = _mm_add_epi32(v[1], v[5]);
+    u[5] = _mm_sub_epi32(v[1], v[5]);
+    u[2] = _mm_add_epi32(v[2], v[6]);
+    u[6] = _mm_sub_epi32(v[2], v[6]);
+    u[3] = _mm_add_epi32(v[3], v[7]);
+    u[7] = _mm_sub_epi32(v[3], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[12]);
+    u[12] = _mm_sub_epi32(v[8], v[12]);
+    u[9] = _mm_add_epi32(v[9], v[13]);
+    u[13] = _mm_sub_epi32(v[9], v[13]);
+    u[10] = _mm_add_epi32(v[10], v[14]);
+    u[14] = _mm_sub_epi32(v[10], v[14]);
+    u[11] = _mm_add_epi32(v[11], v[15]);
+    u[15] = _mm_sub_epi32(v[11], v[15]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+
+    v[4] = _mm_mullo_epi32(u[4], cospi16);
+    x = _mm_mullo_epi32(u[5], cospi48);
+    v[4] = _mm_add_epi32(v[4], x);
+    v[4] = _mm_add_epi32(v[4], rnding);
+    v[4] = _mm_srai_epi32(v[4], bit);
+
+    v[5] = _mm_mullo_epi32(u[4], cospi48);
+    x = _mm_mullo_epi32(u[5], cospi16);
+    v[5] = _mm_sub_epi32(v[5], x);
+    v[5] = _mm_add_epi32(v[5], rnding);
+    v[5] = _mm_srai_epi32(v[5], bit);
+
+    v[6] = _mm_mullo_epi32(u[6], cospim48);
+    x = _mm_mullo_epi32(u[7], cospi16);
+    v[6] = _mm_add_epi32(v[6], x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_mullo_epi32(u[6], cospi16);
+    x = _mm_mullo_epi32(u[7], cospim48);
+    v[7] = _mm_sub_epi32(v[7], x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+
+    v[12] = _mm_mullo_epi32(u[12], cospi16);
+    x = _mm_mullo_epi32(u[13], cospi48);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(u[12], cospi48);
+    x = _mm_mullo_epi32(u[13], cospi16);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(u[14], cospim48);
+    x = _mm_mullo_epi32(u[15], cospi16);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(u[14], cospi16);
+    x = _mm_mullo_epi32(u[15], cospim48);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 7
+    u[0] = _mm_add_epi32(v[0], v[2]);
+    u[2] = _mm_sub_epi32(v[0], v[2]);
+    u[1] = _mm_add_epi32(v[1], v[3]);
+    u[3] = _mm_sub_epi32(v[1], v[3]);
+    u[4] = _mm_add_epi32(v[4], v[6]);
+    u[6] = _mm_sub_epi32(v[4], v[6]);
+    u[5] = _mm_add_epi32(v[5], v[7]);
+    u[7] = _mm_sub_epi32(v[5], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[10]);
+    u[10] = _mm_sub_epi32(v[8], v[10]);
+    u[9] = _mm_add_epi32(v[9], v[11]);
+    u[11] = _mm_sub_epi32(v[9], v[11]);
+    u[12] = _mm_add_epi32(v[12], v[14]);
+    u[14] = _mm_sub_epi32(v[12], v[14]);
+    u[13] = _mm_add_epi32(v[13], v[15]);
+    u[15] = _mm_sub_epi32(v[13], v[15]);
+
+    // stage 8
+    v[0] = u[0];
+    v[1] = u[1];
+
+    y = _mm_mullo_epi32(u[2], cospi32);
+    x = _mm_mullo_epi32(u[3], cospi32);
+    v[2] = _mm_add_epi32(y, x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_sub_epi32(y, x);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    y = _mm_mullo_epi32(u[6], cospi32);
+    x = _mm_mullo_epi32(u[7], cospi32);
+    v[6] = _mm_add_epi32(y, x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_sub_epi32(y, x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    y = _mm_mullo_epi32(u[10], cospi32);
+    x = _mm_mullo_epi32(u[11], cospi32);
+    v[10] = _mm_add_epi32(y, x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_sub_epi32(y, x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    y = _mm_mullo_epi32(u[14], cospi32);
+    x = _mm_mullo_epi32(u[15], cospi32);
+    v[14] = _mm_add_epi32(y, x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_sub_epi32(y, x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 9
+    out[0 * col_num + col] = v[0];
+    out[1 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[8]);
+    out[2 * col_num + col] = v[12];
+    out[3 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[4]);
+    out[4 * col_num + col] = v[6];
+    out[5 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[14]);
+    out[6 * col_num + col] = v[10];
+    out[7 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[2]);
+    out[8 * col_num + col] = v[3];
+    out[9 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[11]);
+    out[10 * col_num + col] = v[15];
+    out[11 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[7]);
+    out[12 * col_num + col] = v[5];
+    out[13 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[13]);
+    out[14 * col_num + col] = v[9];
+    out[15 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[1]);
+  }
+}
+
+static void col_txfm_16x16_rounding(__m128i *in, int shift) {
+  // Note:
+  //  We split 16x16 rounding into 4 sections of 8x8 rounding,
+  //  instead of 4 columns
+  col_txfm_8x8_rounding(&in[0], shift);
+  col_txfm_8x8_rounding(&in[16], shift);
+  col_txfm_8x8_rounding(&in[32], shift);
+  col_txfm_8x8_rounding(&in[48], shift);
+}
+
+static void write_buffer_16x16(const __m128i *in, tran_low_t *output) {
+  const int size_8x8 = 16 * 4;
+  write_buffer_8x8(&in[0], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[16], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[32], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[48], output);
+}
+
+void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
+                                 int stride, int tx_type, int bd) {
+  __m128i in[64], out[64];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &fwd_txfm_2d_cfg_dct_dct_16;
+      load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_16;
+      load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case DCT_ADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_16;
+      load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_16;
+      load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_16;
+      load_buffer_16x16(input, in, stride, 0, 1, cfg->shift[0]);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(input, in, stride, 1, 1, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(input, in, stride, 0, 1, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0);
+  }
+  (void)bd;
+}
diff --git a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
new file mode 100644
index 000000000..198e4e4c4
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
@@ -0,0 +1,1678 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>  // avx2
+
+#include "./av1_rtcd.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/fwd_txfm_avx2.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static int32_t get_16x16_sum(const int16_t *input, int stride) {
+  __m256i r0, r1, r2, r3, u0, u1;
+  __m256i zero = _mm256_setzero_si256();
+  __m256i sum = _mm256_setzero_si256();
+  const int16_t *blockBound = input + (stride << 4);
+  __m128i v0, v1;
+
+  while (input < blockBound) {
+    r0 = _mm256_loadu_si256((__m256i const *)input);
+    r1 = _mm256_loadu_si256((__m256i const *)(input + stride));
+    r2 = _mm256_loadu_si256((__m256i const *)(input + 2 * stride));
+    r3 = _mm256_loadu_si256((__m256i const *)(input + 3 * stride));
+
+    u0 = _mm256_add_epi16(r0, r1);
+    u1 = _mm256_add_epi16(r2, r3);
+    sum = _mm256_add_epi16(sum, u0);
+    sum = _mm256_add_epi16(sum, u1);
+
+    input += stride << 2;
+  }
+
+  // unpack 16 int16_t into 2x8 int32_t
+  u0 = _mm256_unpacklo_epi16(zero, sum);
+  u1 = _mm256_unpackhi_epi16(zero, sum);
+  u0 = _mm256_srai_epi32(u0, 16);
+  u1 = _mm256_srai_epi32(u1, 16);
+  sum = _mm256_add_epi32(u0, u1);
+
+  u0 = _mm256_srli_si256(sum, 8);
+  u1 = _mm256_add_epi32(sum, u0);
+
+  v0 = _mm_add_epi32(_mm256_extracti128_si256(u1, 1),
+                     _mm256_castsi256_si128(u1));
+  v1 = _mm_srli_si128(v0, 4);
+  v0 = _mm_add_epi32(v0, v1);
+  return (int32_t)_mm_extract_epi32(v0, 0);
+}
+
+void aom_fdct16x16_1_avx2(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  int32_t dc = get_16x16_sum(input, stride);
+  output[0] = (tran_low_t)(dc >> 1);
+  _mm256_zeroupper();
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, int stride,
+                                     int flipud, int fliplr, __m256i *in) {
+  if (!flipud) {
+    in[0] = _mm256_loadu_si256((const __m256i *)(input + 0 * stride));
+    in[1] = _mm256_loadu_si256((const __m256i *)(input + 1 * stride));
+    in[2] = _mm256_loadu_si256((const __m256i *)(input + 2 * stride));
+    in[3] = _mm256_loadu_si256((const __m256i *)(input + 3 * stride));
+    in[4] = _mm256_loadu_si256((const __m256i *)(input + 4 * stride));
+    in[5] = _mm256_loadu_si256((const __m256i *)(input + 5 * stride));
+    in[6] = _mm256_loadu_si256((const __m256i *)(input + 6 * stride));
+    in[7] = _mm256_loadu_si256((const __m256i *)(input + 7 * stride));
+    in[8] = _mm256_loadu_si256((const __m256i *)(input + 8 * stride));
+    in[9] = _mm256_loadu_si256((const __m256i *)(input + 9 * stride));
+    in[10] = _mm256_loadu_si256((const __m256i *)(input + 10 * stride));
+    in[11] = _mm256_loadu_si256((const __m256i *)(input + 11 * stride));
+    in[12] = _mm256_loadu_si256((const __m256i *)(input + 12 * stride));
+    in[13] = _mm256_loadu_si256((const __m256i *)(input + 13 * stride));
+    in[14] = _mm256_loadu_si256((const __m256i *)(input + 14 * stride));
+    in[15] = _mm256_loadu_si256((const __m256i *)(input + 15 * stride));
+  } else {
+    in[0] = _mm256_loadu_si256((const __m256i *)(input + 15 * stride));
+    in[1] = _mm256_loadu_si256((const __m256i *)(input + 14 * stride));
+    in[2] = _mm256_loadu_si256((const __m256i *)(input + 13 * stride));
+    in[3] = _mm256_loadu_si256((const __m256i *)(input + 12 * stride));
+    in[4] = _mm256_loadu_si256((const __m256i *)(input + 11 * stride));
+    in[5] = _mm256_loadu_si256((const __m256i *)(input + 10 * stride));
+    in[6] = _mm256_loadu_si256((const __m256i *)(input + 9 * stride));
+    in[7] = _mm256_loadu_si256((const __m256i *)(input + 8 * stride));
+    in[8] = _mm256_loadu_si256((const __m256i *)(input + 7 * stride));
+    in[9] = _mm256_loadu_si256((const __m256i *)(input + 6 * stride));
+    in[10] = _mm256_loadu_si256((const __m256i *)(input + 5 * stride));
+    in[11] = _mm256_loadu_si256((const __m256i *)(input + 4 * stride));
+    in[12] = _mm256_loadu_si256((const __m256i *)(input + 3 * stride));
+    in[13] = _mm256_loadu_si256((const __m256i *)(input + 2 * stride));
+    in[14] = _mm256_loadu_si256((const __m256i *)(input + 1 * stride));
+    in[15] = _mm256_loadu_si256((const __m256i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    mm256_reverse_epi16(&in[0]);
+    mm256_reverse_epi16(&in[1]);
+    mm256_reverse_epi16(&in[2]);
+    mm256_reverse_epi16(&in[3]);
+    mm256_reverse_epi16(&in[4]);
+    mm256_reverse_epi16(&in[5]);
+    mm256_reverse_epi16(&in[6]);
+    mm256_reverse_epi16(&in[7]);
+    mm256_reverse_epi16(&in[8]);
+    mm256_reverse_epi16(&in[9]);
+    mm256_reverse_epi16(&in[10]);
+    mm256_reverse_epi16(&in[11]);
+    mm256_reverse_epi16(&in[12]);
+    mm256_reverse_epi16(&in[13]);
+    mm256_reverse_epi16(&in[14]);
+    mm256_reverse_epi16(&in[15]);
+  }
+
+  in[0] = _mm256_slli_epi16(in[0], 2);
+  in[1] = _mm256_slli_epi16(in[1], 2);
+  in[2] = _mm256_slli_epi16(in[2], 2);
+  in[3] = _mm256_slli_epi16(in[3], 2);
+  in[4] = _mm256_slli_epi16(in[4], 2);
+  in[5] = _mm256_slli_epi16(in[5], 2);
+  in[6] = _mm256_slli_epi16(in[6], 2);
+  in[7] = _mm256_slli_epi16(in[7], 2);
+  in[8] = _mm256_slli_epi16(in[8], 2);
+  in[9] = _mm256_slli_epi16(in[9], 2);
+  in[10] = _mm256_slli_epi16(in[10], 2);
+  in[11] = _mm256_slli_epi16(in[11], 2);
+  in[12] = _mm256_slli_epi16(in[12], 2);
+  in[13] = _mm256_slli_epi16(in[13], 2);
+  in[14] = _mm256_slli_epi16(in[14], 2);
+  in[15] = _mm256_slli_epi16(in[15], 2);
+}
+
+static INLINE void write_buffer_16x16(const __m256i *in, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 16; ++i) {
+    storeu_output_avx2(&in[i], output + (i << 4));
+  }
+}
+
+static void right_shift_16x16(__m256i *in) {
+  const __m256i one = _mm256_set1_epi16(1);
+  __m256i s0 = _mm256_srai_epi16(in[0], 15);
+  __m256i s1 = _mm256_srai_epi16(in[1], 15);
+  __m256i s2 = _mm256_srai_epi16(in[2], 15);
+  __m256i s3 = _mm256_srai_epi16(in[3], 15);
+  __m256i s4 = _mm256_srai_epi16(in[4], 15);
+  __m256i s5 = _mm256_srai_epi16(in[5], 15);
+  __m256i s6 = _mm256_srai_epi16(in[6], 15);
+  __m256i s7 = _mm256_srai_epi16(in[7], 15);
+  __m256i s8 = _mm256_srai_epi16(in[8], 15);
+  __m256i s9 = _mm256_srai_epi16(in[9], 15);
+  __m256i s10 = _mm256_srai_epi16(in[10], 15);
+  __m256i s11 = _mm256_srai_epi16(in[11], 15);
+  __m256i s12 = _mm256_srai_epi16(in[12], 15);
+  __m256i s13 = _mm256_srai_epi16(in[13], 15);
+  __m256i s14 = _mm256_srai_epi16(in[14], 15);
+  __m256i s15 = _mm256_srai_epi16(in[15], 15);
+
+  in[0] = _mm256_add_epi16(in[0], one);
+  in[1] = _mm256_add_epi16(in[1], one);
+  in[2] = _mm256_add_epi16(in[2], one);
+  in[3] = _mm256_add_epi16(in[3], one);
+  in[4] = _mm256_add_epi16(in[4], one);
+  in[5] = _mm256_add_epi16(in[5], one);
+  in[6] = _mm256_add_epi16(in[6], one);
+  in[7] = _mm256_add_epi16(in[7], one);
+  in[8] = _mm256_add_epi16(in[8], one);
+  in[9] = _mm256_add_epi16(in[9], one);
+  in[10] = _mm256_add_epi16(in[10], one);
+  in[11] = _mm256_add_epi16(in[11], one);
+  in[12] = _mm256_add_epi16(in[12], one);
+  in[13] = _mm256_add_epi16(in[13], one);
+  in[14] = _mm256_add_epi16(in[14], one);
+  in[15] = _mm256_add_epi16(in[15], one);
+
+  in[0] = _mm256_sub_epi16(in[0], s0);
+  in[1] = _mm256_sub_epi16(in[1], s1);
+  in[2] = _mm256_sub_epi16(in[2], s2);
+  in[3] = _mm256_sub_epi16(in[3], s3);
+  in[4] = _mm256_sub_epi16(in[4], s4);
+  in[5] = _mm256_sub_epi16(in[5], s5);
+  in[6] = _mm256_sub_epi16(in[6], s6);
+  in[7] = _mm256_sub_epi16(in[7], s7);
+  in[8] = _mm256_sub_epi16(in[8], s8);
+  in[9] = _mm256_sub_epi16(in[9], s9);
+  in[10] = _mm256_sub_epi16(in[10], s10);
+  in[11] = _mm256_sub_epi16(in[11], s11);
+  in[12] = _mm256_sub_epi16(in[12], s12);
+  in[13] = _mm256_sub_epi16(in[13], s13);
+  in[14] = _mm256_sub_epi16(in[14], s14);
+  in[15] = _mm256_sub_epi16(in[15], s15);
+
+  in[0] = _mm256_srai_epi16(in[0], 2);
+  in[1] = _mm256_srai_epi16(in[1], 2);
+  in[2] = _mm256_srai_epi16(in[2], 2);
+  in[3] = _mm256_srai_epi16(in[3], 2);
+  in[4] = _mm256_srai_epi16(in[4], 2);
+  in[5] = _mm256_srai_epi16(in[5], 2);
+  in[6] = _mm256_srai_epi16(in[6], 2);
+  in[7] = _mm256_srai_epi16(in[7], 2);
+  in[8] = _mm256_srai_epi16(in[8], 2);
+  in[9] = _mm256_srai_epi16(in[9], 2);
+  in[10] = _mm256_srai_epi16(in[10], 2);
+  in[11] = _mm256_srai_epi16(in[11], 2);
+  in[12] = _mm256_srai_epi16(in[12], 2);
+  in[13] = _mm256_srai_epi16(in[13], 2);
+  in[14] = _mm256_srai_epi16(in[14], 2);
+  in[15] = _mm256_srai_epi16(in[15], 2);
+}
+
+static void fdct16_avx2(__m256i *in) {
+  // sequence: cospi_L_H = pairs(L, H) and L first
+  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64);
+  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
+  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+
+  const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64);
+  const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64);
+  const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
+
+  const __m256i cospi_p30_p02 = pair256_set_epi16(cospi_30_64, cospi_2_64);
+  const __m256i cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64);
+
+  const __m256i cospi_p14_p18 = pair256_set_epi16(cospi_14_64, cospi_18_64);
+  const __m256i cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64);
+
+  const __m256i cospi_p22_p10 = pair256_set_epi16(cospi_22_64, cospi_10_64);
+  const __m256i cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64);
+
+  const __m256i cospi_p06_p26 = pair256_set_epi16(cospi_6_64, cospi_26_64);
+  const __m256i cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64);
+
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m256i t0, t1, t2, t3, t4, t5, t6, t7;
+  __m256i v0, v1, v2, v3;
+  __m256i x0, x1;
+
+  // 0, 4, 8, 12
+  u0 = _mm256_add_epi16(in[0], in[15]);
+  u1 = _mm256_add_epi16(in[1], in[14]);
+  u2 = _mm256_add_epi16(in[2], in[13]);
+  u3 = _mm256_add_epi16(in[3], in[12]);
+  u4 = _mm256_add_epi16(in[4], in[11]);
+  u5 = _mm256_add_epi16(in[5], in[10]);
+  u6 = _mm256_add_epi16(in[6], in[9]);
+  u7 = _mm256_add_epi16(in[7], in[8]);
+
+  s0 = _mm256_add_epi16(u0, u7);
+  s1 = _mm256_add_epi16(u1, u6);
+  s2 = _mm256_add_epi16(u2, u5);
+  s3 = _mm256_add_epi16(u3, u4);
+
+  // 0, 8
+  v0 = _mm256_add_epi16(s0, s3);
+  v1 = _mm256_add_epi16(s1, s2);
+
+  x0 = _mm256_unpacklo_epi16(v0, v1);
+  x1 = _mm256_unpackhi_epi16(v0, v1);
+
+  t0 = butter_fly(x0, x1, cospi_p16_p16);
+  t1 = butter_fly(x0, x1, cospi_p16_m16);
+
+  // 4, 12
+  v0 = _mm256_sub_epi16(s1, s2);
+  v1 = _mm256_sub_epi16(s0, s3);
+
+  x0 = _mm256_unpacklo_epi16(v0, v1);
+  x1 = _mm256_unpackhi_epi16(v0, v1);
+
+  t2 = butter_fly(x0, x1, cospi_p24_p08);
+  t3 = butter_fly(x0, x1, cospi_m08_p24);
+
+  // 2, 6, 10, 14
+  s0 = _mm256_sub_epi16(u3, u4);
+  s1 = _mm256_sub_epi16(u2, u5);
+  s2 = _mm256_sub_epi16(u1, u6);
+  s3 = _mm256_sub_epi16(u0, u7);
+
+  v0 = s0;  // output[4]
+  v3 = s3;  // output[7]
+
+  x0 = _mm256_unpacklo_epi16(s2, s1);
+  x1 = _mm256_unpackhi_epi16(s2, s1);
+
+  v2 = butter_fly(x0, x1, cospi_p16_p16);  // output[5]
+  v1 = butter_fly(x0, x1, cospi_p16_m16);  // output[6]
+
+  s0 = _mm256_add_epi16(v0, v1);  // step[4]
+  s1 = _mm256_sub_epi16(v0, v1);  // step[5]
+  s2 = _mm256_sub_epi16(v3, v2);  // step[6]
+  s3 = _mm256_add_epi16(v3, v2);  // step[7]
+
+  // 2, 14
+  x0 = _mm256_unpacklo_epi16(s0, s3);
+  x1 = _mm256_unpackhi_epi16(s0, s3);
+
+  t4 = butter_fly(x0, x1, cospi_p28_p04);
+  t5 = butter_fly(x0, x1, cospi_m04_p28);
+
+  // 10, 6
+  x0 = _mm256_unpacklo_epi16(s1, s2);
+  x1 = _mm256_unpackhi_epi16(s1, s2);
+  t6 = butter_fly(x0, x1, cospi_p12_p20);
+  t7 = butter_fly(x0, x1, cospi_m20_p12);
+
+  // 1, 3, 5, 7, 9, 11, 13, 15
+  s0 = _mm256_sub_epi16(in[7], in[8]);  // step[8]
+  s1 = _mm256_sub_epi16(in[6], in[9]);  // step[9]
+  u2 = _mm256_sub_epi16(in[5], in[10]);
+  u3 = _mm256_sub_epi16(in[4], in[11]);
+  u4 = _mm256_sub_epi16(in[3], in[12]);
+  u5 = _mm256_sub_epi16(in[2], in[13]);
+  s6 = _mm256_sub_epi16(in[1], in[14]);  // step[14]
+  s7 = _mm256_sub_epi16(in[0], in[15]);  // step[15]
+
+  in[0] = t0;
+  in[8] = t1;
+  in[4] = t2;
+  in[12] = t3;
+  in[2] = t4;
+  in[14] = t5;
+  in[10] = t6;
+  in[6] = t7;
+
+  x0 = _mm256_unpacklo_epi16(u5, u2);
+  x1 = _mm256_unpackhi_epi16(u5, u2);
+
+  s2 = butter_fly(x0, x1, cospi_p16_p16);  // step[13]
+  s5 = butter_fly(x0, x1, cospi_p16_m16);  // step[10]
+
+  x0 = _mm256_unpacklo_epi16(u4, u3);
+  x1 = _mm256_unpackhi_epi16(u4, u3);
+
+  s3 = butter_fly(x0, x1, cospi_p16_p16);  // step[12]
+  s4 = butter_fly(x0, x1, cospi_p16_m16);  // step[11]
+
+  u0 = _mm256_add_epi16(s0, s4);  // output[8]
+  u1 = _mm256_add_epi16(s1, s5);
+  u2 = _mm256_sub_epi16(s1, s5);
+  u3 = _mm256_sub_epi16(s0, s4);
+  u4 = _mm256_sub_epi16(s7, s3);
+  u5 = _mm256_sub_epi16(s6, s2);
+  u6 = _mm256_add_epi16(s6, s2);
+  u7 = _mm256_add_epi16(s7, s3);
+
+  // stage 4
+  s0 = u0;
+  s3 = u3;
+  s4 = u4;
+  s7 = u7;
+
+  x0 = _mm256_unpacklo_epi16(u1, u6);
+  x1 = _mm256_unpackhi_epi16(u1, u6);
+
+  s1 = butter_fly(x0, x1, cospi_m08_p24);
+  s6 = butter_fly(x0, x1, cospi_p24_p08);
+
+  x0 = _mm256_unpacklo_epi16(u2, u5);
+  x1 = _mm256_unpackhi_epi16(u2, u5);
+
+  s2 = butter_fly(x0, x1, cospi_m24_m08);
+  s5 = butter_fly(x0, x1, cospi_m08_p24);
+
+  // stage 5
+  u0 = _mm256_add_epi16(s0, s1);
+  u1 = _mm256_sub_epi16(s0, s1);
+  u2 = _mm256_sub_epi16(s3, s2);
+  u3 = _mm256_add_epi16(s3, s2);
+  u4 = _mm256_add_epi16(s4, s5);
+  u5 = _mm256_sub_epi16(s4, s5);
+  u6 = _mm256_sub_epi16(s7, s6);
+  u7 = _mm256_add_epi16(s7, s6);
+
+  // stage 6
+  x0 = _mm256_unpacklo_epi16(u0, u7);
+  x1 = _mm256_unpackhi_epi16(u0, u7);
+  in[1] = butter_fly(x0, x1, cospi_p30_p02);
+  in[15] = butter_fly(x0, x1, cospi_m02_p30);
+
+  x0 = _mm256_unpacklo_epi16(u1, u6);
+  x1 = _mm256_unpackhi_epi16(u1, u6);
+  in[9] = butter_fly(x0, x1, cospi_p14_p18);
+  in[7] = butter_fly(x0, x1, cospi_m18_p14);
+
+  x0 = _mm256_unpacklo_epi16(u2, u5);
+  x1 = _mm256_unpackhi_epi16(u2, u5);
+  in[5] = butter_fly(x0, x1, cospi_p22_p10);
+  in[11] = butter_fly(x0, x1, cospi_m10_p22);
+
+  x0 = _mm256_unpacklo_epi16(u3, u4);
+  x1 = _mm256_unpackhi_epi16(u3, u4);
+  in[13] = butter_fly(x0, x1, cospi_p06_p26);
+  in[3] = butter_fly(x0, x1, cospi_m26_p06);
+}
+
+void fadst16_avx2(__m256i *in) {
+  const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64);
+  const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64);
+  const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64);
+  const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64);
+  const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64);
+  const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
+  const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64);
+  const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64);
+  const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
+  const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
+  const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
+  const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64);
+  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  __m256i s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+  __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+  __m256i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+  __m256i y0, y1;
+
+  // stage 1, s takes low 256 bits; x takes high 256 bits
+  y0 = _mm256_unpacklo_epi16(in[15], in[0]);
+  y1 = _mm256_unpackhi_epi16(in[15], in[0]);
+  s0 = _mm256_madd_epi16(y0, cospi_p01_p31);
+  x0 = _mm256_madd_epi16(y1, cospi_p01_p31);
+  s1 = _mm256_madd_epi16(y0, cospi_p31_m01);
+  x1 = _mm256_madd_epi16(y1, cospi_p31_m01);
+
+  y0 = _mm256_unpacklo_epi16(in[13], in[2]);
+  y1 = _mm256_unpackhi_epi16(in[13], in[2]);
+  s2 = _mm256_madd_epi16(y0, cospi_p05_p27);
+  x2 = _mm256_madd_epi16(y1, cospi_p05_p27);
+  s3 = _mm256_madd_epi16(y0, cospi_p27_m05);
+  x3 = _mm256_madd_epi16(y1, cospi_p27_m05);
+
+  y0 = _mm256_unpacklo_epi16(in[11], in[4]);
+  y1 = _mm256_unpackhi_epi16(in[11], in[4]);
+  s4 = _mm256_madd_epi16(y0, cospi_p09_p23);
+  x4 = _mm256_madd_epi16(y1, cospi_p09_p23);
+  s5 = _mm256_madd_epi16(y0, cospi_p23_m09);
+  x5 = _mm256_madd_epi16(y1, cospi_p23_m09);
+
+  y0 = _mm256_unpacklo_epi16(in[9], in[6]);
+  y1 = _mm256_unpackhi_epi16(in[9], in[6]);
+  s6 = _mm256_madd_epi16(y0, cospi_p13_p19);
+  x6 = _mm256_madd_epi16(y1, cospi_p13_p19);
+  s7 = _mm256_madd_epi16(y0, cospi_p19_m13);
+  x7 = _mm256_madd_epi16(y1, cospi_p19_m13);
+
+  y0 = _mm256_unpacklo_epi16(in[7], in[8]);
+  y1 = _mm256_unpackhi_epi16(in[7], in[8]);
+  s8 = _mm256_madd_epi16(y0, cospi_p17_p15);
+  x8 = _mm256_madd_epi16(y1, cospi_p17_p15);
+  s9 = _mm256_madd_epi16(y0, cospi_p15_m17);
+  x9 = _mm256_madd_epi16(y1, cospi_p15_m17);
+
+  y0 = _mm256_unpacklo_epi16(in[5], in[10]);
+  y1 = _mm256_unpackhi_epi16(in[5], in[10]);
+  s10 = _mm256_madd_epi16(y0, cospi_p21_p11);
+  x10 = _mm256_madd_epi16(y1, cospi_p21_p11);
+  s11 = _mm256_madd_epi16(y0, cospi_p11_m21);
+  x11 = _mm256_madd_epi16(y1, cospi_p11_m21);
+
+  y0 = _mm256_unpacklo_epi16(in[3], in[12]);
+  y1 = _mm256_unpackhi_epi16(in[3], in[12]);
+  s12 = _mm256_madd_epi16(y0, cospi_p25_p07);
+  x12 = _mm256_madd_epi16(y1, cospi_p25_p07);
+  s13 = _mm256_madd_epi16(y0, cospi_p07_m25);
+  x13 = _mm256_madd_epi16(y1, cospi_p07_m25);
+
+  y0 = _mm256_unpacklo_epi16(in[1], in[14]);
+  y1 = _mm256_unpackhi_epi16(in[1], in[14]);
+  s14 = _mm256_madd_epi16(y0, cospi_p29_p03);
+  x14 = _mm256_madd_epi16(y1, cospi_p29_p03);
+  s15 = _mm256_madd_epi16(y0, cospi_p03_m29);
+  x15 = _mm256_madd_epi16(y1, cospi_p03_m29);
+
+  // u takes low 256 bits; v takes high 256 bits
+  u0 = _mm256_add_epi32(s0, s8);
+  u1 = _mm256_add_epi32(s1, s9);
+  u2 = _mm256_add_epi32(s2, s10);
+  u3 = _mm256_add_epi32(s3, s11);
+  u4 = _mm256_add_epi32(s4, s12);
+  u5 = _mm256_add_epi32(s5, s13);
+  u6 = _mm256_add_epi32(s6, s14);
+  u7 = _mm256_add_epi32(s7, s15);
+
+  u8 = _mm256_sub_epi32(s0, s8);
+  u9 = _mm256_sub_epi32(s1, s9);
+  u10 = _mm256_sub_epi32(s2, s10);
+  u11 = _mm256_sub_epi32(s3, s11);
+  u12 = _mm256_sub_epi32(s4, s12);
+  u13 = _mm256_sub_epi32(s5, s13);
+  u14 = _mm256_sub_epi32(s6, s14);
+  u15 = _mm256_sub_epi32(s7, s15);
+
+  v0 = _mm256_add_epi32(x0, x8);
+  v1 = _mm256_add_epi32(x1, x9);
+  v2 = _mm256_add_epi32(x2, x10);
+  v3 = _mm256_add_epi32(x3, x11);
+  v4 = _mm256_add_epi32(x4, x12);
+  v5 = _mm256_add_epi32(x5, x13);
+  v6 = _mm256_add_epi32(x6, x14);
+  v7 = _mm256_add_epi32(x7, x15);
+
+  v8 = _mm256_sub_epi32(x0, x8);
+  v9 = _mm256_sub_epi32(x1, x9);
+  v10 = _mm256_sub_epi32(x2, x10);
+  v11 = _mm256_sub_epi32(x3, x11);
+  v12 = _mm256_sub_epi32(x4, x12);
+  v13 = _mm256_sub_epi32(x5, x13);
+  v14 = _mm256_sub_epi32(x6, x14);
+  v15 = _mm256_sub_epi32(x7, x15);
+
+  // low 256 bits rounding
+  u8 = _mm256_add_epi32(u8, dct_rounding);
+  u9 = _mm256_add_epi32(u9, dct_rounding);
+  u10 = _mm256_add_epi32(u10, dct_rounding);
+  u11 = _mm256_add_epi32(u11, dct_rounding);
+  u12 = _mm256_add_epi32(u12, dct_rounding);
+  u13 = _mm256_add_epi32(u13, dct_rounding);
+  u14 = _mm256_add_epi32(u14, dct_rounding);
+  u15 = _mm256_add_epi32(u15, dct_rounding);
+
+  u8 = _mm256_srai_epi32(u8, DCT_CONST_BITS);
+  u9 = _mm256_srai_epi32(u9, DCT_CONST_BITS);
+  u10 = _mm256_srai_epi32(u10, DCT_CONST_BITS);
+  u11 = _mm256_srai_epi32(u11, DCT_CONST_BITS);
+  u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS);
+  u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS);
+  u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
+  u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
+
+  // high 256 bits rounding
+  v8 = _mm256_add_epi32(v8, dct_rounding);
+  v9 = _mm256_add_epi32(v9, dct_rounding);
+  v10 = _mm256_add_epi32(v10, dct_rounding);
+  v11 = _mm256_add_epi32(v11, dct_rounding);
+  v12 = _mm256_add_epi32(v12, dct_rounding);
+  v13 = _mm256_add_epi32(v13, dct_rounding);
+  v14 = _mm256_add_epi32(v14, dct_rounding);
+  v15 = _mm256_add_epi32(v15, dct_rounding);
+
+  v8 = _mm256_srai_epi32(v8, DCT_CONST_BITS);
+  v9 = _mm256_srai_epi32(v9, DCT_CONST_BITS);
+  v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS);
+  v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS);
+  v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
+  v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
+  v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
+  v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
+
+  // Saturation pack 32-bit to 16-bit
+  x8 = _mm256_packs_epi32(u8, v8);
+  x9 = _mm256_packs_epi32(u9, v9);
+  x10 = _mm256_packs_epi32(u10, v10);
+  x11 = _mm256_packs_epi32(u11, v11);
+  x12 = _mm256_packs_epi32(u12, v12);
+  x13 = _mm256_packs_epi32(u13, v13);
+  x14 = _mm256_packs_epi32(u14, v14);
+  x15 = _mm256_packs_epi32(u15, v15);
+
+  // stage 2
+  y0 = _mm256_unpacklo_epi16(x8, x9);
+  y1 = _mm256_unpackhi_epi16(x8, x9);
+  s8 = _mm256_madd_epi16(y0, cospi_p04_p28);
+  x8 = _mm256_madd_epi16(y1, cospi_p04_p28);
+  s9 = _mm256_madd_epi16(y0, cospi_p28_m04);
+  x9 = _mm256_madd_epi16(y1, cospi_p28_m04);
+
+  y0 = _mm256_unpacklo_epi16(x10, x11);
+  y1 = _mm256_unpackhi_epi16(x10, x11);
+  s10 = _mm256_madd_epi16(y0, cospi_p20_p12);
+  x10 = _mm256_madd_epi16(y1, cospi_p20_p12);
+  s11 = _mm256_madd_epi16(y0, cospi_p12_m20);
+  x11 = _mm256_madd_epi16(y1, cospi_p12_m20);
+
+  y0 = _mm256_unpacklo_epi16(x12, x13);
+  y1 = _mm256_unpackhi_epi16(x12, x13);
+  s12 = _mm256_madd_epi16(y0, cospi_m28_p04);
+  x12 = _mm256_madd_epi16(y1, cospi_m28_p04);
+  s13 = _mm256_madd_epi16(y0, cospi_p04_p28);
+  x13 = _mm256_madd_epi16(y1, cospi_p04_p28);
+
+  y0 = _mm256_unpacklo_epi16(x14, x15);
+  y1 = _mm256_unpackhi_epi16(x14, x15);
+  s14 = _mm256_madd_epi16(y0, cospi_m12_p20);
+  x14 = _mm256_madd_epi16(y1, cospi_m12_p20);
+  s15 = _mm256_madd_epi16(y0, cospi_p20_p12);
+  x15 = _mm256_madd_epi16(y1, cospi_p20_p12);
+
+  x0 = _mm256_add_epi32(u0, u4);
+  s0 = _mm256_add_epi32(v0, v4);
+  x1 = _mm256_add_epi32(u1, u5);
+  s1 = _mm256_add_epi32(v1, v5);
+  x2 = _mm256_add_epi32(u2, u6);
+  s2 = _mm256_add_epi32(v2, v6);
+  x3 = _mm256_add_epi32(u3, u7);
+  s3 = _mm256_add_epi32(v3, v7);
+
+  v8 = _mm256_sub_epi32(u0, u4);
+  v9 = _mm256_sub_epi32(v0, v4);
+  v10 = _mm256_sub_epi32(u1, u5);
+  v11 = _mm256_sub_epi32(v1, v5);
+  v12 = _mm256_sub_epi32(u2, u6);
+  v13 = _mm256_sub_epi32(v2, v6);
+  v14 = _mm256_sub_epi32(u3, u7);
+  v15 = _mm256_sub_epi32(v3, v7);
+
+  v8 = _mm256_add_epi32(v8, dct_rounding);
+  v9 = _mm256_add_epi32(v9, dct_rounding);
+  v10 = _mm256_add_epi32(v10, dct_rounding);
+  v11 = _mm256_add_epi32(v11, dct_rounding);
+  v12 = _mm256_add_epi32(v12, dct_rounding);
+  v13 = _mm256_add_epi32(v13, dct_rounding);
+  v14 = _mm256_add_epi32(v14, dct_rounding);
+  v15 = _mm256_add_epi32(v15, dct_rounding);
+
+  v8 = _mm256_srai_epi32(v8, DCT_CONST_BITS);
+  v9 = _mm256_srai_epi32(v9, DCT_CONST_BITS);
+  v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS);
+  v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS);
+  v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
+  v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
+  v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
+  v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
+
+  x4 = _mm256_packs_epi32(v8, v9);
+  x5 = _mm256_packs_epi32(v10, v11);
+  x6 = _mm256_packs_epi32(v12, v13);
+  x7 = _mm256_packs_epi32(v14, v15);
+
+  u8 = _mm256_add_epi32(s8, s12);
+  u9 = _mm256_add_epi32(s9, s13);
+  u10 = _mm256_add_epi32(s10, s14);
+  u11 = _mm256_add_epi32(s11, s15);
+  u12 = _mm256_sub_epi32(s8, s12);
+  u13 = _mm256_sub_epi32(s9, s13);
+  u14 = _mm256_sub_epi32(s10, s14);
+  u15 = _mm256_sub_epi32(s11, s15);
+
+  v8 = _mm256_add_epi32(x8, x12);
+  v9 = _mm256_add_epi32(x9, x13);
+  v10 = _mm256_add_epi32(x10, x14);
+  v11 = _mm256_add_epi32(x11, x15);
+  v12 = _mm256_sub_epi32(x8, x12);
+  v13 = _mm256_sub_epi32(x9, x13);
+  v14 = _mm256_sub_epi32(x10, x14);
+  v15 = _mm256_sub_epi32(x11, x15);
+
+  u12 = _mm256_add_epi32(u12, dct_rounding);
+  u13 = _mm256_add_epi32(u13, dct_rounding);
+  u14 = _mm256_add_epi32(u14, dct_rounding);
+  u15 = _mm256_add_epi32(u15, dct_rounding);
+
+  u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS);
+  u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS);
+  u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
+  u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
+
+  v12 = _mm256_add_epi32(v12, dct_rounding);
+  v13 = _mm256_add_epi32(v13, dct_rounding);
+  v14 = _mm256_add_epi32(v14, dct_rounding);
+  v15 = _mm256_add_epi32(v15, dct_rounding);
+
+  v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
+  v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
+  v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
+  v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
+
+  x12 = _mm256_packs_epi32(u12, v12);
+  x13 = _mm256_packs_epi32(u13, v13);
+  x14 = _mm256_packs_epi32(u14, v14);
+  x15 = _mm256_packs_epi32(u15, v15);
+
+  // stage 3
+  y0 = _mm256_unpacklo_epi16(x4, x5);
+  y1 = _mm256_unpackhi_epi16(x4, x5);
+  s4 = _mm256_madd_epi16(y0, cospi_p08_p24);
+  x4 = _mm256_madd_epi16(y1, cospi_p08_p24);
+  s5 = _mm256_madd_epi16(y0, cospi_p24_m08);
+  x5 = _mm256_madd_epi16(y1, cospi_p24_m08);
+
+  y0 = _mm256_unpacklo_epi16(x6, x7);
+  y1 = _mm256_unpackhi_epi16(x6, x7);
+  s6 = _mm256_madd_epi16(y0, cospi_m24_p08);
+  x6 = _mm256_madd_epi16(y1, cospi_m24_p08);
+  s7 = _mm256_madd_epi16(y0, cospi_p08_p24);
+  x7 = _mm256_madd_epi16(y1, cospi_p08_p24);
+
+  y0 = _mm256_unpacklo_epi16(x12, x13);
+  y1 = _mm256_unpackhi_epi16(x12, x13);
+  s12 = _mm256_madd_epi16(y0, cospi_p08_p24);
+  x12 = _mm256_madd_epi16(y1, cospi_p08_p24);
+  s13 = _mm256_madd_epi16(y0, cospi_p24_m08);
+  x13 = _mm256_madd_epi16(y1, cospi_p24_m08);
+
+  y0 = _mm256_unpacklo_epi16(x14, x15);
+  y1 = _mm256_unpackhi_epi16(x14, x15);
+  s14 = _mm256_madd_epi16(y0, cospi_m24_p08);
+  x14 = _mm256_madd_epi16(y1, cospi_m24_p08);
+  s15 = _mm256_madd_epi16(y0, cospi_p08_p24);
+  x15 = _mm256_madd_epi16(y1, cospi_p08_p24);
+
+  u0 = _mm256_add_epi32(x0, x2);
+  v0 = _mm256_add_epi32(s0, s2);
+  u1 = _mm256_add_epi32(x1, x3);
+  v1 = _mm256_add_epi32(s1, s3);
+  u2 = _mm256_sub_epi32(x0, x2);
+  v2 = _mm256_sub_epi32(s0, s2);
+  u3 = _mm256_sub_epi32(x1, x3);
+  v3 = _mm256_sub_epi32(s1, s3);
+
+  u0 = _mm256_add_epi32(u0, dct_rounding);
+  v0 = _mm256_add_epi32(v0, dct_rounding);
+  u1 = _mm256_add_epi32(u1, dct_rounding);
+  v1 = _mm256_add_epi32(v1, dct_rounding);
+  u2 = _mm256_add_epi32(u2, dct_rounding);
+  v2 = _mm256_add_epi32(v2, dct_rounding);
+  u3 = _mm256_add_epi32(u3, dct_rounding);
+  v3 = _mm256_add_epi32(v3, dct_rounding);
+
+  u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
+  v0 = _mm256_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
+  v1 = _mm256_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS);
+  v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS);
+  v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS);
+
+  in[0] = _mm256_packs_epi32(u0, v0);
+  x1 = _mm256_packs_epi32(u1, v1);
+  x2 = _mm256_packs_epi32(u2, v2);
+  x3 = _mm256_packs_epi32(u3, v3);
+
+  // Rounding on s4 + s6, s5 + s7, s4 - s6, s5 - s7
+  u4 = _mm256_add_epi32(s4, s6);
+  u5 = _mm256_add_epi32(s5, s7);
+  u6 = _mm256_sub_epi32(s4, s6);
+  u7 = _mm256_sub_epi32(s5, s7);
+
+  v4 = _mm256_add_epi32(x4, x6);
+  v5 = _mm256_add_epi32(x5, x7);
+  v6 = _mm256_sub_epi32(x4, x6);
+  v7 = _mm256_sub_epi32(x5, x7);
+
+  u4 = _mm256_add_epi32(u4, dct_rounding);
+  u5 = _mm256_add_epi32(u5, dct_rounding);
+  u6 = _mm256_add_epi32(u6, dct_rounding);
+  u7 = _mm256_add_epi32(u7, dct_rounding);
+
+  u4 = _mm256_srai_epi32(u4, DCT_CONST_BITS);
+  u5 = _mm256_srai_epi32(u5, DCT_CONST_BITS);
+  u6 = _mm256_srai_epi32(u6, DCT_CONST_BITS);
+  u7 = _mm256_srai_epi32(u7, DCT_CONST_BITS);
+
+  v4 = _mm256_add_epi32(v4, dct_rounding);
+  v5 = _mm256_add_epi32(v5, dct_rounding);
+  v6 = _mm256_add_epi32(v6, dct_rounding);
+  v7 = _mm256_add_epi32(v7, dct_rounding);
+
+  v4 = _mm256_srai_epi32(v4, DCT_CONST_BITS);
+  v5 = _mm256_srai_epi32(v5, DCT_CONST_BITS);
+  v6 = _mm256_srai_epi32(v6, DCT_CONST_BITS);
+  v7 = _mm256_srai_epi32(v7, DCT_CONST_BITS);
+
+  x4 = _mm256_packs_epi32(u4, v4);
+  in[12] = _mm256_packs_epi32(u5, v5);
+  x6 = _mm256_packs_epi32(u6, v6);
+  x7 = _mm256_packs_epi32(u7, v7);
+
+  u0 = _mm256_add_epi32(u8, u10);
+  v0 = _mm256_add_epi32(v8, v10);
+  u1 = _mm256_add_epi32(u9, u11);
+  v1 = _mm256_add_epi32(v9, v11);
+  u2 = _mm256_sub_epi32(u8, u10);
+  v2 = _mm256_sub_epi32(v8, v10);
+  u3 = _mm256_sub_epi32(u9, u11);
+  v3 = _mm256_sub_epi32(v9, v11);
+
+  u0 = _mm256_add_epi32(u0, dct_rounding);
+  v0 = _mm256_add_epi32(v0, dct_rounding);
+  u1 = _mm256_add_epi32(u1, dct_rounding);
+  v1 = _mm256_add_epi32(v1, dct_rounding);
+  u2 = _mm256_add_epi32(u2, dct_rounding);
+  v2 = _mm256_add_epi32(v2, dct_rounding);
+  u3 = _mm256_add_epi32(u3, dct_rounding);
+  v3 = _mm256_add_epi32(v3, dct_rounding);
+
+  u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
+  v0 = _mm256_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
+  v1 = _mm256_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS);
+  v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS);
+  v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS);
+
+  x8 = _mm256_packs_epi32(u0, v0);
+  in[14] = _mm256_packs_epi32(u1, v1);
+  x10 = _mm256_packs_epi32(u2, v2);
+  x11 = _mm256_packs_epi32(u3, v3);
+
+  // Rounding on s12 + s14, s13 + s15, s12 - s14, s13 - s15
+  u12 = _mm256_add_epi32(s12, s14);
+  u13 = _mm256_add_epi32(s13, s15);
+  u14 = _mm256_sub_epi32(s12, s14);
+  u15 = _mm256_sub_epi32(s13, s15);
+
+  v12 = _mm256_add_epi32(x12, x14);
+  v13 = _mm256_add_epi32(x13, x15);
+  v14 = _mm256_sub_epi32(x12, x14);
+  v15 = _mm256_sub_epi32(x13, x15);
+
+  u12 = _mm256_add_epi32(u12, dct_rounding);
+  u13 = _mm256_add_epi32(u13, dct_rounding);
+  u14 = _mm256_add_epi32(u14, dct_rounding);
+  u15 = _mm256_add_epi32(u15, dct_rounding);
+
+  u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS);
+  u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS);
+  u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
+  u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
+
+  v12 = _mm256_add_epi32(v12, dct_rounding);
+  v13 = _mm256_add_epi32(v13, dct_rounding);
+  v14 = _mm256_add_epi32(v14, dct_rounding);
+  v15 = _mm256_add_epi32(v15, dct_rounding);
+
+  v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
+  v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
+  v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
+  v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
+
+  x12 = _mm256_packs_epi32(u12, v12);
+  x13 = _mm256_packs_epi32(u13, v13);
+  x14 = _mm256_packs_epi32(u14, v14);
+  x15 = _mm256_packs_epi32(u15, v15);
+  in[2] = x12;
+
+  // stage 4
+  y0 = _mm256_unpacklo_epi16(x2, x3);
+  y1 = _mm256_unpackhi_epi16(x2, x3);
+  s2 = _mm256_madd_epi16(y0, cospi_m16_m16);
+  x2 = _mm256_madd_epi16(y1, cospi_m16_m16);
+  s3 = _mm256_madd_epi16(y0, cospi_p16_m16);
+  x3 = _mm256_madd_epi16(y1, cospi_p16_m16);
+
+  y0 = _mm256_unpacklo_epi16(x6, x7);
+  y1 = _mm256_unpackhi_epi16(x6, x7);
+  s6 = _mm256_madd_epi16(y0, cospi_p16_p16);
+  x6 = _mm256_madd_epi16(y1, cospi_p16_p16);
+  s7 = _mm256_madd_epi16(y0, cospi_m16_p16);
+  x7 = _mm256_madd_epi16(y1, cospi_m16_p16);
+
+  y0 = _mm256_unpacklo_epi16(x10, x11);
+  y1 = _mm256_unpackhi_epi16(x10, x11);
+  s10 = _mm256_madd_epi16(y0, cospi_p16_p16);
+  x10 = _mm256_madd_epi16(y1, cospi_p16_p16);
+  s11 = _mm256_madd_epi16(y0, cospi_m16_p16);
+  x11 = _mm256_madd_epi16(y1, cospi_m16_p16);
+
+  y0 = _mm256_unpacklo_epi16(x14, x15);
+  y1 = _mm256_unpackhi_epi16(x14, x15);
+  s14 = _mm256_madd_epi16(y0, cospi_m16_m16);
+  x14 = _mm256_madd_epi16(y1, cospi_m16_m16);
+  s15 = _mm256_madd_epi16(y0, cospi_p16_m16);
+  x15 = _mm256_madd_epi16(y1, cospi_p16_m16);
+
+  // Rounding
+  u2 = _mm256_add_epi32(s2, dct_rounding);
+  u3 = _mm256_add_epi32(s3, dct_rounding);
+  u6 = _mm256_add_epi32(s6, dct_rounding);
+  u7 = _mm256_add_epi32(s7, dct_rounding);
+
+  u10 = _mm256_add_epi32(s10, dct_rounding);
+  u11 = _mm256_add_epi32(s11, dct_rounding);
+  u14 = _mm256_add_epi32(s14, dct_rounding);
+  u15 = _mm256_add_epi32(s15, dct_rounding);
+
+  u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS);
+  u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS);
+  u6 = _mm256_srai_epi32(u6, DCT_CONST_BITS);
+  u7 = _mm256_srai_epi32(u7, DCT_CONST_BITS);
+
+  u10 = _mm256_srai_epi32(u10, DCT_CONST_BITS);
+  u11 = _mm256_srai_epi32(u11, DCT_CONST_BITS);
+  u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
+  u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
+
+  v2 = _mm256_add_epi32(x2, dct_rounding);
+  v3 = _mm256_add_epi32(x3, dct_rounding);
+  v6 = _mm256_add_epi32(x6, dct_rounding);
+  v7 = _mm256_add_epi32(x7, dct_rounding);
+
+  v10 = _mm256_add_epi32(x10, dct_rounding);
+  v11 = _mm256_add_epi32(x11, dct_rounding);
+  v14 = _mm256_add_epi32(x14, dct_rounding);
+  v15 = _mm256_add_epi32(x15, dct_rounding);
+
+  v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS);
+  v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS);
+  v6 = _mm256_srai_epi32(v6, DCT_CONST_BITS);
+  v7 = _mm256_srai_epi32(v7, DCT_CONST_BITS);
+
+  v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS);
+  v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS);
+  v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
+  v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
+
+  in[7] = _mm256_packs_epi32(u2, v2);
+  in[8] = _mm256_packs_epi32(u3, v3);
+
+  in[4] = _mm256_packs_epi32(u6, v6);
+  in[11] = _mm256_packs_epi32(u7, v7);
+
+  in[6] = _mm256_packs_epi32(u10, v10);
+  in[9] = _mm256_packs_epi32(u11, v11);
+
+  in[5] = _mm256_packs_epi32(u14, v14);
+  in[10] = _mm256_packs_epi32(u15, v15);
+
+  in[1] = _mm256_sub_epi16(zero, x8);
+  in[3] = _mm256_sub_epi16(zero, x4);
+  in[13] = _mm256_sub_epi16(zero, x13);
+  in[15] = _mm256_sub_epi16(zero, x1);
+}
+
+#if CONFIG_EXT_TX
+static void fidtx16_avx2(__m256i *in) { txfm_scaling16_avx2(Sqrt2, in); }
+#endif
+
+void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
+                       int tx_type) {
+  __m256i in[16];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_16x16(input, stride, 0, 0, in);
+      fdct16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fdct16_avx2(in);
+      break;
+    case ADST_DCT:
+      load_buffer_16x16(input, stride, 0, 0, in);
+      fadst16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fdct16_avx2(in);
+      break;
+    case DCT_ADST:
+      load_buffer_16x16(input, stride, 0, 0, in);
+      fdct16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fadst16_avx2(in);
+      break;
+    case ADST_ADST:
+      load_buffer_16x16(input, stride, 0, 0, in);
+      fadst16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fadst16_avx2(in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_16x16(input, stride, 1, 0, in);
+      fadst16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fdct16_avx2(in);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16x16(input, stride, 0, 1, in);
+      fdct16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fadst16_avx2(in);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16x16(input, stride, 1, 1, in);
+      fadst16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fadst16_avx2(in);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16x16(input, stride, 0, 1, in);
+      fadst16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fadst16_avx2(in);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16x16(input, stride, 1, 0, in);
+      fadst16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fadst16_avx2(in);
+      break;
+    case IDTX:
+      load_buffer_16x16(input, stride, 0, 0, in);
+      fidtx16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fidtx16_avx2(in);
+      break;
+    case V_DCT:
+      load_buffer_16x16(input, stride, 0, 0, in);
+      fdct16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fidtx16_avx2(in);
+      break;
+    case H_DCT:
+      load_buffer_16x16(input, stride, 0, 0, in);
+      fidtx16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fdct16_avx2(in);
+      break;
+    case V_ADST:
+      load_buffer_16x16(input, stride, 0, 0, in);
+      fadst16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fidtx16_avx2(in);
+      break;
+    case H_ADST:
+      load_buffer_16x16(input, stride, 0, 0, in);
+      fidtx16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fadst16_avx2(in);
+      break;
+    case V_FLIPADST:
+      load_buffer_16x16(input, stride, 1, 0, in);
+      fadst16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fidtx16_avx2(in);
+      break;
+    case H_FLIPADST:
+      load_buffer_16x16(input, stride, 0, 1, in);
+      fidtx16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fadst16_avx2(in);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+  mm256_transpose_16x16(in);
+  write_buffer_16x16(in, output);
+  _mm256_zeroupper();
+}
+
+void aom_fdct32x32_1_avx2(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  // left and upper corner
+  int32_t sum = get_16x16_sum(input, stride);
+  // right and upper corner
+  sum += get_16x16_sum(input + 16, stride);
+  // left and lower corner
+  sum += get_16x16_sum(input + (stride << 4), stride);
+  // right and lower corner
+  sum += get_16x16_sum(input + (stride << 4) + 16, stride);
+
+  sum >>= 3;
+  output[0] = (tran_low_t)sum;
+  _mm256_zeroupper();
+}
+
+static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) {
+  int i = 0;
+  __m256i temp;
+  while (i < size) {
+    temp = a0[i];
+    a0[i] = a1[i];
+    a1[i] = temp;
+    i++;
+  }
+}
+
+static void mm256_transpose_32x32(__m256i *in0, __m256i *in1) {
+  mm256_transpose_16x16(in0);
+  mm256_transpose_16x16(&in0[16]);
+  mm256_transpose_16x16(in1);
+  mm256_transpose_16x16(&in1[16]);
+  mm256_vectors_swap(&in0[16], in1, 16);
+}
+
+static void prepare_16x16_even(const __m256i *in, __m256i *even) {
+  even[0] = _mm256_add_epi16(in[0], in[31]);
+  even[1] = _mm256_add_epi16(in[1], in[30]);
+  even[2] = _mm256_add_epi16(in[2], in[29]);
+  even[3] = _mm256_add_epi16(in[3], in[28]);
+  even[4] = _mm256_add_epi16(in[4], in[27]);
+  even[5] = _mm256_add_epi16(in[5], in[26]);
+  even[6] = _mm256_add_epi16(in[6], in[25]);
+  even[7] = _mm256_add_epi16(in[7], in[24]);
+  even[8] = _mm256_add_epi16(in[8], in[23]);
+  even[9] = _mm256_add_epi16(in[9], in[22]);
+  even[10] = _mm256_add_epi16(in[10], in[21]);
+  even[11] = _mm256_add_epi16(in[11], in[20]);
+  even[12] = _mm256_add_epi16(in[12], in[19]);
+  even[13] = _mm256_add_epi16(in[13], in[18]);
+  even[14] = _mm256_add_epi16(in[14], in[17]);
+  even[15] = _mm256_add_epi16(in[15], in[16]);
+}
+
+static void prepare_16x16_odd(const __m256i *in, __m256i *odd) {
+  odd[0] = _mm256_sub_epi16(in[15], in[16]);
+  odd[1] = _mm256_sub_epi16(in[14], in[17]);
+  odd[2] = _mm256_sub_epi16(in[13], in[18]);
+  odd[3] = _mm256_sub_epi16(in[12], in[19]);
+  odd[4] = _mm256_sub_epi16(in[11], in[20]);
+  odd[5] = _mm256_sub_epi16(in[10], in[21]);
+  odd[6] = _mm256_sub_epi16(in[9], in[22]);
+  odd[7] = _mm256_sub_epi16(in[8], in[23]);
+  odd[8] = _mm256_sub_epi16(in[7], in[24]);
+  odd[9] = _mm256_sub_epi16(in[6], in[25]);
+  odd[10] = _mm256_sub_epi16(in[5], in[26]);
+  odd[11] = _mm256_sub_epi16(in[4], in[27]);
+  odd[12] = _mm256_sub_epi16(in[3], in[28]);
+  odd[13] = _mm256_sub_epi16(in[2], in[29]);
+  odd[14] = _mm256_sub_epi16(in[1], in[30]);
+  odd[15] = _mm256_sub_epi16(in[0], in[31]);
+}
+
+static void collect_16col(const __m256i *even, const __m256i *odd,
+                          __m256i *out) {
+  // fdct16_avx2() already maps the output
+  out[0] = even[0];
+  out[2] = even[1];
+  out[4] = even[2];
+  out[6] = even[3];
+  out[8] = even[4];
+  out[10] = even[5];
+  out[12] = even[6];
+  out[14] = even[7];
+  out[16] = even[8];
+  out[18] = even[9];
+  out[20] = even[10];
+  out[22] = even[11];
+  out[24] = even[12];
+  out[26] = even[13];
+  out[28] = even[14];
+  out[30] = even[15];
+
+  out[1] = odd[0];
+  out[17] = odd[1];
+  out[9] = odd[2];
+  out[25] = odd[3];
+  out[5] = odd[4];
+  out[21] = odd[5];
+  out[13] = odd[6];
+  out[29] = odd[7];
+  out[3] = odd[8];
+  out[19] = odd[9];
+  out[11] = odd[10];
+  out[27] = odd[11];
+  out[7] = odd[12];
+  out[23] = odd[13];
+  out[15] = odd[14];
+  out[31] = odd[15];
+}
+
+static void collect_coeffs(const __m256i *first_16col_even,
+                           const __m256i *first_16col_odd,
+                           const __m256i *second_16col_even,
+                           const __m256i *second_16col_odd, __m256i *in0,
+                           __m256i *in1) {
+  collect_16col(first_16col_even, first_16col_odd, in0);
+  collect_16col(second_16col_even, second_16col_odd, in1);
+}
+
+static void fdct16_odd_avx2(__m256i *in) {
+  // sequence: cospi_L_H = pairs(L, H) and L first
+  const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64);
+  const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
+  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64);
+  const __m256i cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64);
+  const __m256i cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
+
+  const __m256i cospi_p31_p01 = pair256_set_epi16(cospi_31_64, cospi_1_64);
+  const __m256i cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64);
+  const __m256i cospi_p15_p17 = pair256_set_epi16(cospi_15_64, cospi_17_64);
+  const __m256i cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64);
+  const __m256i cospi_p23_p09 = pair256_set_epi16(cospi_23_64, cospi_9_64);
+  const __m256i cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64);
+  const __m256i cospi_p07_p25 = pair256_set_epi16(cospi_7_64, cospi_25_64);
+  const __m256i cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64);
+  const __m256i cospi_p27_p05 = pair256_set_epi16(cospi_27_64, cospi_5_64);
+  const __m256i cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
+  const __m256i cospi_p11_p21 = pair256_set_epi16(cospi_11_64, cospi_21_64);
+  const __m256i cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
+  const __m256i cospi_p19_p13 = pair256_set_epi16(cospi_19_64, cospi_13_64);
+  const __m256i cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64);
+  const __m256i cospi_p03_p29 = pair256_set_epi16(cospi_3_64, cospi_29_64);
+  const __m256i cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64);
+
+  __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+  __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15;
+  __m256i u0, u1;
+
+  // stage 1 is in prepare_16x16_odd()
+
+  // stage 2
+  y0 = in[0];
+  y1 = in[1];
+  y2 = in[2];
+  y3 = in[3];
+
+  u0 = _mm256_unpacklo_epi16(in[4], in[11]);
+  u1 = _mm256_unpackhi_epi16(in[4], in[11]);
+  y4 = butter_fly(u0, u1, cospi_m16_p16);
+  y11 = butter_fly(u0, u1, cospi_p16_p16);
+
+  u0 = _mm256_unpacklo_epi16(in[5], in[10]);
+  u1 = _mm256_unpackhi_epi16(in[5], in[10]);
+  y5 = butter_fly(u0, u1, cospi_m16_p16);
+  y10 = butter_fly(u0, u1, cospi_p16_p16);
+
+  u0 = _mm256_unpacklo_epi16(in[6], in[9]);
+  u1 = _mm256_unpackhi_epi16(in[6], in[9]);
+  y6 = butter_fly(u0, u1, cospi_m16_p16);
+  y9 = butter_fly(u0, u1, cospi_p16_p16);
+
+  u0 = _mm256_unpacklo_epi16(in[7], in[8]);
+  u1 = _mm256_unpackhi_epi16(in[7], in[8]);
+  y7 = butter_fly(u0, u1, cospi_m16_p16);
+  y8 = butter_fly(u0, u1, cospi_p16_p16);
+
+  y12 = in[12];
+  y13 = in[13];
+  y14 = in[14];
+  y15 = in[15];
+
+  // stage 3
+  x0 = _mm256_add_epi16(y0, y7);
+  x1 = _mm256_add_epi16(y1, y6);
+  x2 = _mm256_add_epi16(y2, y5);
+  x3 = _mm256_add_epi16(y3, y4);
+  x4 = _mm256_sub_epi16(y3, y4);
+  x5 = _mm256_sub_epi16(y2, y5);
+  x6 = _mm256_sub_epi16(y1, y6);
+  x7 = _mm256_sub_epi16(y0, y7);
+  x8 = _mm256_sub_epi16(y15, y8);
+  x9 = _mm256_sub_epi16(y14, y9);
+  x10 = _mm256_sub_epi16(y13, y10);
+  x11 = _mm256_sub_epi16(y12, y11);
+  x12 = _mm256_add_epi16(y12, y11);
+  x13 = _mm256_add_epi16(y13, y10);
+  x14 = _mm256_add_epi16(y14, y9);
+  x15 = _mm256_add_epi16(y15, y8);
+
+  // stage 4
+  y0 = x0;
+  y1 = x1;
+  y6 = x6;
+  y7 = x7;
+  y8 = x8;
+  y9 = x9;
+  y14 = x14;
+  y15 = x15;
+
+  u0 = _mm256_unpacklo_epi16(x2, x13);
+  u1 = _mm256_unpackhi_epi16(x2, x13);
+  y2 = butter_fly(u0, u1, cospi_m08_p24);
+  y13 = butter_fly(u0, u1, cospi_p24_p08);
+
+  u0 = _mm256_unpacklo_epi16(x3, x12);
+  u1 = _mm256_unpackhi_epi16(x3, x12);
+  y3 = butter_fly(u0, u1, cospi_m08_p24);
+  y12 = butter_fly(u0, u1, cospi_p24_p08);
+
+  u0 = _mm256_unpacklo_epi16(x4, x11);
+  u1 = _mm256_unpackhi_epi16(x4, x11);
+  y4 = butter_fly(u0, u1, cospi_m24_m08);
+  y11 = butter_fly(u0, u1, cospi_m08_p24);
+
+  u0 = _mm256_unpacklo_epi16(x5, x10);
+  u1 = _mm256_unpackhi_epi16(x5, x10);
+  y5 = butter_fly(u0, u1, cospi_m24_m08);
+  y10 = butter_fly(u0, u1, cospi_m08_p24);
+
+  // stage 5
+  x0 = _mm256_add_epi16(y0, y3);
+  x1 = _mm256_add_epi16(y1, y2);
+  x2 = _mm256_sub_epi16(y1, y2);
+  x3 = _mm256_sub_epi16(y0, y3);
+  x4 = _mm256_sub_epi16(y7, y4);
+  x5 = _mm256_sub_epi16(y6, y5);
+  x6 = _mm256_add_epi16(y6, y5);
+  x7 = _mm256_add_epi16(y7, y4);
+
+  x8 = _mm256_add_epi16(y8, y11);
+  x9 = _mm256_add_epi16(y9, y10);
+  x10 = _mm256_sub_epi16(y9, y10);
+  x11 = _mm256_sub_epi16(y8, y11);
+  x12 = _mm256_sub_epi16(y15, y12);
+  x13 = _mm256_sub_epi16(y14, y13);
+  x14 = _mm256_add_epi16(y14, y13);
+  x15 = _mm256_add_epi16(y15, y12);
+
+  // stage 6
+  y0 = x0;
+  y3 = x3;
+  y4 = x4;
+  y7 = x7;
+  y8 = x8;
+  y11 = x11;
+  y12 = x12;
+  y15 = x15;
+
+  u0 = _mm256_unpacklo_epi16(x1, x14);
+  u1 = _mm256_unpackhi_epi16(x1, x14);
+  y1 = butter_fly(u0, u1, cospi_m04_p28);
+  y14 = butter_fly(u0, u1, cospi_p28_p04);
+
+  u0 = _mm256_unpacklo_epi16(x2, x13);
+  u1 = _mm256_unpackhi_epi16(x2, x13);
+  y2 = butter_fly(u0, u1, cospi_m28_m04);
+  y13 = butter_fly(u0, u1, cospi_m04_p28);
+
+  u0 = _mm256_unpacklo_epi16(x5, x10);
+  u1 = _mm256_unpackhi_epi16(x5, x10);
+  y5 = butter_fly(u0, u1, cospi_m20_p12);
+  y10 = butter_fly(u0, u1, cospi_p12_p20);
+
+  u0 = _mm256_unpacklo_epi16(x6, x9);
+  u1 = _mm256_unpackhi_epi16(x6, x9);
+  y6 = butter_fly(u0, u1, cospi_m12_m20);
+  y9 = butter_fly(u0, u1, cospi_m20_p12);
+
+  // stage 7
+  x0 = _mm256_add_epi16(y0, y1);
+  x1 = _mm256_sub_epi16(y0, y1);
+  x2 = _mm256_sub_epi16(y3, y2);
+  x3 = _mm256_add_epi16(y3, y2);
+  x4 = _mm256_add_epi16(y4, y5);
+  x5 = _mm256_sub_epi16(y4, y5);
+  x6 = _mm256_sub_epi16(y7, y6);
+  x7 = _mm256_add_epi16(y7, y6);
+
+  x8 = _mm256_add_epi16(y8, y9);
+  x9 = _mm256_sub_epi16(y8, y9);
+  x10 = _mm256_sub_epi16(y11, y10);
+  x11 = _mm256_add_epi16(y11, y10);
+  x12 = _mm256_add_epi16(y12, y13);
+  x13 = _mm256_sub_epi16(y12, y13);
+  x14 = _mm256_sub_epi16(y15, y14);
+  x15 = _mm256_add_epi16(y15, y14);
+
+  // stage 8
+  u0 = _mm256_unpacklo_epi16(x0, x15);
+  u1 = _mm256_unpackhi_epi16(x0, x15);
+  in[0] = butter_fly(u0, u1, cospi_p31_p01);
+  in[15] = butter_fly(u0, u1, cospi_m01_p31);
+
+  u0 = _mm256_unpacklo_epi16(x1, x14);
+  u1 = _mm256_unpackhi_epi16(x1, x14);
+  in[1] = butter_fly(u0, u1, cospi_p15_p17);
+  in[14] = butter_fly(u0, u1, cospi_m17_p15);
+
+  u0 = _mm256_unpacklo_epi16(x2, x13);
+  u1 = _mm256_unpackhi_epi16(x2, x13);
+  in[2] = butter_fly(u0, u1, cospi_p23_p09);
+  in[13] = butter_fly(u0, u1, cospi_m09_p23);
+
+  u0 = _mm256_unpacklo_epi16(x3, x12);
+  u1 = _mm256_unpackhi_epi16(x3, x12);
+  in[3] = butter_fly(u0, u1, cospi_p07_p25);
+  in[12] = butter_fly(u0, u1, cospi_m25_p07);
+
+  u0 = _mm256_unpacklo_epi16(x4, x11);
+  u1 = _mm256_unpackhi_epi16(x4, x11);
+  in[4] = butter_fly(u0, u1, cospi_p27_p05);
+  in[11] = butter_fly(u0, u1, cospi_m05_p27);
+
+  u0 = _mm256_unpacklo_epi16(x5, x10);
+  u1 = _mm256_unpackhi_epi16(x5, x10);
+  in[5] = butter_fly(u0, u1, cospi_p11_p21);
+  in[10] = butter_fly(u0, u1, cospi_m21_p11);
+
+  u0 = _mm256_unpacklo_epi16(x6, x9);
+  u1 = _mm256_unpackhi_epi16(x6, x9);
+  in[6] = butter_fly(u0, u1, cospi_p19_p13);
+  in[9] = butter_fly(u0, u1, cospi_m13_p19);
+
+  u0 = _mm256_unpacklo_epi16(x7, x8);
+  u1 = _mm256_unpackhi_epi16(x7, x8);
+  in[7] = butter_fly(u0, u1, cospi_p03_p29);
+  in[8] = butter_fly(u0, u1, cospi_m29_p03);
+}
+
+static void fdct32_avx2(__m256i *in0, __m256i *in1) {
+  __m256i even0[16], even1[16], odd0[16], odd1[16];
+  prepare_16x16_even(in0, even0);
+  fdct16_avx2(even0);
+
+  prepare_16x16_odd(in0, odd0);
+  fdct16_odd_avx2(odd0);
+
+  prepare_16x16_even(in1, even1);
+  fdct16_avx2(even1);
+
+  prepare_16x16_odd(in1, odd1);
+  fdct16_odd_avx2(odd1);
+
+  collect_coeffs(even0, odd0, even1, odd1, in0, in1);
+
+  mm256_transpose_32x32(in0, in1);
+}
+
+static INLINE void write_buffer_32x32(const __m256i *in0, const __m256i *in1,
+                                      tran_low_t *output) {
+  int i = 0;
+  const int stride = 32;
+  tran_low_t *coeff = output;
+  while (i < 32) {
+    storeu_output_avx2(&in0[i], coeff);
+    storeu_output_avx2(&in1[i], coeff + 16);
+    coeff += stride;
+    i += 1;
+  }
+}
+
+#if CONFIG_EXT_TX
+static void fhalfright32_16col_avx2(__m256i *in) {
+  int i = 0;
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i sqrt2 = _mm256_set1_epi16(Sqrt2);
+  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  __m256i x0, x1;
+
+  while (i < 16) {
+    in[i] = _mm256_slli_epi16(in[i], 2);
+    x0 = _mm256_unpacklo_epi16(in[i + 16], zero);
+    x1 = _mm256_unpackhi_epi16(in[i + 16], zero);
+    x0 = _mm256_madd_epi16(x0, sqrt2);
+    x1 = _mm256_madd_epi16(x1, sqrt2);
+    x0 = _mm256_add_epi32(x0, dct_rounding);
+    x1 = _mm256_add_epi32(x1, dct_rounding);
+    x0 = _mm256_srai_epi32(x0, DCT_CONST_BITS);
+    x1 = _mm256_srai_epi32(x1, DCT_CONST_BITS);
+    in[i + 16] = _mm256_packs_epi32(x0, x1);
+    i += 1;
+  }
+  fdct16_avx2(&in[16]);
+}
+
+static void fhalfright32_avx2(__m256i *in0, __m256i *in1) {
+  fhalfright32_16col_avx2(in0);
+  fhalfright32_16col_avx2(in1);
+  mm256_vectors_swap(in0, &in0[16], 16);
+  mm256_vectors_swap(in1, &in1[16], 16);
+  mm256_transpose_32x32(in0, in1);
+}
+#endif  // CONFIG_EXT_TX
+
+static INLINE void load_buffer_32x32(const int16_t *input, int stride,
+                                     int flipud, int fliplr, __m256i *in0,
+                                     __m256i *in1) {
+  // Load 4 16x16 blocks
+  const int16_t *topL = input;
+  const int16_t *topR = input + 16;
+  const int16_t *botL = input + 16 * stride;
+  const int16_t *botR = input + 16 * stride + 16;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    // Swap left columns
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+    // Swap right columns
+    tmp = topR;
+    topR = botR;
+    botR = tmp;
+  }
+
+  if (fliplr) {
+    // Swap top rows
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+    // Swap bottom rows
+    tmp = botL;
+    botL = botR;
+    botR = tmp;
+  }
+
+  // load first 16 columns
+  load_buffer_16x16(topL, stride, flipud, fliplr, in0);
+  load_buffer_16x16(botL, stride, flipud, fliplr, in0 + 16);
+
+  // load second 16 columns
+  load_buffer_16x16(topR, stride, flipud, fliplr, in1);
+  load_buffer_16x16(botR, stride, flipud, fliplr, in1 + 16);
+}
+
+static INLINE void right_shift_32x32_16col(int bit, __m256i *in) {
+  int i = 0;
+  const __m256i rounding = _mm256_set1_epi16((1 << bit) >> 1);
+  __m256i sign;
+  while (i < 32) {
+    sign = _mm256_srai_epi16(in[i], 15);
+    in[i] = _mm256_add_epi16(in[i], rounding);
+    in[i] = _mm256_add_epi16(in[i], sign);
+    in[i] = _mm256_srai_epi16(in[i], bit);
+    i += 1;
+  }
+}
+
+// Positive rounding
+static INLINE void right_shift_32x32(__m256i *in0, __m256i *in1) {
+  const int bit = 4;
+  right_shift_32x32_16col(bit, in0);
+  right_shift_32x32_16col(bit, in1);
+}
+
+#if CONFIG_EXT_TX
+static void fidtx32_avx2(__m256i *in0, __m256i *in1) {
+  int i = 0;
+  while (i < 32) {
+    in0[i] = _mm256_slli_epi16(in0[i], 2);
+    in1[i] = _mm256_slli_epi16(in1[i], 2);
+    i += 1;
+  }
+  mm256_transpose_32x32(in0, in1);
+}
+#endif
+
+void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride,
+                       int tx_type) {
+  __m256i in0[32];  // left 32 columns
+  __m256i in1[32];  // right 32 columns
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fdct32_avx2(in0, in1);
+      right_shift_32x32(in0, in1);
+      fdct32_avx2(in0, in1);
+      break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      right_shift_32x32(in0, in1);
+      fdct32_avx2(in0, in1);
+      break;
+    case DCT_ADST:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fdct32_avx2(in0, in1);
+      right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case ADST_ADST:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_32x32(input, stride, 1, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      right_shift_32x32(in0, in1);
+      fdct32_avx2(in0, in1);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_32x32(input, stride, 0, 1, in0, in1);
+      fdct32_avx2(in0, in1);
+      right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_32x32(input, stride, 1, 1, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_32x32(input, stride, 0, 1, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_32x32(input, stride, 1, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case IDTX:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fidtx32_avx2(in0, in1);
+      right_shift_32x32(in0, in1);
+      fidtx32_avx2(in0, in1);
+      break;
+    case V_DCT:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fdct32_avx2(in0, in1);
+      right_shift_32x32(in0, in1);
+      fidtx32_avx2(in0, in1);
+      break;
+    case H_DCT:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fidtx32_avx2(in0, in1);
+      right_shift_32x32(in0, in1);
+      fdct32_avx2(in0, in1);
+      break;
+    case V_ADST:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      right_shift_32x32(in0, in1);
+      fidtx32_avx2(in0, in1);
+      break;
+    case H_ADST:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fidtx32_avx2(in0, in1);
+      right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case V_FLIPADST:
+      load_buffer_32x32(input, stride, 1, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      right_shift_32x32(in0, in1);
+      fidtx32_avx2(in0, in1);
+      break;
+    case H_FLIPADST:
+      load_buffer_32x32(input, stride, 0, 1, in0, in1);
+      fidtx32_avx2(in0, in1);
+      right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+  write_buffer_32x32(in0, in1, output);
+  _mm256_zeroupper();
+}
diff --git a/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm b/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm
new file mode 100644
index 000000000..7186b6b92
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm
@@ -0,0 +1,215 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+; void av1_temporal_filter_apply_sse2 | arg
+;  (unsigned char  *frame1,           |  0
+;   unsigned int    stride,           |  1
+;   unsigned char  *frame2,           |  2
+;   unsigned int    block_width,      |  3
+;   unsigned int    block_height,     |  4
+;   int             strength,         |  5
+;   int             filter_weight,    |  6
+;   unsigned int   *accumulator,      |  7
+;   unsigned short *count)            |  8
+global sym(av1_temporal_filter_apply_sse2) PRIVATE
+sym(av1_temporal_filter_apply_sse2):
+
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ALIGN_STACK 16, rax
+    %define block_width    0
+    %define block_height  16
+    %define strength      32
+    %define filter_weight 48
+    %define rounding_bit  64
+    %define rbp_backup    80
+    %define stack_size    96
+    sub         rsp,           stack_size
+    mov         [rsp + rbp_backup], rbp
+    ; end prolog
+
+        mov         edx,            arg(3)
+        mov         [rsp + block_width], rdx
+        mov         edx,            arg(4)
+        mov         [rsp + block_height], rdx
+        movd        xmm6,           arg(5)
+        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
+
+        ; calculate the rounding bit outside the loop
+        ; 0x8000 >> (16 - strength)
+        mov         rdx,            16
+        sub         rdx,            arg(5) ; 16 - strength
+        movq        xmm4,           rdx    ; can't use rdx w/ shift
+        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
+        psrlw       xmm5,           xmm4
+        movdqa      [rsp + rounding_bit], xmm5
+
+        mov         rsi,            arg(0) ; src/frame1
+        mov         rdx,            arg(2) ; predictor frame
+        mov         rdi,            arg(7) ; accumulator
+        mov         rax,            arg(8) ; count
+
+        ; dup the filter weight and store for later
+        movd        xmm0,           arg(6) ; filter_weight
+        pshuflw     xmm0,           xmm0, 0
+        punpcklwd   xmm0,           xmm0
+        movdqa      [rsp + filter_weight], xmm0
+
+        mov         rbp,            arg(1) ; stride
+        pxor        xmm7,           xmm7   ; zero for extraction
+
+        mov         rcx,            [rsp + block_width]
+        imul        rcx,            [rsp + block_height]
+        add         rcx,            rdx
+        cmp         dword ptr [rsp + block_width], 8
+        jne         .temporal_filter_apply_load_16
+
+.temporal_filter_apply_load_8:
+        movq        xmm0,           [rsi]  ; first row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        movq        xmm1,           [rsi]  ; second row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
+        jmp         .temporal_filter_apply_load_finished
+
+.temporal_filter_apply_load_16:
+        movdqa      xmm0,           [rsi]  ; src (frame1)
+        lea         rsi,            [rsi + rbp] ; += stride
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; src[ 8-15]
+
+.temporal_filter_apply_load_finished:
+        movdqa      xmm2,           [rdx]  ; predictor (frame2)
+        movdqa      xmm3,           xmm2
+        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
+
+        ; modifier = src_byte - pixel_value
+        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
+        psubw       xmm1,           xmm3   ; src - pred[ 8-15]
+
+        ; modifier *= modifier
+        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
+        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
+
+        ; modifier *= 3
+        pmullw      xmm0,           [GLOBAL(_const_3w)]
+        pmullw      xmm1,           [GLOBAL(_const_3w)]
+
+        ; modifer += 0x8000 >> (16 - strength)
+        paddw       xmm0,           [rsp + rounding_bit]
+        paddw       xmm1,           [rsp + rounding_bit]
+
+        ; modifier >>= strength
+        psrlw       xmm0,           [rsp + strength]
+        psrlw       xmm1,           [rsp + strength]
+
+        ; modifier = 16 - modifier
+        ; saturation takes care of modifier > 16
+        movdqa      xmm3,           [GLOBAL(_const_16w)]
+        movdqa      xmm2,           [GLOBAL(_const_16w)]
+        psubusw     xmm3,           xmm1
+        psubusw     xmm2,           xmm0
+
+        ; modifier *= filter_weight
+        pmullw      xmm2,           [rsp + filter_weight]
+        pmullw      xmm3,           [rsp + filter_weight]
+
+        ; count
+        movdqa      xmm4,           [rax]
+        movdqa      xmm5,           [rax+16]
+        ; += modifier
+        paddw       xmm4,           xmm2
+        paddw       xmm5,           xmm3
+        ; write back
+        movdqa      [rax],          xmm4
+        movdqa      [rax+16],       xmm5
+        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
+
+        ; load and extract the predictor up to shorts
+        pxor        xmm7,           xmm7
+        movdqa      xmm0,           [rdx]
+        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
+
+        ; modifier *= pixel_value
+        pmullw      xmm0,           xmm2
+        pmullw      xmm1,           xmm3
+
+        ; expand to double words
+        movdqa      xmm2,           xmm0
+        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
+        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
+        movdqa      xmm3,           xmm1
+        punpcklwd   xmm1,           xmm7   ; [ 8-11]
+        punpckhwd   xmm3,           xmm7   ; [12-15]
+
+        ; accumulator
+        movdqa      xmm4,           [rdi]
+        movdqa      xmm5,           [rdi+16]
+        movdqa      xmm6,           [rdi+32]
+        movdqa      xmm7,           [rdi+48]
+        ; += modifier
+        paddd       xmm4,           xmm0
+        paddd       xmm5,           xmm2
+        paddd       xmm6,           xmm1
+        paddd       xmm7,           xmm3
+        ; write back
+        movdqa      [rdi],          xmm4
+        movdqa      [rdi+16],       xmm5
+        movdqa      [rdi+32],       xmm6
+        movdqa      [rdi+48],       xmm7
+        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
+
+        cmp         rdx,            rcx
+        je          .temporal_filter_apply_epilog
+        pxor        xmm7,           xmm7   ; zero for extraction
+        cmp         dword ptr [rsp + block_width], 16
+        je          .temporal_filter_apply_load_16
+        jmp         .temporal_filter_apply_load_8
+
+.temporal_filter_apply_epilog:
+    ; begin epilog
+    mov         rbp,            [rsp + rbp_backup]
+    add         rsp,            stack_size
+    pop         rsp
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+_const_3w:
+    times 8 dw 3
+align 16
+_const_top_bit:
+    times 8 dw 1<<15
+align 16
+_const_16w:
+    times 8 dw 16
diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
new file mode 100644
index 000000000..bf233ca4d
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom/aom_integer.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c
+ */
+uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d,
+                                           const uint8_t *m, int N) {
+  int n = -N;
+  int n8 = n + 8;
+
+  uint64_t csse;
+
+  const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
+  const __m128i v_zext_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+
+  __m128i v_acc0_q = _mm_setzero_si128();
+
+  assert(N % 64 == 0);
+
+  r1 += N;
+  d += N;
+  m += N;
+
+  do {
+    const __m128i v_r0_w = xx_load_128(r1 + n);
+    const __m128i v_r1_w = xx_load_128(r1 + n8);
+    const __m128i v_d0_w = xx_load_128(d + n);
+    const __m128i v_d1_w = xx_load_128(d + n8);
+    const __m128i v_m01_b = xx_load_128(m + n);
+
+    const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w);
+    const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w);
+    const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w);
+    const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w);
+    const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
+    const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
+
+    const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w);
+    const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w);
+    const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w);
+    const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w);
+
+    const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w);
+    const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w);
+    const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w);
+    const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w);
+
+    const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d);
+    const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d);
+
+    const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w);
+    const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w);
+
+    const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q),
+                                           _mm_srli_epi64(v_sq0_d, 32));
+    const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q),
+                                           _mm_srli_epi64(v_sq1_d, 32));
+
+    v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q);
+    v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q);
+
+    n8 += 16;
+    n += 16;
+  } while (n);
+
+  v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
+
+#if ARCH_X86_64
+  csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
+#else
+  xx_storel_64(&csse, v_acc0_q);
+#endif
+
+  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * See av1_wedge_sign_from_residuals_c
+ */
+int av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m,
+                                       int N, int64_t limit) {
+  int64_t acc;
+
+  __m128i v_sign_d;
+  __m128i v_acc0_d = _mm_setzero_si128();
+  __m128i v_acc1_d = _mm_setzero_si128();
+  __m128i v_acc_q;
+
+  // Input size limited to 8192 by the use of 32 bit accumulators and m
+  // being between [0, 64]. Overflow might happen at larger sizes,
+  // though it is practically impossible on real video input.
+  assert(N < 8192);
+  assert(N % 64 == 0);
+
+  do {
+    const __m128i v_m01_b = xx_load_128(m);
+    const __m128i v_m23_b = xx_load_128(m + 16);
+    const __m128i v_m45_b = xx_load_128(m + 32);
+    const __m128i v_m67_b = xx_load_128(m + 48);
+
+    const __m128i v_d0_w = xx_load_128(ds);
+    const __m128i v_d1_w = xx_load_128(ds + 8);
+    const __m128i v_d2_w = xx_load_128(ds + 16);
+    const __m128i v_d3_w = xx_load_128(ds + 24);
+    const __m128i v_d4_w = xx_load_128(ds + 32);
+    const __m128i v_d5_w = xx_load_128(ds + 40);
+    const __m128i v_d6_w = xx_load_128(ds + 48);
+    const __m128i v_d7_w = xx_load_128(ds + 56);
+
+    const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
+    const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
+    const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128());
+    const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128());
+    const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128());
+    const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128());
+    const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128());
+    const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128());
+
+    const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
+    const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
+    const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w);
+    const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w);
+    const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w);
+    const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w);
+    const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w);
+    const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w);
+
+    const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d);
+    const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d);
+    const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d);
+    const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d);
+
+    const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d);
+    const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d);
+
+    v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d);
+    v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d);
+
+    ds += 64;
+    m += 64;
+
+    N -= 64;
+  } while (N);
+
+  v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128());
+  v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d),
+                           _mm_unpackhi_epi32(v_acc0_d, v_sign_d));
+
+  v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128());
+  v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d),
+                           _mm_unpackhi_epi32(v_acc1_d, v_sign_d));
+
+  v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d);
+
+  v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+
+#if ARCH_X86_64
+  acc = (uint64_t)_mm_cvtsi128_si64(v_acc_q);
+#else
+  xx_storel_64(&acc, v_acc_q);
+#endif
+
+  return acc > limit;
+}
+
+// Negate under mask
+static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) {
+  return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w);
+}
+
+/**
+ * av1_wedge_compute_delta_squares_c
+ */
+void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a,
+                                          const int16_t *b, int N) {
+  const __m128i v_neg_w =
+      _mm_set_epi16(0xffff, 0, 0xffff, 0, 0xffff, 0, 0xffff, 0);
+
+  assert(N % 64 == 0);
+
+  do {
+    const __m128i v_a0_w = xx_load_128(a);
+    const __m128i v_b0_w = xx_load_128(b);
+    const __m128i v_a1_w = xx_load_128(a + 8);
+    const __m128i v_b1_w = xx_load_128(b + 8);
+    const __m128i v_a2_w = xx_load_128(a + 16);
+    const __m128i v_b2_w = xx_load_128(b + 16);
+    const __m128i v_a3_w = xx_load_128(a + 24);
+    const __m128i v_b3_w = xx_load_128(b + 24);
+
+    const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w);
+    const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w);
+    const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w);
+    const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w);
+    const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w);
+    const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w);
+    const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w);
+    const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w);
+
+    // Negate top word of pairs
+    const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w);
+    const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w);
+    const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w);
+    const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w);
+    const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w);
+    const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w);
+    const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w);
+    const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w);
+
+    const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w);
+    const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w);
+    const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w);
+    const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w);
+    const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w);
+    const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w);
+    const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w);
+    const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w);
+
+    const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w);
+    const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w);
+    const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w);
+    const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w);
+
+    xx_store_128(d, v_r0_w);
+    xx_store_128(d + 8, v_r1_w);
+    xx_store_128(d + 16, v_r2_w);
+    xx_store_128(d + 24, v_r3_w);
+
+    a += 32;
+    b += 32;
+    d += 32;
+    N -= 32;
+  } while (N);
+}
diff --git a/third_party/aom/av1/exports_dec b/third_party/aom/av1/exports_dec
new file mode 100644
index 000000000..05860e8c0
--- /dev/null
+++ b/third_party/aom/av1/exports_dec
@@ -0,0 +1,2 @@
+data aom_codec_av1_dx_algo
+text aom_codec_av1_dx
diff --git a/third_party/aom/av1/exports_enc b/third_party/aom/av1/exports_enc
new file mode 100644
index 000000000..dc4a9eae7
--- /dev/null
+++ b/third_party/aom/av1/exports_enc
@@ -0,0 +1,2 @@
+data aom_codec_av1_cx_algo
+text aom_codec_av1_cx
author	trav90 <travawine@palemoon.org>	2018-10-15 21:45:30 -0500
committer	trav90 <travawine@palemoon.org>	2018-10-15 21:45:30 -0500
commit	68569dee1416593955c1570d638b3d9250b33012 (patch)
tree	d960f017cd7eba3f125b7e8a813789ee2e076310 /third_party/aom/av1
parent	07c17b6b98ed32fcecff15c083ab0fd878de3cf0 (diff)
download	UXP-68569dee1416593955c1570d638b3d9250b33012.tar UXP-68569dee1416593955c1570d638b3d9250b33012.tar.gz UXP-68569dee1416593955c1570d638b3d9250b33012.tar.lz UXP-68569dee1416593955c1570d638b3d9250b33012.tar.xz UXP-68569dee1416593955c1570d638b3d9250b33012.zip