210 files changed, 16335 insertions, 5545 deletions
diff --git a/third_party/aom/.cmake-format.py b/third_party/aom/.cmake-format.py
index cebad0742..aa7354c2a 100644
--- a/third_party/aom/.cmake-format.py
+++ b/third_party/aom/.cmake-format.py
@@ -1,3 +1,4 @@
+# Generated with cmake-format 0.3.6
 # How wide to allow formatted cmake files
 line_width = 80
 
diff --git a/third_party/aom/CHANGELOG b/third_party/aom/CHANGELOG
index 7510dc660..d84aa0249 100644
--- a/third_party/aom/CHANGELOG
+++ b/third_party/aom/CHANGELOG
@@ -1,631 +1,5 @@
-Next Release
-  - Incompatible changes:
-    The AV1 encoder's default keyframe interval changed to 128 from 9999.
-    Support for armv6 was removed.
+2018-06-28 v1.0.0
+  AOMedia Codec Workgroup Approved version 1.0
 
 2016-04-07 v0.1.0 "AOMedia Codec 1"
   This release is the first Alliance for Open Media codec.
-2015-11-09 v1.5.0 "Javan Whistling Duck"
-  This release improves upon the VP9 encoder and speeds up the encoding and
-  decoding processes.
-
-  - Upgrading:
-    This release is ABI incompatible with 1.4.0. It drops deprecated VP8
-    controls and adds a variety of VP9 controls for testing.
-
-    The vpxenc utility now prefers VP9 by default.
-
-  - Enhancements:
-    Faster VP9 encoding and decoding
-    Smaller library size by combining functions used by VP8 and VP9
-
-  - Bug Fixes:
-    A variety of fuzzing issues
-
-2015-04-03 v1.4.0 "Indian Runner Duck"
-  This release includes significant improvements to the VP9 codec.
-
-  - Upgrading:
-    This release is ABI incompatible with 1.3.0. It drops the compatibility
-    layer, requiring VPX_IMG_FMT_* instead of IMG_FMT_*, and adds several codec
-    controls for VP9.
-
-  - Enhancements:
-    Faster VP9 encoding and decoding
-    Multithreaded VP9 decoding (tile and frame-based)
-    Multithreaded VP9 encoding - on by default
-    YUV 4:2:2 and 4:4:4 support in VP9
-    10 and 12bit support in VP9
-    64bit ARM support by replacing ARM assembly with intrinsics
-
-  - Bug Fixes:
-    Fixes a VP9 bitstream issue in Profile 1. This only affected non-YUV 4:2:0
-    files.
-
-  - Known Issues:
-    Frame Parallel decoding fails for segmented and non-420 files.
-
-2013-11-15 v1.3.0 "Forest"
-  This release introduces the VP9 codec in a backward-compatible way.
-  All existing users of VP8 can continue to use the library without
-  modification. However, some VP8 options do not map to VP9 in the same manner.
-
-  The VP9 encoder in this release is not feature complete. Users interested in
-  the encoder are advised to use the git master branch and discuss issues on
-  libvpx mailing lists.
-
-  - Upgrading:
-    This release is ABI and API compatible with Duclair (v1.0.0). Users
-    of older releases should refer to the Upgrading notes in this document
-    for that release.
-
-  - Enhancements:
-      Get rid of bashisms in the main build scripts
-      Added usage info on command line options
-      Add lossless compression mode
-      Dll build of libvpx
-      Add additional Mac OS X targets: 10.7, 10.8 and 10.9 (darwin11-13)
-      Add option to disable documentation
-      configure: add --enable-external-build support
-      make: support V=1 as short form of verbose=yes
-      configure: support mingw-w64
-      configure: support hardfloat armv7 CHOSTS
-      configure: add support for android x86
-      Add estimated completion time to vpxenc
-      Don't exit on decode errors in vpxenc
-      vpxenc: support scaling prior to encoding
-      vpxdec: support scaling output
-      vpxenc: improve progress indicators with --skip
-      msvs: Don't link to winmm.lib
-      Add a new script for producing vcxproj files
-      Produce Visual Studio 10 and 11 project files
-      Produce Windows Phone project files
-      msvs-build: use msbuild for vs >= 2005
-      configure: default configure log to config.log
-      Add encoding option --static-thresh
-
-  - Speed:
-      Miscellaneous speed optimizations for VP8 and VP9.
-
-  - Quality:
-      In general, quality is consistent with the Eider release.
-
-  - Bug Fixes:
-      This release represents approximately a year of engineering effort,
-      and contains multiple bug fixes. Please refer to git history for details.
-
-
-2012-12-21 v1.2.0
-  This release acts as a checkpoint for a large amount of internal refactoring
-  and testing. It also contains a number of small bugfixes, so all users are
-  encouraged to upgrade.
-
-  - Upgrading:
-    This release is ABI and API compatible with Duclair (v1.0.0). Users
-    of older releases should refer to the Upgrading notes in this
-    document for that release.
-
-  - Enhancements:
-      VP8 optimizations for MIPS dspr2
-      vpxenc: add -quiet option
-
-  - Speed:
-      Encoder and decoder speed is consistent with the Eider release.
-
-  - Quality:
-      In general, quality is consistent with the Eider release.
-
-      Minor tweaks to ARNR filtering
-      Minor improvements to real time encoding with multiple temporal layers
-
-  - Bug Fixes:
-      Fixes multithreaded encoder race condition in loopfilter
-      Fixes multi-resolution threaded encoding
-      Fix potential encoder dead-lock after picture resize
-
-
-2012-05-09 v1.1.0 "Eider"
-  This introduces a number of enhancements, mostly focused on real-time
-  encoding. In addition, it fixes a decoder bug (first introduced in
-  Duclair) so all users of that release are encouraged to upgrade.
-
-  - Upgrading:
-    This release is ABI and API compatible with Duclair (v1.0.0). Users
-    of older releases should refer to the Upgrading notes in this
-    document for that release.
-
-    This release introduces a new temporal denoiser, controlled by the
-    VP8E_SET_NOISE_SENSITIVITY control. The temporal denoiser does not
-    currently take a strength parameter, so the control is effectively
-    a boolean - zero (off) or non-zero (on). For compatibility with
-    existing applications, the values accepted are the same as those
-    for the spatial denoiser (0-6). The temporal denoiser is enabled
-    by default, and the older spatial denoiser may be restored by
-    configuring with --disable-temporal-denoising. The temporal denoiser
-    is more computationally intensive than the spatial one.
-
-    This release removes support for a legacy, decode only API that was
-    supported, but deprecated, at the initial release of libvpx
-    (v0.9.0). This is not expected to have any impact. If you are
-    impacted, you can apply a reversion to commit 2bf8fb58 locally.
-    Please update to the latest libvpx API if you are affected.
-
-  - Enhancements:
-      Adds a motion compensated temporal denoiser to the encoder, which
-      gives higher quality than the older spatial denoiser. (See above
-      for notes on upgrading).
-
-      In addition, support for new compilers and platforms were added,
-      including:
-        improved support for XCode
-        Android x86 NDK build
-        OS/2 support
-        SunCC support
-
-      Changing resolution with vpx_codec_enc_config_set() is now
-      supported. Previously, reinitializing the codec was required to
-      change the input resolution.
-
-      The vpxenc application has initial support for producing multiple
-      encodes from the same input in one call. Resizing is not yet
-      supported, but varying other codec parameters is. Use -- to
-      delineate output streams. Options persist from one stream to the
-      next.
-
-      Also, the vpxenc application will now use a keyframe interval of
-      5 seconds by default. Use the --kf-max-dist option to override.
-
-  - Speed:
-      Decoder performance improved 2.5% versus Duclair. Encoder speed is
-      consistent with Duclair for most material. Two pass encoding of
-      slideshow-like material will see significant improvements.
-
-      Large realtime encoding speed gains at a small quality expense are
-      possible by configuring the on-the-fly bitpacking experiment with
-      --enable-onthefly-bitpacking. Realtime encoder can be up to 13%
-      faster (ARM) depending on the number of threads and bitrate
-      settings. This technique sees constant gain over the 5-16 speed
-      range. For VC style input the loss seen is up to 0.2dB. See commit
-      52cf4dca for further details.
-
-  - Quality:
-      On the whole, quality is consistent with the Duclair release. Some
-      tweaks:
-
-        Reduced blockiness in easy sections by applying a penalty to
-        intra modes.
-
-        Improved quality of static sections (like slideshows) with
-        two pass encoding.
-
-        Improved keyframe sizing with multiple temporal layers
-
-  - Bug Fixes:
-      Corrected alt-ref contribution to frame rate for visible updates
-      to the alt-ref buffer. This affected applications making manual
-      usage of the frame reference flags, or temporal layers.
-
-      Additional constraints were added to disable multi-frame quality
-      enhancement (MFQE) in sections of the frame where there is motion.
-      (#392)
-
-      Fixed corruption issues when vpx_codec_enc_config_set() was called
-      with spatial resampling enabled.
-
-      Fixed a decoder error introduced in Duclair where the segmentation
-      map was not being reinitialized on keyframes (#378)
-
-
-2012-01-27 v1.0.0 "Duclair"
-  Our fourth named release, focused on performance and features related to
-  real-time encoding. It also fixes a decoder crash bug introduced in
-  v0.9.7, so all users of that release are encouraged to upgrade.
-
-  - Upgrading:
-      This release is ABI incompatible with prior releases of libvpx, so the
-      "major" version number has been bumped to 1. You must recompile your
-      applications against the latest version of the libvpx headers. The
-      API remains compatible, and this should not require code changes in most
-      applications.
-
-  - Enhancements:
-      This release introduces several substantial new features to the encoder,
-      of particular interest to real time streaming applications.
-
-      Temporal scalability allows the encoder to produce a stream that can
-      be decimated to different frame rates, with independent rate targetting
-      for each substream.
-
-      Multiframe quality enhancement postprocessing can make visual quality
-      more consistent in the presence of frames that are substantially
-      different quality than the surrounding frames, as in the temporal
-      scalability case and in some forced keyframe scenarios.
-
-      Multiple-resolution encoding support allows the encoding of the
-      same content at different resolutions faster than encoding them
-      separately.
-
-  - Speed:
-      Optimization targets for this release included the decoder and the real-
-      time modes of the encoder. Decoder speed on x86 has improved 10.5% with
-      this release. Encoder improvements followed a curve where speeds 1-3
-      improved 4.0%-1.5%, speeds 4-8 improved <1%, and speeds 9-16 improved
-      1.5% to 10.5%, respectively. "Best" mode speed is consistent with the
-      Cayuga release.
-
-  - Quality:
-      Encoder quality in the single stream case is consistent with the Cayuga
-      release.
-
-  - Bug Fixes:
-      This release fixes an OOB read decoder crash bug present in v0.9.7
-      related to the clamping of motion vectors in SPLITMV blocks. This
-      behavior could be triggered by corrupt input or by starting
-      decoding from a P-frame.
-
-
-2011-08-15 v0.9.7-p1 "Cayuga" patch 1
-  This is an incremental bugfix release against Cayuga. All users of that
-  release are strongly encouraged to upgrade.
-
-    - Fix potential OOB reads (cdae03a)
-
-          An unbounded out of bounds read was discovered when the
-          decoder was requested to perform error concealment (new in
-          Cayuga) given a frame with corrupt partition sizes.
-
-          A bounded out of bounds read was discovered affecting all
-          versions of libvpx. Given an multipartition input frame that
-          is truncated between the mode/mv partition and the first
-          residiual paritition (in the block of partition offsets), up
-          to 3 extra bytes could have been read from the source buffer.
-          The code will not take any action regardless of the contents
-          of these undefined bytes, as the truncated buffer is detected
-          immediately following the read based on the calculated
-          starting position of the coefficient partition.
-
-    - Fix potential error concealment crash when the very first frame
-      is missing or corrupt (a609be5)
-
-    - Fix significant artifacts in error concealment (a4c2211, 99d870a)
-
-    - Revert 1-pass CBR rate control changes (e961317)
-      Further testing showed this change produced undesirable visual
-      artifacts, rolling back for now.
-
-
-2011-08-02 v0.9.7 "Cayuga"
-  Our third named release, focused on a faster, higher quality, encoder.
-
-  - Upgrading:
-    This release is backwards compatible with Aylesbury (v0.9.5) and
-    Bali (v0.9.6). Users of older releases should refer to the Upgrading
-    notes in this document for that release.
-
-  - Enhancements:
-          Stereo 3D format support for vpxenc
-          Runtime detection of available processor cores.
-          Allow specifying --end-usage by enum name
-          vpxdec: test for frame corruption
-          vpxenc: add quantizer histogram display
-          vpxenc: add rate histogram display
-          Set VPX_FRAME_IS_DROPPABLE
-          update configure for ios sdk 4.3
-          Avoid text relocations in ARM vp8 decoder
-          Generate a vpx.pc file for pkg-config.
-          New ways of passing encoded data between encoder and decoder.
-
-  - Speed:
-      This release includes across-the-board speed improvements to the
-      encoder. On x86, these measure at approximately 11.5% in Best mode,
-      21.5% in Good mode (speed 0), and 22.5% in Realtime mode (speed 6).
-      On ARM Cortex A9 with Neon extensions, real-time encoding of video
-      telephony content is 35% faster than Bali on single core and 48%
-      faster on multi-core. On the NVidia Tegra2 platform, real time
-      encoding is 40% faster than Bali.
-
-      Decoder speed was not a priority for this release, but improved
-      approximately 8.4% on x86.
-
-          Reduce motion vector search on alt-ref frame.
-          Encoder loopfilter running in its own thread
-          Reworked loopfilter to precalculate more parameters
-          SSE2/SSSE3 optimizations for build_predictors_mbuv{,_s}().
-          Make hor UV predict ~2x faster (73 vs 132 cycles) using SSSE3.
-          Removed redundant checks
-          Reduced structure sizes
-          utilize preload in ARMv6 MC/LPF/Copy routines
-          ARM optimized quantization, dfct, variance, subtract
-          Increase chrow row alignment to 16 bytes.
-          disable trellis optimization for first pass
-          Write SSSE3 sub-pixel filter function
-          Improve SSE2 half-pixel filter funtions
-          Add vp8_sub_pixel_variance16x8_ssse3 function
-          Reduce unnecessary distortion computation
-          Use diamond search to replace full search
-          Preload reference area in sub-pixel motion search (real-time mode)
-
-  - Quality:
-      This release focused primarily on one-pass use cases, including
-      video conferencing. Low latency data rate control was significantly
-      improved, improving streamability over bandwidth constrained links.
-      Added support for error concealment, allowing frames to maintain
-      visual quality in the presence of substantial packet loss.
-
-          Add rc_max_intra_bitrate_pct control
-          Limit size of initial keyframe in one-pass.
-          Improve framerate adaptation
-          Improved 1-pass CBR rate control
-          Improved KF insertion after fades to still.
-          Improved key frame detection.
-          Improved activity masking (lower PSNR impact for same SSIM boost)
-          Improved interaction between GF and ARFs
-          Adding error-concealment to the decoder.
-          Adding support for independent partitions
-          Adjusted rate-distortion constants
-
-
-  - Bug Fixes:
-          Removed firstpass motion map
-          Fix parallel make install
-          Fix multithreaded encoding for 1 MB wide frame
-          Fixed iwalsh_neon build problems with RVDS4.1
-          Fix semaphore emulation, spin-wait intrinsics on Windows
-          Fix build with xcode4 and simplify GLOBAL.
-          Mark ARM asm objects as allowing a non-executable stack.
-          Fix vpxenc encoding incorrect webm file header on big endian
-
-
-2011-03-07 v0.9.6 "Bali"
-  Our second named release, focused on a faster, higher quality, encoder.
-
-  - Upgrading:
-    This release is backwards compatible with Aylesbury (v0.9.5). Users
-    of older releases should refer to the Upgrading notes in this
-    document for that release.
-
-  - Enhancements:
-      vpxenc --psnr shows a summary when encode completes
-      --tune=ssim option to enable activity masking
-      improved postproc visualizations for development
-      updated support for Apple iOS to SDK 4.2
-      query decoder to determine which reference frames were updated
-      implemented error tracking in the decoder
-      fix pipe support on windows
-
-  - Speed:
-      Primary focus was on good quality mode, speed 0. Average improvement
-      on x86 about 40%, up to 100% on user-generated content at that speed.
-      Best quality mode speed improved 35%, and realtime speed 10-20%. This
-      release also saw significant improvement in realtime encoding speed
-      on ARM platforms.
-
-        Improved encoder threading
-        Dont pick encoder filter level when loopfilter is disabled.
-        Avoid double copying of key frames into alt and golden buffer
-        FDCT optimizations.
-        x86 sse2 temporal filter
-        SSSE3 version of fast quantizer
-        vp8_rd_pick_best_mbsegmentation code restructure
-        Adjusted breakout RD for SPLITMV
-        Changed segmentation check order
-        Improved rd_pick_intra4x4block
-        Adds armv6 optimized variance calculation
-        ARMv6 optimized sad16x16
-        ARMv6 optimized half pixel variance calculations
-        Full search SAD function optimization in SSE4.1
-        Improve MV prediction accuracy to achieve performance gain
-        Improve MV prediction in vp8_pick_inter_mode() for speed>3
-
-  - Quality:
-      Best quality mode improved PSNR 6.3%, and SSIM 6.1%. This release
-      also includes support for "activity masking," which greatly improves
-      SSIM at the expense of PSNR. For now, this feature is available with
-      the --tune=ssim option. Further experimentation in this area
-      is ongoing. This release also introduces a new rate control mode
-      called "CQ," which changes the allocation of bits within a clip to
-      the sections where they will have the most visual impact.
-
-        Tuning for the more exact quantizer.
-        Relax rate control for last few frames
-        CQ Mode
-        Limit key frame quantizer for forced key frames.
-        KF/GF Pulsing
-        Add simple version of activity masking.
-        make rdmult adaptive for intra in quantizer RDO
-        cap the best quantizer for 2nd order DC
-        change the threshold of DC check for encode breakout
-
-  - Bug Fixes:
-      Fix crash on Sparc Solaris.
-      Fix counter of fixed keyframe distance
-      ARNR filter pointer update bug fix
-      Fixed use of motion percentage in KF/GF group calc
-      Changed condition for using RD in Intra Mode
-      Fix encoder real-time only configuration.
-      Fix ARM encoder crash with multiple token partitions
-      Fixed bug first cluster timecode of webm file is wrong.
-      Fixed various encoder bugs with odd-sized images
-      vp8e_get_preview fixed when spatial resampling enabled
-      quantizer: fix assertion in fast quantizer path
-      Allocate source buffers to be multiples of 16
-      Fix for manual Golden frame frequency
-      Fix drastic undershoot in long form content
-
-
-2010-10-28 v0.9.5 "Aylesbury"
-  Our first named release, focused on a faster decoder, and a better encoder.
-
-  - Upgrading:
-    This release incorporates backwards-incompatible changes to the
-    ivfenc and ivfdec tools. These tools are now called vpxenc and vpxdec.
-
-    vpxdec
-      * the -q (quiet) option has been removed, and replaced with
-        -v (verbose). the output is quiet by default. Use -v to see
-        the version number of the binary.
-
-      * The default behavior is now to write output to a single file
-        instead of individual frames. The -y option has been removed.
-        Y4M output is the default.
-
-      * For raw I420/YV12 output instead of Y4M, the --i420 or --yv12
-        options must be specified.
-
-          $ ivfdec -o OUTPUT INPUT
-          $ vpxdec --i420 -o OUTPUT INPUT
-
-      * If an output file is not specified, the default is to write
-        Y4M to stdout. This makes piping more natural.
-
-          $ ivfdec -y -o - INPUT | ...
-          $ vpxdec INPUT | ...
-
-      * The output file has additional flexibility for formatting the
-        filename. It supports escape characters for constructing a
-        filename from the width, height, and sequence number. This
-        replaces the -p option. To get the equivalent:
-
-          $ ivfdec -p frame INPUT
-          $ vpxdec --i420 -o frame-%wx%h-%4.i420 INPUT
-
-    vpxenc
-      * The output file must be specified with -o, rather than as the
-        last argument.
-
-          $ ivfenc <options> INPUT OUTPUT
-          $ vpxenc <options> -o OUTPUT INPUT
-
-      * The output defaults to webm. To get IVF output, use the --ivf
-        option.
-
-          $ ivfenc <options> INPUT OUTPUT.ivf
-          $ vpxenc <options> -o OUTPUT.ivf --ivf INPUT
-
-
-  - Enhancements:
-      ivfenc and ivfdec have been renamed to vpxenc, vpxdec.
-      vpxdec supports .webm input
-      vpxdec writes .y4m by default
-      vpxenc writes .webm output by default
-      vpxenc --psnr now shows the average/overall PSNR at the end
-      ARM platforms now support runtime cpu detection
-      vpxdec visualizations added for motion vectors, block modes, references
-      vpxdec now silent by default
-      vpxdec --progress shows frame-by-frame timing information
-      vpxenc supports the distinction between --fps and --timebase
-      NASM is now a supported assembler
-      configure: enable PIC for shared libs by default
-      configure: add --enable-small
-      configure: support for ppc32-linux-gcc
-      configure: support for sparc-solaris-gcc
-
-  - Bugs:
-      Improve handling of invalid frames
-      Fix valgrind errors in the NEON loop filters.
-      Fix loopfilter delta zero transitions
-      Fix valgrind errors in vp8_sixtap_predict8x4_armv6().
-      Build fixes for darwin-icc
-
-  - Speed:
-      20-40% (average 28%) improvement in libvpx decoder speed,
-      including:
-        Rewrite vp8_short_walsh4x4_sse2()
-        Optimizations on the loopfilters.
-        Miscellaneous improvements for Atom
-        Add 4-tap version of 2nd-pass ARMv6 MC filter.
-        Improved multithread utilization
-        Better instruction choices on x86
-        reorder data to use wider instructions
-        Update NEON wide idcts
-        Make block access to frame buffer sequential
-        Improved subset block search
-        Bilinear subpixel optimizations for ssse3.
-        Decrease memory footprint
-
-      Encoder speed improvements (percentage gain not measured):
-        Skip unnecessary search of identical frames
-        Add SSE2 subtract functions
-        Improve bounds checking in vp8_diamond_search_sadx4()
-        Added vp8_fast_quantize_b_sse2
-
-  - Quality:
-      Over 7% overall PSNR improvement (6.3% SSIM) in "best" quality
-      encoding mode, and up to 60% improvement on very noisy, still
-      or slow moving source video
-
-        Motion compensated temporal filter for Alt-Ref Noise Reduction
-        Improved use of trellis quantization on 2nd order Y blocks
-        Tune effect of motion on KF/GF boost in two pass
-        Allow coefficient optimization for good quality speed 0.
-        Improved control of active min quantizer for two pass.
-        Enable ARFs for non-lagged compress
-
-2010-09-02 v0.9.2
-  - Enhancements:
-      Disable frame dropping by default
-      Improved multithreaded performance
-      Improved Force Key Frame Behaviour
-      Increased rate control buffer level precision
-      Fix bug in 1st pass motion compensation
-      ivfenc: correct fixed kf interval, --disable-kf
-  - Speed:
-      Changed above and left context data layout
-      Rework idct calling structure.
-      Removed unnecessary MB_MODE_INFO copies
-      x86: SSSE3 sixtap prediction
-      Reworked IDCT to include reconstruction (add) step
-      Swap alt/gold/new/last frame buffer ptrs instead of copying.
-      Improve SSE2 loopfilter functions
-      Change bitreader to use a larger window.
-      Avoid loopfilter reinitialization when possible
-  - Quality:
-      Normalize quantizer's zero bin and rounding factors
-      Add trellis quantization.
-      Make the quantizer exact.
-      Updates to ARNR filtering algorithm
-      Fix breakout thresh computation for golden & AltRef frames
-      Redo the forward 4x4 dct
-      Improve the accuracy of forward walsh-hadamard transform
-      Further adjustment of RD behaviour with Q and Zbin.
-  - Build System:
-      Allow linking of libs built with MinGW to MSVC
-      Fix target auto-detection on mingw32
-      Allow --cpu= to work for x86.
-      configure: pass original arguments through to make dist
-      Fix builds without runtime CPU detection
-      msvs: fix install of codec sources
-      msvs: Change devenv.com command line for better msys support
-      msvs: Add vs9 targets.
-      Add x86_64-linux-icc target
-  - Bugs:
-      Potential crashes on older MinGW builds
-      Fix two-pass framrate for Y4M input.
-      Fixed simple loop filter, other crashes on ARM v6
-      arm: fix missing dependency with --enable-shared
-      configure: support directories containing .o
-      Replace pinsrw (SSE) with MMX instructions
-      apple: include proper mach primatives
-      Fixed rate control bug with long key frame interval.
-      Fix DSO link errors on x86-64 when not using a version script
-      Fixed buffer selection for UV in AltRef filtering
-
-
-2010-06-17 v0.9.1
-  - Enhancements:
-      * ivfenc/ivfdec now support YUV4MPEG2 input and pipe I/O
-      * Speed optimizations
-  - Bugfixes:
-      * Rate control
-      * Prevent out-of-bounds accesses on invalid data
-  - Build system updates:
-      * Detect toolchain to be used automatically for native builds
-      * Support building shared libraries
-      * Better autotools emulation (--prefix, --libdir, DESTDIR)
-  - Updated LICENSE
-      * http://webmproject.blogspot.com/2010/06/changes-to-webm-open-source-license.html
-
-
-2010-05-18 v0.9.0
-  - Initial open source release. Welcome to WebM and VP8!
-
diff --git a/third_party/aom/CMakeLists.txt b/third_party/aom/CMakeLists.txt
index 0f6a37ffb..0b445722d 100644
--- a/third_party/aom/CMakeLists.txt
+++ b/third_party/aom/CMakeLists.txt
@@ -186,11 +186,9 @@ list(APPEND AOM_ENCODER_APP_UTIL_SOURCES
             "${AOM_ROOT}/examples/encoder_util.h"
             "${AOM_ROOT}/examples/encoder_util.c")
 
-if (ENABLE_EXAMPLES)
-  list(APPEND AOM_ENCODER_STATS_SOURCES "${AOM_ROOT}/stats/aomstats.c"
-              "${AOM_ROOT}/stats/aomstats.h" "${AOM_ROOT}/stats/rate_hist.c"
-              "${AOM_ROOT}/stats/rate_hist.h")
-endif ()
+list(APPEND AOM_ENCODER_STATS_SOURCES "${AOM_ROOT}/stats/aomstats.c"
+            "${AOM_ROOT}/stats/aomstats.h" "${AOM_ROOT}/stats/rate_hist.c"
+            "${AOM_ROOT}/stats/rate_hist.h")
 
 list(APPEND AOM_PKG_CONFIG_SOURCES "${AOM_CONFIG_DIR}/aom.pc")
 
diff --git a/third_party/aom/aom/aomcx.h b/third_party/aom/aom/aomcx.h
index 4cdb5d332..e77e5f693 100644
--- a/third_party/aom/aom/aomcx.h
+++ b/third_party/aom/aom/aomcx.h
@@ -854,6 +854,12 @@ enum aome_enc_control_id {
   /*!\brief Codec control function to set the path to the film grain parameters
    */
   AV1E_SET_FILM_GRAIN_TABLE,
+
+  /*!\brief Sets the noise level */
+  AV1E_SET_DENOISE_NOISE_LEVEL,
+
+  /*!\brief Sets the denoisers block size */
+  AV1E_SET_DENOISE_BLOCK_SIZE,
 };
 
 /*!\brief aom 1-D scaling mode
@@ -1165,6 +1171,14 @@ AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TABLE, const char *)
 AOM_CTRL_USE_TYPE(AV1E_SET_CDF_UPDATE_MODE, int)
 #define AOM_CTRL_AV1E_SET_CDF_UPDATE_MODE
 
+#ifdef CONFIG_DENOISE
+AOM_CTRL_USE_TYPE(AV1E_SET_DENOISE_NOISE_LEVEL, int);
+#define AOM_CTRL_AV1E_SET_DENOISE_NOISE_LEVEL
+
+AOM_CTRL_USE_TYPE(AV1E_SET_DENOISE_BLOCK_SIZE, unsigned int);
+#define AOM_CTRL_AV1E_SET_DENOISE_BLOCK_SIZE
+#endif
+
 /*!\endcond */
 /*! @} - end defgroup aom_encoder */
 #ifdef __cplusplus
diff --git a/third_party/aom/aom/aomdx.h b/third_party/aom/aom/aomdx.h
index 7ff21a59b..50ff22410 100644
--- a/third_party/aom/aom/aomdx.h
+++ b/third_party/aom/aom/aomdx.h
@@ -119,6 +119,12 @@ enum aom_dec_control_id {
   /** control function to get the bit depth of the stream. */
   AV1D_GET_BIT_DEPTH,
 
+  /** control function to get the image format of the stream. */
+  AV1D_GET_IMG_FORMAT,
+
+  /** control function to get the size of the tile. */
+  AV1D_GET_TILE_SIZE,
+
   /** control function to set the byte alignment of the planes in the reference
    * buffers. Valid values are power of 2, from 32 to 1024. A value of 0 sets
    * legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly
@@ -187,6 +193,12 @@ enum aom_dec_control_id {
    */
   AV1D_EXT_TILE_DEBUG,
 
+  /** control function to enable the row based multi-threading of decoding. A
+   * value that is equal to 1 indicates that row based multi-threading is
+   * enabled.
+   */
+  AV1D_SET_ROW_MT,
+
   /** control function to indicate whether bitstream is in Annex-B format. */
   AV1D_SET_IS_ANNEXB,
 
@@ -238,6 +250,10 @@ AOM_CTRL_USE_TYPE(AV1D_GET_DISPLAY_SIZE, int *)
 #define AOM_CTRL_AV1D_GET_DISPLAY_SIZE
 AOM_CTRL_USE_TYPE(AV1D_GET_BIT_DEPTH, unsigned int *)
 #define AOM_CTRL_AV1D_GET_BIT_DEPTH
+AOM_CTRL_USE_TYPE(AV1D_GET_IMG_FORMAT, aom_img_fmt_t *)
+#define AOM_CTRL_AV1D_GET_IMG_FORMAT
+AOM_CTRL_USE_TYPE(AV1D_GET_TILE_SIZE, unsigned int *)
+#define AOM_CTRL_AV1D_GET_TILE_SIZE
 AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_SIZE, int *)
 #define AOM_CTRL_AV1D_GET_FRAME_SIZE
 AOM_CTRL_USE_TYPE(AV1_INVERT_TILE_DECODE_ORDER, int)
@@ -258,6 +274,8 @@ AOM_CTRL_USE_TYPE(AV1D_SET_EXT_REF_PTR, av1_ext_ref_frame_t *)
 #define AOM_CTRL_AV1D_SET_EXT_REF_PTR
 AOM_CTRL_USE_TYPE(AV1D_EXT_TILE_DEBUG, unsigned int)
 #define AOM_CTRL_AV1D_EXT_TILE_DEBUG
+AOM_CTRL_USE_TYPE(AV1D_SET_ROW_MT, unsigned int)
+#define AOM_CTRL_AV1D_SET_ROW_MT
 AOM_CTRL_USE_TYPE(AV1D_SET_IS_ANNEXB, unsigned int)
 #define AOM_CTRL_AV1D_SET_IS_ANNEXB
 AOM_CTRL_USE_TYPE(AV1D_SET_OPERATING_POINT, int)
diff --git a/third_party/aom/aom/internal/aom_codec_internal.h b/third_party/aom/aom/internal/aom_codec_internal.h
index 84ea4eefa..88bf78ef2 100644
--- a/third_party/aom/aom/internal/aom_codec_internal.h
+++ b/third_party/aom/aom/internal/aom_codec_internal.h
@@ -417,7 +417,7 @@ struct aom_internal_error_info {
   aom_codec_err_t error_code;
   int has_detail;
   char detail[80];
-  int setjmp;
+  int setjmp;  // Boolean: whether 'jmp' is valid.
   jmp_buf jmp;
 };
 
diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake
index 768875f7d..7c0111a69 100644
--- a/third_party/aom/aom_dsp/aom_dsp.cmake
+++ b/third_party/aom/aom_dsp/aom_dsp.cmake
@@ -83,6 +83,7 @@ list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1
 list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
             "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
+            "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h"
             "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h"
             "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
@@ -190,13 +191,16 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/x86/ssim_opt_x86_64.asm")
 
   list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
+              "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/subtract_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c"
-              "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c")
+              "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c")
 
   list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64
               "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm")
@@ -205,9 +209,11 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm")
 
   list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3
+              "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.h"
               "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
               "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h"
               "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c"
               "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c"
               "${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c")
 
diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd.c b/third_party/aom/aom_dsp/aom_dsp_rtcd.c
index 5d7d4515b..1514bd64e 100644
--- a/third_party/aom/aom_dsp/aom_dsp_rtcd.c
+++ b/third_party/aom/aom_dsp/aom_dsp_rtcd.c
@@ -15,4 +15,4 @@
 
 #include "aom_ports/aom_once.h"
 
-void aom_dsp_rtcd() { once(setup_rtcd_internal); }
+void aom_dsp_rtcd() { aom_once(setup_rtcd_internal); }
diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
index a8ac5eb5c..1a9ac3184 100755
--- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -377,7 +377,7 @@ add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8
 specialize qw/aom_lpf_vertical_14_dual sse2/;
 
 add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_6 sse2/;
+specialize qw/aom_lpf_vertical_6 sse2 neon/;
 
 add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/aom_lpf_vertical_8 sse2 neon/;
@@ -386,13 +386,13 @@ add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_
 specialize qw/aom_lpf_vertical_8_dual sse2/;
 
 add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_4 sse2/;
+specialize qw/aom_lpf_vertical_4 sse2 neon/;
 
 add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/aom_lpf_vertical_4_dual sse2/;
 
 add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_14 sse2/;
+specialize qw/aom_lpf_horizontal_14 sse2 neon/;
 
 add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/aom_lpf_horizontal_14_dual sse2/;
@@ -410,7 +410,7 @@ add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint
 specialize qw/aom_lpf_horizontal_8_dual sse2/;
 
 add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_4 sse2/;
+specialize qw/aom_lpf_horizontal_4 sse2 neon/;
 
 add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/aom_lpf_horizontal_4_dual sse2/;
@@ -564,7 +564,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   # Block subtraction
   #
   add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-  specialize qw/aom_subtract_block neon msa sse2/;
+  specialize qw/aom_subtract_block neon msa sse2 avx2/;
 
   add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
   specialize qw/aom_highbd_subtract_block sse2/;
@@ -732,14 +732,14 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   foreach (@block_sizes) {
     ($w, $h) = @$_;
     add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask";
-    specialize "aom_masked_sad${w}x${h}", qw/ssse3/;
+    specialize "aom_masked_sad${w}x${h}", qw/ssse3 avx2/;
   }
 
 
     foreach (@block_sizes) {
       ($w, $h) = @$_;
       add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
-      specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3/;
+      specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2/;
     }
 
 
@@ -750,7 +750,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     ($w, $h) = @$_;
     add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
     if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
-       specialize "aom_obmc_sad${w}x${h}", qw/sse4_1/;
+       specialize "aom_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
     }
   }
 
@@ -759,7 +759,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
       ($w, $h) = @$_;
       add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
       if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
-        specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1/;
+        specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
       }
     }
 
@@ -1102,6 +1102,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
     add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
     specialize "aom_obmc_variance${w}x${h}", q/sse4_1/;
+    specialize "aom_obmc_sub_pixel_variance${w}x${h}", q/sse4_1/;
   }
 
 
@@ -1539,9 +1540,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_comp_mask_pred ssse3 avx2/;
 
   add_proto qw/void aom_highbd_comp_mask_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
-  add_proto qw/void aom_highbd_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
-                                                           int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int bd";
-
+  specialize qw/aom_highbd_comp_mask_pred avx2/;
 
 }  # CONFIG_AV1_ENCODER
 
diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c
index 69470eeb0..c85b1e910 100644
--- a/third_party/aom/aom_dsp/arm/intrapred_neon.c
+++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c
@@ -528,3 +528,63 @@ void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
     }
   }
 }
+
+static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
+                                       const uint16_t *above,
+                                       const uint16_t *left) {
+  assert(bw >= 4);
+  assert(IS_POWER_OF_TWO(bw));
+  int expected_dc, sum = 0;
+  const int count = bw * 2;
+  uint32x4_t sum_q = vdupq_n_u32(0);
+  uint32x2_t sum_d;
+  uint16_t *dst_1;
+  if (bw >= 8) {
+    for (int i = 0; i < bw; i += 8) {
+      sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
+      sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
+      above += 8;
+      left += 8;
+    }
+    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
+    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
+    expected_dc = (sum + (count >> 1)) / count;
+    const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
+    for (int r = 0; r < bw; r++) {
+      dst_1 = dst;
+      for (int i = 0; i < bw; i += 8) {
+        vst1q_u16(dst_1, dc);
+        dst_1 += 8;
+      }
+      dst += stride;
+    }
+  } else {  // 4x4
+    sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
+    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
+    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
+    expected_dc = (sum + (count >> 1)) / count;
+    const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
+    for (int r = 0; r < bw; r++) {
+      vst1_u16(dst, dc);
+      dst += stride;
+    }
+  }
+}
+
+#define intra_pred_highbd_sized_neon(type, width)               \
+  void aom_highbd_##type##_predictor_##width##x##width##_neon(  \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \
+      const uint16_t *left, int bd) {                           \
+    (void)bd;                                                   \
+    highbd_##type##_predictor(dst, stride, width, above, left); \
+  }
+
+#define intra_pred_square(type)           \
+  intra_pred_highbd_sized_neon(type, 4);  \
+  intra_pred_highbd_sized_neon(type, 8);  \
+  intra_pred_highbd_sized_neon(type, 16); \
+  intra_pred_highbd_sized_neon(type, 32); \
+  intra_pred_highbd_sized_neon(type, 64);
+
+intra_pred_square(dc);
+#undef intra_pred_square
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
index ee1a3c78f..bdc67626d 100644
--- a/third_party/aom/aom_dsp/arm/loopfilter_neon.c
+++ b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
@@ -52,6 +52,36 @@ static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1,
   return mask_8x8;
 }
 
+static INLINE uint8x8_t lpf_mask2(uint8x8_t p1q1, uint8x8_t p0q0,
+                                  const uint8_t blimit, const uint8_t limit) {
+  uint32x2x2_t p0q0_p1q1;
+  uint16x8_t temp_16x8;
+  uint16x4_t temp0_16x4, temp1_16x4;
+  const uint16x4_t blimit_16x4 = vdup_n_u16(blimit);
+  const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+  uint8x8_t mask_8x8, temp_8x8;
+
+  mask_8x8 = vabd_u8(p1q1, p0q0);
+  mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+  temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+                     vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+  temp_16x8 = vmovl_u8(temp_8x8);
+  temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+  temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+  temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+  temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+  temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  return mask_8x8;
+}
+
 static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2,
                                        uint8x8_t p1q1, uint8x8_t p0q0) {
   const uint8x8_t thresh_8x8 = vdup_n_u8(1);  // for bd==8 threshold is always 1
@@ -523,6 +553,68 @@ static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
   }
 }
 
+static void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit,
+                       const uint8_t limit, const uint8_t thresh) {
+  int32x2x2_t ps0_qs0, ps1_qs1;
+  int16x8_t filter_s16;
+  const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+  uint8x8_t mask_8x8, temp0_8x8, temp1_8x8;
+  int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+  int8x8_t op0, oq0, op1, oq1;
+  int8x8_t pq_s0, pq_s1;
+  int8x8_t filter_s8, filter1_s8, filter2_s8;
+  int8x8_t hev_8x8;
+  const int8x8_t sign_mask = vdup_n_s8(0x80);
+  const int8x8_t val_4 = vdup_n_s8(4);
+  const int8x8_t val_3 = vdup_n_s8(3);
+
+  // Calculate filter mask
+  mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit);
+
+  pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+  pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+  ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+  ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+  ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+  qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+  ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+  qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+  // hev_mask
+  temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+  temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+  hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+  // add outer taps if we have high edge variance
+  filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+  filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+  // inner taps
+  temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+  filter_s16 = vmovl_s8(filter_s8);
+  filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+  filter_s8 = vqmovn_s16(filter_s16);
+  filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+  filter1_s8 = vqadd_s8(filter_s8, val_4);
+  filter2_s8 = vqadd_s8(filter_s8, val_3);
+  filter1_s8 = vshr_n_s8(filter1_s8, 3);
+  filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+  oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+  op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+  filter_s8 = vrshr_n_s8(filter1_s8, 1);
+  filter_s8 = vbic_s8(filter_s8, hev_8x8);
+
+  oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+  op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+  *p0q0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+  *p1q1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+}
+
 void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
                               const uint8_t *limit, const uint8_t *thresh) {
   uint8x16_t row0, row1, row2, row3;
@@ -646,6 +738,125 @@ void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
   store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3);
 }
 
+void aom_lpf_vertical_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  uint32x2x2_t p2q2_p1q1, pxqy_p0q0;
+  uint32x2_t pq_rev;
+  uint8x8_t pxq0, p2q1, p1q2, p0qy;
+  uint8x8_t p0q0, p1q1, p2q2, pxqy;
+
+  // row0: px p2 p1 p0 | q0 q1 q2 qy
+  // row1: px p2 p1 p0 | q0 q1 q2 qy
+  // row2: px p2 p1 p0 | q0 q1 q2 qy
+  // row3: px p2 p1 p0 | q0 q1 q2 qy
+  load_u8_8x4(src - 4, stride, &pxq0, &p2q1, &p1q2, &p0qy);
+
+  transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0qy));
+  pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxq0), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2));
+  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev);
+
+  p0q0 = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1]));
+  p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+  p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+  pxqy = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
+
+  lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0));
+  pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxqy), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1));
+  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev);
+
+  p0qy = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1]));
+  p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+  p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+  pxq0 = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
+  transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+
+  store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy);
+}
+
+void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0;
+  uint32x2_t pq_rev;
+  uint8x8_t UNINITIALIZED_IS_SAFE(p1p0), q0q1, p0q0, p1q1;
+
+  // row0: p1 p0 | q0 q1
+  // row1: p1 p0 | q0 q1
+  // row2: p1 p0 | q0 q1
+  // row3: p1 p0 | q0 q1
+  load_u8_4x1(src - 2, &p1p0, 0);
+  load_u8_4x1((src - 2) + 1 * stride, &p1p0, 1);
+  load_u8_4x1((src - 2) + 2 * stride, &q0q1, 0);
+  load_u8_4x1((src - 2) + 3 * stride, &q0q1, 1);
+
+  transpose_u8_4x4(&p1p0, &q0q1);
+
+  p1q0_p0q1 = vtrn_u32(vreinterpret_u32_u8(p1p0), vreinterpret_u32_u8(q0q1));
+
+  pq_rev = vrev64_u32(p1q0_p0q1.val[1]);
+  p1q1_p0q0 = vtrn_u32(p1q0_p0q1.val[0], pq_rev);
+
+  p1q1 = vreinterpret_u8_u32(p1q1_p0q0.val[0]);
+  p0q0 = vreinterpret_u8_u32(p1q1_p0q0.val[1]);
+
+  lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  p1p0_q1q0 = vtrn_u32(vreinterpret_u32_u8(p1q1), vreinterpret_u32_u8(p0q0));
+
+  p1p0 = vreinterpret_u8_u32(p1p0_q1q0.val[0]);
+  q0q1 = vreinterpret_u8_u32(vrev64_u32(p1p0_q1q0.val[1]));
+
+  transpose_u8_4x4(&p1p0, &q0q1);
+
+  store_u8_4x1(src - 2, p1p0, 0);
+  store_u8_4x1((src - 2) + 1 * stride, q0q1, 0);
+  store_u8_4x1((src - 2) + 2 * stride, p1p0, 1);
+  store_u8_4x1((src - 2) + 3 * stride, q0q1, 1);
+}
+
+void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                                const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, UNINITIALIZED_IS_SAFE(p6q6);
+
+  load_u8_4x1(src - 7 * stride, &p6q6, 0);
+  load_u8_4x1(src - 6 * stride, &p5q5, 0);
+  load_u8_4x1(src - 5 * stride, &p4q4, 0);
+  load_u8_4x1(src - 4 * stride, &p3q3, 0);
+  load_u8_4x1(src - 3 * stride, &p2q2, 0);
+  load_u8_4x1(src - 2 * stride, &p1q1, 0);
+  load_u8_4x1(src - 1 * stride, &p0q0, 0);
+  load_u8_4x1(src + 0 * stride, &p0q0, 1);
+  load_u8_4x1(src + 1 * stride, &p1q1, 1);
+  load_u8_4x1(src + 2 * stride, &p2q2, 1);
+  load_u8_4x1(src + 3 * stride, &p3q3, 1);
+  load_u8_4x1(src + 4 * stride, &p4q4, 1);
+  load_u8_4x1(src + 5 * stride, &p5q5, 1);
+  load_u8_4x1(src + 6 * stride, &p6q6, 1);
+
+  lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
+              *thresh);
+
+  store_u8_4x1(src - 6 * stride, p5q5, 0);
+  store_u8_4x1(src - 5 * stride, p4q4, 0);
+  store_u8_4x1(src - 4 * stride, p3q3, 0);
+  store_u8_4x1(src - 3 * stride, p2q2, 0);
+  store_u8_4x1(src - 2 * stride, p1q1, 0);
+  store_u8_4x1(src - 1 * stride, p0q0, 0);
+  store_u8_4x1(src + 0 * stride, p0q0, 1);
+  store_u8_4x1(src + 1 * stride, p1q1, 1);
+  store_u8_4x1(src + 2 * stride, p2q2, 1);
+  store_u8_4x1(src + 3 * stride, p3q3, 1);
+  store_u8_4x1(src + 4 * stride, p4q4, 1);
+  store_u8_4x1(src + 5 * stride, p5q5, 1);
+}
+
 void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
                                const uint8_t *limit, const uint8_t *thresh) {
   uint8x8_t p0q0, p1q1, p2q2, p3q3;
@@ -698,3 +909,20 @@ void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
   vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
   vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
 }
+
+void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p0q0, UNINITIALIZED_IS_SAFE(p1q1);
+
+  load_u8_4x1(src - 2 * stride, &p1q1, 0);
+  load_u8_4x1(src - 1 * stride, &p0q0, 0);
+  load_u8_4x1(src + 0 * stride, &p0q0, 1);
+  load_u8_4x1(src + 1 * stride, &p1q1, 1);
+
+  lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  store_u8_4x1(src - 2 * stride, p1q1, 0);
+  store_u8_4x1(src - 1 * stride, p0q0, 0);
+  store_u8_4x1(src + 0 * stride, p0q0, 1);
+  store_u8_4x1(src + 1 * stride, p1q1, 1);
+}
diff --git a/third_party/aom/aom_dsp/bitreader_buffer.c b/third_party/aom/aom_dsp/bitreader_buffer.c
index 68fc381f2..02b5ef924 100644
--- a/third_party/aom/aom_dsp/bitreader_buffer.c
+++ b/third_party/aom/aom_dsp/bitreader_buffer.c
@@ -8,11 +8,14 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+
+#include <assert.h>
+
 #include "config/aom_config.h"
 
 #include "aom_dsp/bitreader_buffer.h"
 
-size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb) {
+size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb) {
   return (rb->bit_offset + 7) >> 3;
 }
 
@@ -31,6 +34,7 @@ int aom_rb_read_bit(struct aom_read_bit_buffer *rb) {
 }
 
 int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) {
+  assert(bits <= 31);
   int value = 0, bit;
   for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit;
   return value;
@@ -38,6 +42,7 @@ int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) {
 
 uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb,
                                       int bits) {
+  assert(bits <= 32);
   uint32_t value = 0;
   int bit;
   for (bit = bits - 1; bit >= 0; bit--)
diff --git a/third_party/aom/aom_dsp/bitreader_buffer.h b/third_party/aom/aom_dsp/bitreader_buffer.h
index 2dafe11ad..5c94ab883 100644
--- a/third_party/aom/aom_dsp/bitreader_buffer.h
+++ b/third_party/aom/aom_dsp/bitreader_buffer.h
@@ -31,7 +31,7 @@ struct aom_read_bit_buffer {
   aom_rb_error_handler error_handler;
 };
 
-size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb);
+size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb);
 
 int aom_rb_read_bit(struct aom_read_bit_buffer *rb);
 
diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.c b/third_party/aom/aom_dsp/bitwriter_buffer.c
index 21314eb2a..a563bf684 100644
--- a/third_party/aom/aom_dsp/bitwriter_buffer.c
+++ b/third_party/aom/aom_dsp/bitwriter_buffer.c
@@ -9,6 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <assert.h>
 #include <limits.h>
 #include <stdlib.h>
 
@@ -49,12 +50,14 @@ void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit) {
 }
 
 void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) {
+  assert(bits <= 31);
   int bit;
   for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
 }
 
 void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb,
                                    uint32_t data, int bits) {
+  assert(bits <= 32);
   int bit;
   for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
 }
diff --git a/third_party/aom/aom_dsp/grain_synthesis.c b/third_party/aom/aom_dsp/grain_synthesis.c
index fcb6c290e..ff1ec41a2 100644
--- a/third_party/aom/aom_dsp/grain_synthesis.c
+++ b/third_party/aom/aom_dsp/grain_synthesis.c
@@ -17,6 +17,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
+#include <assert.h>
 #include "aom_dsp/grain_synthesis.h"
 #include "aom_mem/aom_mem.h"
 
@@ -237,7 +238,7 @@ static int grain_max;
 
 static uint16_t random_register = 0;  // random number generator register
 
-static void init_arrays(aom_film_grain_t *params, int luma_stride,
+static void init_arrays(const aom_film_grain_t *params, int luma_stride,
                         int chroma_stride, int ***pred_pos_luma_p,
                         int ***pred_pos_chroma_p, int **luma_grain_block,
                         int **cb_grain_block, int **cr_grain_block,
@@ -331,7 +332,7 @@ static void init_arrays(aom_film_grain_t *params, int luma_stride,
       (int *)aom_malloc(sizeof(**cr_grain_block) * chroma_grain_samples);
 }
 
-static void dealloc_arrays(aom_film_grain_t *params, int ***pred_pos_luma,
+static void dealloc_arrays(const aom_film_grain_t *params, int ***pred_pos_luma,
                            int ***pred_pos_chroma, int **luma_grain_block,
                            int **cb_grain_block, int **cr_grain_block,
                            int **y_line_buf, int **cb_line_buf,
@@ -396,10 +397,14 @@ static void init_random_generator(int luma_line, uint16_t seed) {
 }
 
 static void generate_luma_grain_block(
-    aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block,
+    const aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block,
     int luma_block_size_y, int luma_block_size_x, int luma_grain_stride,
     int left_pad, int top_pad, int right_pad, int bottom_pad) {
-  if (params->num_y_points == 0) return;
+  if (params->num_y_points == 0) {
+    memset(luma_grain_block, 0,
+           sizeof(*luma_grain_block) * luma_block_size_y * luma_grain_stride);
+    return;
+  }
 
   int bit_depth = params->bit_depth;
   int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift;
@@ -431,7 +436,7 @@ static void generate_luma_grain_block(
 }
 
 static void generate_chroma_grain_blocks(
-    aom_film_grain_t *params,
+    const aom_film_grain_t *params,
     //                                  int** pred_pos_luma,
     int **pred_pos_chroma, int *luma_grain_block, int *cb_grain_block,
     int *cr_grain_block, int luma_grain_stride, int chroma_block_size_y,
@@ -443,7 +448,7 @@ static void generate_chroma_grain_blocks(
   int num_pos_chroma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
   if (params->num_y_points > 0) ++num_pos_chroma;
   int rounding_offset = (1 << (params->ar_coeff_shift - 1));
-  int chroma_grain_samples = chroma_block_size_y * chroma_block_size_x;
+  int chroma_grain_block_size = chroma_block_size_y * chroma_grain_stride;
 
   if (params->num_cb_points || params->chroma_scaling_from_luma) {
     init_random_generator(7 << 5, params->random_seed);
@@ -455,7 +460,8 @@ static void generate_chroma_grain_blocks(
              ((1 << gauss_sec_shift) >> 1)) >>
             gauss_sec_shift;
   } else {
-    memset(cr_grain_block, 0, sizeof(*cr_grain_block) * chroma_grain_samples);
+    memset(cb_grain_block, 0,
+           sizeof(*cb_grain_block) * chroma_grain_block_size);
   }
 
   if (params->num_cr_points || params->chroma_scaling_from_luma) {
@@ -468,7 +474,8 @@ static void generate_chroma_grain_blocks(
              ((1 << gauss_sec_shift) >> 1)) >>
             gauss_sec_shift;
   } else {
-    memset(cb_grain_block, 0, sizeof(*cb_grain_block) * chroma_grain_samples);
+    memset(cr_grain_block, 0,
+           sizeof(*cr_grain_block) * chroma_grain_block_size);
   }
 
   for (int i = top_pad; i < chroma_block_size_y - bottom_pad; i++)
@@ -522,7 +529,7 @@ static void generate_chroma_grain_blocks(
     }
 }
 
-static void init_scaling_function(int scaling_points[][2], int num_points,
+static void init_scaling_function(const int scaling_points[][2], int num_points,
                                   int scaling_lut[]) {
   if (num_points == 0) return;
 
@@ -559,7 +566,7 @@ static int scale_LUT(int *scaling_lut, int index, int bit_depth) {
                              (bit_depth - 8));
 }
 
-static void add_noise_to_block(aom_film_grain_t *params, uint8_t *luma,
+static void add_noise_to_block(const aom_film_grain_t *params, uint8_t *luma,
                                uint8_t *cb, uint8_t *cr, int luma_stride,
                                int chroma_stride, int *luma_grain,
                                int *cb_grain, int *cr_grain,
@@ -675,7 +682,7 @@ static void add_noise_to_block(aom_film_grain_t *params, uint8_t *luma,
 }
 
 static void add_noise_to_block_hbd(
-    aom_film_grain_t *params, uint16_t *luma, uint16_t *cb, uint16_t *cr,
+    const aom_film_grain_t *params, uint16_t *luma, uint16_t *cb, uint16_t *cr,
     int luma_stride, int chroma_stride, int *luma_grain, int *cb_grain,
     int *cr_grain, int luma_grain_stride, int chroma_grain_stride,
     int half_luma_height, int half_luma_width, int bit_depth,
@@ -903,7 +910,7 @@ static void hor_boundary_overlap(int *top_block, int top_stride,
   }
 }
 
-void av1_add_film_grain(aom_film_grain_t *params, aom_image_t *src,
+void av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src,
                         aom_image_t *dst) {
   uint8_t *luma, *cb, *cr;
   int height, width, luma_stride, chroma_stride;
@@ -950,6 +957,11 @@ void av1_add_film_grain(aom_film_grain_t *params, aom_image_t *src,
       exit(1);
   }
 
+  assert(params->bit_depth == src->bit_depth);
+
+  dst->fmt = src->fmt;
+  dst->bit_depth = src->bit_depth;
+
   dst->r_w = src->r_w;
   dst->r_h = src->r_h;
   dst->d_w = src->d_w;
@@ -999,15 +1011,13 @@ void av1_add_film_grain(aom_film_grain_t *params, aom_image_t *src,
   luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth;
   chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth;
 
-  params->bit_depth = dst->bit_depth;
-
   av1_add_film_grain_run(params, luma, cb, cr, height, width, luma_stride,
                          chroma_stride, use_high_bit_depth, chroma_subsamp_y,
                          chroma_subsamp_x, mc_identity);
   return;
 }
 
-void av1_add_film_grain_run(aom_film_grain_t *params, uint8_t *luma,
+void av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma,
                             uint8_t *cb, uint8_t *cr, int height, int width,
                             int luma_stride, int chroma_stride,
                             int use_high_bit_depth, int chroma_subsamp_y,
diff --git a/third_party/aom/aom_dsp/grain_synthesis.h b/third_party/aom/aom_dsp/grain_synthesis.h
index 016cb12d7..65feb6068 100644
--- a/third_party/aom/aom_dsp/grain_synthesis.h
+++ b/third_party/aom/aom_dsp/grain_synthesis.h
@@ -72,7 +72,7 @@ typedef struct {
 
   int clip_to_restricted_range;
 
-  int bit_depth;  // video bit depth
+  unsigned int bit_depth;  // video bit depth
 
   int chroma_scaling_from_luma;
 
@@ -94,7 +94,7 @@ typedef struct {
  * \param[in]    luma_stride      luma plane stride
  * \param[in]    chroma_stride    chroma plane stride
  */
-void av1_add_film_grain_run(aom_film_grain_t *grain_params, uint8_t *luma,
+void av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma,
                             uint8_t *cb, uint8_t *cr, int height, int width,
                             int luma_stride, int chroma_stride,
                             int use_high_bit_depth, int chroma_subsamp_y,
@@ -106,10 +106,10 @@ void av1_add_film_grain_run(aom_film_grain_t *grain_params, uint8_t *luma,
  *
  * \param[in]    grain_params     Grain parameters
  * \param[in]    src              Source image
- * \param[in]    dst              Resulting image with grain
+ * \param[out]   dst              Resulting image with grain
  */
-void av1_add_film_grain(aom_film_grain_t *grain_params, aom_image_t *src,
-                        aom_image_t *dst);
+void av1_add_film_grain(const aom_film_grain_t *grain_params,
+                        const aom_image_t *src, aom_image_t *dst);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/aom_dsp/noise_model.c b/third_party/aom/aom_dsp/noise_model.c
index a1287f74f..5975c62e8 100644
--- a/third_party/aom/aom_dsp/noise_model.c
+++ b/third_party/aom/aom_dsp/noise_model.c
@@ -1458,3 +1458,189 @@ int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
   }
   return init_success;
 }
+
+struct aom_denoise_and_model_t {
+  int block_size;
+  int bit_depth;
+  float noise_level;
+
+  // Size of current denoised buffer and flat_block buffer
+  int width;
+  int height;
+  int y_stride;
+  int uv_stride;
+  int num_blocks_w;
+  int num_blocks_h;
+
+  // Buffers for image and noise_psd allocated on the fly
+  float *noise_psd[3];
+  uint8_t *denoised[3];
+  uint8_t *flat_blocks;
+
+  aom_flat_block_finder_t flat_block_finder;
+  aom_noise_model_t noise_model;
+};
+
+struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth,
+                                                            int block_size,
+                                                            float noise_level) {
+  struct aom_denoise_and_model_t *ctx =
+      (struct aom_denoise_and_model_t *)aom_malloc(
+          sizeof(struct aom_denoise_and_model_t));
+  if (!ctx) {
+    fprintf(stderr, "Unable to allocate denoise_and_model struct\n");
+    return NULL;
+  }
+  memset(ctx, 0, sizeof(*ctx));
+
+  ctx->block_size = block_size;
+  ctx->noise_level = noise_level;
+  ctx->bit_depth = bit_depth;
+
+  ctx->noise_psd[0] =
+      aom_malloc(sizeof(*ctx->noise_psd[0]) * block_size * block_size);
+  ctx->noise_psd[1] =
+      aom_malloc(sizeof(*ctx->noise_psd[1]) * block_size * block_size);
+  ctx->noise_psd[2] =
+      aom_malloc(sizeof(*ctx->noise_psd[2]) * block_size * block_size);
+  if (!ctx->noise_psd[0] || !ctx->noise_psd[1] || !ctx->noise_psd[2]) {
+    fprintf(stderr, "Unable to allocate noise PSD buffers\n");
+    aom_denoise_and_model_free(ctx);
+    return NULL;
+  }
+  return ctx;
+}
+
+void aom_denoise_and_model_free(struct aom_denoise_and_model_t *ctx) {
+  aom_free(ctx->flat_blocks);
+  for (int i = 0; i < 3; ++i) {
+    aom_free(ctx->denoised[i]);
+    aom_free(ctx->noise_psd[i]);
+  }
+  aom_noise_model_free(&ctx->noise_model);
+  aom_flat_block_finder_free(&ctx->flat_block_finder);
+  aom_free(ctx);
+}
+
+static int denoise_and_model_realloc_if_necessary(
+    struct aom_denoise_and_model_t *ctx, YV12_BUFFER_CONFIG *sd) {
+  if (ctx->width == sd->y_width && ctx->height == sd->y_height &&
+      ctx->y_stride == sd->y_stride && ctx->uv_stride == sd->uv_stride)
+    return 1;
+  const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+  const int block_size = ctx->block_size;
+
+  ctx->width = sd->y_width;
+  ctx->height = sd->y_height;
+  ctx->y_stride = sd->y_stride;
+  ctx->uv_stride = sd->uv_stride;
+
+  for (int i = 0; i < 3; ++i) {
+    aom_free(ctx->denoised[i]);
+    ctx->denoised[i] = NULL;
+  }
+  aom_free(ctx->flat_blocks);
+  ctx->flat_blocks = NULL;
+
+  ctx->denoised[0] = aom_malloc((sd->y_stride * sd->y_height) << use_highbd);
+  ctx->denoised[1] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd);
+  ctx->denoised[2] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd);
+  if (!ctx->denoised[0] || !ctx->denoised[1] || !ctx->denoised[2]) {
+    fprintf(stderr, "Unable to allocate denoise buffers\n");
+    return 0;
+  }
+  ctx->num_blocks_w = (sd->y_width + ctx->block_size - 1) / ctx->block_size;
+  ctx->num_blocks_h = (sd->y_height + ctx->block_size - 1) / ctx->block_size;
+  ctx->flat_blocks = aom_malloc(ctx->num_blocks_w * ctx->num_blocks_h);
+
+  aom_flat_block_finder_free(&ctx->flat_block_finder);
+  if (!aom_flat_block_finder_init(&ctx->flat_block_finder, ctx->block_size,
+                                  ctx->bit_depth, use_highbd)) {
+    fprintf(stderr, "Unable to init flat block finder\n");
+    return 0;
+  }
+
+  const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3,
+                                            ctx->bit_depth, use_highbd };
+  aom_noise_model_free(&ctx->noise_model);
+  if (!aom_noise_model_init(&ctx->noise_model, params)) {
+    fprintf(stderr, "Unable to init noise model\n");
+    return 0;
+  }
+
+  // Simply use a flat PSD (although we could use the flat blocks to estimate
+  // PSD) those to estimate an actual noise PSD)
+  const float y_noise_level =
+      aom_noise_psd_get_default_value(ctx->block_size, ctx->noise_level);
+  const float uv_noise_level = aom_noise_psd_get_default_value(
+      ctx->block_size >> sd->subsampling_x, ctx->noise_level);
+  for (int i = 0; i < block_size * block_size; ++i) {
+    ctx->noise_psd[0][i] = y_noise_level;
+    ctx->noise_psd[1][i] = ctx->noise_psd[2][i] = uv_noise_level;
+  }
+  return 1;
+}
+
+int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx,
+                              YV12_BUFFER_CONFIG *sd,
+                              aom_film_grain_t *film_grain) {
+  const int block_size = ctx->block_size;
+  const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+  uint8_t *raw_data[3] = {
+    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->y_buffer) : sd->y_buffer,
+    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->u_buffer) : sd->u_buffer,
+    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->v_buffer) : sd->v_buffer,
+  };
+  const uint8_t *const data[3] = { raw_data[0], raw_data[1], raw_data[2] };
+  int strides[3] = { sd->y_stride, sd->uv_stride, sd->uv_stride };
+  int chroma_sub_log2[2] = { sd->subsampling_x, sd->subsampling_y };
+
+  if (!denoise_and_model_realloc_if_necessary(ctx, sd)) {
+    fprintf(stderr, "Unable to realloc buffers\n");
+    return 0;
+  }
+
+  aom_flat_block_finder_run(&ctx->flat_block_finder, data[0], sd->y_width,
+                            sd->y_height, strides[0], ctx->flat_blocks);
+
+  if (!aom_wiener_denoise_2d(data, ctx->denoised, sd->y_width, sd->y_height,
+                             strides, chroma_sub_log2, ctx->noise_psd,
+                             block_size, ctx->bit_depth, use_highbd)) {
+    fprintf(stderr, "Unable to denoise image\n");
+    return 0;
+  }
+
+  const aom_noise_status_t status = aom_noise_model_update(
+      &ctx->noise_model, data, (const uint8_t *const *)ctx->denoised,
+      sd->y_width, sd->y_height, strides, chroma_sub_log2, ctx->flat_blocks,
+      block_size);
+  int have_noise_estimate = 0;
+  if (status == AOM_NOISE_STATUS_OK) {
+    have_noise_estimate = 1;
+  } else if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) {
+    aom_noise_model_save_latest(&ctx->noise_model);
+    have_noise_estimate = 1;
+  } else {
+    // Unable to update noise model; proceed if we have a previous estimate.
+    have_noise_estimate =
+        (ctx->noise_model.combined_state[0].strength_solver.num_equations > 0);
+  }
+
+  film_grain->apply_grain = 0;
+  if (have_noise_estimate) {
+    if (!aom_noise_model_get_grain_parameters(&ctx->noise_model, film_grain)) {
+      fprintf(stderr, "Unable to get grain parameters.\n");
+      return 0;
+    }
+    if (!film_grain->random_seed) {
+      film_grain->random_seed = 1071;
+    }
+    memcpy(raw_data[0], ctx->denoised[0],
+           (strides[0] * sd->y_height) << use_highbd);
+    memcpy(raw_data[1], ctx->denoised[1],
+           (strides[1] * sd->uv_height) << use_highbd);
+    memcpy(raw_data[2], ctx->denoised[2],
+           (strides[2] * sd->uv_height) << use_highbd);
+  }
+  return 1;
+}
diff --git a/third_party/aom/aom_dsp/noise_model.h b/third_party/aom/aom_dsp/noise_model.h
index dabeacc14..b07bf8617 100644
--- a/third_party/aom/aom_dsp/noise_model.h
+++ b/third_party/aom/aom_dsp/noise_model.h
@@ -18,6 +18,7 @@ extern "C" {
 
 #include <stdint.h>
 #include "aom_dsp/grain_synthesis.h"
+#include "aom_scale/yv12config.h"
 
 /*!\brief Wrapper of data required to represent linear system of eqns and soln.
  */
@@ -280,6 +281,42 @@ int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
                           int w, int h, int stride[3], int chroma_sub_log2[2],
                           float *noise_psd[3], int block_size, int bit_depth,
                           int use_highbd);
+
+struct aom_denoise_and_model_t;
+
+/*!\brief Denoise the buffer and model the residual noise.
+ *
+ * This is meant to be called sequentially on input frames. The input buffer
+ * is denoised and the residual noise is modelled. The current noise estimate
+ * is populated in film_grain. Returns true on success. The grain.apply_grain
+ * parameter will be true when the input buffer was successfully denoised and
+ * grain was modelled. Returns false on error.
+ *
+ * \param[in]      ctx   Struct allocated with aom_denoise_and_model_alloc
+ *                       that holds some buffers for denoising and the current
+ *                       noise estimate.
+ * \param[in/out]   buf  The raw input buffer to be denoised.
+ * \param[out]    grain  Output film grain parameters
+ */
+int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx,
+                              YV12_BUFFER_CONFIG *buf, aom_film_grain_t *grain);
+
+/*!\brief Allocates a context that can be used for denoising and noise modeling.
+ *
+ * \param[in]  bit_depth   Bit depth of buffers this will be run on.
+ * \param[in]  block_size  Block size for noise modeling and flat block
+ *                         estimation
+ * \param[in]  noise_level The noise_level (2.5 for moderate noise, and 5 for
+ *                         higher levels of noise)
+ */
+struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth,
+                                                            int block_size,
+                                                            float noise_level);
+
+/*!\brief Frees the denoise context allocated with aom_denoise_and_model_alloc
+ */
+void aom_denoise_and_model_free(struct aom_denoise_and_model_t *denoise_model);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics.h b/third_party/aom/aom_dsp/simd/v256_intrinsics.h
index 0e5ae5b68..4b70cc57b 100644
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics.h
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics.h
@@ -289,6 +289,15 @@ SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
 SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
   return c_v256_shr_s32(a, c);
 }
+SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
+  return c_v256_shl_64(a, c);
+}
+SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
+  return c_v256_shr_u64(a, c);
+}
+SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
+  return c_v256_shr_s64(a, c);
+}
 
 SIMD_INLINE v256 v256_shr_n_byte(v256 a, unsigned int n) {
   return c_v256_shr_n_byte(a, n);
diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c
index d367905bc..817ebe15d 100644
--- a/third_party/aom/aom_dsp/variance.c
+++ b/third_party/aom/aom_dsp/variance.c
@@ -386,7 +386,7 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
     }
   }
 
-  const InterpFilterParams filter =
+  const InterpFilterParams *filter =
       av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
 
   if (!subpel_x_q3 && !subpel_y_q3) {
@@ -413,12 +413,12 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
     const int16_t *const kernel_y =
         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
     const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
     assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), ref_stride,
-                        temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
-                        intermediate_height);
-    aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
+    aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1),
+                        ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
+                        width, intermediate_height);
+    aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
                        MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
                        width, height);
   }
@@ -974,7 +974,7 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
     }
   }
 
-  const InterpFilterParams filter =
+  const InterpFilterParams *filter =
       av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
 
   if (!subpel_x_q3 && !subpel_y_q3) {
@@ -1004,14 +1004,14 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
     const int16_t *const kernel_y =
         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
     const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
     assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1),
+    aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1),
                                ref_stride, CONVERT_TO_BYTEPTR(temp),
                                MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
                                intermediate_height, bd);
     aom_highbd_convolve8_vert(
-        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)),
+        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
         MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
         16, width, height, bd);
   }
@@ -1185,29 +1185,18 @@ void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
   }
 }
 
-void aom_highbd_comp_mask_upsampled_pred_c(
+void aom_highbd_comp_mask_upsampled_pred(
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
     int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
     int bd) {
-  int i, j;
-
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
                             height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
                             bd);
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      if (!invert_mask)
-        comp_pred[j] = AOM_BLEND_A64(mask[j], comp_pred[j], pred[j]);
-      else
-        comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], comp_pred[j]);
-    }
-    comp_pred += width;
-    pred += width;
-    mask += mask_stride;
-  }
+  aom_highbd_comp_mask_pred(comp_pred, pred8, width, height,
+                            CONVERT_TO_BYTEPTR(comp_pred), width, mask,
+                            mask_stride, invert_mask);
 }
 
 #define HIGHBD_MASK_SUBPIX_VAR(W, H)                                           \
diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h
index 544dda944..b954470de 100644
--- a/third_party/aom/aom_dsp/variance.h
+++ b/third_party/aom/aom_dsp/variance.h
@@ -76,6 +76,13 @@ void aom_comp_mask_upsampled_pred(
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
     int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
 
+void aom_highbd_comp_mask_upsampled_pred(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int bd);
+
 typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
                                           const int32_t *wsrc,
                                           const int32_t *msk);
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
index af45a03ac..f3fe50372 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -41,20 +41,290 @@
 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
 #endif  // __clang__
 
+static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr,
+                                    const ptrdiff_t stride, const __m256i *a) {
+  *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
+  *((uint32_t *)(output_ptr + stride)) =
+      _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
+}
+
+static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
+  __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
+  a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
+  return a;
+}
+
+static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr,
+                                    const ptrdiff_t stride, const __m256i *a) {
+  _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
+  _mm_storel_epi64((__m128i *)(output_ptr + stride),
+                   _mm256_extractf128_si256(*a, 1));
+}
+
+static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
+  __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
+  a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
+  return a;
+}
+
+static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
+                                   const ptrdiff_t stride, const __m256i *a) {
+  _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
+  _mm_store_si128((__m128i *)(output_ptr + stride),
+                  _mm256_extractf128_si256(*a, 1));
+}
+
+static void aom_filter_block1d4_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt1Reg, filt2Reg;
+  __m256i firstFilters, secondFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2;
+  __m256i srcReg32b1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 32 bits
+  firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
+  // duplicate only the second 32 bits
+  secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
+
+  filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+    // filter the source buffer
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+    srcRegFilt32b1_1 =
+        _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 =
+        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 4 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcRegFilt1_1;
+    __m128i srcRegFilt2;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt1_1 =
+        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+
+    // filter the source buffer
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+    srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+    // save 4 bytes
+    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+  }
+}
+
+static void aom_filter_block1d8_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+  filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+  filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 =
+        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 8 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcRegFilt1_1;
+    __m128i srcRegFilt2, srcRegFilt3;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1_1 =
+        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 =
+        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+    // save 8 bytes
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
+  }
+}
+
 static void aom_filter_block1d16_h8_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
   __m128i filtersReg;
-  __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
   __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
   __m256i srcReg32b1, srcReg32b2, filtersReg32;
   unsigned int i;
   ptrdiff_t src_stride, dst_stride;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
   filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
   // converting the 16 bit (short) to 8 bit (byte) and have the same data
   // in both lanes of 128 bit register.
   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
@@ -74,22 +344,17 @@ static void aom_filter_block1d16_h8_avx2(
   // across 256 bit register
   forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
 
-  filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
-  filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
-  filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
-  filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+  filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+  filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
 
   // multiple the size of the source and destination stride by two
   src_stride = src_pixels_per_line << 1;
   dst_stride = output_pitch << 1;
   for (i = output_height; i > 1; i -= 2) {
     // load the 2 strides of source
-    srcReg32b1 =
-        _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
-    srcReg32b1 = _mm256_inserti128_si256(
-        srcReg32b1,
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
-        1);
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
 
     // filter the source buffer
     srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
@@ -110,22 +375,13 @@ static void aom_filter_block1d16_h8_avx2(
     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
 
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(
-        srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+    __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
 
     // reading 2 strides of the next 16 bytes
     // (part of it was being read by earlier read)
     srcReg32b2 =
-        _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
-    srcReg32b2 = _mm256_inserti128_si256(
-        srcReg32b2,
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
-        1);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(
-        srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+        xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
 
     // filter the source buffer
     srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
@@ -148,32 +404,21 @@ static void aom_filter_block1d16_h8_avx2(
 
     // add and saturate the results together
     srcRegFilt32b2_1 = _mm256_adds_epi16(
-        srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
-    srcRegFilt32b2_1 = _mm256_adds_epi16(
-        srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
+        srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2));
 
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
-    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
 
     // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
+    // convolve result and the second lane contain the second convolve result
     srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
 
     src_ptr += src_stride;
 
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr,
-                    _mm256_castsi256_si128(srcRegFilt32b1_1));
-
-    // save the next 16 bits
-    _mm_store_si128((__m128i *)(output_ptr + output_pitch),
-                    _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+    xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
     output_ptr += dst_stride;
   }
 
@@ -183,7 +428,7 @@ static void aom_filter_block1d16_h8_avx2(
     __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
     __m128i srcRegFilt2, srcRegFilt3;
 
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
 
     // filter the source buffer
     srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
@@ -210,15 +455,11 @@ static void aom_filter_block1d16_h8_avx2(
 
     // add and saturate the results together
     srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+        _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
 
     // reading the next 16 bytes
     // (part of it was being read by earlier read)
-    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
-
-    // add and saturate the results together
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
 
     // filter the source buffer
     srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
@@ -245,19 +486,16 @@ static void aom_filter_block1d16_h8_avx2(
 
     // add and saturate the results together
     srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-    srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+        _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
 
+    // shift by 6 bit each 16 bit
     srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64));
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
 
     srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64));
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
-    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+        _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6);
 
     // shrink to 8 bit each 16 bits, the first lane contain the first
     // convolve result and the second lane contain the second convolve
@@ -269,11 +507,163 @@ static void aom_filter_block1d16_h8_avx2(
   }
 }
 
+static void aom_filter_block1d8_v8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32;
+  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+  __m256i srcReg32b11, srcReg32b12, filtersReg32;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  // load 16 bytes 7 times in stride of src_pitch
+  srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr);
+  srcReg32b3 =
+      xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg32b5 =
+      xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
+  srcReg32b7 = _mm256_castsi128_si256(
+      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+
+  // have each consecutive loads on the same 256 register
+  srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
+  srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
+  srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
+  // merge every two consecutive registers except the last one
+  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg32b8 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)));
+    srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+                                         _mm256_castsi256_si128(srcReg32b8), 1);
+    srcReg32b9 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8)));
+    srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+                                         _mm256_castsi256_si128(srcReg32b9), 1);
+
+    // merge every two consecutive registers
+    // save
+    srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+
+    // add and saturate the results together
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+                                    _mm256_adds_epi16(srcReg32b8, srcReg32b12));
+
+    // shift by 6 bit each 16 bit
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
+    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256());
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg32b10 = srcReg32b11;
+    srcReg32b11 = srcReg32b2;
+    srcReg32b2 = srcReg32b4;
+    srcReg32b7 = srcReg32b9;
+  }
+  if (i > 0) {
+    __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8;
+    // load the last 16 bytes
+    srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+
+    // merge the last 2 results together
+    srcRegFilt4 =
+        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+                                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt4 =
+        _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+                                    _mm256_castsi256_si128(secondFilters));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+                                    _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 =
+        _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1 =
+        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128());
+
+    // save 8 bytes
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1);
+  }
+}
+
 static void aom_filter_block1d16_v8_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
   __m128i filtersReg;
-  __m256i addFilterReg64;
+  __m256i addFilterReg32;
   __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
   __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
   __m256i srcReg32b11, srcReg32b12, filtersReg32;
@@ -281,11 +671,11 @@ static void aom_filter_block1d16_v8_avx2(
   unsigned int i;
   ptrdiff_t src_stride, dst_stride;
 
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+  addFilterReg32 = _mm256_set1_epi16(32);
   filtersReg = _mm_loadu_si128((const __m128i *)filter);
   // converting the 16 bit (short) to  8 bit (byte) and have the
   // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
   // have the same data in both lanes of a 256 bit register
   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
@@ -308,49 +698,26 @@ static void aom_filter_block1d16_v8_avx2(
   dst_stride = out_pitch << 1;
 
   // load 16 bytes 7 times in stride of src_pitch
-  srcReg32b1 =
-      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr)));
-  srcReg32b2 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
-  srcReg32b3 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
-  srcReg32b4 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
-  srcReg32b5 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
-  srcReg32b6 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
+  srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr);
+  srcReg32b3 =
+      xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg32b5 =
+      xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
   srcReg32b7 = _mm256_castsi128_si256(
       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
 
   // have each consecutive loads on the same 256 register
-  srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
-                                       _mm256_castsi256_si128(srcReg32b2), 1);
-  srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
-                                       _mm256_castsi256_si128(srcReg32b3), 1);
-  srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
-                                       _mm256_castsi256_si128(srcReg32b4), 1);
-  srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
-                                       _mm256_castsi256_si128(srcReg32b5), 1);
-  srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
-                                       _mm256_castsi256_si128(srcReg32b6), 1);
-  srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
-                                       _mm256_castsi256_si128(srcReg32b7), 1);
-
+  srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
+  srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
+  srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
   // merge every two consecutive registers except the last one
   srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
   srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
 
   // save
   srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
-
-  // save
   srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
-
-  // save
   srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
-
-  // save
   srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
 
   for (i = output_height; i > 1; i -= 2) {
@@ -383,9 +750,7 @@ static void aom_filter_block1d16_v8_avx2(
 
     // add and saturate the results together
     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                                    _mm256_min_epi16(srcReg32b8, srcReg32b12));
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                                    _mm256_max_epi16(srcReg32b8, srcReg32b12));
+                                    _mm256_adds_epi16(srcReg32b8, srcReg32b12));
 
     // multiply 2 adjacent elements with the filter and add the result
     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
@@ -399,16 +764,13 @@ static void aom_filter_block1d16_v8_avx2(
 
     // add and saturate the results together
     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                                   _mm256_min_epi16(srcReg32b8, srcReg32b12));
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                                   _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
+                                   _mm256_adds_epi16(srcReg32b8, srcReg32b12));
 
-    // shift by 7 bit each 16 bit
-    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
-    srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
+    // shift by 6 bit each 16 bit
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
+    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32);
+    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
+    srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6);
 
     // shrink to 8 bit each 16 bits, the first lane contain the first
     // convolve result and the second lane contain the second convolve
@@ -417,12 +779,7 @@ static void aom_filter_block1d16_v8_avx2(
 
     src_ptr += src_stride;
 
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1));
-
-    // save the next 16 bits
-    _mm_store_si128((__m128i *)(output_ptr + out_pitch),
-                    _mm256_extractf128_si256(srcReg32b1, 1));
+    xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1);
 
     output_ptr += dst_stride;
 
@@ -475,24 +832,17 @@ static void aom_filter_block1d16_v8_avx2(
 
     // add and saturate the results together
     srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6));
+        _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
     srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7));
+        _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7));
 
-    // add and saturate the results together
+    // shift by 6 bit each 16 bit
     srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6));
+        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
     srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7));
-
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64));
-    srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64));
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
+        _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
+    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6);
 
     // shrink to 8 bit each 16 bits, the first lane contain the first
     // convolve result and the second lane contain the second convolve
@@ -506,21 +856,6 @@ static void aom_filter_block1d16_v8_avx2(
 
 #if HAVE_AVX2 && HAVE_SSSE3
 filter8_1dfunction aom_filter_block1d4_v8_ssse3;
-#if ARCH_X86_64
-filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3;
-#define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_intrin_ssse3
-#define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_intrin_ssse3
-#define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_intrin_ssse3
-#else  // ARCH_X86
-filter8_1dfunction aom_filter_block1d8_v8_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_ssse3;
-#define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_ssse3
-#define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_ssse3
-#define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_ssse3
-#endif  // ARCH_X86_64
 filter8_1dfunction aom_filter_block1d16_v2_ssse3;
 filter8_1dfunction aom_filter_block1d16_h2_ssse3;
 filter8_1dfunction aom_filter_block1d8_v2_ssse3;
diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h
index 7790baf2e..72fabd236 100644
--- a/third_party/aom/aom_dsp/x86/convolve_avx2.h
+++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h
@@ -13,31 +13,27 @@
 #define AOM_DSP_X86_CONVOLVE_AVX2_H_
 
 // filters for 16
-DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
+  0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
+  2,  2,  3,  3,  4, 4,  5,  5,  6,  6,  7,  7,  8,  2,  3,  3,  4,  4,  5,
+  5,  6,  6,  7,  7, 8,  8,  9,  9,  10, 2,  3,  3,  4,  4,  5,  5,  6,  6,
+  7,  7,  8,  8,  9, 9,  10, 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10,
+  10, 11, 11, 12, 4, 5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11, 11,
+  12, 6,  7,  7,  8, 8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 6,  7,
+  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
 };
 
-DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
+  0, 1, 2, 3,  1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3,  1, 2,
+  3, 4, 2, 3,  4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7,  8, 9,
+  7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
 };
 
 static INLINE void prepare_coeffs_lowbd(
     const InterpFilterParams *const filter_params, const int subpel_q4,
     __m256i *const coeffs /* [4] */) {
   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params, subpel_q4 & SUBPEL_MASK);
+      filter_params, subpel_q4 & SUBPEL_MASK);
   const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
   const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
 
@@ -65,7 +61,7 @@ static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
                                   const int subpel_q4,
                                   __m256i *const coeffs /* [4] */) {
   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params, subpel_q4 & SUBPEL_MASK);
+      filter_params, subpel_q4 & SUBPEL_MASK);
 
   const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
   const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h
index 846fe7bb4..399df5d6d 100644
--- a/third_party/aom/aom_dsp/x86/convolve_sse2.h
+++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h
@@ -19,7 +19,7 @@ static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
                                   const int subpel_q4,
                                   __m128i *const coeffs /* [4] */) {
   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params, subpel_q4 & SUBPEL_MASK);
+      filter_params, subpel_q4 & SUBPEL_MASK);
   const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
 
   // coeffs 0 1 0 1 0 1 0 1
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
index e5e3238d5..099fcf7fc 100644
--- a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
@@ -105,8 +105,8 @@ void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride,
 
 void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
                                    uint16_t *dst, int dst_stride, int w, int h,
-                                   InterpFilterParams *filter_params_x,
-                                   InterpFilterParams *filter_params_y,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
                                    const int subpel_x_q4, const int subpel_y_q4,
                                    ConvolveParams *conv_params, int bd) {
   int i, j;
@@ -254,8 +254,8 @@ void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
 
 void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
                                    uint16_t *dst, int dst_stride, int w, int h,
-                                   InterpFilterParams *filter_params_x,
-                                   InterpFilterParams *filter_params_y,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
                                    const int subpel_x_q4, const int subpel_y_q4,
                                    ConvolveParams *conv_params, int bd) {
   int i, j;
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
index f7ac9b496..e7b33d1c4 100644
--- a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
@@ -18,8 +18,8 @@
 
 void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
                                     uint16_t *dst, int dst_stride, int w, int h,
-                                    InterpFilterParams *filter_params_x,
-                                    InterpFilterParams *filter_params_y,
+                                    const InterpFilterParams *filter_params_x,
+                                    const InterpFilterParams *filter_params_y,
                                     const int subpel_x_q4,
                                     const int subpel_y_q4,
                                     ConvolveParams *conv_params, int bd) {
@@ -166,8 +166,8 @@ void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
 
 void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
                                     uint16_t *dst, int dst_stride, int w, int h,
-                                    InterpFilterParams *filter_params_x,
-                                    InterpFilterParams *filter_params_y,
+                                    const InterpFilterParams *filter_params_x,
+                                    const InterpFilterParams *filter_params_y,
                                     const int subpel_x_q4,
                                     const int subpel_y_q4,
                                     ConvolveParams *conv_params, int bd) {
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
index fdfadc886..131c16aa9 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
@@ -676,7 +676,7 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
     }
   }
 
-  const InterpFilterParams filter =
+  const InterpFilterParams *filter =
       av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
 
   if (!subpel_x_q3 && !subpel_y_q3) {
@@ -726,14 +726,14 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
     const int16_t *const kernel_y =
         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
     const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
     assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1),
+    aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1),
                                ref_stride, CONVERT_TO_BYTEPTR(temp),
                                MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
                                intermediate_height, bd);
     aom_highbd_convolve8_vert(
-        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)),
+        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
         MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
         16, width, height, bd);
   }
diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
index 9801e285c..eaf1f347b 100644
--- a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
@@ -22,118 +22,12 @@
 void aom_var_filter_block2d_bil_first_pass_ssse3(
     const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
     unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter) {
-  // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow
-  // in computation using _mm_maddubs_epi16.
-  // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
-  const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
-  const __m128i r = _mm_set1_epi16(round);
-  const uint8_t f0 = filter[0] >> 1;
-  const uint8_t f1 = filter[1] >> 1;
-  const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1,
-                                        f0, f1, f0, f1, f0, f1);
-  const __m128i shuffle_mask =
-      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
-  unsigned int i, j;
-  (void)pixel_step;
-
-  if (output_width >= 8) {
-    for (i = 0; i < output_height; ++i) {
-      for (j = 0; j < output_width; j += 8) {
-        // load source
-        __m128i source_low = xx_loadl_64(a);
-        __m128i source_hi = _mm_setzero_si128();
-
-        // avoid load undefined memory
-        if (a + 8 != NULL) source_hi = xx_loadl_64(a + 8);
-        __m128i source = _mm_unpacklo_epi64(source_low, source_hi);
-
-        // shuffle to:
-        // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
-        //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
-        __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
-
-        // b[i] = a[i] * filter[0] + a[i + 1] * filter[1]
-        __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
-
-        // round
-        res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
-
-        xx_storeu_128(b, res);
-
-        a += 8;
-        b += 8;
-      }
-
-      a += src_pixels_per_line - output_width;
-    }
-  } else {
-    for (i = 0; i < output_height; ++i) {
-      // load source, only first 5 values are meaningful:
-      // { a[0], a[1], a[2], a[3], a[4], xxxx }
-      __m128i source = xx_loadl_64(a);
-
-      // shuffle, up to the first 8 are useful
-      // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
-      //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
-      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
-
-      __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
-      res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
-
-      xx_storel_64(b, res);
-
-      a += src_pixels_per_line;
-      b += output_width;
-    }
-  }
-}
+    unsigned int output_width, const uint8_t *filter);
 
 void aom_var_filter_block2d_bil_second_pass_ssse3(
     const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
     unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter) {
-  const int16_t round = (1 << FILTER_BITS) >> 1;
-  const __m128i r = _mm_set1_epi32(round);
-  const __m128i filters =
-      _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0],
-                     filter[1], filter[0], filter[1]);
-  const __m128i shuffle_mask =
-      _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
-  const __m128i mask =
-      _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-  unsigned int i, j;
-
-  for (i = 0; i < output_height; ++i) {
-    for (j = 0; j < output_width; j += 4) {
-      // load source as:
-      // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] }
-      __m128i source1 = xx_loadl_64(a);
-      __m128i source2 = xx_loadl_64(a + pixel_step);
-      __m128i source = _mm_unpacklo_epi64(source1, source2);
-
-      // shuffle source to:
-      // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] }
-      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
-
-      // b[i] = a[i] * filter[0] + a[w + i] * filter[1]
-      __m128i res = _mm_madd_epi16(source_shuffle, filters);
-
-      // round
-      res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS);
-
-      // shuffle to get each lower 8 bit of every 32 bit
-      res = _mm_shuffle_epi8(res, mask);
-
-      xx_storel_32(b, res);
-
-      a += 4;
-      b += 4;
-    }
-
-    a += src_pixels_per_line - output_width;
-  }
-}
+    unsigned int output_width, const uint8_t *filter);
 
 static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
                                         const __m128i *w, const __m128i *r,
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
new file mode 100644
index 000000000..6538e4d5e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86//masked_sad_intrin_ssse3.h"
+
+static INLINE unsigned int masked_sad32xh_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
+    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int width, int height) {
+  int x, y;
+  __m256i res = _mm256_setzero_si256();
+  const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_scale =
+      _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 32) {
+      const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
+      const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
+      const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
+      const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]);
+      const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
+
+      // Calculate 16 predicted pixels.
+      // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+      // is 64 * 255, so we have plenty of space to add rounding constants.
+      const __m256i data_l = _mm256_unpacklo_epi8(a, b);
+      const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
+      __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
+      pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
+
+      const __m256i data_r = _mm256_unpackhi_epi8(a, b);
+      const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
+      __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
+      pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
+
+      const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
+      res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+  res = _mm256_shuffle_epi32(res, 0xd8);
+  res = _mm256_permute4x64_epi64(res, 0xd8);
+  res = _mm256_hadd_epi32(res, res);
+  res = _mm256_hadd_epi32(res, res);
+  int32_t sad = _mm256_extract_epi32(res, 0);
+  return (sad + 31) >> 6;
+}
+
+static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) {
+  __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo));
+  __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi));
+  __m256i a = _mm256_castsi128_si256(a0);
+  return _mm256_inserti128_si256(a, a1, 1);
+}
+
+static INLINE unsigned int masked_sad16xh_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
+    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int height) {
+  int y;
+  __m256i res = _mm256_setzero_si256();
+  const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_scale =
+      _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  for (y = 0; y < height; y += 2) {
+    const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
+    const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
+    const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
+    const __m256i m = xx_loadu2_m128i(m_ptr + m_stride, m_ptr);
+    const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
+
+    // Calculate 16 predicted pixels.
+    // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+    // is 64 * 255, so we have plenty of space to add rounding constants.
+    const __m256i data_l = _mm256_unpacklo_epi8(a, b);
+    const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
+    __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
+    pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
+
+    const __m256i data_r = _mm256_unpackhi_epi8(a, b);
+    const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
+    __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
+    pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
+
+    const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
+    res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
+
+    src_ptr += src_stride << 1;
+    a_ptr += a_stride << 1;
+    b_ptr += b_stride << 1;
+    m_ptr += m_stride << 1;
+  }
+  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+  res = _mm256_shuffle_epi32(res, 0xd8);
+  res = _mm256_permute4x64_epi64(res, 0xd8);
+  res = _mm256_hadd_epi32(res, res);
+  res = _mm256_hadd_epi32(res, res);
+  int32_t sad = _mm256_extract_epi32(res, 0);
+  return (sad + 31) >> 6;
+}
+
+static INLINE unsigned int aom_masked_sad_avx2(
+    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+    const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
+    int invert_mask, int m, int n) {
+  unsigned int sad;
+  if (!invert_mask) {
+    switch (m) {
+      case 4:
+        sad = aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
+                                      second_pred, m, msk, msk_stride, n);
+        break;
+      case 8:
+        sad = aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,
+                                      second_pred, m, msk, msk_stride, n);
+        break;
+      case 16:
+        sad = masked_sad16xh_avx2(src, src_stride, ref, ref_stride, second_pred,
+                                  m, msk, msk_stride, n);
+        break;
+      default:
+        sad = masked_sad32xh_avx2(src, src_stride, ref, ref_stride, second_pred,
+                                  m, msk, msk_stride, m, n);
+        break;
+    }
+  } else {
+    switch (m) {
+      case 4:
+        sad = aom_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
+                                      ref_stride, msk, msk_stride, n);
+        break;
+      case 8:
+        sad = aom_masked_sad8xh_ssse3(src, src_stride, second_pred, m, ref,
+                                      ref_stride, msk, msk_stride, n);
+        break;
+      case 16:
+        sad = masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
+                                  ref_stride, msk, msk_stride, n);
+        break;
+      default:
+        sad = masked_sad32xh_avx2(src, src_stride, second_pred, m, ref,
+                                  ref_stride, msk, msk_stride, m, n);
+        break;
+    }
+  }
+  return sad;
+}
+
+#define MASKSADMXN_AVX2(m, n)                                                 \
+  unsigned int aom_masked_sad##m##x##n##_avx2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    return aom_masked_sad_avx2(src, src_stride, ref, ref_stride, second_pred, \
+                               msk, msk_stride, invert_mask, m, n);           \
+  }
+
+MASKSADMXN_AVX2(4, 4)
+MASKSADMXN_AVX2(4, 8)
+MASKSADMXN_AVX2(8, 4)
+MASKSADMXN_AVX2(8, 8)
+MASKSADMXN_AVX2(8, 16)
+MASKSADMXN_AVX2(16, 8)
+MASKSADMXN_AVX2(16, 16)
+MASKSADMXN_AVX2(16, 32)
+MASKSADMXN_AVX2(32, 16)
+MASKSADMXN_AVX2(32, 32)
+MASKSADMXN_AVX2(32, 64)
+MASKSADMXN_AVX2(64, 32)
+MASKSADMXN_AVX2(64, 64)
+MASKSADMXN_AVX2(64, 128)
+MASKSADMXN_AVX2(128, 64)
+MASKSADMXN_AVX2(128, 128)
+MASKSADMXN_AVX2(4, 16)
+MASKSADMXN_AVX2(16, 4)
+MASKSADMXN_AVX2(8, 32)
+MASKSADMXN_AVX2(32, 8)
+MASKSADMXN_AVX2(16, 64)
+MASKSADMXN_AVX2(64, 16)
+
+static INLINE unsigned int highbd_masked_sad8xh_avx2(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int height) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+  int y;
+  __m256i res = _mm256_setzero_si256();
+  const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_const =
+      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  for (y = 0; y < height; y += 2) {
+    const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
+    const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
+    const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
+    // Zero-extend mask to 16 bits
+    const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)(m_ptr)),
+        _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride))));
+    const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
+
+    const __m256i data_l = _mm256_unpacklo_epi16(a, b);
+    const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
+    __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
+    pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
+                               AOM_BLEND_A64_ROUND_BITS);
+
+    const __m256i data_r = _mm256_unpackhi_epi16(a, b);
+    const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
+    __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
+    pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
+                               AOM_BLEND_A64_ROUND_BITS);
+
+    // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+    // so it is safe to do signed saturation here.
+    const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
+    // There is no 16-bit SAD instruction, so we have to synthesize
+    // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+    // and accumulating them at the end
+    const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
+    res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
+
+    src_ptr += src_stride << 1;
+    a_ptr += a_stride << 1;
+    b_ptr += b_stride << 1;
+    m_ptr += m_stride << 1;
+  }
+  // At this point, we have four 32-bit partial SADs stored in 'res'.
+  res = _mm256_hadd_epi32(res, res);
+  res = _mm256_hadd_epi32(res, res);
+  int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
+  return (sad + 31) >> 6;
+}
+
+static INLINE unsigned int highbd_masked_sad16xh_avx2(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int width, int height) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+  int x, y;
+  __m256i res = _mm256_setzero_si256();
+  const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_const =
+      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 16) {
+      const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
+      const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
+      const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
+      // Zero-extend mask to 16 bits
+      const __m256i m =
+          _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x]));
+      const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
+
+      const __m256i data_l = _mm256_unpacklo_epi16(a, b);
+      const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
+      __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
+      pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
+                                 AOM_BLEND_A64_ROUND_BITS);
+
+      const __m256i data_r = _mm256_unpackhi_epi16(a, b);
+      const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
+      __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
+      pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
+                                 AOM_BLEND_A64_ROUND_BITS);
+
+      // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+      // so it is safe to do signed saturation here.
+      const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
+      // There is no 16-bit SAD instruction, so we have to synthesize
+      // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+      // and accumulating them at the end
+      const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
+      res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // At this point, we have four 32-bit partial SADs stored in 'res'.
+  res = _mm256_hadd_epi32(res, res);
+  res = _mm256_hadd_epi32(res, res);
+  int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
+  return (sad + 31) >> 6;
+}
+
+static INLINE unsigned int aom_highbd_masked_sad_avx2(
+    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+    const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
+    int invert_mask, int m, int n) {
+  unsigned int sad;
+  if (!invert_mask) {
+    switch (m) {
+      case 4:
+        sad =
+            aom_highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
+                                           second_pred, m, msk, msk_stride, n);
+        break;
+      case 8:
+        sad = highbd_masked_sad8xh_avx2(src, src_stride, ref, ref_stride,
+                                        second_pred, m, msk, msk_stride, n);
+        break;
+      default:
+        sad = highbd_masked_sad16xh_avx2(src, src_stride, ref, ref_stride,
+                                         second_pred, m, msk, msk_stride, m, n);
+        break;
+    }
+  } else {
+    switch (m) {
+      case 4:
+        sad =
+            aom_highbd_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
+                                           ref_stride, msk, msk_stride, n);
+        break;
+      case 8:
+        sad = highbd_masked_sad8xh_avx2(src, src_stride, second_pred, m, ref,
+                                        ref_stride, msk, msk_stride, n);
+        break;
+      default:
+        sad = highbd_masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
+                                         ref_stride, msk, msk_stride, m, n);
+        break;
+    }
+  }
+  return sad;
+}
+
+#define HIGHBD_MASKSADMXN_AVX2(m, n)                                      \
+  unsigned int aom_highbd_masked_sad##m##x##n##_avx2(                     \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,           \
+      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,    \
+      int msk_stride, int invert_mask) {                                  \
+    return aom_highbd_masked_sad_avx2(src8, src_stride, ref8, ref_stride, \
+                                      second_pred8, msk, msk_stride,      \
+                                      invert_mask, m, n);                 \
+  }
+
+HIGHBD_MASKSADMXN_AVX2(4, 4);
+HIGHBD_MASKSADMXN_AVX2(4, 8);
+HIGHBD_MASKSADMXN_AVX2(8, 4);
+HIGHBD_MASKSADMXN_AVX2(8, 8);
+HIGHBD_MASKSADMXN_AVX2(8, 16);
+HIGHBD_MASKSADMXN_AVX2(16, 8);
+HIGHBD_MASKSADMXN_AVX2(16, 16);
+HIGHBD_MASKSADMXN_AVX2(16, 32);
+HIGHBD_MASKSADMXN_AVX2(32, 16);
+HIGHBD_MASKSADMXN_AVX2(32, 32);
+HIGHBD_MASKSADMXN_AVX2(32, 64);
+HIGHBD_MASKSADMXN_AVX2(64, 32);
+HIGHBD_MASKSADMXN_AVX2(64, 64);
+HIGHBD_MASKSADMXN_AVX2(64, 128);
+HIGHBD_MASKSADMXN_AVX2(128, 64);
+HIGHBD_MASKSADMXN_AVX2(128, 128);
+HIGHBD_MASKSADMXN_AVX2(4, 16);
+HIGHBD_MASKSADMXN_AVX2(16, 4);
+HIGHBD_MASKSADMXN_AVX2(8, 32);
+HIGHBD_MASKSADMXN_AVX2(32, 8);
+HIGHBD_MASKSADMXN_AVX2(16, 64);
+HIGHBD_MASKSADMXN_AVX2(64, 16);
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
index 1f42eec2f..493f9bd8f 100644
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -19,6 +19,8 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/synonyms.h"
 
+#include "aom_dsp/x86//masked_sad_intrin_ssse3.h"
+
 // For width a multiple of 16
 static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
                                             int src_stride,
@@ -27,16 +29,6 @@ static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
                                             const uint8_t *m_ptr, int m_stride,
                                             int width, int height);
 
-static INLINE unsigned int masked_sad8xh_ssse3(
-    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
-    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int height);
-
-static INLINE unsigned int masked_sad4xh_ssse3(
-    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
-    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int height);
-
 #define MASKSADMXN_SSSE3(m, n)                                                \
   unsigned int aom_masked_sad##m##x##n##_ssse3(                               \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
@@ -56,11 +48,11 @@ static INLINE unsigned int masked_sad4xh_ssse3(
       const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
       int invert_mask) {                                                      \
     if (!invert_mask)                                                         \
-      return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,            \
-                                 second_pred, 8, msk, msk_stride, n);         \
+      return aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,        \
+                                     second_pred, 8, msk, msk_stride, n);     \
     else                                                                      \
-      return masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref,        \
-                                 ref_stride, msk, msk_stride, n);             \
+      return aom_masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref,    \
+                                     ref_stride, msk, msk_stride, n);         \
   }
 
 #define MASKSAD4XN_SSSE3(n)                                                   \
@@ -69,11 +61,11 @@ static INLINE unsigned int masked_sad4xh_ssse3(
       const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
       int invert_mask) {                                                      \
     if (!invert_mask)                                                         \
-      return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,            \
-                                 second_pred, 4, msk, msk_stride, n);         \
+      return aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,        \
+                                     second_pred, 4, msk, msk_stride, n);     \
     else                                                                      \
-      return masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref,        \
-                                 ref_stride, msk, msk_stride, n);             \
+      return aom_masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref,    \
+                                     ref_stride, msk, msk_stride, n);         \
   }
 
 MASKSADMXN_SSSE3(128, 128)
@@ -145,10 +137,11 @@ static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
   return (sad + 31) >> 6;
 }
 
-static INLINE unsigned int masked_sad8xh_ssse3(
-    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
-    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int height) {
+unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *a_ptr, int a_stride,
+                                     const uint8_t *b_ptr, int b_stride,
+                                     const uint8_t *m_ptr, int m_stride,
+                                     int height) {
   int y;
   __m128i res = _mm_setzero_si128();
   const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
@@ -189,10 +182,11 @@ static INLINE unsigned int masked_sad8xh_ssse3(
   return (sad + 31) >> 6;
 }
 
-static INLINE unsigned int masked_sad4xh_ssse3(
-    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
-    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int height) {
+unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *a_ptr, int a_stride,
+                                     const uint8_t *b_ptr, int b_stride,
+                                     const uint8_t *m_ptr, int m_stride,
+                                     int height) {
   int y;
   __m128i res = _mm_setzero_si128();
   const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
@@ -238,11 +232,6 @@ static INLINE unsigned int highbd_masked_sad_ssse3(
     const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
     int width, int height);
 
-static INLINE unsigned int highbd_masked_sad4xh_ssse3(
-    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
-    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int height);
-
 #define HIGHBD_MASKSADMXN_SSSE3(m, n)                                         \
   unsigned int aom_highbd_masked_sad##m##x##n##_ssse3(                        \
       const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
@@ -262,11 +251,13 @@ static INLINE unsigned int highbd_masked_sad4xh_ssse3(
       int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,         \
       int msk_stride, int invert_mask) {                                       \
     if (!invert_mask)                                                          \
-      return highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, ref_stride,    \
-                                        second_pred8, 4, msk, msk_stride, n);  \
+      return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, ref8,            \
+                                            ref_stride, second_pred8, 4, msk,  \
+                                            msk_stride, n);                    \
     else                                                                       \
-      return highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4,     \
-                                        ref8, ref_stride, msk, msk_stride, n); \
+      return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \
+                                            ref8, ref_stride, msk, msk_stride, \
+                                            n);                                \
   }
 
 HIGHBD_MASKSADMXN_SSSE3(128, 128)
@@ -350,10 +341,11 @@ static INLINE unsigned int highbd_masked_sad_ssse3(
   return (sad + 31) >> 6;
 }
 
-static INLINE unsigned int highbd_masked_sad4xh_ssse3(
-    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
-    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int height) {
+unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
+                                            const uint8_t *a8, int a_stride,
+                                            const uint8_t *b8, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int height) {
   const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
   const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
   const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
new file mode 100644
index 000000000..19b429d91
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H
+#define _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H
+
+unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *a_ptr, int a_stride,
+                                     const uint8_t *b_ptr, int b_stride,
+                                     const uint8_t *m_ptr, int m_stride,
+                                     int height);
+
+unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *a_ptr, int a_stride,
+                                     const uint8_t *b_ptr, int b_stride,
+                                     const uint8_t *m_ptr, int m_stride,
+                                     int height);
+
+unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
+                                            const uint8_t *a8, int a_stride,
+                                            const uint8_t *b8, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int height);
+
+#endif
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
new file mode 100644
index 000000000..2aa2a0555
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre,
+                                            const int pre_stride,
+                                            const int32_t *wsrc,
+                                            const int32_t *mask,
+                                            const int height) {
+  int n = 0;
+  __m256i v_sad_d = _mm256_setzero_si256();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+
+  do {
+    const __m128i v_p_b_0 = xx_loadl_32(pre);
+    const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride);
+    const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1);
+    const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+    const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+    const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
+
+    const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+    const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
+
+    // Rounded absolute difference
+    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
+    const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
+
+    n += 8;
+    pre += pre_stride << 1;
+  } while (n < 8 * (height >> 1));
+
+  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+  return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+static INLINE unsigned int obmc_sad_w8n_avx2(
+    const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
+  const int pre_step = pre_stride - width;
+  int n = 0;
+  __m256i v_sad_d = _mm256_setzero_si256();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+  assert(width >= 8);
+  assert(IS_POWER_OF_TWO(width));
+
+  do {
+    const __m128i v_p0_b = xx_loadl_64(pre + n);
+    const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+    const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+    const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+
+    const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+    const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
+
+    // Rounded absolute difference
+    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
+    const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
+
+    n += 8;
+
+    if ((n & (width - 1)) == 0) pre += pre_step;
+  } while (n < width * height);
+
+  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+  return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+#define OBMCSADWXH(w, h)                                          \
+  unsigned int aom_obmc_sad##w##x##h##_avx2(                      \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
+      const int32_t *msk) {                                       \
+    if (w == 4) {                                                 \
+      return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h);     \
+    } else {                                                      \
+      return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \
+    }                                                             \
+  }
+
+OBMCSADWXH(128, 128)
+OBMCSADWXH(128, 64)
+OBMCSADWXH(64, 128)
+OBMCSADWXH(64, 64)
+OBMCSADWXH(64, 32)
+OBMCSADWXH(32, 64)
+OBMCSADWXH(32, 32)
+OBMCSADWXH(32, 16)
+OBMCSADWXH(16, 32)
+OBMCSADWXH(16, 16)
+OBMCSADWXH(16, 8)
+OBMCSADWXH(8, 16)
+OBMCSADWXH(8, 8)
+OBMCSADWXH(8, 4)
+OBMCSADWXH(4, 8)
+OBMCSADWXH(4, 4)
+OBMCSADWXH(4, 16)
+OBMCSADWXH(16, 4)
+OBMCSADWXH(8, 32)
+OBMCSADWXH(32, 8)
+OBMCSADWXH(16, 64)
+OBMCSADWXH(64, 16)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8,
+                                                const int pre_stride,
+                                                const int32_t *wsrc,
+                                                const int32_t *mask,
+                                                const int height) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  int n = 0;
+  __m256i v_sad_d = _mm256_setzero_si256();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+  do {
+    const __m128i v_p_w_0 = xx_loadl_64(pre);
+    const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride);
+    const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1);
+    const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+    const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+    const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
+
+    const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+    const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
+
+    // Rounded absolute difference
+
+    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
+    const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
+
+    n += 8;
+
+    pre += pre_stride << 1;
+  } while (n < 8 * (height >> 1));
+
+  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+  return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+static INLINE unsigned int hbd_obmc_sad_w8n_avx2(
+    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - width;
+  int n = 0;
+  __m256i v_sad_d = _mm256_setzero_si256();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+
+  assert(width >= 8);
+  assert(IS_POWER_OF_TWO(width));
+
+  do {
+    const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n));
+    const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+    const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+    const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+
+    const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+    const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
+
+    // Rounded absolute difference
+    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
+    const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
+
+    n += 8;
+
+    if (n % width == 0) pre += pre_step;
+  } while (n < width * height);
+
+  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+  return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+#define HBD_OBMCSADWXH(w, h)                                           \
+  unsigned int aom_highbd_obmc_sad##w##x##h##_avx2(                    \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
+      const int32_t *mask) {                                           \
+    if (w == 4) {                                                      \
+      return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h);     \
+    } else {                                                           \
+      return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \
+    }                                                                  \
+  }
+
+HBD_OBMCSADWXH(128, 128)
+HBD_OBMCSADWXH(128, 64)
+HBD_OBMCSADWXH(64, 128)
+HBD_OBMCSADWXH(64, 64)
+HBD_OBMCSADWXH(64, 32)
+HBD_OBMCSADWXH(32, 64)
+HBD_OBMCSADWXH(32, 32)
+HBD_OBMCSADWXH(32, 16)
+HBD_OBMCSADWXH(16, 32)
+HBD_OBMCSADWXH(16, 16)
+HBD_OBMCSADWXH(16, 8)
+HBD_OBMCSADWXH(8, 16)
+HBD_OBMCSADWXH(8, 8)
+HBD_OBMCSADWXH(8, 4)
+HBD_OBMCSADWXH(4, 8)
+HBD_OBMCSADWXH(4, 4)
+HBD_OBMCSADWXH(4, 16)
+HBD_OBMCSADWXH(16, 4)
+HBD_OBMCSADWXH(8, 32)
+HBD_OBMCSADWXH(32, 8)
+HBD_OBMCSADWXH(16, 64)
+HBD_OBMCSADWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
index 571aa770b..2e2f6e09f 100644
--- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
@@ -26,6 +26,16 @@
 // 8 bit
 ////////////////////////////////////////////////////////////////////////////////
 
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter);
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter);
+
 static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
                                     const int32_t *wsrc, const int32_t *mask,
                                     unsigned int *const sse, int *const sum,
@@ -152,6 +162,46 @@ OBMCVARWXH(32, 8)
 OBMCVARWXH(16, 64)
 OBMCVARWXH(64, 16)
 
+#include "config/aom_dsp_rtcd.h"
+
+#define OBMC_SUBPIX_VAR(W, H)                                                \
+  uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1(                    \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,          \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {         \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint8_t temp2[H * W];                                                    \
+                                                                             \
+    aom_var_filter_block2d_bil_first_pass_ssse3(                             \
+        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_var_filter_block2d_bil_second_pass_ssse3(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse);   \
+  }
+
+OBMC_SUBPIX_VAR(128, 128)
+OBMC_SUBPIX_VAR(128, 64)
+OBMC_SUBPIX_VAR(64, 128)
+OBMC_SUBPIX_VAR(64, 64)
+OBMC_SUBPIX_VAR(64, 32)
+OBMC_SUBPIX_VAR(32, 64)
+OBMC_SUBPIX_VAR(32, 32)
+OBMC_SUBPIX_VAR(32, 16)
+OBMC_SUBPIX_VAR(16, 32)
+OBMC_SUBPIX_VAR(16, 16)
+OBMC_SUBPIX_VAR(16, 8)
+OBMC_SUBPIX_VAR(8, 16)
+OBMC_SUBPIX_VAR(8, 8)
+OBMC_SUBPIX_VAR(8, 4)
+OBMC_SUBPIX_VAR(4, 8)
+OBMC_SUBPIX_VAR(4, 4)
+OBMC_SUBPIX_VAR(4, 16)
+OBMC_SUBPIX_VAR(16, 4)
+OBMC_SUBPIX_VAR(8, 32)
+OBMC_SUBPIX_VAR(32, 8)
+OBMC_SUBPIX_VAR(16, 64)
+OBMC_SUBPIX_VAR(64, 16)
+
 ////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/aom/aom_dsp/x86/subtract_avx2.c b/third_party/aom/aom_dsp/x86/subtract_avx2.c
new file mode 100644
index 000000000..4389d123d
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/subtract_avx2.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr,
+                                   const uint8_t *pred_ptr) {
+  __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr));
+  __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr));
+  __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
+  __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
+  __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
+  __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
+  const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+  const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
+  _mm256_store_si256((__m256i *)(diff_ptr), d_0);
+  _mm256_store_si256((__m256i *)(diff_ptr + 16), d_1);
+}
+
+static INLINE void aom_subtract_block_16xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  for (int32_t j = 0; j < rows; ++j) {
+    __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr));
+    __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr));
+    __m256i s_0 = _mm256_cvtepu8_epi16(s);
+    __m256i p_0 = _mm256_cvtepu8_epi16(p);
+    const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+    _mm256_store_si256((__m256i *)(diff_ptr), d_0);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static INLINE void aom_subtract_block_32xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  for (int32_t j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static INLINE void aom_subtract_block_64xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  for (int32_t j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static INLINE void aom_subtract_block_128xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  for (int32_t j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+    subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64);
+    subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                             ptrdiff_t pred_stride) {
+  switch (cols) {
+    case 16:
+      aom_subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+                                   src_stride, pred_ptr, pred_stride);
+      break;
+    case 32:
+      aom_subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+                                   src_stride, pred_ptr, pred_stride);
+      break;
+    case 64:
+      aom_subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+                                   src_stride, pred_ptr, pred_stride);
+      break;
+    case 128:
+      aom_subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+                                    src_stride, pred_ptr, pred_stride);
+      break;
+    default:
+      aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
+                              src_stride, pred_ptr, pred_stride);
+      break;
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
new file mode 100644
index 000000000..bdff64b8f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+#define AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+
+#include <emmintrin.h>
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit);
+
+static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) {
+  return _mm256_set1_epi32(
+      (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1,
+                                   __m256i *in0, __m256i *in1, const __m256i _r,
+                                   const int32_t cos_bit) {
+  __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
+  __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
+  __m256i u0 = _mm256_madd_epi16(t0, w0);
+  __m256i u1 = _mm256_madd_epi16(t1, w0);
+  __m256i v0 = _mm256_madd_epi16(t0, w1);
+  __m256i v1 = _mm256_madd_epi16(t1, w1);
+
+  __m256i a0 = _mm256_add_epi32(u0, _r);
+  __m256i a1 = _mm256_add_epi32(u1, _r);
+  __m256i b0 = _mm256_add_epi32(v0, _r);
+  __m256i b1 = _mm256_add_epi32(v1, _r);
+
+  __m256i c0 = _mm256_srai_epi32(a0, cos_bit);
+  __m256i c1 = _mm256_srai_epi32(a1, cos_bit);
+  __m256i d0 = _mm256_srai_epi32(b0, cos_bit);
+  __m256i d1 = _mm256_srai_epi32(b1, cos_bit);
+
+  *in0 = _mm256_packs_epi32(c0, c1);
+  *in1 = _mm256_packs_epi32(d0, d1);
+}
+
+static INLINE void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) {
+  const __m256i _in0 = *in0;
+  const __m256i _in1 = *in1;
+  *in0 = _mm256_adds_epi16(_in0, _in1);
+  *in1 = _mm256_subs_epi16(_in0, _in1);
+}
+
+static INLINE void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) {
+  const __m256i _in0 = *in0;
+  const __m256i _in1 = *in1;
+  *in0 = _mm256_add_epi32(_in0, _in1);
+  *in1 = _mm256_sub_epi32(_in0, _in1);
+}
+
+static INLINE void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1,
+                                             __m256i in0, __m256i in1) {
+  const __m256i _in0 = in0;
+  const __m256i _in1 = in1;
+  *out0 = _mm256_adds_epi16(_in0, _in1);
+  *out1 = _mm256_subs_epi16(_in0, _in1);
+}
+
+static INLINE void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1,
+                                           __m256i in0, __m256i in1) {
+  const __m256i _in0 = in0;
+  const __m256i _in1 = in1;
+  *out0 = _mm256_add_epi32(_in0, _in1);
+  *out1 = _mm256_sub_epi32(_in0, _in1);
+}
+
+static INLINE __m256i load_16bit_to_16bit_avx2(const int16_t *a) {
+  return _mm256_load_si256((const __m256i *)a);
+}
+
+static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
+                                                   int stride, __m256i *out,
+                                                   int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_16bit_to_16bit_avx2(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in,
+                                                        int stride,
+                                                        __m256i *out,
+                                                        int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[out_size - i - 1] = load_16bit_to_16bit_avx2(in + i * stride);
+  }
+}
+
+static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
+  const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a);
+  const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
+  return _mm256_permute4x64_epi64(b, 0xD8);
+}
+
+static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
+                                                       int stride, __m256i *out,
+                                                       int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride);
+  }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+                                              __m256i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  08 09 0a 0b  04 05 06 07  0c 0d 0e 0f
+  // in[1]: 10 11 12 13  18 19 1a 1b  14 15 16 17  1c 1d 1e 1f
+  // in[2]: 20 21 22 23  28 29 2a 2b  24 25 26 27  2c 2d 2e 2f
+  // in[3]: 30 31 32 33  38 39 3a 3b  34 35 36 37  3c 3d 3e 3f
+  // in[4]: 40 41 42 43  48 49 4a 4b  44 45 46 47  4c 4d 4e 4f
+  // in[5]: 50 51 52 53  58 59 5a 5b  54 55 56 57  5c 5d 5e 5f
+  // in[6]: 60 61 62 63  68 69 6a 6b  64 65 66 67  6c 6d 6e 6f
+  // in[7]: 70 71 72 73  78 79 7a 7b  74 75 76 77  7c 7d 7e 7f
+  // in[8]: 80 81 82 83  88 89 8a 8b  84 85 86 87  8c 8d 8e 8f
+  // to:
+  // a0:    00 10 01 11  02 12 03 13  04 14 05 15  06 16 07 17
+  // a1:    20 30 21 31  22 32 23 33  24 34 25 35  26 36 27 37
+  // a2:    40 50 41 51  42 52 43 53  44 54 45 55  46 56 47 57
+  // a3:    60 70 61 71  62 72 63 73  64 74 65 75  66 76 67 77
+  // ...
+  __m256i a[16];
+  for (int i = 0; i < 16; i += 2) {
+    a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]);
+    a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]);
+  }
+  __m256i b[16];
+  for (int i = 0; i < 16; i += 2) {
+    b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]);
+    b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]);
+  }
+  __m256i c[16];
+  for (int i = 0; i < 16; i += 2) {
+    c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]);
+    c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]);
+  }
+  out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20);
+  out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20);
+  out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20);
+  out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20);
+
+  out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31);
+  out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31);
+  out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31);
+  out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31);
+
+  out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20);
+  out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20);
+  out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20);
+  out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20);
+
+  out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31);
+  out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31);
+  out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31);
+  out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31);
+}
+
+static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) {
+  for (int i = 0; i < size; ++i) {
+    out[size - i - 1] = in[i];
+  }
+}
+
+static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
+  if (bit < 0) {
+    bit = -bit;
+    __m256i round = _mm256_set1_epi16(1 << (bit - 1));
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm256_adds_epi16(in[i], round);
+      in[i] = _mm256_srai_epi16(in[i], bit);
+    }
+  } else if (bit > 0) {
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm256_slli_epi16(in[i], bit);
+    }
+  }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_DSP_X86_TXFM_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c
index 7d6b7d287..a7ac2c93d 100644
--- a/third_party/aom/aom_dsp/x86/variance_avx2.c
+++ b/third_party/aom/aom_dsp/x86/variance_avx2.c
@@ -324,6 +324,12 @@ static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
   return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
 }
 
+static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) {
+  const __m256i d =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
+  return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
+}
+
 static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
                                             const __m256i a,
                                             uint8_t *comp_pred) {
@@ -401,3 +407,110 @@ void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
     } while (i < height);
   }
 }
+
+static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0,
+                                                      const __m256i s1,
+                                                      const __m256i a) {
+  const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_const =
+      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m256i a_inv = _mm256_sub_epi16(alpha_max, a);
+
+  const __m256i s_lo = _mm256_unpacklo_epi16(s0, s1);
+  const __m256i a_lo = _mm256_unpacklo_epi16(a, a_inv);
+  const __m256i pred_lo = _mm256_madd_epi16(s_lo, a_lo);
+  const __m256i pred_l = _mm256_srai_epi32(
+      _mm256_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS);
+
+  const __m256i s_hi = _mm256_unpackhi_epi16(s0, s1);
+  const __m256i a_hi = _mm256_unpackhi_epi16(a, a_inv);
+  const __m256i pred_hi = _mm256_madd_epi16(s_hi, a_hi);
+  const __m256i pred_h = _mm256_srai_epi32(
+      _mm256_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS);
+
+  const __m256i comp = _mm256_packs_epi32(pred_l, pred_h);
+
+  return comp;
+}
+
+void aom_highbd_comp_mask_pred_avx2(uint16_t *comp_pred, const uint8_t *pred8,
+                                    int width, int height, const uint8_t *ref8,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask) {
+  int i = 0;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  const uint16_t *src0 = invert_mask ? pred : ref;
+  const uint16_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  const __m256i zero = _mm256_setzero_si256();
+
+  if (width == 8) {
+    do {
+      const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0);
+      const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1);
+
+      const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask);
+      const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8));
+
+      __m256i m = _mm256_castsi128_si256(m_l);
+      m = _mm256_insertf128_si256(m, m_h, 1);
+      const __m256i m_16 = _mm256_unpacklo_epi8(m, zero);
+
+      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
+
+      _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp));
+
+      _mm_storeu_si128((__m128i *)(comp_pred + width),
+                       _mm256_extractf128_si256(comp, 1));
+
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      comp_pred += (width << 1);
+      i += 2;
+    } while (i < height);
+  } else if (width == 16) {
+    do {
+      const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0));
+      const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1));
+      const __m256i m_16 =
+          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
+
+      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
+
+      _mm256_storeu_si256((__m256i *)comp_pred, comp);
+
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      comp_pred += width;
+      i += 1;
+    } while (i < height);
+  } else if (width == 32) {
+    do {
+      const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0);
+      const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16));
+      const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1);
+      const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16));
+
+      const __m256i m01_16 =
+          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
+      const __m256i m23_16 =
+          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16)));
+
+      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16);
+      const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16);
+
+      _mm256_storeu_si256((__m256i *)comp_pred, comp);
+      _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1);
+
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      comp_pred += width;
+      i += 1;
+    } while (i < height);
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
new file mode 100644
index 000000000..66b0d7d84
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter) {
+  // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow
+  // in computation using _mm_maddubs_epi16.
+  // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
+  const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
+  const __m128i r = _mm_set1_epi16(round);
+  const uint8_t f0 = filter[0] >> 1;
+  const uint8_t f1 = filter[1] >> 1;
+  const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1,
+                                        f0, f1, f0, f1, f0, f1);
+  unsigned int i, j;
+  (void)pixel_step;
+
+  if (output_width >= 8) {
+    for (i = 0; i < output_height; ++i) {
+      for (j = 0; j < output_width; j += 8) {
+        // load source
+        __m128i source_low = xx_loadl_64(a);
+        __m128i source_hi = xx_loadl_64(a + 1);
+
+        // unpack to:
+        // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+        //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+        __m128i source = _mm_unpacklo_epi8(source_low, source_hi);
+
+        // b[i] = a[i] * filter[0] + a[i + 1] * filter[1]
+        __m128i res = _mm_maddubs_epi16(source, filters);
+
+        // round
+        res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+        xx_storeu_128(b, res);
+
+        a += 8;
+        b += 8;
+      }
+
+      a += src_pixels_per_line - output_width;
+    }
+  } else {
+    const __m128i shuffle_mask =
+        _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+    for (i = 0; i < output_height; ++i) {
+      // load source, only first 5 values are meaningful:
+      // { a[0], a[1], a[2], a[3], a[4], xxxx }
+      __m128i source = xx_loadl_64(a);
+
+      // shuffle, up to the first 8 are useful
+      // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+      //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+      __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
+      res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+      xx_storel_64(b, res);
+
+      a += src_pixels_per_line;
+      b += output_width;
+    }
+  }
+}
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter) {
+  const int16_t round = (1 << FILTER_BITS) >> 1;
+  const __m128i r = _mm_set1_epi32(round);
+  const __m128i filters =
+      _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0],
+                     filter[1], filter[0], filter[1]);
+  const __m128i shuffle_mask =
+      _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+  const __m128i mask =
+      _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; j += 4) {
+      // load source as:
+      // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] }
+      __m128i source1 = xx_loadl_64(a);
+      __m128i source2 = xx_loadl_64(a + pixel_step);
+      __m128i source = _mm_unpacklo_epi64(source1, source2);
+
+      // shuffle source to:
+      // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] }
+      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+      // b[i] = a[i] * filter[0] + a[w + i] * filter[1]
+      __m128i res = _mm_madd_epi16(source_shuffle, filters);
+
+      // round
+      res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS);
+
+      // shuffle to get each lower 8 bit of every 32 bit
+      res = _mm_shuffle_epi8(res, mask);
+
+      xx_storel_32(b, res);
+
+      a += 4;
+      b += 4;
+    }
+
+    a += src_pixels_per_line - output_width;
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c
index c8c90a7dc..7e3c5d5db 100644
--- a/third_party/aom/aom_dsp/x86/variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/variance_sse2.c
@@ -569,7 +569,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
     }
   }
 
-  const InterpFilterParams filter =
+  const InterpFilterParams *filter =
       av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
 
   if (!subpel_x_q3 && !subpel_y_q3) {
@@ -633,12 +633,12 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
     const int16_t *const kernel_y =
         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
     const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
     assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), ref_stride,
-                        temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
-                        intermediate_height);
-    aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
+    aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1),
+                        ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
+                        width, intermediate_height);
+    aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
                        MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
                        width, height);
   }
diff --git a/third_party/aom/aom_ports/aom_once.h b/third_party/aom/aom_ports/aom_once.h
index bb1e21366..8e04f8583 100644
--- a/third_party/aom/aom_ports/aom_once.h
+++ b/third_party/aom/aom_ports/aom_once.h
@@ -17,7 +17,7 @@
 /* Implement a function wrapper to guarantee initialization
  * thread-safety for library singletons.
  *
- * NOTE: These functions use static locks, and can only be
+ * NOTE: This function uses static locks, and can only be
  * used with one common argument per compilation unit. So
  *
  * file1.c:
@@ -25,8 +25,8 @@
  *   ...
  *   aom_once(foo);
  *
- *   file2.c:
- *     aom_once(bar);
+ * file2.c:
+ *   aom_once(bar);
  *
  * will ensure foo() and bar() are each called only once, but in
  *
@@ -46,19 +46,19 @@
  * local initializers are not thread-safe in MSVC prior to Visual
  * Studio 2015.
  *
- * As a static, once_state will be zero-initialized as program start.
+ * As a static, aom_once_state will be zero-initialized as program start.
  */
-static LONG once_state;
-static void once(void (*func)(void)) {
-  /* Try to advance once_state from its initial value of 0 to 1.
+static LONG aom_once_state;
+static void aom_once(void (*func)(void)) {
+  /* Try to advance aom_once_state from its initial value of 0 to 1.
    * Only one thread can succeed in doing so.
    */
-  if (InterlockedCompareExchange(&once_state, 1, 0) == 0) {
-    /* We're the winning thread, having set once_state to 1.
+  if (InterlockedCompareExchange(&aom_once_state, 1, 0) == 0) {
+    /* We're the winning thread, having set aom_once_state to 1.
      * Call our function. */
     func();
-    /* Now advance once_state to 2, unblocking any other threads. */
-    InterlockedIncrement(&once_state);
+    /* Now advance aom_once_state to 2, unblocking any other threads. */
+    InterlockedIncrement(&aom_once_state);
     return;
   }
 
@@ -66,10 +66,10 @@ static void once(void (*func)(void)) {
    * the state variable so we don't return before func()
    * has finished executing elsewhere.
    *
-   * Try to advance once_state from 2 to 2, which is only possible
+   * Try to advance aom_once_state from 2 to 2, which is only possible
    * after the winning thead advances it from 1 to 2.
    */
-  while (InterlockedCompareExchange(&once_state, 2, 2) != 2) {
+  while (InterlockedCompareExchange(&aom_once_state, 2, 2) != 2) {
     /* State isn't yet 2. Try again.
      *
      * We are used for singleton initialization functions,
@@ -83,8 +83,8 @@ static void once(void (*func)(void)) {
     Sleep(0);
   }
 
-  /* We've seen once_state advance to 2, so we know func()
-   * has been called. And we've left once_state as we found it,
+  /* We've seen aom_once_state advance to 2, so we know func()
+   * has been called. And we've left aom_once_state as we found it,
    * so other threads will have the same experience.
    *
    * It's safe to return now.
@@ -95,7 +95,7 @@ static void once(void (*func)(void)) {
 #elif CONFIG_MULTITHREAD && defined(__OS2__)
 #define INCL_DOS
 #include <os2.h>
-static void once(void (*func)(void)) {
+static void aom_once(void (*func)(void)) {
   static int done;
 
   /* If the initialization is complete, return early. */
@@ -117,18 +117,15 @@ static void once(void (*func)(void)) {
 
 #elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
 #include <pthread.h>
-static void once(void (*func)(void)) {
+static void aom_once(void (*func)(void)) {
   static pthread_once_t lock = PTHREAD_ONCE_INIT;
   pthread_once(&lock, func);
 }
 
 #else
-/* No-op version that performs no synchronization. *_rtcd() is idempotent,
- * so as long as your platform provides atomic loads/stores of pointers
- * no synchronization is strictly necessary.
- */
+/* Default version that performs no synchronization. */
 
-static void once(void (*func)(void)) {
+static void aom_once(void (*func)(void)) {
   static int done;
 
   if (!done) {
diff --git a/third_party/aom/aom_scale/aom_scale_rtcd.c b/third_party/aom/aom_scale/aom_scale_rtcd.c
index 08f1a376d..a04e053b0 100644
--- a/third_party/aom/aom_scale/aom_scale_rtcd.c
+++ b/third_party/aom/aom_scale/aom_scale_rtcd.c
@@ -15,4 +15,4 @@
 
 #include "aom_ports/aom_once.h"
 
-void aom_scale_rtcd() { once(setup_rtcd_internal); }
+void aom_scale_rtcd() { aom_once(setup_rtcd_internal); }
diff --git a/third_party/aom/aom_scale/generic/yv12config.c b/third_party/aom/aom_scale/generic/yv12config.c
index cce915165..ca5b69066 100644
--- a/third_party/aom/aom_scale/generic/yv12config.c
+++ b/third_party/aom/aom_scale/generic/yv12config.c
@@ -51,6 +51,10 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
                              aom_codec_frame_buffer_t *fb,
                              aom_get_frame_buffer_cb_fn_t cb, void *cb_priv) {
   if (ybf) {
+#if CONFIG_SIZE_LIMIT
+    if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return -1;
+#endif
+
     const int aom_byte_align = (byte_alignment == 0) ? 1 : byte_alignment;
     const int aligned_width = (width + 7) & ~7;
     const int aligned_height = (height + 7) & ~7;
@@ -154,7 +158,7 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
                                        (uv_border_h * uv_stride) + uv_border_w,
                                    aom_byte_align);
 
-    ybf->use_external_refernce_buffers = 0;
+    ybf->use_external_reference_buffers = 0;
 
     if (use_highbitdepth) {
       if (ybf->y_buffer_8bit) aom_free(ybf->y_buffer_8bit);
diff --git a/third_party/aom/aom_scale/yv12config.h b/third_party/aom/aom_scale/yv12config.h
index 8f1c60069..2b4f597b0 100644
--- a/third_party/aom/aom_scale/yv12config.h
+++ b/third_party/aom/aom_scale/yv12config.h
@@ -81,7 +81,7 @@ typedef struct yv12_buffer_config {
 
   // Indicate whether y_buffer, u_buffer, and v_buffer points to the internally
   // allocated memory or external buffers.
-  int use_external_refernce_buffers;
+  int use_external_reference_buffers;
   // This is needed to store y_buffer, u_buffer, and v_buffer when set reference
   // uses an external refernece, and restore those buffer pointers after the
   // external reference frame is no longer used.
diff --git a/third_party/aom/aom_util/aom_thread.h b/third_party/aom/aom_util/aom_thread.h
index 3b22ac70c..fdb724d0c 100644
--- a/third_party/aom/aom_util/aom_thread.h
+++ b/third_party/aom/aom_util/aom_thread.h
@@ -369,7 +369,8 @@ typedef enum {
 } AVxWorkerStatus;
 
 // Function to be called by the worker thread. Takes two opaque pointers as
-// arguments (data1 and data2), and should return false in case of error.
+// arguments (data1 and data2). Should return true on success and return false
+// in case of error.
 typedef int (*AVxWorkerHook)(void *, void *);
 
 // Platform-dependent implementation details for the worker.
@@ -382,7 +383,7 @@ typedef struct {
   AVxWorkerHook hook;  // hook to call
   void *data1;         // first argument passed to 'hook'
   void *data2;         // second argument passed to 'hook'
-  int had_error;       // return value of the last call to 'hook'
+  int had_error;       // true if a call to 'hook' returned false
 } AVxWorker;
 
 // The interface for all thread-worker related functions. All these functions
diff --git a/third_party/aom/apps/aomdec.c b/third_party/aom/apps/aomdec.c
index 6c4d724a4..48952586f 100644
--- a/third_party/aom/apps/aomdec.c
+++ b/third_party/aom/apps/aomdec.c
@@ -83,6 +83,8 @@ static const arg_def_t outputfile =
     ARG_DEF("o", "output", 1, "Output file name pattern (see below)");
 static const arg_def_t threadsarg =
     ARG_DEF("t", "threads", 1, "Max threads to use");
+static const arg_def_t rowmtarg =
+    ARG_DEF(NULL, "row-mt", 1, "Enable row based multi-threading");
 static const arg_def_t verbosearg =
     ARG_DEF("v", "verbose", 0, "Show version string");
 static const arg_def_t scalearg =
@@ -114,12 +116,12 @@ static const arg_def_t outallarg = ARG_DEF(
     NULL, "all-layers", 0, "Output all decoded frames of a scalable bitstream");
 
 static const arg_def_t *all_args[] = {
-  &help,           &codecarg,   &use_yv12,    &use_i420,      &flipuvarg,
-  &rawvideo,       &noblitarg,  &progressarg, &limitarg,      &skiparg,
-  &postprocarg,    &summaryarg, &outputfile,  &threadsarg,    &verbosearg,
-  &scalearg,       &fb_arg,     &md5arg,      &framestatsarg, &continuearg,
-  &outbitdeptharg, &tilem,      &tiler,       &tilec,         &isannexb,
-  &oppointarg,     &outallarg,  NULL
+  &help,        &codecarg,       &use_yv12,    &use_i420,   &flipuvarg,
+  &rawvideo,    &noblitarg,      &progressarg, &limitarg,   &skiparg,
+  &postprocarg, &summaryarg,     &outputfile,  &threadsarg, &rowmtarg,
+  &verbosearg,  &scalearg,       &fb_arg,      &md5arg,     &framestatsarg,
+  &continuearg, &outbitdeptharg, &tilem,       &tiler,      &tilec,
+  &isannexb,    &oppointarg,     &outallarg,   NULL
 };
 
 #if CONFIG_LIBYUV
@@ -512,6 +514,7 @@ static int main_loop(int argc, const char **argv_) {
   int do_scale = 0;
   int operating_point = 0;
   int output_all_layers = 0;
+  unsigned int row_mt = 0;
   aom_image_t *scaled_img = NULL;
   aom_image_t *img_shifted = NULL;
   int frame_avail, got_data, flush_decoder = 0;
@@ -601,6 +604,15 @@ static int main_loop(int argc, const char **argv_) {
       summary = 1;
     } else if (arg_match(&arg, &threadsarg, argi)) {
       cfg.threads = arg_parse_uint(&arg);
+#if !CONFIG_MULTITHREAD
+      if (cfg.threads > 1) {
+        die("Error: --threads=%d is not supported when CONFIG_MULTITHREAD = "
+            "0.\n",
+            cfg.threads);
+      }
+#endif
+    } else if (arg_match(&arg, &rowmtarg, argi)) {
+      row_mt = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &verbosearg, argi)) {
       quiet = 0;
     } else if (arg_match(&arg, &scalearg, argi)) {
@@ -763,6 +775,11 @@ static int main_loop(int argc, const char **argv_) {
             aom_codec_error(&decoder));
     goto fail;
   }
+
+  if (aom_codec_control(&decoder, AV1D_SET_ROW_MT, row_mt)) {
+    fprintf(stderr, "Failed to set row_mt: %s\n", aom_codec_error(&decoder));
+    goto fail;
+  }
 #endif
 
   if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip);
@@ -910,9 +927,8 @@ static int main_loop(int argc, const char **argv_) {
         // Shift up or down if necessary
         if (output_bit_depth != 0) {
           const aom_img_fmt_t shifted_fmt =
-              output_bit_depth == 8
-                  ? img->fmt ^ (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH)
-                  : img->fmt | AOM_IMG_FMT_HIGHBITDEPTH;
+              output_bit_depth == 8 ? img->fmt & ~AOM_IMG_FMT_HIGHBITDEPTH
+                                    : img->fmt | AOM_IMG_FMT_HIGHBITDEPTH;
 
           if (shifted_fmt != img->fmt || output_bit_depth != img->bit_depth) {
             if (img_shifted &&
diff --git a/third_party/aom/apps/aomenc.c b/third_party/aom/apps/aomenc.c
index db0910220..31cb662e4 100644
--- a/third_party/aom/apps/aomenc.c
+++ b/third_party/aom/apps/aomenc.c
@@ -475,6 +475,13 @@ static const arg_def_t film_grain_test =
 static const arg_def_t film_grain_table =
     ARG_DEF(NULL, "film-grain-table", 1,
             "Path to file containing film grain parameters");
+#if CONFIG_DENOISE
+static const arg_def_t denoise_noise_level =
+    ARG_DEF(NULL, "denoise-noise-level", 1,
+            "Amount of noise (from 0 = don't denoise, to 50)");
+static const arg_def_t denoise_block_size =
+    ARG_DEF(NULL, "denoise-block-size", 1, "Denoise block size (default = 32)");
+#endif
 static const arg_def_t enable_ref_frame_mvs =
     ARG_DEF(NULL, "enable-ref-frame-mvs", 1,
             "Enable temporal mv prediction (default is 1)");
@@ -656,6 +663,10 @@ static const arg_def_t *av1_args[] = { &cpu_used_av1,
                                        &timing_info,
                                        &film_grain_test,
                                        &film_grain_table,
+#if CONFIG_DENOISE
+                                       &denoise_noise_level,
+                                       &denoise_block_size,
+#endif
                                        &enable_ref_frame_mvs,
                                        &bitdeptharg,
                                        &inbitdeptharg,
@@ -708,6 +719,10 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
                                         AV1E_SET_TIMING_INFO_TYPE,
                                         AV1E_SET_FILM_GRAIN_TEST_VECTOR,
                                         AV1E_SET_FILM_GRAIN_TABLE,
+#if CONFIG_DENOISE
+                                        AV1E_SET_DENOISE_NOISE_LEVEL,
+                                        AV1E_SET_DENOISE_BLOCK_SIZE,
+#endif
                                         AV1E_SET_ENABLE_REF_FRAME_MVS,
                                         AV1E_SET_ENABLE_DF,
                                         AV1E_SET_ENABLE_ORDER_HINT,
diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake
index 1c7f937e1..4c4f542fe 100644
--- a/third_party/aom/av1/av1.cmake
+++ b/third_party/aom/av1/av1.cmake
@@ -45,7 +45,6 @@ list(APPEND AOM_AV1_COMMON_SOURCES
             "${AOM_ROOT}/av1/common/entropymv.c"
             "${AOM_ROOT}/av1/common/entropymv.h"
             "${AOM_ROOT}/av1/common/enums.h"
-            "${AOM_ROOT}/av1/common/filter.c"
             "${AOM_ROOT}/av1/common/filter.h"
             "${AOM_ROOT}/av1/common/frame_buffers.c"
             "${AOM_ROOT}/av1/common/frame_buffers.h"
@@ -274,7 +273,10 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1
 list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
             "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c"
-            "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c")
+            "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h"
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
             "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c")
@@ -296,7 +298,9 @@ list(APPEND AOM_AV1_COMMON_INTRIN_NEON
             "${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c"
             "${AOM_ROOT}/av1/common/arm/reconinter_neon.c"
             "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c"
-            "${AOM_ROOT}/av1/common/arm/intrapred_neon.c"
+            "${AOM_ROOT}/av1/common/arm/selfguided_neon.c"
+            "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
+            "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h"
             "${AOM_ROOT}/av1/common/cdef_block_neon.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
diff --git a/third_party/aom/av1/av1_cx_iface.c b/third_party/aom/av1/av1_cx_iface.c
index 9d5414c1e..3bc4804c9 100644
--- a/third_party/aom/av1/av1_cx_iface.c
+++ b/third_party/aom/av1/av1_cx_iface.c
@@ -94,6 +94,10 @@ struct av1_extracfg {
   int enable_warped_motion;  // sequence level
   int allow_warped_motion;   // frame level
   int enable_superres;
+#if CONFIG_DENOISE
+  float noise_level;
+  int noise_block_size;
+#endif
 };
 
 static struct av1_extracfg default_extra_cfg = {
@@ -160,6 +164,10 @@ static struct av1_extracfg default_extra_cfg = {
   1,                            // enable_warped_motion at sequence level
   1,                            // allow_warped_motion at frame level
   1,                            // superres
+#if CONFIG_DENOISE
+  0,   // noise_level
+  32,  // noise_block_size
+#endif
 };
 
 struct aom_codec_alg_priv {
@@ -464,7 +472,7 @@ static aom_codec_err_t set_encoder_config(
     oxcf->buffer_model.num_units_in_decoding_tick = cfg->g_timebase.num;
     oxcf->timing_info.equal_picture_interval = 0;
     oxcf->decoder_model_info_present_flag = 1;
-    oxcf->buffer_removal_delay_present = 1;
+    oxcf->buffer_removal_time_present = 1;
     oxcf->display_model_info_present_flag = 1;
   }
   if (oxcf->init_framerate > 180) {
@@ -612,6 +620,10 @@ static aom_codec_err_t set_encoder_config(
     oxcf->film_grain_test_vector = extra_cfg->film_grain_test_vector;
     oxcf->film_grain_table_filename = extra_cfg->film_grain_table_filename;
   }
+#if CONFIG_DENOISE
+  oxcf->noise_level = extra_cfg->noise_level;
+  oxcf->noise_block_size = extra_cfg->noise_block_size;
+#endif
   oxcf->large_scale_tile = cfg->large_scale_tile;
   oxcf->single_tile_decoding =
       (oxcf->large_scale_tile) ? extra_cfg->single_tile_decoding : 0;
@@ -710,7 +722,7 @@ static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx,
     ctx->cfg = *cfg;
     set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
     // On profile change, request a key frame
-    force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
+    force_key |= ctx->cpi->common.seq_params.profile != ctx->oxcf.profile;
     av1_change_config(ctx->cpi, &ctx->oxcf);
   }
 
@@ -1055,6 +1067,23 @@ static aom_codec_err_t ctrl_set_film_grain_table(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+#if CONFIG_DENOISE
+static aom_codec_err_t ctrl_set_denoise_noise_level(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.noise_level =
+      ((float)CAST(AV1E_SET_DENOISE_NOISE_LEVEL, args)) / 10.0f;
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_denoise_block_size(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.noise_block_size = CAST(AV1E_SET_DENOISE_BLOCK_SIZE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+#endif
+
 static aom_codec_err_t ctrl_set_deltaq_mode(aom_codec_alg_priv_t *ctx,
                                             va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1119,7 +1148,7 @@ static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx,
     }
 
     priv->extra_cfg = default_extra_cfg;
-    once(av1_initialize_enc);
+    aom_once(av1_initialize_enc);
 
     res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
 
@@ -1200,6 +1229,9 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
 
   volatile aom_enc_frame_flags_t flags = enc_flags;
 
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
   if (setjmp(cpi->common.error.jmp)) {
     cpi->common.error.setjmp = 0;
     res = update_error_state(ctx, &cpi->common.error);
@@ -1259,7 +1291,6 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
       if (cx_data_sz < ctx->cx_data_sz / 2) {
         aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
                            "Compressed data buffer too small");
-        return AOM_CODEC_ERROR;
       }
     }
 
@@ -1275,8 +1306,8 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
                                          !img, timebase)) {
       if (cpi->common.seq_params.frame_id_numbers_present_flag) {
         if (cpi->common.invalid_delta_frame_id_minus_1) {
-          ctx->base.err_detail = "Invalid delta_frame_id_minus_1";
-          return AOM_CODEC_ERROR;
+          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+                             "Invalid delta_frame_id_minus_1");
         }
       }
       cpi->seq_params_locked = 1;
@@ -1305,7 +1336,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
           // OBUs are preceded/succeeded by an unsigned leb128 coded integer.
           if (write_uleb_obu_size(obu_header_size, obu_payload_size,
                                   ctx->pending_cx_data) != AOM_CODEC_OK) {
-            return AOM_CODEC_ERROR;
+            aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
           }
 
           frame_size += obu_header_size + obu_payload_size + length_field_size;
@@ -1315,7 +1346,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
           size_t curr_frame_size = frame_size;
           if (av1_convert_sect5obus_to_annexb(cx_data, &curr_frame_size) !=
               AOM_CODEC_OK) {
-            return AOM_CODEC_ERROR;
+            aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
           }
           frame_size = curr_frame_size;
 
@@ -1327,7 +1358,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
           }
           if (write_uleb_obu_size(0, (uint32_t)frame_size, cx_data) !=
               AOM_CODEC_OK) {
-            return AOM_CODEC_ERROR;
+            aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
           }
           frame_size += length_field_size;
         }
@@ -1358,7 +1389,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
         }
         if (write_uleb_obu_size(0, (uint32_t)tu_size, ctx->pending_cx_data) !=
             AOM_CODEC_OK) {
-          return AOM_CODEC_ERROR;
+          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
         }
         ctx->pending_cx_data_sz += length_field_size;
       }
@@ -1710,6 +1741,10 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding },
   { AV1E_SET_FILM_GRAIN_TEST_VECTOR, ctrl_set_film_grain_test_vector },
   { AV1E_SET_FILM_GRAIN_TABLE, ctrl_set_film_grain_table },
+#if CONFIG_DENOISE
+  { AV1E_SET_DENOISE_NOISE_LEVEL, ctrl_set_denoise_noise_level },
+  { AV1E_SET_DENOISE_BLOCK_SIZE, ctrl_set_denoise_block_size },
+#endif  // CONFIG_FILM_GRAIN
   { AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test },
 
   // Getters
@@ -1728,7 +1763,7 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
     {
         // NOLINT
         0,  // g_usage
-        8,  // g_threads
+        0,  // g_threads
         0,  // g_profile
 
         320,         // g_width
@@ -1810,7 +1845,7 @@ CODEC_INTERFACE(aom_codec_av1_cx) = {
       NULL,  // aom_codec_peek_si_fn_t
       NULL,  // aom_codec_get_si_fn_t
       NULL,  // aom_codec_decode_fn_t
-      NULL,  // aom_codec_frame_get_fn_t
+      NULL,  // aom_codec_get_frame_fn_t
       NULL   // aom_codec_set_fb_fn_t
   },
   {
diff --git a/third_party/aom/av1/av1_dx_iface.c b/third_party/aom/av1/av1_dx_iface.c
index db338f7e3..f42572019 100644
--- a/third_party/aom/av1/av1_dx_iface.c
+++ b/third_party/aom/av1/av1_dx_iface.c
@@ -50,6 +50,7 @@ struct aom_codec_alg_priv {
   int decode_tile_col;
   unsigned int tile_mode;
   unsigned int ext_tile_debug;
+  unsigned int row_mt;
   EXTERNAL_REFERENCES ext_refs;
   unsigned int is_annexb;
   int operating_point;
@@ -61,7 +62,7 @@ struct aom_codec_alg_priv {
   int last_submit_worker_id;
   int next_output_worker_id;
   int available_threads;
-  aom_image_t *image_with_grain;
+  aom_image_t *image_with_grain[MAX_NUM_SPATIAL_LAYERS];
   int need_resync;  // wait for key/intra-only frame
   // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
   BufferPool *buffer_pool;
@@ -101,7 +102,7 @@ static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx,
       // default values
       priv->cfg.cfg.ext_partition = 1;
     }
-    priv->image_with_grain = NULL;
+    av1_zero(priv->image_with_grain);
   }
 
   return AOM_CODEC_OK;
@@ -139,7 +140,9 @@ static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {
 
   aom_free(ctx->frame_workers);
   aom_free(ctx->buffer_pool);
-  if (ctx->image_with_grain) aom_img_free(ctx->image_with_grain);
+  for (int i = 0; i < MAX_NUM_SPATIAL_LAYERS; i++) {
+    if (ctx->image_with_grain[i]) aom_img_free(ctx->image_with_grain[i]);
+  }
   aom_free(ctx);
   return AOM_CODEC_OK;
 }
@@ -339,16 +342,16 @@ static int frame_worker_hook(void *arg1, void *arg2) {
   const uint8_t *data = frame_worker_data->data;
   (void)arg2;
 
-  frame_worker_data->result = av1_receive_compressed_data(
-      frame_worker_data->pbi, frame_worker_data->data_size, &data);
+  int result = av1_receive_compressed_data(frame_worker_data->pbi,
+                                           frame_worker_data->data_size, &data);
   frame_worker_data->data_end = data;
 
-  if (frame_worker_data->result != 0) {
+  if (result != 0) {
     // Check decode result in serial decode.
     frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
     frame_worker_data->pbi->need_resync = 1;
   }
-  return !frame_worker_data->result;
+  return !result;
 }
 
 static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
@@ -429,6 +432,7 @@ static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
     frame_worker_data->pbi->operating_point = ctx->operating_point;
     frame_worker_data->pbi->output_all_layers = ctx->output_all_layers;
     frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
+    frame_worker_data->pbi->row_mt = ctx->row_mt;
 
     worker->hook = (AVxWorkerHook)frame_worker_hook;
     if (!winterface->reset(worker)) {
@@ -489,6 +493,7 @@ static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx,
   frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
   frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
   frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
+  frame_worker_data->pbi->row_mt = ctx->row_mt;
   frame_worker_data->pbi->ext_refs = ctx->ext_refs;
 
   frame_worker_data->pbi->common.is_annexb = ctx->is_annexb;
@@ -592,21 +597,31 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
   return res;
 }
 
-aom_image_t *add_grain_if_needed(aom_image_t *img, aom_image_t *grain_img_buf,
-                                 aom_film_grain_t *grain_params) {
+// If grain_params->apply_grain is false, returns img. Otherwise, adds film
+// grain to img, saves the result in *grain_img_ptr (allocating *grain_img_ptr
+// if necessary), and returns *grain_img_ptr.
+static aom_image_t *add_grain_if_needed(aom_image_t *img,
+                                        aom_image_t **grain_img_ptr,
+                                        aom_film_grain_t *grain_params) {
   if (!grain_params->apply_grain) return img;
 
-  if (grain_img_buf &&
-      (img->d_w != grain_img_buf->d_w || img->d_h != grain_img_buf->d_h ||
-       img->fmt != grain_img_buf->fmt || !(img->d_h % 2) || !(img->d_w % 2))) {
-    aom_img_free(grain_img_buf);
-    grain_img_buf = NULL;
+  aom_image_t *grain_img_buf = *grain_img_ptr;
+
+  const int w_even = ALIGN_POWER_OF_TWO(img->d_w, 1);
+  const int h_even = ALIGN_POWER_OF_TWO(img->d_h, 1);
+
+  if (grain_img_buf) {
+    const int alloc_w = ALIGN_POWER_OF_TWO(grain_img_buf->d_w, 1);
+    const int alloc_h = ALIGN_POWER_OF_TWO(grain_img_buf->d_h, 1);
+    if (w_even != alloc_w || h_even != alloc_h ||
+        img->fmt != grain_img_buf->fmt) {
+      aom_img_free(grain_img_buf);
+      grain_img_buf = NULL;
+    }
   }
   if (!grain_img_buf) {
-    int w_even = img->d_w % 2 ? img->d_w + 1 : img->d_w;
-    int h_even = img->d_h % 2 ? img->d_h + 1 : img->d_h;
     grain_img_buf = aom_img_alloc(NULL, img->fmt, w_even, h_even, 16);
-    grain_img_buf->bit_depth = img->bit_depth;
+    *grain_img_ptr = grain_img_buf;
   }
 
   av1_add_film_grain(grain_params, img, grain_img_buf);
@@ -649,8 +664,6 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
         aom_film_grain_t *grain_params;
         if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd,
                               &grain_params) == 0) {
-          *index += 1;  // Advance the iterator to point to the next image
-
           AV1Decoder *const pbi = frame_worker_data->pbi;
           AV1_COMMON *const cm = &pbi->common;
           RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
@@ -659,6 +672,7 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
           yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv);
 
           if (!pbi->ext_tile_debug && cm->large_scale_tile) {
+            *index += 1;  // Advance the iterator to point to the next image
             img = &ctx->img;
             img->img_data = pbi->tile_list_output;
             img->sz = pbi->tile_list_size;
@@ -688,11 +702,14 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
             const int tile_col = AOMMIN(pbi->dec_tile_col, cm->tile_cols - 1);
             const int mi_col = tile_col * cm->tile_width;
             const int ssx = ctx->img.x_chroma_shift;
+            const int is_hbd =
+                (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0;
             int plane;
-            ctx->img.planes[0] += mi_col * MI_SIZE;
+            ctx->img.planes[0] += mi_col * MI_SIZE * (1 + is_hbd);
             if (num_planes > 1) {
               for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-                ctx->img.planes[plane] += mi_col * (MI_SIZE >> ssx);
+                ctx->img.planes[plane] +=
+                    mi_col * (MI_SIZE >> ssx) * (1 + is_hbd);
               }
             }
             ctx->img.d_w =
@@ -703,7 +720,10 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
           img = &ctx->img;
           img->temporal_id = cm->temporal_layer_id;
           img->spatial_id = cm->spatial_layer_id;
-          return add_grain_if_needed(img, ctx->image_with_grain, grain_params);
+          aom_image_t *res = add_grain_if_needed(
+              img, &ctx->image_with_grain[*index], grain_params);
+          *index += 1;  // Advance the iterator to point to the next image
+          return res;
         }
       } else {
         // Decoding failed. Release the worker thread.
@@ -999,7 +1019,7 @@ static aom_codec_err_t ctrl_get_bit_depth(aom_codec_alg_priv_t *ctx,
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
       const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
-      *bit_depth = cm->bit_depth;
+      *bit_depth = cm->seq_params.bit_depth;
       return AOM_CODEC_OK;
     } else {
       return AOM_CODEC_ERROR;
@@ -1009,6 +1029,64 @@ static aom_codec_err_t ctrl_get_bit_depth(aom_codec_alg_priv_t *ctx,
   return AOM_CODEC_INVALID_PARAM;
 }
 
+static aom_img_fmt_t get_img_format(int subsampling_x, int subsampling_y,
+                                    int use_highbitdepth) {
+  aom_img_fmt_t fmt = 0;
+
+  if (subsampling_x == 0 && subsampling_y == 0)
+    fmt = AOM_IMG_FMT_I444;
+  else if (subsampling_x == 1 && subsampling_y == 0)
+    fmt = AOM_IMG_FMT_I422;
+  else if (subsampling_x == 1 && subsampling_y == 1)
+    fmt = AOM_IMG_FMT_I420;
+
+  if (use_highbitdepth) fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  return fmt;
+}
+
+static aom_codec_err_t ctrl_get_img_format(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  aom_img_fmt_t *const img_fmt = va_arg(args, aom_img_fmt_t *);
+  AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+
+  if (img_fmt) {
+    if (worker) {
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+
+      *img_fmt = get_img_format(cm->seq_params.subsampling_x,
+                                cm->seq_params.subsampling_y,
+                                cm->seq_params.use_highbitdepth);
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_tile_size(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  unsigned int *const tile_size = va_arg(args, unsigned int *);
+  AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+
+  if (tile_size) {
+    if (worker) {
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+      *tile_size =
+          ((cm->tile_width * MI_SIZE) << 16) + cm->tile_height * MI_SIZE;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+  return AOM_CODEC_INVALID_PARAM;
+}
+
 static aom_codec_err_t ctrl_set_invert_tile_order(aom_codec_alg_priv_t *ctx,
                                                   va_list args) {
   ctx->invert_tile_order = va_arg(args, int);
@@ -1124,6 +1202,12 @@ static aom_codec_err_t ctrl_ext_tile_debug(aom_codec_alg_priv_t *ctx,
   return AOM_CODEC_OK;
 }
 
+static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  ctx->row_mt = va_arg(args, unsigned int);
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { AV1_COPY_REFERENCE, ctrl_copy_reference },
 
@@ -1145,6 +1229,7 @@ static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { AV1D_SET_OUTPUT_ALL_LAYERS, ctrl_set_output_all_layers },
   { AV1_SET_INSPECTION_CALLBACK, ctrl_set_inspection_callback },
   { AV1D_EXT_TILE_DEBUG, ctrl_ext_tile_debug },
+  { AV1D_SET_ROW_MT, ctrl_set_row_mt },
   { AV1D_SET_EXT_REF_PTR, ctrl_set_ext_ref_ptr },
 
   // Getters
@@ -1152,6 +1237,8 @@ static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { AOMD_GET_LAST_QUANTIZER, ctrl_get_last_quantizer },
   { AOMD_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates },
   { AV1D_GET_BIT_DEPTH, ctrl_get_bit_depth },
+  { AV1D_GET_IMG_FORMAT, ctrl_get_img_format },
+  { AV1D_GET_TILE_SIZE, ctrl_get_tile_size },
   { AV1D_GET_DISPLAY_SIZE, ctrl_get_render_size },
   { AV1D_GET_FRAME_SIZE, ctrl_get_frame_size },
   { AV1_GET_ACCOUNTING, ctrl_get_accounting },
@@ -1180,7 +1267,7 @@ CODEC_INTERFACE(aom_codec_av1_dx) = {
       decoder_peek_si,    // aom_codec_peek_si_fn_t
       decoder_get_si,     // aom_codec_get_si_fn_t
       decoder_decode,     // aom_codec_decode_fn_t
-      decoder_get_frame,  // aom_codec_frame_get_fn_t
+      decoder_get_frame,  // aom_codec_get_frame_fn_t
       decoder_set_fb_fn,  // aom_codec_set_fb_fn_t
   },
   {
diff --git a/third_party/aom/av1/common/alloccommon.c b/third_party/aom/av1/common/alloccommon.c
index 49902cc7d..1bf81c91d 100644
--- a/third_party/aom/av1/common/alloccommon.c
+++ b/third_party/aom/av1/common/alloccommon.c
@@ -137,11 +137,11 @@ void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
   // Now we need to allocate enough space to store the line buffers for the
   // stripes
   const int frame_w = cm->superres_upscaled_width;
-  const int use_highbd = cm->use_highbitdepth ? 1 : 0;
+  const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
 
   for (int p = 0; p < num_planes; ++p) {
     const int is_uv = p > 0;
-    const int ss_x = is_uv && cm->subsampling_x;
+    const int ss_x = is_uv && cm->seq_params.subsampling_x;
     const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ;
     const int stride = ALIGN_POWER_OF_TWO(plane_w, 5);
     const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT
diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
new file mode 100644
index 000000000..51c991498
--- /dev/null
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
@@ -0,0 +1,844 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+#include "av1/common/arm/av1_inv_txfm_neon.h"
+
+static INLINE TxSetType find_TxSetType(TX_SIZE tx_size) {
+  const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
+  TxSetType tx_set_type;
+  if (tx_size_sqr_up > TX_32X32) {
+    tx_set_type = EXT_TX_SET_DCTONLY;
+  } else if (tx_size_sqr_up == TX_32X32) {
+    tx_set_type = EXT_TX_SET_DCT_IDTX;
+  } else {
+    tx_set_type = EXT_TX_SET_ALL16;
+  }
+  return tx_set_type;
+}
+
+// 1D itx types
+typedef enum ATTRIBUTE_PACKED {
+  IDCT_1D,
+  IADST_1D,
+  IFLIPADST_1D = IADST_1D,
+  IIDENTITY_1D,
+  ITX_TYPES_1D,
+} ITX_TYPE_1D;
+
+static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
+  IDCT_1D,      IADST_1D,     IDCT_1D,      IADST_1D,
+  IFLIPADST_1D, IDCT_1D,      IFLIPADST_1D, IADST_1D,
+  IFLIPADST_1D, IIDENTITY_1D, IDCT_1D,      IIDENTITY_1D,
+  IADST_1D,     IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
+};
+
+static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
+  IDCT_1D,      IDCT_1D,      IADST_1D,     IADST_1D,
+  IDCT_1D,      IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
+  IADST_1D,     IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
+  IIDENTITY_1D, IADST_1D,     IIDENTITY_1D, IFLIPADST_1D,
+};
+
+// 1D functions
+static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
+  { av1_idct4_new, av1_iadst4_new, av1_iidentity4_c },
+  { av1_idct8_new, av1_iadst8_new, av1_iidentity8_c },
+  { av1_idct16_new, av1_iadst16_new, av1_iidentity16_c },
+  { av1_idct32_new, NULL, NULL },
+  { av1_idct64_new, NULL, NULL },
+};
+
+// Functions for blocks with eob at DC and within
+// topleft 8x8, 16x16, 32x32 corner
+static const transform_1d_neon
+    lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { av1_idct4_new, av1_idct4_new, NULL, NULL },
+          { av1_iadst4_new, av1_iadst4_new, NULL, NULL },
+          { av1_iidentity4_c, av1_iidentity4_c, NULL, NULL },
+      },
+      { { av1_idct8_new, av1_idct8_new, NULL, NULL },
+        { av1_iadst8_new, av1_iadst8_new, NULL, NULL },
+        { av1_iidentity8_c, av1_iidentity8_c, NULL, NULL } },
+      {
+          { av1_idct16_new, av1_idct16_new, av1_idct16_new, NULL },
+          { av1_iadst16_new, av1_iadst16_new, av1_iadst16_new, NULL },
+          { av1_iidentity16_c, av1_iidentity16_c, av1_iidentity16_c, NULL },
+      },
+      { { av1_idct32_new, av1_idct32_new, av1_idct32_new, av1_idct32_new },
+        { NULL, NULL, NULL, NULL },
+        { av1_iidentity32_c, av1_iidentity32_c, av1_iidentity32_c,
+          av1_iidentity32_c } },
+      { { av1_idct64_new, av1_idct64_new, av1_idct64_new, av1_idct64_new },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
+                                                  uint8_t *output, int stride,
+                                                  TX_TYPE tx_type,
+                                                  TX_SIZE tx_size, int eob) {
+  DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+  int32_t *temp_in = txfm_buf;
+
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+
+  // row tx
+  int row_start = (buf_size_nonzero_h_div8 * 8);
+  for (int i = 0; i < row_start; i++) {
+    if (abs(rect_type) == 1) {
+      for (int j = 0; j < txfm_size_col; j++)
+        temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+      row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    } else {
+      row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    }
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  // Doing memset for the rows which are not processed in row transform.
+  memset(buf_ptr, 0,
+         sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+  // col tx
+  for (int c = 0; c < txfm_size_col; c++) {
+    for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + c];
+
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    for (r = 0; r < txfm_size_row; ++r) {
+      output[r * stride + c] =
+          highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+  int32_t *temp_in = txfm_buf;
+
+  int eobx, eoby;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // row tx
+  int row_start = (buf_size_nonzero_h_div8 * 8);
+  for (int i = 0; i < row_start; i++) {
+    if (abs(rect_type) == 1) {
+      for (int j = 0; j < txfm_size_col; j++)
+        temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+      row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    } else {
+      row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    }
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+  // Doing memset for the rows which are not processed in row transform.
+  memset(buf_ptr, 0,
+         sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+  // col tx
+  for (int c = 0; c < txfm_size_col; c++) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+  int32_t *temp_in = txfm_buf;
+
+  int eobx, eoby;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // row tx
+  int row_start = (buf_size_nonzero_h_div8 * 8);
+  for (int i = 0; i < row_start; i++) {
+    if (abs(rect_type) == 1) {
+      for (int j = 0; j < txfm_size_col; j++)
+        temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+      row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    } else {
+      row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    }
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+  // Doing memset for the rows which are not processed in row transform.
+  memset(buf_ptr, 0,
+         sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+  // col tx
+  for (int c = 0; c < txfm_size_col; c++) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
+                                                 uint8_t *output, int stride,
+                                                 TX_TYPE tx_type,
+                                                 TX_SIZE tx_size, int eob) {
+  (void)eob;
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]);
+  int32_t *temp_in = txfm_buf;
+
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  for (int c = 0; c < txfm_size_col; ++c) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+                                   int eob) {
+  (void)eob;
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
+  int32_t *temp_in = txfm_buf;
+
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    for (int j = 0; j < txfm_size_col; j++)
+      temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+
+    row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  for (int c = 0; c < txfm_size_col; ++c) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+                                   int eob) {
+  (void)eob;
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
+  int32_t *temp_in = txfm_buf;
+
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    for (int j = 0; j < txfm_size_col; j++)
+      temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+
+    row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  for (int c = 0; c < txfm_size_col; ++c) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size, int eob) {
+  (void)eob;
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+  int32_t *temp_in = txfm_buf;
+
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  for (int c = 0; c < txfm_size_col; ++c) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size, int eob) {
+  (void)eob;
+
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
+  int32_t *temp_in = txfm_buf;
+
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  for (int c = 0; c < txfm_size_col; ++c) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]);
+  int32_t *temp_in = txfm_buf;
+
+  int eobx, eoby, ud_flip, lr_flip, row_start;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  const int bd = 8;
+  int r;
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  row_start = (buf_size_nonzero_h_div8 << 3);
+
+  for (int i = 0; i < row_start; i++) {
+    if (abs(rect_type) == 1) {
+      for (int j = 0; j < txfm_size_col; j++)
+        temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+      row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    } else {
+      row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    }
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  // Doing memset for the rows which are not processed in row transform.
+  memset(buf_ptr, 0,
+         sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+  for (int c = 0; c < txfm_size_col; c++) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_universe_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  switch (tx_type) {
+    case IDTX:
+      lowbd_inv_txfm2d_add_idtx_neon(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      lowbd_inv_txfm2d_add_v_identity_neon(input, output, stride, tx_type,
+                                           tx_size, eob);
+      break;
+
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      lowbd_inv_txfm2d_add_h_identity_neon(input, output, stride, tx_type,
+                                           tx_size, eob);
+      break;
+
+    default:
+      lowbd_inv_txfm2d_add_no_identity_neon(input, output, stride, tx_type,
+                                            tx_size, eob);
+      break;
+  }
+}
+void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+                                   int eob) {
+  int row;
+  switch (tx_size) {
+    case TX_4X4:
+      lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, tx_size,
+                                    eob);
+      break;
+
+    case TX_4X8:
+      lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, tx_size,
+                                    eob);
+      break;
+
+    case TX_8X4:
+      lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, tx_size,
+                                    eob);
+      break;
+
+    case TX_4X16:
+      lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+
+    case TX_16X4:
+      lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+
+    case TX_16X64: {
+      lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
+                                         tx_size, eob);
+    } break;
+
+    case TX_64X16: {
+      int32_t mod_input[64 * 16];
+      for (row = 0; row < 16; ++row) {
+        memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+        memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+      }
+      lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
+                                         tx_size, eob);
+    } break;
+
+    case TX_32X64: {
+      lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
+                                         tx_size, eob);
+    } break;
+
+    case TX_64X32: {
+      int32_t mod_input[64 * 32];
+      for (row = 0; row < 32; ++row) {
+        memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+        memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+      }
+      lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
+                                         tx_size, eob);
+    } break;
+
+    case TX_64X64: {
+      int32_t mod_input[64 * 64];
+      for (row = 0; row < 32; ++row) {
+        memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+        memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+      }
+      lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
+                                         tx_size, eob);
+    } break;
+
+    default:
+      lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
+                                         tx_size, eob);
+      break;
+  }
+}
+void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+                           const TxfmParam *txfm_param) {
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  if (!txfm_param->lossless) {
+    av1_lowbd_inv_txfm2d_add_neon(dqcoeff, dst, stride, tx_type,
+                                  txfm_param->tx_size, txfm_param->eob);
+  } else {
+    av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+  }
+}
diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
new file mode 100644
index 000000000..6af2d61e7
--- /dev/null
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+#define AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+
+typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output,
+                                  const int8_t cos_bit,
+                                  const int8_t *stage_ptr);
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_16x16_default[16]) = {
+  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+  0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_32x32_default[32]) = {
+  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
+  0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_16x32_default[32]) = {
+  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+  0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_32x16_default[16]) = {
+  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+  0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
+  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
+  0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t *,
+                av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
+  NULL,
+  av1_eob_to_eobxy_8x8_default,
+  av1_eob_to_eobxy_16x16_default,
+  av1_eob_to_eobxy_32x32_default,
+  av1_eob_to_eobxy_32x32_default,
+  NULL,
+  NULL,
+  av1_eob_to_eobxy_8x16_default,
+  av1_eob_to_eobxy_16x8_default,
+  av1_eob_to_eobxy_16x32_default,
+  av1_eob_to_eobxy_32x16_default,
+  av1_eob_to_eobxy_32x32_default,
+  av1_eob_to_eobxy_32x32_default,
+  NULL,
+  NULL,
+  av1_eob_to_eobxy_8x32_default,
+  av1_eob_to_eobxy_32x8_default,
+  av1_eob_to_eobxy_16x32_default,
+  av1_eob_to_eobxy_32x16_default,
+};
+
+static const int lowbd_txfm_all_1d_zeros_idx[32] = {
+  0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+// Transform block width in log2 for eob (size of 64 map to 32)
+static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
+  2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
+};
+
+static int eob_fill[32] = {
+  0,  7,  7,  7,  7,  7,  7,  7,  15, 15, 15, 15, 15, 15, 15, 15,
+  31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+};
+
+static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+                                              TX_SIZE tx_size, int eob) {
+  if (eob == 1) {
+    *eobx = 0;
+    *eoby = 0;
+    return;
+  }
+
+  const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
+  const int eob_row = (eob - 1) >> tx_w_log2;
+  const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
+  *eobx = eobxy & 0xFF;
+  *eoby = eobxy >> 8;
+}
+
+static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+                                                 TX_SIZE tx_size, int eob) {
+  eob -= 1;
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
+  *eobx = eob / (eoby_max + 1);
+  *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
+}
+
+static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+                                                 TX_SIZE tx_size, int eob) {
+  eob -= 1;
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
+  *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
+  const int temp_eoby = eob / (eobx_max + 1);
+  assert(temp_eoby < 32);
+  *eoby = eob_fill[temp_eoby];
+}
+
+#endif  // AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/convolve_neon.c b/third_party/aom/av1/common/arm/convolve_neon.c
index 86a25e109..f15744c94 100644
--- a/third_party/aom/av1/common/arm/convolve_neon.c
+++ b/third_party/aom/av1/common/arm/convolve_neon.c
@@ -164,8 +164,8 @@ static INLINE uint8x8_t convolve8_vert_8x4_s32(
 
 void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
-                            InterpFilterParams *filter_params_x,
-                            InterpFilterParams *filter_params_y,
+                            const InterpFilterParams *filter_params_x,
+                            const InterpFilterParams *filter_params_y,
                             const int subpel_x_q4, const int subpel_y_q4,
                             ConvolveParams *conv_params) {
   const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
@@ -182,7 +182,7 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
 
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
 
   const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
   const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
@@ -485,8 +485,8 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
 
 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
-                            InterpFilterParams *filter_params_x,
-                            InterpFilterParams *filter_params_y,
+                            const InterpFilterParams *filter_params_x,
+                            const InterpFilterParams *filter_params_y,
                             const int subpel_x_q4, const int subpel_y_q4,
                             ConvolveParams *conv_params) {
   const int vert_offset = filter_params_y->taps / 2 - 1;
@@ -502,7 +502,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
 
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
 
   if (w <= 4) {
     uint8x8_t d01, d23;
@@ -680,8 +680,8 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
 
 void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
                              int dst_stride, int w, int h,
-                             InterpFilterParams *filter_params_x,
-                             InterpFilterParams *filter_params_y,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
                              const int subpel_x_q4, const int subpel_y_q4,
                              ConvolveParams *conv_params) {
   int im_dst_stride;
@@ -711,7 +711,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
 
   int16_t x_filter_tmp[8];
   int16x8_t filter_x_coef = vld1q_s16(x_filter);
@@ -896,7 +896,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
     const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
                               (1 << (offset_bits - conv_params->round_1 - 1));
     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+        filter_params_y, subpel_y_q4 & SUBPEL_MASK);
 
     const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
     const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
@@ -1086,8 +1086,8 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
 }
 void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride,
                                   uint8_t *dst, int dst_stride, int w, int h,
-                                  InterpFilterParams *filter_params_x,
-                                  InterpFilterParams *filter_params_y,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
                                   const int subpel_x_q4, const int subpel_y_q4,
                                   ConvolveParams *conv_params) {
   (void)filter_params_x;
diff --git a/third_party/aom/av1/common/arm/intrapred_neon.c b/third_party/aom/av1/common/arm/intrapred_neon.c
deleted file mode 100644
index 799355553..000000000
--- a/third_party/aom/av1/common/arm/intrapred_neon.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- *
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
-#include "av1/common/arm/mem_neon.h"
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE void highbd_dc_predictor_neon(uint16_t *dst, ptrdiff_t stride,
-                                            int bw, const uint16_t *above,
-                                            const uint16_t *left) {
-  assert(bw >= 4);
-  assert(IS_POWER_OF_TWO(bw));
-  int expected_dc, sum = 0;
-  const int count = bw * 2;
-  uint32x4_t sum_q = vdupq_n_u32(0);
-  uint32x2_t sum_d;
-  uint16_t *dst_1;
-  if (bw >= 8) {
-    for (int i = 0; i < bw; i += 8) {
-      sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
-      sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
-      above += 8;
-      left += 8;
-    }
-    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
-    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
-    expected_dc = (sum + (count >> 1)) / count;
-    const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
-    for (int r = 0; r < bw; r++) {
-      dst_1 = dst;
-      for (int i = 0; i < bw; i += 8) {
-        vst1q_u16(dst_1, dc);
-        dst_1 += 8;
-      }
-      dst += stride;
-    }
-  } else {  // 4x4
-    sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
-    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
-    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
-    expected_dc = (sum + (count >> 1)) / count;
-    const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
-    for (int r = 0; r < bw; r++) {
-      vst1_u16(dst, dc);
-      dst += stride;
-    }
-  }
-}
-
-#define intra_pred_highbd_sized(type, width)                         \
-  void aom_highbd_##type##_predictor_##width##x##width##_neon(       \
-      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,        \
-      const uint16_t *left, int bd) {                                \
-    (void)bd;                                                        \
-    highbd_##type##_predictor_neon(dst, stride, width, above, left); \
-  }
-
-#define intra_pred_square(type)      \
-  intra_pred_highbd_sized(type, 4);  \
-  intra_pred_highbd_sized(type, 8);  \
-  intra_pred_highbd_sized(type, 16); \
-  intra_pred_highbd_sized(type, 32); \
-  intra_pred_highbd_sized(type, 64);
-
-intra_pred_square(dc);
-
-#undef intra_pred_square
diff --git a/third_party/aom/av1/common/arm/jnt_convolve_neon.c b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
index 992be4a9e..4015082b4 100644
--- a/third_party/aom/av1/common/arm/jnt_convolve_neon.c
+++ b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
@@ -515,8 +515,8 @@ static INLINE void jnt_convolve_2d_vert_neon(
 
 void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
                               int dst8_stride, int w, int h,
-                              InterpFilterParams *filter_params_x,
-                              InterpFilterParams *filter_params_y,
+                              const InterpFilterParams *filter_params_x,
+                              const InterpFilterParams *filter_params_y,
                               const int subpel_x_q4, const int subpel_y_q4,
                               ConvolveParams *conv_params) {
   assert(!(w % 4));
@@ -532,9 +532,9 @@ void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
   const int round_0 = conv_params->round_0 - 1;
   const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
 
   int16_t x_filter_tmp[8];
   int16x8_t filter_x_coef = vld1q_s16(x_filter);
@@ -553,8 +553,8 @@ void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
 
 void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
                                    uint8_t *dst8, int dst8_stride, int w, int h,
-                                   InterpFilterParams *filter_params_x,
-                                   InterpFilterParams *filter_params_y,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
                                    const int subpel_x_q4, const int subpel_y_q4,
                                    ConvolveParams *conv_params) {
   uint8x8_t res0_8, res1_8, res2_8, res3_8, tmp_shift0, tmp_shift1, tmp_shift2,
@@ -679,8 +679,8 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
 
 void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
                              int dst8_stride, int w, int h,
-                             InterpFilterParams *filter_params_x,
-                             InterpFilterParams *filter_params_y,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
                              const int subpel_x_q4, const int subpel_y_q4,
                              ConvolveParams *conv_params) {
   assert(!(w % 4));
@@ -705,7 +705,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
 
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
 
   const uint8_t *src_ptr = src - horiz_offset;
 
@@ -1013,8 +1013,8 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
 
 void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
                              int dst8_stride, int w, int h,
-                             InterpFilterParams *filter_params_x,
-                             InterpFilterParams *filter_params_y,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
                              const int subpel_x_q4, const int subpel_y_q4,
                              ConvolveParams *conv_params) {
   assert(!(w % 4));
@@ -1040,7 +1040,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
 
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
 
   const uint8_t *src_ptr = src - (vert_offset * src_stride);
 
diff --git a/third_party/aom/av1/common/arm/mem_neon.h b/third_party/aom/av1/common/arm/mem_neon.h
index 214b14bf7..4bf45a52c 100644
--- a/third_party/aom/av1/common/arm/mem_neon.h
+++ b/third_party/aom/av1/common/arm/mem_neon.h
@@ -22,6 +22,14 @@ static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
   s += p;
 }
 
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+#define load_u8_4x1(s, s0, lane)                                           \
+  do {                                                                     \
+    *(s0) = vreinterpret_u8_u32(                                           \
+        vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \
+  } while (0)
+
 static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
                                uint8x8_t *const s0, uint8x8_t *const s1,
                                uint8x8_t *const s2, uint8x8_t *const s3,
@@ -128,6 +136,13 @@ static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
   *s3 = vld1_s16(s);
 }
 
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+#define store_u8_4x1(s, s0, lane)                                  \
+  do {                                                             \
+    vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \
+  } while (0)
+
 static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
                                 const uint8x8_t s1, const uint8x8_t s2,
                                 const uint8x8_t s3, const uint8x8_t s4,
@@ -242,6 +257,30 @@ static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
   vst1q_s16(s, s7);
 }
 
+static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
+                                 const int16x4_t s0, const int16x4_t s1,
+                                 const int16x4_t s2, const int16x4_t s3) {
+  vst1_s16(s, s0);
+  s += dst_stride;
+  vst1_s16(s, s1);
+  s += dst_stride;
+  vst1_s16(s, s2);
+  s += dst_stride;
+  vst1_s16(s, s3);
+}
+
+static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
+                                 const int16x8_t s0, const int16x8_t s1,
+                                 const int16x8_t s2, const int16x8_t s3) {
+  vst1q_s16(s, s0);
+  s += dst_stride;
+  vst1q_s16(s, s1);
+  s += dst_stride;
+  vst1q_s16(s, s2);
+  s += dst_stride;
+  vst1q_s16(s, s3);
+}
+
 static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
                                 int16x8_t *const s0, int16x8_t *const s1,
                                 int16x8_t *const s2, int16x8_t *const s3,
@@ -398,4 +437,49 @@ static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
   *tu1 = vsetq_lane_u64(a, *tu1, 1);
 }
 
+static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
+                                int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
+  *s1 = vld1q_s32(s);
+  s += p;
+  *s2 = vld1q_s32(s);
+  s += p;
+  *s3 = vld1q_s32(s);
+  s += p;
+  *s4 = vld1q_s32(s);
+}
+
+static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
+                                 int32x4_t s2, int32x4_t s3, int32x4_t s4) {
+  vst1q_s32(s, s1);
+  s += p;
+  vst1q_s32(s, s2);
+  s += p;
+  vst1q_s32(s, s3);
+  s += p;
+  vst1q_s32(s, s4);
+}
+
+static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
+                                uint32x4_t *s2, uint32x4_t *s3,
+                                uint32x4_t *s4) {
+  *s1 = vld1q_u32(s);
+  s += p;
+  *s2 = vld1q_u32(s);
+  s += p;
+  *s3 = vld1q_u32(s);
+  s += p;
+  *s4 = vld1q_u32(s);
+}
+
+static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
+                                 uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
+  vst1q_u32(s, s1);
+  s += p;
+  vst1q_u32(s, s2);
+  s += p;
+  vst1q_u32(s, s3);
+  s += p;
+  vst1q_u32(s, s4);
+}
+
 #endif  // AV1_COMMON_ARM_MEM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/selfguided_neon.c b/third_party/aom/av1/common/arm/selfguided_neon.c
new file mode 100644
index 000000000..b4808a972
--- /dev/null
+++ b/third_party/aom/av1/common/arm/selfguided_neon.c
@@ -0,0 +1,1506 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/common.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
+#include "av1/common/restoration.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+// Constants used for right shift in final_filter calculation.
+#define NB_EVEN 5
+#define NB_ODD 4
+
+static INLINE void calc_ab_fast_internal_common(
+    uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4,
+    uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, int32x4_t sr4, int32x4_t sr5,
+    int32x4_t sr6, int32x4_t sr7, uint32x4_t const_n_val, uint32x4_t s_vec,
+    uint32x4_t const_val, uint32x4_t one_by_n_minus_1_vec,
+    uint16x4_t sgrproj_sgr, int32_t *src1, uint16_t *dst_A16, int32_t *src2,
+    const int buf_stride) {
+  uint32x4_t q0, q1, q2, q3;
+  uint32x4_t p0, p1, p2, p3;
+  uint16x4_t d0, d1, d2, d3;
+
+  s0 = vmulq_u32(s0, const_n_val);
+  s1 = vmulq_u32(s1, const_n_val);
+  s2 = vmulq_u32(s2, const_n_val);
+  s3 = vmulq_u32(s3, const_n_val);
+
+  q0 = vmulq_u32(s4, s4);
+  q1 = vmulq_u32(s5, s5);
+  q2 = vmulq_u32(s6, s6);
+  q3 = vmulq_u32(s7, s7);
+
+  p0 = vcleq_u32(q0, s0);
+  p1 = vcleq_u32(q1, s1);
+  p2 = vcleq_u32(q2, s2);
+  p3 = vcleq_u32(q3, s3);
+
+  q0 = vsubq_u32(s0, q0);
+  q1 = vsubq_u32(s1, q1);
+  q2 = vsubq_u32(s2, q2);
+  q3 = vsubq_u32(s3, q3);
+
+  p0 = vandq_u32(p0, q0);
+  p1 = vandq_u32(p1, q1);
+  p2 = vandq_u32(p2, q2);
+  p3 = vandq_u32(p3, q3);
+
+  p0 = vmulq_u32(p0, s_vec);
+  p1 = vmulq_u32(p1, s_vec);
+  p2 = vmulq_u32(p2, s_vec);
+  p3 = vmulq_u32(p3, s_vec);
+
+  p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS);
+  p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS);
+  p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS);
+  p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS);
+
+  p0 = vminq_u32(p0, const_val);
+  p1 = vminq_u32(p1, const_val);
+  p2 = vminq_u32(p2, const_val);
+  p3 = vminq_u32(p3, const_val);
+
+  {
+    store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3);
+
+    for (int x = 0; x < 4; x++) {
+      for (int y = 0; y < 4; y++) {
+        dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]];
+      }
+    }
+    load_u16_4x4(dst_A16, buf_stride, &d0, &d1, &d2, &d3);
+  }
+  p0 = vsubl_u16(sgrproj_sgr, d0);
+  p1 = vsubl_u16(sgrproj_sgr, d1);
+  p2 = vsubl_u16(sgrproj_sgr, d2);
+  p3 = vsubl_u16(sgrproj_sgr, d3);
+
+  s4 = vmulq_u32(vreinterpretq_u32_s32(sr4), one_by_n_minus_1_vec);
+  s5 = vmulq_u32(vreinterpretq_u32_s32(sr5), one_by_n_minus_1_vec);
+  s6 = vmulq_u32(vreinterpretq_u32_s32(sr6), one_by_n_minus_1_vec);
+  s7 = vmulq_u32(vreinterpretq_u32_s32(sr7), one_by_n_minus_1_vec);
+
+  s4 = vmulq_u32(s4, p0);
+  s5 = vmulq_u32(s5, p1);
+  s6 = vmulq_u32(s6, p2);
+  s7 = vmulq_u32(s7, p3);
+
+  p0 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS);
+  p1 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS);
+  p2 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS);
+  p3 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS);
+
+  store_s32_4x4(src2, buf_stride, vreinterpretq_s32_u32(p0),
+                vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2),
+                vreinterpretq_s32_u32(p3));
+}
+static INLINE void calc_ab_internal_common(
+    uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4,
+    uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, uint16x8_t s16_0,
+    uint16x8_t s16_1, uint16x8_t s16_2, uint16x8_t s16_3, uint16x8_t s16_4,
+    uint16x8_t s16_5, uint16x8_t s16_6, uint16x8_t s16_7,
+    uint32x4_t const_n_val, uint32x4_t s_vec, uint32x4_t const_val,
+    uint16x4_t one_by_n_minus_1_vec, uint16x8_t sgrproj_sgr, int32_t *src1,
+    uint16_t *dst_A16, int32_t *dst2, const int buf_stride) {
+  uint16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
+  uint32x4_t q0, q1, q2, q3, q4, q5, q6, q7;
+  uint32x4_t p0, p1, p2, p3, p4, p5, p6, p7;
+
+  s0 = vmulq_u32(s0, const_n_val);
+  s1 = vmulq_u32(s1, const_n_val);
+  s2 = vmulq_u32(s2, const_n_val);
+  s3 = vmulq_u32(s3, const_n_val);
+  s4 = vmulq_u32(s4, const_n_val);
+  s5 = vmulq_u32(s5, const_n_val);
+  s6 = vmulq_u32(s6, const_n_val);
+  s7 = vmulq_u32(s7, const_n_val);
+
+  d0 = vget_low_u16(s16_4);
+  d1 = vget_low_u16(s16_5);
+  d2 = vget_low_u16(s16_6);
+  d3 = vget_low_u16(s16_7);
+  d4 = vget_high_u16(s16_4);
+  d5 = vget_high_u16(s16_5);
+  d6 = vget_high_u16(s16_6);
+  d7 = vget_high_u16(s16_7);
+
+  q0 = vmull_u16(d0, d0);
+  q1 = vmull_u16(d1, d1);
+  q2 = vmull_u16(d2, d2);
+  q3 = vmull_u16(d3, d3);
+  q4 = vmull_u16(d4, d4);
+  q5 = vmull_u16(d5, d5);
+  q6 = vmull_u16(d6, d6);
+  q7 = vmull_u16(d7, d7);
+
+  p0 = vcleq_u32(q0, s0);
+  p1 = vcleq_u32(q1, s1);
+  p2 = vcleq_u32(q2, s2);
+  p3 = vcleq_u32(q3, s3);
+  p4 = vcleq_u32(q4, s4);
+  p5 = vcleq_u32(q5, s5);
+  p6 = vcleq_u32(q6, s6);
+  p7 = vcleq_u32(q7, s7);
+
+  q0 = vsubq_u32(s0, q0);
+  q1 = vsubq_u32(s1, q1);
+  q2 = vsubq_u32(s2, q2);
+  q3 = vsubq_u32(s3, q3);
+  q4 = vsubq_u32(s4, q4);
+  q5 = vsubq_u32(s5, q5);
+  q6 = vsubq_u32(s6, q6);
+  q7 = vsubq_u32(s7, q7);
+
+  p0 = vandq_u32(p0, q0);
+  p1 = vandq_u32(p1, q1);
+  p2 = vandq_u32(p2, q2);
+  p3 = vandq_u32(p3, q3);
+  p4 = vandq_u32(p4, q4);
+  p5 = vandq_u32(p5, q5);
+  p6 = vandq_u32(p6, q6);
+  p7 = vandq_u32(p7, q7);
+
+  p0 = vmulq_u32(p0, s_vec);
+  p1 = vmulq_u32(p1, s_vec);
+  p2 = vmulq_u32(p2, s_vec);
+  p3 = vmulq_u32(p3, s_vec);
+  p4 = vmulq_u32(p4, s_vec);
+  p5 = vmulq_u32(p5, s_vec);
+  p6 = vmulq_u32(p6, s_vec);
+  p7 = vmulq_u32(p7, s_vec);
+
+  p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS);
+  p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS);
+  p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS);
+  p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS);
+  p4 = vrshrq_n_u32(p4, SGRPROJ_MTABLE_BITS);
+  p5 = vrshrq_n_u32(p5, SGRPROJ_MTABLE_BITS);
+  p6 = vrshrq_n_u32(p6, SGRPROJ_MTABLE_BITS);
+  p7 = vrshrq_n_u32(p7, SGRPROJ_MTABLE_BITS);
+
+  p0 = vminq_u32(p0, const_val);
+  p1 = vminq_u32(p1, const_val);
+  p2 = vminq_u32(p2, const_val);
+  p3 = vminq_u32(p3, const_val);
+  p4 = vminq_u32(p4, const_val);
+  p5 = vminq_u32(p5, const_val);
+  p6 = vminq_u32(p6, const_val);
+  p7 = vminq_u32(p7, const_val);
+
+  {
+    store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3);
+    store_u32_4x4((uint32_t *)src1 + 4, buf_stride, p4, p5, p6, p7);
+
+    for (int x = 0; x < 4; x++) {
+      for (int y = 0; y < 8; y++) {
+        dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]];
+      }
+    }
+    load_u16_8x4(dst_A16, buf_stride, &s16_4, &s16_5, &s16_6, &s16_7);
+  }
+
+  s16_4 = vsubq_u16(sgrproj_sgr, s16_4);
+  s16_5 = vsubq_u16(sgrproj_sgr, s16_5);
+  s16_6 = vsubq_u16(sgrproj_sgr, s16_6);
+  s16_7 = vsubq_u16(sgrproj_sgr, s16_7);
+
+  s0 = vmull_u16(vget_low_u16(s16_0), one_by_n_minus_1_vec);
+  s1 = vmull_u16(vget_low_u16(s16_1), one_by_n_minus_1_vec);
+  s2 = vmull_u16(vget_low_u16(s16_2), one_by_n_minus_1_vec);
+  s3 = vmull_u16(vget_low_u16(s16_3), one_by_n_minus_1_vec);
+  s4 = vmull_u16(vget_high_u16(s16_0), one_by_n_minus_1_vec);
+  s5 = vmull_u16(vget_high_u16(s16_1), one_by_n_minus_1_vec);
+  s6 = vmull_u16(vget_high_u16(s16_2), one_by_n_minus_1_vec);
+  s7 = vmull_u16(vget_high_u16(s16_3), one_by_n_minus_1_vec);
+
+  s0 = vmulq_u32(s0, vmovl_u16(vget_low_u16(s16_4)));
+  s1 = vmulq_u32(s1, vmovl_u16(vget_low_u16(s16_5)));
+  s2 = vmulq_u32(s2, vmovl_u16(vget_low_u16(s16_6)));
+  s3 = vmulq_u32(s3, vmovl_u16(vget_low_u16(s16_7)));
+  s4 = vmulq_u32(s4, vmovl_u16(vget_high_u16(s16_4)));
+  s5 = vmulq_u32(s5, vmovl_u16(vget_high_u16(s16_5)));
+  s6 = vmulq_u32(s6, vmovl_u16(vget_high_u16(s16_6)));
+  s7 = vmulq_u32(s7, vmovl_u16(vget_high_u16(s16_7)));
+
+  p0 = vrshrq_n_u32(s0, SGRPROJ_RECIP_BITS);
+  p1 = vrshrq_n_u32(s1, SGRPROJ_RECIP_BITS);
+  p2 = vrshrq_n_u32(s2, SGRPROJ_RECIP_BITS);
+  p3 = vrshrq_n_u32(s3, SGRPROJ_RECIP_BITS);
+  p4 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS);
+  p5 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS);
+  p6 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS);
+  p7 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS);
+
+  store_s32_4x4(dst2, buf_stride, vreinterpretq_s32_u32(p0),
+                vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2),
+                vreinterpretq_s32_u32(p3));
+  store_s32_4x4(dst2 + 4, buf_stride, vreinterpretq_s32_u32(p4),
+                vreinterpretq_s32_u32(p5), vreinterpretq_s32_u32(p6),
+                vreinterpretq_s32_u32(p7));
+}
+
+static INLINE void boxsum2_square_sum_calc(
+    int16x4_t t1, int16x4_t t2, int16x4_t t3, int16x4_t t4, int16x4_t t5,
+    int16x4_t t6, int16x4_t t7, int16x4_t t8, int16x4_t t9, int16x4_t t10,
+    int16x4_t t11, int32x4_t *r0, int32x4_t *r1, int32x4_t *r2, int32x4_t *r3) {
+  int32x4_t d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11;
+  int32x4_t r12, r34, r67, r89, r1011;
+  int32x4_t r345, r6789, r789;
+
+  d1 = vmull_s16(t1, t1);
+  d2 = vmull_s16(t2, t2);
+  d3 = vmull_s16(t3, t3);
+  d4 = vmull_s16(t4, t4);
+  d5 = vmull_s16(t5, t5);
+  d6 = vmull_s16(t6, t6);
+  d7 = vmull_s16(t7, t7);
+  d8 = vmull_s16(t8, t8);
+  d9 = vmull_s16(t9, t9);
+  d10 = vmull_s16(t10, t10);
+  d11 = vmull_s16(t11, t11);
+
+  r12 = vaddq_s32(d1, d2);
+  r34 = vaddq_s32(d3, d4);
+  r67 = vaddq_s32(d6, d7);
+  r89 = vaddq_s32(d8, d9);
+  r1011 = vaddq_s32(d10, d11);
+  r345 = vaddq_s32(r34, d5);
+  r6789 = vaddq_s32(r67, r89);
+  r789 = vsubq_s32(r6789, d6);
+  *r0 = vaddq_s32(r12, r345);
+  *r1 = vaddq_s32(r67, r345);
+  *r2 = vaddq_s32(d5, r6789);
+  *r3 = vaddq_s32(r789, r1011);
+}
+
+static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16,
+                           int32_t *dst32, int32_t *dst2, const int dst_stride,
+                           const int width, const int height) {
+  assert(width > 2 * SGRPROJ_BORDER_HORZ);
+  assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+  int16_t *dst1_16_ptr, *src_ptr;
+  int32_t *dst2_ptr;
+  int h, w, count = 0;
+  const int dst_stride_2 = (dst_stride << 1);
+  const int dst_stride_8 = (dst_stride << 3);
+
+  dst1_16_ptr = dst16;
+  dst2_ptr = dst2;
+  src_ptr = src;
+  w = width;
+  {
+    int16x8_t t1, t2, t3, t4, t5, t6, t7;
+    int16x8_t t8, t9, t10, t11, t12;
+
+    int16x8_t q12345, q56789, q34567, q7891011;
+    int16x8_t q12, q34, q67, q89, q1011;
+    int16x8_t q345, q6789, q789;
+
+    int32x4_t r12345, r56789, r34567, r7891011;
+
+    do {
+      h = height;
+      dst1_16_ptr = dst16 + (count << 3);
+      dst2_ptr = dst2 + (count << 3);
+      src_ptr = src + (count << 3);
+
+      dst1_16_ptr += dst_stride_2;
+      dst2_ptr += dst_stride_2;
+      do {
+        load_s16_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4);
+        src_ptr += 4 * src_stride;
+        load_s16_8x4(src_ptr, src_stride, &t5, &t6, &t7, &t8);
+        src_ptr += 4 * src_stride;
+        load_s16_8x4(src_ptr, src_stride, &t9, &t10, &t11, &t12);
+
+        q12 = vaddq_s16(t1, t2);
+        q34 = vaddq_s16(t3, t4);
+        q67 = vaddq_s16(t6, t7);
+        q89 = vaddq_s16(t8, t9);
+        q1011 = vaddq_s16(t10, t11);
+        q345 = vaddq_s16(q34, t5);
+        q6789 = vaddq_s16(q67, q89);
+        q789 = vaddq_s16(q89, t7);
+        q12345 = vaddq_s16(q12, q345);
+        q34567 = vaddq_s16(q67, q345);
+        q56789 = vaddq_s16(t5, q6789);
+        q7891011 = vaddq_s16(q789, q1011);
+
+        store_s16_8x4(dst1_16_ptr, dst_stride_2, q12345, q34567, q56789,
+                      q7891011);
+        dst1_16_ptr += dst_stride_8;
+
+        boxsum2_square_sum_calc(
+            vget_low_s16(t1), vget_low_s16(t2), vget_low_s16(t3),
+            vget_low_s16(t4), vget_low_s16(t5), vget_low_s16(t6),
+            vget_low_s16(t7), vget_low_s16(t8), vget_low_s16(t9),
+            vget_low_s16(t10), vget_low_s16(t11), &r12345, &r34567, &r56789,
+            &r7891011);
+
+        store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r34567, r56789, r7891011);
+
+        boxsum2_square_sum_calc(
+            vget_high_s16(t1), vget_high_s16(t2), vget_high_s16(t3),
+            vget_high_s16(t4), vget_high_s16(t5), vget_high_s16(t6),
+            vget_high_s16(t7), vget_high_s16(t8), vget_high_s16(t9),
+            vget_high_s16(t10), vget_high_s16(t11), &r12345, &r34567, &r56789,
+            &r7891011);
+
+        store_s32_4x4(dst2_ptr + 4, dst_stride_2, r12345, r34567, r56789,
+                      r7891011);
+        dst2_ptr += (dst_stride_8);
+        h -= 8;
+      } while (h > 0);
+      w -= 8;
+      count++;
+    } while (w > 0);
+  }
+
+  {
+    int16x4_t s1, s2, s3, s4, s5, s6, s7, s8;
+    int32x4_t d1, d2, d3, d4, d5, d6, d7, d8;
+    int32x4_t q12345, q34567, q23456, q45678;
+    int32x4_t q23, q45, q67;
+    int32x4_t q2345, q4567;
+
+    int32x4_t r12345, r34567, r23456, r45678;
+    int32x4_t r23, r45, r67;
+    int32x4_t r2345, r4567;
+
+    int32_t *src2_ptr, *dst1_32_ptr;
+    int16_t *src1_ptr;
+    count = 0;
+    h = height;
+    do {
+      dst1_32_ptr = dst32 + count * dst_stride_8 + (dst_stride_2);
+      dst2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2);
+      src1_ptr = dst16 + count * dst_stride_8 + (dst_stride_2);
+      src2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2);
+      w = width;
+
+      dst1_32_ptr += 2;
+      dst2_ptr += 2;
+      load_s16_4x4(src1_ptr, dst_stride_2, &s1, &s2, &s3, &s4);
+      transpose_s16_4x4d(&s1, &s2, &s3, &s4);
+      load_s32_4x4(src2_ptr, dst_stride_2, &d1, &d2, &d3, &d4);
+      transpose_s32_4x4(&d1, &d2, &d3, &d4);
+      do {
+        src1_ptr += 4;
+        src2_ptr += 4;
+        load_s16_4x4(src1_ptr, dst_stride_2, &s5, &s6, &s7, &s8);
+        transpose_s16_4x4d(&s5, &s6, &s7, &s8);
+        load_s32_4x4(src2_ptr, dst_stride_2, &d5, &d6, &d7, &d8);
+        transpose_s32_4x4(&d5, &d6, &d7, &d8);
+        q23 = vaddl_s16(s2, s3);
+        q45 = vaddl_s16(s4, s5);
+        q67 = vaddl_s16(s6, s7);
+        q2345 = vaddq_s32(q23, q45);
+        q4567 = vaddq_s32(q45, q67);
+        q12345 = vaddq_s32(vmovl_s16(s1), q2345);
+        q23456 = vaddq_s32(q2345, vmovl_s16(s6));
+        q34567 = vaddq_s32(q4567, vmovl_s16(s3));
+        q45678 = vaddq_s32(q4567, vmovl_s16(s8));
+
+        transpose_s32_4x4(&q12345, &q23456, &q34567, &q45678);
+        store_s32_4x4(dst1_32_ptr, dst_stride_2, q12345, q23456, q34567,
+                      q45678);
+        dst1_32_ptr += 4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+
+        r23 = vaddq_s32(d2, d3);
+        r45 = vaddq_s32(d4, d5);
+        r67 = vaddq_s32(d6, d7);
+        r2345 = vaddq_s32(r23, r45);
+        r4567 = vaddq_s32(r45, r67);
+        r12345 = vaddq_s32(d1, r2345);
+        r23456 = vaddq_s32(r2345, d6);
+        r34567 = vaddq_s32(r4567, d3);
+        r45678 = vaddq_s32(r4567, d8);
+
+        transpose_s32_4x4(&r12345, &r23456, &r34567, &r45678);
+        store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r23456, r34567, r45678);
+        dst2_ptr += 4;
+        d1 = d5;
+        d2 = d6;
+        d3 = d7;
+        d4 = d8;
+        w -= 4;
+      } while (w > 0);
+      h -= 8;
+      count++;
+    } while (h > 0);
+  }
+}
+
+static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16,
+                                        uint16_t *B16, int32_t *B,
+                                        const int buf_stride, const int width,
+                                        const int height, const int r,
+                                        const int s, const int ht_inc) {
+  int32_t *src1, *dst2, count = 0;
+  uint16_t *dst_A16, *src2;
+  const uint32_t n = (2 * r + 1) * (2 * r + 1);
+  const uint32x4_t const_n_val = vdupq_n_u32(n);
+  const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
+  const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]);
+  const uint32x4_t const_val = vdupq_n_u32(255);
+
+  uint16x8_t s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7;
+
+  uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  const uint32x4_t s_vec = vdupq_n_u32(s);
+  int w, h = height;
+
+  do {
+    dst_A16 = A16 + (count << 2) * buf_stride;
+    src1 = A + (count << 2) * buf_stride;
+    src2 = B16 + (count << 2) * buf_stride;
+    dst2 = B + (count << 2) * buf_stride;
+    w = width;
+    do {
+      load_u32_4x4((uint32_t *)src1, buf_stride, &s0, &s1, &s2, &s3);
+      load_u32_4x4((uint32_t *)src1 + 4, buf_stride, &s4, &s5, &s6, &s7);
+      load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3);
+
+      s16_4 = s16_0;
+      s16_5 = s16_1;
+      s16_6 = s16_2;
+      s16_7 = s16_3;
+
+      calc_ab_internal_common(
+          s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4,
+          s16_5, s16_6, s16_7, const_n_val, s_vec, const_val,
+          one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride);
+
+      w -= 8;
+      dst2 += 8;
+      src1 += 8;
+      src2 += 8;
+      dst_A16 += 8;
+    } while (w > 0);
+    count++;
+    h -= (ht_inc * 4);
+  } while (h > 0);
+}
+
+static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
+                                        uint16_t *B16, int32_t *B,
+                                        const int buf_stride, const int width,
+                                        const int height, const int bit_depth,
+                                        const int r, const int s,
+                                        const int ht_inc) {
+  int32_t *src1, *dst2, count = 0;
+  uint16_t *dst_A16, *src2;
+  const uint32_t n = (2 * r + 1) * (2 * r + 1);
+  const int16x8_t bd_min_2_vec = vdupq_n_s16(-(bit_depth - 8));
+  const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
+  const uint32x4_t const_n_val = vdupq_n_u32(n);
+  const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
+  const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]);
+  const uint32x4_t const_val = vdupq_n_u32(255);
+
+  int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+  uint16x8_t s16_0, s16_1, s16_2, s16_3;
+  uint16x8_t s16_4, s16_5, s16_6, s16_7;
+  uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  const uint32x4_t s_vec = vdupq_n_u32(s);
+  int w, h = height;
+
+  do {
+    src1 = A + (count << 2) * buf_stride;
+    src2 = B16 + (count << 2) * buf_stride;
+    dst2 = B + (count << 2) * buf_stride;
+    dst_A16 = A16 + (count << 2) * buf_stride;
+    w = width;
+    do {
+      load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+      load_s32_4x4(src1 + 4, buf_stride, &sr4, &sr5, &sr6, &sr7);
+      load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3);
+
+      s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec);
+      s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec);
+      s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec);
+      s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec);
+      s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_1_vec);
+      s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_1_vec);
+      s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_1_vec);
+      s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_1_vec);
+
+      s16_4 = vrshlq_u16(s16_0, bd_min_2_vec);
+      s16_5 = vrshlq_u16(s16_1, bd_min_2_vec);
+      s16_6 = vrshlq_u16(s16_2, bd_min_2_vec);
+      s16_7 = vrshlq_u16(s16_3, bd_min_2_vec);
+
+      calc_ab_internal_common(
+          s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4,
+          s16_5, s16_6, s16_7, const_n_val, s_vec, const_val,
+          one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride);
+
+      w -= 8;
+      dst2 += 8;
+      src1 += 8;
+      src2 += 8;
+      dst_A16 += 8;
+    } while (w > 0);
+    count++;
+    h -= (ht_inc * 4);
+  } while (h > 0);
+}
+
+static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
+                                             int32_t *B, const int buf_stride,
+                                             const int width, const int height,
+                                             const int r, const int s,
+                                             const int ht_inc) {
+  int32_t *src1, *src2, count = 0;
+  uint16_t *dst_A16;
+  const uint32_t n = (2 * r + 1) * (2 * r + 1);
+  const uint32x4_t const_n_val = vdupq_n_u32(n);
+  const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
+  const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]);
+  const uint32x4_t const_val = vdupq_n_u32(255);
+
+  int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+  uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  const uint32x4_t s_vec = vdupq_n_u32(s);
+  int w, h = height;
+
+  do {
+    src1 = A + (count << 2) * buf_stride;
+    src2 = B + (count << 2) * buf_stride;
+    dst_A16 = A16 + (count << 2) * buf_stride;
+    w = width;
+    do {
+      load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+      load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7);
+
+      s0 = vreinterpretq_u32_s32(sr0);
+      s1 = vreinterpretq_u32_s32(sr1);
+      s2 = vreinterpretq_u32_s32(sr2);
+      s3 = vreinterpretq_u32_s32(sr3);
+      s4 = vreinterpretq_u32_s32(sr4);
+      s5 = vreinterpretq_u32_s32(sr5);
+      s6 = vreinterpretq_u32_s32(sr6);
+      s7 = vreinterpretq_u32_s32(sr7);
+
+      calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5,
+                                   sr6, sr7, const_n_val, s_vec, const_val,
+                                   one_by_n_minus_1_vec, sgrproj_sgr, src1,
+                                   dst_A16, src2, buf_stride);
+
+      w -= 4;
+      src1 += 4;
+      src2 += 4;
+      dst_A16 += 4;
+    } while (w > 0);
+    count++;
+    h -= (ht_inc * 4);
+  } while (h > 0);
+}
+
+static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
+                                             int32_t *B, const int buf_stride,
+                                             const int width, const int height,
+                                             const int bit_depth, const int r,
+                                             const int s, const int ht_inc) {
+  int32_t *src1, *src2, count = 0;
+  uint16_t *dst_A16;
+  const uint32_t n = (2 * r + 1) * (2 * r + 1);
+  const int32x4_t bd_min_2_vec = vdupq_n_s32(-(bit_depth - 8));
+  const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
+  const uint32x4_t const_n_val = vdupq_n_u32(n);
+  const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
+  const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]);
+  const uint32x4_t const_val = vdupq_n_u32(255);
+
+  int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+  uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  const uint32x4_t s_vec = vdupq_n_u32(s);
+  int w, h = height;
+
+  do {
+    src1 = A + (count << 2) * buf_stride;
+    src2 = B + (count << 2) * buf_stride;
+    dst_A16 = A16 + (count << 2) * buf_stride;
+    w = width;
+    do {
+      load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+      load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7);
+
+      s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec);
+      s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec);
+      s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec);
+      s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec);
+      s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_2_vec);
+      s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_2_vec);
+      s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_2_vec);
+      s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_2_vec);
+
+      calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5,
+                                   sr6, sr7, const_n_val, s_vec, const_val,
+                                   one_by_n_minus_1_vec, sgrproj_sgr, src1,
+                                   dst_A16, src2, buf_stride);
+
+      w -= 4;
+      src1 += 4;
+      src2 += 4;
+      dst_A16 += 4;
+    } while (w > 0);
+    count++;
+    h -= (ht_inc * 4);
+  } while (h > 0);
+}
+
+static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
+                           int32_t *dst2, const int dst_stride, const int width,
+                           const int height) {
+  assert(width > 2 * SGRPROJ_BORDER_HORZ);
+  assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+  int16_t *src_ptr;
+  int32_t *dst2_ptr;
+  uint16_t *dst1_ptr;
+  int h, w, count = 0;
+
+  w = width;
+  {
+    int16x8_t s1, s2, s3, s4, s5, s6, s7, s8;
+    int16x8_t q23, q34, q56, q234, q345, q456, q567;
+    int32x4_t r23, r56, r345, r456, r567, r78, r678;
+    int32x4_t r4_low, r4_high, r34_low, r34_high, r234_low, r234_high;
+    int32x4_t r2, r3, r5, r6, r7, r8;
+    int16x8_t q678, q78;
+
+    do {
+      dst1_ptr = dst1 + (count << 3);
+      dst2_ptr = dst2 + (count << 3);
+      src_ptr = src + (count << 3);
+      h = height;
+
+      load_s16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4);
+      src_ptr += 4 * src_stride;
+
+      q23 = vaddq_s16(s2, s3);
+      q234 = vaddq_s16(q23, s4);
+      q34 = vaddq_s16(s3, s4);
+      dst1_ptr += (dst_stride << 1);
+
+      r2 = vmull_s16(vget_low_s16(s2), vget_low_s16(s2));
+      r3 = vmull_s16(vget_low_s16(s3), vget_low_s16(s3));
+      r4_low = vmull_s16(vget_low_s16(s4), vget_low_s16(s4));
+      r23 = vaddq_s32(r2, r3);
+      r234_low = vaddq_s32(r23, r4_low);
+      r34_low = vaddq_s32(r3, r4_low);
+
+      r2 = vmull_s16(vget_high_s16(s2), vget_high_s16(s2));
+      r3 = vmull_s16(vget_high_s16(s3), vget_high_s16(s3));
+      r4_high = vmull_s16(vget_high_s16(s4), vget_high_s16(s4));
+      r23 = vaddq_s32(r2, r3);
+      r234_high = vaddq_s32(r23, r4_high);
+      r34_high = vaddq_s32(r3, r4_high);
+
+      dst2_ptr += (dst_stride << 1);
+
+      do {
+        load_s16_8x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
+        src_ptr += 4 * src_stride;
+
+        q345 = vaddq_s16(s5, q34);
+        q56 = vaddq_s16(s5, s6);
+        q456 = vaddq_s16(s4, q56);
+        q567 = vaddq_s16(s7, q56);
+        q78 = vaddq_s16(s7, s8);
+        q678 = vaddq_s16(s6, q78);
+
+        store_s16_8x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567);
+        dst1_ptr += (dst_stride << 2);
+
+        s4 = s8;
+        q34 = q78;
+        q234 = q678;
+
+        r5 = vmull_s16(vget_low_s16(s5), vget_low_s16(s5));
+        r6 = vmull_s16(vget_low_s16(s6), vget_low_s16(s6));
+        r7 = vmull_s16(vget_low_s16(s7), vget_low_s16(s7));
+        r8 = vmull_s16(vget_low_s16(s8), vget_low_s16(s8));
+
+        r345 = vaddq_s32(r5, r34_low);
+        r56 = vaddq_s32(r5, r6);
+        r456 = vaddq_s32(r4_low, r56);
+        r567 = vaddq_s32(r7, r56);
+        r78 = vaddq_s32(r7, r8);
+        r678 = vaddq_s32(r6, r78);
+        store_s32_4x4(dst2_ptr, dst_stride, r234_low, r345, r456, r567);
+
+        r4_low = r8;
+        r34_low = r78;
+        r234_low = r678;
+
+        r5 = vmull_s16(vget_high_s16(s5), vget_high_s16(s5));
+        r6 = vmull_s16(vget_high_s16(s6), vget_high_s16(s6));
+        r7 = vmull_s16(vget_high_s16(s7), vget_high_s16(s7));
+        r8 = vmull_s16(vget_high_s16(s8), vget_high_s16(s8));
+
+        r345 = vaddq_s32(r5, r34_high);
+        r56 = vaddq_s32(r5, r6);
+        r456 = vaddq_s32(r4_high, r56);
+        r567 = vaddq_s32(r7, r56);
+        r78 = vaddq_s32(r7, r8);
+        r678 = vaddq_s32(r6, r78);
+        store_s32_4x4((dst2_ptr + 4), dst_stride, r234_high, r345, r456, r567);
+        dst2_ptr += (dst_stride << 2);
+
+        r4_high = r8;
+        r34_high = r78;
+        r234_high = r678;
+
+        h -= 4;
+      } while (h > 0);
+      w -= 8;
+      count++;
+    } while (w > 0);
+  }
+
+  {
+    int16x4_t d1, d2, d3, d4, d5, d6, d7, d8;
+    int16x4_t q23, q34, q56, q234, q345, q456, q567;
+    int32x4_t r23, r56, r234, r345, r456, r567, r34, r78, r678;
+    int32x4_t r1, r2, r3, r4, r5, r6, r7, r8;
+    int16x4_t q678, q78;
+
+    int32_t *src2_ptr;
+    uint16_t *src1_ptr;
+    count = 0;
+    h = height;
+    w = width;
+    do {
+      dst1_ptr = dst1 + (count << 2) * dst_stride;
+      dst2_ptr = dst2 + (count << 2) * dst_stride;
+      src1_ptr = dst1 + (count << 2) * dst_stride;
+      src2_ptr = dst2 + (count << 2) * dst_stride;
+      w = width;
+
+      load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d1, &d2, &d3, &d4);
+      transpose_s16_4x4d(&d1, &d2, &d3, &d4);
+      load_s32_4x4(src2_ptr, dst_stride, &r1, &r2, &r3, &r4);
+      transpose_s32_4x4(&r1, &r2, &r3, &r4);
+      src1_ptr += 4;
+      src2_ptr += 4;
+
+      q23 = vadd_s16(d2, d3);
+      q234 = vadd_s16(q23, d4);
+      q34 = vadd_s16(d3, d4);
+      dst1_ptr += 2;
+      r23 = vaddq_s32(r2, r3);
+      r234 = vaddq_s32(r23, r4);
+      r34 = vaddq_s32(r3, r4);
+      dst2_ptr += 2;
+
+      do {
+        load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d5, &d6, &d7, &d8);
+        transpose_s16_4x4d(&d5, &d6, &d7, &d8);
+        load_s32_4x4(src2_ptr, dst_stride, &r5, &r6, &r7, &r8);
+        transpose_s32_4x4(&r5, &r6, &r7, &r8);
+        src1_ptr += 4;
+        src2_ptr += 4;
+
+        q345 = vadd_s16(d5, q34);
+        q56 = vadd_s16(d5, d6);
+        q456 = vadd_s16(d4, q56);
+        q567 = vadd_s16(d7, q56);
+        q78 = vadd_s16(d7, d8);
+        q678 = vadd_s16(d6, q78);
+        transpose_s16_4x4d(&q234, &q345, &q456, &q567);
+        store_s16_4x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567);
+        dst1_ptr += 4;
+
+        d4 = d8;
+        q34 = q78;
+        q234 = q678;
+
+        r345 = vaddq_s32(r5, r34);
+        r56 = vaddq_s32(r5, r6);
+        r456 = vaddq_s32(r4, r56);
+        r567 = vaddq_s32(r7, r56);
+        r78 = vaddq_s32(r7, r8);
+        r678 = vaddq_s32(r6, r78);
+        transpose_s32_4x4(&r234, &r345, &r456, &r567);
+        store_s32_4x4(dst2_ptr, dst_stride, r234, r345, r456, r567);
+        dst2_ptr += 4;
+
+        r4 = r8;
+        r34 = r78;
+        r234 = r678;
+        w -= 4;
+      } while (w > 0);
+      h -= 4;
+      count++;
+    } while (h > 0);
+  }
+}
+
+static INLINE int32x4_t cross_sum_inp_s32(int32_t *buf, int buf_stride) {
+  int32x4_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl;
+  int32x4_t fours, threes, res;
+
+  xtl = vld1q_s32(buf - buf_stride - 1);
+  xt = vld1q_s32(buf - buf_stride);
+  xtr = vld1q_s32(buf - buf_stride + 1);
+  xl = vld1q_s32(buf - 1);
+  x = vld1q_s32(buf);
+  xr = vld1q_s32(buf + 1);
+  xbl = vld1q_s32(buf + buf_stride - 1);
+  xb = vld1q_s32(buf + buf_stride);
+  xbr = vld1q_s32(buf + buf_stride + 1);
+
+  fours = vaddq_s32(xl, vaddq_s32(xt, vaddq_s32(xr, vaddq_s32(xb, x))));
+  threes = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl)));
+  res = vsubq_s32(vshlq_n_s32(vaddq_s32(fours, threes), 2), threes);
+  return res;
+}
+
+static INLINE void cross_sum_inp_u16(uint16_t *buf, int buf_stride,
+                                     int32x4_t *a0, int32x4_t *a1) {
+  uint16x8_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl;
+  uint16x8_t r0, r1;
+
+  xtl = vld1q_u16(buf - buf_stride - 1);
+  xt = vld1q_u16(buf - buf_stride);
+  xtr = vld1q_u16(buf - buf_stride + 1);
+  xl = vld1q_u16(buf - 1);
+  x = vld1q_u16(buf);
+  xr = vld1q_u16(buf + 1);
+  xbl = vld1q_u16(buf + buf_stride - 1);
+  xb = vld1q_u16(buf + buf_stride);
+  xbr = vld1q_u16(buf + buf_stride + 1);
+
+  xb = vaddq_u16(xb, x);
+  xt = vaddq_u16(xt, xr);
+  xl = vaddq_u16(xl, xb);
+  xl = vaddq_u16(xl, xt);
+
+  r0 = vshlq_n_u16(xl, 2);
+
+  xbl = vaddq_u16(xbl, xbr);
+  xtl = vaddq_u16(xtl, xtr);
+  xtl = vaddq_u16(xtl, xbl);
+
+  r1 = vshlq_n_u16(xtl, 2);
+  r1 = vsubq_u16(r1, xtl);
+
+  *a0 = vreinterpretq_s32_u32(
+      vaddq_u32(vmovl_u16(vget_low_u16(r0)), vmovl_u16(vget_low_u16(r1))));
+  *a1 = vreinterpretq_s32_u32(
+      vaddq_u32(vmovl_u16(vget_high_u16(r0)), vmovl_u16(vget_high_u16(r1))));
+}
+
+static INLINE int32x4_t cross_sum_fast_even_row(int32_t *buf, int buf_stride) {
+  int32x4_t xtr, xt, xtl, xbr, xb, xbl;
+  int32x4_t fives, sixes, fives_plus_sixes;
+
+  xtl = vld1q_s32(buf - buf_stride - 1);
+  xt = vld1q_s32(buf - buf_stride);
+  xtr = vld1q_s32(buf - buf_stride + 1);
+  xbl = vld1q_s32(buf + buf_stride - 1);
+  xb = vld1q_s32(buf + buf_stride);
+  xbr = vld1q_s32(buf + buf_stride + 1);
+
+  fives = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl)));
+  sixes = vaddq_s32(xt, xb);
+  fives_plus_sixes = vaddq_s32(fives, sixes);
+
+  return vaddq_s32(
+      vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes);
+}
+
+static INLINE void cross_sum_fast_even_row_inp16(uint16_t *buf, int buf_stride,
+                                                 int32x4_t *a0, int32x4_t *a1) {
+  uint16x8_t xtr, xt, xtl, xbr, xb, xbl, xb0;
+
+  xtl = vld1q_u16(buf - buf_stride - 1);
+  xt = vld1q_u16(buf - buf_stride);
+  xtr = vld1q_u16(buf - buf_stride + 1);
+  xbl = vld1q_u16(buf + buf_stride - 1);
+  xb = vld1q_u16(buf + buf_stride);
+  xbr = vld1q_u16(buf + buf_stride + 1);
+
+  xbr = vaddq_u16(xbr, xbl);
+  xtr = vaddq_u16(xtr, xtl);
+  xbr = vaddq_u16(xbr, xtr);
+  xtl = vshlq_n_u16(xbr, 2);
+  xbr = vaddq_u16(xtl, xbr);
+
+  xb = vaddq_u16(xb, xt);
+  xb0 = vshlq_n_u16(xb, 1);
+  xb = vshlq_n_u16(xb, 2);
+  xb = vaddq_u16(xb, xb0);
+
+  *a0 = vreinterpretq_s32_u32(
+      vaddq_u32(vmovl_u16(vget_low_u16(xbr)), vmovl_u16(vget_low_u16(xb))));
+  *a1 = vreinterpretq_s32_u32(
+      vaddq_u32(vmovl_u16(vget_high_u16(xbr)), vmovl_u16(vget_high_u16(xb))));
+}
+
+static INLINE int32x4_t cross_sum_fast_odd_row(int32_t *buf) {
+  int32x4_t xl, x, xr;
+  int32x4_t fives, sixes, fives_plus_sixes;
+
+  xl = vld1q_s32(buf - 1);
+  x = vld1q_s32(buf);
+  xr = vld1q_s32(buf + 1);
+  fives = vaddq_s32(xl, xr);
+  sixes = x;
+  fives_plus_sixes = vaddq_s32(fives, sixes);
+
+  return vaddq_s32(
+      vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes);
+}
+
+static INLINE void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0,
+                                                int32x4_t *a1) {
+  uint16x8_t xl, x, xr;
+  uint16x8_t x0;
+
+  xl = vld1q_u16(buf - 1);
+  x = vld1q_u16(buf);
+  xr = vld1q_u16(buf + 1);
+  xl = vaddq_u16(xl, xr);
+  x0 = vshlq_n_u16(xl, 2);
+  xl = vaddq_u16(xl, x0);
+
+  x0 = vshlq_n_u16(x, 1);
+  x = vshlq_n_u16(x, 2);
+  x = vaddq_u16(x, x0);
+
+  *a0 = vreinterpretq_s32_u32(
+      vaddq_u32(vmovl_u16(vget_low_u16(xl)), vmovl_u16(vget_low_u16(x))));
+  *a1 = vreinterpretq_s32_u32(
+      vaddq_u32(vmovl_u16(vget_high_u16(xl)), vmovl_u16(vget_high_u16(x))));
+}
+
+void final_filter_fast_internal(uint16_t *A, int32_t *B, const int buf_stride,
+                                int16_t *src, const int src_stride,
+                                int32_t *dst, const int dst_stride,
+                                const int width, const int height) {
+  int16x8_t s0;
+  int32_t *B_tmp, *dst_ptr;
+  uint16_t *A_tmp;
+  int16_t *src_ptr;
+  int32x4_t a_res0, a_res1, b_res0, b_res1;
+  int w, h, count = 0;
+  assert(SGRPROJ_SGR_BITS == 8);
+  assert(SGRPROJ_RST_BITS == 4);
+
+  A_tmp = A;
+  B_tmp = B;
+  src_ptr = src;
+  dst_ptr = dst;
+  h = height;
+  do {
+    A_tmp = (A + count * buf_stride);
+    B_tmp = (B + count * buf_stride);
+    src_ptr = (src + count * src_stride);
+    dst_ptr = (dst + count * dst_stride);
+    w = width;
+    if (!(count & 1)) {
+      do {
+        s0 = vld1q_s16(src_ptr);
+        cross_sum_fast_even_row_inp16(A_tmp, buf_stride, &a_res0, &a_res1);
+        a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+        a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+        b_res0 = cross_sum_fast_even_row(B_tmp, buf_stride);
+        b_res1 = cross_sum_fast_even_row(B_tmp + 4, buf_stride);
+        a_res0 = vaddq_s32(a_res0, b_res0);
+        a_res1 = vaddq_s32(a_res1, b_res1);
+
+        a_res0 =
+            vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+        a_res1 =
+            vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+
+        vst1q_s32(dst_ptr, a_res0);
+        vst1q_s32(dst_ptr + 4, a_res1);
+
+        A_tmp += 8;
+        B_tmp += 8;
+        src_ptr += 8;
+        dst_ptr += 8;
+        w -= 8;
+      } while (w > 0);
+    } else {
+      do {
+        s0 = vld1q_s16(src_ptr);
+        cross_sum_fast_odd_row_inp16(A_tmp, &a_res0, &a_res1);
+        a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+        a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+        b_res0 = cross_sum_fast_odd_row(B_tmp);
+        b_res1 = cross_sum_fast_odd_row(B_tmp + 4);
+        a_res0 = vaddq_s32(a_res0, b_res0);
+        a_res1 = vaddq_s32(a_res1, b_res1);
+
+        a_res0 =
+            vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS);
+        a_res1 =
+            vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS);
+
+        vst1q_s32(dst_ptr, a_res0);
+        vst1q_s32(dst_ptr + 4, a_res1);
+
+        A_tmp += 8;
+        B_tmp += 8;
+        src_ptr += 8;
+        dst_ptr += 8;
+        w -= 8;
+      } while (w > 0);
+    }
+    count++;
+    h -= 1;
+  } while (h > 0);
+}
+
+void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride,
+                           int16_t *src, const int src_stride, int32_t *dst,
+                           const int dst_stride, const int width,
+                           const int height) {
+  int16x8_t s0;
+  int32_t *B_tmp, *dst_ptr;
+  uint16_t *A_tmp;
+  int16_t *src_ptr;
+  int32x4_t a_res0, a_res1, b_res0, b_res1;
+  int w, h, count = 0;
+
+  assert(SGRPROJ_SGR_BITS == 8);
+  assert(SGRPROJ_RST_BITS == 4);
+  h = height;
+
+  do {
+    A_tmp = (A + count * buf_stride);
+    B_tmp = (B + count * buf_stride);
+    src_ptr = (src + count * src_stride);
+    dst_ptr = (dst + count * dst_stride);
+    w = width;
+    do {
+      s0 = vld1q_s16(src_ptr);
+      cross_sum_inp_u16(A_tmp, buf_stride, &a_res0, &a_res1);
+      a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+      a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+      b_res0 = cross_sum_inp_s32(B_tmp, buf_stride);
+      b_res1 = cross_sum_inp_s32(B_tmp + 4, buf_stride);
+      a_res0 = vaddq_s32(a_res0, b_res0);
+      a_res1 = vaddq_s32(a_res1, b_res1);
+
+      a_res0 =
+          vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+      a_res1 =
+          vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+      vst1q_s32(dst_ptr, a_res0);
+      vst1q_s32(dst_ptr + 4, a_res1);
+
+      A_tmp += 8;
+      B_tmp += 8;
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w > 0);
+    count++;
+    h -= 1;
+  } while (h > 0);
+}
+
+static INLINE void restoration_fast_internal(uint16_t *dgd16, int width,
+                                             int height, int dgd_stride,
+                                             int32_t *dst, int dst_stride,
+                                             int bit_depth, int sgr_params_idx,
+                                             int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+  const int buf_stride = ((width_ext + 3) & ~3) + 16;
+  int32_t A_[RESTORATION_PROC_UNIT_PELS];
+  uint16_t A16_[RESTORATION_PROC_UNIT_PELS];
+  int32_t B_[RESTORATION_PROC_UNIT_PELS];
+  int32_t *square_sum_buf = A_;
+  int32_t *sum_buf = B_;
+  uint16_t *tmp16_buf = A16_;
+
+  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+         "Need SGRPROJ_BORDER_* >= r+1");
+
+  assert(radius_idx == 0);
+  assert(r == 2);
+
+  // input(dgd16) is 16bit.
+  // sum of pixels 1st stage output will be in 16bit(tmp16_buf). End output is
+  // kept in 32bit [sum_buf]. sum of squares output is kept in 32bit
+  // buffer(square_sum_buf).
+  boxsum2((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT -
+                      SGRPROJ_BORDER_HORZ),
+          dgd_stride, (int16_t *)tmp16_buf, sum_buf, square_sum_buf, buf_stride,
+          width_ext, height_ext);
+
+  square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  tmp16_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+  // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
+  // [1, 256] for all bit depths. b output is kept in 32bit buffer.
+
+  if (8 == bit_depth) {
+    calc_ab_fast_internal_lbd(
+        (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
+        (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r,
+        params->s[radius_idx], 2);
+  } else {
+    calc_ab_fast_internal_hbd(
+        (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
+        (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2,
+        bit_depth, r, params->s[radius_idx], 2);
+  }
+  final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16,
+                             dgd_stride, dst, dst_stride, width, height);
+}
+
+static INLINE void restoration_internal(uint16_t *dgd16, int width, int height,
+                                        int dgd_stride, int32_t *dst,
+                                        int dst_stride, int bit_depth,
+                                        int sgr_params_idx, int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+  int buf_stride = ((width_ext + 3) & ~3) + 16;
+  int32_t A_[RESTORATION_PROC_UNIT_PELS];
+  uint16_t A16_[RESTORATION_PROC_UNIT_PELS];
+  uint16_t B16_[RESTORATION_PROC_UNIT_PELS];
+  int32_t B_[RESTORATION_PROC_UNIT_PELS];
+  int32_t *square_sum_buf = A_;
+  uint16_t *sum_buf = B16_;
+  uint16_t *A16 = A16_;
+  int32_t *B = B_;
+
+  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+         "Need SGRPROJ_BORDER_* >= r+1");
+
+  assert(radius_idx == 1);
+  assert(r == 1);
+
+  // input(dgd16) is 16bit.
+  // sum of pixels output will be in 16bit(sum_buf).
+  // sum of squares output is kept in 32bit buffer(square_sum_buf).
+  boxsum1((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT -
+                      SGRPROJ_BORDER_HORZ),
+          dgd_stride, sum_buf, square_sum_buf, buf_stride, width_ext,
+          height_ext);
+
+  square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  A16 += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+  // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
+  // [1, 256] for all bit depths. b output is kept in 32bit buffer.
+  if (8 == bit_depth) {
+    calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
+                         (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+                         (B - buf_stride - 1), buf_stride, width + 2,
+                         height + 2, r, params->s[radius_idx], 1);
+  } else {
+    calc_ab_internal_hbd((square_sum_buf - buf_stride - 1),
+                         (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+                         (B - buf_stride - 1), buf_stride, width + 2,
+                         height + 2, bit_depth, r, params->s[radius_idx], 1);
+  }
+  final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst,
+                        dst_stride, width, height);
+}
+
+static INLINE void src_convert_u8_to_u16(const uint8_t *src,
+                                         const int src_stride, uint16_t *dst,
+                                         const int dst_stride, const int width,
+                                         const int height) {
+  const uint8_t *src_ptr;
+  uint16_t *dst_ptr;
+  int h, w, count = 0;
+
+  uint8x8_t t1, t2, t3, t4;
+  uint16x8_t s1, s2, s3, s4;
+  h = height;
+  do {
+    src_ptr = src + (count << 2) * src_stride;
+    dst_ptr = dst + (count << 2) * dst_stride;
+    w = width;
+    if (w >= 7) {
+      do {
+        load_u8_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4);
+        s1 = vmovl_u8(t1);
+        s2 = vmovl_u8(t2);
+        s3 = vmovl_u8(t3);
+        s4 = vmovl_u8(t4);
+        store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4);
+
+        src_ptr += 8;
+        dst_ptr += 8;
+        w -= 8;
+      } while (w > 7);
+    }
+
+    for (int y = 0; y < w; y++) {
+      dst_ptr[y] = src_ptr[y];
+      dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride];
+      dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride];
+      dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride];
+    }
+    count++;
+    h -= 4;
+  } while (h > 3);
+
+  src_ptr = src + (count << 2) * src_stride;
+  dst_ptr = dst + (count << 2) * dst_stride;
+  for (int x = 0; x < h; x++) {
+    for (int y = 0; y < width; y++) {
+      dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride];
+    }
+  }
+}
+
+static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride,
+                                        uint16_t *dst, const int dst_stride,
+                                        int width, int height) {
+  const uint16_t *src_ptr;
+  uint16_t *dst_ptr;
+  int h, w, count = 0;
+  uint16x8_t s1, s2, s3, s4;
+
+  h = height;
+  do {
+    src_ptr = src + (count << 2) * src_stride;
+    dst_ptr = dst + (count << 2) * dst_stride;
+    w = width;
+    do {
+      load_u16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4);
+      store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w > 7);
+
+    for (int y = 0; y < w; y++) {
+      dst_ptr[y] = src_ptr[y];
+      dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride];
+      dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride];
+      dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride];
+    }
+    count++;
+    h -= 4;
+  } while (h > 3);
+
+  src_ptr = src + (count << 2) * src_stride;
+  dst_ptr = dst + (count << 2) * dst_stride;
+
+  for (int x = 0; x < h; x++) {
+    memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride),
+           sizeof(uint16_t) * width);
+  }
+}
+
+void av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
+                                     int stride, int32_t *flt0, int32_t *flt1,
+                                     int flt_stride, int sgr_params_idx,
+                                     int bit_depth, int highbd) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  assert(!(params->r[0] == 0 && params->r[1] == 0));
+
+  uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
+  const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+  uint16_t *dgd16 =
+      dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+  const int dgd_stride = stride;
+
+  if (highbd) {
+    const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
+    src_convert_hbd_copy(
+        dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+        dgd_stride,
+        dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+        dgd16_stride, width_ext, height_ext);
+  } else {
+    src_convert_u8_to_u16(
+        dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+        dgd_stride,
+        dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+        dgd16_stride, width_ext, height_ext);
+  }
+
+  if (params->r[0] > 0)
+    restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0,
+                              flt_stride, bit_depth, sgr_params_idx, 0);
+  if (params->r[1] > 0)
+    restoration_internal(dgd16, width, height, dgd16_stride, flt1, flt_stride,
+                         bit_depth, sgr_params_idx, 1);
+}
+
+void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
+                                       int height, int stride, int eps,
+                                       const int *xqd, uint8_t *dst8,
+                                       int dst_stride, int32_t *tmpbuf,
+                                       int bit_depth, int highbd) {
+  int32_t *flt0 = tmpbuf;
+  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+  assert(width * height <= RESTORATION_UNITPELS_MAX);
+  uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
+  const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+  uint16_t *dgd16 =
+      dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+  const int dgd_stride = stride;
+  const sgr_params_type *const params = &sgr_params[eps];
+  int xq[2];
+
+  assert(!(params->r[0] == 0 && params->r[1] == 0));
+
+  if (highbd) {
+    const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
+    src_convert_hbd_copy(
+        dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+        dgd_stride,
+        dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+        dgd16_stride, width_ext, height_ext);
+  } else {
+    src_convert_u8_to_u16(
+        dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+        dgd_stride,
+        dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+        dgd16_stride, width_ext, height_ext);
+  }
+
+  if (params->r[0] > 0)
+    restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, width,
+                              bit_depth, eps, 0);
+  if (params->r[1] > 0)
+    restoration_internal(dgd16, width, height, dgd16_stride, flt1, width,
+                         bit_depth, eps, 1);
+
+  decode_xq(xqd, xq, params);
+
+  {
+    int16_t *src_ptr;
+    uint8_t *dst_ptr;
+    uint16_t *dst16_ptr;
+    int16x4_t d0, d4;
+    int16x8_t r0, s0;
+    uint16x8_t r4;
+    int32x4_t u0, u4, v0, v4, f00, f10;
+    uint8x8_t t0;
+    int count = 0, w = width, h = height, rc = 0;
+
+    const int32x4_t xq0_vec = vdupq_n_s32(xq[0]);
+    const int32x4_t xq1_vec = vdupq_n_s32(xq[1]);
+    const int16x8_t zero = vdupq_n_s16(0);
+    const uint16x8_t max = vdupq_n_u16((1 << bit_depth) - 1);
+    uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst8);
+    dst_ptr = dst8;
+    src_ptr = (int16_t *)dgd16;
+    do {
+      w = width;
+      count = 0;
+      dst_ptr = dst8 + rc * dst_stride;
+      dst16_ptr = dst16 + rc * dst_stride;
+      do {
+        s0 = vld1q_s16(src_ptr + count);
+
+        u0 = vshll_n_s16(vget_low_s16(s0), SGRPROJ_RST_BITS);
+        u4 = vshll_n_s16(vget_high_s16(s0), SGRPROJ_RST_BITS);
+
+        v0 = vshlq_n_s32(u0, SGRPROJ_PRJ_BITS);
+        v4 = vshlq_n_s32(u4, SGRPROJ_PRJ_BITS);
+
+        if (params->r[0] > 0) {
+          f00 = vld1q_s32(flt0 + count);
+          f10 = vld1q_s32(flt0 + count + 4);
+
+          f00 = vsubq_s32(f00, u0);
+          f10 = vsubq_s32(f10, u4);
+
+          v0 = vmlaq_s32(v0, xq0_vec, f00);
+          v4 = vmlaq_s32(v4, xq0_vec, f10);
+        }
+
+        if (params->r[1] > 0) {
+          f00 = vld1q_s32(flt1 + count);
+          f10 = vld1q_s32(flt1 + count + 4);
+
+          f00 = vsubq_s32(f00, u0);
+          f10 = vsubq_s32(f10, u4);
+
+          v0 = vmlaq_s32(v0, xq1_vec, f00);
+          v4 = vmlaq_s32(v4, xq1_vec, f10);
+        }
+
+        d0 = vqrshrn_n_s32(v0, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+        d4 = vqrshrn_n_s32(v4, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+        r0 = vcombine_s16(d0, d4);
+
+        r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero));
+
+        if (highbd) {
+          r4 = vminq_u16(r4, max);
+          vst1q_u16(dst16_ptr, r4);
+        } else {
+          t0 = vqmovn_u16(r4);
+          vst1_u8(dst_ptr, t0);
+        }
+        w -= 8;
+        count += 8;
+        dst_ptr += 8;
+        dst16_ptr += 8;
+      } while (w > 0);
+
+      src_ptr += dgd16_stride;
+      flt1 += width;
+      flt0 += width;
+      rc++;
+      h--;
+    } while (h > 0);
+  }
+}
diff --git a/third_party/aom/av1/common/arm/transpose_neon.h b/third_party/aom/av1/common/arm/transpose_neon.h
index 53727bb43..fe134087b 100644
--- a/third_party/aom/av1/common/arm/transpose_neon.h
+++ b/third_party/aom/av1/common/arm/transpose_neon.h
@@ -419,4 +419,42 @@ static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
   *a3 = vreinterpret_s16_s32(c1.val[1]);
 }
 
+static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
+  int32x4x2_t b0;
+  b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
+  b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+  return b0;
+}
+
+static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
+                                     int32x4_t *a2, int32x4_t *a3) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+
+  const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
+  const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
+
+  // Swap 64 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+
+  const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+  const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+
+  *a0 = c0.val[0];
+  *a1 = c1.val[0];
+  *a2 = c0.val[1];
+  *a3 = c1.val[1];
+}
+
 #endif  // AV1_COMMON_ARM_TRANSPOSE_NEON_H_
diff --git a/third_party/aom/av1/common/av1_loopfilter.c b/third_party/aom/av1/common/av1_loopfilter.c
index 738290fad..9d68b8760 100644
--- a/third_party/aom/av1/common/av1_loopfilter.c
+++ b/third_party/aom/av1/common/av1_loopfilter.c
@@ -1308,7 +1308,7 @@ static int compare_ref_dst(AV1_COMMON *const cm, uint8_t *ref_buf,
   end <<= MI_SIZE_LOG2;
   uint8_t *ref0 = ref_buf;
   uint8_t *dst0 = dst_buf;
-  if (cm->use_highbitdepth) {
+  if (cm->seq_params.use_highbitdepth) {
     const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref_buf);
     const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst_buf);
     for (int j = 0; j < 4; ++j) {
@@ -1404,11 +1404,11 @@ void av1_filter_block_plane_ver(AV1_COMMON *const cm,
       uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
       uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
 
-      if (cm->use_highbitdepth)
+      if (cm->seq_params.use_highbitdepth)
         highbd_filter_selectively_vert_row2(
             ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
             mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
-            &cm->lf_info, lfl, lfl2, (int)cm->bit_depth);
+            &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
       else
         filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl,
                                      mask_16x16_0, mask_8x8_0, mask_4x4_0,
@@ -1474,10 +1474,11 @@ void av1_filter_block_plane_hor(AV1_COMMON *const cm,
       mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
       mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
 
-      if (cm->use_highbitdepth)
-        highbd_filter_selectively_horiz(
-            CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
-            mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->bit_depth);
+      if (cm->seq_params.use_highbitdepth)
+        highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                        dst->stride, pl, ssx, mask_16x16,
+                                        mask_8x8, mask_4x4, &cm->lf_info, lfl,
+                                        (int)cm->seq_params.bit_depth);
       else
         filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
                                  mask_8x8, mask_4x4, &cm->lf_info, lfl);
@@ -1652,6 +1653,8 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
   const int dst_stride = plane_ptr->dst.stride;
   const int y_range = (MAX_MIB_SIZE >> scale_vert);
   const int x_range = (MAX_MIB_SIZE >> scale_horz);
+  const int use_highbitdepth = cm->seq_params.use_highbitdepth;
+  const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
   for (int y = 0; y < y_range; y += row_step) {
     uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
     for (int x = 0; x < x_range;) {
@@ -1677,40 +1680,40 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
       switch (params.filter_length) {
         // apply 4-tap filtering
         case 4:
-          if (cm->use_highbitdepth)
+          if (use_highbitdepth)
             aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p), dst_stride,
                                       params.mblim, params.lim, params.hev_thr,
-                                      cm->bit_depth);
+                                      bit_depth);
           else
             aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
                                params.hev_thr);
           break;
         case 6:  // apply 6-tap filter for chroma plane only
           assert(plane != 0);
-          if (cm->use_highbitdepth)
+          if (use_highbitdepth)
             aom_highbd_lpf_vertical_6(CONVERT_TO_SHORTPTR(p), dst_stride,
                                       params.mblim, params.lim, params.hev_thr,
-                                      cm->bit_depth);
+                                      bit_depth);
           else
             aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim,
                                params.hev_thr);
           break;
         // apply 8-tap filtering
         case 8:
-          if (cm->use_highbitdepth)
+          if (use_highbitdepth)
             aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(p), dst_stride,
                                       params.mblim, params.lim, params.hev_thr,
-                                      cm->bit_depth);
+                                      bit_depth);
           else
             aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim,
                                params.hev_thr);
           break;
         // apply 14-tap filtering
         case 14:
-          if (cm->use_highbitdepth)
+          if (use_highbitdepth)
             aom_highbd_lpf_vertical_14(CONVERT_TO_SHORTPTR(p), dst_stride,
                                        params.mblim, params.lim, params.hev_thr,
-                                       cm->bit_depth);
+                                       bit_depth);
           else
             aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim,
                                 params.hev_thr);
@@ -1737,6 +1740,8 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
   const int dst_stride = plane_ptr->dst.stride;
   const int y_range = (MAX_MIB_SIZE >> scale_vert);
   const int x_range = (MAX_MIB_SIZE >> scale_horz);
+  const int use_highbitdepth = cm->seq_params.use_highbitdepth;
+  const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
   for (int x = 0; x < x_range; x += col_step) {
     uint8_t *p = dst_ptr + x * MI_SIZE;
     for (int y = 0; y < y_range;) {
@@ -1762,10 +1767,10 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
       switch (params.filter_length) {
         // apply 4-tap filtering
         case 4:
-          if (cm->use_highbitdepth)
+          if (use_highbitdepth)
             aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p), dst_stride,
                                         params.mblim, params.lim,
-                                        params.hev_thr, cm->bit_depth);
+                                        params.hev_thr, bit_depth);
           else
             aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
                                  params.hev_thr);
@@ -1773,30 +1778,30 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
         // apply 6-tap filtering
         case 6:
           assert(plane != 0);
-          if (cm->use_highbitdepth)
+          if (use_highbitdepth)
             aom_highbd_lpf_horizontal_6(CONVERT_TO_SHORTPTR(p), dst_stride,
                                         params.mblim, params.lim,
-                                        params.hev_thr, cm->bit_depth);
+                                        params.hev_thr, bit_depth);
           else
             aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim,
                                  params.hev_thr);
           break;
         // apply 8-tap filtering
         case 8:
-          if (cm->use_highbitdepth)
+          if (use_highbitdepth)
             aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(p), dst_stride,
                                         params.mblim, params.lim,
-                                        params.hev_thr, cm->bit_depth);
+                                        params.hev_thr, bit_depth);
           else
             aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim,
                                  params.hev_thr);
           break;
         // apply 14-tap filtering
         case 14:
-          if (cm->use_highbitdepth)
+          if (use_highbitdepth)
             aom_highbd_lpf_horizontal_14(CONVERT_TO_SHORTPTR(p), dst_stride,
                                          params.mblim, params.lim,
-                                         params.hev_thr, cm->bit_depth);
+                                         params.hev_thr, bit_depth);
           else
             aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim,
                                   params.hev_thr);
diff --git a/third_party/aom/av1/common/av1_rtcd.c b/third_party/aom/av1/common/av1_rtcd.c
index 38e26bee1..a77a4d254 100644
--- a/third_party/aom/av1/common/av1_rtcd.c
+++ b/third_party/aom/av1/common/av1_rtcd.c
@@ -16,7 +16,7 @@
 #include "aom_ports/aom_once.h"
 
 void av1_rtcd() {
-  // TODO(JBB): Remove this once, by insuring that both the encoder and
-  // decoder setup functions are protected by once();
-  once(setup_rtcd_internal);
+  // TODO(JBB): Remove this aom_once, by insuring that both the encoder and
+  // decoder setup functions are protected by aom_once();
+  aom_once(setup_rtcd_internal);
 }
diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl
index 6aa925515..fa8b34981 100755
--- a/third_party/aom/av1/common/av1_rtcd_defs.pl
+++ b/third_party/aom/av1/common/av1_rtcd_defs.pl
@@ -106,7 +106,7 @@ specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
 
 #inv txfm
 add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_inv_txfm_add ssse3 avx2/;
+specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;
 
 add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
@@ -181,7 +181,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
   #fwd txfm
   add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param";
-  specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1/;
+  specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2/;
 
   add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -241,11 +241,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/av1_txb_init_levels sse4_1/;
 
   add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
-  specialize qw/av1_wedge_sse_from_residuals sse2/;
+  specialize qw/av1_wedge_sse_from_residuals sse2 avx2/;
   add_proto qw/int av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
-  specialize qw/av1_wedge_sign_from_residuals sse2/;
+  specialize qw/av1_wedge_sign_from_residuals sse2 avx2/;
   add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
-  specialize qw/av1_wedge_compute_delta_squares sse2/;
+  specialize qw/av1_wedge_compute_delta_squares sse2 avx2/;
 
   # hash
   add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, int length";
@@ -288,34 +288,34 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 # LOOP_RESTORATION functions
 
 add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
-specialize qw/apply_selfguided_restoration sse4_1 avx2/;
+specialize qw/apply_selfguided_restoration sse4_1 avx2 neon/;
 
 add_proto qw/void av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
                                   int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
                                   int sgr_params_idx, int bit_depth, int highbd";
-specialize qw/av1_selfguided_restoration sse4_1 avx2/;
+specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/;
 
 # CONVOLVE_ROUND/COMPOUND_ROUND functions
 
-add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-
-  add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
-  add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+
+  add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
+  add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
 
   specialize qw/av1_convolve_2d_sr sse2 avx2 neon/;
   specialize qw/av1_convolve_2d_copy_sr sse2 avx2 neon/;
diff --git a/third_party/aom/av1/common/av1_txfm.h b/third_party/aom/av1/common/av1_txfm.h
index 5db3233f5..c9cc79852 100644
--- a/third_party/aom/av1/common/av1_txfm.h
+++ b/third_party/aom/av1/common/av1_txfm.h
@@ -171,53 +171,6 @@ static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) {
   get_flip_cfg(tx_type, &cfg->ud_flip, &cfg->lr_flip);
 }
 
-static INLINE TX_SIZE av1_rotate_tx_size(TX_SIZE tx_size) {
-  switch (tx_size) {
-    case TX_4X4: return TX_4X4;
-    case TX_8X8: return TX_8X8;
-    case TX_16X16: return TX_16X16;
-    case TX_32X32: return TX_32X32;
-    case TX_64X64: return TX_64X64;
-    case TX_32X64: return TX_64X32;
-    case TX_64X32: return TX_32X64;
-    case TX_4X8: return TX_8X4;
-    case TX_8X4: return TX_4X8;
-    case TX_8X16: return TX_16X8;
-    case TX_16X8: return TX_8X16;
-    case TX_16X32: return TX_32X16;
-    case TX_32X16: return TX_16X32;
-    case TX_4X16: return TX_16X4;
-    case TX_16X4: return TX_4X16;
-    case TX_8X32: return TX_32X8;
-    case TX_32X8: return TX_8X32;
-    case TX_16X64: return TX_64X16;
-    case TX_64X16: return TX_16X64;
-    default: assert(0); return TX_INVALID;
-  }
-}
-
-static INLINE TX_TYPE av1_rotate_tx_type(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case DCT_DCT: return DCT_DCT;
-    case ADST_DCT: return DCT_ADST;
-    case DCT_ADST: return ADST_DCT;
-    case ADST_ADST: return ADST_ADST;
-    case FLIPADST_DCT: return DCT_FLIPADST;
-    case DCT_FLIPADST: return FLIPADST_DCT;
-    case FLIPADST_FLIPADST: return FLIPADST_FLIPADST;
-    case ADST_FLIPADST: return FLIPADST_ADST;
-    case FLIPADST_ADST: return ADST_FLIPADST;
-    case IDTX: return IDTX;
-    case V_DCT: return H_DCT;
-    case H_DCT: return V_DCT;
-    case V_ADST: return H_ADST;
-    case H_ADST: return V_ADST;
-    case V_FLIPADST: return H_FLIPADST;
-    case H_FLIPADST: return V_FLIPADST;
-    default: assert(0); return TX_TYPES;
-  }
-}
-
 // Utility function that returns the log of the ratio of the col and row
 // sizes.
 static INLINE int get_rect_tx_log_ratio(int col, int row) {
diff --git a/third_party/aom/av1/common/blockd.h b/third_party/aom/av1/common/blockd.h
index 3e8d1d6c6..979f13bd9 100644
--- a/third_party/aom/av1/common/blockd.h
+++ b/third_party/aom/av1/common/blockd.h
@@ -605,6 +605,12 @@ static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) {
   return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
 }
 
+static INLINE uint8_t *get_buf_by_bd(const MACROBLOCKD *xd, uint8_t *buf16) {
+  return (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+             ? CONVERT_TO_BYTEPTR(buf16)
+             : buf16;
+}
+
 static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
   switch (bsize) {
     case BLOCK_4X4: return 0;
@@ -674,6 +680,15 @@ static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = {
   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
 };
 
+static const uint16_t av1_ext_tx_used_flag[EXT_TX_SET_TYPES] = {
+  0x0001,  // 0000 0000 0000 0001
+  0x0201,  // 0000 0010 0000 0001
+  0x020F,  // 0000 0010 0000 1111
+  0x0E0F,  // 0000 1110 0000 1111
+  0x0FFF,  // 0000 1111 1111 1111
+  0xFFFF,  // 1111 1111 1111 1111
+};
+
 static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter,
                                                 int use_reduced_set) {
   const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
@@ -1145,38 +1160,6 @@ static INLINE PLANE_TYPE get_plane_type(int plane) {
   return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
 }
 
-static INLINE void transpose_uint8(uint8_t *dst, int dst_stride,
-                                   const uint8_t *src, int src_stride, int w,
-                                   int h) {
-  int r, c;
-  for (r = 0; r < h; ++r)
-    for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
-}
-
-static INLINE void transpose_uint16(uint16_t *dst, int dst_stride,
-                                    const uint16_t *src, int src_stride, int w,
-                                    int h) {
-  int r, c;
-  for (r = 0; r < h; ++r)
-    for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
-}
-
-static INLINE void transpose_int16(int16_t *dst, int dst_stride,
-                                   const int16_t *src, int src_stride, int w,
-                                   int h) {
-  int r, c;
-  for (r = 0; r < h; ++r)
-    for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
-}
-
-static INLINE void transpose_int32(int32_t *dst, int dst_stride,
-                                   const int32_t *src, int src_stride, int w,
-                                   int h) {
-  int r, c;
-  for (r = 0; r < h; ++r)
-    for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
-}
-
 static INLINE int av1_get_max_eob(TX_SIZE tx_size) {
   if (tx_size == TX_64X64 || tx_size == TX_64X32 || tx_size == TX_32X64) {
     return 1024;
diff --git a/third_party/aom/av1/common/cdef.c b/third_party/aom/av1/common/cdef.c
index c9b974900..e9e2b0e42 100644
--- a/third_party/aom/av1/common/cdef.c
+++ b/third_party/aom/av1/common/cdef.c
@@ -110,7 +110,7 @@ void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
 static void copy_sb8_16(AOM_UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride,
                         const uint8_t *src, int src_voffset, int src_hoffset,
                         int sstride, int vsize, int hsize) {
-  if (cm->use_highbitdepth) {
+  if (cm->seq_params.use_highbitdepth) {
     const uint16_t *base =
         &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
     copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
@@ -153,7 +153,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
   int mi_high_l2[3];
   int xdec[3];
   int ydec[3];
-  int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
+  int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
   const int nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   const int nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
@@ -363,7 +363,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                     vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
         }
 
-        if (cm->use_highbitdepth) {
+        if (cm->seq_params.use_highbitdepth) {
           cdef_filter_fb(
               NULL,
               &CONVERT_TO_SHORTPTR(
diff --git a/third_party/aom/av1/common/cfl.c b/third_party/aom/av1/common/cfl.c
index ee19f0bcf..ccc59b4eb 100644
--- a/third_party/aom/av1/common/cfl.c
+++ b/third_party/aom/av1/common/cfl.c
@@ -15,21 +15,14 @@
 
 #include "config/av1_rtcd.h"
 
-void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm) {
+void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params) {
   assert(block_size_wide[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
   assert(block_size_high[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
-  if (!(cm->subsampling_x == 0 && cm->subsampling_y == 0) &&
-      !(cm->subsampling_x == 1 && cm->subsampling_y == 1) &&
-      !(cm->subsampling_x == 1 && cm->subsampling_y == 0)) {
-    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                       "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported by "
-                       "CfL, %d %d subsampling is not supported.\n",
-                       cm->subsampling_x, cm->subsampling_y);
-  }
+
   memset(&cfl->recon_buf_q3, 0, sizeof(cfl->recon_buf_q3));
   memset(&cfl->ac_buf_q3, 0, sizeof(cfl->ac_buf_q3));
-  cfl->subsampling_x = cm->subsampling_x;
-  cfl->subsampling_y = cm->subsampling_y;
+  cfl->subsampling_x = seq_params->subsampling_x;
+  cfl->subsampling_y = seq_params->subsampling_y;
   cfl->are_parameters_computed = 0;
   cfl->store_y = 0;
   // The DC_PRED cache is disabled by default and is only enabled in
diff --git a/third_party/aom/av1/common/convolve.c b/third_party/aom/av1/common/convolve.c
index d57f44f8b..ed962c722 100644
--- a/third_party/aom/av1/common/convolve.c
+++ b/third_party/aom/av1/common/convolve.c
@@ -75,8 +75,8 @@ void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
 
 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                           int dst_stride, int w, int h,
-                          InterpFilterParams *filter_params_x,
-                          InterpFilterParams *filter_params_y,
+                          const InterpFilterParams *filter_params_x,
+                          const InterpFilterParams *filter_params_y,
                           const int subpel_x_q4, const int subpel_y_q4,
                           ConvolveParams *conv_params) {
   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
@@ -91,7 +91,7 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
   // horizontal filter
   const uint8_t *src_horiz = src - fo_vert * src_stride;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
   for (int y = 0; y < im_h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -107,7 +107,7 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
   // vertical filter
   int16_t *src_vert = im_block + fo_vert * im_stride;
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
@@ -126,8 +126,8 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
 
 void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                          int dst_stride, int w, int h,
-                         InterpFilterParams *filter_params_x,
-                         InterpFilterParams *filter_params_y,
+                         const InterpFilterParams *filter_params_x,
+                         const InterpFilterParams *filter_params_y,
                          const int subpel_x_q4, const int subpel_y_q4,
                          ConvolveParams *conv_params) {
   const int fo_vert = filter_params_y->taps / 2 - 1;
@@ -141,7 +141,7 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
 
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -156,8 +156,8 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
 
 void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                          int dst_stride, int w, int h,
-                         InterpFilterParams *filter_params_x,
-                         InterpFilterParams *filter_params_y,
+                         const InterpFilterParams *filter_params_x,
+                         const InterpFilterParams *filter_params_y,
                          const int subpel_x_q4, const int subpel_y_q4,
                          ConvolveParams *conv_params) {
   const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -172,7 +172,7 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
 
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -187,8 +187,8 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
 
 void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                                int dst_stride, int w, int h,
-                               InterpFilterParams *filter_params_x,
-                               InterpFilterParams *filter_params_y,
+                               const InterpFilterParams *filter_params_x,
+                               const InterpFilterParams *filter_params_y,
                                const int subpel_x_q4, const int subpel_y_q4,
                                ConvolveParams *conv_params) {
   (void)filter_params_x;
@@ -204,8 +204,8 @@ void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
 
 void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
                            int dst8_stride, int w, int h,
-                           InterpFilterParams *filter_params_x,
-                           InterpFilterParams *filter_params_y,
+                           const InterpFilterParams *filter_params_x,
+                           const InterpFilterParams *filter_params_y,
                            const int subpel_x_q4, const int subpel_y_q4,
                            ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -222,7 +222,7 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
   // horizontal filter
   const uint8_t *src_horiz = src - fo_vert * src_stride;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
   for (int y = 0; y < im_h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -238,7 +238,7 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
   // vertical filter
   int16_t *src_vert = im_block + fo_vert * im_stride;
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
@@ -270,8 +270,8 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
 
 void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
                           int dst8_stride, int w, int h,
-                          InterpFilterParams *filter_params_x,
-                          InterpFilterParams *filter_params_y,
+                          const InterpFilterParams *filter_params_x,
+                          const InterpFilterParams *filter_params_y,
                           const int subpel_x_q4, const int subpel_y_q4,
                           ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -289,7 +289,7 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
 
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -320,8 +320,8 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
 
 void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
                           int dst8_stride, int w, int h,
-                          InterpFilterParams *filter_params_x,
-                          InterpFilterParams *filter_params_y,
+                          const InterpFilterParams *filter_params_x,
+                          const InterpFilterParams *filter_params_y,
                           const int subpel_x_q4, const int subpel_y_q4,
                           ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -339,7 +339,7 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
 
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -370,8 +370,8 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
 
 void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
                                 uint8_t *dst8, int dst8_stride, int w, int h,
-                                InterpFilterParams *filter_params_x,
-                                InterpFilterParams *filter_params_y,
+                                const InterpFilterParams *filter_params_x,
+                                const InterpFilterParams *filter_params_y,
                                 const int subpel_x_q4, const int subpel_y_q4,
                                 ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -412,8 +412,8 @@ void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
 
 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
                              int dst8_stride, int w, int h,
-                             InterpFilterParams *filter_params_x,
-                             InterpFilterParams *filter_params_y,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
                              const int subpel_x_qn, const int x_step_qn,
                              const int subpel_y_qn, const int y_step_qn,
                              ConvolveParams *conv_params) {
@@ -439,7 +439,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
       assert(x_filter_idx < SUBPEL_SHIFTS);
       const int16_t *x_filter =
-          av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
+          av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
       int32_t sum = (1 << (bd + FILTER_BITS - 1));
       for (int k = 0; k < filter_params_x->taps; ++k) {
         sum += x_filter[k] * src_x[k - fo_horiz];
@@ -461,7 +461,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
       assert(y_filter_idx < SUBPEL_SHIFTS);
       const int16_t *y_filter =
-          av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
+          av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
       int32_t sum = 1 << offset_bits;
       for (int k = 0; k < filter_params_y->taps; ++k) {
         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
@@ -498,8 +498,8 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
 
 static void convolve_2d_scale_wrapper(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
-    int h, InterpFilterParams *filter_params_x,
-    InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
     const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
     ConvolveParams *conv_params) {
   if (conv_params->is_compound) {
@@ -520,25 +520,27 @@ void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
   (void)y_step_q4;
   (void)dst;
   (void)dst_stride;
-
-  InterpFilterParams filter_params_x, filter_params_y;
-  av1_get_convolve_filter_params(interp_filters, &filter_params_x,
-                                 &filter_params_y, w, h);
+  InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
+  InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
+  const InterpFilterParams *filter_params_x =
+      av1_get_interp_filter_params_with_block_size(filter_x, w);
+  const InterpFilterParams *filter_params_y =
+      av1_get_interp_filter_params_with_block_size(filter_y, h);
 
   if (scaled)
     convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
-                              &filter_params_x, &filter_params_y, subpel_x_q4,
+                              filter_params_x, filter_params_y, subpel_x_q4,
                               x_step_q4, subpel_y_q4, y_step_q4, conv_params);
   else
     sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
-        src, src_stride, dst, dst_stride, w, h, &filter_params_x,
-        &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
+        src, src_stride, dst, dst_stride, w, h, filter_params_x,
+        filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
 }
 
 void av1_highbd_convolve_2d_copy_sr_c(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, InterpFilterParams *filter_params_x,
-    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
   (void)filter_params_x;
   (void)filter_params_y;
@@ -554,8 +556,8 @@ void av1_highbd_convolve_2d_copy_sr_c(
 
 void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
                                 uint16_t *dst, int dst_stride, int w, int h,
-                                InterpFilterParams *filter_params_x,
-                                InterpFilterParams *filter_params_y,
+                                const InterpFilterParams *filter_params_x,
+                                const InterpFilterParams *filter_params_y,
                                 const int subpel_x_q4, const int subpel_y_q4,
                                 ConvolveParams *conv_params, int bd) {
   const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -569,7 +571,7 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
 
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -585,8 +587,8 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
 
 void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
                                 uint16_t *dst, int dst_stride, int w, int h,
-                                InterpFilterParams *filter_params_x,
-                                InterpFilterParams *filter_params_y,
+                                const InterpFilterParams *filter_params_x,
+                                const InterpFilterParams *filter_params_y,
                                 const int subpel_x_q4, const int subpel_y_q4,
                                 ConvolveParams *conv_params, int bd) {
   const int fo_vert = filter_params_y->taps / 2 - 1;
@@ -599,7 +601,7 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -614,8 +616,8 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
 
 void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
                                  uint16_t *dst, int dst_stride, int w, int h,
-                                 InterpFilterParams *filter_params_x,
-                                 InterpFilterParams *filter_params_y,
+                                 const InterpFilterParams *filter_params_x,
+                                 const InterpFilterParams *filter_params_y,
                                  const int subpel_x_q4, const int subpel_y_q4,
                                  ConvolveParams *conv_params, int bd) {
   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
@@ -630,7 +632,7 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
   // horizontal filter
   const uint16_t *src_horiz = src - fo_vert * src_stride;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
   for (int y = 0; y < im_h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -646,7 +648,7 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
   // vertical filter
   int16_t *src_vert = im_block + fo_vert * im_stride;
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
@@ -666,8 +668,9 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
 
 void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
                                   uint16_t *dst16, int dst16_stride, int w,
-                                  int h, InterpFilterParams *filter_params_x,
-                                  InterpFilterParams *filter_params_y,
+                                  int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
                                   const int subpel_x_q4, const int subpel_y_q4,
                                   ConvolveParams *conv_params, int bd) {
   int x, y, k;
@@ -685,7 +688,7 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
   // horizontal filter
   const uint16_t *src_horiz = src - fo_vert * src_stride;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
   for (y = 0; y < im_h; ++y) {
     for (x = 0; x < w; ++x) {
       int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -703,7 +706,7 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
   int16_t *src_vert = im_block + fo_vert * im_stride;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
   for (y = 0; y < h; ++y) {
     for (x = 0; x < w; ++x) {
       int32_t sum = 1 << offset_bits;
@@ -734,8 +737,9 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
 
 void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
                                  uint16_t *dst16, int dst16_stride, int w,
-                                 int h, InterpFilterParams *filter_params_x,
-                                 InterpFilterParams *filter_params_y,
+                                 int h,
+                                 const InterpFilterParams *filter_params_x,
+                                 const InterpFilterParams *filter_params_y,
                                  const int subpel_x_q4, const int subpel_y_q4,
                                  ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -753,7 +757,7 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
   assert(bits >= 0);
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -784,8 +788,9 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
 
 void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
                                  uint16_t *dst16, int dst16_stride, int w,
-                                 int h, InterpFilterParams *filter_params_x,
-                                 InterpFilterParams *filter_params_y,
+                                 int h,
+                                 const InterpFilterParams *filter_params_x,
+                                 const InterpFilterParams *filter_params_y,
                                  const int subpel_x_q4, const int subpel_y_q4,
                                  ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -803,7 +808,7 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
   assert(bits >= 0);
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -834,8 +839,8 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
 
 void av1_highbd_jnt_convolve_2d_copy_c(
     const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
-    int w, int h, InterpFilterParams *filter_params_x,
-    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    int w, int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
@@ -875,8 +880,8 @@ void av1_highbd_jnt_convolve_2d_copy_c(
 
 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
                                     uint16_t *dst, int dst_stride, int w, int h,
-                                    InterpFilterParams *filter_params_x,
-                                    InterpFilterParams *filter_params_y,
+                                    const InterpFilterParams *filter_params_x,
+                                    const InterpFilterParams *filter_params_y,
                                     const int subpel_x_qn, const int x_step_qn,
                                     const int subpel_y_qn, const int y_step_qn,
                                     ConvolveParams *conv_params, int bd) {
@@ -900,7 +905,7 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
       assert(x_filter_idx < SUBPEL_SHIFTS);
       const int16_t *x_filter =
-          av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
+          av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
       int32_t sum = (1 << (bd + FILTER_BITS - 1));
       for (int k = 0; k < filter_params_x->taps; ++k) {
         sum += x_filter[k] * src_x[k - fo_horiz];
@@ -922,7 +927,7 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
       assert(y_filter_idx < SUBPEL_SHIFTS);
       const int16_t *y_filter =
-          av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
+          av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
       int32_t sum = 1 << offset_bits;
       for (int k = 0; k < filter_params_y->taps; ++k) {
         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
@@ -971,9 +976,12 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
   (void)dst_stride;
 
   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  InterpFilterParams filter_params_x, filter_params_y;
-  av1_get_convolve_filter_params(interp_filters, &filter_params_x,
-                                 &filter_params_y, w, h);
+  InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
+  InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
+  const InterpFilterParams *filter_params_x =
+      av1_get_interp_filter_params_with_block_size(filter_x, w);
+  const InterpFilterParams *filter_params_y =
+      av1_get_interp_filter_params_with_block_size(filter_y, h);
 
   if (scaled) {
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -981,16 +989,16 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
       assert(conv_params->dst != NULL);
     }
     av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
-                                 &filter_params_x, &filter_params_y,
-                                 subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4,
-                                 conv_params, bd);
+                                 filter_params_x, filter_params_y, subpel_x_q4,
+                                 x_step_q4, subpel_y_q4, y_step_q4, conv_params,
+                                 bd);
   } else {
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
 
     sf->highbd_convolve[subpel_x_q4 != 0][subpel_y_q4 !=
                                           0][conv_params->is_compound](
-        src, src_stride, dst, dst_stride, w, h, &filter_params_x,
-        &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd);
+        src, src_stride, dst, dst_stride, w, h, filter_params_x,
+        filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd);
   }
 }
 
diff --git a/third_party/aom/av1/common/convolve.h b/third_party/aom/av1/common/convolve.h
index 1b2c2d0d5..bc2d4bccf 100644
--- a/third_party/aom/av1/common/convolve.h
+++ b/third_party/aom/av1/common/convolve.h
@@ -40,27 +40,17 @@ typedef struct ConvolveParams {
 
 typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride,
                                   uint8_t *dst, int dst_stride, int w, int h,
-                                  InterpFilterParams *filter_params_x,
-                                  InterpFilterParams *filter_params_y,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
                                   const int subpel_x_q4, const int subpel_y_q4,
                                   ConvolveParams *conv_params);
 
 typedef void (*aom_highbd_convolve_fn_t)(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, InterpFilterParams *filter_params_x,
-    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
     const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 
-static INLINE void av1_get_convolve_filter_params(InterpFilters interp_filters,
-                                                  InterpFilterParams *params_x,
-                                                  InterpFilterParams *params_y,
-                                                  int w, int h) {
-  InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
-  InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
-  *params_x = av1_get_interp_filter_params_with_block_size(filter_x, w);
-  *params_y = av1_get_interp_filter_params_with_block_size(filter_y, h);
-}
-
 struct AV1Common;
 struct scale_factors;
 
diff --git a/third_party/aom/av1/common/enums.h b/third_party/aom/av1/common/enums.h
index a37ee9f24..689c25f30 100644
--- a/third_party/aom/av1/common/enums.h
+++ b/third_party/aom/av1/common/enums.h
@@ -557,6 +557,7 @@ typedef uint8_t TXFM_CONTEXT;
 #define BWDREF_FRAME 5
 #define ALTREF2_FRAME 6
 #define ALTREF_FRAME 7
+#define EXTREF_FRAME REF_FRAMES
 #define LAST_REF_FRAMES (LAST3_FRAME - LAST_FRAME + 1)
 
 #define INTER_REFS_PER_FRAME (ALTREF_FRAME - LAST_FRAME + 1)
@@ -607,6 +608,7 @@ typedef enum ATTRIBUTE_PACKED {
 
 // In large_scale_tile coding, external references are used.
 #define MAX_EXTERNAL_REFERENCES 128
+#define MAX_TILES 512
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/common/filter.c b/third_party/aom/av1/common/filter.c
deleted file mode 100644
index a7e67ea4a..000000000
--- a/third_party/aom/av1/common/filter.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "av1/common/filter.h"
-
-DECLARE_ALIGNED(256, static const InterpKernel,
-                bilinear_filters[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },  { 0, 0, 0, 120, 8, 0, 0, 0 },
-  { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 },
-  { 0, 0, 0, 96, 32, 0, 0, 0 },  { 0, 0, 0, 88, 40, 0, 0, 0 },
-  { 0, 0, 0, 80, 48, 0, 0, 0 },  { 0, 0, 0, 72, 56, 0, 0, 0 },
-  { 0, 0, 0, 64, 64, 0, 0, 0 },  { 0, 0, 0, 56, 72, 0, 0, 0 },
-  { 0, 0, 0, 48, 80, 0, 0, 0 },  { 0, 0, 0, 40, 88, 0, 0, 0 },
-  { 0, 0, 0, 32, 96, 0, 0, 0 },  { 0, 0, 0, 24, 104, 0, 0, 0 },
-  { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 }
-};
-
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_8[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },      { 0, 2, -6, 126, 8, -2, 0, 0 },
-  { 0, 2, -10, 122, 18, -4, 0, 0 },  { 0, 2, -12, 116, 28, -8, 2, 0 },
-  { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 },
-  { 0, 2, -16, 94, 58, -12, 2, 0 },  { 0, 2, -14, 84, 66, -12, 2, 0 },
-  { 0, 2, -14, 76, 76, -14, 2, 0 },  { 0, 2, -12, 66, 84, -14, 2, 0 },
-  { 0, 2, -12, 58, 94, -16, 2, 0 },  { 0, 2, -12, 48, 102, -14, 2, 0 },
-  { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 },
-  { 0, 0, -4, 18, 122, -10, 2, 0 },  { 0, 0, -2, 8, 126, -6, 2, 0 }
-};
-
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },         { -2, 2, -6, 126, 8, -2, 2, 0 },
-  { -2, 6, -12, 124, 16, -6, 4, -2 },   { -2, 8, -18, 120, 26, -10, 6, -2 },
-  { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 },
-  { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 },
-  { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 },
-  { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 },
-  { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 },
-  { -2, 4, -6, 16, 124, -12, 6, -2 },   { 0, 2, -2, 8, 126, -6, 2, -2 }
-};
-
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 2, 28, 62, 34, 2, 0, 0 },
-  { 0, 0, 26, 62, 36, 4, 0, 0 },    { 0, 0, 22, 62, 40, 4, 0, 0 },
-  { 0, 0, 20, 60, 42, 6, 0, 0 },    { 0, 0, 18, 58, 44, 8, 0, 0 },
-  { 0, 0, 16, 56, 46, 10, 0, 0 },   { 0, -2, 16, 54, 48, 12, 0, 0 },
-  { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 },
-  { 0, 0, 10, 46, 56, 16, 0, 0 },   { 0, 0, 8, 44, 58, 18, 0, 0 },
-  { 0, 0, 6, 42, 60, 20, 0, 0 },    { 0, 0, 4, 40, 62, 22, 0, 0 },
-  { 0, 0, 4, 36, 62, 26, 0, 0 },    { 0, 0, 2, 34, 62, 28, 2, 0 }
-};
-
-static const InterpFilterParams
-    av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
-      { (const int16_t *)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        EIGHTTAP_REGULAR },
-      { (const int16_t *)sub_pel_filters_8smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        EIGHTTAP_SMOOTH },
-      { (const int16_t *)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        MULTITAP_SHARP },
-      { (const int16_t *)bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        BILINEAR }
-    };
-
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_4[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 0, -4, 126, 8, -2, 0, 0 },
-  { 0, 0, -8, 122, 18, -4, 0, 0 },  { 0, 0, -10, 116, 28, -6, 0, 0 },
-  { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 },
-  { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 },
-  { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 },
-  { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 },
-  { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 },
-  { 0, 0, -4, 18, 122, -8, 0, 0 },  { 0, 0, -2, 8, 126, -4, 0, 0 }
-};
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },   { 0, 0, 30, 62, 34, 2, 0, 0 },
-  { 0, 0, 26, 62, 36, 4, 0, 0 },  { 0, 0, 22, 62, 40, 4, 0, 0 },
-  { 0, 0, 20, 60, 42, 6, 0, 0 },  { 0, 0, 18, 58, 44, 8, 0, 0 },
-  { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 },
-  { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 },
-  { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
-  { 0, 0, 6, 42, 60, 20, 0, 0 },  { 0, 0, 4, 40, 62, 22, 0, 0 },
-  { 0, 0, 4, 36, 62, 26, 0, 0 },  { 0, 0, 2, 34, 62, 30, 0, 0 }
-};
-
-static const InterpFilterParams av1_interp_4tap[2] = {
-  { (const int16_t *)sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
-    EIGHTTAP_REGULAR },
-  { (const int16_t *)sub_pel_filters_4smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
-    EIGHTTAP_SMOOTH },
-};
-
-InterpFilterParams av1_get_interp_filter_params_with_block_size(
-    const InterpFilter interp_filter, const int w) {
-  if (w <= 4 &&
-      (interp_filter == MULTITAP_SHARP || interp_filter == EIGHTTAP_REGULAR))
-    return av1_interp_4tap[0];
-  else if (w <= 4 && interp_filter == EIGHTTAP_SMOOTH)
-    return av1_interp_4tap[1];
-
-  return av1_interp_filter_params_list[interp_filter];
-}
-
-const int16_t *av1_get_interp_filter_kernel(const InterpFilter interp_filter) {
-  return (const int16_t *)av1_interp_filter_params_list[interp_filter]
-      .filter_ptr;
-}
diff --git a/third_party/aom/av1/common/filter.h b/third_party/aom/av1/common/filter.h
index 0c24ad9d0..7f8ad583a 100644
--- a/third_party/aom/av1/common/filter.h
+++ b/third_party/aom/av1/common/filter.h
@@ -64,8 +64,8 @@ static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) {
   return filter == SWITCHABLE ? EIGHTTAP_REGULAR : filter;
 }
 
-#define LOG_SWITCHABLE_FILTERS \
-  2 /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
+/* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
+#define LOG_SWITCHABLE_FILTERS 2
 
 #define MAX_SUBPEL_TAPS 12
 #define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4)
@@ -79,14 +79,116 @@ typedef struct InterpFilterParams {
   InterpFilter interp_filter;
 } InterpFilterParams;
 
-const int16_t *av1_get_interp_filter_kernel(const InterpFilter interp_filter);
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_bilinear_filters[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },  { 0, 0, 0, 120, 8, 0, 0, 0 },
+  { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 },
+  { 0, 0, 0, 96, 32, 0, 0, 0 },  { 0, 0, 0, 88, 40, 0, 0, 0 },
+  { 0, 0, 0, 80, 48, 0, 0, 0 },  { 0, 0, 0, 72, 56, 0, 0, 0 },
+  { 0, 0, 0, 64, 64, 0, 0, 0 },  { 0, 0, 0, 56, 72, 0, 0, 0 },
+  { 0, 0, 0, 48, 80, 0, 0, 0 },  { 0, 0, 0, 40, 88, 0, 0, 0 },
+  { 0, 0, 0, 32, 96, 0, 0, 0 },  { 0, 0, 0, 24, 104, 0, 0, 0 },
+  { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },      { 0, 2, -6, 126, 8, -2, 0, 0 },
+  { 0, 2, -10, 122, 18, -4, 0, 0 },  { 0, 2, -12, 116, 28, -8, 2, 0 },
+  { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 },
+  { 0, 2, -16, 94, 58, -12, 2, 0 },  { 0, 2, -14, 84, 66, -12, 2, 0 },
+  { 0, 2, -14, 76, 76, -14, 2, 0 },  { 0, 2, -12, 66, 84, -14, 2, 0 },
+  { 0, 2, -12, 58, 94, -16, 2, 0 },  { 0, 2, -12, 48, 102, -14, 2, 0 },
+  { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 },
+  { 0, 0, -4, 18, 122, -10, 2, 0 },  { 0, 0, -2, 8, 126, -6, 2, 0 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },         { -2, 2, -6, 126, 8, -2, 2, 0 },
+  { -2, 6, -12, 124, 16, -6, 4, -2 },   { -2, 8, -18, 120, 26, -10, 6, -2 },
+  { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 },
+  { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 },
+  { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 },
+  { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 },
+  { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 },
+  { -2, 4, -6, 16, 124, -12, 6, -2 },   { 0, 2, -2, 8, 126, -6, 2, -2 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 2, 28, 62, 34, 2, 0, 0 },
+  { 0, 0, 26, 62, 36, 4, 0, 0 },    { 0, 0, 22, 62, 40, 4, 0, 0 },
+  { 0, 0, 20, 60, 42, 6, 0, 0 },    { 0, 0, 18, 58, 44, 8, 0, 0 },
+  { 0, 0, 16, 56, 46, 10, 0, 0 },   { 0, -2, 16, 54, 48, 12, 0, 0 },
+  { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 },
+  { 0, 0, 10, 46, 56, 16, 0, 0 },   { 0, 0, 8, 44, 58, 18, 0, 0 },
+  { 0, 0, 6, 42, 60, 20, 0, 0 },    { 0, 0, 4, 40, 62, 22, 0, 0 },
+  { 0, 0, 4, 36, 62, 26, 0, 0 },    { 0, 0, 2, 34, 62, 28, 2, 0 }
+};
+
+static const InterpFilterParams
+    av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
+      { (const int16_t *)av1_sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        EIGHTTAP_REGULAR },
+      { (const int16_t *)av1_sub_pel_filters_8smooth, SUBPEL_TAPS,
+        SUBPEL_SHIFTS, EIGHTTAP_SMOOTH },
+      { (const int16_t *)av1_sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        MULTITAP_SHARP },
+      { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        BILINEAR }
+    };
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_sub_pel_filters_4[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 0, -4, 126, 8, -2, 0, 0 },
+  { 0, 0, -8, 122, 18, -4, 0, 0 },  { 0, 0, -10, 116, 28, -6, 0, 0 },
+  { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 },
+  { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 },
+  { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 },
+  { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 },
+  { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 },
+  { 0, 0, -4, 18, 122, -8, 0, 0 },  { 0, 0, -2, 8, 126, -4, 0, 0 }
+};
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },   { 0, 0, 30, 62, 34, 2, 0, 0 },
+  { 0, 0, 26, 62, 36, 4, 0, 0 },  { 0, 0, 22, 62, 40, 4, 0, 0 },
+  { 0, 0, 20, 60, 42, 6, 0, 0 },  { 0, 0, 18, 58, 44, 8, 0, 0 },
+  { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 },
+  { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 },
+  { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
+  { 0, 0, 6, 42, 60, 20, 0, 0 },  { 0, 0, 4, 40, 62, 22, 0, 0 },
+  { 0, 0, 4, 36, 62, 26, 0, 0 },  { 0, 0, 2, 34, 62, 30, 0, 0 }
+};
+
+// For w<=4, MULTITAP_SHARP is the same as EIGHTTAP_REGULAR
+static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = {
+  { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
+    EIGHTTAP_REGULAR },
+  { (const int16_t *)av1_sub_pel_filters_4smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
+    EIGHTTAP_SMOOTH },
+  { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
+    EIGHTTAP_REGULAR },
+  { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
+    BILINEAR },
+};
+
+static INLINE const InterpFilterParams *
+av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter,
+                                             const int w) {
+  if (w <= 4) return &av1_interp_4tap[interp_filter];
+  return &av1_interp_filter_params_list[interp_filter];
+}
 
-InterpFilterParams av1_get_interp_filter_params_with_block_size(
-    const InterpFilter interp_filter, const int w);
+static INLINE const int16_t *av1_get_interp_filter_kernel(
+    const InterpFilter interp_filter) {
+  return av1_interp_filter_params_list[interp_filter].filter_ptr;
+}
 
 static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
-    const InterpFilterParams filter_params, const int subpel) {
-  return filter_params.filter_ptr + filter_params.taps * subpel;
+    const InterpFilterParams *const filter_params, const int subpel) {
+  return filter_params->filter_ptr + filter_params->taps * subpel;
 }
 
 #ifdef __cplusplus
diff --git a/third_party/aom/av1/common/mv.h b/third_party/aom/av1/common/mv.h
index a6227f18f..c2495640e 100644
--- a/third_party/aom/av1/common/mv.h
+++ b/third_party/aom/av1/common/mv.h
@@ -294,9 +294,6 @@ static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row,
   mv->row = clamp(mv->row, min_row, max_row);
 }
 
-static INLINE int mv_has_subpel(const MV *mv) {
-  return (mv->row & SUBPEL_MASK) || (mv->col & SUBPEL_MASK);
-}
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/mvref_common.h b/third_party/aom/av1/common/mvref_common.h
index 716b4a247..f68c159e1 100644
--- a/third_party/aom/av1/common/mvref_common.h
+++ b/third_party/aom/av1/common/mvref_common.h
@@ -44,7 +44,7 @@ static INLINE int get_relative_dist(const AV1_COMMON *cm, int a, int b) {
   assert(b >= 0 && b < (1 << bits));
 
   int diff = a - b;
-  int m = 1 << (bits - 1);
+  const int m = 1 << (bits - 1);
   diff = (diff & (m - 1)) - (diff & m);
   return diff;
 }
diff --git a/third_party/aom/av1/common/onyxc_int.h b/third_party/aom/av1/common/onyxc_int.h
index fa5f02e52..6b1bf2d74 100644
--- a/third_party/aom/av1/common/onyxc_int.h
+++ b/third_party/aom/av1/common/onyxc_int.h
@@ -184,7 +184,10 @@ typedef struct BitstreamLevel {
   uint8_t minor;
 } BitstreamLevel;
 
-/* Initial version of sequence header structure */
+// Sequence header structure.
+// Note: All syntax elements of sequence_header_obu that need to be
+// bit-identical across multiple sequence headers must be part of this struct,
+// so that consistency is checked by are_seq_headers_consistent() function.
 typedef struct SequenceHeader {
   int num_bits_width;
   int num_bits_height;
@@ -205,7 +208,6 @@ typedef struct SequenceHeader {
                                    // 2 - adaptive
   int still_picture;               // Video is a single frame still picture
   int reduced_still_picture_hdr;   // Use reduced header for still picture
-  int monochrome;                  // Monochorme video
   int enable_filter_intra;         // enables/disables filterintra
   int enable_intra_edge_filter;    // enables/disables corner/edge/upsampling
   int enable_interintra_compound;  // enables/disables interintra_compound
@@ -229,6 +231,9 @@ typedef struct SequenceHeader {
                            //     enabled for that frame.
   int enable_cdef;         // To turn on/off CDEF
   int enable_restoration;  // To turn on/off loop restoration
+  BITSTREAM_PROFILE profile;
+
+  // Operating point info.
   int operating_points_cnt_minus_1;
   int operating_point_idc[MAX_NUM_OPERATING_POINTS];
   int display_model_info_present_flag;
@@ -236,15 +241,26 @@ typedef struct SequenceHeader {
   BitstreamLevel level[MAX_NUM_OPERATING_POINTS];
   uint8_t tier[MAX_NUM_OPERATING_POINTS];  // seq_tier in the spec. One bit: 0
                                            // or 1.
-} SequenceHeader;
 
-typedef struct AV1Common {
-  struct aom_internal_error_info error;
+  // Color config.
+  aom_bit_depth_t bit_depth;  // AOM_BITS_8 in profile 0 or 1,
+                              // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3.
+  int use_highbitdepth;       // If true, we need to use 16bit frame buffers.
+  int monochrome;             // Monochorme video
   aom_color_primaries_t color_primaries;
   aom_transfer_characteristics_t transfer_characteristics;
   aom_matrix_coefficients_t matrix_coefficients;
-  aom_chroma_sample_position_t chroma_sample_position;
   int color_range;
+  int subsampling_x;          // Chroma subsampling for x
+  int subsampling_y;          // Chroma subsampling for y
+  aom_chroma_sample_position_t chroma_sample_position;
+  int separate_uv_delta_q;
+
+  int film_grain_params_present;
+} SequenceHeader;
+
+typedef struct AV1Common {
+  struct aom_internal_error_info error;
   int width;
   int height;
   int render_width;
@@ -253,18 +269,11 @@ typedef struct AV1Common {
   int last_height;
   int timing_info_present;
   aom_timing_info_t timing_info;
-  int buffer_removal_delay_present;
+  int buffer_removal_time_present;
   aom_dec_model_info_t buffer_model;
   aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
   aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1];
-  int tu_presentation_delay_flag;
-  int64_t tu_presentation_delay;
-
-  // TODO(jkoleszar): this implies chroma ss right now, but could vary per
-  // plane. Revisit as part of the future change to YV12_BUFFER_CONFIG to
-  // support additional planes.
-  int subsampling_x;
-  int subsampling_y;
+  uint32_t frame_presentation_time;
 
   int largest_tile_id;
   size_t largest_tile_size;
@@ -273,8 +282,6 @@ typedef struct AV1Common {
   // Scale of the current frame with respect to itself.
   struct scale_factors sf_identity;
 
-  // Marks if we need to use 16bit frame buffers (1: yes, 0: no).
-  int use_highbitdepth;
   YV12_BUFFER_CONFIG *frame_to_show;
   RefCntBuffer *prev_frame;
 
@@ -342,8 +349,6 @@ typedef struct AV1Common {
   int u_ac_delta_q;
   int v_ac_delta_q;
 
-  int separate_uv_delta_q;
-
   // The dequantizers below are true dequntizers used only in the
   // dequantization process.  They have the same coefficient
   // shift/scale as TX.
@@ -447,10 +452,7 @@ typedef struct AV1Common {
   unsigned int frame_offset;
 
   unsigned int current_video_frame;
-  BITSTREAM_PROFILE profile;
 
-  // AOM_BITS_8 in profile 0 or 1, AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3.
-  aom_bit_depth_t bit_depth;
   aom_bit_depth_t dequant_bit_depth;  // bit_depth of current dequantizer
 
   int error_resilient_mode;
@@ -494,9 +496,8 @@ typedef struct AV1Common {
   ENTROPY_CONTEXT **above_context[MAX_MB_PLANE];
   TXFM_CONTEXT **above_txfm_context;
   WarpedMotionParams global_motion[REF_FRAMES];
-  aom_film_grain_table_t *film_grain_table;
-  int film_grain_params_present;
   aom_film_grain_t film_grain_params;
+
   int cdef_pri_damping;
   int cdef_sec_damping;
   int nb_cdef_strengths;
@@ -590,7 +591,7 @@ static INLINE int get_free_fb(AV1_COMMON *cm) {
     if (frame_bufs[i].ref_count == 0) break;
 
   if (i != FRAME_BUFFERS) {
-    if (frame_bufs[i].buf.use_external_refernce_buffers) {
+    if (frame_bufs[i].buf.use_external_reference_buffers) {
       // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the
       // external reference buffers. Restore the buffer pointers to point to the
       // internally allocated memory.
@@ -598,7 +599,7 @@ static INLINE int get_free_fb(AV1_COMMON *cm) {
       ybf->y_buffer = ybf->store_buf_adr[0];
       ybf->u_buffer = ybf->store_buf_adr[1];
       ybf->v_buffer = ybf->store_buf_adr[2];
-      ybf->use_external_refernce_buffers = 0;
+      ybf->use_external_reference_buffers = 0;
     }
 
     frame_bufs[i].ref_count = 1;
@@ -683,15 +684,7 @@ static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
   }
 }
 
-static INLINE int mi_cols_aligned_to_sb(const AV1_COMMON *cm) {
-  return ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
-}
-
-static INLINE int mi_rows_aligned_to_sb(const AV1_COMMON *cm) {
-  return ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
-}
-
-void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm);
+void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params);
 
 static INLINE int av1_num_planes(const AV1_COMMON *cm) {
   return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE;
@@ -734,7 +727,7 @@ static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd,
   }
   xd->mi_stride = cm->mi_stride;
   xd->error_info = &cm->error;
-  cfl_init(&xd->cfl, cm);
+  cfl_init(&xd->cfl, &cm->seq_params);
 }
 
 static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
@@ -1066,17 +1059,18 @@ static INLINE int max_intra_block_height(const MACROBLOCKD *xd,
   return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]);
 }
 
-static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
+static INLINE void av1_zero_above_context(AV1_COMMON *const cm, const MACROBLOCKD *xd,
   int mi_col_start, int mi_col_end, const int tile_row) {
+  const SequenceHeader *const seq_params = &cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   const int width = mi_col_end - mi_col_start;
   const int aligned_width =
-    ALIGN_POWER_OF_TWO(width, cm->seq_params.mib_size_log2);
+    ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2);
 
   const int offset_y = mi_col_start;
   const int width_y = aligned_width;
-  const int offset_uv = offset_y >> cm->subsampling_x;
-  const int width_uv = width_y >> cm->subsampling_x;
+  const int offset_uv = offset_y >> seq_params->subsampling_x;
+  const int width_uv = width_y >> seq_params->subsampling_x;
 
   av1_zero_array(cm->above_context[0][tile_row] + offset_y, width_y);
   if (num_planes > 1) {
@@ -1084,7 +1078,7 @@ static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
       av1_zero_array(cm->above_context[1][tile_row] + offset_uv, width_uv);
       av1_zero_array(cm->above_context[2][tile_row] + offset_uv, width_uv);
     } else {
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
                          "Invalid value of planes");
     }
   }
diff --git a/third_party/aom/av1/common/quant_common.c b/third_party/aom/av1/common/quant_common.c
index 84575d74b..0e14da7a3 100644
--- a/third_party/aom/av1/common/quant_common.c
+++ b/third_party/aom/av1/common/quant_common.c
@@ -223,29 +223,6 @@ int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
   return av1_ac_quant_Q3(qindex, delta, bit_depth);
 }
 
-int16_t av1_qindex_from_ac_Q3(int ac_Q3, aom_bit_depth_t bit_depth) {
-  int i;
-  const int16_t *tab = ac_qlookup_Q3;
-  switch (bit_depth) {
-    case AOM_BITS_10: {
-      tab = ac_qlookup_10_Q3;
-      break;
-    }
-    case AOM_BITS_12: {
-      tab = ac_qlookup_12_Q3;
-      break;
-    }
-    default:
-      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
-      return -1;
-  }
-  (void)bit_depth;
-  for (i = 0; i < QINDEX_RANGE; i++) {
-    if (ac_Q3 <= tab[i]) return i;
-  }
-  return QINDEX_RANGE - 1;
-}
-
 int av1_get_qindex(const struct segmentation *seg, int segment_id,
                    int base_qindex) {
   if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
diff --git a/third_party/aom/av1/common/quant_common.h b/third_party/aom/av1/common/quant_common.h
index f9681036d..ca199e94c 100644
--- a/third_party/aom/av1/common/quant_common.h
+++ b/third_party/aom/av1/common/quant_common.h
@@ -42,7 +42,6 @@ int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
 int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
 int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
 int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
-int16_t av1_qindex_from_ac_Q3(int ac_Q3, aom_bit_depth_t bit_depth);
 
 int av1_get_qindex(const struct segmentation *seg, int segment_id,
                    int base_qindex);
diff --git a/third_party/aom/av1/common/reconinter.c b/third_party/aom/av1/common/reconinter.c
index b6ac436fb..b9f0b57f3 100644
--- a/third_party/aom/av1/common/reconinter.c
+++ b/third_party/aom/av1/common/reconinter.c
@@ -627,9 +627,7 @@ void av1_make_masked_inter_predictor(
                   tmp_buf[INTER_PRED_BYTES_PER_PIXEL * MAX_SB_SQUARE]);
 #undef INTER_PRED_BYTES_PER_PIXEL
 
-  uint8_t *tmp_dst = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-                         ? CONVERT_TO_BYTEPTR(tmp_buf)
-                         : tmp_buf;
+  uint8_t *tmp_dst = get_buf_by_bd(xd, tmp_buf);
 
   const int tmp_buf_stride = MAX_SB_SIZE;
   CONV_BUF_TYPE *org_dst = conv_params->dst;
@@ -1002,8 +1000,8 @@ void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
     BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL },
                                { xd->plane[0].dst.stride, 0, 0 } };
     if (!ctx) ctx = &default_ctx;
-    av1_build_interintra_predictors_sby(cm, xd, xd->plane[0].dst.buf,
-                                        xd->plane[0].dst.stride, ctx, bsize);
+    av1_build_interintra_predictors_sbp(cm, xd, xd->plane[0].dst.buf,
+                                        xd->plane[0].dst.stride, ctx, 0, bsize);
   }
 }
 
@@ -1609,10 +1607,10 @@ void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
   const int ssy = xd->plane[plane].subsampling_y;
   BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
   PREDICTION_MODE mode = interintra_to_intra_mode[xd->mi[0]->interintra_mode];
-  xd->mi[0]->angle_delta[PLANE_TYPE_Y] = 0;
-  xd->mi[0]->angle_delta[PLANE_TYPE_UV] = 0;
-  xd->mi[0]->filter_intra_mode_info.use_filter_intra = 0;
-  xd->mi[0]->use_intrabc = 0;
+  assert(xd->mi[0]->angle_delta[PLANE_TYPE_Y] == 0);
+  assert(xd->mi[0]->angle_delta[PLANE_TYPE_UV] == 0);
+  assert(xd->mi[0]->filter_intra_mode_info.use_filter_intra == 0);
+  assert(xd->mi[0]->use_intrabc == 0);
 
   av1_predict_intra_block(cm, xd, pd->width, pd->height,
                           max_txsize_rect_lookup[plane_bsize], mode, 0, 0,
@@ -1642,42 +1640,23 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
       inter_pred, inter_stride, intra_pred, intra_stride);
 }
 
-void av1_build_interintra_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         uint8_t *ypred, int ystride,
-                                         BUFFER_SET *ctx, BLOCK_SIZE bsize) {
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
-    av1_build_intra_predictors_for_interintra(
-        cm, xd, bsize, 0, ctx, CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
-    av1_combine_interintra(xd, bsize, 0, ypred, ystride,
-                           CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
-    return;
-  }
-  {
-    DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
-    av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, ctx,
-                                              intrapredictor, MAX_SB_SIZE);
-    av1_combine_interintra(xd, bsize, 0, ypred, ystride, intrapredictor,
-                           MAX_SB_SIZE);
-  }
-}
-
-void av1_build_interintra_predictors_sbc(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         uint8_t *upred, int ustride,
+// build interintra_predictors for one plane
+void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                         uint8_t *pred, int stride,
                                          BUFFER_SET *ctx, int plane,
                                          BLOCK_SIZE bsize) {
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    DECLARE_ALIGNED(16, uint16_t, uintrapredictor[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
     av1_build_intra_predictors_for_interintra(
-        cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(uintrapredictor),
+        cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor),
         MAX_SB_SIZE);
-    av1_combine_interintra(xd, bsize, plane, upred, ustride,
-                           CONVERT_TO_BYTEPTR(uintrapredictor), MAX_SB_SIZE);
+    av1_combine_interintra(xd, bsize, plane, pred, stride,
+                           CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
   } else {
-    DECLARE_ALIGNED(16, uint8_t, uintrapredictor[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
     av1_build_intra_predictors_for_interintra(cm, xd, bsize, plane, ctx,
-                                              uintrapredictor, MAX_SB_SIZE);
-    av1_combine_interintra(xd, bsize, plane, upred, ustride, uintrapredictor,
+                                              intrapredictor, MAX_SB_SIZE);
+    av1_combine_interintra(xd, bsize, plane, pred, stride, intrapredictor,
                            MAX_SB_SIZE);
   }
 }
@@ -1686,8 +1665,8 @@ void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                           uint8_t *upred, uint8_t *vpred,
                                           int ustride, int vstride,
                                           BUFFER_SET *ctx, BLOCK_SIZE bsize) {
-  av1_build_interintra_predictors_sbc(cm, xd, upred, ustride, ctx, 1, bsize);
-  av1_build_interintra_predictors_sbc(cm, xd, vpred, vstride, ctx, 2, bsize);
+  av1_build_interintra_predictors_sbp(cm, xd, upred, ustride, ctx, 1, bsize);
+  av1_build_interintra_predictors_sbp(cm, xd, vpred, vstride, ctx, 2, bsize);
 }
 
 void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
@@ -1695,7 +1674,7 @@ void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                      uint8_t *vpred, int ystride, int ustride,
                                      int vstride, BUFFER_SET *ctx,
                                      BLOCK_SIZE bsize) {
-  av1_build_interintra_predictors_sby(cm, xd, ypred, ystride, ctx, bsize);
+  av1_build_interintra_predictors_sbp(cm, xd, ypred, ystride, ctx, 0, bsize);
   av1_build_interintra_predictors_sbuv(cm, xd, upred, vpred, ustride, vstride,
                                        ctx, bsize);
 }
@@ -1713,9 +1692,7 @@ static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
 
   const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
   struct buf_2d *const pre_buf = &pd->pre[ref];
-  const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
-  uint8_t *const dst =
-      (hbd ? CONVERT_TO_BYTEPTR(ext_dst) : ext_dst) + ext_dst_stride * y + x;
+  uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x;
   const MV mv = mi->mv[ref].as_mv;
 
   ConvolveParams conv_params = get_conv_params(ref, 0, plane, xd->bd);
diff --git a/third_party/aom/av1/common/reconinter.h b/third_party/aom/av1/common/reconinter.h
index aa3aefc88..6a3def270 100644
--- a/third_party/aom/av1/common/reconinter.h
+++ b/third_party/aom/av1/common/reconinter.h
@@ -412,12 +412,9 @@ void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                      int vstride, BUFFER_SET *ctx,
                                      BLOCK_SIZE bsize);
 
-void av1_build_interintra_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         uint8_t *ypred, int ystride,
-                                         BUFFER_SET *ctx, BLOCK_SIZE bsize);
-
-void av1_build_interintra_predictors_sbc(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         uint8_t *upred, int ustride,
+// build interintra_predictors for one plane
+void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                         uint8_t *pred, int stride,
                                          BUFFER_SET *ctx, int plane,
                                          BLOCK_SIZE bsize);
 
@@ -429,6 +426,7 @@ void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
 void av1_build_intra_predictors_for_interintra(
     const AV1_COMMON *cm, MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
     BUFFER_SET *ctx, uint8_t *intra_pred, int intra_stride);
+
 void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
                             const uint8_t *inter_pred, int inter_stride,
                             const uint8_t *intra_pred, int intra_stride);
diff --git a/third_party/aom/av1/common/reconintra.c b/third_party/aom/av1/common/reconintra.c
index 21d1f60b2..71a52e73e 100644
--- a/third_party/aom/av1/common/reconintra.c
+++ b/third_party/aom/av1/common/reconintra.c
@@ -1071,13 +1071,6 @@ static void filter_intra_edge_corner_high(uint16_t *p_above, uint16_t *p_left) {
   p_left[-1] = s;
 }
 
-static int use_intra_edge_upsample(int bs0, int bs1, int delta, int type) {
-  const int d = abs(delta);
-  const int blk_wh = bs0 + bs1;
-  if (d <= 0 || d >= 40) return 0;
-  return type ? (blk_wh <= 8) : (blk_wh <= 16);
-}
-
 void av1_upsample_intra_edge_c(uint8_t *p, int sz) {
   // interpolate half-sample positions
   assert(sz <= MAX_UPSAMPLE_SZ);
@@ -1284,13 +1277,13 @@ static void build_intra_predictors_high(
         }
       }
       upsample_above =
-          use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+          av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
       if (need_above && upsample_above) {
         const int n_px = txwpx + (need_right ? txhpx : 0);
         av1_upsample_intra_edge_high(above_row, n_px, xd->bd);
       }
       upsample_left =
-          use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+          av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
       if (need_left && upsample_left) {
         const int n_px = txhpx + (need_bottom ? txwpx : 0);
         av1_upsample_intra_edge_high(left_col, n_px, xd->bd);
@@ -1467,13 +1460,13 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
         }
       }
       upsample_above =
-          use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+          av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
       if (need_above && upsample_above) {
         const int n_px = txwpx + (need_right ? txhpx : 0);
         av1_upsample_intra_edge(above_row, n_px);
       }
       upsample_left =
-          use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+          av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
       if (need_left && upsample_left) {
         const int n_px = txhpx + (need_bottom ? txwpx : 0);
         av1_upsample_intra_edge(left_col, n_px);
@@ -1642,4 +1635,6 @@ void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
                           dst_stride, dst, dst_stride, blk_col, blk_row, plane);
 }
 
-void av1_init_intra_predictors(void) { once(init_intra_predictors_internal); }
+void av1_init_intra_predictors(void) {
+  aom_once(init_intra_predictors_internal);
+}
diff --git a/third_party/aom/av1/common/reconintra.h b/third_party/aom/av1/common/reconintra.h
index a7d9e8b79..57638f24e 100644
--- a/third_party/aom/av1/common/reconintra.h
+++ b/third_party/aom/av1/common/reconintra.h
@@ -12,6 +12,8 @@
 #ifndef AV1_COMMON_RECONINTRA_H_
 #define AV1_COMMON_RECONINTRA_H_
 
+#include <stdlib.h>
+
 #include "aom/aom_integer.h"
 #include "av1/common/blockd.h"
 #include "av1/common/onyxc_int.h"
@@ -103,6 +105,14 @@ static INLINE int av1_get_dy(int angle) {
     return 1;
   }
 }
+
+static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta,
+                                              int type) {
+  const int d = abs(delta);
+  const int blk_wh = bs0 + bs1;
+  if (d <= 0 || d >= 40) return 0;
+  return type ? (blk_wh <= 8) : (blk_wh <= 16);
+}
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c
index 17e6823b1..93d62292a 100644
--- a/third_party/aom/av1/common/resize.c
+++ b/third_party/aom/av1/common/resize.c
@@ -1100,7 +1100,7 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
                                 int src_stride, uint8_t *dst, int dst_stride,
                                 int plane, int rows) {
   const int is_uv = (plane > 0);
-  const int ss_x = is_uv && cm->subsampling_x;
+  const int ss_x = is_uv && cm->seq_params.subsampling_x;
   const int downscaled_plane_width = ROUND_POWER_OF_TWO(cm->width, ss_x);
   const int upscaled_plane_width =
       ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
@@ -1141,10 +1141,11 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
     const int pad_left = (j == 0);
     const int pad_right = (j == cm->tile_cols - 1);
 
-    if (cm->use_highbitdepth)
-      highbd_upscale_normative_rect(
-          src_ptr, rows, src_width, src_stride, dst_ptr, rows, dst_width,
-          dst_stride, x_step_qn, x0_qn, pad_left, pad_right, cm->bit_depth);
+    if (cm->seq_params.use_highbitdepth)
+      highbd_upscale_normative_rect(src_ptr, rows, src_width, src_stride,
+                                    dst_ptr, rows, dst_width, dst_stride,
+                                    x_step_qn, x0_qn, pad_left, pad_right,
+                                    cm->seq_params.bit_depth);
     else
       upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr,
                              rows, dst_width, dst_stride, x_step_qn, x0_qn,
@@ -1175,7 +1176,7 @@ YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
   const int num_planes = av1_num_planes(cm);
   if (cm->width != unscaled->y_crop_width ||
       cm->height != unscaled->y_crop_height) {
-    av1_resize_and_extend_frame(unscaled, scaled, (int)cm->bit_depth,
+    av1_resize_and_extend_frame(unscaled, scaled, (int)cm->seq_params.bit_depth,
                                 num_planes);
     return scaled;
   } else {
@@ -1232,6 +1233,7 @@ static void copy_buffer_config(const YV12_BUFFER_CONFIG *const src,
 void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
   const int num_planes = av1_num_planes(cm);
   if (!av1_superres_scaled(cm)) return;
+  const SequenceHeader *const seq_params = &cm->seq_params;
 
   YV12_BUFFER_CONFIG copy_buffer;
   memset(&copy_buffer, 0, sizeof(copy_buffer));
@@ -1239,10 +1241,10 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
   YV12_BUFFER_CONFIG *const frame_to_show = get_frame_new_buffer(cm);
 
   const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, 3);
-  if (aom_alloc_frame_buffer(&copy_buffer, aligned_width, cm->height,
-                             cm->subsampling_x, cm->subsampling_y,
-                             cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
-                             cm->byte_alignment))
+  if (aom_alloc_frame_buffer(
+          &copy_buffer, aligned_width, cm->height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          AOM_BORDER_IN_PIXELS, cm->byte_alignment))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate copy buffer for superres upscaling");
 
@@ -1269,11 +1271,11 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
           "Failed to free current frame buffer before superres upscaling");
 
     // aom_realloc_frame_buffer() leaves config data for frame_to_show intact
-    if (aom_realloc_frame_buffer(frame_to_show, cm->superres_upscaled_width,
-                                 cm->superres_upscaled_height,
-                                 cm->subsampling_x, cm->subsampling_y,
-                                 cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
-                                 cm->byte_alignment, fb, cb, cb_priv))
+    if (aom_realloc_frame_buffer(
+            frame_to_show, cm->superres_upscaled_width,
+            cm->superres_upscaled_height, seq_params->subsampling_x,
+            seq_params->subsampling_y, seq_params->use_highbitdepth,
+            AOM_BORDER_IN_PIXELS, cm->byte_alignment, fb, cb, cb_priv))
       aom_internal_error(
           &cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to allocate current frame buffer for superres upscaling");
@@ -1283,10 +1285,11 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
 
     // Don't use callbacks on the encoder.
     // aom_alloc_frame_buffer() clears the config data for frame_to_show
-    if (aom_alloc_frame_buffer(frame_to_show, cm->superres_upscaled_width,
-                               cm->superres_upscaled_height, cm->subsampling_x,
-                               cm->subsampling_y, cm->use_highbitdepth,
-                               AOM_BORDER_IN_PIXELS, cm->byte_alignment))
+    if (aom_alloc_frame_buffer(
+            frame_to_show, cm->superres_upscaled_width,
+            cm->superres_upscaled_height, seq_params->subsampling_x,
+            seq_params->subsampling_y, seq_params->use_highbitdepth,
+            AOM_BORDER_IN_PIXELS, cm->byte_alignment))
       aom_internal_error(
           &cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to reallocate current frame buffer for superres upscaling");
diff --git a/third_party/aom/av1/common/restoration.c b/third_party/aom/av1/common/restoration.c
index 58a5275ca..632967957 100644
--- a/third_party/aom/av1/common/restoration.c
+++ b/third_party/aom/av1/common/restoration.c
@@ -42,8 +42,8 @@ const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
 AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
   AV1PixelRect rect;
 
-  int ss_x = is_uv && cm->subsampling_x;
-  int ss_y = is_uv && cm->subsampling_y;
+  int ss_x = is_uv && cm->seq_params.subsampling_x;
+  int ss_y = is_uv && cm->seq_params.subsampling_y;
 
   rect.top = 0;
   rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
@@ -1146,16 +1146,17 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
                                             YV12_BUFFER_CONFIG *frame,
                                             AV1_COMMON *cm, int optimized_lr,
                                             int num_planes) {
-  const int bit_depth = cm->bit_depth;
-  const int highbd = cm->use_highbitdepth;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const int bit_depth = seq_params->bit_depth;
+  const int highbd = seq_params->use_highbitdepth;
   lr_ctxt->dst = &cm->rst_frame;
 
   const int frame_width = frame->crop_widths[0];
   const int frame_height = frame->crop_heights[0];
-  if (aom_realloc_frame_buffer(lr_ctxt->dst, frame_width, frame_height,
-                               cm->subsampling_x, cm->subsampling_y,
-                               cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
-                               cm->byte_alignment, NULL, NULL, NULL) < 0)
+  if (aom_realloc_frame_buffer(
+          lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
+          seq_params->subsampling_y, highbd, AOM_BORDER_IN_PIXELS,
+          cm->byte_alignment, NULL, NULL, NULL) < 0)
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate restoration dst buffer");
 
@@ -1180,8 +1181,8 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
                  highbd);
 
     lr_plane_ctxt->rsi = rsi;
-    lr_plane_ctxt->ss_x = is_uv && cm->subsampling_x;
-    lr_plane_ctxt->ss_y = is_uv && cm->subsampling_y;
+    lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
+    lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
     lr_plane_ctxt->highbd = highbd;
     lr_plane_ctxt->bit_depth = bit_depth;
     lr_plane_ctxt->data8 = frame->buffers[plane];
@@ -1337,7 +1338,7 @@ void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
                                     int32_t *tmpbuf,
                                     RestorationLineBuffers *rlbs) {
   const int is_uv = plane > 0;
-  const int ss_y = is_uv && cm->subsampling_y;
+  const int ss_y = is_uv && cm->seq_params.subsampling_y;
 
   const RestorationInfo *rsi = &cm->rst_info[plane];
 
@@ -1350,7 +1351,7 @@ void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
                                        int *rcol0, int *rcol1, int *rrow0,
-                                       int *rrow1, int *tile_tl_idx) {
+                                       int *rrow1) {
   assert(rcol0 && rcol1 && rrow0 && rrow1);
 
   if (bsize != cm->seq_params.sb_size) return 0;
@@ -1383,8 +1384,8 @@ int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
   const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
 
   // The size of an MI-unit on this plane of the image
-  const int ss_x = is_uv && cm->subsampling_x;
-  const int ss_y = is_uv && cm->subsampling_y;
+  const int ss_x = is_uv && cm->seq_params.subsampling_x;
+  const int ss_y = is_uv && cm->seq_params.subsampling_y;
   const int mi_size_x = MI_SIZE >> ss_x;
   const int mi_size_y = MI_SIZE >> ss_y;
 
@@ -1419,9 +1420,6 @@ int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
   *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
   *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
 
-  const int tile_idx = 0;
-  *tile_tl_idx = tile_idx * rsi->units_per_tile;
-
   return *rcol0 < *rcol1 && *rrow0 < *rrow1;
 }
 
@@ -1468,7 +1466,7 @@ static void save_deblock_boundary_lines(
   int upscaled_width;
   int line_bytes;
   if (av1_superres_scaled(cm)) {
-    const int ss_x = is_uv && cm->subsampling_x;
+    const int ss_x = is_uv && cm->seq_params.subsampling_x;
     upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
     line_bytes = upscaled_width << use_highbd;
     if (use_highbd)
@@ -1515,7 +1513,7 @@ static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
   // At the point where this function is called, we've already applied
   // superres. So we don't need to extend the lines here, we can just
   // pull directly from the topmost row of the upscaled frame.
-  const int ss_x = is_uv && cm->subsampling_x;
+  const int ss_x = is_uv && cm->seq_params.subsampling_x;
   const int upscaled_width = av1_superres_scaled(cm)
                                  ? (cm->superres_upscaled_width + ss_x) >> ss_x
                                  : src_width;
@@ -1535,7 +1533,7 @@ static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
                                          int use_highbd, int plane,
                                          AV1_COMMON *cm, int after_cdef) {
   const int is_uv = plane > 0;
-  const int ss_y = is_uv && cm->subsampling_y;
+  const int ss_y = is_uv && cm->seq_params.subsampling_y;
   const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
   const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
 
@@ -1600,7 +1598,7 @@ static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
 void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
                                               AV1_COMMON *cm, int after_cdef) {
   const int num_planes = av1_num_planes(cm);
-  const int use_highbd = cm->use_highbitdepth;
+  const int use_highbd = cm->seq_params.use_highbitdepth;
   for (int p = 0; p < num_planes; ++p) {
     save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
   }
diff --git a/third_party/aom/av1/common/restoration.h b/third_party/aom/av1/common/restoration.h
index 0c4017534..aec37d834 100644
--- a/third_party/aom/av1/common/restoration.h
+++ b/third_party/aom/av1/common/restoration.h
@@ -346,7 +346,7 @@ void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
                                        int *rcol0, int *rcol1, int *rrow0,
-                                       int *rrow1, int *tile_tl_idx);
+                                       int *rrow1);
 
 void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
                                               struct AV1Common *cm,
diff --git a/third_party/aom/av1/common/scan.h b/third_party/aom/av1/common/scan.h
index c5cebc135..d206586b5 100644
--- a/third_party/aom/av1/common/scan.h
+++ b/third_party/aom/av1/common/scan.h
@@ -39,13 +39,6 @@ extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES];
 
 void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd);
 
-static INLINE int get_coef_context(const int16_t *neighbors,
-                                   const uint8_t *token_cache, int c) {
-  return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
-          token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >>
-         1;
-}
-
 static INLINE const SCAN_ORDER *get_default_scan(TX_SIZE tx_size,
                                                  TX_TYPE tx_type) {
   return &av1_scan_orders[tx_size][tx_type];
diff --git a/third_party/aom/av1/common/thread_common.c b/third_party/aom/av1/common/thread_common.c
index 3fa998a91..f9b734b8c 100644
--- a/third_party/aom/av1/common/thread_common.c
+++ b/third_party/aom/av1/common/thread_common.c
@@ -572,7 +572,7 @@ static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt,
   for (int plane = 0; plane < num_planes; plane++) {
     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
     const int is_uv = plane > 0;
-    const int ss_y = is_uv && cm->subsampling_y;
+    const int ss_y = is_uv && cm->seq_params.subsampling_y;
 
     AV1PixelRect tile_rect = ctxt[plane].tile_rect;
     const int unit_size = ctxt[plane].rsi->restoration_unit_size;
diff --git a/third_party/aom/av1/common/tile_common.c b/third_party/aom/av1/common/tile_common.c
index 9a43ab29a..026c904b6 100644
--- a/third_party/aom/av1/common/tile_common.c
+++ b/third_party/aom/av1/common/tile_common.c
@@ -179,8 +179,8 @@ AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
   r.bottom = AOMMIN(r.bottom, frame_h);
 
   // Convert to coordinates in the appropriate plane
-  const int ss_x = is_uv && cm->subsampling_x;
-  const int ss_y = is_uv && cm->subsampling_y;
+  const int ss_x = is_uv && cm->seq_params.subsampling_x;
+  const int ss_y = is_uv && cm->seq_params.subsampling_y;
 
   r.left = ROUND_POWER_OF_TWO(r.left, ss_x);
   r.right = ROUND_POWER_OF_TWO(r.right, ss_x);
diff --git a/third_party/aom/av1/common/timing.c b/third_party/aom/av1/common/timing.c
index 5ff538ae1..49dbde78f 100644
--- a/third_party/aom/av1/common/timing.c
+++ b/third_party/aom/av1/common/timing.c
@@ -53,8 +53,8 @@ int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
 
 void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) {
   decoder_model->encoder_decoder_buffer_delay_length = 16;
-  decoder_model->buffer_removal_delay_length = 10;
-  decoder_model->frame_presentation_delay_length = 10;
+  decoder_model->buffer_removal_time_length = 10;
+  decoder_model->frame_presentation_time_length = 10;
 }
 
 void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) {
diff --git a/third_party/aom/av1/common/timing.h b/third_party/aom/av1/common/timing.h
index d31f4b7fc..1749baa57 100644
--- a/third_party/aom/av1/common/timing.h
+++ b/third_party/aom/av1/common/timing.h
@@ -27,23 +27,23 @@ typedef struct aom_timing {
 typedef struct aom_dec_model_info {
   uint32_t num_units_in_decoding_tick;
   int encoder_decoder_buffer_delay_length;
-  int buffer_removal_delay_length;
-  int frame_presentation_delay_length;
+  int buffer_removal_time_length;
+  int frame_presentation_time_length;
 } aom_dec_model_info_t;
 
 typedef struct aom_dec_model_op_parameters {
   int decoder_model_param_present_flag;
   int64_t bitrate;
   int64_t buffer_size;
-  int decoder_buffer_delay;
-  int encoder_buffer_delay;
+  uint32_t decoder_buffer_delay;
+  uint32_t encoder_buffer_delay;
   int low_delay_mode_flag;
   int display_model_param_present_flag;
   int initial_display_delay;
 } aom_dec_model_op_parameters_t;
 
 typedef struct aom_op_timing_info_t {
-  int64_t buffer_removal_delay;
+  uint32_t buffer_removal_time;
 } aom_op_timing_info_t;
 
 void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model);
diff --git a/third_party/aom/av1/common/txb_common.h b/third_party/aom/av1/common/txb_common.h
index cdac90d9e..f0ab79d0f 100644
--- a/third_party/aom/av1/common/txb_common.h
+++ b/third_party/aom/av1/common/txb_common.h
@@ -466,31 +466,6 @@ static AOM_FORCE_INLINE int get_nz_mag(const uint8_t *const levels,
   return mag;
 }
 
-static INLINE int get_nz_count(const uint8_t *const levels, const int bwl,
-                               const TX_CLASS tx_class) {
-  int count;
-
-  count = (levels[1] != 0);                         // { 0, 1 }
-  count += (levels[(1 << bwl) + TX_PAD_HOR] != 0);  // { 1, 0 }
-
-  for (int idx = 0; idx < SIG_REF_DIFF_OFFSET_NUM; ++idx) {
-    const int row_offset =
-        ((tx_class == TX_CLASS_2D) ? sig_ref_diff_offset[idx][0]
-                                   : ((tx_class == TX_CLASS_VERT)
-                                          ? sig_ref_diff_offset_vert[idx][0]
-                                          : sig_ref_diff_offset_horiz[idx][0]));
-    const int col_offset =
-        ((tx_class == TX_CLASS_2D) ? sig_ref_diff_offset[idx][1]
-                                   : ((tx_class == TX_CLASS_VERT)
-                                          ? sig_ref_diff_offset_vert[idx][1]
-                                          : sig_ref_diff_offset_horiz[idx][1]));
-    const int nb_pos =
-        (row_offset << bwl) + (row_offset << TX_PAD_HOR_LOG2) + col_offset;
-    count += (levels[nb_pos] != 0);
-  }
-  return count;
-}
-
 #define NZ_MAP_CTX_0 SIG_COEF_CONTEXTS_2D
 #define NZ_MAP_CTX_5 (NZ_MAP_CTX_0 + 5)
 #define NZ_MAP_CTX_10 (NZ_MAP_CTX_0 + 10)
diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c
index ae6f07657..412d83ed8 100644
--- a/third_party/aom/av1/common/warped_motion.c
+++ b/third_party/aom/av1/common/warped_motion.c
@@ -92,33 +92,6 @@ static const int error_measure_lut[512] = {
 };
 /* clang-format on */
 
-void project_points_affine(const int32_t *mat, int *points, int *proj,
-                           const int n, const int stride_points,
-                           const int stride_proj, const int subsampling_x,
-                           const int subsampling_y) {
-  for (int i = 0; i < n; ++i) {
-    const int x = *(points++), y = *(points++);
-    if (subsampling_x)
-      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          mat[2] * 2 * x + mat[3] * 2 * y + mat[0] +
-              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-          WARPEDDIFF_PREC_BITS + 1);
-    else
-      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[0],
-                                            WARPEDDIFF_PREC_BITS);
-    if (subsampling_y)
-      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          mat[4] * 2 * x + mat[5] * 2 * y + mat[1] +
-              (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-          WARPEDDIFF_PREC_BITS + 1);
-    else
-      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[4] * x + mat[5] * y + mat[1],
-                                            WARPEDDIFF_PREC_BITS);
-    points += stride_points - 2;
-    proj += stride_proj - 2;
-  }
-}
-
 // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
 // at a time. The zoom/rotation/shear in the model are applied to the
 // "fractional" position of each pixel, which therefore varies within
diff --git a/third_party/aom/av1/common/warped_motion.h b/third_party/aom/av1/common/warped_motion.h
index f5da36bbb..ce4032ee5 100644
--- a/third_party/aom/av1/common/warped_motion.h
+++ b/third_party/aom/av1/common/warped_motion.h
@@ -68,11 +68,6 @@ static const uint8_t warp_pad_right[14][16] = {
   { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
 };
 
-void project_points_affine(const int32_t *mat, int *points, int *proj,
-                           const int n, const int stride_points,
-                           const int stride_proj, const int subsampling_x,
-                           const int subsampling_y);
-
 // Returns the error between the result of applying motion 'wm' to the frame
 // described by 'ref' and the frame described by 'dst'.
 int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
index 6747cae01..0c5286f9d 100644
--- a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
@@ -39,7 +39,7 @@ static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w,
     const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
     assert(filter_idx < SUBPEL_SHIFTS);
     const int16_t *filter =
-        av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
+        av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
 
     // Load the filter coefficients
     const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
@@ -140,7 +140,7 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
     const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
     assert(filter_idx < SUBPEL_SHIFTS);
     const int16_t *filter =
-        av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
+        av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
 
     const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
     int x;
@@ -232,8 +232,8 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
 }
 void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
                                   uint8_t *dst8, int dst8_stride, int w, int h,
-                                  InterpFilterParams *filter_params_x,
-                                  InterpFilterParams *filter_params_y,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
                                   const int subpel_x_qn, const int x_step_qn,
                                   const int subpel_y_qn, const int y_step_qn,
                                   ConvolveParams *conv_params) {
@@ -278,7 +278,7 @@ static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst,
     const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
     assert(filter_idx < SUBPEL_SHIFTS);
     const int16_t *filter =
-        av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
+        av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
 
     // Load the filter coefficients
     const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
@@ -372,7 +372,7 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
     const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
     assert(filter_idx < SUBPEL_SHIFTS);
     const int16_t *filter =
-        av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
+        av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
 
     const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
     int x;
@@ -472,8 +472,8 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
 
 void av1_highbd_convolve_2d_scale_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, InterpFilterParams *filter_params_x,
-    InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
     const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
     ConvolveParams *conv_params, int bd) {
   // TODO(yaowu): Move this out of stack
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
index 7415c58df..ae331b40d 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
@@ -19,49 +19,47 @@
 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
 
 static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
-                                      const __m256i __rounding,
-                                      int8_t cos_bit) {
+                                      const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  btf_16_adds_subs_avx2(x1[0], x1[3]);
-  btf_16_adds_subs_avx2(x1[1], x1[2]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x1[5], x1[6]);
-
-  btf_16_adds_subs_avx2(x1[8], x1[11]);
-  btf_16_adds_subs_avx2(x1[9], x1[10]);
-  btf_16_subs_adds_avx2(x1[15], x1[12]);
-  btf_16_subs_adds_avx2(x1[14], x1[13]);
+  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+
+  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
 }
 
 static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
-                                      const __m256i __rounding,
-                                      int8_t cos_bit) {
+                                      const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  btf_16_adds_subs_avx2(x[0], x[7]);
-  btf_16_adds_subs_avx2(x[1], x[6]);
-  btf_16_adds_subs_avx2(x[2], x[5]);
-  btf_16_adds_subs_avx2(x[3], x[4]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  btf_16_adds_subs_avx2(&x[0], &x[7]);
+  btf_16_adds_subs_avx2(&x[1], &x[6]);
+  btf_16_adds_subs_avx2(&x[2], &x[5]);
+  btf_16_adds_subs_avx2(&x[3], &x[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
 }
 
 static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
-  btf_16_adds_subs_out_avx2(output[0], output[15], x1[0], x1[15]);
-  btf_16_adds_subs_out_avx2(output[1], output[14], x1[1], x1[14]);
-  btf_16_adds_subs_out_avx2(output[2], output[13], x1[2], x1[13]);
-  btf_16_adds_subs_out_avx2(output[3], output[12], x1[3], x1[12]);
-  btf_16_adds_subs_out_avx2(output[4], output[11], x1[4], x1[11]);
-  btf_16_adds_subs_out_avx2(output[5], output[10], x1[5], x1[10]);
-  btf_16_adds_subs_out_avx2(output[6], output[9], x1[6], x1[9]);
-  btf_16_adds_subs_out_avx2(output[7], output[8], x1[7], x1[8]);
+  btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]);
+  btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]);
+  btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]);
+  btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]);
+  btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]);
+  btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]);
+  btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]);
+  btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
 }
 
 static void idct16_new_avx2(const __m256i *input, __m256i *output,
                             int8_t cos_bit) {
   (void)(cos_bit);
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
 
   __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
   __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
@@ -103,29 +101,29 @@ static void idct16_new_avx2(const __m256i *input, __m256i *output,
   x1[15] = input[15];
 
   // stage 2
-  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, x1[8], x1[15], x1[8], x1[15]);
-  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, x1[9], x1[14], x1[9], x1[14]);
-  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, x1[10], x1[13], x1[10], x1[13]);
-  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, x1[11], x1[12], x1[11], x1[12]);
+  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
 
   // stage 3
-  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, x1[4], x1[7], x1[4], x1[7]);
-  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, x1[5], x1[6], x1[5], x1[6]);
-  btf_16_adds_subs_avx2(x1[8], x1[9]);
-  btf_16_subs_adds_avx2(x1[11], x1[10]);
-  btf_16_adds_subs_avx2(x1[12], x1[13]);
-  btf_16_subs_adds_avx2(x1[15], x1[14]);
+  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
 
   // stage 4
-  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x1[0], x1[1]);
-  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, x1[2], x1[3], x1[2], x1[3]);
-  btf_16_adds_subs_avx2(x1[4], x1[5]);
-  btf_16_subs_adds_avx2(x1[7], x1[6]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x1[9], x1[14], x1[9], x1[14]);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x1[10], x1[13], x1[10], x1[13]);
-
-  idct16_stage5_avx2(x1, cospi, __rounding, cos_bit);
-  idct16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+
+  idct16_stage5_avx2(x1, cospi, _r, cos_bit);
+  idct16_stage6_avx2(x1, cospi, _r, cos_bit);
   idct16_stage7_avx2(output, x1);
 }
 
@@ -133,7 +131,7 @@ static void idct16_low8_new_avx2(const __m256i *input, __m256i *output,
                                  int8_t cos_bit) {
   (void)(cos_bit);
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
 
   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
@@ -159,21 +157,21 @@ static void idct16_low8_new_avx2(const __m256i *input, __m256i *output,
   // stage 3
   btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
   btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
-  btf_16_adds_subs_avx2(x1[8], x1[9]);
-  btf_16_subs_adds_avx2(x1[11], x1[10]);
-  btf_16_adds_subs_avx2(x1[12], x1[13]);
-  btf_16_subs_adds_avx2(x1[15], x1[14]);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
 
   // stage 4
   btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
   btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
-  btf_16_adds_subs_avx2(x1[4], x1[5]);
-  btf_16_subs_adds_avx2(x1[7], x1[6]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x1[9], x1[14], x1[9], x1[14]);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x1[10], x1[13], x1[10], x1[13]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
 
-  idct16_stage5_avx2(x1, cospi, __rounding, cos_bit);
-  idct16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+  idct16_stage5_avx2(x1, cospi, _r, cos_bit);
+  idct16_stage6_avx2(x1, cospi, _r, cos_bit);
   idct16_stage7_avx2(output, x1);
 }
 
@@ -212,74 +210,71 @@ static void idct16_low1_new_avx2(const __m256i *input, __m256i *output,
 }
 
 static INLINE void iadst16_stage3_avx2(__m256i *x) {
-  btf_16_adds_subs_avx2(x[0], x[8]);
-  btf_16_adds_subs_avx2(x[1], x[9]);
-  btf_16_adds_subs_avx2(x[2], x[10]);
-  btf_16_adds_subs_avx2(x[3], x[11]);
-  btf_16_adds_subs_avx2(x[4], x[12]);
-  btf_16_adds_subs_avx2(x[5], x[13]);
-  btf_16_adds_subs_avx2(x[6], x[14]);
-  btf_16_adds_subs_avx2(x[7], x[15]);
+  btf_16_adds_subs_avx2(&x[0], &x[8]);
+  btf_16_adds_subs_avx2(&x[1], &x[9]);
+  btf_16_adds_subs_avx2(&x[2], &x[10]);
+  btf_16_adds_subs_avx2(&x[3], &x[11]);
+  btf_16_adds_subs_avx2(&x[4], &x[12]);
+  btf_16_adds_subs_avx2(&x[5], &x[13]);
+  btf_16_adds_subs_avx2(&x[6], &x[14]);
+  btf_16_adds_subs_avx2(&x[7], &x[15]);
 }
 
 static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
-                                       const __m256i __rounding,
-                                       int8_t cos_bit) {
+                                       const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
   const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
   const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
   const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
   const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
   const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
-  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
-  btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
-  btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
-  btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit);
 }
 
 static INLINE void iadst16_stage5_avx2(__m256i *x) {
-  btf_16_adds_subs_avx2(x[0], x[4]);
-  btf_16_adds_subs_avx2(x[1], x[5]);
-  btf_16_adds_subs_avx2(x[2], x[6]);
-  btf_16_adds_subs_avx2(x[3], x[7]);
-  btf_16_adds_subs_avx2(x[8], x[12]);
-  btf_16_adds_subs_avx2(x[9], x[13]);
-  btf_16_adds_subs_avx2(x[10], x[14]);
-  btf_16_adds_subs_avx2(x[11], x[15]);
+  btf_16_adds_subs_avx2(&x[0], &x[4]);
+  btf_16_adds_subs_avx2(&x[1], &x[5]);
+  btf_16_adds_subs_avx2(&x[2], &x[6]);
+  btf_16_adds_subs_avx2(&x[3], &x[7]);
+  btf_16_adds_subs_avx2(&x[8], &x[12]);
+  btf_16_adds_subs_avx2(&x[9], &x[13]);
+  btf_16_adds_subs_avx2(&x[10], &x[14]);
+  btf_16_adds_subs_avx2(&x[11], &x[15]);
 }
 
 static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
-                                       const __m256i __rounding,
-                                       int8_t cos_bit) {
+                                       const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
   const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
   const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
-  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
-  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
-  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
-  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit);
 }
 
 static INLINE void iadst16_stage7_avx2(__m256i *x) {
-  btf_16_adds_subs_avx2(x[0], x[2]);
-  btf_16_adds_subs_avx2(x[1], x[3]);
-  btf_16_adds_subs_avx2(x[4], x[6]);
-  btf_16_adds_subs_avx2(x[5], x[7]);
-  btf_16_adds_subs_avx2(x[8], x[10]);
-  btf_16_adds_subs_avx2(x[9], x[11]);
-  btf_16_adds_subs_avx2(x[12], x[14]);
-  btf_16_adds_subs_avx2(x[13], x[15]);
+  btf_16_adds_subs_avx2(&x[0], &x[2]);
+  btf_16_adds_subs_avx2(&x[1], &x[3]);
+  btf_16_adds_subs_avx2(&x[4], &x[6]);
+  btf_16_adds_subs_avx2(&x[5], &x[7]);
+  btf_16_adds_subs_avx2(&x[8], &x[10]);
+  btf_16_adds_subs_avx2(&x[9], &x[11]);
+  btf_16_adds_subs_avx2(&x[12], &x[14]);
+  btf_16_adds_subs_avx2(&x[13], &x[15]);
 }
 
 static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
-                                       const __m256i __rounding,
-                                       int8_t cos_bit) {
+                                       const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
   const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
-  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x1[2], x1[3]);
-  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x1[6], x1[7]);
-  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x1[10], x1[11]);
-  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x1[14], x1[15]);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
 }
 
 static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
@@ -307,7 +302,7 @@ static void iadst16_new_avx2(const __m256i *input, __m256i *output,
   (void)(cos_bit);
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
-  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
 
   __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
   __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
@@ -346,21 +341,21 @@ static void iadst16_new_avx2(const __m256i *input, __m256i *output,
   x1[15] = input[14];
 
   // stage 2
-  btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, x1[0], x1[1], x1[0], x1[1]);
-  btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, x1[2], x1[3], x1[2], x1[3]);
-  btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, x1[4], x1[5], x1[4], x1[5]);
-  btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, x1[6], x1[7], x1[6], x1[7]);
-  btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, x1[8], x1[9], x1[8], x1[9]);
-  btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, x1[10], x1[11], x1[10], x1[11]);
-  btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, x1[12], x1[13], x1[12], x1[13]);
-  btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, x1[14], x1[15], x1[14], x1[15]);
+  btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
 
   iadst16_stage3_avx2(x1);
-  iadst16_stage4_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
   iadst16_stage5_avx2(x1);
-  iadst16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
   iadst16_stage7_avx2(x1);
-  iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
   iadst16_stage9_avx2(output, x1);
 }
 
@@ -368,7 +363,7 @@ static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output,
                                   int8_t cos_bit) {
   (void)(cos_bit);
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
 
   // stage 1
   __m256i x1[16];
@@ -392,11 +387,11 @@ static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output,
   btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
 
   iadst16_stage3_avx2(x1);
-  iadst16_stage4_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
   iadst16_stage5_avx2(x1);
-  iadst16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
   iadst16_stage7_avx2(x1);
-  iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
   iadst16_stage9_avx2(output, x1);
 }
 
@@ -404,7 +399,7 @@ static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
                                   int8_t cos_bit) {
   (void)(cos_bit);
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
 
   const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
   const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
@@ -423,7 +418,7 @@ static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
   x1[9] = x1[1];
 
   // stage 4
-  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, x1[8], x1[9], x1[8], x1[9]);
+  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
 
   // stage 5
   x1[4] = x1[0];
@@ -433,8 +428,8 @@ static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
   x1[13] = x1[9];
 
   // stage 6
-  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x1[4], x1[5], x1[4], x1[5]);
-  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x1[12], x1[13], x1[12], x1[13]);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
 
   // stage 7
   x1[2] = x1[0];
@@ -446,130 +441,125 @@ static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
   x1[14] = x1[12];
   x1[15] = x1[13];
 
-  iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
   iadst16_stage9_avx2(output, x1);
 }
 
 static INLINE void idct32_high16_stage3_avx2(__m256i *x) {
-  btf_16_adds_subs_avx2(x[16], x[17]);
-  btf_16_subs_adds_avx2(x[19], x[18]);
-  btf_16_adds_subs_avx2(x[20], x[21]);
-  btf_16_subs_adds_avx2(x[23], x[22]);
-  btf_16_adds_subs_avx2(x[24], x[25]);
-  btf_16_subs_adds_avx2(x[27], x[26]);
-  btf_16_adds_subs_avx2(x[28], x[29]);
-  btf_16_subs_adds_avx2(x[31], x[30]);
+  btf_16_adds_subs_avx2(&x[16], &x[17]);
+  btf_16_adds_subs_avx2(&x[19], &x[18]);
+  btf_16_adds_subs_avx2(&x[20], &x[21]);
+  btf_16_adds_subs_avx2(&x[23], &x[22]);
+  btf_16_adds_subs_avx2(&x[24], &x[25]);
+  btf_16_adds_subs_avx2(&x[27], &x[26]);
+  btf_16_adds_subs_avx2(&x[28], &x[29]);
+  btf_16_adds_subs_avx2(&x[31], &x[30]);
 }
 
 static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
-                                             const __m256i __rounding,
-                                             int8_t cos_bit) {
+                                             const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
   const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
   const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
   const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
   const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
   const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
-  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
-  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
-  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
-  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
 }
 
 static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
-                                             const __m256i __rounding,
-                                             int8_t cos_bit) {
+                                             const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
   const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
-  btf_16_adds_subs_avx2(x[16], x[19]);
-  btf_16_adds_subs_avx2(x[17], x[18]);
-  btf_16_subs_adds_avx2(x[23], x[20]);
-  btf_16_subs_adds_avx2(x[22], x[21]);
-  btf_16_adds_subs_avx2(x[24], x[27]);
-  btf_16_adds_subs_avx2(x[25], x[26]);
-  btf_16_subs_adds_avx2(x[31], x[28]);
-  btf_16_subs_adds_avx2(x[30], x[29]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[16], &x[19]);
+  btf_16_adds_subs_avx2(&x[17], &x[18]);
+  btf_16_adds_subs_avx2(&x[23], &x[20]);
+  btf_16_adds_subs_avx2(&x[22], &x[21]);
+  btf_16_adds_subs_avx2(&x[24], &x[27]);
+  btf_16_adds_subs_avx2(&x[25], &x[26]);
+  btf_16_adds_subs_avx2(&x[31], &x[28]);
+  btf_16_adds_subs_avx2(&x[30], &x[29]);
 }
 
 static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
-                                             const __m256i __rounding,
-                                             int8_t cos_bit) {
+                                             const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
   const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
-  btf_16_adds_subs_avx2(x[8], x[11]);
-  btf_16_adds_subs_avx2(x[9], x[10]);
-  btf_16_subs_adds_avx2(x[15], x[12]);
-  btf_16_subs_adds_avx2(x[14], x[13]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[8], &x[11]);
+  btf_16_adds_subs_avx2(&x[9], &x[10]);
+  btf_16_adds_subs_avx2(&x[15], &x[12]);
+  btf_16_adds_subs_avx2(&x[14], &x[13]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
 }
 
 static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
-                                      const __m256i __rounding,
-                                      int8_t cos_bit) {
+                                      const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  btf_16_adds_subs_avx2(x[0], x[7]);
-  btf_16_adds_subs_avx2(x[1], x[6]);
-  btf_16_adds_subs_avx2(x[2], x[5]);
-  btf_16_adds_subs_avx2(x[3], x[4]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
-  btf_16_adds_subs_avx2(x[16], x[23]);
-  btf_16_adds_subs_avx2(x[17], x[22]);
-  btf_16_adds_subs_avx2(x[18], x[21]);
-  btf_16_adds_subs_avx2(x[19], x[20]);
-  btf_16_subs_adds_avx2(x[31], x[24]);
-  btf_16_subs_adds_avx2(x[30], x[25]);
-  btf_16_subs_adds_avx2(x[29], x[26]);
-  btf_16_subs_adds_avx2(x[28], x[27]);
+  btf_16_adds_subs_avx2(&x[0], &x[7]);
+  btf_16_adds_subs_avx2(&x[1], &x[6]);
+  btf_16_adds_subs_avx2(&x[2], &x[5]);
+  btf_16_adds_subs_avx2(&x[3], &x[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[16], &x[23]);
+  btf_16_adds_subs_avx2(&x[17], &x[22]);
+  btf_16_adds_subs_avx2(&x[18], &x[21]);
+  btf_16_adds_subs_avx2(&x[19], &x[20]);
+  btf_16_adds_subs_avx2(&x[31], &x[24]);
+  btf_16_adds_subs_avx2(&x[30], &x[25]);
+  btf_16_adds_subs_avx2(&x[29], &x[26]);
+  btf_16_adds_subs_avx2(&x[28], &x[27]);
 }
 
 static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
-                                      const __m256i __rounding,
-                                      int8_t cos_bit) {
+                                      const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  btf_16_adds_subs_avx2(x[0], x[15]);
-  btf_16_adds_subs_avx2(x[1], x[14]);
-  btf_16_adds_subs_avx2(x[2], x[13]);
-  btf_16_adds_subs_avx2(x[3], x[12]);
-  btf_16_adds_subs_avx2(x[4], x[11]);
-  btf_16_adds_subs_avx2(x[5], x[10]);
-  btf_16_adds_subs_avx2(x[6], x[9]);
-  btf_16_adds_subs_avx2(x[7], x[8]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+  btf_16_adds_subs_avx2(&x[0], &x[15]);
+  btf_16_adds_subs_avx2(&x[1], &x[14]);
+  btf_16_adds_subs_avx2(&x[2], &x[13]);
+  btf_16_adds_subs_avx2(&x[3], &x[12]);
+  btf_16_adds_subs_avx2(&x[4], &x[11]);
+  btf_16_adds_subs_avx2(&x[5], &x[10]);
+  btf_16_adds_subs_avx2(&x[6], &x[9]);
+  btf_16_adds_subs_avx2(&x[7], &x[8]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
 }
 
 static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
-  btf_16_adds_subs_out_avx2(output[0], output[31], x[0], x[31]);
-  btf_16_adds_subs_out_avx2(output[1], output[30], x[1], x[30]);
-  btf_16_adds_subs_out_avx2(output[2], output[29], x[2], x[29]);
-  btf_16_adds_subs_out_avx2(output[3], output[28], x[3], x[28]);
-  btf_16_adds_subs_out_avx2(output[4], output[27], x[4], x[27]);
-  btf_16_adds_subs_out_avx2(output[5], output[26], x[5], x[26]);
-  btf_16_adds_subs_out_avx2(output[6], output[25], x[6], x[25]);
-  btf_16_adds_subs_out_avx2(output[7], output[24], x[7], x[24]);
-  btf_16_adds_subs_out_avx2(output[8], output[23], x[8], x[23]);
-  btf_16_adds_subs_out_avx2(output[9], output[22], x[9], x[22]);
-  btf_16_adds_subs_out_avx2(output[10], output[21], x[10], x[21]);
-  btf_16_adds_subs_out_avx2(output[11], output[20], x[11], x[20]);
-  btf_16_adds_subs_out_avx2(output[12], output[19], x[12], x[19]);
-  btf_16_adds_subs_out_avx2(output[13], output[18], x[13], x[18]);
-  btf_16_adds_subs_out_avx2(output[14], output[17], x[14], x[17]);
-  btf_16_adds_subs_out_avx2(output[15], output[16], x[15], x[16]);
+  btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]);
+  btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]);
+  btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]);
+  btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]);
+  btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]);
+  btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]);
+  btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]);
+  btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]);
+  btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]);
+  btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]);
+  btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]);
+  btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]);
+  btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]);
+  btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]);
+  btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]);
+  btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
 }
 
 static void idct32_low1_new_avx2(const __m256i *input, __m256i *output,
@@ -629,7 +619,7 @@ static void idct32_low8_new_avx2(const __m256i *input, __m256i *output,
                                  int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
 
   // stage 1
   __m256i x[32];
@@ -666,20 +656,20 @@ static void idct32_low8_new_avx2(const __m256i *input, __m256i *output,
   x[10] = x[11];
   x[13] = x[12];
   x[14] = x[15];
-  idct32_high16_stage4_avx2(x, cospi, __rounding, cos_bit);
+  idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
 
   // stage 5
   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
   x[5] = x[4];
   x[6] = x[7];
-  idct32_high24_stage5_avx2(x, cospi, __rounding, cos_bit);
+  idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
   // stage 6
   x[3] = x[0];
   x[2] = x[1];
-  idct32_high28_stage6_avx2(x, cospi, __rounding, cos_bit);
+  idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
 
-  idct32_stage7_avx2(x, cospi, __rounding, cos_bit);
-  idct32_stage8_avx2(x, cospi, __rounding, cos_bit);
+  idct32_stage7_avx2(x, cospi, _r, cos_bit);
+  idct32_stage8_avx2(x, cospi, _r, cos_bit);
   idct32_stage9_avx2(output, x);
 }
 
@@ -687,7 +677,7 @@ static void idct32_low16_new_avx2(const __m256i *input, __m256i *output,
                                   int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
 
   // stage 1
   __m256i x[32];
@@ -728,25 +718,25 @@ static void idct32_low16_new_avx2(const __m256i *input, __m256i *output,
   // stage 4
   btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
   btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
-  btf_16_adds_subs_avx2(x[8], x[9]);
-  btf_16_subs_adds_avx2(x[11], x[10]);
-  btf_16_adds_subs_avx2(x[12], x[13]);
-  btf_16_subs_adds_avx2(x[15], x[14]);
-  idct32_high16_stage4_avx2(x, cospi, __rounding, cos_bit);
+  btf_16_adds_subs_avx2(&x[8], &x[9]);
+  btf_16_adds_subs_avx2(&x[11], &x[10]);
+  btf_16_adds_subs_avx2(&x[12], &x[13]);
+  btf_16_adds_subs_avx2(&x[15], &x[14]);
+  idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
 
   // stage 5
   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
   btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
-  btf_16_adds_subs_avx2(x[4], x[5]);
-  btf_16_subs_adds_avx2(x[7], x[6]);
-  idct32_high24_stage5_avx2(x, cospi, __rounding, cos_bit);
+  btf_16_adds_subs_avx2(&x[4], &x[5]);
+  btf_16_adds_subs_avx2(&x[7], &x[6]);
+  idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
 
-  btf_16_adds_subs_avx2(x[0], x[3]);
-  btf_16_adds_subs_avx2(x[1], x[2]);
-  idct32_high28_stage6_avx2(x, cospi, __rounding, cos_bit);
+  btf_16_adds_subs_avx2(&x[0], &x[3]);
+  btf_16_adds_subs_avx2(&x[1], &x[2]);
+  idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
 
-  idct32_stage7_avx2(x, cospi, __rounding, cos_bit);
-  idct32_stage8_avx2(x, cospi, __rounding, cos_bit);
+  idct32_stage7_avx2(x, cospi, _r, cos_bit);
+  idct32_stage8_avx2(x, cospi, _r, cos_bit);
   idct32_stage9_avx2(output, x);
 }
 
@@ -754,7 +744,7 @@ static void idct32_new_avx2(const __m256i *input, __m256i *output,
                             int8_t cos_bit) {
   (void)(cos_bit);
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
 
   __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
   __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
@@ -825,51 +815,50 @@ static void idct32_new_avx2(const __m256i *input, __m256i *output,
   x1[31] = input[31];
 
   // stage 2
-  btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, x1[16], x1[31], x1[16], x1[31]);
-  btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, x1[17], x1[30], x1[17], x1[30]);
-  btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, x1[18], x1[29], x1[18], x1[29]);
-  btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, x1[19], x1[28], x1[19], x1[28]);
-  btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, x1[20], x1[27], x1[20], x1[27]);
-  btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, x1[21], x1[26], x1[21], x1[26]);
-  btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, x1[22], x1[25], x1[22], x1[25]);
-  btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, x1[23], x1[24], x1[23], x1[24]);
+  btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r, cos_bit);
 
   // stage 3
-  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, x1[8], x1[15], x1[8], x1[15]);
-  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, x1[9], x1[14], x1[9], x1[14]);
-  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, x1[10], x1[13], x1[10], x1[13]);
-  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, x1[11], x1[12], x1[11], x1[12]);
+  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
   idct32_high16_stage3_avx2(x1);
 
   // stage 4
-  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, x1[4], x1[7], x1[4], x1[7]);
-  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, x1[5], x1[6], x1[5], x1[6]);
-  btf_16_adds_subs_avx2(x1[8], x1[9]);
-  btf_16_subs_adds_avx2(x1[11], x1[10]);
-  btf_16_adds_subs_avx2(x1[12], x1[13]);
-  btf_16_subs_adds_avx2(x1[15], x1[14]);
-  idct32_high16_stage4_avx2(x1, cospi, __rounding, cos_bit);
+  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+  idct32_high16_stage4_avx2(x1, cospi, _r, cos_bit);
 
   // stage 5
-  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x1[0], x1[1]);
-  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, x1[2], x1[3], x1[2], x1[3]);
-  btf_16_adds_subs_avx2(x1[4], x1[5]);
-  btf_16_subs_adds_avx2(x1[7], x1[6]);
-  idct32_high24_stage5_avx2(x1, cospi, __rounding, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  idct32_high24_stage5_avx2(x1, cospi, _r, cos_bit);
 
   // stage 6
-  btf_16_adds_subs_avx2(x1[0], x1[3]);
-  btf_16_adds_subs_avx2(x1[1], x1[2]);
-  idct32_high28_stage6_avx2(x1, cospi, __rounding, cos_bit);
+  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+  idct32_high28_stage6_avx2(x1, cospi, _r, cos_bit);
 
-  idct32_stage7_avx2(x1, cospi, __rounding, cos_bit);
-  idct32_stage8_avx2(x1, cospi, __rounding, cos_bit);
+  idct32_stage7_avx2(x1, cospi, _r, cos_bit);
+  idct32_stage8_avx2(x1, cospi, _r, cos_bit);
   idct32_stage9_avx2(output, x1);
 }
 
 static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
-                                             const __m256i __rounding,
-                                             int8_t cos_bit) {
+                                             const __m256i _r, int8_t cos_bit) {
   (void)cos_bit;
   const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
   const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
@@ -883,19 +872,18 @@ static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
   const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
   const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
   const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
-  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
-  btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
-  btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
-  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
-  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
-  btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
-  btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
-  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
 }
 
 static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
-                                             const __m256i __rounding,
-                                             int8_t cos_bit) {
+                                             const __m256i _r, int8_t cos_bit) {
   (void)cos_bit;
   const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
   const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
@@ -903,31 +891,30 @@ static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
   const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
   const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
   const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
-  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
-  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
-  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
-  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
-  btf_16_adds_subs_avx2(x[32], x[35]);
-  btf_16_adds_subs_avx2(x[33], x[34]);
-  btf_16_subs_adds_avx2(x[39], x[36]);
-  btf_16_subs_adds_avx2(x[38], x[37]);
-  btf_16_adds_subs_avx2(x[40], x[43]);
-  btf_16_adds_subs_avx2(x[41], x[42]);
-  btf_16_subs_adds_avx2(x[47], x[44]);
-  btf_16_subs_adds_avx2(x[46], x[45]);
-  btf_16_adds_subs_avx2(x[48], x[51]);
-  btf_16_adds_subs_avx2(x[49], x[50]);
-  btf_16_subs_adds_avx2(x[55], x[52]);
-  btf_16_subs_adds_avx2(x[54], x[53]);
-  btf_16_adds_subs_avx2(x[56], x[59]);
-  btf_16_adds_subs_avx2(x[57], x[58]);
-  btf_16_subs_adds_avx2(x[63], x[60]);
-  btf_16_subs_adds_avx2(x[62], x[61]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[32], &x[35]);
+  btf_16_adds_subs_avx2(&x[33], &x[34]);
+  btf_16_adds_subs_avx2(&x[39], &x[36]);
+  btf_16_adds_subs_avx2(&x[38], &x[37]);
+  btf_16_adds_subs_avx2(&x[40], &x[43]);
+  btf_16_adds_subs_avx2(&x[41], &x[42]);
+  btf_16_adds_subs_avx2(&x[47], &x[44]);
+  btf_16_adds_subs_avx2(&x[46], &x[45]);
+  btf_16_adds_subs_avx2(&x[48], &x[51]);
+  btf_16_adds_subs_avx2(&x[49], &x[50]);
+  btf_16_adds_subs_avx2(&x[55], &x[52]);
+  btf_16_adds_subs_avx2(&x[54], &x[53]);
+  btf_16_adds_subs_avx2(&x[56], &x[59]);
+  btf_16_adds_subs_avx2(&x[57], &x[58]);
+  btf_16_adds_subs_avx2(&x[63], &x[60]);
+  btf_16_adds_subs_avx2(&x[62], &x[61]);
 }
 
 static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
-                                             const __m256i __rounding,
-                                             int8_t cos_bit) {
+                                             const __m256i _r, int8_t cos_bit) {
   (void)cos_bit;
   const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
   const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
@@ -935,185 +922,180 @@ static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
   const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
   const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
   const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
-  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
-  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
-  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
-  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
-  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
-  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
-  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
-  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit);
 }
 
 static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
-                                             const __m256i __rounding,
-                                             int8_t cos_bit) {
-  btf_16_adds_subs_avx2(x[16], x[19]);
-  btf_16_adds_subs_avx2(x[17], x[18]);
-  btf_16_subs_adds_avx2(x[23], x[20]);
-  btf_16_subs_adds_avx2(x[22], x[21]);
-  btf_16_adds_subs_avx2(x[24], x[27]);
-  btf_16_adds_subs_avx2(x[25], x[26]);
-  btf_16_subs_adds_avx2(x[31], x[28]);
-  btf_16_subs_adds_avx2(x[30], x[29]);
-  idct64_stage6_high32_avx2(x, cospi, __rounding, cos_bit);
+                                             const __m256i _r, int8_t cos_bit) {
+  btf_16_adds_subs_avx2(&x[16], &x[19]);
+  btf_16_adds_subs_avx2(&x[17], &x[18]);
+  btf_16_adds_subs_avx2(&x[23], &x[20]);
+  btf_16_adds_subs_avx2(&x[22], &x[21]);
+  btf_16_adds_subs_avx2(&x[24], &x[27]);
+  btf_16_adds_subs_avx2(&x[25], &x[26]);
+  btf_16_adds_subs_avx2(&x[31], &x[28]);
+  btf_16_adds_subs_avx2(&x[30], &x[29]);
+  idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
 }
 
 static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
-                                             const __m256i __rounding,
-                                             int8_t cos_bit) {
+                                             const __m256i _r, int8_t cos_bit) {
   (void)cos_bit;
   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
   const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
-  btf_16_adds_subs_avx2(x[32], x[39]);
-  btf_16_adds_subs_avx2(x[33], x[38]);
-  btf_16_adds_subs_avx2(x[34], x[37]);
-  btf_16_adds_subs_avx2(x[35], x[36]);
-  btf_16_subs_adds_avx2(x[47], x[40]);
-  btf_16_subs_adds_avx2(x[46], x[41]);
-  btf_16_subs_adds_avx2(x[45], x[42]);
-  btf_16_subs_adds_avx2(x[44], x[43]);
-  btf_16_adds_subs_avx2(x[48], x[55]);
-  btf_16_adds_subs_avx2(x[49], x[54]);
-  btf_16_adds_subs_avx2(x[50], x[53]);
-  btf_16_adds_subs_avx2(x[51], x[52]);
-  btf_16_subs_adds_avx2(x[63], x[56]);
-  btf_16_subs_adds_avx2(x[62], x[57]);
-  btf_16_subs_adds_avx2(x[61], x[58]);
-  btf_16_subs_adds_avx2(x[60], x[59]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[32], &x[39]);
+  btf_16_adds_subs_avx2(&x[33], &x[38]);
+  btf_16_adds_subs_avx2(&x[34], &x[37]);
+  btf_16_adds_subs_avx2(&x[35], &x[36]);
+  btf_16_adds_subs_avx2(&x[47], &x[40]);
+  btf_16_adds_subs_avx2(&x[46], &x[41]);
+  btf_16_adds_subs_avx2(&x[45], &x[42]);
+  btf_16_adds_subs_avx2(&x[44], &x[43]);
+  btf_16_adds_subs_avx2(&x[48], &x[55]);
+  btf_16_adds_subs_avx2(&x[49], &x[54]);
+  btf_16_adds_subs_avx2(&x[50], &x[53]);
+  btf_16_adds_subs_avx2(&x[51], &x[52]);
+  btf_16_adds_subs_avx2(&x[63], &x[56]);
+  btf_16_adds_subs_avx2(&x[62], &x[57]);
+  btf_16_adds_subs_avx2(&x[61], &x[58]);
+  btf_16_adds_subs_avx2(&x[60], &x[59]);
 }
 
 static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
-                                             const __m256i __rounding,
-                                             int8_t cos_bit) {
+                                             const __m256i _r, int8_t cos_bit) {
   (void)cos_bit;
   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
   const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
-  btf_16_adds_subs_avx2(x[16], x[23]);
-  btf_16_adds_subs_avx2(x[17], x[22]);
-  btf_16_adds_subs_avx2(x[18], x[21]);
-  btf_16_adds_subs_avx2(x[19], x[20]);
-  btf_16_subs_adds_avx2(x[31], x[24]);
-  btf_16_subs_adds_avx2(x[30], x[25]);
-  btf_16_subs_adds_avx2(x[29], x[26]);
-  btf_16_subs_adds_avx2(x[28], x[27]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
+  btf_16_adds_subs_avx2(&x[16], &x[23]);
+  btf_16_adds_subs_avx2(&x[17], &x[22]);
+  btf_16_adds_subs_avx2(&x[18], &x[21]);
+  btf_16_adds_subs_avx2(&x[19], &x[20]);
+  btf_16_adds_subs_avx2(&x[31], &x[24]);
+  btf_16_adds_subs_avx2(&x[30], &x[25]);
+  btf_16_adds_subs_avx2(&x[29], &x[26]);
+  btf_16_adds_subs_avx2(&x[28], &x[27]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit);
 }
 
 static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
-                                      const __m256i __rounding,
-                                      int8_t cos_bit) {
+                                      const __m256i _r, int8_t cos_bit) {
   (void)cos_bit;
   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  btf_16_adds_subs_avx2(x[0], x[15]);
-  btf_16_adds_subs_avx2(x[1], x[14]);
-  btf_16_adds_subs_avx2(x[2], x[13]);
-  btf_16_adds_subs_avx2(x[3], x[12]);
-  btf_16_adds_subs_avx2(x[4], x[11]);
-  btf_16_adds_subs_avx2(x[5], x[10]);
-  btf_16_adds_subs_avx2(x[6], x[9]);
-  btf_16_adds_subs_avx2(x[7], x[8]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
-  btf_16_adds_subs_avx2(x[32], x[47]);
-  btf_16_adds_subs_avx2(x[33], x[46]);
-  btf_16_adds_subs_avx2(x[34], x[45]);
-  btf_16_adds_subs_avx2(x[35], x[44]);
-  btf_16_adds_subs_avx2(x[36], x[43]);
-  btf_16_adds_subs_avx2(x[37], x[42]);
-  btf_16_adds_subs_avx2(x[38], x[41]);
-  btf_16_adds_subs_avx2(x[39], x[40]);
-  btf_16_subs_adds_avx2(x[63], x[48]);
-  btf_16_subs_adds_avx2(x[62], x[49]);
-  btf_16_subs_adds_avx2(x[61], x[50]);
-  btf_16_subs_adds_avx2(x[60], x[51]);
-  btf_16_subs_adds_avx2(x[59], x[52]);
-  btf_16_subs_adds_avx2(x[58], x[53]);
-  btf_16_subs_adds_avx2(x[57], x[54]);
-  btf_16_subs_adds_avx2(x[56], x[55]);
+  btf_16_adds_subs_avx2(&x[0], &x[15]);
+  btf_16_adds_subs_avx2(&x[1], &x[14]);
+  btf_16_adds_subs_avx2(&x[2], &x[13]);
+  btf_16_adds_subs_avx2(&x[3], &x[12]);
+  btf_16_adds_subs_avx2(&x[4], &x[11]);
+  btf_16_adds_subs_avx2(&x[5], &x[10]);
+  btf_16_adds_subs_avx2(&x[6], &x[9]);
+  btf_16_adds_subs_avx2(&x[7], &x[8]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[32], &x[47]);
+  btf_16_adds_subs_avx2(&x[33], &x[46]);
+  btf_16_adds_subs_avx2(&x[34], &x[45]);
+  btf_16_adds_subs_avx2(&x[35], &x[44]);
+  btf_16_adds_subs_avx2(&x[36], &x[43]);
+  btf_16_adds_subs_avx2(&x[37], &x[42]);
+  btf_16_adds_subs_avx2(&x[38], &x[41]);
+  btf_16_adds_subs_avx2(&x[39], &x[40]);
+  btf_16_adds_subs_avx2(&x[63], &x[48]);
+  btf_16_adds_subs_avx2(&x[62], &x[49]);
+  btf_16_adds_subs_avx2(&x[61], &x[50]);
+  btf_16_adds_subs_avx2(&x[60], &x[51]);
+  btf_16_adds_subs_avx2(&x[59], &x[52]);
+  btf_16_adds_subs_avx2(&x[58], &x[53]);
+  btf_16_adds_subs_avx2(&x[57], &x[54]);
+  btf_16_adds_subs_avx2(&x[56], &x[55]);
 }
 
 static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
-                                       const __m256i __rounding,
-                                       int8_t cos_bit) {
+                                       const __m256i _r, int8_t cos_bit) {
   (void)cos_bit;
   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  btf_16_adds_subs_avx2(x[0], x[31]);
-  btf_16_adds_subs_avx2(x[1], x[30]);
-  btf_16_adds_subs_avx2(x[2], x[29]);
-  btf_16_adds_subs_avx2(x[3], x[28]);
-  btf_16_adds_subs_avx2(x[4], x[27]);
-  btf_16_adds_subs_avx2(x[5], x[26]);
-  btf_16_adds_subs_avx2(x[6], x[25]);
-  btf_16_adds_subs_avx2(x[7], x[24]);
-  btf_16_adds_subs_avx2(x[8], x[23]);
-  btf_16_adds_subs_avx2(x[9], x[22]);
-  btf_16_adds_subs_avx2(x[10], x[21]);
-  btf_16_adds_subs_avx2(x[11], x[20]);
-  btf_16_adds_subs_avx2(x[12], x[19]);
-  btf_16_adds_subs_avx2(x[13], x[18]);
-  btf_16_adds_subs_avx2(x[14], x[17]);
-  btf_16_adds_subs_avx2(x[15], x[16]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
+  btf_16_adds_subs_avx2(&x[0], &x[31]);
+  btf_16_adds_subs_avx2(&x[1], &x[30]);
+  btf_16_adds_subs_avx2(&x[2], &x[29]);
+  btf_16_adds_subs_avx2(&x[3], &x[28]);
+  btf_16_adds_subs_avx2(&x[4], &x[27]);
+  btf_16_adds_subs_avx2(&x[5], &x[26]);
+  btf_16_adds_subs_avx2(&x[6], &x[25]);
+  btf_16_adds_subs_avx2(&x[7], &x[24]);
+  btf_16_adds_subs_avx2(&x[8], &x[23]);
+  btf_16_adds_subs_avx2(&x[9], &x[22]);
+  btf_16_adds_subs_avx2(&x[10], &x[21]);
+  btf_16_adds_subs_avx2(&x[11], &x[20]);
+  btf_16_adds_subs_avx2(&x[12], &x[19]);
+  btf_16_adds_subs_avx2(&x[13], &x[18]);
+  btf_16_adds_subs_avx2(&x[14], &x[17]);
+  btf_16_adds_subs_avx2(&x[15], &x[16]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit);
 }
 
 static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
-  btf_16_adds_subs_out_avx2(output[0], output[63], x[0], x[63]);
-  btf_16_adds_subs_out_avx2(output[1], output[62], x[1], x[62]);
-  btf_16_adds_subs_out_avx2(output[2], output[61], x[2], x[61]);
-  btf_16_adds_subs_out_avx2(output[3], output[60], x[3], x[60]);
-  btf_16_adds_subs_out_avx2(output[4], output[59], x[4], x[59]);
-  btf_16_adds_subs_out_avx2(output[5], output[58], x[5], x[58]);
-  btf_16_adds_subs_out_avx2(output[6], output[57], x[6], x[57]);
-  btf_16_adds_subs_out_avx2(output[7], output[56], x[7], x[56]);
-  btf_16_adds_subs_out_avx2(output[8], output[55], x[8], x[55]);
-  btf_16_adds_subs_out_avx2(output[9], output[54], x[9], x[54]);
-  btf_16_adds_subs_out_avx2(output[10], output[53], x[10], x[53]);
-  btf_16_adds_subs_out_avx2(output[11], output[52], x[11], x[52]);
-  btf_16_adds_subs_out_avx2(output[12], output[51], x[12], x[51]);
-  btf_16_adds_subs_out_avx2(output[13], output[50], x[13], x[50]);
-  btf_16_adds_subs_out_avx2(output[14], output[49], x[14], x[49]);
-  btf_16_adds_subs_out_avx2(output[15], output[48], x[15], x[48]);
-  btf_16_adds_subs_out_avx2(output[16], output[47], x[16], x[47]);
-  btf_16_adds_subs_out_avx2(output[17], output[46], x[17], x[46]);
-  btf_16_adds_subs_out_avx2(output[18], output[45], x[18], x[45]);
-  btf_16_adds_subs_out_avx2(output[19], output[44], x[19], x[44]);
-  btf_16_adds_subs_out_avx2(output[20], output[43], x[20], x[43]);
-  btf_16_adds_subs_out_avx2(output[21], output[42], x[21], x[42]);
-  btf_16_adds_subs_out_avx2(output[22], output[41], x[22], x[41]);
-  btf_16_adds_subs_out_avx2(output[23], output[40], x[23], x[40]);
-  btf_16_adds_subs_out_avx2(output[24], output[39], x[24], x[39]);
-  btf_16_adds_subs_out_avx2(output[25], output[38], x[25], x[38]);
-  btf_16_adds_subs_out_avx2(output[26], output[37], x[26], x[37]);
-  btf_16_adds_subs_out_avx2(output[27], output[36], x[27], x[36]);
-  btf_16_adds_subs_out_avx2(output[28], output[35], x[28], x[35]);
-  btf_16_adds_subs_out_avx2(output[29], output[34], x[29], x[34]);
-  btf_16_adds_subs_out_avx2(output[30], output[33], x[30], x[33]);
-  btf_16_adds_subs_out_avx2(output[31], output[32], x[31], x[32]);
+  btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]);
+  btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]);
+  btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]);
+  btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]);
+  btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]);
+  btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]);
+  btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]);
+  btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]);
+  btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]);
+  btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]);
+  btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]);
+  btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]);
+  btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]);
+  btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]);
+  btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]);
+  btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]);
+  btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]);
+  btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]);
+  btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]);
+  btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]);
+  btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]);
+  btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]);
+  btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]);
+  btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]);
+  btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]);
+  btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]);
+  btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]);
+  btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]);
+  btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]);
+  btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]);
+  btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]);
+  btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
 }
 
 static void idct64_low1_new_avx2(const __m256i *input, __m256i *output,
@@ -1207,7 +1189,7 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
                                  int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
   const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
   const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
   const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
@@ -1260,16 +1242,16 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
   x[22] = x[23];
   x[25] = x[24];
   x[30] = x[31];
-  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
-  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
-  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
-  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
 
   // stage 5
   x[9] = x[8];
   x[14] = x[15];
-  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
-  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
   x[35] = x[32];
   x[34] = x[33];
   x[36] = x[39];
@@ -1289,7 +1271,7 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
 
   // stage 6
   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
   x[19] = x[16];
   x[18] = x[17];
   x[20] = x[23];
@@ -1298,7 +1280,7 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
   x[26] = x[25];
   x[28] = x[31];
   x[29] = x[30];
-  idct64_stage6_high32_avx2(x, cospi, __rounding, cos_bit);
+  idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
 
   // stage 7
   x[3] = x[0];
@@ -1307,7 +1289,7 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
   x[10] = x[9];
   x[12] = x[15];
   x[13] = x[14];
-  idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit);
+  idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
 
   // stage 8
   x[7] = x[0];
@@ -1315,12 +1297,12 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
   x[5] = x[2];
   x[4] = x[3];
   x[9] = x[9];
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
-  idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+  idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
 
-  idct64_stage9_avx2(x, cospi, __rounding, cos_bit);
-  idct64_stage10_avx2(x, cospi, __rounding, cos_bit);
+  idct64_stage9_avx2(x, cospi, _r, cos_bit);
+  idct64_stage10_avx2(x, cospi, _r, cos_bit);
   idct64_stage11_avx2(output, x);
 }
 
@@ -1328,7 +1310,7 @@ static void idct64_low16_new_avx2(const __m256i *input, __m256i *output,
                                   int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
 
   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
@@ -1398,7 +1380,7 @@ static void idct64_low16_new_avx2(const __m256i *input, __m256i *output,
   x[26] = x[27];
   x[29] = x[28];
   x[30] = x[31];
-  idct64_stage4_high32_avx2(x, cospi, __rounding, cos_bit);
+  idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
 
   // stage 5
   btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
@@ -1406,37 +1388,37 @@ static void idct64_low16_new_avx2(const __m256i *input, __m256i *output,
   x[10] = x[11];
   x[13] = x[12];
   x[14] = x[15];
-  idct64_stage5_high48_avx2(x, cospi, __rounding, cos_bit);
+  idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
 
   // stage 6
   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
   x[5] = x[4];
   x[6] = x[7];
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
-  idct64_stage6_high48_avx2(x, cospi, __rounding, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
+  idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
 
   // stage 7
   x[3] = x[0];
   x[2] = x[1];
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
-  btf_16_adds_subs_avx2(x[8], x[11]);
-  btf_16_adds_subs_avx2(x[9], x[10]);
-  btf_16_subs_adds_avx2(x[15], x[12]);
-  btf_16_subs_adds_avx2(x[14], x[13]);
-  idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[8], &x[11]);
+  btf_16_adds_subs_avx2(&x[9], &x[10]);
+  btf_16_adds_subs_avx2(&x[15], &x[12]);
+  btf_16_adds_subs_avx2(&x[14], &x[13]);
+  idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
 
   // stage 8
-  btf_16_adds_subs_avx2(x[0], x[7]);
-  btf_16_adds_subs_avx2(x[1], x[6]);
-  btf_16_adds_subs_avx2(x[2], x[5]);
-  btf_16_adds_subs_avx2(x[3], x[4]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
-  idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit);
-
-  idct64_stage9_avx2(x, cospi, __rounding, cos_bit);
-  idct64_stage10_avx2(x, cospi, __rounding, cos_bit);
+  btf_16_adds_subs_avx2(&x[0], &x[7]);
+  btf_16_adds_subs_avx2(&x[1], &x[6]);
+  btf_16_adds_subs_avx2(&x[2], &x[5]);
+  btf_16_adds_subs_avx2(&x[3], &x[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+  idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
+
+  idct64_stage9_avx2(x, cospi, _r, cos_bit);
+  idct64_stage10_avx2(x, cospi, _r, cos_bit);
   idct64_stage11_avx2(output, x);
 }
 
@@ -1444,7 +1426,7 @@ static void idct64_low32_new_avx2(const __m256i *input, __m256i *output,
                                   int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
 
   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
@@ -1514,78 +1496,78 @@ static void idct64_low32_new_avx2(const __m256i *input, __m256i *output,
   btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
   btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
   btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
-  btf_16_adds_subs_avx2(x[32], x[33]);
-  btf_16_subs_adds_avx2(x[35], x[34]);
-  btf_16_adds_subs_avx2(x[36], x[37]);
-  btf_16_subs_adds_avx2(x[39], x[38]);
-  btf_16_adds_subs_avx2(x[40], x[41]);
-  btf_16_subs_adds_avx2(x[43], x[42]);
-  btf_16_adds_subs_avx2(x[44], x[45]);
-  btf_16_subs_adds_avx2(x[47], x[46]);
-  btf_16_adds_subs_avx2(x[48], x[49]);
-  btf_16_subs_adds_avx2(x[51], x[50]);
-  btf_16_adds_subs_avx2(x[52], x[53]);
-  btf_16_subs_adds_avx2(x[55], x[54]);
-  btf_16_adds_subs_avx2(x[56], x[57]);
-  btf_16_subs_adds_avx2(x[59], x[58]);
-  btf_16_adds_subs_avx2(x[60], x[61]);
-  btf_16_subs_adds_avx2(x[63], x[62]);
+  btf_16_adds_subs_avx2(&x[32], &x[33]);
+  btf_16_adds_subs_avx2(&x[35], &x[34]);
+  btf_16_adds_subs_avx2(&x[36], &x[37]);
+  btf_16_adds_subs_avx2(&x[39], &x[38]);
+  btf_16_adds_subs_avx2(&x[40], &x[41]);
+  btf_16_adds_subs_avx2(&x[43], &x[42]);
+  btf_16_adds_subs_avx2(&x[44], &x[45]);
+  btf_16_adds_subs_avx2(&x[47], &x[46]);
+  btf_16_adds_subs_avx2(&x[48], &x[49]);
+  btf_16_adds_subs_avx2(&x[51], &x[50]);
+  btf_16_adds_subs_avx2(&x[52], &x[53]);
+  btf_16_adds_subs_avx2(&x[55], &x[54]);
+  btf_16_adds_subs_avx2(&x[56], &x[57]);
+  btf_16_adds_subs_avx2(&x[59], &x[58]);
+  btf_16_adds_subs_avx2(&x[60], &x[61]);
+  btf_16_adds_subs_avx2(&x[63], &x[62]);
 
   // stage 4
   btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
   btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
   btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
   btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
-  btf_16_adds_subs_avx2(x[16], x[17]);
-  btf_16_subs_adds_avx2(x[19], x[18]);
-  btf_16_adds_subs_avx2(x[20], x[21]);
-  btf_16_subs_adds_avx2(x[23], x[22]);
-  btf_16_adds_subs_avx2(x[24], x[25]);
-  btf_16_subs_adds_avx2(x[27], x[26]);
-  btf_16_adds_subs_avx2(x[28], x[29]);
-  btf_16_subs_adds_avx2(x[31], x[30]);
-  idct64_stage4_high32_avx2(x, cospi, __rounding, cos_bit);
+  btf_16_adds_subs_avx2(&x[16], &x[17]);
+  btf_16_adds_subs_avx2(&x[19], &x[18]);
+  btf_16_adds_subs_avx2(&x[20], &x[21]);
+  btf_16_adds_subs_avx2(&x[23], &x[22]);
+  btf_16_adds_subs_avx2(&x[24], &x[25]);
+  btf_16_adds_subs_avx2(&x[27], &x[26]);
+  btf_16_adds_subs_avx2(&x[28], &x[29]);
+  btf_16_adds_subs_avx2(&x[31], &x[30]);
+  idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
 
   // stage 5
   btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
   btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
-  btf_16_adds_subs_avx2(x[8], x[9]);
-  btf_16_subs_adds_avx2(x[11], x[10]);
-  btf_16_adds_subs_avx2(x[12], x[13]);
-  btf_16_subs_adds_avx2(x[15], x[14]);
-  idct64_stage5_high48_avx2(x, cospi, __rounding, cos_bit);
+  btf_16_adds_subs_avx2(&x[8], &x[9]);
+  btf_16_adds_subs_avx2(&x[11], &x[10]);
+  btf_16_adds_subs_avx2(&x[12], &x[13]);
+  btf_16_adds_subs_avx2(&x[15], &x[14]);
+  idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
 
   // stage 6
   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
   btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
-  btf_16_adds_subs_avx2(x[4], x[5]);
-  btf_16_subs_adds_avx2(x[7], x[6]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
-  idct64_stage6_high48_avx2(x, cospi, __rounding, cos_bit);
+  btf_16_adds_subs_avx2(&x[4], &x[5]);
+  btf_16_adds_subs_avx2(&x[7], &x[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
+  idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
 
   // stage 7
-  btf_16_adds_subs_avx2(x[0], x[3]);
-  btf_16_adds_subs_avx2(x[1], x[2]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
-  btf_16_adds_subs_avx2(x[8], x[11]);
-  btf_16_adds_subs_avx2(x[9], x[10]);
-  btf_16_subs_adds_avx2(x[15], x[12]);
-  btf_16_subs_adds_avx2(x[14], x[13]);
-  idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit);
+  btf_16_adds_subs_avx2(&x[0], &x[3]);
+  btf_16_adds_subs_avx2(&x[1], &x[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[8], &x[11]);
+  btf_16_adds_subs_avx2(&x[9], &x[10]);
+  btf_16_adds_subs_avx2(&x[15], &x[12]);
+  btf_16_adds_subs_avx2(&x[14], &x[13]);
+  idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
 
   // stage 8
-  btf_16_adds_subs_avx2(x[0], x[7]);
-  btf_16_adds_subs_avx2(x[1], x[6]);
-  btf_16_adds_subs_avx2(x[2], x[5]);
-  btf_16_adds_subs_avx2(x[3], x[4]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
-  idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit);
+  btf_16_adds_subs_avx2(&x[0], &x[7]);
+  btf_16_adds_subs_avx2(&x[1], &x[6]);
+  btf_16_adds_subs_avx2(&x[2], &x[5]);
+  btf_16_adds_subs_avx2(&x[3], &x[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+  idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
 
   // stage 9~11
-  idct64_stage9_avx2(x, cospi, __rounding, cos_bit);
-  idct64_stage10_avx2(x, cospi, __rounding, cos_bit);
+  idct64_stage9_avx2(x, cospi, _r, cos_bit);
+  idct64_stage10_avx2(x, cospi, _r, cos_bit);
   idct64_stage11_avx2(output, x);
 }
 
@@ -1667,7 +1649,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
     if (lr_flip) {
       for (int j = 0; j < buf_size_w_div16; ++j) {
         __m256i temp[16];
-        flip_buf_av2(buf0 + 16 * j, temp, 16);
+        flip_buf_avx2(buf0 + 16 * j, temp, 16);
         int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
         transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
       }
@@ -1693,18 +1675,18 @@ static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
                                            int txw_idx, int rect_type) {
   const int32_t *input_row = input;
   const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
-  const __m256i rounding = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
-                                             (1 << (NewSqrt2Bits - shift - 1)));
+  const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
+                                       (1 << (NewSqrt2Bits - shift - 1)));
   const __m256i one = _mm256_set1_epi16(1);
-  const __m256i scale_rounding = _mm256_unpacklo_epi16(scale, rounding);
+  const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r);
   if (rect_type != 1 && rect_type != -1) {
     for (int i = 0; i < height; ++i) {
       const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
       input_row += stride;
       __m256i lo = _mm256_unpacklo_epi16(src, one);
       __m256i hi = _mm256_unpackhi_epi16(src, one);
-      lo = _mm256_madd_epi16(lo, scale_rounding);
-      hi = _mm256_madd_epi16(hi, scale_rounding);
+      lo = _mm256_madd_epi16(lo, scale__r);
+      hi = _mm256_madd_epi16(hi, scale__r);
       lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
       hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
       out[i] = _mm256_packs_epi32(lo, hi);
@@ -1718,8 +1700,8 @@ static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
       input_row += stride;
       __m256i lo = _mm256_unpacklo_epi16(src, one);
       __m256i hi = _mm256_unpackhi_epi16(src, one);
-      lo = _mm256_madd_epi16(lo, scale_rounding);
-      hi = _mm256_madd_epi16(hi, scale_rounding);
+      lo = _mm256_madd_epi16(lo, scale__r);
+      hi = _mm256_madd_epi16(hi, scale__r);
       lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
       hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
       out[i] = _mm256_packs_epi32(lo, hi);
@@ -1731,10 +1713,10 @@ static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
                                            __m256i *buf, int shift, int height,
                                            int txh_idx) {
   const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
-  const __m256i scale_rounding = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
-  const __m256i shift_rounding = _mm256_set1_epi32(1 << (-shift - 1));
+  const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
+  const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1));
   const __m256i one = _mm256_set1_epi16(1);
-  const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale_rounding);
+  const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r);
   for (int h = 0; h < height; ++h) {
     __m256i lo = _mm256_unpacklo_epi16(buf[h], one);
     __m256i hi = _mm256_unpackhi_epi16(buf[h], one);
@@ -1742,8 +1724,8 @@ static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
     hi = _mm256_madd_epi16(hi, scale_coeff);
     lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
     hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
-    lo = _mm256_add_epi32(lo, shift_rounding);
-    hi = _mm256_add_epi32(hi, shift_rounding);
+    lo = _mm256_add_epi32(lo, shift__r);
+    hi = _mm256_add_epi32(hi, shift__r);
     lo = _mm256_srai_epi32(lo, -shift);
     hi = _mm256_srai_epi32(hi, -shift);
     const __m256i x = _mm256_packs_epi32(lo, hi);
@@ -1856,7 +1838,7 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
     if (lr_flip) {
       for (int j = 0; j < buf_size_w_div16; ++j) {
         __m256i temp[16];
-        flip_buf_av2(buf0 + 16 * j, temp, 16);
+        flip_buf_avx2(buf0 + 16 * j, temp, 16);
         transpose_16bit_16x16_avx2(temp,
                                    _buf1 + 16 * (buf_size_w_div16 - 1 - j));
       }
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
index c17f655c5..7b5b29cf8 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
@@ -19,37 +19,12 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/transpose_sse2.h"
 #include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define pair_set_w16_epi16(a, b) \
-  _mm256_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
-
-#define btf_16_w16_avx2(w0, w1, in0, in1, out0, out1) \
-  {                                                   \
-    __m256i t0 = _mm256_unpacklo_epi16(in0, in1);     \
-    __m256i t1 = _mm256_unpackhi_epi16(in0, in1);     \
-    __m256i u0 = _mm256_madd_epi16(t0, w0);           \
-    __m256i u1 = _mm256_madd_epi16(t1, w0);           \
-    __m256i v0 = _mm256_madd_epi16(t0, w1);           \
-    __m256i v1 = _mm256_madd_epi16(t1, w1);           \
-                                                      \
-    __m256i a0 = _mm256_add_epi32(u0, __rounding);    \
-    __m256i a1 = _mm256_add_epi32(u1, __rounding);    \
-    __m256i b0 = _mm256_add_epi32(v0, __rounding);    \
-    __m256i b1 = _mm256_add_epi32(v1, __rounding);    \
-                                                      \
-    __m256i c0 = _mm256_srai_epi32(a0, cos_bit);      \
-    __m256i c1 = _mm256_srai_epi32(a1, cos_bit);      \
-    __m256i d0 = _mm256_srai_epi32(b0, cos_bit);      \
-    __m256i d1 = _mm256_srai_epi32(b1, cos_bit);      \
-                                                      \
-    out0 = _mm256_packs_epi32(c0, c1);                \
-    out1 = _mm256_packs_epi32(d0, d1);                \
-  }
-
 // half input is zero
 #define btf_16_w16_0_avx2(w0, w1, in, out0, out1)  \
   {                                                \
@@ -60,111 +35,6 @@ extern "C" {
     out1 = _mm256_mulhrs_epi16(_in, _w1);          \
   }
 
-#define btf_16_adds_subs_avx2(in0, in1)  \
-  {                                      \
-    const __m256i _in0 = in0;            \
-    const __m256i _in1 = in1;            \
-    in0 = _mm256_adds_epi16(_in0, _in1); \
-    in1 = _mm256_subs_epi16(_in0, _in1); \
-  }
-
-#define btf_16_subs_adds_avx2(in0, in1)  \
-  {                                      \
-    const __m256i _in0 = in0;            \
-    const __m256i _in1 = in1;            \
-    in1 = _mm256_subs_epi16(_in0, _in1); \
-    in0 = _mm256_adds_epi16(_in0, _in1); \
-  }
-
-#define btf_16_adds_subs_out_avx2(out0, out1, in0, in1) \
-  {                                                     \
-    const __m256i _in0 = in0;                           \
-    const __m256i _in1 = in1;                           \
-    out0 = _mm256_adds_epi16(_in0, _in1);               \
-    out1 = _mm256_subs_epi16(_in0, _in1);               \
-  }
-
-static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
-  const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a);
-  const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
-  return _mm256_permute4x64_epi64(b, 0xD8);
-}
-
-static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
-                                                       int stride, __m256i *out,
-                                                       int out_size) {
-  for (int i = 0; i < out_size; ++i) {
-    out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride);
-  }
-}
-
-static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
-                                              __m256i *const out) {
-  // Unpack 16 bit elements. Goes from:
-  // in[0]: 00 01 02 03  08 09 0a 0b  04 05 06 07  0c 0d 0e 0f
-  // in[1]: 10 11 12 13  18 19 1a 1b  14 15 16 17  1c 1d 1e 1f
-  // in[2]: 20 21 22 23  28 29 2a 2b  24 25 26 27  2c 2d 2e 2f
-  // in[3]: 30 31 32 33  38 39 3a 3b  34 35 36 37  3c 3d 3e 3f
-  // in[4]: 40 41 42 43  48 49 4a 4b  44 45 46 47  4c 4d 4e 4f
-  // in[5]: 50 51 52 53  58 59 5a 5b  54 55 56 57  5c 5d 5e 5f
-  // in[6]: 60 61 62 63  68 69 6a 6b  64 65 66 67  6c 6d 6e 6f
-  // in[7]: 70 71 72 73  78 79 7a 7b  74 75 76 77  7c 7d 7e 7f
-  // in[8]: 80 81 82 83  88 89 8a 8b  84 85 86 87  8c 8d 8e 8f
-  // to:
-  // a0:    00 10 01 11  02 12 03 13  04 14 05 15  06 16 07 17
-  // a1:    20 30 21 31  22 32 23 33  24 34 25 35  26 36 27 37
-  // a2:    40 50 41 51  42 52 43 53  44 54 45 55  46 56 47 57
-  // a3:    60 70 61 71  62 72 63 73  64 74 65 75  66 76 67 77
-  // ...
-  __m256i a[16];
-  for (int i = 0; i < 16; i += 2) {
-    a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]);
-    a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]);
-  }
-  __m256i b[16];
-  for (int i = 0; i < 16; i += 2) {
-    b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]);
-    b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]);
-  }
-  __m256i c[16];
-  for (int i = 0; i < 16; i += 2) {
-    c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]);
-    c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]);
-  }
-  out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20);
-  out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20);
-  out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20);
-  out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20);
-
-  out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31);
-  out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31);
-  out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31);
-  out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31);
-
-  out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20);
-  out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20);
-  out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20);
-  out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20);
-
-  out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31);
-  out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31);
-  out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31);
-  out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31);
-}
-
-static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
-  if (bit < 0) {
-    __m256i scale = _mm256_set1_epi16(1 << (bit + 15));
-    for (int i = 0; i < size; ++i) {
-      in[i] = _mm256_mulhrs_epi16(in[i], scale);
-    }
-  } else if (bit > 0) {
-    for (int i = 0; i < size; ++i) {
-      in[i] = _mm256_slli_epi16(in[i], bit);
-    }
-  }
-}
-
 static INLINE void round_shift_avx2(const __m256i *input, __m256i *output,
                                     int size) {
   const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8);
@@ -173,12 +43,6 @@ static INLINE void round_shift_avx2(const __m256i *input, __m256i *output,
   }
 }
 
-static INLINE void flip_buf_av2(__m256i *in, __m256i *out, int size) {
-  for (int i = 0; i < size; ++i) {
-    out[size - i - 1] = in[i];
-  }
-}
-
 static INLINE void write_recon_w16_avx2(__m256i res, uint8_t *output) {
   __m128i pred = _mm_loadu_si128((__m128i const *)(output));
   __m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res);
@@ -197,9 +61,6 @@ static INLINE void lowbd_write_buffer_16xn_avx2(__m256i *in, uint8_t *output,
   }
 }
 
-typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
-                                  int8_t cos_bit);
-
 void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
                                    int stride, TX_TYPE tx_type, TX_SIZE tx_size,
                                    int eob);
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.c b/third_party/aom/av1/common/x86/av1_txfm_sse4.c
index cccc62f03..90b9879cc 100644
--- a/third_party/aom/av1/common/x86/av1_txfm_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.c
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 #include "config/aom_dsp_rtcd.h"
 
 #include "av1/common/av1_txfm.h"
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.h b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
index faf7251fa..367e02096 100644
--- a/third_party/aom/av1/common/x86/av1_txfm_sse4.h
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 #ifndef AV1_TXFM_SSE4_H_
 #define AV1_TXFM_SSE4_H_
 
diff --git a/third_party/aom/av1/common/x86/convolve_2d_avx2.c b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
index fd5e90a2e..1099144fe 100644
--- a/third_party/aom/av1/common/x86/convolve_2d_avx2.c
+++ b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
@@ -24,8 +24,8 @@
 
 void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
                              int dst_stride, int w, int h,
-                             InterpFilterParams *filter_params_x,
-                             InterpFilterParams *filter_params_y,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
                              const int subpel_x_q4, const int subpel_y_q4,
                              ConvolveParams *conv_params) {
   const int bd = 8;
@@ -46,10 +46,10 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
 
   assert(conv_params->round_0 > 0);
 
-  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
-  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
-  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
 
   prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h);
   prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v);
@@ -180,8 +180,8 @@ static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
 
 void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride,
                                   uint8_t *dst, int dst_stride, int w, int h,
-                                  InterpFilterParams *filter_params_x,
-                                  InterpFilterParams *filter_params_y,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
                                   const int subpel_x_q4, const int subpel_y_q4,
                                   ConvolveParams *conv_params) {
   (void)filter_params_x;
diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
index fc0e65453..637f83cf7 100644
--- a/third_party/aom/av1/common/x86/convolve_2d_sse2.c
+++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
@@ -21,8 +21,8 @@
 
 void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
                              int dst_stride, int w, int h,
-                             InterpFilterParams *filter_params_x,
-                             InterpFilterParams *filter_params_y,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
                              const int subpel_x_q4, const int subpel_y_q4,
                              ConvolveParams *conv_params) {
   const int bd = 8;
@@ -46,7 +46,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
   /* Horizontal filter */
   {
     const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+        filter_params_x, subpel_x_q4 & SUBPEL_MASK);
     const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
 
     // coeffs 0 1 0 1 2 3 2 3
@@ -112,7 +112,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
   /* Vertical filter */
   {
     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+        filter_params_y, subpel_y_q4 & SUBPEL_MASK);
     const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
 
     // coeffs 0 1 0 1 2 3 2 3
@@ -239,8 +239,8 @@ static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
 
 void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
                                   uint8_t *dst, int dst_stride, int w, int h,
-                                  InterpFilterParams *filter_params_x,
-                                  InterpFilterParams *filter_params_y,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
                                   const int subpel_x_q4, const int subpel_y_q4,
                                   ConvolveParams *conv_params) {
   (void)filter_params_x;
@@ -357,8 +357,8 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
 
 void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
                                    uint8_t *dst0, int dst_stride0, int w, int h,
-                                   InterpFilterParams *filter_params_x,
-                                   InterpFilterParams *filter_params_y,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
                                    const int subpel_x_q4, const int subpel_y_q4,
                                    ConvolveParams *conv_params) {
   const int bd = 8;
diff --git a/third_party/aom/av1/common/x86/convolve_avx2.c b/third_party/aom/av1/common/x86/convolve_avx2.c
index 6fdfb0954..0e91ea947 100644
--- a/third_party/aom/av1/common/x86/convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/convolve_avx2.c
@@ -19,8 +19,8 @@
 
 void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
-                            InterpFilterParams *filter_params_x,
-                            InterpFilterParams *filter_params_y,
+                            const InterpFilterParams *filter_params_x,
+                            const InterpFilterParams *filter_params_y,
                             const int subpel_x_q4, const int subpel_y_q4,
                             ConvolveParams *conv_params) {
   int i, j;
@@ -176,8 +176,8 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
 
 void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
-                            InterpFilterParams *filter_params_x,
-                            InterpFilterParams *filter_params_y,
+                            const InterpFilterParams *filter_params_x,
+                            const InterpFilterParams *filter_params_y,
                             const int subpel_x_q4, const int subpel_y_q4,
                             ConvolveParams *conv_params) {
   int i, j;
@@ -187,10 +187,10 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
 
   __m256i filt[4], coeffs[4];
 
-  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
-  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
-  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
 
   prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
 
diff --git a/third_party/aom/av1/common/x86/convolve_sse2.c b/third_party/aom/av1/common/x86/convolve_sse2.c
index 18fe9ae5a..f66dee37d 100644
--- a/third_party/aom/av1/common/x86/convolve_sse2.c
+++ b/third_party/aom/av1/common/x86/convolve_sse2.c
@@ -23,7 +23,7 @@ static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
                                   const int subpel_q4,
                                   __m128i *const coeffs /* [4] */) {
   const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params, subpel_q4 & SUBPEL_MASK);
+      filter_params, subpel_q4 & SUBPEL_MASK);
   const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
   // coeffs 0 1 0 1 2 3 2 3
   const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
@@ -78,8 +78,8 @@ static INLINE __m128i convolve_hi_y(const __m128i *const s,
 
 void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride,
                             const uint8_t *dst, int dst_stride, int w, int h,
-                            InterpFilterParams *filter_params_x,
-                            InterpFilterParams *filter_params_y,
+                            const InterpFilterParams *filter_params_x,
+                            const InterpFilterParams *filter_params_y,
                             const int subpel_x_q4, const int subpel_y_q4,
                             ConvolveParams *conv_params) {
   const int fo_vert = filter_params_y->taps / 2 - 1;
@@ -239,8 +239,8 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride,
 
 void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride,
                             const uint8_t *dst, int dst_stride, int w, int h,
-                            InterpFilterParams *filter_params_x,
-                            InterpFilterParams *filter_params_y,
+                            const InterpFilterParams *filter_params_x,
+                            const InterpFilterParams *filter_params_y,
                             const int subpel_x_q4, const int subpel_y_q4,
                             ConvolveParams *conv_params) {
   const int fo_horiz = filter_params_x->taps / 2 - 1;
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
index a34c618d0..8444ffa93 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -23,8 +23,8 @@
 
 void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
                                     uint16_t *dst, int dst_stride, int w, int h,
-                                    InterpFilterParams *filter_params_x,
-                                    InterpFilterParams *filter_params_y,
+                                    const InterpFilterParams *filter_params_x,
+                                    const InterpFilterParams *filter_params_y,
                                     const int subpel_x_q4,
                                     const int subpel_y_q4,
                                     ConvolveParams *conv_params, int bd) {
@@ -222,8 +222,8 @@ static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
 
 void av1_highbd_convolve_2d_copy_sr_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, InterpFilterParams *filter_params_x,
-    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
   (void)filter_params_x;
   (void)filter_params_y;
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c
index bdf813fa0..15f8872c1 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c
@@ -73,8 +73,8 @@ static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
 
 void av1_highbd_convolve_2d_copy_sr_sse2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, InterpFilterParams *filter_params_x,
-    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
   (void)filter_params_x;
   (void)filter_params_y;
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
index 5d2fc465e..eb340523a 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -24,8 +24,8 @@
 
 void av1_highbd_jnt_convolve_2d_copy_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
-    int h, InterpFilterParams *filter_params_x,
-    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
@@ -169,8 +169,8 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
 
 void av1_highbd_jnt_convolve_2d_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
-    int h, InterpFilterParams *filter_params_x,
-    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
   DECLARE_ALIGNED(16, int16_t,
                   im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
@@ -207,7 +207,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
   /* Horizontal filter */
   {
     const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+        filter_params_x, subpel_x_q4 & SUBPEL_MASK);
     const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
 
     // coeffs 0 1 0 1 2 3 2 3
@@ -274,7 +274,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
   /* Vertical filter */
   {
     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+        filter_params_y, subpel_y_q4 & SUBPEL_MASK);
     const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
 
     // coeffs 0 1 0 1 2 3 2 3
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
index a9cf6a4d6..33183fdee 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -20,13 +20,11 @@
 #include "aom_dsp/x86/convolve_sse2.h"
 #include "av1/common/convolve.h"
 
-void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride,
-                                     uint16_t *dst, int dst_stride, int w,
-                                     int h, InterpFilterParams *filter_params_x,
-                                     InterpFilterParams *filter_params_y,
-                                     const int subpel_x_q4,
-                                     const int subpel_y_q4,
-                                     ConvolveParams *conv_params, int bd) {
+void av1_highbd_convolve_2d_sr_ssse3(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
   DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
   int im_h = h + filter_params_y->taps - 1;
   int im_stride = 8;
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
index 89d0ecb1e..608bd88a4 100644
--- a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -25,8 +25,8 @@
 
 void av1_highbd_jnt_convolve_2d_copy_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
-    int h, InterpFilterParams *filter_params_x,
-    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
@@ -224,13 +224,11 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
   }
 }
 
-void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride,
-                                     uint16_t *dst0, int dst_stride0, int w,
-                                     int h, InterpFilterParams *filter_params_x,
-                                     InterpFilterParams *filter_params_y,
-                                     const int subpel_x_q4,
-                                     const int subpel_y_q4,
-                                     ConvolveParams *conv_params, int bd) {
+void av1_highbd_jnt_convolve_2d_avx2(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
   DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
@@ -459,13 +457,11 @@ void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride,
   }
 }
 
-void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride,
-                                    uint16_t *dst0, int dst_stride0, int w,
-                                    int h, InterpFilterParams *filter_params_x,
-                                    InterpFilterParams *filter_params_y,
-                                    const int subpel_x_q4,
-                                    const int subpel_y_q4,
-                                    ConvolveParams *conv_params, int bd) {
+void av1_highbd_jnt_convolve_x_avx2(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -628,13 +624,11 @@ void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride,
   }
 }
 
-void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride,
-                                    uint16_t *dst0, int dst_stride0, int w,
-                                    int h, InterpFilterParams *filter_params_x,
-                                    InterpFilterParams *filter_params_y,
-                                    const int subpel_x_q4,
-                                    const int subpel_y_q4,
-                                    ConvolveParams *conv_params, int bd) {
+void av1_highbd_jnt_convolve_y_avx2(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_vert = filter_params_y->taps / 2 - 1;
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
index ccca6b07a..1a29985b5 100644
--- a/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -19,8 +19,8 @@
 
 void av1_highbd_jnt_convolve_y_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
-    int h, InterpFilterParams *filter_params_x,
-    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
@@ -259,8 +259,8 @@ void av1_highbd_jnt_convolve_y_sse4_1(
 
 void av1_highbd_jnt_convolve_x_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
-    int h, InterpFilterParams *filter_params_x,
-    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
index ac1d2c9ca..d1ea26290 100644
--- a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
@@ -23,8 +23,8 @@
 
 void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
                              int dst_stride0, int w, int h,
-                             InterpFilterParams *filter_params_x,
-                             InterpFilterParams *filter_params_y,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
                              const int subpel_x_q4, const int subpel_y_q4,
                              ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -53,10 +53,10 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   assert(bits >= 0);
   assert(conv_params->round_0 > 0);
 
-  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
-  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
-  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
 
   prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
 
@@ -126,8 +126,8 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
 
 void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
                              int dst_stride0, int w, int h,
-                             InterpFilterParams *filter_params_x,
-                             InterpFilterParams *filter_params_y,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
                              const int subpel_x_q4, const int subpel_y_q4,
                              ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -389,8 +389,8 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
 
 void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
                               int dst_stride0, int w, int h,
-                              InterpFilterParams *filter_params_x,
-                              InterpFilterParams *filter_params_y,
+                              const InterpFilterParams *filter_params_x,
+                              const InterpFilterParams *filter_params_y,
                               const int subpel_x_q4, const int subpel_y_q4,
                               ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -422,10 +422,10 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
 
   assert(conv_params->round_0 > 0);
 
-  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
-  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
-  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
 
   prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x);
   prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
@@ -581,8 +581,8 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
 
 void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
                                    uint8_t *dst0, int dst_stride0, int w, int h,
-                                   InterpFilterParams *filter_params_x,
-                                   InterpFilterParams *filter_params_y,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
                                    const int subpel_x_q4, const int subpel_y_q4,
                                    ConvolveParams *conv_params) {
   const int bd = 8;
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_sse2.c b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
index 4df7bd42e..87dc3242e 100644
--- a/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
+++ b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
@@ -18,8 +18,8 @@
 
 void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
                              int dst_stride0, int w, int h,
-                             InterpFilterParams *filter_params_x,
-                             InterpFilterParams *filter_params_y,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
                              const int subpel_x_q4, const int subpel_y_q4,
                              ConvolveParams *conv_params) {
   const int bd = 8;
@@ -152,8 +152,8 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
 
 void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
                              int dst_stride0, int w, int h,
-                             InterpFilterParams *filter_params_x,
-                             InterpFilterParams *filter_params_y,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
                              const int subpel_x_q4, const int subpel_y_q4,
                              ConvolveParams *conv_params) {
   const int bd = 8;
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
index e4d51ac8d..822772782 100644
--- a/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
+++ b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
@@ -18,8 +18,8 @@
 
 void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
                                uint8_t *dst0, int dst_stride0, int w, int h,
-                               InterpFilterParams *filter_params_x,
-                               InterpFilterParams *filter_params_y,
+                               const InterpFilterParams *filter_params_x,
+                               const InterpFilterParams *filter_params_y,
                                const int subpel_x_q4, const int subpel_y_q4,
                                ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -56,7 +56,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
   /* Horizontal filter */
   {
     const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+        filter_params_x, subpel_x_q4 & SUBPEL_MASK);
     const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
 
     // coeffs 0 1 0 1 2 3 2 3
@@ -124,7 +124,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
   /* Vertical filter */
   {
     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+        filter_params_y, subpel_y_q4 & SUBPEL_MASK);
     const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
 
     // coeffs 0 1 0 1 2 3 2 3
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
index a42c94028..c64150b9d 100644
--- a/third_party/aom/av1/common/x86/selfguided_sse4.c
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 #include <smmintrin.h>
 
 #include "config/aom_config.h"
diff --git a/third_party/aom/av1/decoder/decodeframe.c b/third_party/aom/av1/decoder/decodeframe.c
index e92c6b28c..6dbc4f3eb 100644
--- a/third_party/aom/av1/decoder/decodeframe.c
+++ b/third_party/aom/av1/decoder/decodeframe.c
@@ -84,15 +84,15 @@ int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
 }
 
 // Use only_chroma = 1 to only set the chroma planes
-static void set_planes_to_neutral_grey(AV1_COMMON *const cm,
+static void set_planes_to_neutral_grey(const SequenceHeader *const seq_params,
                                        const YV12_BUFFER_CONFIG *const buf,
                                        int only_chroma) {
-  const int val = 1 << (cm->bit_depth - 1);
+  const int val = 1 << (seq_params->bit_depth - 1);
 
   for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
     const int is_uv = plane > 0;
     for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) {
-      if (cm->use_highbitdepth) {
+      if (seq_params->use_highbitdepth) {
         // TODO(yaowu): replace this with aom_memset16() for speed
         for (int col_idx = 0; col_idx < buf->crop_widths[is_uv]; col_idx++) {
           uint16_t *base = CONVERT_TO_SHORTPTR(buf->buffers[plane]);
@@ -157,16 +157,18 @@ static void inverse_transform_block(MACROBLOCKD *xd, int plane,
   memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0]));
 }
 
-static void read_coeffs_tx_intra_block(AV1_COMMON *cm, MACROBLOCKD *const xd,
-                                       aom_reader *const r, int plane, int row,
-                                       int col, TX_SIZE tx_size) {
+static void read_coeffs_tx_intra_block(const AV1_COMMON *const cm,
+                                       MACROBLOCKD *const xd,
+                                       aom_reader *const r, const int plane,
+                                       const int row, const int col,
+                                       const TX_SIZE tx_size) {
   MB_MODE_INFO *mbmi = xd->mi[0];
   if (!mbmi->skip) {
 #if TXCOEFF_TIMER
     struct aom_usec_timer timer;
     aom_usec_timer_start(&timer);
 #endif
-    av1_read_coeffs_txb_facade(cm, xd, r, row, col, plane, tx_size);
+    av1_read_coeffs_txb_facade(cm, xd, r, plane, row, col, tx_size);
 #if TXCOEFF_TIMER
     aom_usec_timer_mark(&timer);
     const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
@@ -176,11 +178,38 @@ static void read_coeffs_tx_intra_block(AV1_COMMON *cm, MACROBLOCKD *const xd,
   }
 }
 
-static void predict_and_reconstruct_intra_block(AV1_COMMON *cm,
-                                                MACROBLOCKD *const xd,
-                                                aom_reader *const r, int plane,
-                                                int row, int col,
-                                                TX_SIZE tx_size) {
+static void decode_block_void(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                              aom_reader *const r, const int plane,
+                              const int row, const int col,
+                              const TX_SIZE tx_size) {
+  (void)cm;
+  (void)xd;
+  (void)r;
+  (void)plane;
+  (void)row;
+  (void)col;
+  (void)tx_size;
+}
+
+static void predict_inter_block_void(AV1_COMMON *const cm,
+                                     MACROBLOCKD *const xd, int mi_row,
+                                     int mi_col, BLOCK_SIZE bsize) {
+  (void)cm;
+  (void)xd;
+  (void)mi_row;
+  (void)mi_col;
+  (void)bsize;
+}
+
+static void cfl_store_inter_block_void(AV1_COMMON *const cm,
+                                       MACROBLOCKD *const xd) {
+  (void)cm;
+  (void)xd;
+}
+
+static void predict_and_reconstruct_intra_block(
+    const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r,
+    const int plane, const int row, const int col, const TX_SIZE tx_size) {
   (void)r;
   MB_MODE_INFO *mbmi = xd->mi[0];
   PLANE_TYPE plane_type = get_plane_type(plane);
@@ -208,28 +237,33 @@ static void predict_and_reconstruct_intra_block(AV1_COMMON *cm,
 
 static void inverse_transform_inter_block(const AV1_COMMON *const cm,
                                           MACROBLOCKD *const xd,
-                                          aom_reader *const r,
+                                          aom_reader *const r, const int plane,
                                           const int blk_row, const int blk_col,
-                                          const int plane,
                                           const TX_SIZE tx_size) {
   (void)r;
   PLANE_TYPE plane_type = get_plane_type(plane);
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  MB_MODE_INFO *mbmi = xd->mi[0];
 
   // tx_type will be read out in av1_read_coeffs_txb_facade
   const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
                                           tx_size, cm->reduced_tx_set_used);
 
-  if (plane == 0)
-    update_txk_array(mbmi->txk_type, mbmi->sb_type, blk_row, blk_col, tx_size,
-                     tx_type);
-
   uint8_t *dst =
       &pd->dst
            .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
   inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
                           cm->reduced_tx_set_used);
+#if CONFIG_MISMATCH_DEBUG
+  int pixel_c, pixel_r;
+  BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  int blk_w = block_size_wide[bsize];
+  int blk_h = block_size_high[bsize];
+  mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row,
+                  pd->subsampling_x, pd->subsampling_y);
+  mismatch_check_block_tx(dst, pd->dst.stride, cm->frame_offset, plane, pixel_c,
+                          pixel_r, blk_w, blk_h,
+                          xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+#endif
 }
 
 static void set_cb_buffer_offsets(MACROBLOCKD *const xd, TX_SIZE tx_size,
@@ -239,11 +273,12 @@ static void set_cb_buffer_offsets(MACROBLOCKD *const xd, TX_SIZE tx_size,
       xd->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
 }
 
-static void decode_reconstruct_tx(AV1_COMMON *cm, MACROBLOCKD *const xd,
+static void decode_reconstruct_tx(AV1_COMMON *cm, ThreadData *const td,
                                   aom_reader *r, MB_MODE_INFO *const mbmi,
                                   int plane, BLOCK_SIZE plane_bsize,
                                   int blk_row, int blk_col, int block,
                                   TX_SIZE tx_size, int *eob_total) {
+  MACROBLOCKD *const xd = &td->xd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const TX_SIZE plane_tx_size =
       plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
@@ -257,30 +292,11 @@ static void decode_reconstruct_tx(AV1_COMMON *cm, MACROBLOCKD *const xd,
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
   if (tx_size == plane_tx_size || plane) {
-#if TXCOEFF_TIMER
-    struct aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-#endif
-    av1_read_coeffs_txb_facade(cm, xd, r, blk_row, blk_col, plane, tx_size);
-#if TXCOEFF_TIMER
-    aom_usec_timer_mark(&timer);
-    const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
-    cm->txcoeff_timer += elapsed_time;
-    ++cm->txb_count;
-#endif
-    inverse_transform_inter_block(cm, xd, r, blk_row, blk_col, plane, tx_size);
+    td->read_coeffs_tx_inter_block_visit(cm, xd, r, plane, blk_row, blk_col,
+                                         tx_size);
 
-#if CONFIG_MISMATCH_DEBUG
-    int pixel_c, pixel_r;
-    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-    int blk_w = block_size_wide[bsize];
-    int blk_h = block_size_high[bsize];
-    mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row,
-                    pd->subsampling_x, pd->subsampling_y);
-    mismatch_check_block_tx(dst, pd->dst.stride, cm->frame_offset, plane,
-                            pixel_c, pixel_r, blk_w, blk_h,
-                            xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
-#endif
+    td->inverse_tx_inter_block_visit(cm, xd, r, plane, blk_row, blk_col,
+                                     tx_size);
     eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
     *eob_total += eob_data->eob;
     set_cb_buffer_offsets(xd, tx_size, plane);
@@ -301,7 +317,7 @@ static void decode_reconstruct_tx(AV1_COMMON *cm, MACROBLOCKD *const xd,
 
         if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
 
-        decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize, offsetr,
+        decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize, offsetr,
                               offsetc, block, sub_txs, eob_total);
         block += sub_step;
       }
@@ -352,6 +368,7 @@ static void decode_mbmi_block(AV1Decoder *const pbi, MACROBLOCKD *const xd,
                               int mi_row, int mi_col, aom_reader *r,
                               PARTITION_TYPE partition, BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &pbi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
   const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
@@ -363,9 +380,11 @@ static void decode_mbmi_block(AV1Decoder *const pbi, MACROBLOCKD *const xd,
   set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
   xd->mi[0]->partition = partition;
   av1_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
-  if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
+  if (bsize >= BLOCK_8X8 &&
+      (seq_params->subsampling_x || seq_params->subsampling_y)) {
     const BLOCK_SIZE uv_subsize =
-        ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y];
+        ss_size_lookup[bsize][seq_params->subsampling_x]
+                      [seq_params->subsampling_y];
     if (uv_subsize == BLOCK_INVALID)
       aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
                          "Invalid block size.");
@@ -843,8 +862,8 @@ static void dec_build_inter_predictors_sby(const AV1_COMMON *cm,
     BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL },
                                { xd->plane[0].dst.stride, 0, 0 } };
     if (!ctx) ctx = &default_ctx;
-    av1_build_interintra_predictors_sby(cm, xd, xd->plane[0].dst.buf,
-                                        xd->plane[0].dst.stride, ctx, bsize);
+    av1_build_interintra_predictors_sbp(cm, xd, xd->plane[0].dst.buf,
+                                        xd->plane[0].dst.stride, ctx, 0, bsize);
   }
 }
 
@@ -1052,6 +1071,20 @@ static void predict_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd,
   dec_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
   if (mbmi->motion_mode == OBMC_CAUSAL)
     dec_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+#if CONFIG_MISMATCH_DEBUG
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    int pixel_c, pixel_r;
+    mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, pd->subsampling_x,
+                    pd->subsampling_y);
+    if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                             pd->subsampling_y))
+      continue;
+    mismatch_check_block_pre(pd->dst.buf, pd->dst.stride, cm->frame_offset,
+                             plane, pixel_c, pixel_r, pd->width, pd->height,
+                             xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+  }
+#endif
 }
 
 static void set_color_index_map_offset(MACROBLOCKD *const xd, int plane,
@@ -1064,42 +1097,19 @@ static void set_color_index_map_offset(MACROBLOCKD *const xd, int plane,
   xd->color_index_map_offset[plane] += params.plane_width * params.plane_height;
 }
 
-static void decode_token_and_recon_block(AV1Decoder *const pbi,
-                                         MACROBLOCKD *const xd, int mi_row,
-                                         int mi_col, aom_reader *r,
-                                         BLOCK_SIZE bsize) {
+static void decode_token_recon_block(AV1Decoder *const pbi,
+                                     ThreadData *const td, int mi_row,
+                                     int mi_col, aom_reader *r,
+                                     BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &td->xd;
   const int num_planes = av1_num_planes(cm);
-  const int bw = mi_size_wide[bsize];
-  const int bh = mi_size_high[bsize];
-  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
-  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
 
-  set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
   MB_MODE_INFO *mbmi = xd->mi[0];
   CFL_CTX *const cfl = &xd->cfl;
   cfl->is_chroma_reference = is_chroma_reference(
       mi_row, mi_col, bsize, cfl->subsampling_x, cfl->subsampling_y);
 
-  if (cm->delta_q_present_flag) {
-    for (int i = 0; i < MAX_SEGMENTS; i++) {
-      const int current_qindex =
-          av1_get_qindex(&cm->seg, i, xd->current_qindex);
-      for (int j = 0; j < num_planes; ++j) {
-        const int dc_delta_q =
-            j == 0 ? cm->y_dc_delta_q
-                   : (j == 1 ? cm->u_dc_delta_q : cm->v_dc_delta_q);
-        const int ac_delta_q =
-            j == 0 ? 0 : (j == 1 ? cm->u_ac_delta_q : cm->v_ac_delta_q);
-        xd->plane[j].seg_dequant_QTX[i][0] =
-            av1_dc_quant_QTX(current_qindex, dc_delta_q, cm->bit_depth);
-        xd->plane[j].seg_dequant_QTX[i][1] =
-            av1_ac_quant_QTX(current_qindex, ac_delta_q, cm->bit_depth);
-      }
-    }
-  }
-  if (mbmi->skip) av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
-
   if (!is_inter_block(mbmi)) {
     int row, col;
     assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
@@ -1135,10 +1145,10 @@ static void decode_token_and_recon_block(AV1Decoder *const pbi,
                blk_row += stepr) {
             for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
                  blk_col += stepc) {
-              read_coeffs_tx_intra_block(cm, xd, r, plane, blk_row, blk_col,
-                                         tx_size);
-              predict_and_reconstruct_intra_block(cm, xd, r, plane, blk_row,
-                                                  blk_col, tx_size);
+              td->read_coeffs_tx_intra_block_visit(cm, xd, r, plane, blk_row,
+                                                   blk_col, tx_size);
+              td->predict_and_recon_intra_block_visit(cm, xd, r, plane, blk_row,
+                                                      blk_col, tx_size);
               set_cb_buffer_offsets(xd, tx_size, plane);
             }
           }
@@ -1146,22 +1156,7 @@ static void decode_token_and_recon_block(AV1Decoder *const pbi,
       }
     }
   } else {
-    predict_inter_block(cm, xd, mi_row, mi_col, bsize);
-#if CONFIG_MISMATCH_DEBUG
-    for (int plane = 0; plane < num_planes; ++plane) {
-      const struct macroblockd_plane *pd = &xd->plane[plane];
-      int pixel_c, pixel_r;
-      mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
-                      pd->subsampling_x, pd->subsampling_y);
-      if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
-                               pd->subsampling_y))
-        continue;
-      mismatch_check_block_pre(pd->dst.buf, pd->dst.stride, cm->frame_offset,
-                               plane, pixel_c, pixel_r, pd->width, pd->height,
-                               xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
-    }
-#endif
-
+    td->predict_inter_block_visit(cm, xd, mi_row, mi_col, bsize);
     // Reconstruction
     if (!mbmi->skip) {
       int eobtotal = 0;
@@ -1213,7 +1208,7 @@ static void decode_token_and_recon_block(AV1Decoder *const pbi,
                  blk_row += bh_var_tx) {
               for (blk_col = col >> pd->subsampling_x; blk_col < unit_width;
                    blk_col += bw_var_tx) {
-                decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize,
+                decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize,
                                       blk_row, blk_col, block, max_tx_size,
                                       &eobtotal);
                 block += step;
@@ -1223,14 +1218,11 @@ static void decode_token_and_recon_block(AV1Decoder *const pbi,
         }
       }
     }
-    cfl_store_inter_block(cm, xd);
+    td->cfl_store_inter_block_visit(cm, xd);
   }
 
   av1_visit_palette(pbi, xd, mi_row, mi_col, r, bsize,
                     set_color_index_map_offset);
-
-  int reader_corrupted_flag = aom_reader_has_error(r);
-  aom_merge_corrupted_flag(&xd->corrupted, reader_corrupted_flag);
 }
 
 static void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
@@ -1338,15 +1330,17 @@ static TX_SIZE read_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd, int is_inter,
   }
 }
 
-static void decode_block(AV1Decoder *const pbi, MACROBLOCKD *const xd,
-                         int mi_row, int mi_col, aom_reader *r,
-                         PARTITION_TYPE partition, BLOCK_SIZE bsize) {
+static void parse_decode_block(AV1Decoder *const pbi, ThreadData *const td,
+                               int mi_row, int mi_col, aom_reader *r,
+                               PARTITION_TYPE partition, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &td->xd;
   decode_mbmi_block(pbi, xd, mi_row, mi_col, r, partition, bsize);
 
   av1_visit_palette(pbi, xd, mi_row, mi_col, r, bsize,
                     av1_decode_palette_tokens);
 
   AV1_COMMON *cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
   MB_MODE_INFO *mbmi = xd->mi[0];
   int inter_block_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi);
   if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
@@ -1368,7 +1362,63 @@ static void decode_block(AV1Decoder *const pbi, MACROBLOCKD *const xd,
                   mbmi->skip && is_inter_block(mbmi), xd);
   }
 
-  decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, bsize);
+  if (cm->delta_q_present_flag) {
+    for (int i = 0; i < MAX_SEGMENTS; i++) {
+      const int current_qindex =
+          av1_get_qindex(&cm->seg, i, xd->current_qindex);
+      for (int j = 0; j < num_planes; ++j) {
+        const int dc_delta_q =
+            j == 0 ? cm->y_dc_delta_q
+                   : (j == 1 ? cm->u_dc_delta_q : cm->v_dc_delta_q);
+        const int ac_delta_q =
+            j == 0 ? 0 : (j == 1 ? cm->u_ac_delta_q : cm->v_ac_delta_q);
+        xd->plane[j].seg_dequant_QTX[i][0] = av1_dc_quant_QTX(
+            current_qindex, dc_delta_q, cm->seq_params.bit_depth);
+        xd->plane[j].seg_dequant_QTX[i][1] = av1_ac_quant_QTX(
+            current_qindex, ac_delta_q, cm->seq_params.bit_depth);
+      }
+    }
+  }
+  if (mbmi->skip) av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
+
+  decode_token_recon_block(pbi, td, mi_row, mi_col, r, bsize);
+
+  int reader_corrupted_flag = aom_reader_has_error(r);
+  aom_merge_corrupted_flag(&xd->corrupted, reader_corrupted_flag);
+}
+
+static void set_offsets_for_pred_and_recon(AV1Decoder *const pbi,
+                                           ThreadData *const td, int mi_row,
+                                           int mi_col, BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &td->xd;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int num_planes = av1_num_planes(cm);
+
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  const TileInfo *const tile = &xd->tile;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->cfl.mi_row = mi_row;
+  xd->cfl.mi_col = mi_col;
+
+  set_plane_n4(xd, bw, bh, num_planes);
+
+  // Distance of Mb to the various image edges. These are specified to 8th pel
+  // as they are always compared to values that are in 1/8th pel units
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+                       mi_col, 0, num_planes);
+}
+
+static void decode_block(AV1Decoder *const pbi, ThreadData *const td,
+                         int mi_row, int mi_col, aom_reader *r,
+                         PARTITION_TYPE partition, BLOCK_SIZE bsize) {
+  (void)partition;
+  set_offsets_for_pred_and_recon(pbi, td, mi_row, mi_col, bsize);
+  decode_token_recon_block(pbi, td, mi_row, mi_col, r, bsize);
 }
 
 static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
@@ -1401,10 +1451,11 @@ static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
 }
 
 // TODO(slavarnway): eliminate bsize and subsize in future commits
-static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+static void decode_partition(AV1Decoder *const pbi, ThreadData *const td,
                              int mi_row, int mi_col, aom_reader *r,
-                             BLOCK_SIZE bsize) {
+                             BLOCK_SIZE bsize, int parse_decode_flag) {
   AV1_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &td->xd;
   const int bw = mi_size_wide[bsize];
   const int hbs = bw >> 1;
   PARTITION_TYPE partition;
@@ -1416,25 +1467,36 @@ static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
-  const int num_planes = av1_num_planes(cm);
-  for (int plane = 0; plane < num_planes; ++plane) {
-    int rcol0, rcol1, rrow0, rrow1, tile_tl_idx;
-    if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
-                                           &rcol0, &rcol1, &rrow0, &rrow1,
-                                           &tile_tl_idx)) {
-      const int rstride = cm->rst_info[plane].horz_units_per_tile;
-      for (int rrow = rrow0; rrow < rrow1; ++rrow) {
-        for (int rcol = rcol0; rcol < rcol1; ++rcol) {
-          const int runit_idx = tile_tl_idx + rcol + rrow * rstride;
-          loop_restoration_read_sb_coeffs(cm, xd, r, plane, runit_idx);
+  // parse_decode_flag takes the following values :
+  // 01 - do parse only
+  // 10 - do decode only
+  // 11 - do parse and decode
+  static const block_visitor_fn_t block_visit[4] = {
+    NULL, parse_decode_block, decode_block, parse_decode_block
+  };
+
+  if (parse_decode_flag & 1) {
+    const int num_planes = av1_num_planes(cm);
+    for (int plane = 0; plane < num_planes; ++plane) {
+      int rcol0, rcol1, rrow0, rrow1;
+      if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
+                                             &rcol0, &rcol1, &rrow0, &rrow1)) {
+        const int rstride = cm->rst_info[plane].horz_units_per_tile;
+        for (int rrow = rrow0; rrow < rrow1; ++rrow) {
+          for (int rcol = rcol0; rcol < rcol1; ++rcol) {
+            const int runit_idx = rcol + rrow * rstride;
+            loop_restoration_read_sb_coeffs(cm, xd, r, plane, runit_idx);
+          }
         }
       }
     }
-  }
 
-  partition = (bsize < BLOCK_8X8) ? PARTITION_NONE
-                                  : read_partition(xd, mi_row, mi_col, r,
-                                                   has_rows, has_cols, bsize);
+    partition = (bsize < BLOCK_8X8) ? PARTITION_NONE
+                                    : read_partition(xd, mi_row, mi_col, r,
+                                                     has_rows, has_cols, bsize);
+  } else {
+    partition = get_partition(cm, mi_row, mi_col, bsize);
+  }
   subsize = get_partition_subsize(bsize, partition);
 
   // Check the bitstream is conformant: if there is subsampling on the
@@ -1442,18 +1504,19 @@ static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
   const struct macroblockd_plane *const pd_u = &xd->plane[1];
   if (get_plane_block_size(subsize, pd_u->subsampling_x, pd_u->subsampling_y) ==
       BLOCK_INVALID) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
                        "Block size %dx%d invalid with this subsampling mode",
                        block_size_wide[subsize], block_size_high[subsize]);
   }
 
 #define DEC_BLOCK_STX_ARG
 #define DEC_BLOCK_EPT_ARG partition,
-#define DEC_BLOCK(db_r, db_c, db_subsize)                   \
-  decode_block(pbi, xd, DEC_BLOCK_STX_ARG(db_r), (db_c), r, \
-               DEC_BLOCK_EPT_ARG(db_subsize))
-#define DEC_PARTITION(db_r, db_c, db_subsize) \
-  decode_partition(pbi, xd, DEC_BLOCK_STX_ARG(db_r), (db_c), r, (db_subsize))
+#define DEC_BLOCK(db_r, db_c, db_subsize)                                     \
+  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), r, \
+                                 DEC_BLOCK_EPT_ARG(db_subsize))
+#define DEC_PARTITION(db_r, db_c, db_subsize)                                 \
+  decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), r, (db_subsize), \
+                   parse_decode_flag)
 
   switch (partition) {
     case PARTITION_NONE: DEC_BLOCK(mi_row, mi_col, subsize); break;
@@ -1513,7 +1576,8 @@ static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
 #undef DEC_BLOCK_EPT_ARG
 #undef DEC_BLOCK_STX_ARG
 
-  update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+  if (parse_decode_flag & 1)
+    update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
 }
 
 static void setup_bool_decoder(const uint8_t *data, const uint8_t *data_end,
@@ -1650,7 +1714,7 @@ static void decode_restoration_mode(AV1_COMMON *cm,
   }
 
   if (num_planes > 1) {
-    int s = AOMMIN(cm->subsampling_x, cm->subsampling_y);
+    int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y);
     if (s && !chroma_none) {
       cm->rst_info[1].restoration_unit_size =
           cm->rst_info[0].restoration_unit_size >> (aom_rb_read_bit(rb) * s);
@@ -1872,12 +1936,13 @@ static INLINE int read_delta_q(struct aom_read_bit_buffer *rb) {
 
 static void setup_quantization(AV1_COMMON *const cm,
                                struct aom_read_bit_buffer *rb) {
+  const SequenceHeader *const seq_params = &cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   cm->base_qindex = aom_rb_read_literal(rb, QINDEX_BITS);
   cm->y_dc_delta_q = read_delta_q(rb);
   if (num_planes > 1) {
     int diff_uv_delta = 0;
-    if (cm->separate_uv_delta_q) diff_uv_delta = aom_rb_read_bit(rb);
+    if (seq_params->separate_uv_delta_q) diff_uv_delta = aom_rb_read_bit(rb);
     cm->u_dc_delta_q = read_delta_q(rb);
     cm->u_ac_delta_q = read_delta_q(rb);
     if (diff_uv_delta) {
@@ -1888,12 +1953,12 @@ static void setup_quantization(AV1_COMMON *const cm,
       cm->v_ac_delta_q = cm->u_ac_delta_q;
     }
   }
-  cm->dequant_bit_depth = cm->bit_depth;
+  cm->dequant_bit_depth = seq_params->bit_depth;
   cm->using_qmatrix = aom_rb_read_bit(rb);
   if (cm->using_qmatrix) {
     cm->qm_y = aom_rb_read_literal(rb, QM_LEVEL_BITS);
     cm->qm_u = aom_rb_read_literal(rb, QM_LEVEL_BITS);
-    if (!cm->separate_uv_delta_q)
+    if (!seq_params->separate_uv_delta_q)
       cm->qm_v = cm->qm_u;
     else
       cm->qm_v = aom_rb_read_literal(rb, QM_LEVEL_BITS);
@@ -1906,6 +1971,7 @@ static void setup_quantization(AV1_COMMON *const cm,
 
 // Build y/uv dequant values based on segmentation.
 static void setup_segmentation_dequant(AV1_COMMON *const cm) {
+  const int bit_depth = cm->seq_params.bit_depth;
   const int using_qm = cm->using_qmatrix;
   // When segmentation is disabled, only the first value is used.  The
   // remaining are don't cares.
@@ -1913,16 +1979,16 @@ static void setup_segmentation_dequant(AV1_COMMON *const cm) {
   for (int i = 0; i < max_segments; ++i) {
     const int qindex = av1_get_qindex(&cm->seg, i, cm->base_qindex);
     cm->y_dequant_QTX[i][0] =
-        av1_dc_quant_QTX(qindex, cm->y_dc_delta_q, cm->bit_depth);
-    cm->y_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, 0, cm->bit_depth);
+        av1_dc_quant_QTX(qindex, cm->y_dc_delta_q, bit_depth);
+    cm->y_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, 0, bit_depth);
     cm->u_dequant_QTX[i][0] =
-        av1_dc_quant_QTX(qindex, cm->u_dc_delta_q, cm->bit_depth);
+        av1_dc_quant_QTX(qindex, cm->u_dc_delta_q, bit_depth);
     cm->u_dequant_QTX[i][1] =
-        av1_ac_quant_QTX(qindex, cm->u_ac_delta_q, cm->bit_depth);
+        av1_ac_quant_QTX(qindex, cm->u_ac_delta_q, bit_depth);
     cm->v_dequant_QTX[i][0] =
-        av1_dc_quant_QTX(qindex, cm->v_dc_delta_q, cm->bit_depth);
+        av1_dc_quant_QTX(qindex, cm->v_dc_delta_q, bit_depth);
     cm->v_dequant_QTX[i][1] =
-        av1_ac_quant_QTX(qindex, cm->v_ac_delta_q, cm->bit_depth);
+        av1_ac_quant_QTX(qindex, cm->v_ac_delta_q, bit_depth);
     const int lossless = qindex == 0 && cm->y_dc_delta_q == 0 &&
                          cm->u_dc_delta_q == 0 && cm->u_ac_delta_q == 0 &&
                          cm->v_dc_delta_q == 0 && cm->v_ac_delta_q == 0;
@@ -1994,9 +2060,15 @@ static void resize_context_buffers(AV1_COMMON *cm, int width, int height) {
     // Allocations in av1_alloc_context_buffers() depend on individual
     // dimensions as well as the overall size.
     if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) {
-      if (av1_alloc_context_buffers(cm, width, height))
+      if (av1_alloc_context_buffers(cm, width, height)) {
+        // The cm->mi_* values have been cleared and any existing context
+        // buffers have been freed. Clear cm->width and cm->height to be
+        // consistent and to force a realloc next time.
+        cm->width = 0;
+        cm->height = 0;
         aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                            "Failed to allocate context buffers");
+      }
     } else {
       av1_set_mb_mi(cm, width, height);
     }
@@ -2012,21 +2084,22 @@ static void resize_context_buffers(AV1_COMMON *cm, int width, int height) {
 
 static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag,
                              struct aom_read_bit_buffer *rb) {
+  const SequenceHeader *const seq_params = &cm->seq_params;
   int width, height;
   BufferPool *const pool = cm->buffer_pool;
 
   if (frame_size_override_flag) {
-    int num_bits_width = cm->seq_params.num_bits_width;
-    int num_bits_height = cm->seq_params.num_bits_height;
+    int num_bits_width = seq_params->num_bits_width;
+    int num_bits_height = seq_params->num_bits_height;
     av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
-    if (width > cm->seq_params.max_frame_width ||
-        height > cm->seq_params.max_frame_height) {
+    if (width > seq_params->max_frame_width ||
+        height > seq_params->max_frame_height) {
       aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                          "Frame dimensions are larger than the maximum values");
     }
   } else {
-    width = cm->seq_params.max_frame_width;
-    height = cm->seq_params.max_frame_height;
+    width = seq_params->max_frame_width;
+    height = seq_params->max_frame_height;
   }
 
   setup_superres(cm, rb, &width, &height);
@@ -2035,8 +2108,9 @@ static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag,
 
   lock_buffer_pool(pool);
   if (aom_realloc_frame_buffer(
-          get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x,
-          cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+          get_frame_new_buffer(cm), cm->width, cm->height,
+          seq_params->subsampling_x, seq_params->subsampling_y,
+          seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
           cm->byte_alignment,
           &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
           pool->cb_priv)) {
@@ -2046,18 +2120,22 @@ static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag,
   }
   unlock_buffer_pool(pool);
 
-  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
-  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
-  pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
-  pool->frame_bufs[cm->new_fb_idx].buf.color_primaries = cm->color_primaries;
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x =
+      seq_params->subsampling_x;
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y =
+      seq_params->subsampling_y;
+  pool->frame_bufs[cm->new_fb_idx].buf.bit_depth =
+      (unsigned int)seq_params->bit_depth;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_primaries =
+      seq_params->color_primaries;
   pool->frame_bufs[cm->new_fb_idx].buf.transfer_characteristics =
-      cm->transfer_characteristics;
+      seq_params->transfer_characteristics;
   pool->frame_bufs[cm->new_fb_idx].buf.matrix_coefficients =
-      cm->matrix_coefficients;
-  pool->frame_bufs[cm->new_fb_idx].buf.monochrome = cm->seq_params.monochrome;
+      seq_params->matrix_coefficients;
+  pool->frame_bufs[cm->new_fb_idx].buf.monochrome = seq_params->monochrome;
   pool->frame_bufs[cm->new_fb_idx].buf.chroma_sample_position =
-      cm->chroma_sample_position;
-  pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range;
+      seq_params->chroma_sample_position;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_range = seq_params->color_range;
   pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width;
   pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
 }
@@ -2095,9 +2173,10 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
     }
   }
 
+  const SequenceHeader *const seq_params = &cm->seq_params;
   if (!found) {
-    int num_bits_width = cm->seq_params.num_bits_width;
-    int num_bits_height = cm->seq_params.num_bits_height;
+    int num_bits_width = seq_params->num_bits_width;
+    int num_bits_height = seq_params->num_bits_height;
 
     av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
     setup_superres(cm, rb, &width, &height);
@@ -2122,18 +2201,19 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
                        "Referenced frame has invalid size");
   for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
     RefBuffer *const ref_frame = &cm->frame_refs[i];
-    if (!valid_ref_frame_img_fmt(ref_frame->buf->bit_depth,
-                                 ref_frame->buf->subsampling_x,
-                                 ref_frame->buf->subsampling_y, cm->bit_depth,
-                                 cm->subsampling_x, cm->subsampling_y))
+    if (!valid_ref_frame_img_fmt(
+            ref_frame->buf->bit_depth, ref_frame->buf->subsampling_x,
+            ref_frame->buf->subsampling_y, seq_params->bit_depth,
+            seq_params->subsampling_x, seq_params->subsampling_y))
       aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                          "Referenced frame has incompatible color format");
   }
 
   lock_buffer_pool(pool);
   if (aom_realloc_frame_buffer(
-          get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x,
-          cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+          get_frame_new_buffer(cm), cm->width, cm->height,
+          seq_params->subsampling_x, seq_params->subsampling_y,
+          seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
           cm->byte_alignment,
           &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
           pool->cb_priv)) {
@@ -2143,18 +2223,22 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
   }
   unlock_buffer_pool(pool);
 
-  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
-  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
-  pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
-  pool->frame_bufs[cm->new_fb_idx].buf.color_primaries = cm->color_primaries;
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x =
+      seq_params->subsampling_x;
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y =
+      seq_params->subsampling_y;
+  pool->frame_bufs[cm->new_fb_idx].buf.bit_depth =
+      (unsigned int)seq_params->bit_depth;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_primaries =
+      seq_params->color_primaries;
   pool->frame_bufs[cm->new_fb_idx].buf.transfer_characteristics =
-      cm->transfer_characteristics;
+      seq_params->transfer_characteristics;
   pool->frame_bufs[cm->new_fb_idx].buf.matrix_coefficients =
-      cm->matrix_coefficients;
-  pool->frame_bufs[cm->new_fb_idx].buf.monochrome = cm->seq_params.monochrome;
+      seq_params->matrix_coefficients;
+  pool->frame_bufs[cm->new_fb_idx].buf.monochrome = seq_params->monochrome;
   pool->frame_bufs[cm->new_fb_idx].buf.chroma_sample_position =
-      cm->chroma_sample_position;
-  pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range;
+      seq_params->chroma_sample_position;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_range = seq_params->color_range;
   pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width;
   pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
 }
@@ -2500,8 +2584,15 @@ static void get_tile_buffers(AV1Decoder *pbi, const uint8_t *data,
   }
 }
 
-static void set_cb_buffer(MACROBLOCKD *const xd, CB_BUFFER *cb_buffer,
-                          const int num_planes) {
+static void set_cb_buffer(AV1Decoder *pbi, MACROBLOCKD *const xd,
+                          CB_BUFFER *cb_buffer_base, const int num_planes,
+                          int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &pbi->common;
+  int mib_size_log2 = cm->seq_params.mib_size_log2;
+  int stride = (cm->mi_cols >> mib_size_log2) + 1;
+  int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
+  CB_BUFFER *cb_buffer = cb_buffer_base + offset;
+
   for (int plane = 0; plane < num_planes; ++plane) {
     xd->plane[plane].dqcoeff_block = cb_buffer->dqcoeff[plane];
     xd->plane[plane].eob_data = cb_buffer->eob_data[plane];
@@ -2514,18 +2605,189 @@ static void set_cb_buffer(MACROBLOCKD *const xd, CB_BUFFER *cb_buffer,
   xd->color_index_map_offset[1] = 0;
 }
 
+static void decoder_alloc_tile_data(AV1Decoder *pbi, const int n_tiles) {
+  AV1_COMMON *const cm = &pbi->common;
+  aom_free(pbi->tile_data);
+  CHECK_MEM_ERROR(cm, pbi->tile_data,
+                  aom_memalign(32, n_tiles * sizeof(*pbi->tile_data)));
+  pbi->allocated_tiles = n_tiles;
+  for (int i = 0; i < n_tiles; i++) {
+    TileDataDec *const tile_data = pbi->tile_data + i;
+    av1_zero(tile_data->dec_row_mt_sync);
+  }
+  pbi->allocated_row_mt_sync_rows = 0;
+}
+
+// Set up nsync by width.
+static INLINE int get_sync_range(int width) {
+// nsync numbers are picked by testing.
+#if 0
+  if (width < 640)
+    return 1;
+  else if (width <= 1280)
+    return 2;
+  else if (width <= 4096)
+    return 4;
+  else
+    return 8;
+#else
+  (void)width;
+#endif
+  return 1;
+}
+
+// Allocate memory for decoder row synchronization
+static void dec_row_mt_alloc(AV1DecRowMTSync *dec_row_mt_sync, AV1_COMMON *cm,
+                             int rows) {
+  dec_row_mt_sync->allocated_sb_rows = rows;
+#if CONFIG_MULTITHREAD
+  {
+    int i;
+
+    CHECK_MEM_ERROR(cm, dec_row_mt_sync->mutex_,
+                    aom_malloc(sizeof(*(dec_row_mt_sync->mutex_)) * rows));
+    if (dec_row_mt_sync->mutex_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_mutex_init(&dec_row_mt_sync->mutex_[i], NULL);
+      }
+    }
+
+    CHECK_MEM_ERROR(cm, dec_row_mt_sync->cond_,
+                    aom_malloc(sizeof(*(dec_row_mt_sync->cond_)) * rows));
+    if (dec_row_mt_sync->cond_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_cond_init(&dec_row_mt_sync->cond_[i], NULL);
+      }
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  CHECK_MEM_ERROR(cm, dec_row_mt_sync->cur_sb_col,
+                  aom_malloc(sizeof(*(dec_row_mt_sync->cur_sb_col)) * rows));
+
+  // Set up nsync.
+  dec_row_mt_sync->sync_range = get_sync_range(cm->width);
+}
+
+// Deallocate decoder row synchronization related mutex and data
+void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync) {
+  if (dec_row_mt_sync != NULL) {
+#if CONFIG_MULTITHREAD
+    int i;
+    if (dec_row_mt_sync->mutex_ != NULL) {
+      for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) {
+        pthread_mutex_destroy(&dec_row_mt_sync->mutex_[i]);
+      }
+      aom_free(dec_row_mt_sync->mutex_);
+    }
+    if (dec_row_mt_sync->cond_ != NULL) {
+      for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) {
+        pthread_cond_destroy(&dec_row_mt_sync->cond_[i]);
+      }
+      aom_free(dec_row_mt_sync->cond_);
+    }
+#endif  // CONFIG_MULTITHREAD
+    aom_free(dec_row_mt_sync->cur_sb_col);
+
+    // clear the structure as the source of this call may be a resize in which
+    // case this call will be followed by an _alloc() which may fail.
+    av1_zero(*dec_row_mt_sync);
+  }
+}
+
+static INLINE void sync_read(AV1DecRowMTSync *const dec_row_mt_sync, int r,
+                             int c) {
+#if CONFIG_MULTITHREAD
+  const int nsync = dec_row_mt_sync->sync_range;
+
+  if (r && !(c & (nsync - 1))) {
+    pthread_mutex_t *const mutex = &dec_row_mt_sync->mutex_[r - 1];
+    pthread_mutex_lock(mutex);
+
+    while (c > dec_row_mt_sync->cur_sb_col[r - 1] - nsync) {
+      pthread_cond_wait(&dec_row_mt_sync->cond_[r - 1], mutex);
+    }
+    pthread_mutex_unlock(mutex);
+  }
+#else
+  (void)dec_row_mt_sync;
+  (void)r;
+  (void)c;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static INLINE void sync_write(AV1DecRowMTSync *const dec_row_mt_sync, int r,
+                              int c, const int sb_cols) {
+#if CONFIG_MULTITHREAD
+  const int nsync = dec_row_mt_sync->sync_range;
+  int cur;
+  int sig = 1;
+
+  if (c < sb_cols - 1) {
+    cur = c;
+    if (c % nsync) sig = 0;
+  } else {
+    cur = sb_cols + nsync;
+  }
+
+  if (sig) {
+    pthread_mutex_lock(&dec_row_mt_sync->mutex_[r]);
+
+    dec_row_mt_sync->cur_sb_col[r] = cur;
+
+    pthread_cond_signal(&dec_row_mt_sync->cond_[r]);
+    pthread_mutex_unlock(&dec_row_mt_sync->mutex_[r]);
+  }
+#else
+  (void)dec_row_mt_sync;
+  (void)r;
+  (void)c;
+  (void)sb_cols;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static INLINE int get_sb_rows_in_tile(AV1Decoder *pbi, TileInfo tile) {
+  AV1_COMMON *cm = &pbi->common;
+  int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO(
+      tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2);
+  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+  return sb_rows;
+}
+
+static INLINE int get_sb_cols_in_tile(AV1Decoder *pbi, TileInfo tile) {
+  AV1_COMMON *cm = &pbi->common;
+  int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO(
+      tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2);
+  int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+  return sb_cols;
+}
+
 static void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
                                TileInfo tile_info, const int mi_row) {
   AV1_COMMON *const cm = &pbi->common;
   const int num_planes = av1_num_planes(cm);
-  av1_zero_left_context(&td->xd);
+  TileDataDec *const tile_data =
+      pbi->tile_data + tile_info.tile_row * cm->tile_cols + tile_info.tile_col;
+  const int sb_cols_in_tile = get_sb_cols_in_tile(pbi, tile_info);
+  const int sb_row_in_tile =
+      (mi_row - tile_info.mi_row_start) >> cm->seq_params.mib_size_log2;
+  int sb_col_in_tile = 0;
 
   for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-       mi_col += cm->seq_params.mib_size) {
-    set_cb_buffer(&td->xd, &td->cb_buffer_base, num_planes);
+       mi_col += cm->seq_params.mib_size, sb_col_in_tile++) {
+    set_cb_buffer(pbi, &td->xd, pbi->cb_buffer_base, num_planes, mi_row,
+                  mi_col);
+
+    sync_read(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile);
 
-    decode_partition(pbi, &td->xd, mi_row, mi_col, td->bit_reader,
-                     cm->seq_params.sb_size);
+    // Decoding of the super-block
+    decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
+                     cm->seq_params.sb_size, 0x2);
+
+    sync_write(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile,
+               sb_cols_in_tile);
   }
 }
 
@@ -2555,6 +2817,27 @@ static int check_trailing_bits_after_symbol_coder(aom_reader *r) {
   return 0;
 }
 
+static void set_decode_func_pointers(ThreadData *td, int parse_decode_flag) {
+  td->read_coeffs_tx_intra_block_visit = decode_block_void;
+  td->predict_and_recon_intra_block_visit = decode_block_void;
+  td->read_coeffs_tx_inter_block_visit = decode_block_void;
+  td->inverse_tx_inter_block_visit = decode_block_void;
+  td->predict_inter_block_visit = predict_inter_block_void;
+  td->cfl_store_inter_block_visit = cfl_store_inter_block_void;
+
+  if (parse_decode_flag & 0x1) {
+    td->read_coeffs_tx_intra_block_visit = read_coeffs_tx_intra_block;
+    td->read_coeffs_tx_inter_block_visit = av1_read_coeffs_txb_facade;
+  }
+  if (parse_decode_flag & 0x2) {
+    td->predict_and_recon_intra_block_visit =
+        predict_and_reconstruct_intra_block;
+    td->inverse_tx_inter_block_visit = inverse_transform_inter_block;
+    td->predict_inter_block_visit = predict_inter_block;
+    td->cfl_store_inter_block_visit = cfl_store_inter_block;
+  }
+}
+
 static void decode_tile(AV1Decoder *pbi, ThreadData *const td, int tile_row,
                         int tile_col) {
   TileInfo tile_info;
@@ -2564,13 +2847,23 @@ static void decode_tile(AV1Decoder *pbi, ThreadData *const td, int tile_row,
 
   av1_tile_set_row(&tile_info, cm, tile_row);
   av1_tile_set_col(&tile_info, cm, tile_col);
-  av1_zero_above_context(cm, tile_info.mi_col_start, tile_info.mi_col_end,
-                         tile_row);
+  av1_zero_above_context(cm, &td->xd, tile_info.mi_col_start,
+                         tile_info.mi_col_end, tile_row);
+  av1_reset_loop_filter_delta(&td->xd, num_planes);
   av1_reset_loop_restoration(&td->xd, num_planes);
 
   for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
        mi_row += cm->seq_params.mib_size) {
-    decode_tile_sb_row(pbi, td, tile_info, mi_row);
+    av1_zero_left_context(&td->xd);
+
+    for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+         mi_col += cm->seq_params.mib_size) {
+      set_cb_buffer(pbi, &td->xd, &td->cb_buffer_base, num_planes, 0, 0);
+
+      // Bit-stream parsing and decoding of the superblock
+      decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
+                       cm->seq_params.sb_size, 0x3);
+    }
   }
 
   int corrupted =
@@ -2582,6 +2875,7 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
                                    const uint8_t *data_end, int start_tile,
                                    int end_tile) {
   AV1_COMMON *const cm = &pbi->common;
+  ThreadData *const td = &pbi->td;
   const int tile_cols = cm->tile_cols;
   const int tile_rows = cm->tile_rows;
   const int n_tiles = tile_cols * tile_rows;
@@ -2641,23 +2935,26 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
     get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
 
   if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
-    aom_free(pbi->tile_data);
-    CHECK_MEM_ERROR(cm, pbi->tile_data,
-                    aom_memalign(32, n_tiles * (sizeof(*pbi->tile_data))));
-    pbi->allocated_tiles = n_tiles;
+    decoder_alloc_tile_data(pbi, n_tiles);
   }
 #if CONFIG_ACCOUNTING
   if (pbi->acct_enabled) {
     aom_accounting_reset(&pbi->accounting);
   }
 #endif
+
+  set_decode_func_pointers(&pbi->td, 0x3);
+
   // Load all tile information into thread_data.
+  td->xd = pbi->mb;
+  td->xd.corrupted = 0;
+  td->xd.mc_buf[0] = td->mc_buf[0];
+  td->xd.mc_buf[1] = td->mc_buf[1];
   for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
     const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row;
 
     for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
       const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col;
-      ThreadData *const td = &pbi->td;
       TileDataDec *const tile_data = pbi->tile_data + row * cm->tile_cols + col;
       const TileBufferDec *const tile_bs_buf = &tile_buffers[row][col];
 
@@ -2665,13 +2962,10 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
           row * cm->tile_cols + col > end_tile)
         continue;
 
-      td->xd = pbi->mb;
-      td->xd.corrupted = 0;
-      td->xd.mc_buf[0] = pbi->td.mc_buf[0];
-      td->xd.mc_buf[1] = pbi->td.mc_buf[1];
       td->bit_reader = &tile_data->bit_reader;
       av1_zero(td->dqcoeff);
       av1_tile_init(&td->xd.tile, cm, row, col);
+      td->xd.current_qindex = cm->base_qindex;
       setup_bool_decoder(tile_bs_buf->data, data_end, tile_bs_buf->size,
                          &cm->error, td->bit_reader, allow_update_cdf);
 #if CONFIG_ACCOUNTING
@@ -2691,7 +2985,7 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
       td->xd.tile_ctx = &tile_data->tctx;
 
       // decode tile
-      decode_tile(pbi, &pbi->td, row, col);
+      decode_tile(pbi, td, row, col);
       aom_merge_corrupted_flag(&pbi->mb.corrupted, td->xd.corrupted);
       if (pbi->mb.corrupted)
         aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
@@ -2729,6 +3023,47 @@ static TileJobsDec *get_dec_job_info(AV1DecTileMT *tile_mt_info) {
   return cur_job_info;
 }
 
+static void tile_worker_hook_init(AV1Decoder *const pbi,
+                                  DecWorkerData *const thread_data,
+                                  const TileBufferDec *const tile_buffer,
+                                  TileDataDec *const tile_data,
+                                  uint8_t allow_update_cdf) {
+  AV1_COMMON *cm = &pbi->common;
+  ThreadData *const td = thread_data->td;
+  int tile_row = tile_data->tile_info.tile_row;
+  int tile_col = tile_data->tile_info.tile_col;
+
+  td->bit_reader = &tile_data->bit_reader;
+  av1_zero(td->dqcoeff);
+  av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
+  td->xd.current_qindex = cm->base_qindex;
+  setup_bool_decoder(tile_buffer->data, thread_data->data_end,
+                     tile_buffer->size, &thread_data->error_info,
+                     td->bit_reader, allow_update_cdf);
+#if CONFIG_ACCOUNTING
+  if (pbi->acct_enabled) {
+    td->bit_reader->accounting = &pbi->accounting;
+    td->bit_reader->accounting->last_tell_frac =
+        aom_reader_tell_frac(td->bit_reader);
+  } else {
+    td->bit_reader->accounting = NULL;
+  }
+#endif
+  av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+  td->xd.error_info = &thread_data->error_info;
+  av1_init_above_context(cm, &td->xd, tile_row);
+
+  // Initialise the tile context from the frame context
+  tile_data->tctx = *cm->fc;
+  td->xd.tile_ctx = &tile_data->tctx;
+#if CONFIG_ACCOUNTING
+  if (pbi->acct_enabled) {
+    tile_data->bit_reader.accounting->last_tell_frac =
+        aom_reader_tell_frac(&tile_data->bit_reader);
+  }
+#endif
+}
+
 static int tile_worker_hook(void *arg1, void *arg2) {
   DecWorkerData *const thread_data = (DecWorkerData *)arg1;
   AV1Decoder *const pbi = (AV1Decoder *)arg2;
@@ -2736,14 +3071,21 @@ static int tile_worker_hook(void *arg1, void *arg2) {
   ThreadData *const td = thread_data->td;
   uint8_t allow_update_cdf;
 
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
   if (setjmp(thread_data->error_info.jmp)) {
     thread_data->error_info.setjmp = 0;
     thread_data->td->xd.corrupted = 1;
     return 0;
   }
+  thread_data->error_info.setjmp = 1;
+
   allow_update_cdf = cm->large_scale_tile ? 0 : 1;
   allow_update_cdf = allow_update_cdf && !cm->disable_cdf_update;
 
+  set_decode_func_pointers(td, 0x3);
+
   assert(cm->tile_cols > 0);
   while (1) {
     TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
@@ -2751,46 +3093,248 @@ static int tile_worker_hook(void *arg1, void *arg2) {
     if (cur_job_info != NULL && !td->xd.corrupted) {
       const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer;
       TileDataDec *const tile_data = cur_job_info->tile_data;
-      volatile int tile_row = tile_data->tile_info.tile_row;
-      volatile int tile_col = tile_data->tile_info.tile_col;
+      tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data,
+                            allow_update_cdf);
+      // decode tile
+      int tile_row = tile_data->tile_info.tile_row;
+      int tile_col = tile_data->tile_info.tile_col;
+      decode_tile(pbi, td, tile_row, tile_col);
+    } else {
+      break;
+    }
+  }
+  thread_data->error_info.setjmp = 0;
+  return !td->xd.corrupted;
+}
 
-      td->xd = pbi->mb;
-      td->xd.corrupted = 0;
-      td->xd.mc_buf[0] = td->mc_buf[0];
-      td->xd.mc_buf[1] = td->mc_buf[1];
-      td->bit_reader = &tile_data->bit_reader;
-      av1_zero(td->dqcoeff);
-      av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
-      setup_bool_decoder(tile_buffer->data, thread_data->data_end,
-                         tile_buffer->size, &cm->error, td->bit_reader,
-                         allow_update_cdf);
-#if CONFIG_ACCOUNTING
-      if (pbi->acct_enabled) {
-        td->bit_reader->accounting = &pbi->accounting;
-        td->bit_reader->accounting->last_tell_frac =
-            aom_reader_tell_frac(td->bit_reader);
-      } else {
-        td->bit_reader->accounting = NULL;
+static int get_next_job_info(AV1Decoder *const pbi,
+                             AV1DecRowMTJobInfo *next_job_info,
+                             int *end_of_frame) {
+  AV1_COMMON *cm = &pbi->common;
+  TileDataDec *tile_data;
+  AV1DecRowMTSync *dec_row_mt_sync;
+  AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+  TileInfo tile_info;
+  const int tile_rows_start = frame_row_mt_info->tile_rows_start;
+  const int tile_rows_end = frame_row_mt_info->tile_rows_end;
+  const int tile_cols_start = frame_row_mt_info->tile_cols_start;
+  const int tile_cols_end = frame_row_mt_info->tile_cols_end;
+  const int start_tile = frame_row_mt_info->start_tile;
+  const int end_tile = frame_row_mt_info->end_tile;
+  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+  int num_mis_to_decode, num_threads_working;
+  int num_mis_waiting_for_decode;
+  int min_threads_working = INT_MAX;
+  int max_mis_to_decode = 0;
+  int tile_row_idx, tile_col_idx;
+  int tile_row = 0;
+  int tile_col = 0;
+
+  memset(next_job_info, 0, sizeof(*next_job_info));
+
+  // Frame decode is completed or error is encountered.
+  *end_of_frame = (frame_row_mt_info->mi_rows_decode_started ==
+                   frame_row_mt_info->mi_rows_to_decode) ||
+                  (frame_row_mt_info->row_mt_exit == 1);
+  if (*end_of_frame) {
+    return 1;
+  }
+
+  // Decoding cannot start as bit-stream parsing is not complete.
+  if (frame_row_mt_info->mi_rows_parse_done -
+          frame_row_mt_info->mi_rows_decode_started ==
+      0)
+    return 0;
+
+  // Choose the tile to decode.
+  for (tile_row_idx = tile_rows_start; tile_row_idx < tile_rows_end;
+       ++tile_row_idx) {
+    for (tile_col_idx = tile_cols_start; tile_col_idx < tile_cols_end;
+         ++tile_col_idx) {
+      if (tile_row_idx * cm->tile_cols + tile_col_idx < start_tile ||
+          tile_row_idx * cm->tile_cols + tile_col_idx > end_tile)
+        continue;
+
+      tile_data = pbi->tile_data + tile_row_idx * cm->tile_cols + tile_col_idx;
+      dec_row_mt_sync = &tile_data->dec_row_mt_sync;
+
+      num_threads_working = dec_row_mt_sync->num_threads_working;
+      num_mis_waiting_for_decode = (dec_row_mt_sync->mi_rows_parse_done -
+                                    dec_row_mt_sync->mi_rows_decode_started) *
+                                   dec_row_mt_sync->mi_cols;
+      num_mis_to_decode =
+          (dec_row_mt_sync->mi_rows - dec_row_mt_sync->mi_rows_decode_started) *
+          dec_row_mt_sync->mi_cols;
+
+      assert(num_mis_to_decode >= num_mis_waiting_for_decode);
+
+      // Pick the tile which has minimum number of threads working on it.
+      if (num_mis_waiting_for_decode > 0) {
+        if (num_threads_working < min_threads_working) {
+          min_threads_working = num_threads_working;
+          max_mis_to_decode = 0;
+        }
+        if (num_threads_working == min_threads_working &&
+            num_mis_to_decode > max_mis_to_decode) {
+          max_mis_to_decode = num_mis_to_decode;
+          tile_row = tile_row_idx;
+          tile_col = tile_col_idx;
+        }
       }
+    }
+  }
+
+  tile_data = pbi->tile_data + tile_row * cm->tile_cols + tile_col;
+  tile_info = tile_data->tile_info;
+  dec_row_mt_sync = &tile_data->dec_row_mt_sync;
+
+  next_job_info->tile_row = tile_row;
+  next_job_info->tile_col = tile_col;
+  next_job_info->mi_row =
+      dec_row_mt_sync->mi_rows_decode_started + tile_info.mi_row_start;
+
+  dec_row_mt_sync->num_threads_working++;
+  dec_row_mt_sync->mi_rows_decode_started += sb_mi_size;
+  frame_row_mt_info->mi_rows_decode_started += sb_mi_size;
+
+  return 1;
+}
+
+static INLINE void signal_parse_sb_row_done(AV1Decoder *const pbi,
+                                            TileDataDec *const tile_data,
+                                            const int sb_mi_size) {
+  AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(pbi->row_mt_mutex_);
 #endif
-      av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
-      av1_init_above_context(cm, &td->xd, tile_row);
+  tile_data->dec_row_mt_sync.mi_rows_parse_done += sb_mi_size;
+  frame_row_mt_info->mi_rows_parse_done += sb_mi_size;
+#if CONFIG_MULTITHREAD
+  pthread_cond_broadcast(pbi->row_mt_cond_);
+  pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+}
 
-      // Initialise the tile context from the frame context
-      tile_data->tctx = *cm->fc;
-      td->xd.tile_ctx = &tile_data->tctx;
-#if CONFIG_ACCOUNTING
-      if (pbi->acct_enabled) {
-        tile_data->bit_reader.accounting->last_tell_frac =
-            aom_reader_tell_frac(&tile_data->bit_reader);
-      }
+static int row_mt_worker_hook(void *arg1, void *arg2) {
+  DecWorkerData *const thread_data = (DecWorkerData *)arg1;
+  AV1Decoder *const pbi = (AV1Decoder *)arg2;
+  AV1_COMMON *cm = &pbi->common;
+  ThreadData *const td = thread_data->td;
+  uint8_t allow_update_cdf;
+  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+  AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+  td->xd.corrupted = 0;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(thread_data->error_info.jmp)) {
+    thread_data->error_info.setjmp = 0;
+    thread_data->td->xd.corrupted = 1;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(pbi->row_mt_mutex_);
 #endif
+    frame_row_mt_info->row_mt_exit = 1;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+    return 0;
+  }
+  thread_data->error_info.setjmp = 1;
+
+  const int num_planes = av1_num_planes(cm);
+  allow_update_cdf = cm->large_scale_tile ? 0 : 1;
+  allow_update_cdf = allow_update_cdf && !cm->disable_cdf_update;
+
+  assert(cm->tile_cols > 0);
+  while (1) {
+    TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
+
+    if (cur_job_info != NULL && !td->xd.corrupted) {
+      const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer;
+      TileDataDec *const tile_data = cur_job_info->tile_data;
+      tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data,
+                            allow_update_cdf);
+
+      set_decode_func_pointers(td, 0x1);
+
       // decode tile
-      decode_tile(pbi, td, tile_row, tile_col);
+      TileInfo tile_info = tile_data->tile_info;
+      int tile_row = tile_info.tile_row;
+
+      av1_zero_above_context(cm, &td->xd, tile_info.mi_col_start,
+                             tile_info.mi_col_end, tile_row);
+      av1_reset_loop_filter_delta(&td->xd, num_planes);
+      av1_reset_loop_restoration(&td->xd, num_planes);
+
+      for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+           mi_row += cm->seq_params.mib_size) {
+        av1_zero_left_context(&td->xd);
+
+        for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+             mi_col += cm->seq_params.mib_size) {
+          set_cb_buffer(pbi, &td->xd, pbi->cb_buffer_base, num_planes, mi_row,
+                        mi_col);
+
+          // Bit-stream parsing of the superblock
+          decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
+                           cm->seq_params.sb_size, 0x1);
+        }
+        signal_parse_sb_row_done(pbi, tile_data, sb_mi_size);
+      }
+
+      int corrupted =
+          (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0;
+      aom_merge_corrupted_flag(&td->xd.corrupted, corrupted);
     } else {
       break;
     }
   }
+
+  set_decode_func_pointers(td, 0x2);
+
+  while (1) {
+    AV1DecRowMTJobInfo next_job_info;
+    int end_of_frame = 0;
+
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+    while (!get_next_job_info(pbi, &next_job_info, &end_of_frame)) {
+#if CONFIG_MULTITHREAD
+      pthread_cond_wait(pbi->row_mt_cond_, pbi->row_mt_mutex_);
+#endif
+    }
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+
+    if (end_of_frame) break;
+
+    int tile_row = next_job_info.tile_row;
+    int tile_col = next_job_info.tile_col;
+    int mi_row = next_job_info.mi_row;
+
+    TileDataDec *tile_data =
+        pbi->tile_data + tile_row * cm->tile_cols + tile_col;
+    AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync;
+    TileInfo tile_info = tile_data->tile_info;
+
+    av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
+    av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+    td->xd.error_info = &thread_data->error_info;
+
+    decode_tile_sb_row(pbi, td, tile_info, mi_row);
+
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+    dec_row_mt_sync->num_threads_working--;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+  }
+  thread_data->error_info.setjmp = 0;
   return !td->xd.corrupted;
 }
 
@@ -2842,8 +3386,7 @@ static void alloc_dec_jobs(AV1DecTileMT *tile_mt_info, AV1_COMMON *cm,
                   aom_malloc(sizeof(*tile_mt_info->job_queue) * num_tiles));
 }
 
-void av1_free_mc_tmp_buf(void *td, int use_highbd) {
-  ThreadData *thread_data = (ThreadData *)td;
+void av1_free_mc_tmp_buf(ThreadData *thread_data, int use_highbd) {
   int ref;
   for (ref = 0; ref < 2; ref++) {
     if (use_highbd)
@@ -2855,10 +3398,8 @@ void av1_free_mc_tmp_buf(void *td, int use_highbd) {
   thread_data->mc_buf_size = 0;
 }
 
-static void allocate_mc_tmp_buf(AV1_COMMON *const cm, void *td, int buf_size,
-                                int use_highbd) {
-  ThreadData *thread_data = (ThreadData *)td;
-
+static void allocate_mc_tmp_buf(AV1_COMMON *const cm, ThreadData *thread_data,
+                                int buf_size, int use_highbd) {
   for (int ref = 0; ref < 2; ref++) {
     if (use_highbd) {
       uint16_t *hbd_mc_buf;
@@ -2872,11 +3413,130 @@ static void allocate_mc_tmp_buf(AV1_COMMON *const cm, void *td, int buf_size,
   thread_data->mc_buf_size = buf_size;
 }
 
+static void reset_dec_workers(AV1Decoder *pbi, AVxWorkerHook worker_hook,
+                              int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+  // Reset tile decoding hook
+  for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+    AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+    DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+    thread_data->td->xd = pbi->mb;
+    thread_data->td->xd.corrupted = 0;
+    thread_data->td->xd.mc_buf[0] = thread_data->td->mc_buf[0];
+    thread_data->td->xd.mc_buf[1] = thread_data->td->mc_buf[1];
+    winterface->sync(worker);
+
+    worker->hook = worker_hook;
+    worker->data1 = thread_data;
+    worker->data2 = pbi;
+  }
+#if CONFIG_ACCOUNTING
+  if (pbi->acct_enabled) {
+    aom_accounting_reset(&pbi->accounting);
+  }
+#endif
+}
+
+static void launch_dec_workers(AV1Decoder *pbi, const uint8_t *data_end,
+                               int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+  for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+    AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+    DecWorkerData *const thread_data = (DecWorkerData *)worker->data1;
+
+    thread_data->data_end = data_end;
+
+    worker->had_error = 0;
+    if (worker_idx == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+}
+
+static void sync_dec_workers(AV1Decoder *pbi, int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int corrupted = 0;
+
+  for (int worker_idx = num_workers; worker_idx > 0; --worker_idx) {
+    AVxWorker *const worker = &pbi->tile_workers[worker_idx - 1];
+    aom_merge_corrupted_flag(&corrupted, !winterface->sync(worker));
+  }
+
+  pbi->mb.corrupted = corrupted;
+}
+
+static void decode_mt_init(AV1Decoder *pbi) {
+  AV1_COMMON *const cm = &pbi->common;
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int worker_idx;
+
+  // Create workers and thread_data
+  if (pbi->num_workers == 0) {
+    const int num_threads = pbi->max_threads;
+    CHECK_MEM_ERROR(cm, pbi->tile_workers,
+                    aom_malloc(num_threads * sizeof(*pbi->tile_workers)));
+    CHECK_MEM_ERROR(cm, pbi->thread_data,
+                    aom_malloc(num_threads * sizeof(*pbi->thread_data)));
+
+    for (worker_idx = 0; worker_idx < num_threads; ++worker_idx) {
+      AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+      DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+      ++pbi->num_workers;
+
+      winterface->init(worker);
+      if (worker_idx < num_threads - 1 && !winterface->reset(worker)) {
+        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                           "Tile decoder thread creation failed");
+      }
+
+      if (worker_idx < num_threads - 1) {
+        // Allocate thread data.
+        CHECK_MEM_ERROR(cm, thread_data->td,
+                        aom_memalign(32, sizeof(*thread_data->td)));
+        av1_zero(*thread_data->td);
+      } else {
+        // Main thread acts as a worker and uses the thread data in pbi
+        thread_data->td = &pbi->td;
+      }
+      thread_data->error_info.error_code = AOM_CODEC_OK;
+      thread_data->error_info.setjmp = 0;
+    }
+  }
+  const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
+  const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
+  for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) {
+    DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+    if (thread_data->td->mc_buf_size != buf_size) {
+      av1_free_mc_tmp_buf(thread_data->td, use_highbd);
+      allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd);
+    }
+  }
+}
+
+static void tile_mt_queue(AV1Decoder *pbi, int tile_cols, int tile_rows,
+                          int tile_rows_start, int tile_rows_end,
+                          int tile_cols_start, int tile_cols_end,
+                          int start_tile, int end_tile) {
+  AV1_COMMON *const cm = &pbi->common;
+  if (pbi->tile_mt_info.alloc_tile_cols != tile_cols ||
+      pbi->tile_mt_info.alloc_tile_rows != tile_rows) {
+    av1_dealloc_dec_jobs(&pbi->tile_mt_info);
+    alloc_dec_jobs(&pbi->tile_mt_info, cm, tile_rows, tile_cols);
+  }
+  enqueue_tile_jobs(pbi, cm, tile_rows_start, tile_rows_end, tile_cols_start,
+                    tile_cols_end, start_tile, end_tile);
+  qsort(pbi->tile_mt_info.job_queue, pbi->tile_mt_info.jobs_enqueued,
+        sizeof(pbi->tile_mt_info.job_queue[0]), compare_tile_buffers);
+}
+
 static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
                                       const uint8_t *data_end, int start_tile,
                                       int end_tile) {
   AV1_COMMON *const cm = &pbi->common;
-  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   const int tile_cols = cm->tile_cols;
   const int tile_rows = cm->tile_rows;
   const int n_tiles = tile_cols * tile_rows;
@@ -2891,7 +3551,6 @@ static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
   int tile_cols_end;
   int tile_count_tg;
   int num_workers;
-  int worker_idx;
   const uint8_t *raw_data_end = NULL;
 
   if (cm->large_scale_tile) {
@@ -2923,48 +3582,188 @@ static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
   assert(start_tile <= end_tile);
   assert(start_tile >= 0 && end_tile < n_tiles);
 
-  // Create workers and thread_data
-  if (pbi->num_workers == 0) {
-    const int num_threads = pbi->max_threads;
-    CHECK_MEM_ERROR(cm, pbi->tile_workers,
-                    aom_malloc(num_threads * sizeof(*pbi->tile_workers)));
-    CHECK_MEM_ERROR(cm, pbi->thread_data,
-                    aom_malloc(num_threads * sizeof(*pbi->thread_data)));
+  decode_mt_init(pbi);
 
-    for (worker_idx = 0; worker_idx < num_threads; ++worker_idx) {
-      AVxWorker *const worker = &pbi->tile_workers[worker_idx];
-      DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
-      ++pbi->num_workers;
+  // get tile size in tile group
+#if EXT_TILE_DEBUG
+  if (cm->large_scale_tile) assert(pbi->ext_tile_debug == 1);
+  if (cm->large_scale_tile)
+    raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
+  else
+#endif  // EXT_TILE_DEBUG
+    get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
 
-      winterface->init(worker);
-      if (worker_idx < num_threads - 1 && !winterface->reset(worker)) {
-        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
-                           "Tile decoder thread creation failed");
-      }
+  if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
+    decoder_alloc_tile_data(pbi, n_tiles);
+  }
 
-      if (worker_idx < num_threads - 1) {
-        // Allocate thread data.
-        CHECK_MEM_ERROR(cm, thread_data->td,
-                        aom_memalign(32, sizeof(*thread_data->td)));
-        av1_zero(*thread_data->td);
-      } else {
-        // Main thread acts as a worker and uses the thread data in pbi
-        thread_data->td = &pbi->td;
-      }
+  for (int row = 0; row < tile_rows; row++) {
+    for (int col = 0; col < tile_cols; col++) {
+      TileDataDec *tile_data = pbi->tile_data + row * cm->tile_cols + col;
+      av1_tile_init(&tile_data->tile_info, cm, row, col);
     }
   }
-  const int use_highbd = cm->use_highbitdepth ? 1 : 0;
-  const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
-  for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) {
-    DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
-    if (thread_data->td->mc_buf_size != buf_size) {
-      av1_free_mc_tmp_buf(thread_data->td, use_highbd);
-      allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd);
+
+  tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end,
+                tile_cols_start, tile_cols_end, start_tile, end_tile);
+
+  reset_dec_workers(pbi, tile_worker_hook, num_workers);
+  launch_dec_workers(pbi, data_end, num_workers);
+  sync_dec_workers(pbi, num_workers);
+
+  if (pbi->mb.corrupted)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Failed to decode tile data");
+
+  if (cm->large_scale_tile) {
+    if (n_tiles == 1) {
+      // Find the end of the single tile buffer
+      return aom_reader_find_end(&pbi->tile_data->bit_reader);
     }
+    // Return the end of the last tile buffer
+    return raw_data_end;
+  }
+  TileDataDec *const tile_data = pbi->tile_data + end_tile;
+
+  return aom_reader_find_end(&tile_data->bit_reader);
+}
+
+static void dec_alloc_cb_buf(AV1Decoder *pbi) {
+  AV1_COMMON *const cm = &pbi->common;
+  int size = ((cm->mi_rows >> cm->seq_params.mib_size_log2) + 1) *
+             ((cm->mi_cols >> cm->seq_params.mib_size_log2) + 1);
+
+  if (pbi->cb_buffer_alloc_size < size) {
+    av1_dec_free_cb_buf(pbi);
+    CHECK_MEM_ERROR(cm, pbi->cb_buffer_base,
+                    aom_memalign(32, sizeof(*pbi->cb_buffer_base) * size));
+    pbi->cb_buffer_alloc_size = size;
+  }
+}
+
+static void row_mt_frame_init(AV1Decoder *pbi, int tile_rows_start,
+                              int tile_rows_end, int tile_cols_start,
+                              int tile_cols_end, int start_tile, int end_tile,
+                              int max_sb_rows) {
+  AV1_COMMON *const cm = &pbi->common;
+  AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+
+  frame_row_mt_info->tile_rows_start = tile_rows_start;
+  frame_row_mt_info->tile_rows_end = tile_rows_end;
+  frame_row_mt_info->tile_cols_start = tile_cols_start;
+  frame_row_mt_info->tile_cols_end = tile_cols_end;
+  frame_row_mt_info->start_tile = start_tile;
+  frame_row_mt_info->end_tile = end_tile;
+  frame_row_mt_info->mi_rows_to_decode = 0;
+  frame_row_mt_info->mi_rows_parse_done = 0;
+  frame_row_mt_info->mi_rows_decode_started = 0;
+  frame_row_mt_info->row_mt_exit = 0;
+
+  for (int tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
+    for (int tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
+      if (tile_row * cm->tile_cols + tile_col < start_tile ||
+          tile_row * cm->tile_cols + tile_col > end_tile)
+        continue;
+
+      TileDataDec *const tile_data =
+          pbi->tile_data + tile_row * cm->tile_cols + tile_col;
+      TileInfo tile_info = tile_data->tile_info;
+
+      tile_data->dec_row_mt_sync.mi_rows_parse_done = 0;
+      tile_data->dec_row_mt_sync.mi_rows_decode_started = 0;
+      tile_data->dec_row_mt_sync.num_threads_working = 0;
+      tile_data->dec_row_mt_sync.mi_rows =
+          ALIGN_POWER_OF_TWO(tile_info.mi_row_end - tile_info.mi_row_start,
+                             cm->seq_params.mib_size_log2);
+      tile_data->dec_row_mt_sync.mi_cols =
+          ALIGN_POWER_OF_TWO(tile_info.mi_col_end - tile_info.mi_col_start,
+                             cm->seq_params.mib_size_log2);
+
+      frame_row_mt_info->mi_rows_to_decode +=
+          tile_data->dec_row_mt_sync.mi_rows;
+
+      // Initialize cur_sb_col to -1 for all SB rows.
+      memset(tile_data->dec_row_mt_sync.cur_sb_col, -1,
+             sizeof(*tile_data->dec_row_mt_sync.cur_sb_col) * max_sb_rows);
+    }
+  }
+
+#if CONFIG_MULTITHREAD
+  if (pbi->row_mt_mutex_ == NULL) {
+    CHECK_MEM_ERROR(cm, pbi->row_mt_mutex_,
+                    aom_malloc(sizeof(*(pbi->row_mt_mutex_))));
+    if (pbi->row_mt_mutex_) {
+      pthread_mutex_init(pbi->row_mt_mutex_, NULL);
+    }
+  }
+
+  if (pbi->row_mt_cond_ == NULL) {
+    CHECK_MEM_ERROR(cm, pbi->row_mt_cond_,
+                    aom_malloc(sizeof(*(pbi->row_mt_cond_))));
+    if (pbi->row_mt_cond_) {
+      pthread_cond_init(pbi->row_mt_cond_, NULL);
+    }
+  }
+#endif
+}
+
+static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
+                                          const uint8_t *data_end,
+                                          int start_tile, int end_tile) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  const int n_tiles = tile_cols * tile_rows;
+  TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
+  const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+  const int single_row = pbi->dec_tile_row >= 0;
+  const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+  const int single_col = pbi->dec_tile_col >= 0;
+  int tile_rows_start;
+  int tile_rows_end;
+  int tile_cols_start;
+  int tile_cols_end;
+  int tile_count_tg;
+  int num_workers;
+  const uint8_t *raw_data_end = NULL;
+  int max_sb_rows = 0;
+
+  if (cm->large_scale_tile) {
+    tile_rows_start = single_row ? dec_tile_row : 0;
+    tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
+    tile_cols_start = single_col ? dec_tile_col : 0;
+    tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+  } else {
+    tile_rows_start = 0;
+    tile_rows_end = tile_rows;
+    tile_cols_start = 0;
+    tile_cols_end = tile_cols;
   }
+  tile_count_tg = end_tile - start_tile + 1;
+  num_workers = pbi->max_threads;
 
-    // get tile size in tile group
+  // No tiles to decode.
+  if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
+      // First tile is larger than end_tile.
+      tile_rows_start * tile_cols + tile_cols_start > end_tile ||
+      // Last tile is smaller than start_tile.
+      (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile)
+    return data;
+
+  assert(tile_rows <= MAX_TILE_ROWS);
+  assert(tile_cols <= MAX_TILE_COLS);
+  assert(tile_count_tg > 0);
+  assert(num_workers > 0);
+  assert(start_tile <= end_tile);
+  assert(start_tile >= 0 && end_tile < n_tiles);
+
+  (void)tile_count_tg;
+
+  decode_mt_init(pbi);
+
+  // get tile size in tile group
 #if EXT_TILE_DEBUG
+  if (cm->large_scale_tile) assert(pbi->ext_tile_debug == 1);
   if (cm->large_scale_tile)
     raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
   else
@@ -2972,74 +3771,43 @@ static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
     get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
 
   if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
-    aom_free(pbi->tile_data);
-    CHECK_MEM_ERROR(cm, pbi->tile_data,
-                    aom_memalign(32, n_tiles * sizeof(*pbi->tile_data)));
-    pbi->allocated_tiles = n_tiles;
+    for (int i = 0; i < pbi->allocated_tiles; i++) {
+      TileDataDec *const tile_data = pbi->tile_data + i;
+      av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
+    }
+    decoder_alloc_tile_data(pbi, n_tiles);
   }
 
-  // Reset tile decoding hook
-  for (worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
-    AVxWorker *const worker = &pbi->tile_workers[worker_idx];
-    DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
-    winterface->sync(worker);
-
-    worker->hook = tile_worker_hook;
-    worker->data1 = thread_data;
-    worker->data2 = pbi;
-  }
-#if CONFIG_ACCOUNTING
-  if (pbi->acct_enabled) {
-    aom_accounting_reset(&pbi->accounting);
-  }
-#endif
   for (int row = 0; row < tile_rows; row++) {
     for (int col = 0; col < tile_cols; col++) {
       TileDataDec *tile_data = pbi->tile_data + row * cm->tile_cols + col;
       av1_tile_init(&tile_data->tile_info, cm, row, col);
+
+      max_sb_rows =
+          AOMMAX(max_sb_rows, get_sb_rows_in_tile(pbi, tile_data->tile_info));
     }
   }
 
-  if (pbi->tile_mt_info.alloc_tile_cols != tile_cols ||
-      pbi->tile_mt_info.alloc_tile_rows != tile_rows) {
-    av1_dealloc_dec_jobs(&pbi->tile_mt_info);
-    alloc_dec_jobs(&pbi->tile_mt_info, cm, tile_rows, tile_cols);
+  if (pbi->allocated_row_mt_sync_rows != max_sb_rows) {
+    for (int i = 0; i < n_tiles; ++i) {
+      TileDataDec *const tile_data = pbi->tile_data + i;
+      av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
+      dec_row_mt_alloc(&tile_data->dec_row_mt_sync, cm, max_sb_rows);
+    }
+    pbi->allocated_row_mt_sync_rows = max_sb_rows;
   }
-  enqueue_tile_jobs(pbi, cm, tile_rows_start, tile_rows_end, tile_cols_start,
-                    tile_cols_end, start_tile, end_tile);
-  qsort(pbi->tile_mt_info.job_queue, pbi->tile_mt_info.jobs_enqueued,
-        sizeof(pbi->tile_mt_info.job_queue[0]), compare_tile_buffers);
 
-  {
-    const int base = tile_count_tg / num_workers;
-    const int remain = tile_count_tg % num_workers;
-    int tile_start = start_tile;
-    int corrupted = 0;
-
-    for (worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
-      // compute number of tiles assign to each worker
-      const int count = base + (remain + worker_idx) / num_workers;
-      AVxWorker *const worker = &pbi->tile_workers[worker_idx];
-      DecWorkerData *const thread_data = (DecWorkerData *)worker->data1;
-
-      thread_data->data_end = data_end;
-      tile_start += count;
+  tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end,
+                tile_cols_start, tile_cols_end, start_tile, end_tile);
 
-      worker->had_error = 0;
-      if (worker_idx == num_workers - 1) {
-        winterface->execute(worker);
-      } else {
-        winterface->launch(worker);
-      }
-    }
+  dec_alloc_cb_buf(pbi);
 
-    for (; worker_idx > 0; --worker_idx) {
-      AVxWorker *const worker = &pbi->tile_workers[worker_idx - 1];
-      aom_merge_corrupted_flag(&corrupted, !winterface->sync(worker));
-    }
+  row_mt_frame_init(pbi, tile_rows_start, tile_rows_end, tile_cols_start,
+                    tile_cols_end, start_tile, end_tile, max_sb_rows);
 
-    pbi->mb.corrupted = corrupted;
-  }
+  reset_dec_workers(pbi, row_mt_worker_hook, num_workers);
+  launch_dec_workers(pbi, data_end, num_workers);
+  sync_dec_workers(pbi, num_workers);
 
   if (pbi->mb.corrupted)
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
@@ -3064,17 +3832,20 @@ static void error_handler(void *data) {
 }
 
 // Reads the high_bitdepth and twelve_bit fields in color_config() and sets
-// cm->bit_depth based on the values of those fields and cm->profile. Reports
-// errors by calling rb->error_handler() or aom_internal_error().
-static void av1_read_bitdepth(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+// seq_params->bit_depth based on the values of those fields and
+// seq_params->profile. Reports errors by calling rb->error_handler() or
+// aom_internal_error().
+static void read_bitdepth(struct aom_read_bit_buffer *rb,
+                          SequenceHeader *seq_params,
+                          struct aom_internal_error_info *error_info) {
   const int high_bitdepth = aom_rb_read_bit(rb);
-  if (cm->profile == PROFILE_2 && high_bitdepth) {
+  if (seq_params->profile == PROFILE_2 && high_bitdepth) {
     const int twelve_bit = aom_rb_read_bit(rb);
-    cm->bit_depth = twelve_bit ? AOM_BITS_12 : AOM_BITS_10;
-  } else if (cm->profile <= PROFILE_2) {
-    cm->bit_depth = high_bitdepth ? AOM_BITS_10 : AOM_BITS_8;
+    seq_params->bit_depth = twelve_bit ? AOM_BITS_12 : AOM_BITS_10;
+  } else if (seq_params->profile <= PROFILE_2) {
+    seq_params->bit_depth = high_bitdepth ? AOM_BITS_10 : AOM_BITS_8;
   } else {
-    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+    aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM,
                        "Unsupported profile/bit-depth combination");
   }
 }
@@ -3082,6 +3853,7 @@ static void av1_read_bitdepth(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
 void av1_read_film_grain_params(AV1_COMMON *cm,
                                 struct aom_read_bit_buffer *rb) {
   aom_film_grain_t *pars = &cm->film_grain_params;
+  const SequenceHeader *const seq_params = &cm->seq_params;
 
   pars->apply_grain = aom_rb_read_bit(rb);
   if (!pars->apply_grain) {
@@ -3095,6 +3867,8 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
   else
     pars->update_parameters = 1;
 
+  pars->bit_depth = seq_params->bit_depth;
+
   if (!pars->update_parameters) {
     // inherit parameters from a previous reference frame
     RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
@@ -3129,11 +3903,11 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
     pars->scaling_points_y[i][1] = aom_rb_read_literal(rb, 8);
   }
 
-  if (!cm->seq_params.monochrome)
+  if (!seq_params->monochrome)
     pars->chroma_scaling_from_luma = aom_rb_read_bit(rb);
 
-  if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma ||
-      ((cm->subsampling_x == 1) && (cm->subsampling_y == 1) &&
+  if (seq_params->monochrome || pars->chroma_scaling_from_luma ||
+      ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
        (pars->num_y_points == 0))) {
     pars->num_cb_points = 0;
     pars->num_cr_points = 0;
@@ -3168,7 +3942,7 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
       pars->scaling_points_cr[i][1] = aom_rb_read_literal(rb, 8);
     }
 
-    if ((cm->subsampling_x == 1) && (cm->subsampling_y == 1) &&
+    if ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
         (((pars->num_cb_points == 0) && (pars->num_cr_points != 0)) ||
          ((pars->num_cb_points != 0) && (pars->num_cr_points == 0))))
       aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
@@ -3222,89 +3996,93 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
 }
 
 static void read_film_grain(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
-  if (cm->film_grain_params_present && (cm->show_frame || cm->showable_frame)) {
+  if (cm->seq_params.film_grain_params_present &&
+      (cm->show_frame || cm->showable_frame)) {
     av1_read_film_grain_params(cm, rb);
   } else {
     memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
   }
-  cm->film_grain_params.bit_depth = cm->bit_depth;
+  cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
   memcpy(&cm->cur_frame->film_grain_params, &cm->film_grain_params,
          sizeof(aom_film_grain_t));
 }
 
-void av1_read_color_config(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
-                           int allow_lowbitdepth) {
-  av1_read_bitdepth(cm, rb);
+void av1_read_color_config(struct aom_read_bit_buffer *rb,
+                           int allow_lowbitdepth, SequenceHeader *seq_params,
+                           struct aom_internal_error_info *error_info) {
+  read_bitdepth(rb, seq_params, error_info);
 
-  cm->use_highbitdepth = cm->bit_depth > AOM_BITS_8 || !allow_lowbitdepth;
+  seq_params->use_highbitdepth =
+      seq_params->bit_depth > AOM_BITS_8 || !allow_lowbitdepth;
   // monochrome bit (not needed for PROFILE_1)
-  const int is_monochrome = cm->profile != PROFILE_1 ? aom_rb_read_bit(rb) : 0;
-  cm->seq_params.monochrome = is_monochrome;
+  const int is_monochrome =
+      seq_params->profile != PROFILE_1 ? aom_rb_read_bit(rb) : 0;
+  seq_params->monochrome = is_monochrome;
   int color_description_present_flag = aom_rb_read_bit(rb);
   if (color_description_present_flag) {
-    cm->color_primaries = aom_rb_read_literal(rb, 8);
-    cm->transfer_characteristics = aom_rb_read_literal(rb, 8);
-    cm->matrix_coefficients = aom_rb_read_literal(rb, 8);
+    seq_params->color_primaries = aom_rb_read_literal(rb, 8);
+    seq_params->transfer_characteristics = aom_rb_read_literal(rb, 8);
+    seq_params->matrix_coefficients = aom_rb_read_literal(rb, 8);
   } else {
-    cm->color_primaries = AOM_CICP_CP_UNSPECIFIED;
-    cm->transfer_characteristics = AOM_CICP_TC_UNSPECIFIED;
-    cm->matrix_coefficients = AOM_CICP_MC_UNSPECIFIED;
+    seq_params->color_primaries = AOM_CICP_CP_UNSPECIFIED;
+    seq_params->transfer_characteristics = AOM_CICP_TC_UNSPECIFIED;
+    seq_params->matrix_coefficients = AOM_CICP_MC_UNSPECIFIED;
   }
   if (is_monochrome) {
     // [16,235] (including xvycc) vs [0,255] range
-    cm->color_range = aom_rb_read_bit(rb);
-    cm->subsampling_y = cm->subsampling_x = 1;
-    cm->chroma_sample_position = AOM_CSP_UNKNOWN;
-    cm->separate_uv_delta_q = 0;
+    seq_params->color_range = aom_rb_read_bit(rb);
+    seq_params->subsampling_y = seq_params->subsampling_x = 1;
+    seq_params->chroma_sample_position = AOM_CSP_UNKNOWN;
+    seq_params->separate_uv_delta_q = 0;
     return;
   }
-  if (cm->color_primaries == AOM_CICP_CP_BT_709 &&
-      cm->transfer_characteristics == AOM_CICP_TC_SRGB &&
-      cm->matrix_coefficients == AOM_CICP_MC_IDENTITY) {  // it would be better
-                                                          // to remove this
-                                                          // dependency too
-    cm->subsampling_y = cm->subsampling_x = 0;
-    cm->color_range = 1;  // assume full color-range
-    if (!(cm->profile == PROFILE_1 ||
-          (cm->profile == PROFILE_2 && cm->bit_depth == AOM_BITS_12))) {
+  if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
+      seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
+      seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+    // It would be good to remove this dependency.
+    seq_params->subsampling_y = seq_params->subsampling_x = 0;
+    seq_params->color_range = 1;  // assume full color-range
+    if (!(seq_params->profile == PROFILE_1 ||
+          (seq_params->profile == PROFILE_2 &&
+           seq_params->bit_depth == AOM_BITS_12))) {
       aom_internal_error(
-          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          error_info, AOM_CODEC_UNSUP_BITSTREAM,
           "sRGB colorspace not compatible with specified profile");
     }
   } else {
     // [16,235] (including xvycc) vs [0,255] range
-    cm->color_range = aom_rb_read_bit(rb);
-    if (cm->profile == PROFILE_0) {
+    seq_params->color_range = aom_rb_read_bit(rb);
+    if (seq_params->profile == PROFILE_0) {
       // 420 only
-      cm->subsampling_x = cm->subsampling_y = 1;
-    } else if (cm->profile == PROFILE_1) {
+      seq_params->subsampling_x = seq_params->subsampling_y = 1;
+    } else if (seq_params->profile == PROFILE_1) {
       // 444 only
-      cm->subsampling_x = cm->subsampling_y = 0;
+      seq_params->subsampling_x = seq_params->subsampling_y = 0;
     } else {
-      assert(cm->profile == PROFILE_2);
-      if (cm->bit_depth == AOM_BITS_12) {
-        cm->subsampling_x = aom_rb_read_bit(rb);
-        if (cm->subsampling_x)
-          cm->subsampling_y = aom_rb_read_bit(rb);  // 422 or 420
+      assert(seq_params->profile == PROFILE_2);
+      if (seq_params->bit_depth == AOM_BITS_12) {
+        seq_params->subsampling_x = aom_rb_read_bit(rb);
+        if (seq_params->subsampling_x)
+          seq_params->subsampling_y = aom_rb_read_bit(rb);  // 422 or 420
         else
-          cm->subsampling_y = 0;  // 444
+          seq_params->subsampling_y = 0;  // 444
       } else {
         // 422
-        cm->subsampling_x = 1;
-        cm->subsampling_y = 0;
+        seq_params->subsampling_x = 1;
+        seq_params->subsampling_y = 0;
       }
     }
-    if (cm->matrix_coefficients == AOM_CICP_MC_IDENTITY &&
-        (cm->subsampling_x || cm->subsampling_y)) {
+    if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY &&
+        (seq_params->subsampling_x || seq_params->subsampling_y)) {
       aom_internal_error(
-          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          error_info, AOM_CODEC_UNSUP_BITSTREAM,
           "Identity CICP Matrix incompatible with non 4:4:4 color sampling");
     }
-    if (cm->subsampling_x && cm->subsampling_y) {
-      cm->chroma_sample_position = aom_rb_read_literal(rb, 2);
+    if (seq_params->subsampling_x && seq_params->subsampling_y) {
+      seq_params->chroma_sample_position = aom_rb_read_literal(rb, 2);
     }
   }
-  cm->separate_uv_delta_q = aom_rb_read_bit(rb);
+  seq_params->separate_uv_delta_q = aom_rb_read_bit(rb);
 }
 
 void av1_read_timing_info_header(AV1_COMMON *cm,
@@ -3338,8 +4116,8 @@ void av1_read_decoder_model_info(AV1_COMMON *cm,
       aom_rb_read_literal(rb, 5) + 1;
   cm->buffer_model.num_units_in_decoding_tick = aom_rb_read_unsigned_literal(
       rb, 32);  // Number of units in a decoding tick
-  cm->buffer_model.buffer_removal_delay_length = aom_rb_read_literal(rb, 5) + 1;
-  cm->buffer_model.frame_presentation_delay_length =
+  cm->buffer_model.buffer_removal_time_length = aom_rb_read_literal(rb, 5) + 1;
+  cm->buffer_model.frame_presentation_time_length =
       aom_rb_read_literal(rb, 5) + 1;
 }
 
@@ -3352,32 +4130,27 @@ void av1_read_op_parameters_info(AV1_COMMON *const cm,
                        op_num + 1);
   }
 
-  cm->op_params[op_num].decoder_buffer_delay = aom_rb_read_literal(
+  cm->op_params[op_num].decoder_buffer_delay = aom_rb_read_unsigned_literal(
       rb, cm->buffer_model.encoder_decoder_buffer_delay_length);
 
-  cm->op_params[op_num].encoder_buffer_delay = aom_rb_read_literal(
+  cm->op_params[op_num].encoder_buffer_delay = aom_rb_read_unsigned_literal(
       rb, cm->buffer_model.encoder_decoder_buffer_delay_length);
 
   cm->op_params[op_num].low_delay_mode_flag = aom_rb_read_bit(rb);
 }
 
-static void av1_read_tu_pts_info(AV1_COMMON *const cm,
-                                 struct aom_read_bit_buffer *rb) {
-  cm->tu_presentation_delay =
-      aom_rb_read_literal(rb, cm->buffer_model.frame_presentation_delay_length);
-}
-
-void read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
-  // rb->error_handler may be triggered during aom_rb_read_bit(), raising
-  // internal errors and immediate decoding termination. We use a local variable
-  // to store the info. as we decode. At the end, if no errors have occurred,
-  // cm->seq_params is updated.
-  SequenceHeader sh = cm->seq_params;
-  SequenceHeader *const seq_params = &sh;
-  int num_bits_width = aom_rb_read_literal(rb, 4) + 1;
-  int num_bits_height = aom_rb_read_literal(rb, 4) + 1;
-  int max_frame_width = aom_rb_read_literal(rb, num_bits_width) + 1;
-  int max_frame_height = aom_rb_read_literal(rb, num_bits_height) + 1;
+static void av1_read_temporal_point_info(AV1_COMMON *const cm,
+                                         struct aom_read_bit_buffer *rb) {
+  cm->frame_presentation_time = aom_rb_read_unsigned_literal(
+      rb, cm->buffer_model.frame_presentation_time_length);
+}
+
+void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
+                              SequenceHeader *seq_params) {
+  const int num_bits_width = aom_rb_read_literal(rb, 4) + 1;
+  const int num_bits_height = aom_rb_read_literal(rb, 4) + 1;
+  const int max_frame_width = aom_rb_read_literal(rb, num_bits_width) + 1;
+  const int max_frame_height = aom_rb_read_literal(rb, num_bits_height) + 1;
 
   seq_params->num_bits_width = num_bits_width;
   seq_params->num_bits_height = num_bits_height;
@@ -3452,7 +4225,6 @@ void read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
   seq_params->enable_superres = aom_rb_read_bit(rb);
   seq_params->enable_cdef = aom_rb_read_bit(rb);
   seq_params->enable_restoration = aom_rb_read_bit(rb);
-  cm->seq_params = *seq_params;
 }
 
 static int read_global_motion_params(WarpedMotionParams *params,
@@ -3640,9 +4412,12 @@ static void show_existing_frame_reset(AV1Decoder *const pbi,
   *cm->fc = cm->frame_contexts[existing_frame_idx];
 }
 
+// On success, returns 0. On failure, calls aom_internal_error and does not
+// return.
 static int read_uncompressed_header(AV1Decoder *pbi,
                                     struct aom_read_bit_buffer *rb) {
   AV1_COMMON *const cm = &pbi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
   MACROBLOCKD *const xd = &pbi->mb;
   BufferPool *const pool = cm->buffer_pool;
   RefCntBuffer *const frame_bufs = pool->frame_bufs;
@@ -3658,7 +4433,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
   // NOTE: By default all coded frames to be used as a reference
   cm->is_reference_frame = 1;
 
-  if (cm->seq_params.reduced_still_picture_hdr) {
+  if (seq_params->reduced_still_picture_hdr) {
     cm->show_existing_frame = 0;
     cm->show_frame = 1;
     cm->frame_type = KEY_FRAME;
@@ -3671,12 +4446,12 @@ static int read_uncompressed_header(AV1Decoder *pbi,
       // Show an existing frame directly.
       const int existing_frame_idx = aom_rb_read_literal(rb, 3);
       const int frame_to_show = cm->ref_frame_map[existing_frame_idx];
-      if (cm->seq_params.decoder_model_info_present_flag &&
+      if (seq_params->decoder_model_info_present_flag &&
           cm->timing_info.equal_picture_interval == 0) {
-        av1_read_tu_pts_info(cm, rb);
+        av1_read_temporal_point_info(cm, rb);
       }
-      if (cm->seq_params.frame_id_numbers_present_flag) {
-        int frame_id_length = cm->seq_params.frame_id_length;
+      if (seq_params->frame_id_numbers_present_flag) {
+        int frame_id_length = seq_params->frame_id_length;
         int display_frame_id = aom_rb_read_literal(rb, frame_id_length);
         /* Compare display_frame_id with ref_frame_id and check valid for
          * referencing */
@@ -3719,16 +4494,16 @@ static int read_uncompressed_header(AV1Decoder *pbi,
 
     cm->frame_type = (FRAME_TYPE)aom_rb_read_literal(rb, 2);  // 2 bits
     cm->show_frame = aom_rb_read_bit(rb);
-    if (cm->seq_params.still_picture &&
+    if (seq_params->still_picture &&
         (cm->frame_type != KEY_FRAME || !cm->show_frame)) {
       aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                          "Still pictures must be coded as shown keyframes");
     }
     cm->showable_frame = cm->frame_type != KEY_FRAME;
     if (cm->show_frame) {
-      if (cm->seq_params.decoder_model_info_present_flag &&
+      if (seq_params->decoder_model_info_present_flag &&
           cm->timing_info.equal_picture_interval == 0)
-        av1_read_tu_pts_info(cm, rb);
+        av1_read_temporal_point_info(cm, rb);
     } else {
       // See if this frame can be used as show_existing_frame in future
       cm->showable_frame = aom_rb_read_bit(rb);
@@ -3742,17 +4517,17 @@ static int read_uncompressed_header(AV1Decoder *pbi,
   }
 
   cm->disable_cdf_update = aom_rb_read_bit(rb);
-  if (cm->seq_params.force_screen_content_tools == 2) {
+  if (seq_params->force_screen_content_tools == 2) {
     cm->allow_screen_content_tools = aom_rb_read_bit(rb);
   } else {
-    cm->allow_screen_content_tools = cm->seq_params.force_screen_content_tools;
+    cm->allow_screen_content_tools = seq_params->force_screen_content_tools;
   }
 
   if (cm->allow_screen_content_tools) {
-    if (cm->seq_params.force_integer_mv == 2) {
+    if (seq_params->force_integer_mv == 2) {
       cm->cur_frame_force_integer_mv = aom_rb_read_bit(rb);
     } else {
-      cm->cur_frame_force_integer_mv = cm->seq_params.force_integer_mv;
+      cm->cur_frame_force_integer_mv = seq_params->force_integer_mv;
     }
   } else {
     cm->cur_frame_force_integer_mv = 0;
@@ -3763,10 +4538,10 @@ static int read_uncompressed_header(AV1Decoder *pbi,
   cm->allow_intrabc = 0;
   cm->primary_ref_frame = PRIMARY_REF_NONE;
 
-  if (!cm->seq_params.reduced_still_picture_hdr) {
-    if (cm->seq_params.frame_id_numbers_present_flag) {
-      int frame_id_length = cm->seq_params.frame_id_length;
-      int diff_len = cm->seq_params.delta_frame_id_length;
+  if (!seq_params->reduced_still_picture_hdr) {
+    if (seq_params->frame_id_numbers_present_flag) {
+      int frame_id_length = seq_params->frame_id_length;
+      int diff_len = seq_params->delta_frame_id_length;
       int prev_frame_id = 0;
       int have_prev_frame_id = !pbi->decoding_first_frame &&
                                !(cm->frame_type == KEY_FRAME && cm->show_frame);
@@ -3811,7 +4586,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
         frame_is_sframe(cm) ? 1 : aom_rb_read_literal(rb, 1);
 
     cm->frame_offset =
-        aom_rb_read_literal(rb, cm->seq_params.order_hint_bits_minus_1 + 1);
+        aom_rb_read_literal(rb, seq_params->order_hint_bits_minus_1 + 1);
     cm->current_video_frame = cm->frame_offset;
 
     if (!cm->error_resilient_mode && !frame_is_intra_only(cm)) {
@@ -3819,27 +4594,27 @@ static int read_uncompressed_header(AV1Decoder *pbi,
     }
   }
 
-  if (cm->seq_params.decoder_model_info_present_flag) {
-    cm->buffer_removal_delay_present = aom_rb_read_bit(rb);
-    if (cm->buffer_removal_delay_present) {
+  if (seq_params->decoder_model_info_present_flag) {
+    cm->buffer_removal_time_present = aom_rb_read_bit(rb);
+    if (cm->buffer_removal_time_present) {
       for (int op_num = 0;
-           op_num < cm->seq_params.operating_points_cnt_minus_1 + 1; op_num++) {
+           op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
         if (cm->op_params[op_num].decoder_model_param_present_flag) {
-          if ((((cm->seq_params.operating_point_idc[op_num] >>
+          if ((((seq_params->operating_point_idc[op_num] >>
                  cm->temporal_layer_id) &
                 0x1) &&
-               ((cm->seq_params.operating_point_idc[op_num] >>
+               ((seq_params->operating_point_idc[op_num] >>
                  (cm->spatial_layer_id + 8)) &
                 0x1)) ||
-              cm->seq_params.operating_point_idc[op_num] == 0) {
-            cm->op_frame_timing[op_num].buffer_removal_delay =
-                aom_rb_read_literal(
-                    rb, cm->buffer_model.buffer_removal_delay_length);
+              seq_params->operating_point_idc[op_num] == 0) {
+            cm->op_frame_timing[op_num].buffer_removal_time =
+                aom_rb_read_unsigned_literal(
+                    rb, cm->buffer_model.buffer_removal_time_length);
           } else {
-            cm->op_frame_timing[op_num].buffer_removal_delay = 0;
+            cm->op_frame_timing[op_num].buffer_removal_time = 0;
           }
         } else {
-          cm->op_frame_timing[op_num].buffer_removal_delay = 0;
+          cm->op_frame_timing[op_num].buffer_removal_time = 0;
         }
       }
     }
@@ -3882,11 +4657,11 @@ static int read_uncompressed_header(AV1Decoder *pbi,
 
   if (!frame_is_intra_only(cm) || pbi->refresh_frame_flags != 0xFF) {
     // Read all ref frame order hints if error_resilient_mode == 1
-    if (cm->error_resilient_mode && cm->seq_params.enable_order_hint) {
+    if (cm->error_resilient_mode && seq_params->enable_order_hint) {
       for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
         // Read order hint from bit stream
         unsigned int frame_offset =
-            aom_rb_read_literal(rb, cm->seq_params.order_hint_bits_minus_1 + 1);
+            aom_rb_read_literal(rb, seq_params->order_hint_bits_minus_1 + 1);
         // Get buffer index
         int buf_idx = cm->ref_frame_map[ref_idx];
         assert(buf_idx < FRAME_BUFFERS);
@@ -3906,10 +4681,10 @@ static int read_uncompressed_header(AV1Decoder *pbi,
           }
           lock_buffer_pool(pool);
           if (aom_realloc_frame_buffer(
-                  &frame_bufs[buf_idx].buf, cm->seq_params.max_frame_width,
-                  cm->seq_params.max_frame_height, cm->subsampling_x,
-                  cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
-                  cm->byte_alignment,
+                  &frame_bufs[buf_idx].buf, seq_params->max_frame_width,
+                  seq_params->max_frame_height, seq_params->subsampling_x,
+                  seq_params->subsampling_y, seq_params->use_highbitdepth,
+                  AOM_BORDER_IN_PIXELS, cm->byte_alignment,
                   &pool->frame_bufs[buf_idx].raw_frame_buffer, pool->get_fb_cb,
                   pool->cb_priv)) {
             unlock_buffer_pool(pool);
@@ -3917,7 +4692,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
                                "Failed to allocate frame buffer");
           }
           unlock_buffer_pool(pool);
-          set_planes_to_neutral_grey(cm, &frame_bufs[buf_idx].buf, 0);
+          set_planes_to_neutral_grey(seq_params, &frame_bufs[buf_idx].buf, 0);
 
           cm->ref_frame_map[ref_idx] = buf_idx;
           frame_bufs[buf_idx].cur_frame_offset = frame_offset;
@@ -3937,7 +4712,8 @@ static int read_uncompressed_header(AV1Decoder *pbi,
     cm->allow_ref_frame_mvs = 0;
 
     if (cm->intra_only) {
-      cm->cur_frame->film_grain_params_present = cm->film_grain_params_present;
+      cm->cur_frame->film_grain_params_present =
+          seq_params->film_grain_params_present;
       setup_frame_size(cm, frame_size_override_flag, rb);
       if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
         cm->allow_intrabc = aom_rb_read_bit(rb);
@@ -3945,7 +4721,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
     } else if (pbi->need_resync != 1) { /* Skip if need resync */
 
       // Frame refs short signaling is off when error resilient mode is on.
-      if (cm->seq_params.enable_order_hint)
+      if (seq_params->enable_order_hint)
         cm->frame_refs_short_signaling = aom_rb_read_bit(rb);
 
       if (cm->frame_refs_short_signaling) {
@@ -3999,9 +4775,9 @@ static int read_uncompressed_header(AV1Decoder *pbi,
 
         cm->ref_frame_sign_bias[LAST_FRAME + i] = 0;
 
-        if (cm->seq_params.frame_id_numbers_present_flag) {
-          int frame_id_length = cm->seq_params.frame_id_length;
-          int diff_len = cm->seq_params.delta_frame_id_length;
+        if (seq_params->frame_id_numbers_present_flag) {
+          int frame_id_length = seq_params->frame_id_length;
+          int diff_len = seq_params->delta_frame_id_length;
           int delta_frame_id_minus_1 = aom_rb_read_literal(rb, diff_len);
           int ref_frame_id =
               ((cm->current_frame_id - (delta_frame_id_minus_1 + 1) +
@@ -4064,7 +4840,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
   cm->cur_frame->intra_only = cm->frame_type == KEY_FRAME || cm->intra_only;
   cm->cur_frame->frame_type = cm->frame_type;
 
-  if (cm->seq_params.frame_id_numbers_present_flag) {
+  if (seq_params->frame_id_numbers_present_flag) {
     /* If bitmask is set, update reference frame id values and
        mark frames as valid for reference */
     int refresh_frame_flags = pbi->refresh_frame_flags;
@@ -4077,7 +4853,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
   }
 
   const int might_bwd_adapt =
-      !(cm->seq_params.reduced_still_picture_hdr) && !(cm->disable_cdf_update);
+      !(seq_params->reduced_still_picture_hdr) && !(cm->disable_cdf_update);
   if (might_bwd_adapt) {
     cm->refresh_frame_context = aom_rb_read_bit(rb)
                                     ? REFRESH_FRAME_CONTEXT_DISABLED
@@ -4086,14 +4862,16 @@ static int read_uncompressed_header(AV1Decoder *pbi,
     cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
   }
 
-  get_frame_new_buffer(cm)->bit_depth = cm->bit_depth;
-  get_frame_new_buffer(cm)->color_primaries = cm->color_primaries;
+  get_frame_new_buffer(cm)->bit_depth = seq_params->bit_depth;
+  get_frame_new_buffer(cm)->color_primaries = seq_params->color_primaries;
   get_frame_new_buffer(cm)->transfer_characteristics =
-      cm->transfer_characteristics;
-  get_frame_new_buffer(cm)->matrix_coefficients = cm->matrix_coefficients;
-  get_frame_new_buffer(cm)->monochrome = cm->seq_params.monochrome;
-  get_frame_new_buffer(cm)->chroma_sample_position = cm->chroma_sample_position;
-  get_frame_new_buffer(cm)->color_range = cm->color_range;
+      seq_params->transfer_characteristics;
+  get_frame_new_buffer(cm)->matrix_coefficients =
+      seq_params->matrix_coefficients;
+  get_frame_new_buffer(cm)->monochrome = seq_params->monochrome;
+  get_frame_new_buffer(cm)->chroma_sample_position =
+      seq_params->chroma_sample_position;
+  get_frame_new_buffer(cm)->color_range = seq_params->color_range;
   get_frame_new_buffer(cm)->render_width = cm->render_width;
   get_frame_new_buffer(cm)->render_height = cm->render_height;
 
@@ -4145,7 +4923,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
 
   read_tile_info(pbi, rb);
   setup_quantization(cm, rb);
-  xd->bd = (int)cm->bit_depth;
+  xd->bd = (int)seq_params->bit_depth;
 
   if (cm->num_allocated_above_context_planes < av1_num_planes(cm) ||
       cm->num_allocated_above_context_mi_col < cm->mi_cols ||
@@ -4196,22 +4974,22 @@ static int read_uncompressed_header(AV1Decoder *pbi,
     cm->lf.filter_level[0] = 0;
     cm->lf.filter_level[1] = 0;
   }
-  if (cm->coded_lossless || !cm->seq_params.enable_cdef) {
+  if (cm->coded_lossless || !seq_params->enable_cdef) {
     cm->cdef_bits = 0;
     cm->cdef_strengths[0] = 0;
     cm->cdef_uv_strengths[0] = 0;
   }
-  if (cm->all_lossless || !cm->seq_params.enable_restoration) {
+  if (cm->all_lossless || !seq_params->enable_restoration) {
     cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
     cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
     cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
   }
   setup_loopfilter(cm, rb);
 
-  if (!cm->coded_lossless && cm->seq_params.enable_cdef) {
+  if (!cm->coded_lossless && seq_params->enable_cdef) {
     setup_cdef(cm, rb);
   }
-  if (!cm->all_lossless && cm->seq_params.enable_restoration) {
+  if (!cm->all_lossless && seq_params->enable_restoration) {
     decode_restoration_mode(cm, rb);
   }
 
@@ -4236,7 +5014,8 @@ static int read_uncompressed_header(AV1Decoder *pbi,
 
   if (!frame_is_intra_only(cm)) read_global_motion(cm, rb);
 
-  cm->cur_frame->film_grain_params_present = cm->film_grain_params_present;
+  cm->cur_frame->film_grain_params_present =
+      seq_params->film_grain_params_present;
   read_film_grain(cm, rb);
 
 #if EXT_TILE_DEBUG
@@ -4282,11 +5061,11 @@ void superres_post_decode(AV1Decoder *pbi) {
   unlock_buffer_pool(pool);
 }
 
-int av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
-                                       struct aom_read_bit_buffer *rb,
-                                       const uint8_t *data,
-                                       const uint8_t **p_data_end,
-                                       int trailing_bits_present) {
+uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
+                                            struct aom_read_bit_buffer *rb,
+                                            const uint8_t *data,
+                                            const uint8_t **p_data_end,
+                                            int trailing_bits_present) {
   AV1_COMMON *const cm = &pbi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &pbi->mb;
@@ -4316,7 +5095,8 @@ int av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
     pbi->dec_tile_col = -1;
   }
 
-  pbi->uncomp_hdr_size = aom_rb_bytes_read(rb);
+  const uint32_t uncomp_hdr_size =
+      (uint32_t)aom_rb_bytes_read(rb);  // Size of the uncompressed header
   YV12_BUFFER_CONFIG *new_fb = get_frame_new_buffer(cm);
   xd->cur_buf = new_fb;
   if (av1_allow_intrabc(cm)) {
@@ -4327,7 +5107,7 @@ int av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
 
   if (cm->show_existing_frame) {
     // showing a frame directly
-    *p_data_end = data + aom_rb_bytes_read(rb);
+    *p_data_end = data + uncomp_hdr_size;
     if (cm->reset_decoder_state) {
       // Use the default frame context values.
       *cm->fc = cm->frame_contexts[FRAME_CONTEXT_DEFAULTS];
@@ -4335,7 +5115,7 @@ int av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
         aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                            "Uninitialized entropy context.");
     }
-    return 0;
+    return uncomp_hdr_size;
   }
 
   cm->setup_mi(cm);
@@ -4344,7 +5124,8 @@ int av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
 
   av1_setup_motion_field(cm);
 
-  av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y, num_planes);
+  av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
+                         cm->seq_params.subsampling_y, num_planes);
   if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
     // use the default frame context values
     *cm->fc = cm->frame_contexts[FRAME_CONTEXT_DEFAULTS];
@@ -4356,7 +5137,7 @@ int av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
                        "Uninitialized entropy context.");
 
   xd->corrupted = 0;
-  return 0;
+  return uncomp_hdr_size;
 }
 
 // Once-per-frame initialization
@@ -4368,7 +5149,7 @@ static void setup_frame_info(AV1Decoder *pbi) {
       cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
     av1_alloc_restoration_buffers(cm);
   }
-  const int use_highbd = cm->use_highbitdepth ? 1 : 0;
+  const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
   const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
   if (pbi->td.mc_buf_size != buf_size) {
     av1_free_mc_tmp_buf(&pbi->td, use_highbd);
@@ -4386,14 +5167,21 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
 
   if (initialize_flag) setup_frame_info(pbi);
 
-  if (pbi->max_threads > 1 && tile_count_tg > 1 && !cm->large_scale_tile)
+  if (pbi->max_threads > 1 && !(cm->large_scale_tile && !pbi->ext_tile_debug) &&
+      pbi->row_mt)
+    *p_data_end =
+        decode_tiles_row_mt(pbi, data, data_end, start_tile, end_tile);
+  else if (pbi->max_threads > 1 && tile_count_tg > 1 &&
+           !(cm->large_scale_tile && !pbi->ext_tile_debug))
     *p_data_end = decode_tiles_mt(pbi, data, data_end, start_tile, end_tile);
   else
     *p_data_end = decode_tiles(pbi, data, data_end, start_tile, end_tile);
 
   const int num_planes = av1_num_planes(cm);
   // If the bit stream is monochrome, set the U and V buffers to a constant.
-  if (num_planes < 3) set_planes_to_neutral_grey(cm, xd->cur_buf, 1);
+  if (num_planes < 3) {
+    set_planes_to_neutral_grey(&cm->seq_params, xd->cur_buf, 1);
+  }
 
   if (end_tile != cm->tile_rows * cm->tile_cols - 1) {
     return;
diff --git a/third_party/aom/av1/decoder/decodeframe.h b/third_party/aom/av1/decoder/decodeframe.h
index 330cedcdc..d289b31f2 100644
--- a/third_party/aom/av1/decoder/decodeframe.h
+++ b/third_party/aom/av1/decoder/decodeframe.h
@@ -18,12 +18,13 @@ extern "C" {
 
 struct AV1Decoder;
 struct aom_read_bit_buffer;
+struct ThreadData;
 
 // Reads the middle part of the sequence header OBU (from
-// frame_width_bits_minus_1 to enable_restoration) into cm->seq_params (a
-// SequenceHeader). Reports errors by calling rb->error_handler() or
-// aom_internal_error().
-void read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb);
+// frame_width_bits_minus_1 to enable_restoration) into seq_params.
+// Reports errors by calling rb->error_handler() or aom_internal_error().
+void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
+                              SequenceHeader *seq_params);
 
 void av1_read_frame_size(struct aom_read_bit_buffer *rb, int num_bits_width,
                          int num_bits_height, int *width, int *height);
@@ -34,11 +35,14 @@ BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb);
 int av1_check_trailing_bits(struct AV1Decoder *pbi,
                             struct aom_read_bit_buffer *rb);
 
-int av1_decode_frame_headers_and_setup(struct AV1Decoder *pbi,
-                                       struct aom_read_bit_buffer *rb,
-                                       const uint8_t *data,
-                                       const uint8_t **p_data_end,
-                                       int trailing_bits_present);
+// On success, returns the frame header size. On failure, calls
+// aom_internal_error and does not return.
+// TODO(wtc): Figure out and document the p_data_end parameter.
+uint32_t av1_decode_frame_headers_and_setup(struct AV1Decoder *pbi,
+                                            struct aom_read_bit_buffer *rb,
+                                            const uint8_t *data,
+                                            const uint8_t **p_data_end,
+                                            int trailing_bits_present);
 
 void av1_decode_tg_tiles_and_wrapup(struct AV1Decoder *pbi, const uint8_t *data,
                                     const uint8_t *data_end,
@@ -47,8 +51,9 @@ void av1_decode_tg_tiles_and_wrapup(struct AV1Decoder *pbi, const uint8_t *data,
 
 // Implements the color_config() function in the spec. Reports errors by
 // calling rb->error_handler() or aom_internal_error().
-void av1_read_color_config(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
-                           int allow_lowbitdepth);
+void av1_read_color_config(struct aom_read_bit_buffer *rb,
+                           int allow_lowbitdepth, SequenceHeader *seq_params,
+                           struct aom_internal_error_info *error_info);
 
 // Implements the timing_info() function in the spec. Reports errors by calling
 // rb->error_handler().
@@ -69,7 +74,7 @@ struct aom_read_bit_buffer *av1_init_read_bit_buffer(
     struct AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
     const uint8_t *data_end);
 
-void av1_free_mc_tmp_buf(void *td, int use_highbd);
+void av1_free_mc_tmp_buf(struct ThreadData *thread_data, int use_highbd);
 
 void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm);
 
diff --git a/third_party/aom/av1/decoder/decodemv.c b/third_party/aom/av1/decoder/decodemv.c
index cc8f4d29e..5e920b18d 100644
--- a/third_party/aom/av1/decoder/decodemv.c
+++ b/third_party/aom/av1/decoder/decodemv.c
@@ -290,7 +290,7 @@ static int read_segment_id(AV1_COMMON *const cm, const MACROBLOCKD *const xd,
       av1_neg_deinterleave(coded_id, pred, seg->last_active_segid + 1);
 
   if (segment_id < 0 || segment_id > seg->last_active_segid) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
                        "Corrupted segment_ids");
   }
   return segment_id;
@@ -573,7 +573,7 @@ static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
           aom_read_symbol(r, xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
                           PALETTE_SIZES, ACCT_STR) +
           2;
-      read_palette_colors_y(xd, cm->bit_depth, pmi, r);
+      read_palette_colors_y(xd, cm->seq_params.bit_depth, pmi, r);
     }
   }
   if (num_planes > 1 && mbmi->uv_mode == UV_DC_PRED &&
@@ -587,7 +587,7 @@ static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
           aom_read_symbol(r, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
                           PALETTE_SIZES, ACCT_STR) +
           2;
-      read_palette_colors_uv(xd, cm->bit_depth, pmi, r);
+      read_palette_colors_uv(xd, cm->seq_params.bit_depth, pmi, r);
     }
   }
 }
@@ -1299,7 +1299,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
   }
 
   if (is_compound != is_inter_compound_mode(mbmi->mode)) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
                        "Prediction mode %d invalid with ref frame %d %d",
                        mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
   }
@@ -1480,8 +1480,9 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
     }
   }
 
-  xd->cfl.is_chroma_reference = is_chroma_reference(
-      mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y);
+  xd->cfl.is_chroma_reference =
+      is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x,
+                          cm->seq_params.subsampling_y);
   xd->cfl.store_y = store_cfl_required(cm, xd);
 
 #if DEC_MISMATCH_DEBUG
diff --git a/third_party/aom/av1/decoder/decoder.c b/third_party/aom/av1/decoder/decoder.c
index 2e91d27d3..e978fad6c 100644
--- a/third_party/aom/av1/decoder/decoder.c
+++ b/third_party/aom/av1/decoder/decoder.c
@@ -71,6 +71,7 @@ static void dec_free_mi(AV1_COMMON *cm) {
   cm->mip = NULL;
   aom_free(cm->mi_grid_base);
   cm->mi_grid_base = NULL;
+  cm->mi_alloc_size = 0;
 }
 
 AV1Decoder *av1_decoder_create(BufferPool *const pool) {
@@ -81,6 +82,9 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) {
 
   av1_zero(*pbi);
 
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
   if (setjmp(cm->error.jmp)) {
     cm->error.setjmp = 0;
     av1_decoder_remove(pbi);
@@ -98,7 +102,7 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) {
   memset(cm->frame_contexts, 0, FRAME_CONTEXTS * sizeof(*cm->frame_contexts));
 
   pbi->need_resync = 1;
-  once(initialize_dec);
+  aom_once(initialize_dec);
 
   // Initialize the references to not point to any frame buffers.
   memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
@@ -108,7 +112,7 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) {
   pbi->decoding_first_frame = 1;
   pbi->common.buffer_pool = pool;
 
-  cm->bit_depth = AOM_BITS_8;
+  cm->seq_params.bit_depth = AOM_BITS_8;
   cm->dequant_bit_depth = AOM_BITS_8;
 
   cm->alloc_mi = av1_dec_alloc_mi;
@@ -146,6 +150,12 @@ void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_mt_info) {
   }
 }
 
+void av1_dec_free_cb_buf(AV1Decoder *pbi) {
+  aom_free(pbi->cb_buffer_base);
+  pbi->cb_buffer_base = NULL;
+  pbi->cb_buffer_alloc_size = 0;
+}
+
 void av1_decoder_remove(AV1Decoder *pbi) {
   int i;
 
@@ -161,7 +171,7 @@ void av1_decoder_remove(AV1Decoder *pbi) {
   if (pbi->thread_data) {
     for (int worker_idx = 0; worker_idx < pbi->max_threads - 1; worker_idx++) {
       DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
-      const int use_highbd = pbi->common.use_highbitdepth ? 1 : 0;
+      const int use_highbd = pbi->common.seq_params.use_highbitdepth ? 1 : 0;
       av1_free_mc_tmp_buf(thread_data->td, use_highbd);
       aom_free(thread_data->td);
     }
@@ -172,6 +182,20 @@ void av1_decoder_remove(AV1Decoder *pbi) {
     AVxWorker *const worker = &pbi->tile_workers[i];
     aom_get_worker_interface()->end(worker);
   }
+#if CONFIG_MULTITHREAD
+  if (pbi->row_mt_mutex_ != NULL) {
+    pthread_mutex_destroy(pbi->row_mt_mutex_);
+    aom_free(pbi->row_mt_mutex_);
+  }
+  if (pbi->row_mt_cond_ != NULL) {
+    pthread_cond_destroy(pbi->row_mt_cond_);
+    aom_free(pbi->row_mt_cond_);
+  }
+#endif
+  for (i = 0; i < pbi->allocated_tiles; i++) {
+    TileDataDec *const tile_data = pbi->tile_data + i;
+    av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
+  }
   aom_free(pbi->tile_data);
   aom_free(pbi->tile_workers);
 
@@ -181,10 +205,11 @@ void av1_decoder_remove(AV1Decoder *pbi) {
     av1_dealloc_dec_jobs(&pbi->tile_mt_info);
   }
 
+  av1_dec_free_cb_buf(pbi);
 #if CONFIG_ACCOUNTING
   aom_accounting_clear(&pbi->accounting);
 #endif
-  const int use_highbd = pbi->common.use_highbitdepth ? 1 : 0;
+  const int use_highbd = pbi->common.seq_params.use_highbitdepth ? 1 : 0;
   av1_free_mc_tmp_buf(&pbi->td, use_highbd);
 
   aom_free(pbi);
@@ -279,7 +304,7 @@ aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
       ref_buf->y_buffer = sd->y_buffer;
       ref_buf->u_buffer = sd->u_buffer;
       ref_buf->v_buffer = sd->v_buffer;
-      ref_buf->use_external_refernce_buffers = 1;
+      ref_buf->use_external_reference_buffers = 1;
     }
   }
 
@@ -414,7 +439,10 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
 
   // Find a free frame buffer. Return error if can not find any.
   cm->new_fb_idx = get_free_fb(cm);
-  if (cm->new_fb_idx == INVALID_IDX) return AOM_CODEC_MEM_ERROR;
+  if (cm->new_fb_idx == INVALID_IDX) {
+    cm->error.error_code = AOM_CODEC_MEM_ERROR;
+    return 1;
+  }
 
   // Assign a MV array to the frame buffer.
   cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
@@ -423,6 +451,9 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
 
   pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
 
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
   if (setjmp(cm->error.jmp)) {
     const AVxWorkerInterface *const winterface = aom_get_worker_interface();
     int i;
@@ -474,7 +505,13 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
   int frame_decoded =
       aom_decode_frame_from_obus(pbi, source, source + size, psource);
 
-  if (cm->error.error_code != AOM_CODEC_OK) return 1;
+  if (cm->error.error_code != AOM_CODEC_OK) {
+    lock_buffer_pool(pool);
+    decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+    unlock_buffer_pool(pool);
+    cm->error.setjmp = 0;
+    return 1;
+  }
 
 #if TXCOEFF_TIMER
   cm->cum_txcoeff_timer += cm->txcoeff_timer;
@@ -493,7 +530,10 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
     pbi->decoding_first_frame = 0;
   }
 
-  if (cm->error.error_code != AOM_CODEC_OK) return 1;
+  if (cm->error.error_code != AOM_CODEC_OK) {
+    cm->error.setjmp = 0;
+    return 1;
+  }
 
   aom_clear_system_state();
 
diff --git a/third_party/aom/av1/decoder/decoder.h b/third_party/aom/av1/decoder/decoder.h
index 42fcc1256..610b98d95 100644
--- a/third_party/aom/av1/decoder/decoder.h
+++ b/third_party/aom/av1/decoder/decoder.h
@@ -33,6 +33,20 @@
 extern "C" {
 #endif
 
+typedef void (*decode_block_visitor_fn_t)(const AV1_COMMON *const cm,
+                                          MACROBLOCKD *const xd,
+                                          aom_reader *const r, const int plane,
+                                          const int row, const int col,
+                                          const TX_SIZE tx_size);
+
+typedef void (*predict_inter_block_visitor_fn_t)(AV1_COMMON *const cm,
+                                                 MACROBLOCKD *const xd,
+                                                 int mi_row, int mi_col,
+                                                 BLOCK_SIZE bsize);
+
+typedef void (*cfl_store_inter_block_visitor_fn_t)(AV1_COMMON *const cm,
+                                                   MACROBLOCKD *const xd);
+
 typedef struct ThreadData {
   aom_reader *bit_reader;
   DECLARE_ALIGNED(32, MACROBLOCKD, xd);
@@ -41,12 +55,54 @@ typedef struct ThreadData {
   CB_BUFFER cb_buffer_base;
   uint8_t *mc_buf[2];
   int32_t mc_buf_size;
+
+  decode_block_visitor_fn_t read_coeffs_tx_intra_block_visit;
+  decode_block_visitor_fn_t predict_and_recon_intra_block_visit;
+  decode_block_visitor_fn_t read_coeffs_tx_inter_block_visit;
+  decode_block_visitor_fn_t inverse_tx_inter_block_visit;
+  predict_inter_block_visitor_fn_t predict_inter_block_visit;
+  cfl_store_inter_block_visitor_fn_t cfl_store_inter_block_visit;
 } ThreadData;
 
+typedef struct AV1DecRowMTJobInfo {
+  int tile_row;
+  int tile_col;
+  int mi_row;
+} AV1DecRowMTJobInfo;
+
+typedef struct AV1DecRowMTSyncData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_;
+  pthread_cond_t *cond_;
+#endif
+  int allocated_sb_rows;
+  int *cur_sb_col;
+  int sync_range;
+  int mi_rows;
+  int mi_cols;
+  int mi_rows_parse_done;
+  int mi_rows_decode_started;
+  int num_threads_working;
+} AV1DecRowMTSync;
+
+typedef struct AV1DecRowMTInfo {
+  int tile_rows_start;
+  int tile_rows_end;
+  int tile_cols_start;
+  int tile_cols_end;
+  int start_tile;
+  int end_tile;
+  int mi_rows_parse_done;
+  int mi_rows_decode_started;
+  int mi_rows_to_decode;
+  int row_mt_exit;
+} AV1DecRowMTInfo;
+
 typedef struct TileDataDec {
   TileInfo tile_info;
   aom_reader bit_reader;
   DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
+  AV1DecRowMTSync dec_row_mt_sync;
 } TileDataDec;
 
 typedef struct TileBufferDec {
@@ -139,9 +195,8 @@ typedef struct AV1Decoder {
   int acct_enabled;
   Accounting accounting;
 #endif
-  size_t uncomp_hdr_size;  // Size of the uncompressed header
-  int tg_size;             // Number of tiles in the current tilegroup
-  int tg_start;            // First tile in the current tilegroup
+  int tg_size;   // Number of tiles in the current tilegroup
+  int tg_start;  // First tile in the current tilegroup
   int tg_size_bit_offset;
   int sequence_header_ready;
 #if CONFIG_INSPECTION
@@ -162,12 +217,27 @@ typedef struct AV1Decoder {
   int tile_count_minus_1;
   uint32_t coded_tile_data_size;
   unsigned int ext_tile_debug;  // for ext-tile software debug & testing
+  unsigned int row_mt;
   EXTERNAL_REFERENCES ext_refs;
   size_t tile_list_size;
   uint8_t *tile_list_output;
   size_t buffer_sz;
+
+  CB_BUFFER *cb_buffer_base;
+  int cb_buffer_alloc_size;
+
+  int allocated_row_mt_sync_rows;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *row_mt_mutex_;
+  pthread_cond_t *row_mt_cond_;
+#endif
+
+  AV1DecRowMTInfo frame_row_mt_info;
 } AV1Decoder;
 
+// Returns 0 on success. Sets pbi->common.error.error_code to a nonzero error
+// code and returns a nonzero value on failure.
 int av1_receive_compressed_data(struct AV1Decoder *pbi, size_t size,
                                 const uint8_t **dest);
 
@@ -192,6 +262,10 @@ struct AV1Decoder *av1_decoder_create(BufferPool *const pool);
 void av1_decoder_remove(struct AV1Decoder *pbi);
 void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_jobs_sync);
 
+void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync);
+
+void av1_dec_free_cb_buf(AV1Decoder *pbi);
+
 static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
                                       BufferPool *const pool) {
   if (idx >= 0) {
@@ -207,18 +281,6 @@ static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
   }
 }
 
-static INLINE int dec_is_ref_frame_buf(AV1Decoder *const pbi,
-                                       RefCntBuffer *frame_buf) {
-  AV1_COMMON *const cm = &pbi->common;
-  int i;
-  for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-    RefBuffer *const ref_frame = &cm->frame_refs[i];
-    if (ref_frame->idx == INVALID_IDX) continue;
-    if (frame_buf == &cm->buffer_pool->frame_bufs[ref_frame->idx]) break;
-  }
-  return (i < INTER_REFS_PER_FRAME);
-}
-
 #define ACCT_STR __func__
 static INLINE int av1_read_uniform(aom_reader *r, int n) {
   const int l = get_unsigned_bits(n);
@@ -238,6 +300,10 @@ void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd, int mi_row,
                        int mi_col, aom_reader *r, BLOCK_SIZE bsize,
                        palette_visitor_fn_t visit);
 
+typedef void (*block_visitor_fn_t)(AV1Decoder *const pbi, ThreadData *const td,
+                                   int mi_row, int mi_col, aom_reader *r,
+                                   PARTITION_TYPE partition, BLOCK_SIZE bsize);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/decoder/decodetxb.c b/third_party/aom/av1/decoder/decodetxb.c
index f9a3e8578..f3ef2d55e 100644
--- a/third_party/aom/av1/decoder/decodetxb.c
+++ b/third_party/aom/av1/decoder/decodetxb.c
@@ -320,10 +320,14 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
   return cul_level;
 }
 
-uint8_t av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
-                                   MACROBLOCKD *const xd, aom_reader *const r,
-                                   const int row, const int col,
-                                   const int plane, const TX_SIZE tx_size) {
+void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
+                                MACROBLOCKD *const xd, aom_reader *const r,
+                                const int plane, const int row, const int col,
+                                const TX_SIZE tx_size) {
+#if TXCOEFF_TIMER
+  struct aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+#endif
   MB_MODE_INFO *const mbmi = xd->mi[0];
   struct macroblockd_plane *const pd = &xd->plane[plane];
 
@@ -337,5 +341,22 @@ uint8_t av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
   const uint8_t cul_level =
       av1_read_coeffs_txb(cm, xd, r, row, col, plane, &txb_ctx, tx_size);
   av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, col, row);
-  return cul_level;
+
+  if (is_inter_block(mbmi)) {
+    PLANE_TYPE plane_type = get_plane_type(plane);
+    // tx_type will be read out in av1_read_coeffs_txb_facade
+    const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, row, col, tx_size,
+                                            cm->reduced_tx_set_used);
+
+    if (plane == 0)
+      update_txk_array(mbmi->txk_type, mbmi->sb_type, row, col, tx_size,
+                       tx_type);
+  }
+
+#if TXCOEFF_TIMER
+  aom_usec_timer_mark(&timer);
+  const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+  cm->txcoeff_timer += elapsed_time;
+  ++cm->txb_count;
+#endif
 }
diff --git a/third_party/aom/av1/decoder/decodetxb.h b/third_party/aom/av1/decoder/decodetxb.h
index d0b3d8c7a..687bba958 100644
--- a/third_party/aom/av1/decoder/decodetxb.h
+++ b/third_party/aom/av1/decoder/decodetxb.h
@@ -25,8 +25,8 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
                             const TXB_CTX *const txb_ctx,
                             const TX_SIZE tx_size);
 
-uint8_t av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
-                                   MACROBLOCKD *const xd, aom_reader *const r,
-                                   const int row, const int col,
-                                   const int plane, const TX_SIZE tx_size);
+void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
+                                MACROBLOCKD *const xd, aom_reader *const r,
+                                const int plane, const int row, const int col,
+                                const TX_SIZE tx_size);
 #endif  //  DECODETXB_H_
diff --git a/third_party/aom/av1/decoder/dthread.c b/third_party/aom/av1/decoder/dthread.c
index ff03502e6..3946c787a 100644
--- a/third_party/aom/av1/decoder/dthread.c
+++ b/third_party/aom/av1/decoder/dthread.c
@@ -157,8 +157,8 @@ void av1_frameworker_copy_context(AVxWorker *const dst_worker,
   dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync;
   av1_frameworker_unlock_stats(src_worker);
 
-  dst_cm->bit_depth = src_cm->bit_depth;
-  dst_cm->use_highbitdepth = src_cm->use_highbitdepth;
+  dst_cm->seq_params.bit_depth = src_cm->seq_params.bit_depth;
+  dst_cm->seq_params.use_highbitdepth = src_cm->seq_params.use_highbitdepth;
   // TODO(zoeliu): To handle parallel decoding
   dst_cm->prev_frame =
       src_cm->show_existing_frame ? src_cm->prev_frame : src_cm->cur_frame;
@@ -166,8 +166,8 @@ void av1_frameworker_copy_context(AVxWorker *const dst_worker,
       !src_cm->show_existing_frame ? src_cm->width : src_cm->last_width;
   dst_cm->last_height =
       !src_cm->show_existing_frame ? src_cm->height : src_cm->last_height;
-  dst_cm->subsampling_x = src_cm->subsampling_x;
-  dst_cm->subsampling_y = src_cm->subsampling_y;
+  dst_cm->seq_params.subsampling_x = src_cm->seq_params.subsampling_x;
+  dst_cm->seq_params.subsampling_y = src_cm->seq_params.subsampling_y;
   dst_cm->frame_type = src_cm->frame_type;
   dst_cm->last_show_frame = !src_cm->show_existing_frame
                                 ? src_cm->show_frame
diff --git a/third_party/aom/av1/decoder/dthread.h b/third_party/aom/av1/decoder/dthread.h
index 33d89006e..9f854e015 100644
--- a/third_party/aom/av1/decoder/dthread.h
+++ b/third_party/aom/av1/decoder/dthread.h
@@ -39,7 +39,6 @@ typedef struct FrameWorkerData {
   const uint8_t *data_end;
   size_t data_size;
   void *user_priv;
-  int result;
   int worker_id;
   int received_frame;
 
diff --git a/third_party/aom/av1/decoder/obu.c b/third_party/aom/av1/decoder/obu.c
index 482b6415e..715bc6837 100644
--- a/third_party/aom/av1/decoder/obu.c
+++ b/third_party/aom/av1/decoder/obu.c
@@ -161,6 +161,17 @@ static int is_obu_in_current_operating_point(AV1Decoder *pbi,
   return 0;
 }
 
+static int byte_alignment(AV1_COMMON *const cm,
+                          struct aom_read_bit_buffer *const rb) {
+  while (rb->bit_offset & 7) {
+    if (aom_rb_read_bit(rb)) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return -1;
+    }
+  }
+  return 0;
+}
+
 static uint32_t read_temporal_delimiter_obu() { return 0; }
 
 // Returns a boolean that indicates success.
@@ -173,6 +184,13 @@ static int read_bitstream_level(BitstreamLevel *bl,
   return 1;
 }
 
+// Returns whether two sequence headers are consistent with each other.
+// TODO(huisu,wtc@google.com): make sure the code matches the spec exactly.
+static int are_seq_headers_consistent(const SequenceHeader *seq_params_old,
+                                      const SequenceHeader *seq_params_new) {
+  return !memcmp(seq_params_old, seq_params_new, sizeof(SequenceHeader));
+}
+
 // On success, sets pbi->sequence_header_ready to 1 and returns the number of
 // bytes read from 'rb'.
 // On failure, sets pbi->common.error.error_code and returns 0.
@@ -184,14 +202,17 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
   // Verify rb has been configured to report errors.
   assert(rb->error_handler);
 
-  cm->profile = av1_read_profile(rb);
-  if (cm->profile > PROFILE_2) {
+  // Use a local variable to store the information as we decode. At the end,
+  // if no errors have occurred, cm->seq_params is updated.
+  SequenceHeader sh = cm->seq_params;
+  SequenceHeader *const seq_params = &sh;
+
+  seq_params->profile = av1_read_profile(rb);
+  if (seq_params->profile > PROFILE_2) {
     cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
     return 0;
   }
 
-  SequenceHeader *const seq_params = &cm->seq_params;
-
   // Still picture or not
   seq_params->still_picture = aom_rb_read_bit(rb);
   seq_params->reduced_still_picture_hdr = aom_rb_read_bit(rb);
@@ -252,7 +273,8 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
           (cm->timing_info.equal_picture_interval ||
            cm->op_params[i].decoder_model_param_present_flag)) {
         cm->op_params[i].bitrate = max_level_bitrate(
-            cm->profile, major_minor_to_seq_level_idx(seq_params->level[i]),
+            seq_params->profile,
+            major_minor_to_seq_level_idx(seq_params->level[i]),
             seq_params->tier[i]);
         // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass
         // the check
@@ -305,30 +327,49 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
     return 0;
   }
 
-  read_sequence_header(cm, rb);
+  av1_read_sequence_header(cm, rb, seq_params);
 
-  av1_read_color_config(cm, rb, pbi->allow_lowbitdepth);
+  av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &cm->error);
+  if (!(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0) &&
+      !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) &&
+      !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 0)) {
+    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported, "
+                       "%d %d subsampling is not supported.\n",
+                       seq_params->subsampling_x, seq_params->subsampling_y);
+  }
 
-  cm->film_grain_params_present = aom_rb_read_bit(rb);
+  seq_params->film_grain_params_present = aom_rb_read_bit(rb);
 
   if (av1_check_trailing_bits(pbi, rb) != 0) {
     // cm->error.error_code is already set.
     return 0;
   }
 
+  // If a sequence header has been decoded before, we check if the new
+  // one is consistent with the old one.
+  if (pbi->sequence_header_ready) {
+    if (!are_seq_headers_consistent(&cm->seq_params, seq_params)) {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Inconsistent sequence headers received.");
+    }
+  }
+
+  cm->seq_params = *seq_params;
   pbi->sequence_header_ready = 1;
 
   return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
 }
 
+// On success, returns the frame header size. On failure, calls
+// aom_internal_error and does not return.
 static uint32_t read_frame_header_obu(AV1Decoder *pbi,
                                       struct aom_read_bit_buffer *rb,
                                       const uint8_t *data,
                                       const uint8_t **p_data_end,
                                       int trailing_bits_present) {
-  av1_decode_frame_headers_and_setup(pbi, rb, data, p_data_end,
-                                     trailing_bits_present);
-  return (uint32_t)(pbi->uncomp_hdr_size);
+  return av1_decode_frame_headers_and_setup(pbi, rb, data, p_data_end,
+                                            trailing_bits_present);
 }
 
 static int32_t read_tile_group_header(AV1Decoder *pbi,
@@ -353,7 +394,6 @@ static int32_t read_tile_group_header(AV1Decoder *pbi,
     aom_internal_error(
         &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
         "For OBU_FRAME type obu tile_start_and_end_present_flag must be 0");
-    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
     return -1;
   }
   *start_tile =
@@ -371,9 +411,12 @@ static uint32_t read_one_tile_group_obu(
   int start_tile, end_tile;
   int32_t header_size, tg_payload_size;
 
+  assert((rb->bit_offset & 7) == 0);
+  assert(rb->bit_buffer + aom_rb_bytes_read(rb) == data);
+
   header_size = read_tile_group_header(pbi, rb, &start_tile, &end_tile,
                                        tile_start_implicit);
-  if (header_size == -1) return 0;
+  if (header_size == -1 || byte_alignment(cm, rb)) return 0;
   if (start_tile > end_tile) return header_size;
   data += header_size;
   av1_decode_tg_tiles_and_wrapup(pbi, data, data_end, p_data_end, start_tile,
@@ -386,44 +429,22 @@ static uint32_t read_one_tile_group_obu(
   return header_size + tg_payload_size;
 }
 
-// Only called while large_scale_tile = 1.
-static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi,
-                                              struct aom_read_bit_buffer *rb,
-                                              const uint8_t *data,
-                                              const uint8_t *data_end,
-                                              const uint8_t **p_data_end,
-                                              int *frame_decoding_finished) {
-  AV1_COMMON *const cm = &pbi->common;
-  uint32_t tile_list_payload_size = 0;
-  const int num_tiles = cm->tile_cols * cm->tile_rows;
-  const int start_tile = 0;
-  const int end_tile = num_tiles - 1;
-  int i = 0;
-
-  // Process the tile list info.
-  pbi->output_frame_width_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
-  pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
-  pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16);
-  if (pbi->tile_count_minus_1 > 511) {
-    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
-    return 0;
-  }
-
-  // Allocate output frame buffer for the tile list.
+static void alloc_tile_list_buffer(AV1Decoder *pbi) {
   // TODO(yunqing): for now, copy each tile's decoded YUV data directly to the
   // output buffer. This needs to be modified according to the application
   // requirement.
+  AV1_COMMON *const cm = &pbi->common;
   const int tile_width_in_pixels = cm->tile_width * MI_SIZE;
   const int tile_height_in_pixels = cm->tile_height * MI_SIZE;
-  const int ssy = cm->subsampling_y;
-  const int ssx = cm->subsampling_x;
+  const int ssy = cm->seq_params.subsampling_y;
+  const int ssx = cm->seq_params.subsampling_x;
   const int num_planes = av1_num_planes(cm);
   const size_t yplane_tile_size = tile_height_in_pixels * tile_width_in_pixels;
   const size_t uvplane_tile_size =
       (num_planes > 1)
           ? (tile_height_in_pixels >> ssy) * (tile_width_in_pixels >> ssx)
           : 0;
-  const size_t tile_size = (cm->use_highbitdepth ? 2 : 1) *
+  const size_t tile_size = (cm->seq_params.use_highbitdepth ? 2 : 1) *
                            (yplane_tile_size + 2 * uvplane_tile_size);
   pbi->tile_list_size = tile_size * (pbi->tile_count_minus_1 + 1);
 
@@ -437,6 +458,83 @@ static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi,
                          "Failed to allocate the tile list output buffer");
     pbi->buffer_sz = pbi->tile_list_size;
   }
+}
+
+static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi,
+                                                  uint8_t **output) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int tile_width_in_pixels = cm->tile_width * MI_SIZE;
+  const int tile_height_in_pixels = cm->tile_height * MI_SIZE;
+  const int ssy = cm->seq_params.subsampling_y;
+  const int ssx = cm->seq_params.subsampling_x;
+  const int num_planes = av1_num_planes(cm);
+
+  // Copy decoded tile to the tile list output buffer.
+  YV12_BUFFER_CONFIG *cur_frame = get_frame_new_buffer(cm);
+  const int mi_row = pbi->dec_tile_row * cm->tile_height;
+  const int mi_col = pbi->dec_tile_col * cm->tile_width;
+  const int is_hbd = (cur_frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+  uint8_t *bufs[MAX_MB_PLANE] = { NULL, NULL, NULL };
+  int strides[MAX_MB_PLANE] = { 0, 0, 0 };
+  int plane;
+
+  for (plane = 0; plane < num_planes; ++plane) {
+    int shift_x = plane > 0 ? ssx : 0;
+    int shift_y = plane > 0 ? ssy : 0;
+
+    bufs[plane] = cur_frame->buffers[plane];
+    strides[plane] =
+        (plane > 0) ? cur_frame->strides[1] : cur_frame->strides[0];
+
+    bufs[plane] += mi_row * (MI_SIZE >> shift_y) * strides[plane] +
+                   mi_col * (MI_SIZE >> shift_x);
+
+    if (is_hbd) {
+      bufs[plane] = (uint8_t *)CONVERT_TO_SHORTPTR(bufs[plane]);
+      strides[plane] *= 2;
+    }
+
+    int w, h;
+    w = (plane > 0 && shift_x > 0) ? ((tile_width_in_pixels + 1) >> shift_x)
+                                   : tile_width_in_pixels;
+    w *= (1 + is_hbd);
+    h = (plane > 0 && shift_y > 0) ? ((tile_height_in_pixels + 1) >> shift_y)
+                                   : tile_height_in_pixels;
+    int j;
+
+    for (j = 0; j < h; ++j) {
+      memcpy(*output, bufs[plane], w);
+      bufs[plane] += strides[plane];
+      *output += w;
+    }
+  }
+}
+
+// Only called while large_scale_tile = 1.
+static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi,
+                                              struct aom_read_bit_buffer *rb,
+                                              const uint8_t *data,
+                                              const uint8_t *data_end,
+                                              const uint8_t **p_data_end,
+                                              int *frame_decoding_finished) {
+  AV1_COMMON *const cm = &pbi->common;
+  uint32_t tile_list_payload_size = 0;
+  const int num_tiles = cm->tile_cols * cm->tile_rows;
+  const int start_tile = 0;
+  const int end_tile = num_tiles - 1;
+  int i = 0;
+
+  // Process the tile list info.
+  pbi->output_frame_width_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
+  pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
+  pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16);
+  if (pbi->tile_count_minus_1 > MAX_TILES - 1) {
+    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    return 0;
+  }
+
+  // Allocate output frame buffer for the tile list.
+  alloc_tile_list_buffer(pbi);
 
   uint32_t tile_list_info_bytes = 4;
   tile_list_payload_size += tile_list_info_bytes;
@@ -485,45 +583,8 @@ static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi,
     data = *p_data_end;
     assert(data <= data_end);
 
-    // Copy decoded tile to the tile list output buffer.
-    YV12_BUFFER_CONFIG *cur_frame = get_frame_new_buffer(cm);
-    const int mi_row = pbi->dec_tile_row * cm->tile_height;
-    const int mi_col = pbi->dec_tile_col * cm->tile_width;
-    const int is_hbd = (cur_frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-    uint8_t *bufs[MAX_MB_PLANE] = { NULL, NULL, NULL };
-    int strides[MAX_MB_PLANE] = { 0, 0, 0 };
-    int plane;
-
-    for (plane = 0; plane < num_planes; ++plane) {
-      int shift_x = plane > 0 ? ssx : 0;
-      int shift_y = plane > 0 ? ssy : 0;
-
-      bufs[plane] = cur_frame->buffers[plane];
-      strides[plane] =
-          (plane > 0) ? cur_frame->strides[1] : cur_frame->strides[0];
-      if (is_hbd) {
-        bufs[plane] = (uint8_t *)CONVERT_TO_SHORTPTR(cur_frame->buffers[plane]);
-        strides[plane] =
-            (plane > 0) ? 2 * cur_frame->strides[1] : 2 * cur_frame->strides[0];
-      }
-
-      bufs[plane] += mi_row * (MI_SIZE >> shift_y) * strides[plane] +
-                     mi_col * (MI_SIZE >> shift_x);
-
-      int w, h;
-      w = (plane > 0 && shift_x > 0) ? ((tile_width_in_pixels + 1) >> shift_x)
-                                     : tile_width_in_pixels;
-      w *= (1 + is_hbd);
-      h = (plane > 0 && shift_y > 0) ? ((tile_height_in_pixels + 1) >> shift_y)
-                                     : tile_height_in_pixels;
-      int j;
-
-      for (j = 0; j < h; ++j) {
-        memcpy(output, bufs[plane], w);
-        bufs[plane] += strides[plane];
-        output += w;
-      }
-    }
+    // Copy the decoded tile to the tile list output buffer.
+    copy_decoded_tile_to_tile_list_buffer(pbi, &output);
   }
 
   *frame_decoding_finished = 1;
@@ -710,7 +771,6 @@ aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
   return AOM_CODEC_OK;
 }
 
-#define EXT_TILE_DEBUG 0
 // On success, returns a boolean that indicates whether the decoding of the
 // current frame is finished. On failure, sets cm->error.error_code and
 // returns -1.
@@ -720,7 +780,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
   AV1_COMMON *const cm = &pbi->common;
   int frame_decoding_finished = 0;
   int is_first_tg_obu_received = 1;
-  int frame_header_size = 0;
+  uint32_t frame_header_size = 0;
   int seq_header_received = 0;
   size_t seq_header_size = 0;
   ObuHeader obu_header;
@@ -785,7 +845,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
       }
     }
 
-    av1_init_read_bit_buffer(pbi, &rb, data, data_end);
+    av1_init_read_bit_buffer(pbi, &rb, data, data + payload_size);
 
     switch (obu_header.type) {
       case OBU_TEMPORAL_DELIMITER:
@@ -813,21 +873,35 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
         // Only decode first frame header received
         if (!pbi->seen_frame_header ||
             (cm->large_scale_tile && !pbi->camera_frame_header_ready)) {
-          pbi->seen_frame_header = 1;
           frame_header_size = read_frame_header_obu(
               pbi, &rb, data, p_data_end, obu_header.type != OBU_FRAME);
-          if (cm->large_scale_tile) pbi->camera_frame_header_ready = 1;
+          pbi->seen_frame_header = 1;
+          if (!pbi->ext_tile_debug && cm->large_scale_tile)
+            pbi->camera_frame_header_ready = 1;
+        } else {
+          // TODO(wtc): Verify that the frame_header_obu is identical to the
+          // original frame_header_obu. For now just skip frame_header_size
+          // bytes in the bit buffer.
+          if (frame_header_size > payload_size) {
+            cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+            return -1;
+          }
+          assert(rb.bit_offset == 0);
+          rb.bit_offset = 8 * frame_header_size;
         }
         decoded_payload_size = frame_header_size;
-        pbi->frame_header_size = (size_t)frame_header_size;
+        pbi->frame_header_size = frame_header_size;
 
         if (cm->show_existing_frame) {
+          if (obu_header.type == OBU_FRAME) {
+            cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+            return -1;
+          }
           frame_decoding_finished = 1;
           pbi->seen_frame_header = 0;
           break;
         }
 
-#if !EXT_TILE_DEBUG
         // In large scale tile coding, decode the common camera frame header
         // before any tile list OBU.
         if (!pbi->ext_tile_debug && pbi->camera_frame_header_ready) {
@@ -838,17 +912,18 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
           *p_data_end = data_end;
           break;
         }
-#endif  // EXT_TILE_DEBUG
 
         if (obu_header.type != OBU_FRAME) break;
         obu_payload_offset = frame_header_size;
+        // Byte align the reader before reading the tile group.
+        if (byte_alignment(cm, &rb)) return -1;
         AOM_FALLTHROUGH_INTENDED;  // fall through to read tile group.
       case OBU_TILE_GROUP:
         if (!pbi->seen_frame_header) {
           cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
           return -1;
         }
-        if ((size_t)(data_end - data) < obu_payload_offset) {
+        if (obu_payload_offset > payload_size) {
           cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
           return -1;
         }
@@ -904,4 +979,3 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
 
   return frame_decoding_finished;
 }
-#undef EXT_TILE_DEBUG
diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c
index c5a6bc831..b721b6d2b 100644
--- a/third_party/aom/av1/encoder/aq_complexity.c
+++ b/third_party/aom/av1/encoder/aq_complexity.c
@@ -66,7 +66,8 @@ void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
       cpi->refresh_alt_ref_frame ||
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
     int segment;
-    const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
+    const int aq_strength =
+        get_aq_c_strength(cm->base_qindex, cm->seq_params.bit_depth);
 
     // Clear down the segment map.
     memset(cpi->segmentation_map, DEFAULT_AQ2_SEG, cm->mi_rows * cm->mi_cols);
@@ -93,7 +94,7 @@ void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
 
       qindex_delta = av1_compute_qdelta_by_rate(
           &cpi->rc, cm->frame_type, cm->base_qindex,
-          aq_c_q_adj_factor[aq_strength][segment], cm->bit_depth);
+          aq_c_q_adj_factor[aq_strength][segment], cm->seq_params.bit_depth);
 
       // For AQ complexity mode, we dont allow Q0 in a segment if the base
       // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
@@ -138,7 +139,8 @@ void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
     const int target_rate = (int)(num / denom);
     double logvar;
     double low_var_thresh;
-    const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
+    const int aq_strength =
+        get_aq_c_strength(cm->base_qindex, cm->seq_params.bit_depth);
 
     aom_clear_system_state();
     low_var_thresh = (cpi->oxcf.pass == 2) ? AOMMAX(cpi->twopass.mb_av_energy,
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
index a1fe37d4a..dec2c730d 100644
--- a/third_party/aom/av1/encoder/aq_cyclicrefresh.c
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
@@ -137,8 +137,9 @@ static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
 static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) {
   const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   const RATE_CONTROL *const rc = &cpi->rc;
-  int deltaq = av1_compute_qdelta_by_rate(rc, cpi->common.frame_type, q,
-                                          rate_factor, cpi->common.bit_depth);
+  int deltaq =
+      av1_compute_qdelta_by_rate(rc, cpi->common.frame_type, q, rate_factor,
+                                 cpi->common.seq_params.bit_depth);
   if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
     deltaq = -cr->max_qdelta_perc * q / 100;
   }
@@ -164,15 +165,16 @@ int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi,
   estimated_bits =
       (int)((1.0 - weight_segment1 - weight_segment2) *
                 av1_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs,
-                                       correction_factor, cm->bit_depth) +
-            weight_segment1 *
-                av1_estimate_bits_at_q(cm->frame_type,
-                                       cm->base_qindex + cr->qindex_delta[1],
-                                       mbs, correction_factor, cm->bit_depth) +
-            weight_segment2 *
-                av1_estimate_bits_at_q(cm->frame_type,
-                                       cm->base_qindex + cr->qindex_delta[2],
-                                       mbs, correction_factor, cm->bit_depth));
+                                       correction_factor,
+                                       cm->seq_params.bit_depth) +
+            weight_segment1 * av1_estimate_bits_at_q(
+                                  cm->frame_type,
+                                  cm->base_qindex + cr->qindex_delta[1], mbs,
+                                  correction_factor, cm->seq_params.bit_depth) +
+            weight_segment2 * av1_estimate_bits_at_q(
+                                  cm->frame_type,
+                                  cm->base_qindex + cr->qindex_delta[2], mbs,
+                                  correction_factor, cm->seq_params.bit_depth));
   return estimated_bits;
 }
 
@@ -197,12 +199,13 @@ int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i,
   // Compute delta-q corresponding to qindex i.
   int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
   // Take segment weighted average for bits per mb.
-  bits_per_mb = (int)((1.0 - weight_segment) *
-                          av1_rc_bits_per_mb(cm->frame_type, i,
-                                             correction_factor, cm->bit_depth) +
-                      weight_segment *
-                          av1_rc_bits_per_mb(cm->frame_type, i + deltaq,
-                                             correction_factor, cm->bit_depth));
+  bits_per_mb =
+      (int)((1.0 - weight_segment) *
+                av1_rc_bits_per_mb(cm->frame_type, i, correction_factor,
+                                   cm->seq_params.bit_depth) +
+            weight_segment * av1_rc_bits_per_mb(cm->frame_type, i + deltaq,
+                                                correction_factor,
+                                                cm->seq_params.bit_depth));
   return bits_per_mb;
 }
 
@@ -507,7 +510,8 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
   } else {
     int qindex_delta = 0;
     int qindex2;
-    const double q = av1_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
+    const double q =
+        av1_convert_qindex_to_q(cm->base_qindex, cm->seq_params.bit_depth);
     aom_clear_system_state();
     // Set rate threshold to some multiple (set to 2 for now) of the target
     // rate (target is given by sb64_target_rate and scaled by 256).
diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c
index 29a311447..6cb6adc42 100644
--- a/third_party/aom/av1/encoder/aq_variance.c
+++ b/third_party/aom/av1/encoder/aq_variance.c
@@ -71,7 +71,7 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
     for (i = 0; i < MAX_SEGMENTS; ++i) {
       int qindex_delta =
           av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
-                                     rate_ratio[i], cm->bit_depth);
+                                     rate_ratio[i], cm->seq_params.bit_depth);
 
       // We don't allow qindex 0 in a segment if the base value is not 0.
       // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
@@ -235,9 +235,9 @@ int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi,
 
   const int rate_level = SEGMENT_ID(block_var_level);
   const AV1_COMMON *const cm = &cpi->common;
-  int qindex_delta =
-      av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
-                                 rate_ratio[rate_level], cm->bit_depth);
+  int qindex_delta = av1_compute_qdelta_by_rate(
+      &cpi->rc, cm->frame_type, cm->base_qindex, rate_ratio[rate_level],
+      cm->seq_params.bit_depth);
 
   if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
     qindex_delta = -cm->base_qindex + 1;
diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c
index 1c5bdeb25..d0477b35b 100644
--- a/third_party/aom/av1/encoder/av1_quantize.c
+++ b/third_party/aom/av1/encoder/av1_quantize.c
@@ -613,9 +613,9 @@ void av1_init_quantizer(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   QUANTS *const quants = &cpi->quants;
   Dequants *const dequants = &cpi->dequants;
-  av1_build_quantizer(cm->bit_depth, cm->y_dc_delta_q, cm->u_dc_delta_q,
-                      cm->u_ac_delta_q, cm->v_dc_delta_q, cm->v_ac_delta_q,
-                      quants, dequants);
+  av1_build_quantizer(cm->seq_params.bit_depth, cm->y_dc_delta_q,
+                      cm->u_dc_delta_q, cm->u_ac_delta_q, cm->v_dc_delta_q,
+                      cm->v_ac_delta_q, quants, dequants);
 }
 
 void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
@@ -713,7 +713,7 @@ void av1_set_quantizer(AV1_COMMON *cm, int q) {
   cm->qm_u = aom_get_qmlevel(cm->base_qindex + cm->u_ac_delta_q,
                              cm->min_qmlevel, cm->max_qmlevel);
 
-  if (!cm->separate_uv_delta_q)
+  if (!cm->seq_params.separate_uv_delta_q)
     cm->qm_v = cm->qm_u;
   else
     cm->qm_v = aom_get_qmlevel(cm->base_qindex + cm->v_ac_delta_q,
diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c
index cdd7c2492..2070755cd 100644
--- a/third_party/aom/av1/encoder/bitstream.c
+++ b/third_party/aom/av1/encoder/bitstream.c
@@ -769,7 +769,7 @@ static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
       aom_write_symbol(w, n - PALETTE_MIN_SIZE,
                        xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
                        PALETTE_SIZES);
-      write_palette_colors_y(xd, pmi, cm->bit_depth, w);
+      write_palette_colors_y(xd, pmi, cm->seq_params.bit_depth, w);
     }
   }
 
@@ -786,7 +786,7 @@ static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
       aom_write_symbol(w, n - PALETTE_MIN_SIZE,
                        xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
                        PALETTE_SIZES);
-      write_palette_colors_uv(xd, pmi, cm->bit_depth, w);
+      write_palette_colors_uv(xd, pmi, cm->seq_params.bit_depth, w);
     }
   }
 }
@@ -1421,8 +1421,8 @@ static void write_inter_txb_coeff(AV1_COMMON *const cm, MACROBLOCK *const x,
     for (blk_col = col >> pd->subsampling_x; blk_col < unit_width;
          blk_col += bkw) {
       pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize,
-                      cm->bit_depth, *block, blk_row, blk_col, max_tx_size,
-                      token_stats);
+                      cm->seq_params.bit_depth, *block, blk_row, blk_col,
+                      max_tx_size, token_stats);
       *block += step;
     }
   }
@@ -1612,14 +1612,13 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
 
   const int num_planes = av1_num_planes(cm);
   for (int plane = 0; plane < num_planes; ++plane) {
-    int rcol0, rcol1, rrow0, rrow1, tile_tl_idx;
+    int rcol0, rcol1, rrow0, rrow1;
     if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
-                                           &rcol0, &rcol1, &rrow0, &rrow1,
-                                           &tile_tl_idx)) {
+                                           &rcol0, &rcol1, &rrow0, &rrow1)) {
       const int rstride = cm->rst_info[plane].horz_units_per_tile;
       for (int rrow = rrow0; rrow < rrow1; ++rrow) {
         for (int rcol = rcol0; rcol < rcol1; ++rcol) {
-          const int runit_idx = tile_tl_idx + rcol + rrow * rstride;
+          const int runit_idx = rcol + rrow * rstride;
           const RestorationUnitInfo *rui =
               &cm->rst_info[plane].unit_info[runit_idx];
           loop_restoration_write_sb_coeffs(cm, xd, rui, w, plane,
@@ -1705,7 +1704,7 @@ static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
   const int mi_col_end = tile->mi_col_end;
   int mi_row, mi_col;
 
-  av1_zero_above_context(cm, mi_col_start, mi_col_end, tile->tile_row);
+  av1_zero_above_context(cm, xd, mi_col_start, mi_col_end, tile->tile_row);
   av1_init_above_context(cm, xd, tile->tile_row);
 
   if (cpi->common.delta_q_present_flag) {
@@ -1779,7 +1778,7 @@ static void encode_restoration_mode(AV1_COMMON *cm,
   }
 
   if (num_planes > 1) {
-    int s = AOMMIN(cm->subsampling_x, cm->subsampling_y);
+    int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y);
     if (s && !chroma_none) {
       aom_wb_write_bit(wb, cm->rst_info[1].restoration_unit_size !=
                                cm->rst_info[0].restoration_unit_size);
@@ -2020,7 +2019,7 @@ static void encode_quantization(const AV1_COMMON *const cm,
   if (num_planes > 1) {
     int diff_uv_delta = (cm->u_dc_delta_q != cm->v_dc_delta_q) ||
                         (cm->u_ac_delta_q != cm->v_ac_delta_q);
-    if (cm->separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta);
+    if (cm->seq_params.separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta);
     write_delta_q(wb, cm->u_dc_delta_q);
     write_delta_q(wb, cm->u_ac_delta_q);
     if (diff_uv_delta) {
@@ -2032,7 +2031,7 @@ static void encode_quantization(const AV1_COMMON *const cm,
   if (cm->using_qmatrix) {
     aom_wb_write_literal(wb, cm->qm_y, QM_LEVEL_BITS);
     aom_wb_write_literal(wb, cm->qm_u, QM_LEVEL_BITS);
-    if (!cm->separate_uv_delta_q)
+    if (!cm->seq_params.separate_uv_delta_q)
       assert(cm->qm_u == cm->qm_v);
     else
       aom_wb_write_literal(wb, cm->qm_v, QM_LEVEL_BITS);
@@ -2240,7 +2239,8 @@ static int get_refresh_mask_gf16(AV1_COMP *cpi) {
 #endif  // USE_GF16_MULTI_LAYER
 
 static int get_refresh_mask(AV1_COMP *cpi) {
-  if (cpi->common.frame_type == KEY_FRAME || frame_is_sframe(&cpi->common))
+  if ((cpi->common.frame_type == KEY_FRAME && cpi->common.show_frame) ||
+      frame_is_sframe(&cpi->common))
     return 0xFF;
 
   int refresh_mask = 0;
@@ -2258,9 +2258,15 @@ static int get_refresh_mask(AV1_COMP *cpi) {
   //     LAST3_FRAME.
   refresh_mask |=
       (cpi->refresh_last_frame << cpi->ref_fb_idx[LAST_REF_FRAMES - 1]);
-
+#if USE_SYMM_MULTI_LAYER
+  refresh_mask |=
+      (cpi->new_bwdref_update_rule == 1)
+          ? (cpi->refresh_bwd_ref_frame << cpi->ref_fb_idx[EXTREF_FRAME - 1])
+          : (cpi->refresh_bwd_ref_frame << cpi->ref_fb_idx[BWDREF_FRAME - 1]);
+#else
   refresh_mask |=
       (cpi->refresh_bwd_ref_frame << cpi->ref_fb_idx[BWDREF_FRAME - 1]);
+#endif
   refresh_mask |=
       (cpi->refresh_alt2_ref_frame << cpi->ref_fb_idx[ALTREF2_FRAME - 1]);
 
@@ -2419,80 +2425,82 @@ static void write_profile(BITSTREAM_PROFILE profile,
   aom_wb_write_literal(wb, profile, PROFILE_BITS);
 }
 
-static void write_bitdepth(AV1_COMMON *const cm,
+static void write_bitdepth(const SequenceHeader *const seq_params,
                            struct aom_write_bit_buffer *wb) {
   // Profile 0/1: [0] for 8 bit, [1]  10-bit
   // Profile   2: [0] for 8 bit, [10] 10-bit, [11] - 12-bit
-  aom_wb_write_bit(wb, cm->bit_depth == AOM_BITS_8 ? 0 : 1);
-  if (cm->profile == PROFILE_2 && cm->bit_depth != AOM_BITS_8) {
-    aom_wb_write_bit(wb, cm->bit_depth == AOM_BITS_10 ? 0 : 1);
+  aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_8 ? 0 : 1);
+  if (seq_params->profile == PROFILE_2 && seq_params->bit_depth != AOM_BITS_8) {
+    aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_10 ? 0 : 1);
   }
 }
 
-static void write_color_config(AV1_COMMON *const cm,
+static void write_color_config(const SequenceHeader *const seq_params,
                                struct aom_write_bit_buffer *wb) {
-  write_bitdepth(cm, wb);
-  const int is_monochrome = cm->seq_params.monochrome;
+  write_bitdepth(seq_params, wb);
+  const int is_monochrome = seq_params->monochrome;
   // monochrome bit
-  if (cm->profile != PROFILE_1)
+  if (seq_params->profile != PROFILE_1)
     aom_wb_write_bit(wb, is_monochrome);
   else
     assert(!is_monochrome);
-  if (cm->color_primaries == AOM_CICP_CP_UNSPECIFIED &&
-      cm->transfer_characteristics == AOM_CICP_TC_UNSPECIFIED &&
-      cm->matrix_coefficients == AOM_CICP_MC_UNSPECIFIED) {
+  if (seq_params->color_primaries == AOM_CICP_CP_UNSPECIFIED &&
+      seq_params->transfer_characteristics == AOM_CICP_TC_UNSPECIFIED &&
+      seq_params->matrix_coefficients == AOM_CICP_MC_UNSPECIFIED) {
     aom_wb_write_bit(wb, 0);  // No color description present
   } else {
     aom_wb_write_bit(wb, 1);  // Color description present
-    aom_wb_write_literal(wb, cm->color_primaries, 8);
-    aom_wb_write_literal(wb, cm->transfer_characteristics, 8);
-    aom_wb_write_literal(wb, cm->matrix_coefficients, 8);
+    aom_wb_write_literal(wb, seq_params->color_primaries, 8);
+    aom_wb_write_literal(wb, seq_params->transfer_characteristics, 8);
+    aom_wb_write_literal(wb, seq_params->matrix_coefficients, 8);
   }
   if (is_monochrome) {
     // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
-    aom_wb_write_bit(wb, cm->color_range);
+    aom_wb_write_bit(wb, seq_params->color_range);
     return;
   }
-  if (cm->color_primaries == AOM_CICP_CP_BT_709 &&
-      cm->transfer_characteristics == AOM_CICP_TC_SRGB &&
-      cm->matrix_coefficients ==
+  if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
+      seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
+      seq_params->matrix_coefficients ==
           AOM_CICP_MC_IDENTITY) {  // it would be better to remove this
                                    // dependency too
-    assert(cm->subsampling_x == 0 && cm->subsampling_y == 0);
-    assert(cm->profile == PROFILE_1 ||
-           (cm->profile == PROFILE_2 && cm->bit_depth == AOM_BITS_12));
+    assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+    assert(seq_params->profile == PROFILE_1 ||
+           (seq_params->profile == PROFILE_2 &&
+            seq_params->bit_depth == AOM_BITS_12));
   } else {
     // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
-    aom_wb_write_bit(wb, cm->color_range);
-    if (cm->profile == PROFILE_0) {
+    aom_wb_write_bit(wb, seq_params->color_range);
+    if (seq_params->profile == PROFILE_0) {
       // 420 only
-      assert(cm->subsampling_x == 1 && cm->subsampling_y == 1);
-    } else if (cm->profile == PROFILE_1) {
+      assert(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1);
+    } else if (seq_params->profile == PROFILE_1) {
       // 444 only
-      assert(cm->subsampling_x == 0 && cm->subsampling_y == 0);
-    } else if (cm->profile == PROFILE_2) {
-      if (cm->bit_depth == AOM_BITS_12) {
+      assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+    } else if (seq_params->profile == PROFILE_2) {
+      if (seq_params->bit_depth == AOM_BITS_12) {
         // 420, 444 or 422
-        aom_wb_write_bit(wb, cm->subsampling_x);
-        if (cm->subsampling_x == 0) {
-          assert(cm->subsampling_y == 0 &&
+        aom_wb_write_bit(wb, seq_params->subsampling_x);
+        if (seq_params->subsampling_x == 0) {
+          assert(seq_params->subsampling_y == 0 &&
                  "4:4:0 subsampling not allowed in AV1");
         } else {
-          aom_wb_write_bit(wb, cm->subsampling_y);
+          aom_wb_write_bit(wb, seq_params->subsampling_y);
         }
       } else {
         // 422 only
-        assert(cm->subsampling_x == 1 && cm->subsampling_y == 0);
+        assert(seq_params->subsampling_x == 1 &&
+               seq_params->subsampling_y == 0);
       }
     }
-    if (cm->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
-      assert(cm->subsampling_x == 0 && cm->subsampling_y == 0);
+    if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+      assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
     }
-    if (cm->subsampling_x == 1 && cm->subsampling_y == 1) {
-      aom_wb_write_literal(wb, cm->chroma_sample_position, 2);
+    if (seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) {
+      aom_wb_write_literal(wb, seq_params->chroma_sample_position, 2);
     }
   }
-  aom_wb_write_bit(wb, cm->separate_uv_delta_q);
+  aom_wb_write_bit(wb, seq_params->separate_uv_delta_q);
 }
 
 static void write_timing_info_header(AV1_COMMON *const cm,
@@ -2517,8 +2525,8 @@ static void write_decoder_model_info(AV1_COMMON *const cm,
       wb, cm->buffer_model.encoder_decoder_buffer_delay_length - 1, 5);
   aom_wb_write_unsigned_literal(wb, cm->buffer_model.num_units_in_decoding_tick,
                                 32);  // Number of units in decoding tick
-  aom_wb_write_literal(wb, cm->buffer_model.buffer_removal_delay_length - 1, 5);
-  aom_wb_write_literal(wb, cm->buffer_model.frame_presentation_delay_length - 1,
+  aom_wb_write_literal(wb, cm->buffer_model.buffer_removal_time_length - 1, 5);
+  aom_wb_write_literal(wb, cm->buffer_model.frame_presentation_time_length - 1,
                        5);
 }
 
@@ -2533,23 +2541,25 @@ static void write_dec_model_op_parameters(AV1_COMMON *const cm,
   //  aom_wb_write_bit(wb, cm->op_params[op_num].has_parameters);
   //  if (!cm->op_params[op_num].has_parameters) return;
 
-  aom_wb_write_literal(wb, cm->op_params[op_num].decoder_buffer_delay,
-                       cm->buffer_model.encoder_decoder_buffer_delay_length);
+  aom_wb_write_unsigned_literal(
+      wb, cm->op_params[op_num].decoder_buffer_delay,
+      cm->buffer_model.encoder_decoder_buffer_delay_length);
 
-  aom_wb_write_literal(wb, cm->op_params[op_num].encoder_buffer_delay,
-                       cm->buffer_model.encoder_decoder_buffer_delay_length);
+  aom_wb_write_unsigned_literal(
+      wb, cm->op_params[op_num].encoder_buffer_delay,
+      cm->buffer_model.encoder_decoder_buffer_delay_length);
 
   aom_wb_write_bit(wb, cm->op_params[op_num].low_delay_mode_flag);
 
-  cm->op_frame_timing[op_num].buffer_removal_delay =
+  cm->op_frame_timing[op_num].buffer_removal_time =
       0;  // reset the decoded frame counter
 }
 
 static void write_tu_pts_info(AV1_COMMON *const cm,
                               struct aom_write_bit_buffer *wb) {
   aom_wb_write_unsigned_literal(
-      wb, (uint32_t)cm->tu_presentation_delay,
-      cm->buffer_model.frame_presentation_delay_length);
+      wb, cm->frame_presentation_time,
+      cm->buffer_model.frame_presentation_time_length);
 }
 
 static void write_film_grain_params(AV1_COMP *cpi,
@@ -2601,8 +2611,8 @@ static void write_film_grain_params(AV1_COMP *cpi,
     pars->chroma_scaling_from_luma = 0;  // for monochrome override to 0
 
   if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma ||
-      ((cm->subsampling_x == 1) && (cm->subsampling_y == 1) &&
-       (pars->num_y_points == 0))) {
+      ((cm->seq_params.subsampling_x == 1) &&
+       (cm->seq_params.subsampling_y == 1) && (pars->num_y_points == 0))) {
     pars->num_cb_points = 0;
     pars->num_cr_points = 0;
   } else {
@@ -2931,18 +2941,19 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
                                           struct aom_write_bit_buffer *saved_wb,
                                           struct aom_write_bit_buffer *wb) {
   AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 
   // NOTE: By default all coded frames to be used as a reference
   cm->is_reference_frame = 1;
   cm->frame_type = cm->intra_only ? INTRA_ONLY_FRAME : cm->frame_type;
 
-  if (cm->seq_params.still_picture) {
+  if (seq_params->still_picture) {
     assert(cm->show_existing_frame == 0);
     assert(cm->show_frame == 1);
     assert(cm->frame_type == KEY_FRAME);
   }
-  if (!cm->seq_params.reduced_still_picture_hdr) {
+  if (!seq_params->reduced_still_picture_hdr) {
     if (cm->show_existing_frame) {
       RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
       const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show];
@@ -2957,12 +2968,12 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
       aom_wb_write_bit(wb, 1);  // show_existing_frame
       aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
 
-      if (cm->seq_params.decoder_model_info_present_flag &&
+      if (seq_params->decoder_model_info_present_flag &&
           cm->timing_info.equal_picture_interval == 0) {
         write_tu_pts_info(cm, wb);
       }
-      if (cm->seq_params.frame_id_numbers_present_flag) {
-        int frame_id_len = cm->seq_params.frame_id_length;
+      if (seq_params->frame_id_numbers_present_flag) {
+        int frame_id_len = seq_params->frame_id_length;
         int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
         aom_wb_write_literal(wb, display_frame_id, frame_id_len);
       }
@@ -2983,7 +2994,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
 
     aom_wb_write_bit(wb, cm->show_frame);
     if (cm->show_frame) {
-      if (cm->seq_params.decoder_model_info_present_flag &&
+      if (seq_params->decoder_model_info_present_flag &&
           cm->timing_info.equal_picture_interval == 0)
         write_tu_pts_info(cm, wb);
     } else {
@@ -2997,18 +3008,18 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
   }
   aom_wb_write_bit(wb, cm->disable_cdf_update);
 
-  if (cm->seq_params.force_screen_content_tools == 2) {
+  if (seq_params->force_screen_content_tools == 2) {
     aom_wb_write_bit(wb, cm->allow_screen_content_tools);
   } else {
     assert(cm->allow_screen_content_tools ==
-           cm->seq_params.force_screen_content_tools);
+           seq_params->force_screen_content_tools);
   }
 
   if (cm->allow_screen_content_tools) {
-    if (cm->seq_params.force_integer_mv == 2) {
+    if (seq_params->force_integer_mv == 2) {
       aom_wb_write_bit(wb, cm->cur_frame_force_integer_mv);
     } else {
-      assert(cm->cur_frame_force_integer_mv == cm->seq_params.force_integer_mv);
+      assert(cm->cur_frame_force_integer_mv == seq_params->force_integer_mv);
     }
   } else {
     assert(cm->cur_frame_force_integer_mv == 0);
@@ -3018,53 +3029,57 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
   int frame_size_override_flag = 0;
   cm->frame_refs_short_signaling = 0;
 
-  if (cm->seq_params.reduced_still_picture_hdr) {
-    assert(cm->width == cm->seq_params.max_frame_width &&
-           cm->height == cm->seq_params.max_frame_height);
+  if (seq_params->reduced_still_picture_hdr) {
+    assert(cm->width == seq_params->max_frame_width &&
+           cm->height == seq_params->max_frame_height);
   } else {
-    if (cm->seq_params.frame_id_numbers_present_flag) {
-      int frame_id_len = cm->seq_params.frame_id_length;
+    if (seq_params->frame_id_numbers_present_flag) {
+      int frame_id_len = seq_params->frame_id_length;
       aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len);
     }
 
-    if (cm->width > cm->seq_params.max_frame_width ||
-        cm->height > cm->seq_params.max_frame_height) {
+    if (cm->width > seq_params->max_frame_width ||
+        cm->height > seq_params->max_frame_height) {
       aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Frame dimensions are larger than the maximum values");
     }
 
     frame_size_override_flag =
         frame_is_sframe(cm) ? 1
-                            : (cm->width != cm->seq_params.max_frame_width ||
-                               cm->height != cm->seq_params.max_frame_height);
+                            : (cm->width != seq_params->max_frame_width ||
+                               cm->height != seq_params->max_frame_height);
     if (!frame_is_sframe(cm)) aom_wb_write_bit(wb, frame_size_override_flag);
 
-    if (cm->seq_params.enable_order_hint)
+    if (seq_params->enable_order_hint)
       aom_wb_write_literal(wb, cm->frame_offset,
-                           cm->seq_params.order_hint_bits_minus_1 + 1);
+                           seq_params->order_hint_bits_minus_1 + 1);
 
     if (!cm->error_resilient_mode && !frame_is_intra_only(cm)) {
       aom_wb_write_literal(wb, cm->primary_ref_frame, PRIMARY_REF_BITS);
     }
   }
 
-  if (cm->seq_params.decoder_model_info_present_flag) {
-    aom_wb_write_bit(wb, cm->buffer_removal_delay_present);
-    if (cm->buffer_removal_delay_present) {
+  if (seq_params->decoder_model_info_present_flag) {
+    aom_wb_write_bit(wb, cm->buffer_removal_time_present);
+    if (cm->buffer_removal_time_present) {
       for (int op_num = 0;
-           op_num < cm->seq_params.operating_points_cnt_minus_1 + 1; op_num++) {
+           op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
         if (cm->op_params[op_num].decoder_model_param_present_flag) {
-          if (((cm->seq_params.operating_point_idc[op_num] >>
+          if (((seq_params->operating_point_idc[op_num] >>
                 cm->temporal_layer_id) &
                    0x1 &&
-               (cm->seq_params.operating_point_idc[op_num] >>
+               (seq_params->operating_point_idc[op_num] >>
                 (cm->spatial_layer_id + 8)) &
                    0x1) ||
-              cm->seq_params.operating_point_idc[op_num] == 0) {
-            aom_wb_write_literal(
-                wb, (uint32_t)cm->op_frame_timing[op_num].buffer_removal_delay,
-                cm->buffer_model.buffer_removal_delay_length);
-            cm->op_frame_timing[op_num].buffer_removal_delay++;
+              seq_params->operating_point_idc[op_num] == 0) {
+            aom_wb_write_unsigned_literal(
+                wb, cm->op_frame_timing[op_num].buffer_removal_time,
+                cm->buffer_model.buffer_removal_time_length);
+            cm->op_frame_timing[op_num].buffer_removal_time++;
+            if (cm->op_frame_timing[op_num].buffer_removal_time == 0) {
+              aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                                 "buffer_removal_time overflowed");
+            }
           }
         }
       }
@@ -3122,7 +3137,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
 
   if (!frame_is_intra_only(cm) || cpi->refresh_frame_mask != 0xFF) {
     // Write all ref frame order hints if error_resilient_mode == 1
-    if (cm->error_resilient_mode && cm->seq_params.enable_order_hint) {
+    if (cm->error_resilient_mode && seq_params->enable_order_hint) {
       RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
       for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
         // Get buffer index
@@ -3131,7 +3146,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
 
         // Write order hint to bit stream
         aom_wb_write_literal(wb, frame_bufs[buf_idx].cur_frame_offset,
-                             cm->seq_params.order_hint_bits_minus_1 + 1);
+                             seq_params->order_hint_bits_minus_1 + 1);
       }
     }
   }
@@ -3156,7 +3171,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
       //       automatically.
 #define FRAME_REFS_SHORT_SIGNALING 0
 #if FRAME_REFS_SHORT_SIGNALING
-      cm->frame_refs_short_signaling = cm->seq_params.enable_order_hint;
+      cm->frame_refs_short_signaling = seq_params->enable_order_hint;
 #endif  // FRAME_REFS_SHORT_SIGNALING
 
       if (cm->frame_refs_short_signaling) {
@@ -3167,7 +3182,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
         check_frame_refs_short_signaling(cpi);
       }
 
-      if (cm->seq_params.enable_order_hint)
+      if (seq_params->enable_order_hint)
         aom_wb_write_bit(wb, cm->frame_refs_short_signaling);
 
       if (cm->frame_refs_short_signaling) {
@@ -3183,10 +3198,10 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
         if (!cm->frame_refs_short_signaling)
           aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
                                REF_FRAMES_LOG2);
-        if (cm->seq_params.frame_id_numbers_present_flag) {
+        if (seq_params->frame_id_numbers_present_flag) {
           int i = get_ref_frame_map_idx(cpi, ref_frame);
-          int frame_id_len = cm->seq_params.frame_id_length;
-          int diff_len = cm->seq_params.delta_frame_id_length;
+          int frame_id_len = seq_params->frame_id_length;
+          int diff_len = seq_params->delta_frame_id_length;
           int delta_frame_id_minus_1 =
               ((cm->current_frame_id - cm->ref_frame_id[i] +
                 (1 << frame_id_len)) %
@@ -3222,7 +3237,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
   }
 
   const int might_bwd_adapt =
-      !(cm->seq_params.reduced_still_picture_hdr) && !(cm->disable_cdf_update);
+      !(seq_params->reduced_still_picture_hdr) && !(cm->disable_cdf_update);
   if (cm->large_scale_tile)
     cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 
@@ -3282,7 +3297,8 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
 
   if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb);
 
-  if (cm->film_grain_params_present && (cm->show_frame || cm->showable_frame)) {
+  if (seq_params->film_grain_params_present &&
+      (cm->show_frame || cm->showable_frame)) {
     int flip_back_update_parameters_flag = 0;
     if (cm->frame_type != INTER_FRAME &&
         cm->film_grain_params.update_parameters == 0) {
@@ -3497,7 +3513,7 @@ static uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
   struct aom_write_bit_buffer wb = { dst, 0 };
   uint32_t size = 0;
 
-  write_profile(cm->profile, &wb);
+  write_profile(cm->seq_params.profile, &wb);
 
   // Still picture or not
   aom_wb_write_bit(&wb, cm->seq_params.still_picture);
@@ -3551,9 +3567,9 @@ static uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
   }
   write_sequence_header(cpi, &wb);
 
-  write_color_config(cm, &wb);
+  write_color_config(&cm->seq_params, &wb);
 
-  aom_wb_write_bit(&wb, cm->film_grain_params_present);
+  aom_wb_write_bit(&wb, cm->seq_params.film_grain_params_present);
 
   add_trailing_bits(&wb);
 
@@ -3960,7 +3976,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
   // The TD is now written outside the frame encode loop
 
   // write sequence header obu if KEY_FRAME, preceded by 4-byte size
-  if (cm->frame_type == KEY_FRAME) {
+  if (cm->frame_type == KEY_FRAME && cm->show_frame) {
     obu_header_size = write_obu_header(OBU_SEQUENCE_HEADER, 0, data);
 
     obu_payload_size = write_sequence_header_obu(cpi, data + obu_header_size);
diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h
index 13fc11c31..003e59e39 100644
--- a/third_party/aom/av1/encoder/block.h
+++ b/third_party/aom/av1/encoder/block.h
@@ -224,6 +224,7 @@ struct macroblock {
   int sadperbit4;
   int rdmult;
   int mb_energy;
+  int sb_energy_level;
   int *m_search_count_ptr;
   int *ex_search_count_ptr;
 
@@ -258,7 +259,6 @@ struct macroblock {
   MvLimits mv_limits;
 
   uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
-  uint8_t blk_skip_drl[MAX_MIB_SIZE * MAX_MIB_SIZE];
 
   int skip;
   int skip_chroma_rd;
diff --git a/third_party/aom/av1/encoder/dwt.c b/third_party/aom/av1/encoder/dwt.c
index 0a57ebcfb..04088b25f 100644
--- a/third_party/aom/av1/encoder/dwt.c
+++ b/third_party/aom/av1/encoder/dwt.c
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 #include <assert.h>
 #include <stdlib.h>
 #include <math.h>
diff --git a/third_party/aom/av1/encoder/dwt.h b/third_party/aom/av1/encoder/dwt.h
index 9a86db2f1..03318e5b7 100644
--- a/third_party/aom/av1/encoder/dwt.h
+++ b/third_party/aom/av1/encoder/dwt.h
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 #include "av1/common/common.h"
 #include "av1/common/enums.h"
 
diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c
index 027b80a16..27ca53761 100644
--- a/third_party/aom/av1/encoder/encodeframe.c
+++ b/third_party/aom/av1/encoder/encodeframe.c
@@ -41,7 +41,6 @@
 #include "av1/common/seg_common.h"
 #include "av1/common/tile_common.h"
 
-#include "av1/encoder/ab_partition_model_weights.h"
 #include "av1/encoder/aq_complexity.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/aq_variance.h"
@@ -54,6 +53,7 @@
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/extend.h"
 #include "av1/encoder/ml.h"
+#include "av1/encoder/partition_model_weights.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/segmentation.h"
@@ -2099,7 +2099,7 @@ static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile,
   // When use_square_partition_only is true, make sure at least one square
   // partition is allowed by selecting the next smaller square size as
   // *min_block_size.
-  if (cpi->sf.use_square_partition_only) {
+  if (min_size >= cpi->sf.use_square_partition_only_threshold) {
     min_size = AOMMIN(min_size, next_square_size[max_size]);
   }
 
@@ -2363,6 +2363,7 @@ static int64_t dist_8x8_yuv(const AV1_COMP *const cpi, MACROBLOCK *const x,
 static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
   pc_tree->partitioning = PARTITION_NONE;
   pc_tree->cb_search_range = SEARCH_FULL_PLANE;
+  pc_tree->none.skip = 0;
 
   if (bsize >= BLOCK_8X8) {
     BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
@@ -2876,6 +2877,168 @@ static void ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
   }
 }
 
+#define FEATURES 18
+#define LABELS 4
+// Use a ML model to predict if horz4 and vert4 should be considered.
+static void ml_prune_4_partition(const AV1_COMP *const cpi,
+                                 const MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                 int part_ctx, int64_t best_rd,
+                                 int64_t horz_rd[2], int64_t vert_rd[2],
+                                 int64_t split_rd[4],
+                                 int *const partition_horz4_allowed,
+                                 int *const partition_vert4_allowed) {
+  if (best_rd >= 1000000000) return;
+  const NN_CONFIG *nn_config = NULL;
+  switch (bsize) {
+    case BLOCK_16X16: nn_config = &av1_4_partition_nnconfig_16; break;
+    case BLOCK_32X32: nn_config = &av1_4_partition_nnconfig_32; break;
+    case BLOCK_64X64: nn_config = &av1_4_partition_nnconfig_64; break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config) return;
+
+  aom_clear_system_state();
+
+  // Generate features.
+  float features[FEATURES];
+  int feature_index = 0;
+  features[feature_index++] = (float)part_ctx;
+  features[feature_index++] = (float)get_unsigned_bits(x->source_variance);
+
+  const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+  int sub_block_rdcost[8] = { 0 };
+  int rd_index = 0;
+  for (int i = 0; i < 2; ++i) {
+    if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)horz_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)vert_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 4; ++i) {
+    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)split_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 8; ++i) {
+    // Ratio between the sub-block RD and the whole-block RD.
+    float rd_ratio = 1.0f;
+    if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+      rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+    features[feature_index++] = rd_ratio;
+  }
+
+  // Get variance of the 1:4 and 4:1 sub-blocks.
+  unsigned int horz_4_source_var[4] = { 0 };
+  unsigned int vert_4_source_var[4] = { 0 };
+  {
+    BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
+    BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+    const int src_stride = x->plane[0].src.stride;
+    const uint8_t *src = x->plane[0].src.buf;
+    const MACROBLOCKD *const xd = &x->e_mbd;
+    for (int i = 0; i < 4; ++i) {
+      const uint8_t *horz_src =
+          src + i * block_size_high[horz_4_bs] * src_stride;
+      const uint8_t *vert_src = src + i * block_size_wide[vert_4_bs];
+      unsigned int horz_var, vert_var, sse;
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        switch (xd->bd) {
+          case 10:
+            horz_var = cpi->fn_ptr[horz_4_bs].vf(
+                horz_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10),
+                0, &sse);
+            vert_var = cpi->fn_ptr[vert_4_bs].vf(
+                vert_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10),
+                0, &sse);
+            break;
+          case 12:
+            horz_var = cpi->fn_ptr[horz_4_bs].vf(
+                horz_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12),
+                0, &sse);
+            vert_var = cpi->fn_ptr[vert_4_bs].vf(
+                vert_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12),
+                0, &sse);
+            break;
+          case 8:
+          default:
+            horz_var = cpi->fn_ptr[horz_4_bs].vf(
+                horz_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8),
+                0, &sse);
+            vert_var = cpi->fn_ptr[vert_4_bs].vf(
+                vert_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8),
+                0, &sse);
+            break;
+        }
+        horz_4_source_var[i] =
+            ROUND_POWER_OF_TWO(horz_var, num_pels_log2_lookup[horz_4_bs]);
+        vert_4_source_var[i] =
+            ROUND_POWER_OF_TWO(vert_var, num_pels_log2_lookup[vert_4_bs]);
+      } else {
+        horz_var = cpi->fn_ptr[horz_4_bs].vf(horz_src, src_stride, AV1_VAR_OFFS,
+                                             0, &sse);
+        vert_var = cpi->fn_ptr[vert_4_bs].vf(vert_src, src_stride, AV1_VAR_OFFS,
+                                             0, &sse);
+        horz_4_source_var[i] =
+            ROUND_POWER_OF_TWO(horz_var, num_pels_log2_lookup[horz_4_bs]);
+        vert_4_source_var[i] =
+            ROUND_POWER_OF_TWO(vert_var, num_pels_log2_lookup[vert_4_bs]);
+      }
+    }
+  }
+
+  const float denom = (float)(x->source_variance + 1);
+  const float low_b = 0.1f;
+  const float high_b = 10.0f;
+  for (int i = 0; i < 4; ++i) {
+    // Ratio between the 4:1 sub-block variance and the whole-block variance.
+    float var_ratio = (float)(horz_4_source_var[i] + 1) / denom;
+    if (var_ratio < low_b) var_ratio = low_b;
+    if (var_ratio > high_b) var_ratio = high_b;
+    features[feature_index++] = var_ratio;
+  }
+  for (int i = 0; i < 4; ++i) {
+    // Ratio between the 1:4 sub-block RD and the whole-block RD.
+    float var_ratio = (float)(vert_4_source_var[i] + 1) / denom;
+    if (var_ratio < low_b) var_ratio = low_b;
+    if (var_ratio > high_b) var_ratio = high_b;
+    features[feature_index++] = var_ratio;
+  }
+  assert(feature_index == FEATURES);
+
+  // Calculate scores using the NN model.
+  float score[LABELS] = { 0.0f };
+  av1_nn_predict(features, nn_config, score);
+  int int_score[LABELS];
+  int max_score = -1000;
+  for (int i = 0; i < LABELS; ++i) {
+    int_score[i] = (int)(100 * score[i]);
+    max_score = AOMMAX(int_score[i], max_score);
+  }
+
+  // Make decisions based on the model scores.
+  int thresh = max_score;
+  switch (bsize) {
+    case BLOCK_16X16: thresh -= 400; break;
+    case BLOCK_32X32: thresh -= 400; break;
+    case BLOCK_64X64: thresh -= 100; break;
+    default: break;
+  }
+  *partition_horz4_allowed = 0;
+  *partition_vert4_allowed = 0;
+  for (int i = 0; i < LABELS; ++i) {
+    if (int_score[i] >= thresh) {
+      if ((i >> 0) & 1) *partition_horz4_allowed = 1;
+      if ((i >> 1) & 1) *partition_vert4_allowed = 1;
+    }
+  }
+}
+#undef FEATURES
+#undef LABELS
+
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
@@ -3003,7 +3166,8 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
     partition_vert_allowed &= partition_allowed || !has_cols;
     do_square_split &= bsize > min_size;
   }
-  if (cpi->sf.use_square_partition_only) {
+
+  if (bsize > cpi->sf.use_square_partition_only_threshold) {
     partition_horz_allowed &= !has_rows;
     partition_vert_allowed &= !has_cols;
   }
@@ -3480,13 +3644,6 @@ BEGIN_PARTITION_SEARCH:
   const int ext_partition_allowed =
       do_rectangular_split && bsize > BLOCK_8X8 && partition_none_allowed;
 
-  // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or
-  // PARTITION_VERT_4 for this block. This is almost the same as
-  // ext_partition_allowed, except that we don't allow 128x32 or 32x128 blocks,
-  // so we require that bsize is not BLOCK_128X128.
-  const int partition4_allowed =
-      ext_partition_allowed && bsize != BLOCK_128X128;
-
   // The standard AB partitions are allowed whenever ext-partition-types are
   // allowed
   int horzab_partition_allowed = ext_partition_allowed;
@@ -3642,15 +3799,34 @@ BEGIN_PARTITION_SEARCH:
     restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
 
-  // PARTITION_HORZ_4
+  // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or
+  // PARTITION_VERT_4 for this block. This is almost the same as
+  // ext_partition_allowed, except that we don't allow 128x32 or 32x128 blocks,
+  // so we require that bsize is not BLOCK_128X128.
+  const int partition4_allowed =
+      ext_partition_allowed && bsize != BLOCK_128X128;
   int partition_horz4_allowed = partition4_allowed && partition_horz_allowed;
+  int partition_vert4_allowed = partition4_allowed && partition_vert_allowed;
   if (cpi->sf.prune_ext_partition_types_search_level == 2) {
     partition_horz4_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
                                 pc_tree->partitioning == PARTITION_HORZ_A ||
                                 pc_tree->partitioning == PARTITION_HORZ_B ||
                                 pc_tree->partitioning == PARTITION_SPLIT ||
                                 pc_tree->partitioning == PARTITION_NONE);
+    partition_vert4_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+                                pc_tree->partitioning == PARTITION_VERT_A ||
+                                pc_tree->partitioning == PARTITION_VERT_B ||
+                                pc_tree->partitioning == PARTITION_SPLIT ||
+                                pc_tree->partitioning == PARTITION_NONE);
   }
+  if (cpi->sf.ml_prune_4_partition && partition4_allowed &&
+      partition_horz_allowed && partition_vert_allowed) {
+    ml_prune_4_partition(cpi, x, bsize, pc_tree->partitioning, best_rdc.rdcost,
+                         horz_rd, vert_rd, split_rd, &partition_horz4_allowed,
+                         &partition_vert4_allowed);
+  }
+
+  // PARTITION_HORZ_4
   if (partition_horz4_allowed && has_rows &&
       (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
     av1_init_rd_stats(&sum_rdc);
@@ -3687,14 +3863,6 @@ BEGIN_PARTITION_SEARCH:
   }
 
   // PARTITION_VERT_4
-  int partition_vert4_allowed = partition4_allowed && partition_vert_allowed;
-  if (cpi->sf.prune_ext_partition_types_search_level == 2) {
-    partition_vert4_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
-                                pc_tree->partitioning == PARTITION_VERT_A ||
-                                pc_tree->partitioning == PARTITION_VERT_B ||
-                                pc_tree->partitioning == PARTITION_SPLIT ||
-                                pc_tree->partitioning == PARTITION_NONE);
-  }
   if (partition_vert4_allowed && has_cols &&
       (do_rectangular_split || active_v_edge(cpi, mi_row, mi_step))) {
     av1_init_rd_stats(&sum_rdc);
@@ -3857,6 +4025,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
     }
     xd->cur_frame_force_integer_mv = cm->cur_frame_force_integer_mv;
 
+    x->sb_energy_level = 0;
     if (cm->delta_q_present_flag) {
       // Delta-q modulation based on variance
       av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes);
@@ -3865,11 +4034,13 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
       if (DELTAQ_MODULATION == 1) {
         const int block_wavelet_energy_level =
             av1_block_wavelet_energy_level(cpi, x, cm->seq_params.sb_size);
+        x->sb_energy_level = block_wavelet_energy_level;
         offset_qindex = av1_compute_deltaq_from_energy_level(
             cpi, block_wavelet_energy_level);
       } else {
         const int block_var_level =
             av1_block_energy(cpi, x, cm->seq_params.sb_size);
+        x->sb_energy_level = block_var_level;
         offset_qindex =
             av1_compute_deltaq_from_energy_level(cpi, block_var_level);
       }
@@ -3943,6 +4114,8 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
       x->use_cb_search_range = 0;
       init_first_partition_pass_stats_tables(x->first_partition_pass_stats);
       if (cpi->sf.two_pass_partition_search &&
+          cpi->sf.use_square_partition_only_threshold <
+              cm->seq_params.sb_size &&
           mi_row + mi_size_high[cm->seq_params.sb_size] < cm->mi_rows &&
           mi_col + mi_size_wide[cm->seq_params.sb_size] < cm->mi_cols &&
           cm->frame_type != KEY_FRAME) {
@@ -4030,7 +4203,8 @@ static void init_encode_frame_mb_context(AV1_COMP *cpi) {
   // Copy data over into macro block data structures.
   av1_setup_src_planes(x, cpi->source, 0, 0, num_planes);
 
-  av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y, num_planes);
+  av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
+                         cm->seq_params.subsampling_y, num_planes);
 }
 
 static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) {
@@ -4116,8 +4290,8 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
   TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
   int mi_row;
 
-  av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end,
-                         tile_row);
+  av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start,
+                         tile_info->mi_col_end, tile_row);
   av1_init_above_context(cm, &td->mb.e_mbd, tile_row);
 
   // Set up pointers to per thread motion search counters.
@@ -4128,7 +4302,7 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
   this_tile->tctx = *cm->fc;
   td->mb.e_mbd.tile_ctx = &this_tile->tctx;
 
-  cfl_init(&td->mb.e_mbd.cfl, cm);
+  cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
 
   av1_crc32c_calculator_init(&td->mb.mb_rd_record.crc_calculator);
 
@@ -4263,25 +4437,24 @@ static int is_screen_content(const uint8_t *src, int use_hbd, int bd,
   return counts * blk_h * blk_w * 10 > width * height;
 }
 
+static const uint8_t ref_frame_flag_list[REF_FRAMES] = { 0,
+                                                         AOM_LAST_FLAG,
+                                                         AOM_LAST2_FLAG,
+                                                         AOM_LAST3_FLAG,
+                                                         AOM_GOLD_FLAG,
+                                                         AOM_BWD_FLAG,
+                                                         AOM_ALT2_FLAG,
+                                                         AOM_ALT_FLAG };
+
 // Enforce the number of references for each arbitrary frame limited to
 // (INTER_REFS_PER_FRAME - 1)
 static void enforce_max_ref_frames(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  static const int flag_list[REF_FRAMES] = { 0,
-                                             AOM_LAST_FLAG,
-                                             AOM_LAST2_FLAG,
-                                             AOM_LAST3_FLAG,
-                                             AOM_GOLD_FLAG,
-                                             AOM_BWD_FLAG,
-                                             AOM_ALT2_FLAG,
-                                             AOM_ALT_FLAG };
   MV_REFERENCE_FRAME ref_frame;
   int total_valid_refs = 0;
-
-  (void)flag_list;
-
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) total_valid_refs++;
+    if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame])
+      total_valid_refs++;
   }
 
   // NOTE(zoeliu): When all the possible reference frames are availble, we
@@ -4617,7 +4790,6 @@ static void encode_frame_internal(AV1_COMP *cpi) {
   cm->prev_mi = cm->allow_ref_frame_mvs ? cm->prev_mip : NULL;
 
   x->txb_split_count = 0;
-  av1_zero(x->blk_skip_drl);
 
   av1_zero(rdc->global_motion_used);
   av1_zero(cpi->gmparams_cost);
@@ -4672,8 +4844,9 @@ static void encode_frame_internal(AV1_COMP *cpi) {
           }
 
           compute_global_motion_feature_based(
-              model, cpi->source, ref_buf[frame], cpi->common.bit_depth,
-              inliers_by_motion, params_by_motion, RANSAC_NUM_MOTIONS);
+              model, cpi->source, ref_buf[frame],
+              cpi->common.seq_params.bit_depth, inliers_by_motion,
+              params_by_motion, RANSAC_NUM_MOTIONS);
 
           for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
             if (inliers_by_motion[i] == 0) continue;
@@ -4734,6 +4907,15 @@ static void encode_frame_internal(AV1_COMP *cpi) {
           cpi->gmtype_cost[cm->global_motion[frame].wmtype] -
           cpi->gmtype_cost[IDENTITY];
     }
+    // clear disabled ref_frames
+    for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+      const int ref_disabled =
+          !(cpi->ref_frame_flags & ref_frame_flag_list[frame]);
+      if (ref_disabled && cpi->sf.recode_loop != DISALLOW_RECODE) {
+        cpi->gmparams_cost[frame] = 0;
+        cm->global_motion[frame] = default_warp_params;
+      }
+    }
     cpi->global_motion_search_done = 1;
   }
   memcpy(cm->cur_frame->global_motion, cm->global_motion,
@@ -5082,8 +5264,9 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
   }
 
   if (!is_inter) {
-    xd->cfl.is_chroma_reference = is_chroma_reference(
-        mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y);
+    xd->cfl.is_chroma_reference =
+        is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x,
+                            cm->seq_params.subsampling_y);
     xd->cfl.store_y = store_cfl_required(cm, xd);
     mbmi->skip = 1;
     for (int plane = 0; plane < num_planes; ++plane) {
diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c
index 196e18d8a..13ea32e38 100644
--- a/third_party/aom/av1/encoder/encoder.c
+++ b/third_party/aom/av1/encoder/encoder.c
@@ -56,6 +56,11 @@
 #include "av1/encoder/grain_test_vectors.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#if CONFIG_DENOISE
+#include "aom_dsp/grain_table.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_dsp/noise_model.h"
+#endif
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
@@ -290,7 +295,8 @@ static void setup_frame(AV1_COMP *cpi) {
       cm->fb_of_context_type[i] = -1;
     }
     cm->fb_of_context_type[REGULAR_FRAME] =
-        get_ref_frame_map_idx(cpi, GOLDEN_FRAME);
+        cm->show_frame ? get_ref_frame_map_idx(cpi, GOLDEN_FRAME)
+                       : get_ref_frame_map_idx(cpi, ALTREF_FRAME);
     cm->frame_context_idx = REGULAR_FRAME;
   } else {
     const GF_GROUP *gf_group = &cpi->twopass.gf_group;
@@ -315,7 +321,7 @@ static void setup_frame(AV1_COMP *cpi) {
     }
   }
 
-  if (cm->frame_type == KEY_FRAME) {
+  if (cm->frame_type == KEY_FRAME && cm->show_frame) {
     cpi->refresh_golden_frame = 1;
     cpi->refresh_alt_ref_frame = 1;
     av1_zero(cpi->interp_filter_selected);
@@ -344,19 +350,20 @@ static void setup_frame(AV1_COMP *cpi) {
 
 static void enc_setup_mi(AV1_COMMON *cm) {
   int i;
+  int mi_rows_sb_aligned = calc_mi_size(cm->mi_rows);
   cm->mi = cm->mip;
-  memset(cm->mip, 0, cm->mi_stride * cm->mi_rows * sizeof(*cm->mip));
+  memset(cm->mip, 0, cm->mi_stride * mi_rows_sb_aligned * sizeof(*cm->mip));
   cm->prev_mi = cm->prev_mip;
   // Clear top border row
   memset(cm->prev_mip, 0, sizeof(*cm->prev_mip) * cm->mi_stride);
   // Clear left border column
-  for (i = 0; i < cm->mi_rows; ++i)
+  for (i = 0; i < mi_rows_sb_aligned; ++i)
     memset(&cm->prev_mip[i * cm->mi_stride], 0, sizeof(*cm->prev_mip));
   cm->mi_grid_visible = cm->mi_grid_base;
   cm->prev_mi_grid_visible = cm->prev_mi_grid_base;
 
   memset(cm->mi_grid_base, 0,
-         cm->mi_stride * cm->mi_rows * sizeof(*cm->mi_grid_base));
+         cm->mi_stride * mi_rows_sb_aligned * sizeof(*cm->mi_grid_base));
 }
 
 static int enc_alloc_mi(AV1_COMMON *cm, int mi_size) {
@@ -441,32 +448,32 @@ static void update_film_grain_parameters(struct AV1_COMP *cpi,
   AV1_COMMON *const cm = &cpi->common;
   cpi->oxcf = *oxcf;
 
-  if (cm->film_grain_table) {
-    aom_film_grain_table_free(cm->film_grain_table);
-    aom_free(cm->film_grain_table);
+  if (cpi->film_grain_table) {
+    aom_film_grain_table_free(cpi->film_grain_table);
+    aom_free(cpi->film_grain_table);
+    cpi->film_grain_table = NULL;
   }
-  cm->film_grain_table = 0;
 
   if (oxcf->film_grain_test_vector) {
-    cm->film_grain_params_present = 1;
+    cm->seq_params.film_grain_params_present = 1;
     if (cm->frame_type == KEY_FRAME) {
       memcpy(&cm->film_grain_params,
              film_grain_test_vectors + oxcf->film_grain_test_vector - 1,
              sizeof(cm->film_grain_params));
 
-      cm->film_grain_params.bit_depth = cm->bit_depth;
-      if (cm->color_range == AOM_CR_FULL_RANGE) {
+      cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
+      if (cm->seq_params.color_range == AOM_CR_FULL_RANGE) {
         cm->film_grain_params.clip_to_restricted_range = 0;
       }
     }
   } else if (oxcf->film_grain_table_filename) {
-    cm->film_grain_table = aom_malloc(sizeof(*cm->film_grain_table));
-    memset(cm->film_grain_table, 0, sizeof(aom_film_grain_table_t));
+    cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
+    memset(cpi->film_grain_table, 0, sizeof(aom_film_grain_table_t));
 
-    aom_film_grain_table_read(cm->film_grain_table,
+    aom_film_grain_table_read(cpi->film_grain_table,
                               oxcf->film_grain_table_filename, &cm->error);
   } else {
-    cm->film_grain_params_present = 0;
+    cm->seq_params.film_grain_params_present = 0;
     memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
   }
 }
@@ -523,6 +530,17 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
   av1_free_pc_tree(&cpi->td, num_planes);
 
   aom_free(cpi->td.mb.palette_buffer);
+
+#if CONFIG_DENOISE
+  if (cpi->denoise_and_model) {
+    aom_denoise_and_model_free(cpi->denoise_and_model);
+    cpi->denoise_and_model = NULL;
+  }
+#endif
+  if (cpi->film_grain_table) {
+    aom_film_grain_table_free(cpi->film_grain_table);
+    cpi->film_grain_table = NULL;
+  }
 }
 
 static void save_coding_context(AV1_COMP *cpi) {
@@ -596,8 +614,8 @@ static void configure_static_seg_features(AV1_COMP *cpi) {
       seg->update_map = 1;
       seg->update_data = 1;
 
-      qi_delta =
-          av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875, cm->bit_depth);
+      qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875,
+                                    cm->seq_params.bit_depth);
       av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
       av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
       av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
@@ -621,8 +639,8 @@ static void configure_static_seg_features(AV1_COMP *cpi) {
         seg->update_map = 0;
         seg->update_data = 1;
 
-        qi_delta =
-            av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125, cm->bit_depth);
+        qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125,
+                                      cm->seq_params.bit_depth);
         av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2);
         av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
 
@@ -705,53 +723,58 @@ static void update_reference_segmentation_map(AV1_COMP *cpi) {
 
 static void alloc_raw_frame_buffers(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
 
   if (!cpi->lookahead)
-    cpi->lookahead = av1_lookahead_init(
-        oxcf->width, oxcf->height, cm->subsampling_x, cm->subsampling_y,
-        cm->use_highbitdepth, oxcf->lag_in_frames);
+    cpi->lookahead =
+        av1_lookahead_init(oxcf->width, oxcf->height, seq_params->subsampling_x,
+                           seq_params->subsampling_y,
+                           seq_params->use_highbitdepth, oxcf->lag_in_frames);
   if (!cpi->lookahead)
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate lag buffers");
 
   // TODO(agrange) Check if ARF is enabled and skip allocation if not.
-  if (aom_realloc_frame_buffer(&cpi->alt_ref_buffer, oxcf->width, oxcf->height,
-                               cm->subsampling_x, cm->subsampling_y,
-                               cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
-                               cm->byte_alignment, NULL, NULL, NULL))
+  if (aom_realloc_frame_buffer(
+          &cpi->alt_ref_buffer, oxcf->width, oxcf->height,
+          seq_params->subsampling_x, seq_params->subsampling_y,
+          seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+          cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate altref buffer");
 }
 
 static void alloc_util_frame_buffers(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  if (aom_realloc_frame_buffer(&cpi->last_frame_uf, cm->width, cm->height,
-                               cm->subsampling_x, cm->subsampling_y,
-                               cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
-                               cm->byte_alignment, NULL, NULL, NULL))
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  if (aom_realloc_frame_buffer(
+          &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
 
   if (aom_realloc_frame_buffer(
           &cpi->trial_frame_rst, cm->superres_upscaled_width,
-          cm->superres_upscaled_height, cm->subsampling_x, cm->subsampling_y,
-          cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
-          NULL, NULL))
+          cm->superres_upscaled_height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate trial restored frame buffer");
 
-  if (aom_realloc_frame_buffer(&cpi->scaled_source, cm->width, cm->height,
-                               cm->subsampling_x, cm->subsampling_y,
-                               cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
-                               cm->byte_alignment, NULL, NULL, NULL))
+  if (aom_realloc_frame_buffer(
+          &cpi->scaled_source, cm->width, cm->height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
 
-  if (aom_realloc_frame_buffer(&cpi->scaled_last_source, cm->width, cm->height,
-                               cm->subsampling_x, cm->subsampling_y,
-                               cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
-                               cm->byte_alignment, NULL, NULL, NULL))
+  if (aom_realloc_frame_buffer(
+          &cpi->scaled_last_source, cm->width, cm->height,
+          seq_params->subsampling_x, seq_params->subsampling_y,
+          seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+          cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate scaled last source buffer");
 }
@@ -846,8 +869,6 @@ static void init_buffer_indices(AV1_COMP *cpi) {
   int fb_idx;
   for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx)
     cpi->ref_fb_idx[fb_idx] = fb_idx;
-  for (fb_idx = 0; fb_idx < MAX_EXT_ARFS + 1; ++fb_idx)
-    cpi->arf_map[fb_idx] = LAST_REF_FRAMES + 2 + fb_idx;
   cpi->rate_index = 0;
   cpi->rate_size = 0;
   cpi->cur_poc = -1;
@@ -941,7 +962,8 @@ static void set_bitstream_level_tier(SequenceHeader *seq, AV1_COMMON *cm,
     // Set the maximum parameters for bitrate and buffer size for this profile,
     // level, and tier
     cm->op_params[i].bitrate = max_level_bitrate(
-        cm->profile, major_minor_to_seq_level_idx(seq->level[i]), seq->tier[i]);
+        cm->seq_params.profile, major_minor_to_seq_level_idx(seq->level[i]),
+        seq->tier[i]);
     // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the
     // check
     if (cm->op_params[i].bitrate == 0)
@@ -1006,15 +1028,15 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
   cpi->oxcf = *oxcf;
   cpi->framerate = oxcf->init_framerate;
 
-  cm->profile = oxcf->profile;
-  cm->bit_depth = oxcf->bit_depth;
-  cm->use_highbitdepth = oxcf->use_highbitdepth;
-  cm->color_primaries = oxcf->color_primaries;
-  cm->transfer_characteristics = oxcf->transfer_characteristics;
-  cm->matrix_coefficients = oxcf->matrix_coefficients;
+  cm->seq_params.profile = oxcf->profile;
+  cm->seq_params.bit_depth = oxcf->bit_depth;
+  cm->seq_params.use_highbitdepth = oxcf->use_highbitdepth;
+  cm->seq_params.color_primaries = oxcf->color_primaries;
+  cm->seq_params.transfer_characteristics = oxcf->transfer_characteristics;
+  cm->seq_params.matrix_coefficients = oxcf->matrix_coefficients;
   cm->seq_params.monochrome = oxcf->monochrome;
-  cm->chroma_sample_position = oxcf->chroma_sample_position;
-  cm->color_range = oxcf->color_range;
+  cm->seq_params.chroma_sample_position = oxcf->chroma_sample_position;
+  cm->seq_params.color_range = oxcf->color_range;
   cm->timing_info_present = oxcf->timing_info_present;
   cm->timing_info.num_units_in_display_tick =
       oxcf->timing_info.num_units_in_display_tick;
@@ -1032,7 +1054,7 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
     // set the decoder model parameters in schedule mode
     cm->buffer_model.num_units_in_decoding_tick =
         oxcf->buffer_model.num_units_in_decoding_tick;
-    cm->buffer_removal_delay_present = 1;
+    cm->buffer_removal_time_present = 1;
     set_aom_dec_model_info(&cm->buffer_model);
     set_dec_model_op_parameters(&cm->op_params[0]);
   } else if (cm->timing_info_present &&
@@ -1365,8 +1387,8 @@ MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16)
 
 static void highbd_set_var_fns(AV1_COMP *const cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  if (cm->use_highbitdepth) {
-    switch (cm->bit_depth) {
+  if (cm->seq_params.use_highbitdepth) {
+    switch (cm->seq_params.bit_depth) {
       case AOM_BITS_8:
         HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits8,
                    aom_highbd_sad64x16_avg_bits8, aom_highbd_8_variance64x16,
@@ -2226,7 +2248,7 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
 
       default:
         assert(0 &&
-               "cm->bit_depth should be AOM_BITS_8, "
+               "cm->seq_params.bit_depth should be AOM_BITS_8, "
                "AOM_BITS_10 or AOM_BITS_12");
     }
   }
@@ -2253,20 +2275,22 @@ static void realloc_segmentation_maps(AV1_COMP *cpi) {
 
 void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = &cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   RATE_CONTROL *const rc = &cpi->rc;
   MACROBLOCK *const x = &cpi->td.mb;
 
-  if (cm->profile != oxcf->profile) cm->profile = oxcf->profile;
-  cm->bit_depth = oxcf->bit_depth;
-  cm->color_primaries = oxcf->color_primaries;
-  cm->transfer_characteristics = oxcf->transfer_characteristics;
-  cm->matrix_coefficients = oxcf->matrix_coefficients;
-  cm->seq_params.monochrome = oxcf->monochrome;
-  cm->chroma_sample_position = oxcf->chroma_sample_position;
-  cm->color_range = oxcf->color_range;
+  if (seq_params->profile != oxcf->profile) seq_params->profile = oxcf->profile;
+  seq_params->bit_depth = oxcf->bit_depth;
+  seq_params->color_primaries = oxcf->color_primaries;
+  seq_params->transfer_characteristics = oxcf->transfer_characteristics;
+  seq_params->matrix_coefficients = oxcf->matrix_coefficients;
+  seq_params->monochrome = oxcf->monochrome;
+  seq_params->chroma_sample_position = oxcf->chroma_sample_position;
+  seq_params->color_range = oxcf->color_range;
 
-  assert(IMPLIES(cm->profile <= PROFILE_1, cm->bit_depth <= AOM_BITS_10));
+  assert(IMPLIES(seq_params->profile <= PROFILE_1,
+                 seq_params->bit_depth <= AOM_BITS_10));
 
   cm->timing_info_present = oxcf->timing_info_present;
   cm->timing_info.num_units_in_display_tick =
@@ -2277,20 +2301,20 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   cm->timing_info.num_ticks_per_picture =
       oxcf->timing_info.num_ticks_per_picture;
 
-  cm->seq_params.display_model_info_present_flag =
+  seq_params->display_model_info_present_flag =
       oxcf->display_model_info_present_flag;
-  cm->seq_params.decoder_model_info_present_flag =
+  seq_params->decoder_model_info_present_flag =
       oxcf->decoder_model_info_present_flag;
   if (oxcf->decoder_model_info_present_flag) {
     // set the decoder model parameters in schedule mode
     cm->buffer_model.num_units_in_decoding_tick =
         oxcf->buffer_model.num_units_in_decoding_tick;
-    cm->buffer_removal_delay_present = 1;
+    cm->buffer_removal_time_present = 1;
     set_aom_dec_model_info(&cm->buffer_model);
     set_dec_model_op_parameters(&cm->op_params[0]);
   } else if (cm->timing_info_present &&
              cm->timing_info.equal_picture_interval &&
-             !cm->seq_params.decoder_model_info_present_flag) {
+             !seq_params->decoder_model_info_present_flag) {
     // set the decoder model parameters in resource availability mode
     set_resource_availability_parameters(&cm->op_params[0]);
   } else {
@@ -2302,7 +2326,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
 
   cpi->oxcf = *oxcf;
   cpi->common.options = oxcf->cfg;
-  x->e_mbd.bd = (int)cm->bit_depth;
+  x->e_mbd.bd = (int)seq_params->bit_depth;
   x->e_mbd.global_motion = cm->global_motion;
 
   if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) {
@@ -2360,15 +2384,15 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   cm->width = cpi->oxcf.width;
   cm->height = cpi->oxcf.height;
 
-  int sb_size = cm->seq_params.sb_size;
+  int sb_size = seq_params->sb_size;
   // Superblock size should not be updated after the first key frame.
   if (!cpi->seq_params_locked) {
     set_sb_size(&cm->seq_params, select_sb_size(cpi));
   }
 
-  if (cpi->initial_width || sb_size != cm->seq_params.sb_size) {
+  if (cpi->initial_width || sb_size != seq_params->sb_size) {
     if (cm->width > cpi->initial_width || cm->height > cpi->initial_height ||
-        cm->seq_params.sb_size != sb_size) {
+        seq_params->sb_size != sb_size) {
       av1_free_context_buffers(cm);
       av1_free_pc_tree(&cpi->td, num_planes);
       alloc_compressor_data(cpi);
@@ -2395,7 +2419,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   // Init sequence level coding tools
   // This should not be called after the first key frame.
   if (!cpi->seq_params_locked) {
-    cm->seq_params.operating_points_cnt_minus_1 =
+    seq_params->operating_points_cnt_minus_1 =
         cm->number_spatial_layers > 1 ? cm->number_spatial_layers - 1 : 0;
     init_seq_coding_tools(&cm->seq_params, cm, oxcf);
   }
@@ -2411,6 +2435,9 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 
   av1_zero(*cpi);
 
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
   if (setjmp(cm->error.jmp)) {
     cm->error.setjmp = 0;
     av1_remove_compressor(cpi);
@@ -3082,28 +3109,52 @@ static void check_show_existing_frame(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   const FRAME_UPDATE_TYPE next_frame_update_type =
       gf_group->update_type[gf_group->index];
+#if USE_SYMM_MULTI_LAYER
+  const int which_arf = (cpi->new_bwdref_update_rule == 1)
+                            ? gf_group->arf_update_idx[gf_group->index] > 0
+                            : gf_group->arf_update_idx[gf_group->index];
+#else
   const int which_arf = gf_group->arf_update_idx[gf_group->index];
+#endif
 
   if (cm->show_existing_frame == 1) {
     cm->show_existing_frame = 0;
   } else if (cpi->rc.is_last_bipred_frame) {
-    // NOTE: If the current frame is a last bi-predictive frame, it is
-    //       needed next to show the BWDREF_FRAME, which is pointed by
-    //       the last_fb_idxes[0] after reference frame buffer update
-    cpi->rc.is_last_bipred_frame = 0;
-    cm->show_existing_frame = 1;
-    cpi->existing_fb_idx_to_show = cpi->ref_fb_idx[0];
+#if USE_SYMM_MULTI_LAYER
+    // NOTE: When new structure is used, every bwdref will have one overlay
+    //       frame. Therefore, there is no need to find out which frame to
+    //       show in advance.
+    if (cpi->new_bwdref_update_rule == 0) {
+#endif
+      // NOTE: If the current frame is a last bi-predictive frame, it is
+      //       needed next to show the BWDREF_FRAME, which is pointed by
+      //       the last_fb_idxes[0] after reference frame buffer update
+      cpi->rc.is_last_bipred_frame = 0;
+      cm->show_existing_frame = 1;
+      cpi->existing_fb_idx_to_show = cpi->ref_fb_idx[0];
+#if USE_SYMM_MULTI_LAYER
+    }
+#endif
   } else if (cpi->is_arf_filter_off[which_arf] &&
              (next_frame_update_type == OVERLAY_UPDATE ||
               next_frame_update_type == INTNL_OVERLAY_UPDATE)) {
+#if USE_SYMM_MULTI_LAYER
+    const int bwdref_to_show =
+        (cpi->new_bwdref_update_rule == 1) ? BWDREF_FRAME : ALTREF2_FRAME;
+#else
+    const int bwdref_to_show = ALTREF2_FRAME;
+#endif
     // Other parameters related to OVERLAY_UPDATE will be taken care of
     // in av1_rc_get_second_pass_params(cpi)
     cm->show_existing_frame = 1;
     cpi->rc.is_src_frame_alt_ref = 1;
     cpi->existing_fb_idx_to_show = (next_frame_update_type == OVERLAY_UPDATE)
                                        ? cpi->ref_fb_idx[ALTREF_FRAME - 1]
-                                       : cpi->ref_fb_idx[ALTREF2_FRAME - 1];
-    cpi->is_arf_filter_off[which_arf] = 0;
+                                       : cpi->ref_fb_idx[bwdref_to_show - 1];
+#if USE_SYMM_MULTI_LAYER
+    if (cpi->new_bwdref_update_rule == 0)
+#endif
+      cpi->is_arf_filter_off[which_arf] = 0;
   }
   cpi->rc.is_src_frame_ext_arf = 0;
 }
@@ -3288,6 +3339,48 @@ static INLINE void shift_last_ref_frames(AV1_COMP *cpi) {
   }
 }
 
+#if USE_SYMM_MULTI_LAYER
+// This function is used to shift the virtual indices of bwd reference
+// frames as follows:
+// BWD_REF -> ALT2_REF -> EXT_REF
+// to clear a space to store the closest bwdref
+static INLINE void rshift_bwd_ref_frames(AV1_COMP *cpi) {
+  // TODO(isbs): shift the scaled indices as well
+  static const int ordered_bwd[3] = { BWDREF_FRAME - 1, ALTREF2_FRAME - 1,
+                                      EXTREF_FRAME - 1 };
+
+  for (int i = 2; i > 0; --i) {
+    cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i - 1]];
+
+    // [0] is allocated to the current coded frame, i.e. bwdref
+    memcpy(
+        cpi->interp_filter_selected[ordered_bwd[i] + LAST_FRAME],
+        cpi->interp_filter_selected[ordered_bwd[i - 1] + LAST_FRAME],
+        sizeof(cpi->interp_filter_selected[ordered_bwd[i - 1] + LAST_FRAME]));
+  }
+}
+
+// This function is used to shift the virtual indices of bwd reference
+// frames as follows:
+// BWD_REF <- ALT2_REF <- EXT_REF
+// to update the bwd reference frame for coding the next frame.
+static INLINE void lshift_bwd_ref_frames(AV1_COMP *cpi) {
+  // TODO(isbs): shift the scaled indices as well
+  static const int ordered_bwd[3] = { BWDREF_FRAME - 1, ALTREF2_FRAME - 1,
+                                      EXTREF_FRAME - 1 };
+
+  for (int i = 0; i < 2; ++i) {
+    cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i + 1]];
+
+    // [0] is allocated to the current coded frame, i.e. bwdref
+    memcpy(
+        cpi->interp_filter_selected[ordered_bwd[i] + LAST_FRAME],
+        cpi->interp_filter_selected[ordered_bwd[i + 1] + LAST_FRAME],
+        sizeof(cpi->interp_filter_selected[ordered_bwd[i + 1] + LAST_FRAME]));
+  }
+}
+#endif  // USE_SYMM_MULTI_LAYER
+
 #if USE_GF16_MULTI_LAYER
 static void update_reference_frames_gf16(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
@@ -3343,7 +3436,9 @@ static void update_reference_frames(AV1_COMP *cpi) {
   // At this point the new frame has been encoded.
   // If any buffer copy / swapping is signaled it should be done here.
 
-  if (cm->frame_type == KEY_FRAME || frame_is_sframe(cm)) {
+  // Only update all of the reference buffers if a KEY_FRAME is also a
+  // show_frame. This ensures a fwd keyframe does not update all of the buffers
+  if ((cm->frame_type == KEY_FRAME && cm->show_frame) || frame_is_sframe(cm)) {
     for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
       ref_cnt_fb(pool->frame_bufs,
                  &cm->ref_frame_map[cpi->ref_fb_idx[ref_frame]],
@@ -3370,37 +3465,49 @@ static void update_reference_frames(AV1_COMP *cpi) {
     cpi->ref_fb_idx[ALTREF_FRAME - 1] = cpi->ref_fb_idx[GOLDEN_FRAME - 1];
     cpi->ref_fb_idx[GOLDEN_FRAME - 1] = tmp;
 
-    // We need to modify the mapping accordingly
-    cpi->arf_map[0] = cpi->ref_fb_idx[ALTREF_FRAME - 1];
     // TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to
     // cpi->interp_filter_selected[GOLDEN_FRAME]?
   } else if (cpi->rc.is_src_frame_ext_arf && cm->show_existing_frame) {
+#if CONFIG_DEBUG
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    assert(gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE);
+#endif
+#if USE_SYMM_MULTI_LAYER
+    const int bwdref_to_show =
+        (cpi->new_bwdref_update_rule == 1) ? BWDREF_FRAME : ALTREF2_FRAME;
+#else
+    const int bwdref_to_show = ALTREF2_FRAME;
+#endif
     // Deal with the special case for showing existing internal ALTREF_FRAME
     // Refresh the LAST_FRAME with the ALTREF_FRAME and retire the LAST3_FRAME
     // by updating the virtual indices.
-    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-    const int which_arf = gf_group->arf_ref_idx[gf_group->index];
-    assert(gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE);
-
     const int tmp = cpi->ref_fb_idx[LAST_REF_FRAMES - 1];
     shift_last_ref_frames(cpi);
 
-    cpi->ref_fb_idx[LAST_FRAME - 1] = cpi->ref_fb_idx[ALTREF2_FRAME - 1];
-    cpi->ref_fb_idx[ALTREF2_FRAME - 1] = tmp;
-    // We need to modify the mapping accordingly
-    cpi->arf_map[which_arf] = cpi->ref_fb_idx[ALTREF2_FRAME - 1];
+    cpi->ref_fb_idx[LAST_FRAME - 1] = cpi->ref_fb_idx[bwdref_to_show - 1];
 
     memcpy(cpi->interp_filter_selected[LAST_FRAME],
-           cpi->interp_filter_selected[ALTREF2_FRAME],
-           sizeof(cpi->interp_filter_selected[ALTREF2_FRAME]));
+           cpi->interp_filter_selected[bwdref_to_show],
+           sizeof(cpi->interp_filter_selected[bwdref_to_show]));
+#if USE_SYMM_MULTI_LAYER
+    if (cpi->new_bwdref_update_rule == 1) {
+      lshift_bwd_ref_frames(cpi);
+      // pass outdated forward reference frame (previous LAST3) to the
+      // spared space
+      cpi->ref_fb_idx[EXTREF_FRAME - 1] = tmp;
+    } else {
+#endif
+      cpi->ref_fb_idx[bwdref_to_show - 1] = tmp;
+#if USE_SYMM_MULTI_LAYER
+    }
+#endif
   } else { /* For non key/golden frames */
     // === ALTREF_FRAME ===
     if (cpi->refresh_alt_ref_frame) {
       int arf_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
-      int which_arf = 0;
       ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
 
-      memcpy(cpi->interp_filter_selected[ALTREF_FRAME + which_arf],
+      memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
              cpi->interp_filter_selected[0],
              sizeof(cpi->interp_filter_selected[0]));
     }
@@ -3418,10 +3525,25 @@ static void update_reference_frames(AV1_COMP *cpi) {
 
     // === BWDREF_FRAME ===
     if (cpi->refresh_bwd_ref_frame) {
-      ref_cnt_fb(pool->frame_bufs,
-                 &cm->ref_frame_map[cpi->ref_fb_idx[BWDREF_FRAME - 1]],
-                 cm->new_fb_idx);
-
+#if USE_SYMM_MULTI_LAYER
+      if (cpi->new_bwdref_update_rule) {
+        // We shift the backward reference frame as follows:
+        // BWDREF -> ALTREF2 -> EXTREF
+        // and assign the newly coded frame to BWDREF so that it always
+        // keeps the nearest future frame
+        int tmp = cpi->ref_fb_idx[EXTREF_FRAME - 1];
+        ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[tmp], cm->new_fb_idx);
+
+        rshift_bwd_ref_frames(cpi);
+        cpi->ref_fb_idx[BWDREF_FRAME - 1] = tmp;
+      } else {
+#endif  // USE_SYMM_MULTI_LAYER
+        ref_cnt_fb(pool->frame_bufs,
+                   &cm->ref_frame_map[cpi->ref_fb_idx[BWDREF_FRAME - 1]],
+                   cm->new_fb_idx);
+#if USE_SYMM_MULTI_LAYER
+      }
+#endif
       memcpy(cpi->interp_filter_selected[BWDREF_FRAME],
              cpi->interp_filter_selected[0],
              sizeof(cpi->interp_filter_selected[0]));
@@ -3486,7 +3608,14 @@ static void update_reference_frames(AV1_COMP *cpi) {
            cpi->interp_filter_selected[0],
            sizeof(cpi->interp_filter_selected[0]));
 
+    // If the new structure is used, we will always have overlay frames coupled
+    // with bwdref frames. Therefore, we won't have to perform this update
+    // in advance (we do this update when the overlay frame shows up).
+#if USE_SYMM_MULTI_LAYER
+    if (cpi->new_bwdref_update_rule == 0 && cpi->rc.is_last_bipred_frame) {
+#else
     if (cpi->rc.is_last_bipred_frame) {
+#endif
       // Refresh the LAST_FRAME with the BWDREF_FRAME and retire the
       // LAST3_FRAME by updating the virtual indices.
       //
@@ -3555,13 +3684,14 @@ static void scale_references(AV1_COMP *cpi) {
         if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
             new_fb_ptr->buf.y_crop_height != cm->height) {
           if (aom_realloc_frame_buffer(
-                  &new_fb_ptr->buf, cm->width, cm->height, cm->subsampling_x,
-                  cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                  &new_fb_ptr->buf, cm->width, cm->height,
+                  cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
+                  cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS,
                   cm->byte_alignment, NULL, NULL, NULL))
             aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
-          av1_resize_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth,
-                                      num_planes);
+          av1_resize_and_extend_frame(
+              ref, &new_fb_ptr->buf, (int)cm->seq_params.bit_depth, num_planes);
           cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
         }
@@ -3706,13 +3836,14 @@ static void init_ref_frame_bufs(AV1_COMMON *cm) {
 static void check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
                                 int subsampling_x, int subsampling_y) {
   AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = &cm->seq_params;
 
-  if (!cpi->initial_width || cm->use_highbitdepth != use_highbitdepth ||
-      cm->subsampling_x != subsampling_x ||
-      cm->subsampling_y != subsampling_y) {
-    cm->subsampling_x = subsampling_x;
-    cm->subsampling_y = subsampling_y;
-    cm->use_highbitdepth = use_highbitdepth;
+  if (!cpi->initial_width || seq_params->use_highbitdepth != use_highbitdepth ||
+      seq_params->subsampling_x != subsampling_x ||
+      seq_params->subsampling_y != subsampling_y) {
+    seq_params->subsampling_x = subsampling_x;
+    seq_params->subsampling_y = subsampling_y;
+    seq_params->use_highbitdepth = use_highbitdepth;
 
     alloc_raw_frame_buffers(cpi);
     init_ref_frame_bufs(cm);
@@ -3730,8 +3861,9 @@ static void check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
 static int set_size_literal(AV1_COMP *cpi, int width, int height) {
   AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
-  check_initial_width(cpi, cm->use_highbitdepth, cm->subsampling_x,
-                      cm->subsampling_y);
+  check_initial_width(cpi, cm->seq_params.use_highbitdepth,
+                      cm->seq_params.subsampling_x,
+                      cm->seq_params.subsampling_y);
 
   if (width <= 0 || height <= 0) return 1;
 
@@ -3753,6 +3885,7 @@ static int set_size_literal(AV1_COMP *cpi, int width, int height) {
 
 static void set_frame_size(AV1_COMP *cpi, int width, int height) {
   AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   int ref_frame;
@@ -3782,17 +3915,19 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) {
   }
 
   // Reset the frame pointers to the current frame size.
-  if (aom_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height,
-                               cm->subsampling_x, cm->subsampling_y,
-                               cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
-                               cm->byte_alignment, NULL, NULL, NULL))
+  if (aom_realloc_frame_buffer(
+          get_frame_new_buffer(cm), cm->width, cm->height,
+          seq_params->subsampling_x, seq_params->subsampling_y,
+          seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+          cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
 
   const int frame_width = cm->superres_upscaled_width;
   const int frame_height = cm->superres_upscaled_height;
-  set_restoration_unit_size(frame_width, frame_height, cm->subsampling_x,
-                            cm->subsampling_y, cm->rst_info);
+  set_restoration_unit_size(frame_width, frame_height,
+                            seq_params->subsampling_x,
+                            seq_params->subsampling_y, cm->rst_info);
   for (int i = 0; i < num_planes; ++i)
     cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
 
@@ -4038,16 +4173,16 @@ static void superres_post_encode(AV1_COMP *cpi) {
     // av1_superres_upscale
     if (aom_realloc_frame_buffer(
             &cpi->scaled_source, cm->superres_upscaled_width,
-            cm->superres_upscaled_height, cm->subsampling_x, cm->subsampling_y,
-            cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->byte_alignment,
-            NULL, NULL, NULL))
+            cm->superres_upscaled_height, cm->seq_params.subsampling_x,
+            cm->seq_params.subsampling_y, cm->seq_params.use_highbitdepth,
+            AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
       aom_internal_error(
           &cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to reallocate scaled source buffer for superres");
     assert(cpi->scaled_source.y_crop_width == cm->superres_upscaled_width);
     assert(cpi->scaled_source.y_crop_height == cm->superres_upscaled_height);
     av1_resize_and_extend_frame(cpi->unscaled_source, &cpi->scaled_source,
-                                (int)cm->bit_depth, num_planes);
+                                (int)cm->seq_params.bit_depth, num_planes);
     cpi->source = &cpi->scaled_source;
   }
 }
@@ -4331,7 +4466,7 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
         int64_t high_err_target = cpi->ambient_err;
         int64_t low_err_target = cpi->ambient_err >> 1;
 
-        if (cm->use_highbitdepth) {
+        if (cm->seq_params.use_highbitdepth) {
           kf_err = aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm));
         } else {
           kf_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
@@ -4574,7 +4709,11 @@ static void set_ext_overrides(AV1_COMP *cpi) {
     cpi->ext_refresh_frame_flags_pending = 0;
   }
   cpi->common.allow_ref_frame_mvs = cpi->ext_use_ref_frame_mvs;
-  cpi->common.error_resilient_mode = cpi->ext_use_error_resilient;
+  // A keyframe is already error resilient and keyframes with
+  // error_resilient_mode interferes with the use of show_existing_frame
+  // when forward reference keyframes are enabled.
+  cpi->common.error_resilient_mode =
+      cpi->ext_use_error_resilient && cpi->common.frame_type != KEY_FRAME;
 }
 
 static int setup_interp_filter_search_mask(AV1_COMP *cpi) {
@@ -4725,10 +4864,17 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) {
 }
 #endif  // DUMP_RECON_FRAMES
 
+static INLINE int is_frame_droppable(AV1_COMP *cpi) {
+  return !(cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame ||
+           cpi->refresh_bwd_ref_frame || cpi->refresh_golden_frame ||
+           cpi->refresh_last_frame);
+}
+
 static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
                                      int skip_adapt,
                                      unsigned int *frame_flags) {
   AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = &cm->seq_params;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   struct segmentation *const seg = &cm->seg;
 
@@ -4744,7 +4890,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
 
   cm->large_scale_tile = cpi->oxcf.large_scale_tile;
   cm->single_tile_decoding = cpi->oxcf.single_tile_decoding;
-  if (cm->large_scale_tile) cm->seq_params.frame_id_numbers_present_flag = 0;
+  if (cm->large_scale_tile) seq_params->frame_id_numbers_present_flag = 0;
 
   cm->allow_ref_frame_mvs &= frame_might_allow_ref_frame_mvs(cm);
   // cm->allow_ref_frame_mvs needs to be written into the frame header while
@@ -4756,7 +4902,8 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
       cpi->oxcf.allow_warped_motion && frame_might_allow_warped_motion(cm);
 
   // Reset the frame packet stamp index.
-  if (cm->frame_type == KEY_FRAME) cm->current_video_frame = 0;
+  if (cm->frame_type == KEY_FRAME && cm->show_frame)
+    cm->current_video_frame = 0;
 
   // NOTE:
   // (1) Move the setup of the ref_frame_flags upfront as it would be
@@ -4770,7 +4917,11 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
   if (cm->show_existing_frame) {
     // NOTE(zoeliu): In BIDIR_PRED, the existing frame to show is the current
     //               BWDREF_FRAME in the reference frame buffer.
-    cm->frame_type = INTER_FRAME;
+    if (cm->frame_type == KEY_FRAME) {
+      cm->reset_decoder_state = 1;
+    } else {
+      cm->frame_type = INTER_FRAME;
+    }
     cm->show_frame = 1;
     cpi->frame_flags = *frame_flags;
 
@@ -4839,6 +4990,10 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
       av1_rc_postencode_update(cpi, *size);
     }
 
+    // Decrement count down till next gf
+    if (cpi->rc.frames_till_gf_update_due > 0)
+      cpi->rc.frames_till_gf_update_due--;
+
     ++cm->current_video_frame;
 
     return AOM_CODEC_OK;
@@ -4889,7 +5044,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
          MAX_MODES * sizeof(*cpi->mode_chosen_counts));
 #endif
 
-  if (cm->seq_params.frame_id_numbers_present_flag) {
+  if (seq_params->frame_id_numbers_present_flag) {
     /* Non-normative definition of current_frame_id ("frame counter" with
      * wraparound) */
     const int frame_id_length = FRAME_ID_LENGTH;
@@ -4935,7 +5090,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
           (frame_is_intra_only(cm) || !cm->show_frame) ? 0 : 1;
       break;
   }
-  cm->timing_info_present &= !cm->seq_params.reduced_still_picture_hdr;
+  cm->timing_info_present &= !seq_params->reduced_still_picture_hdr;
 
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
     if (encode_without_recode_loop(cpi) != AOM_CODEC_OK) return AOM_CODEC_ERROR;
@@ -4957,7 +5112,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
   // fixed interval. Note the reconstruction error if it is the frame before
   // the force key frame
   if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
-    if (cm->use_highbitdepth) {
+    if (seq_params->use_highbitdepth) {
       cpi->ambient_err =
           aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm));
     } else {
@@ -4966,17 +5121,19 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
   }
 
   // If the encoder forced a KEY_FRAME decision or if frame is an S_FRAME
-  if (cm->frame_type == KEY_FRAME || frame_is_sframe(cm)) {
+  if ((cm->frame_type == KEY_FRAME && cm->show_frame) || frame_is_sframe(cm)) {
     cpi->refresh_last_frame = 1;
   }
 
   cm->frame_to_show = get_frame_new_buffer(cm);
-  cm->frame_to_show->color_primaries = cm->color_primaries;
-  cm->frame_to_show->transfer_characteristics = cm->transfer_characteristics;
-  cm->frame_to_show->matrix_coefficients = cm->matrix_coefficients;
-  cm->frame_to_show->monochrome = cm->seq_params.monochrome;
-  cm->frame_to_show->chroma_sample_position = cm->chroma_sample_position;
-  cm->frame_to_show->color_range = cm->color_range;
+  cm->frame_to_show->color_primaries = seq_params->color_primaries;
+  cm->frame_to_show->transfer_characteristics =
+      seq_params->transfer_characteristics;
+  cm->frame_to_show->matrix_coefficients = seq_params->matrix_coefficients;
+  cm->frame_to_show->monochrome = seq_params->monochrome;
+  cm->frame_to_show->chroma_sample_position =
+      seq_params->chroma_sample_position;
+  cm->frame_to_show->color_range = seq_params->color_range;
   cm->frame_to_show->render_width = cm->render_width;
   cm->frame_to_show->render_height = cm->render_height;
 
@@ -5014,7 +5171,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
 
   if (skip_adapt) return AOM_CODEC_OK;
 
-  if (cm->seq_params.frame_id_numbers_present_flag) {
+  if (seq_params->frame_id_numbers_present_flag) {
     int i;
     // Update reference frame id values based on the value of refresh_frame_mask
     for (i = 0; i < REF_FRAMES; i++) {
@@ -5085,6 +5242,19 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
   cm->seg.update_data = 0;
   cm->lf.mode_ref_delta_update = 0;
 
+  // A droppable frame might not be shown but it always
+  // takes a space in the gf group. Therefore, even when
+  // it is not shown, we still need update the count down.
+
+  // TODO(weitinglin): This is a work-around to handle the condition
+  // when a frame is drop. We should fix the cm->show_frame flag
+  // instead of checking the other condition to update the counter properly.
+  if (cm->show_frame || is_frame_droppable(cpi)) {
+    // Decrement count down till next gf
+    if (cpi->rc.frames_till_gf_update_due > 0)
+      cpi->rc.frames_till_gf_update_due--;
+  }
+
   if (cm->show_frame) {
     // TODO(zoeliu): We may only swamp mi and prev_mi for those frames that
     // are
@@ -5092,6 +5262,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
     swap_mi_and_prev_mi(cm);
     // Don't increment frame counters if this was an altref buffer
     // update not a real frame
+
     ++cm->current_video_frame;
   }
 
@@ -5160,10 +5331,45 @@ static int Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
   return AOM_CODEC_OK;
 }
 
+#if CONFIG_DENOISE
+static int apply_denoise_2d(AV1_COMP *cpi, YV12_BUFFER_CONFIG *sd,
+                            int block_size, float noise_level,
+                            int64_t time_stamp, int64_t end_time) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (!cpi->denoise_and_model) {
+    cpi->denoise_and_model = aom_denoise_and_model_alloc(
+        cm->seq_params.bit_depth, block_size, noise_level);
+    if (!cpi->denoise_and_model) {
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Error allocating denoise and model");
+      return -1;
+    }
+  }
+  if (!cpi->film_grain_table) {
+    cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
+    if (!cpi->film_grain_table) {
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Error allocating grain table");
+      return -1;
+    }
+    memset(cpi->film_grain_table, 0, sizeof(*cpi->film_grain_table));
+  }
+  if (aom_denoise_and_model_run(cpi->denoise_and_model, sd,
+                                &cm->film_grain_params)) {
+    if (cm->film_grain_params.apply_grain) {
+      aom_film_grain_table_append(cpi->film_grain_table, time_stamp, end_time,
+                                  &cm->film_grain_params);
+    }
+  }
+  return 0;
+}
+#endif
+
 int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                           int64_t end_time) {
   AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
   struct aom_usec_timer timer;
   int res = 0;
   const int subsampling_x = sd->subsampling_x;
@@ -5174,25 +5380,33 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
 
   aom_usec_timer_start(&timer);
 
+#if CONFIG_DENOISE
+  if (cpi->oxcf.noise_level > 0)
+    if (apply_denoise_2d(cpi, sd, cpi->oxcf.noise_block_size,
+                         cpi->oxcf.noise_level, time_stamp, end_time) < 0)
+      res = -1;
+#endif  //  CONFIG_DENOISE
+
   if (av1_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
                          use_highbitdepth, frame_flags))
     res = -1;
   aom_usec_timer_mark(&timer);
   cpi->time_receive_data += aom_usec_timer_elapsed(&timer);
 
-  if ((cm->profile == PROFILE_0) && !cm->seq_params.monochrome &&
+  if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome &&
       (subsampling_x != 1 || subsampling_y != 1)) {
     aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
                        "Non-4:2:0 color format requires profile 1 or 2");
     res = -1;
   }
-  if ((cm->profile == PROFILE_1) &&
+  if ((seq_params->profile == PROFILE_1) &&
       !(subsampling_x == 0 && subsampling_y == 0)) {
     aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
                        "Profile 1 requires 4:4:4 color format");
     res = -1;
   }
-  if ((cm->profile == PROFILE_2) && (cm->bit_depth <= AOM_BITS_10) &&
+  if ((seq_params->profile == PROFILE_2) &&
+      (seq_params->bit_depth <= AOM_BITS_10) &&
       !(subsampling_x == 1 && subsampling_y == 0)) {
     aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
                        "Profile 2 bit-depth < 10 requires 4:2:2 color format");
@@ -5364,9 +5578,9 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
 #endif
   cpi->bytes += frame_bytes;
 
-  if (cm->use_highbitdepth) {
+  if (cm->seq_params.use_highbitdepth) {
     in_bit_depth = cpi->oxcf.input_bit_depth;
-    bit_depth = cm->bit_depth;
+    bit_depth = cm->seq_params.bit_depth;
   }
   if (cm->show_frame) {
     const YV12_BUFFER_CONFIG *orig = cpi->source;
@@ -5387,7 +5601,7 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
       cpi->total_samples += psnr.samples[0];
       samples = psnr.samples[0];
       // TODO(yaowu): unify these two versions into one.
-      if (cm->use_highbitdepth)
+      if (cm->seq_params.use_highbitdepth)
         frame_ssim2 =
             aom_highbd_calc_ssim(orig, recon, &weight, bit_depth, in_bit_depth);
       else
@@ -5412,7 +5626,7 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
 #endif
     }
     if (cpi->b_calculate_blockiness) {
-      if (!cm->use_highbitdepth) {
+      if (!cm->seq_params.use_highbitdepth) {
         const double frame_blockiness =
             av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer,
                                recon->y_stride, orig->y_width, orig->y_height);
@@ -5421,7 +5635,7 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
       }
 
       if (cpi->b_calculate_consistency) {
-        if (!cm->use_highbitdepth) {
+        if (!cm->seq_params.use_highbitdepth) {
           const double this_inconsistency = aom_get_ssim_metrics(
               orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride,
               orig->y_width, orig->y_height, cpi->ssim_vars, &cpi->metrics, 1);
@@ -5622,18 +5836,17 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   if (oxcf->large_scale_tile)
     cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 
-  cpi->refresh_last_frame = 1;
-  cpi->refresh_golden_frame = 0;
-  cpi->refresh_bwd_ref_frame = 0;
-  cpi->refresh_alt2_ref_frame = 0;
-  cpi->refresh_alt_ref_frame = 0;
+  // default reference buffers update config
+  av1_configure_buffer_updates_firstpass(cpi, LF_UPDATE);
 
-  // TODO(zoeliu@gmail.com): To support forward-KEY_FRAME and set up the
-  //                         following flag accordingly.
+  // Initialize fields related to forward keyframes
+  cpi->no_show_kf = 0;
   cm->reset_decoder_state = 0;
 
   // Don't allow a show_existing_frame to coincide with an error resilient or
-  // S-Frame
+  // S-Frame. An exception can be made in the case of a keyframe, since it
+  // does not depend on any previous frames. We must make this exception here
+  // because of the use of show_existing_frame with forward coded keyframes.
   struct lookahead_entry *lookahead_src = NULL;
   if (cm->current_video_frame > 0)
     lookahead_src = av1_lookahead_peek(cpi->lookahead, 0);
@@ -5641,7 +5854,8 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
       ((cpi->oxcf.error_resilient_mode |
         ((lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT) != 0)) ||
        (cpi->oxcf.s_frame_mode |
-        ((lookahead_src->flags & AOM_EFLAG_SET_S_FRAME) != 0)))) {
+        ((lookahead_src->flags & AOM_EFLAG_SET_S_FRAME) != 0))) &&
+      !(rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY))) {
     cm->show_existing_frame = 0;
   }
 
@@ -5719,22 +5933,29 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
     if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
       cm->showable_frame = 1;
       cpi->alt_ref_source = source;
-
-      if (oxcf->arnr_max_frames > 0) {
-        // Produce the filtered ARF frame.
-        av1_temporal_filter(cpi, arf_src_index);
-        aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes);
-        force_src_buffer = &cpi->alt_ref_buffer;
+      // When arf_src_index == rc->frames_to_key, it indicates a fwd_kf
+      if (arf_src_index == rc->frames_to_key) {
+        // Skip temporal filtering and mark as intra_only if we have a fwd_kf
+        const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+        int which_arf = gf_group->arf_update_idx[gf_group->index];
+        cpi->is_arf_filter_off[which_arf] = 1;
+        cpi->no_show_kf = 1;
+      } else {
+        if (oxcf->arnr_max_frames > 0) {
+          // Produce the filtered ARF frame.
+          av1_temporal_filter(cpi, arf_src_index);
+          aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes);
+          force_src_buffer = &cpi->alt_ref_buffer;
+        }
       }
-
       cm->show_frame = 0;
       cm->intra_only = 0;
-      cpi->refresh_alt_ref_frame = 1;
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      rc->is_src_frame_alt_ref = 0;
+
+      if (oxcf->pass < 2) {
+        // In second pass, the buffer updates configure will be set
+        // in the function av1_rc_get_second_pass_params
+        av1_configure_buffer_updates_firstpass(cpi, ARF_UPDATE);
+      }
     }
     rc->source_alt_ref_pending = 0;
   }
@@ -5771,13 +5992,12 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
 
       cm->show_frame = 0;
       cm->intra_only = 0;
-      cpi->refresh_alt2_ref_frame = 1;
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-      rc->is_src_frame_alt_ref = 0;
-      rc->is_src_frame_ext_arf = 0;
+
+      if (oxcf->pass < 2) {
+        // In second pass, the buffer updates configure will be set
+        // in the function av1_rc_get_second_pass_params
+        av1_configure_buffer_updates_firstpass(cpi, INTNL_ARF_UPDATE);
+      }
     }
     rc->source_alt_ref_pending = 0;
   }
@@ -5791,13 +6011,11 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
       cm->show_frame = 0;
       cm->intra_only = 0;
 
-      cpi->refresh_bwd_ref_frame = 1;
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-
-      rc->is_bwd_ref_frame = 1;
+      if (oxcf->pass < 2) {
+        // In second pass, the buffer updates configure will be set
+        // in the function av1_rc_get_second_pass_params
+        av1_configure_buffer_updates_firstpass(cpi, BIPRED_UPDATE);
+      }
     }
   }
 
@@ -5865,16 +6083,18 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
   cm->cur_frame->buf.buf_8bit_valid = 0;
 
-  if (cm->film_grain_table) {
-    cm->film_grain_params_present = aom_film_grain_table_lookup(
-        cm->film_grain_table, *time_stamp, *time_end, 0 /* erase */,
+  if (cpi->film_grain_table) {
+    cm->seq_params.film_grain_params_present = aom_film_grain_table_lookup(
+        cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */,
         &cm->film_grain_params);
   }
-  cm->cur_frame->film_grain_params_present = cm->film_grain_params_present;
+  cm->cur_frame->film_grain_params_present =
+      cm->seq_params.film_grain_params_present;
 
   // only one operating point supported now
-  cpi->common.tu_presentation_delay =
-      ticks_to_timebase_units(timebase, *time_stamp);
+  const int64_t pts64 = ticks_to_timebase_units(timebase, *time_stamp);
+  if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR;
+  cpi->common.frame_presentation_time = (uint32_t)pts64;
 
   // Start with a 0 size frame.
   *size = 0;
@@ -6004,8 +6224,8 @@ int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
       *dest = *cm->frame_to_show;
       dest->y_width = cm->width;
       dest->y_height = cm->height;
-      dest->uv_width = cm->width >> cm->subsampling_x;
-      dest->uv_height = cm->height >> cm->subsampling_y;
+      dest->uv_width = cm->width >> cm->seq_params.subsampling_x;
+      dest->uv_height = cm->height >> cm->seq_params.subsampling_y;
       ret = 0;
     } else {
       ret = -1;
diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h
index 5212db2b1..2b7ab711d 100644
--- a/third_party/aom/av1/encoder/encoder.h
+++ b/third_party/aom/av1/encoder/encoder.h
@@ -41,6 +41,9 @@
 #include "aom_dsp/ssim.h"
 #endif
 #include "aom_dsp/variance.h"
+#if CONFIG_DENOISE
+#include "aom_dsp/noise_model.h"
+#endif
 #include "aom/internal/aom_codec_internal.h"
 #include "aom_util/aom_thread.h"
 
@@ -277,7 +280,7 @@ typedef struct AV1EncoderConfig {
   aom_timing_info_t timing_info;
   int decoder_model_info_present_flag;
   int display_model_info_present_flag;
-  int buffer_removal_delay_present;
+  int buffer_removal_time_present;
   aom_dec_model_info_t buffer_model;
   aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
   aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1];
@@ -301,6 +304,11 @@ typedef struct AV1EncoderConfig {
   int allow_warped_motion;
   int enable_superres;
   unsigned int save_as_annexb;
+
+#if CONFIG_DENOISE
+  float noise_level;
+  int noise_block_size;
+#endif
 } AV1EncoderConfig;
 
 static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) {
@@ -472,6 +480,7 @@ typedef struct AV1_COMP {
   AV1EncoderConfig oxcf;
   struct lookahead_ctx *lookahead;
   struct lookahead_entry *alt_ref_source;
+  int no_show_kf;
 
   int optimize_speed_feature;
   int optimize_seg_arr[MAX_SEGMENTS];
@@ -504,6 +513,9 @@ typedef struct AV1_COMP {
   int refresh_bwd_ref_frame;
   int refresh_alt2_ref_frame;
   int refresh_alt_ref_frame;
+#if USE_SYMM_MULTI_LAYER
+  int new_bwdref_update_rule;
+#endif
 
   int ext_refresh_frame_flags_pending;
   int ext_refresh_last_frame;
@@ -666,7 +678,6 @@ typedef struct AV1_COMP {
   int existing_fb_idx_to_show;
   int is_arf_filter_off[MAX_EXT_ARFS + 1];
   int num_extra_arfs;
-  int arf_map[MAX_EXT_ARFS + 1];
   int arf_pos_in_gf[MAX_EXT_ARFS + 1];
   int arf_pos_for_ovrly[MAX_EXT_ARFS + 1];
   int global_motion_search_done;
@@ -687,6 +698,11 @@ typedef struct AV1_COMP {
   AV1LfSync lf_row_sync;
   AV1LrSync lr_row_sync;
   AV1LrStruct lr_ctxt;
+
+  aom_film_grain_table_t *film_grain_table;
+#if CONFIG_DENOISE
+  struct aom_denoise_and_model_t *denoise_and_model;
+#endif
 } AV1_COMP;
 
 void av1_initialize_enc(void);
diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c
index 4d4802b46..81f360733 100644
--- a/third_party/aom/av1/encoder/encodetxb.c
+++ b/third_party/aom/av1/encoder/encodetxb.c
@@ -792,9 +792,8 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb(
 }
 
 int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x,
-                        const int plane, const int blk_row, const int blk_col,
-                        const int block, const TX_SIZE tx_size,
-                        const TXB_CTX *const txb_ctx) {
+                        const int plane, const int block, const TX_SIZE tx_size,
+                        const TX_TYPE tx_type, const TXB_CTX *const txb_ctx) {
   const struct macroblock_plane *p = &x->plane[plane];
   const int eob = p->eobs[block];
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
@@ -806,8 +805,6 @@ int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x,
   }
 
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
-                                          tx_size, cm->reduced_tx_set_used);
   const TX_CLASS tx_class = tx_type_to_class[tx_type];
 
 #define WAREHOUSE_EFFICIENTS_TXB_CASE(tx_class_literal)                        \
@@ -1583,9 +1580,14 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
   const int64_t rdmult =
       ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) +
        2) >>
-      (sharpness + (cpi->oxcf.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4
-                        ? 7 - mbmi->segment_id
-                        : 2));
+      (sharpness +
+       (cpi->oxcf.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4
+            ? 7 - mbmi->segment_id
+            : 2) +
+       (cpi->oxcf.aq_mode != VARIANCE_AQ &&
+                cpi->oxcf.deltaq_mode > NO_DELTA_Q && x->sb_energy_level < 0
+            ? (3 - x->sb_energy_level)
+            : 0));
 
   uint8_t levels_buf[TX_PAD_2D];
   uint8_t *const levels = set_levels(levels_buf, width);
diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h
index aa847ad62..0442cc613 100644
--- a/third_party/aom/av1/encoder/encodetxb.h
+++ b/third_party/aom/av1/encoder/encodetxb.h
@@ -50,9 +50,8 @@ typedef struct TxbInfo {
 void av1_alloc_txb_buf(AV1_COMP *cpi);
 void av1_free_txb_buf(AV1_COMP *cpi);
 int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x,
-                        const int plane, const int blk_row, const int blk_col,
-                        const int block, const TX_SIZE tx_size,
-                        const TXB_CTX *const txb_ctx);
+                        const int plane, const int block, const TX_SIZE tx_size,
+                        const TX_TYPE tx_type, const TXB_CTX *const txb_ctx);
 void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
                           aom_writer *w, int blk_row, int blk_col, int plane,
                           TX_SIZE tx_size, const tran_low_t *tcoeff,
@@ -77,9 +76,10 @@ void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x,
                           int mi_row, int mi_col);
 
 void hbt_destroy();
-int av1_optimize_txb_new(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
                          int block, TX_SIZE tx_size, TX_TYPE tx_type,
-                         const TXB_CTX *txb_ctx, int *rate_cost, int sharpness);
+                         const TXB_CTX *const txb_ctx, int *rate_cost,
+                         int sharpness);
 #ifdef __cplusplus
 }
 #endif
diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c
index 404af2e7c..637d6824c 100644
--- a/third_party/aom/av1/encoder/ethread.c
+++ b/third_party/aom/av1/encoder/ethread.c
@@ -44,7 +44,7 @@ static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
     av1_encode_tile(cpi, thread_data->td, tile_row, tile_col);
   }
 
-  return 0;
+  return 1;
 }
 
 void av1_encode_tiles_mt(AV1_COMP *cpi) {
@@ -126,12 +126,11 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
 
   for (i = 0; i < num_workers; i++) {
     AVxWorker *const worker = &cpi->workers[i];
-    EncWorkerData *thread_data;
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
 
     worker->hook = (AVxWorkerHook)enc_worker_hook;
-    worker->data1 = &cpi->tile_thr_data[i];
+    worker->data1 = thread_data;
     worker->data2 = NULL;
-    thread_data = (EncWorkerData *)worker->data1;
 
     // Before encoding a frame, copy the thread data from cpi.
     if (thread_data->td != &cpi->td) {
diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c
index 113c068c1..ef0800c79 100644
--- a/third_party/aom/av1/encoder/firstpass.c
+++ b/third_party/aom/av1/encoder/firstpass.c
@@ -486,6 +486,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   int mb_row, mb_col;
   MACROBLOCK *const x = &cpi->td.mb;
   AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   TileInfo tile;
@@ -524,7 +525,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   double intra_factor;
   double brightness_factor;
   BufferPool *const pool = cm->buffer_pool;
-  const int qindex = find_fp_qindex(cm->bit_depth);
+  const int qindex = find_fp_qindex(seq_params->bit_depth);
   const int mb_scale = mi_size_wide[BLOCK_16X16];
 
   int *raw_motion_err_list;
@@ -555,11 +556,11 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   set_first_pass_params(cpi);
   av1_set_quantizer(cm, qindex);
 
-  av1_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y,
-                         num_planes);
+  av1_setup_block_planes(&x->e_mbd, seq_params->subsampling_x,
+                         seq_params->subsampling_y, num_planes);
 
   av1_setup_src_planes(x, cpi->source, 0, 0, num_planes);
-  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, new_yv12, 0, 0, 0,
+  av1_setup_dst_planes(xd->plane, seq_params->sb_size, new_yv12, 0, 0, 0,
                        num_planes);
 
   if (!frame_is_intra_only(cm)) {
@@ -654,14 +655,14 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
         image_data_start_row = mb_row;
       }
 
-      if (cm->use_highbitdepth) {
-        switch (cm->bit_depth) {
+      if (seq_params->use_highbitdepth) {
+        switch (seq_params->bit_depth) {
           case AOM_BITS_8: break;
           case AOM_BITS_10: this_error >>= 4; break;
           case AOM_BITS_12: this_error >>= 8; break;
           default:
             assert(0 &&
-                   "cm->bit_depth should be AOM_BITS_8, "
+                   "seq_params->bit_depth should be AOM_BITS_8, "
                    "AOM_BITS_10 or AOM_BITS_12");
             return;
         }
@@ -674,7 +675,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
       else
         intra_factor += 1.0;
 
-      if (cm->use_highbitdepth)
+      if (seq_params->use_highbitdepth)
         level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
       else
         level_sample = x->plane[0].src.buf[0];
@@ -1156,10 +1157,10 @@ static int get_twopass_worst_quality(const AV1_COMP *cpi,
     for (q = rc->best_quality; q < rc->worst_quality; ++q) {
       const double factor = calc_correction_factor(
           av_err_per_mb, ERR_DIVISOR - ediv_size_correction, FACTOR_PT_LOW,
-          FACTOR_PT_HIGH, q, cpi->common.bit_depth);
+          FACTOR_PT_HIGH, q, cpi->common.seq_params.bit_depth);
       const int bits_per_mb = av1_rc_bits_per_mb(
           INTER_FRAME, q, factor * speed_term * group_weight_factor,
-          cpi->common.bit_depth);
+          cpi->common.seq_params.bit_depth);
       if (bits_per_mb <= target_norm_bits_per_mb) break;
     }
 
@@ -1377,7 +1378,7 @@ static double calc_frame_boost(AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame,
                                double this_frame_mv_in_out, double max_boost) {
   double frame_boost;
   const double lq = av1_convert_qindex_to_q(
-      cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
+      cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.seq_params.bit_depth);
   const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5);
   int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
                                                        : cpi->common.MBs;
@@ -2130,6 +2131,319 @@ static void define_gf_group_structure_16(AV1_COMP *cpi) {
 }
 #endif  // USE_GF16_MULTI_LAYER
 
+#if USE_SYMM_MULTI_LAYER
+void check_frame_params(GF_GROUP *const gf_group, int gf_interval,
+                        int frame_nums) {
+  static const char *update_type_strings[] = {
+    "KF_UPDATE",          "LF_UPDATE",      "GF_UPDATE",
+    "ARF_UPDATE",         "OVERLAY_UPDATE", "BRF_UPDATE",
+    "LAST_BIPRED_UPDATE", "BIPRED_UPDATE",  "INTNL_OVERLAY_UPDATE",
+    "INTNL_ARF_UPDATE"
+  };
+  FILE *fid = fopen("GF_PARAMS.txt", "a");
+
+  fprintf(fid, "\n{%d}\n", gf_interval);
+  for (int i = 0; i <= frame_nums; ++i) {
+    fprintf(fid, "%s %d %d %d %d\n",
+            update_type_strings[gf_group->update_type[i]],
+            gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i],
+            gf_group->arf_update_idx[i], gf_group->pyramid_level[i]);
+  }
+  fclose(fid);
+}
+
+static int update_type_2_rf_level(FRAME_UPDATE_TYPE update_type) {
+  // Derive rf_level from update_type
+  switch (update_type) {
+    case LF_UPDATE: return INTER_NORMAL;
+    case ARF_UPDATE: return GF_ARF_STD;
+    case OVERLAY_UPDATE: return INTER_NORMAL;
+    case BRF_UPDATE: return GF_ARF_LOW;
+    case LAST_BIPRED_UPDATE: return INTER_NORMAL;
+    case BIPRED_UPDATE: return INTER_NORMAL;
+    case INTNL_ARF_UPDATE: return GF_ARF_LOW;
+    case INTNL_OVERLAY_UPDATE: return INTER_NORMAL;
+    default: return INTER_NORMAL;
+  }
+}
+
+static void set_multi_layer_params(GF_GROUP *const gf_group, int l, int r,
+                                   int *frame_ind, int arf_ind, int level) {
+  if (r - l == 2) {
+    // leaf node, not a look-ahead frame
+    gf_group->update_type[*frame_ind] = LF_UPDATE;
+    gf_group->arf_src_offset[*frame_ind] = 0;
+    gf_group->arf_pos_in_gf[*frame_ind] = 0;
+    gf_group->arf_update_idx[*frame_ind] = arf_ind;
+    gf_group->pyramid_level[*frame_ind] = level;
+    ++(*frame_ind);
+  } else {
+    int m = (l + r) / 2;
+    int arf_pos_in_gf = *frame_ind;
+
+    gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
+    gf_group->arf_src_offset[*frame_ind] = m - l - 1;
+    gf_group->arf_pos_in_gf[*frame_ind] = 0;
+    gf_group->arf_update_idx[*frame_ind] = 1;  // mark all internal ARF 1
+    gf_group->pyramid_level[*frame_ind] = level;
+    ++(*frame_ind);
+
+    // set parameters for frames displayed before this frame
+    set_multi_layer_params(gf_group, l, m, frame_ind, 1, level - 1);
+
+    // for overlay frames, we need to record the position of its corresponding
+    // arf frames for bit allocation
+    gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
+    gf_group->arf_src_offset[*frame_ind] = 0;
+    gf_group->arf_pos_in_gf[*frame_ind] = arf_pos_in_gf;
+    gf_group->arf_update_idx[*frame_ind] = 1;
+    gf_group->pyramid_level[*frame_ind] = 0;
+    ++(*frame_ind);
+
+    // set parameters for frames displayed after this frame
+    set_multi_layer_params(gf_group, m, r, frame_ind, arf_ind, level - 1);
+  }
+}
+
+static INLINE unsigned char get_pyramid_height(int pyramid_width) {
+  assert(pyramid_width <= 16 && pyramid_width >= 4 &&
+         "invalid gf interval for pyramid structure");
+
+  return pyramid_width == 16 ? 4 : (pyramid_width >= 8 ? 3 : 2);
+}
+
+static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group,
+                                              const int gf_interval) {
+  int frame_index = 0;
+  gf_group->pyramid_height = get_pyramid_height(gf_interval);
+
+  // At the beginning of each GF group it will be a key or overlay frame,
+  gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+  gf_group->arf_src_offset[frame_index] = 0;
+  gf_group->arf_pos_in_gf[frame_index] = 0;
+  gf_group->arf_update_idx[frame_index] = 0;
+  gf_group->pyramid_level[frame_index] = 0;
+  ++frame_index;
+
+  // ALT0
+  gf_group->update_type[frame_index] = ARF_UPDATE;
+  gf_group->arf_src_offset[frame_index] = gf_interval - 1;
+  gf_group->arf_pos_in_gf[frame_index] = 0;
+  gf_group->arf_update_idx[frame_index] = 0;
+  gf_group->pyramid_level[frame_index] = gf_group->pyramid_height;
+  ++frame_index;
+
+  // set parameters for the rest of the frames
+  set_multi_layer_params(gf_group, 0, gf_interval, &frame_index, 0,
+                         gf_group->pyramid_height - 1);
+
+  // check_frame_params(gf_group, gf_interval, frame_index);
+
+  return frame_index;
+}
+
+void define_customized_gf_group_structure(AV1_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  const int key_frame = cpi->common.frame_type == KEY_FRAME;
+
+  assert(rc->baseline_gf_interval == 4 || rc->baseline_gf_interval == 8 ||
+         rc->baseline_gf_interval == 16);
+
+  const int gf_update_frames =
+      construct_multi_layer_gf_structure(gf_group, rc->baseline_gf_interval);
+  int frame_index;
+
+  cpi->num_extra_arfs = 0;
+
+  for (frame_index = 0; frame_index < gf_update_frames; ++frame_index) {
+    // Set unused variables to default values
+    gf_group->bidir_pred_enabled[frame_index] = 0;
+    gf_group->brf_src_offset[frame_index] = 0;
+
+    // Special handle for the first frame for assigning update_type
+    if (frame_index == 0) {
+      // For key frames the frame target rate is already set and it
+      // is also the golden frame.
+      if (key_frame) {
+        gf_group->update_type[frame_index] = KF_UPDATE;
+        continue;
+      }
+
+      if (rc->source_alt_ref_active) {
+        gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+      } else {
+        gf_group->update_type[frame_index] = GF_UPDATE;
+      }
+    } else {
+      if (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
+        ++cpi->num_extra_arfs;
+    }
+
+    // Assign rf level based on update type
+    gf_group->rf_level[frame_index] =
+        update_type_2_rf_level(gf_group->update_type[frame_index]);
+  }
+
+  // NOTE: We need to configure the frame at the end of the sequence + 1 that
+  //       will be the start frame for the next group. Otherwise prior to the
+  //       call to av1_rc_get_second_pass_params() the data will be undefined.
+  if (rc->source_alt_ref_pending) {
+    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+    gf_group->rf_level[frame_index] = INTER_NORMAL;
+  } else {
+    gf_group->update_type[frame_index] = GF_UPDATE;
+    gf_group->rf_level[frame_index] = GF_ARF_STD;
+  }
+
+  gf_group->bidir_pred_enabled[frame_index] = 0;
+  gf_group->brf_src_offset[frame_index] = 0;
+  gf_group->arf_update_idx[frame_index] = 0;
+  // This value is only used for INTNL_OVERLAY_UPDATE
+  gf_group->arf_pos_in_gf[frame_index] = 0;
+
+  // This parameter is useless?
+  gf_group->arf_ref_idx[frame_index] = 0;
+
+  check_frame_params(gf_group, rc->baseline_gf_interval, gf_update_frames);
+}
+
+// It is an example of how to define a GF stucture manually. The function will
+// result in exactly the same GF group structure as
+// define_customized_gf_group_structure() when rc->baseline_gf_interval == 4
+#if USE_MANUAL_GF4_STRUCT
+#define GF_INTERVAL_4 4
+static const unsigned char gf4_multi_layer_params[][GF_FRAME_PARAMS] = {
+  {
+      // gf_group->index == 0 (Frame 0)
+      // It can also be KEY frame. Will assign the proper value
+      // in define_gf_group_structure
+      OVERLAY_UPDATE,  // update_type (default value)
+      0,               // arf_src_offset
+      0,               // arf_pos_in_gf
+      0                // arf_update_idx
+  },
+  {
+      // gf_group->index == 1 (Frame 4)
+      ARF_UPDATE,         // update_type
+      GF_INTERVAL_4 - 1,  // arf_src_offset
+      0,                  // arf_pos_in_gf
+      0                   // arf_update_idx
+  },
+  {
+      // gf_group->index == 2 (Frame 2)
+      INTNL_ARF_UPDATE,          // update_type
+      (GF_INTERVAL_4 >> 1) - 1,  // arf_src_offset
+      0,                         // arf_pos_in_gf
+      0                          // arf_update_idx
+  },
+  {
+      // gf_group->index == 3 (Frame 1)
+      LAST_BIPRED_UPDATE,  // update_type
+      0,                   // arf_src_offset
+      0,                   // arf_pos_in_gf
+      0                    // arf_update_idx
+  },
+
+  {
+      // gf_group->index == 4 (Frame 2 - OVERLAY)
+      INTNL_OVERLAY_UPDATE,  // update_type
+      0,                     // arf_src_offset
+      2,                     // arf_pos_in_gf
+      0                      // arf_update_idx
+  },
+  {
+      // gf_group->index == 5 (Frame 3)
+      LF_UPDATE,  // update_type
+      0,          // arf_src_offset
+      0,          // arf_pos_in_gf
+      1           // arf_update_idx
+  }
+};
+
+static int define_gf_group_structure_4(AV1_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  const int key_frame = cpi->common.frame_type == KEY_FRAME;
+
+  assert(rc->baseline_gf_interval == GF_INTERVAL_4);
+
+  const int gf_update_frames = rc->baseline_gf_interval + 2;
+  int frame_index;
+
+  for (frame_index = 0; frame_index < gf_update_frames; ++frame_index) {
+    int param_idx = 0;
+
+    gf_group->bidir_pred_enabled[frame_index] = 0;
+
+    if (frame_index == 0) {
+      // gf_group->arf_src_offset[frame_index] = 0;
+      gf_group->brf_src_offset[frame_index] = 0;
+      gf_group->bidir_pred_enabled[frame_index] = 0;
+
+      // For key frames the frame target rate is already set and it
+      // is also the golden frame.
+      if (key_frame) continue;
+
+      gf_group->update_type[frame_index] =
+          gf4_multi_layer_params[frame_index][param_idx++];
+
+      if (rc->source_alt_ref_active) {
+        gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+      } else {
+        gf_group->update_type[frame_index] = GF_UPDATE;
+      }
+      param_idx++;
+    } else {
+      gf_group->update_type[frame_index] =
+          gf4_multi_layer_params[frame_index][param_idx++];
+    }
+
+    // setup other parameters
+    gf_group->rf_level[frame_index] =
+        update_type_2_rf_level(gf_group->update_type[frame_index]);
+
+    // == arf_src_offset ==
+    gf_group->arf_src_offset[frame_index] =
+        gf4_multi_layer_params[frame_index][param_idx++];
+
+    // == arf_pos_in_gf ==
+    gf_group->arf_pos_in_gf[frame_index] =
+        gf4_multi_layer_params[frame_index][param_idx++];
+
+    // == arf_update_idx ==
+    gf_group->brf_src_offset[frame_index] =
+        gf4_multi_layer_params[frame_index][param_idx];
+  }
+
+  // NOTE: We need to configure the frame at the end of the sequence + 1 that
+  //       will be the start frame for the next group. Otherwise prior to the
+  //       call to av1_rc_get_second_pass_params() the data will be undefined.
+  gf_group->arf_update_idx[frame_index] = 0;
+  gf_group->arf_ref_idx[frame_index] = 0;
+
+  if (rc->source_alt_ref_pending) {
+    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+    gf_group->rf_level[frame_index] = INTER_NORMAL;
+
+  } else {
+    gf_group->update_type[frame_index] = GF_UPDATE;
+    gf_group->rf_level[frame_index] = GF_ARF_STD;
+  }
+
+  gf_group->bidir_pred_enabled[frame_index] = 0;
+  gf_group->brf_src_offset[frame_index] = 0;
+
+  // This value is only used for INTNL_OVERLAY_UPDATE
+  gf_group->arf_pos_in_gf[frame_index] = 0;
+
+  return gf_update_frames;
+}
+#endif  // USE_MANUAL_GF4_STRUCT
+#endif  // USE_SYMM_MULTI_LAYER
+
 static void define_gf_group_structure(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
 
@@ -2139,6 +2453,25 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
     return;
   }
 #endif  // USE_GF16_MULTI_LAYER
+#if USE_SYMM_MULTI_LAYER
+  const int valid_customized_gf_length = rc->baseline_gf_interval == 4 ||
+                                         rc->baseline_gf_interval == 8 ||
+                                         rc->baseline_gf_interval == 16;
+  // used the new structure only if extra_arf is allowed
+  if (valid_customized_gf_length && rc->source_alt_ref_pending &&
+      cpi->extra_arf_allowed > 0) {
+#if USE_MANUAL_GF4_STRUCT
+    if (rc->baseline_gf_interval == 4)
+      define_gf_group_structure_4(cpi);
+    else
+#endif
+      define_customized_gf_group_structure(cpi);
+    cpi->new_bwdref_update_rule = 1;
+    return;
+  } else {
+    cpi->new_bwdref_update_rule = 0;
+  }
+#endif
 
   TWO_PASS *const twopass = &cpi->twopass;
   GF_GROUP *const gf_group = &twopass->gf_group;
@@ -2322,9 +2655,8 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
   }
 
   // NOTE: We need to configure the frame at the end of the sequence + 1 that
-  // will
-  //       be the start frame for the next group. Otherwise prior to the call to
-  //       av1_rc_get_second_pass_params() the data will be undefined.
+  //       will be the start frame for the next group. Otherwise prior to the
+  //       call to av1_rc_get_second_pass_params() the data will be undefined.
   gf_group->arf_update_idx[frame_index] = 0;
   gf_group->arf_ref_idx[frame_index] = 0;
 
@@ -2438,6 +2770,17 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
       // TODO(zoeliu): To investigate whether the allocated bits on
       // BIPRED_UPDATE frames need to be further adjusted.
       gf_group->bit_allocation[frame_index] = target_frame_size;
+#if USE_SYMM_MULTI_LAYER
+    } else if (cpi->new_bwdref_update_rule == 1 &&
+               gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) {
+      int arf_pos = gf_group->arf_pos_in_gf[frame_index];
+      gf_group->bit_allocation[frame_index] = 0;
+
+      // Tried boosting up the allocated bits on backward reference frame
+      // by (target_frame_size >> 2) as in the original setting. However it
+      // does not bring gains for pyramid structure with GF length = 16.
+      gf_group->bit_allocation[arf_pos] = target_frame_size;
+#endif
     } else {
       assert(gf_group->update_type[frame_index] == LF_UPDATE ||
              gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE);
@@ -2453,10 +2796,11 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
     }
   }
 
-  // NOTE: We need to configure the frame at the end of the sequence + 1 that
-  //       will be the start frame for the next group. Otherwise prior to the
-  //       call to av1_rc_get_second_pass_params() the data will be undefined.
+#if USE_SYMM_MULTI_LAYER
+  if (cpi->new_bwdref_update_rule == 0 && rc->source_alt_ref_pending) {
+#else
   if (rc->source_alt_ref_pending) {
+#endif
     if (cpi->num_extra_arfs) {
       // NOTE: For bit allocation, move the allocated bits associated with
       //       INTNL_OVERLAY_UPDATE to the corresponding INTNL_ARF_UPDATE.
@@ -2489,7 +2833,10 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   int i;
 
   double boost_score = 0.0;
+#if !FIX_GF_INTERVAL_LENGTH
   double old_boost_score = 0.0;
+  double mv_ratio_accumulator_thresh;
+#endif
   double gf_group_err = 0.0;
 #if GROUP_ADAPTIVE_MAXQ
   double gf_group_raw_error = 0.0;
@@ -2509,7 +2856,7 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double this_frame_mv_in_out = 0.0;
   double mv_in_out_accumulator = 0.0;
   double abs_mv_in_out_accumulator = 0.0;
-  double mv_ratio_accumulator_thresh;
+
   unsigned int allow_alt_ref = is_altref_enabled(cpi);
 
   int f_boost = 0;
@@ -2551,18 +2898,18 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     gf_group_skip_pct -= this_frame->intra_skip_pct;
     gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
   }
-
+#if !FIX_GF_INTERVAL_LENGTH
   // Motion breakout threshold for loop below depends on image size.
   mv_ratio_accumulator_thresh =
       (cpi->initial_height + cpi->initial_width) / 4.0;
-
+#endif
   // Set a maximum and minimum interval for the GF group.
   // If the image appears almost completely static we can extend beyond this.
   {
-    int int_max_q = (int)(av1_convert_qindex_to_q(twopass->active_worst_quality,
-                                                  cpi->common.bit_depth));
-    int int_lbq = (int)(av1_convert_qindex_to_q(rc->last_boosted_qindex,
-                                                cpi->common.bit_depth));
+    int int_max_q = (int)(av1_convert_qindex_to_q(
+        twopass->active_worst_quality, cpi->common.seq_params.bit_depth));
+    int int_lbq = (int)(av1_convert_qindex_to_q(
+        rc->last_boosted_qindex, cpi->common.seq_params.bit_depth));
 
     active_min_gf_interval = rc->min_gf_interval + AOMMIN(2, int_max_q / 200);
     if (active_min_gf_interval > rc->max_gf_interval)
@@ -2643,7 +2990,10 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     boost_score +=
         decay_accumulator *
         calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST);
-
+#if FIX_GF_INTERVAL_LENGTH
+    if (i == (FIXED_GF_LENGTH + 1)) break;
+#else
+    // Skip breaking condition for FIX_GF_INTERVAL_LENGTH
     // Break out conditions.
     if (
         // Break at active_max_gf_interval unless almost totally static.
@@ -2666,9 +3016,9 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         break;
       }
     }
-
-    *this_frame = next_frame;
     old_boost_score = boost_score;
+#endif  // FIX_GF_INTERVAL_LENGTH
+    *this_frame = next_frame;
   }
   twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
 
@@ -2693,7 +3043,18 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   }
 
   // Set the interval until the next gf.
-  rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
+  if (cpi->oxcf.fwd_kf_enabled) {
+    // Ensure the gf group before the next keyframe will contain an altref
+    if ((rc->frames_to_key - i < rc->min_gf_interval) &&
+        (rc->frames_to_key != i)) {
+      rc->baseline_gf_interval = AOMMIN(rc->frames_to_key - rc->min_gf_interval,
+                                        rc->static_scene_max_gf_interval);
+    } else {
+      rc->baseline_gf_interval = i;
+    }
+  } else {
+    rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
+  }
   if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count;
 
   // Disable extra altrefs and backward refs for "still" gf group:
@@ -2711,12 +3072,23 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   if (!cpi->extra_arf_allowed) {
     cpi->num_extra_arfs = 0;
   } else {
+#if USE_SYMM_MULTI_LAYER
+    if (rc->baseline_gf_interval == 4 && rc->source_alt_ref_pending)
+      cpi->num_extra_arfs = 1;
+    else
+      cpi->num_extra_arfs = get_number_of_extra_arfs(
+          rc->baseline_gf_interval, rc->source_alt_ref_pending);
+#else
     // Compute how many extra alt_refs we can have
     cpi->num_extra_arfs = get_number_of_extra_arfs(rc->baseline_gf_interval,
                                                    rc->source_alt_ref_pending);
+#endif  // USE_SYMM_MULTI_LAYER
   }
+
+#if !USE_SYMM_MULTI_LAYER
   // Currently at maximum two extra ARFs' are allowed
   assert(cpi->num_extra_arfs <= MAX_EXT_ARFS);
+#endif
 
   rc->frames_till_gf_update_due = rc->baseline_gf_interval;
 
@@ -3393,12 +3765,66 @@ static void configure_buffer_updates(AV1_COMP *cpi) {
     case INTNL_ARF_UPDATE:
       cpi->refresh_last_frame = 0;
       cpi->refresh_golden_frame = 0;
+#if USE_SYMM_MULTI_LAYER
+      if (cpi->new_bwdref_update_rule == 1) {
+        cpi->refresh_bwd_ref_frame = 1;
+        cpi->refresh_alt2_ref_frame = 0;
+      } else {
+#endif
+        cpi->refresh_bwd_ref_frame = 0;
+        cpi->refresh_alt2_ref_frame = 1;
+#if USE_SYMM_MULTI_LAYER
+      }
+#endif
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+
+    default: assert(0); break;
+  }
+}
+
+void av1_configure_buffer_updates_firstpass(AV1_COMP *cpi,
+                                            FRAME_UPDATE_TYPE update_type) {
+  RATE_CONTROL *rc = &cpi->rc;
+
+  cpi->refresh_last_frame = 1;
+  cpi->refresh_golden_frame = 0;
+  cpi->refresh_bwd_ref_frame = 0;
+  cpi->refresh_alt2_ref_frame = 0;
+  cpi->refresh_alt_ref_frame = 0;
+
+  rc->is_bwd_ref_frame = 0;
+
+  switch (update_type) {
+    case ARF_UPDATE:
+      cpi->refresh_alt_ref_frame = 1;
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
       cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+
+      rc->is_src_frame_alt_ref = 0;
+      break;
+    case INTNL_ARF_UPDATE:
       cpi->refresh_alt2_ref_frame = 1;
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
       cpi->refresh_alt_ref_frame = 0;
+      rc->is_src_frame_alt_ref = 0;
+      rc->is_src_frame_ext_arf = 0;
+
       break;
+    case BIPRED_UPDATE:
+      cpi->refresh_bwd_ref_frame = 1;
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
 
-    default: assert(0); break;
+      rc->is_bwd_ref_frame = 1;
+      break;
+    default: break;
   }
 }
 
@@ -3444,7 +3870,12 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
     target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate);
     rc->base_frame_target = target_rate;
 
-    cm->frame_type = INTER_FRAME;
+    if (cpi->no_show_kf) {
+      assert(gf_group->update_type[gf_group->index] == ARF_UPDATE);
+      cm->frame_type = KEY_FRAME;
+    } else {
+      cm->frame_type = INTER_FRAME;
+    }
 
     // Do the firstpass stats indicate that this frame is skippable for the
     // partition search?
@@ -3479,7 +3910,7 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
     twopass->baseline_active_worst_quality = tmp_q;
     rc->ni_av_qi = tmp_q;
     rc->last_q[INTER_FRAME] = tmp_q;
-    rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->bit_depth);
+    rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params.bit_depth);
     rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
     rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
     rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h
index 4ff0f73b0..b0c1a21e4 100644
--- a/third_party/aom/av1/encoder/firstpass.h
+++ b/third_party/aom/av1/encoder/firstpass.h
@@ -122,6 +122,11 @@ typedef struct {
   unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
+#if USE_SYMM_MULTI_LAYER
+  unsigned char arf_pos_in_gf[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char pyramid_level[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char pyramid_height;
+#endif
   unsigned char brf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char bidir_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char ref_fb_idx_map[(MAX_LAG_BUFFERS * 2) + 1][REF_FRAMES];
@@ -186,6 +191,8 @@ void av1_end_first_pass(struct AV1_COMP *cpi);
 
 void av1_init_second_pass(struct AV1_COMP *cpi);
 void av1_rc_get_second_pass_params(struct AV1_COMP *cpi);
+void av1_configure_buffer_updates_firstpass(struct AV1_COMP *cpi,
+                                            FRAME_UPDATE_TYPE update_type);
 
 // Post encode update of the rate control parameters for 2-pass
 void av1_twopass_postencode_update(struct AV1_COMP *cpi);
diff --git a/third_party/aom/av1/encoder/hash_motion.c b/third_party/aom/av1/encoder/hash_motion.c
index 5a8f8cbba..f2ff5b495 100644
--- a/third_party/aom/av1/encoder/hash_motion.c
+++ b/third_party/aom/av1/encoder/hash_motion.c
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 #include <assert.h>
 
 #include "config/av1_rtcd.h"
diff --git a/third_party/aom/av1/encoder/ab_partition_model_weights.h b/third_party/aom/av1/encoder/partition_model_weights.h
index 5b918fae2..279d39495 100644
--- a/third_party/aom/av1/encoder/ab_partition_model_weights.h
+++ b/third_party/aom/av1/encoder/partition_model_weights.h
@@ -1311,6 +1311,481 @@ static const NN_CONFIG av1_ab_partition_nnconfig_16 = {
 #undef FEATURE_SIZE
 #undef LABEL_SIZE
 
+#define FEATURE_SIZE 18
+#define LABEL_SIZE 4
+
+static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 48] = {
+  0.121894f,  0.058485f,  0.702226f,  0.015457f,  -0.123380f, -0.573450f,
+  0.319576f,  0.118808f,  0.166057f,  0.526984f,  0.015211f,  -0.025050f,
+  0.085717f,  -0.028221f, -0.580062f, -0.270530f, -0.092371f, 0.037679f,
+  0.083573f,  0.007112f,  -0.358623f, -0.264443f, -0.064819f, 0.022013f,
+  -0.040077f, -0.291967f, -0.293100f, 0.072266f,  -0.270572f, -0.292253f,
+  -0.260105f, -0.294472f, -0.275752f, 0.054315f,  0.000085f,  0.105115f,
+  -0.363572f, -0.016542f, 0.185943f,  -0.359903f, 0.038765f,  -0.377668f,
+  0.172692f,  0.127749f,  -0.031275f, -0.242528f, -0.145880f, -0.055247f,
+  -0.000265f, -0.355224f, 0.089917f,  -0.377841f, -0.209766f, 0.030899f,
+  0.039546f,  -0.375030f, -0.041605f, 0.137677f,  0.021282f,  -0.150442f,
+  -0.189445f, 0.009293f,  -0.316033f, 0.038745f,  -0.278761f, 0.005692f,
+  -0.071763f, -0.302936f, -0.224572f, -0.211841f, 0.057503f,  0.005435f,
+  -0.930979f, 0.115513f,  0.689958f,  0.221318f,  1.003891f,  0.359540f,
+  -0.640534f, -0.162373f, -0.118105f, 0.205587f,  0.019710f,  0.025067f,
+  -0.025344f, 0.002831f,  0.033078f,  0.040175f,  -0.007502f, 0.026272f,
+  0.083443f,  -0.880884f, 0.436948f,  0.293297f,  0.051678f,  -0.133328f,
+  -0.180323f, 0.667835f,  0.070733f,  -0.003060f, -0.221804f, 0.146601f,
+  0.064024f,  0.056758f,  -0.077361f, 0.105587f,  -0.185500f, -0.133552f,
+  0.138269f,  0.165055f,  0.628284f,  0.846449f,  0.058825f,  0.223157f,
+  0.277896f,  -0.381303f, 0.408241f,  0.643301f,  0.067494f,  0.120822f,
+  -0.182491f, -0.111373f, -0.033374f, 0.131387f,  -0.114654f, 0.114318f,
+  0.094718f,  -0.052232f, 0.385903f,  1.212304f,  0.425305f,  -0.052993f,
+  0.291474f,  -0.319730f, 0.023090f,  -0.317259f, 0.011181f,  -0.034185f,
+  -0.100671f, 0.186185f,  -0.432511f, -0.115957f, -0.067746f, -0.177810f,
+  -0.226700f, 0.004464f,  0.006809f,  0.171360f,  -0.080723f, 0.099826f,
+  -0.062301f, -0.358755f, -0.202549f, -0.084616f, -0.042313f, -0.325560f,
+  0.010452f,  -0.341089f, -0.013566f, -0.340129f, 0.034675f,  -0.036518f,
+  -0.036473f, -0.192892f, 0.650235f,  0.609437f,  -0.160982f, 0.125535f,
+  -1.004575f, 0.521969f,  1.318091f,  0.614004f,  -0.106622f, -0.077453f,
+  -0.037328f, -0.081940f, 0.007640f,  0.026654f,  -0.080332f, -0.077356f,
+  -0.288170f, -0.319680f, -0.131712f, -0.150985f, 0.073218f,  0.089502f,
+  -0.280502f, 0.003941f,  -0.249937f, 0.244263f,  0.023269f,  0.080263f,
+  0.073172f,  -0.200036f, 0.022381f,  0.008592f,  -0.339517f, -0.135073f,
+  0.177199f,  0.208363f,  0.652360f,  0.272990f,  0.609535f,  0.145805f,
+  0.022527f,  -0.088378f, 0.205008f,  0.101021f,  -0.019673f, -0.252681f,
+  0.116034f,  -0.062052f, 0.009991f,  0.138933f,  -0.182428f, 0.052542f,
+  -0.350825f, -0.122654f, -0.154687f, 0.066747f,  0.021541f,  -0.212169f,
+  -0.087093f, -0.087488f, 0.178129f,  -0.146544f, 0.013919f,  -0.273899f,
+  0.223753f,  -0.187327f, -0.118795f, -0.191892f, -0.355979f, 0.023794f,
+  -0.135236f, 0.058918f,  0.069080f,  0.279287f,  0.369689f,  1.134526f,
+  0.659511f,  0.250223f,  0.286040f,  0.515284f,  0.067791f,  -0.156385f,
+  0.143283f,  0.050884f,  0.089956f,  -0.040850f, -0.003650f, -0.081162f,
+  0.086004f,  0.116578f,  0.826254f,  0.504869f,  -0.196022f, -0.207279f,
+  0.200503f,  -0.196801f, 0.008211f,  0.411158f,  -0.075855f, -0.036690f,
+  0.111519f,  -0.057838f, -0.005846f, 0.111067f,  0.174712f,  -0.078054f,
+  0.765897f,  0.018670f,  -0.306960f, -0.020034f, -0.332875f, 0.662707f,
+  -0.461233f, -1.007542f, -0.693995f, -1.243352f, -0.014745f, 0.004036f,
+  -0.009141f, 0.003325f,  -0.011233f, -0.000819f, 0.006369f,  0.002418f,
+  -0.035906f, -0.005135f, 1.073830f,  1.020736f,  -0.182611f, -1.038976f,
+  -0.226695f, -0.375663f, 0.364568f,  0.620995f,  -0.018615f, 0.011347f,
+  0.045786f,  0.041077f,  0.010886f,  -0.148428f, 0.028007f,  -0.022322f,
+  -0.165985f, 0.233315f,  -0.277531f, -0.329683f, -0.516967f, -0.390750f,
+  0.006948f,  0.133744f,  -0.375681f, -0.116877f, -0.009441f, -0.008597f,
+  -0.160679f, 0.102150f,  -0.142647f, -0.117501f, 0.035035f,  0.228687f,
+  -1.117397f, -0.005171f, -0.008708f, 0.413042f,  -0.298532f, 0.614909f,
+  -0.181084f, -0.711770f, 0.344033f,  0.287220f,  -0.112848f, -0.052866f,
+  -0.222466f, 0.025029f,  -0.107558f, 0.137036f,  -0.276661f, -0.038808f,
+  -0.057448f, 0.037563f,  0.526020f,  0.447997f,  0.288366f,  0.264815f,
+  0.319974f,  -0.193091f, 0.353830f,  0.412950f,  -0.280454f, 0.092737f,
+  0.070919f,  0.043336f,  0.041214f,  -0.052147f, 0.010860f,  0.191325f,
+  0.079783f,  -0.425672f, -0.053469f, -0.005495f, 0.184526f,  -0.166171f,
+  0.084459f,  -0.042165f, -0.261759f, -0.248723f, -0.073483f, -0.377884f,
+  -0.189614f, -0.054146f, -0.261279f, 0.196347f,  -0.087568f, 0.070533f,
+  -0.145492f, -0.041500f, -0.465861f, 0.077369f,  0.020645f,  -0.440232f,
+  -0.414585f, -0.168627f, -0.050011f, -0.336676f, -0.344943f, -0.288140f,
+  0.085513f,  -0.200425f, 0.218516f,  0.049604f,  -0.280952f, -0.242674f,
+  -1.969931f, 0.013374f,  -0.039643f, 1.113947f,  0.018568f,  0.916330f,
+  -0.302934f, -0.225816f, 0.189529f,  -0.361971f, 0.021073f,  -0.050143f,
+  -0.041415f, 0.015126f,  0.018091f,  -0.082401f, 0.017152f,  0.064856f,
+  0.156170f,  0.145323f,  -0.281409f, 0.213357f,  -0.058966f, 0.158668f,
+  0.033742f,  0.378820f,  -0.662875f, -0.455532f, -0.702928f, 0.234325f,
+  0.139627f,  -1.360650f, 0.040921f,  -0.044373f, -0.059999f, -0.048565f,
+  0.115339f,  -0.105888f, -0.170567f, -0.206097f, -0.349537f, 0.107941f,
+  -0.356286f, -0.374928f, 0.143257f,  -0.317790f, 0.079875f,  -0.359345f,
+  0.081321f,  -0.219772f, -0.077213f, 0.110624f,  -0.252329f, -0.266481f,
+  0.190135f,  0.121214f,  0.661064f,  -0.037820f, -0.373068f, -0.065209f,
+  -0.286154f, -0.120695f, -0.110670f, -0.193589f, -0.010867f, -0.048054f,
+  -0.032010f, 0.110627f,  0.054094f,  -0.884309f, -1.171623f, -0.386911f,
+  -0.756058f, 0.030362f,  0.563628f,  -0.334227f, -0.111213f, 1.143898f,
+  -0.940454f, 0.084510f,  0.671010f,  0.312244f,  -0.052592f, -0.014376f,
+  0.039965f,  -0.010763f, -0.114936f, -0.146020f, 0.015874f,  0.027439f,
+  -1.702315f, 0.148702f,  0.153021f,  0.363147f,  -0.488933f, 0.220772f,
+  0.640310f,  -0.173911f, -0.169523f, -0.082261f, -0.014854f, 0.024414f,
+  0.061041f,  -0.013998f, 0.086539f,  0.000466f,  0.037472f,  -0.010665f,
+  -0.326646f, 0.106971f,  0.405589f,  0.555345f,  -0.318315f, 0.526498f,
+  0.119246f,  0.022213f,  0.171237f,  0.214651f,  0.062904f,  -0.023764f,
+  0.011831f,  0.079644f,  -0.096530f, -0.054373f, -0.306309f, -0.203709f,
+  -0.353217f, -0.350005f, -0.329549f, 0.062679f,  -0.387625f, -0.237111f,
+  -0.025050f, -0.193987f, 0.002235f,  -0.380821f, -0.051036f, -0.136020f,
+  0.077989f,  -0.361691f, 0.120485f,  0.157746f,  0.073394f,  -0.284401f,
+  0.113221f,  0.109808f,  0.000197f,  0.122523f,  0.081411f,  -0.048544f,
+  -0.136577f, -0.007158f, -0.208952f, -0.276831f, 0.260479f,  -1.392915f,
+  -0.865248f, 0.114577f,  -0.000749f, -0.060338f, -0.091176f, -0.108421f,
+  0.221256f,  0.100176f,  -0.877560f, -1.248838f, 0.643005f,  0.064580f,
+  -0.049878f, 0.267988f,  -0.434340f, -0.299254f, -0.097572f, 0.009606f,
+  0.063810f,  -0.090525f, 0.027760f,  0.043484f,  0.041697f,  0.108024f,
+  -0.359586f, -0.197090f, 0.121397f,  0.152206f,  -0.391126f, -0.283145f,
+  0.008754f,  -0.059022f, -0.218745f, 0.043042f,  -0.056716f, 0.153051f,
+  -0.210372f, -0.029681f, -0.288354f, 0.065242f,  -0.189376f, 0.115013f,
+  -0.251488f, -0.533091f, 0.037768f,  -0.319107f, -0.161364f, -0.103967f,
+  0.063271f,  -0.313289f, -0.312093f, -0.045239f, 0.150607f,  0.001487f,
+  0.019602f,  -0.338031f, -0.036214f, 0.112736f,  -0.367762f, 0.122367f,
+  0.094670f,  0.175590f,  0.301041f,  -0.135257f, 0.539620f,  0.328619f,
+  -0.163971f, 0.137256f,  0.238805f,  0.483722f,  0.121353f,  0.083630f,
+  -0.283568f, 0.291661f,  -0.061122f, -0.195295f, 0.153459f,  -0.153727f,
+  -0.238839f, -0.071736f, 0.601437f,  -0.664072f, 0.230827f,  0.198753f,
+  -0.039196f, 0.206751f,  0.529020f,  0.904132f,  -0.219471f, 0.186694f,
+  -0.208608f, -0.093385f, -0.161617f, 0.003930f,  -0.429869f, -0.123563f,
+  0.626098f,  -0.002495f, -0.245511f, -1.069848f, 0.296115f,  -0.940267f,
+  -1.649122f, -0.512937f, -0.802874f, -1.000239f, -0.027629f, 0.020434f,
+  -0.003030f, 0.035986f,  -0.004812f, -0.009193f, -0.004644f, -0.024347f,
+  0.068439f,  -0.314339f, 0.095057f,  -0.212372f, 0.197523f,  -0.040878f,
+  -0.272164f, -0.243326f, -0.204955f, 0.157199f,  -0.049964f, -0.091537f,
+  -0.058012f, -0.306650f, 0.098621f,  -0.146778f, -0.154447f, -0.177889f,
+  -0.009698f, 0.025427f,  0.350576f,  -0.448237f, -0.068823f, 1.224960f,
+  -0.776883f, -0.692167f, -0.948497f, -0.492598f, 0.029440f,  -0.056460f,
+  0.021654f,  0.004352f,  0.041508f,  -0.027179f, 0.006789f,  -0.023573f,
+  0.207775f,  -0.280273f, -0.347984f, -0.129935f, 0.151512f,  -0.087294f,
+  -0.494352f, -0.341424f, 0.044084f,  -0.064080f, 0.073091f,  -0.145574f,
+  0.094715f,  -0.258786f, -0.020419f, -0.401823f, 0.009397f,  -0.138642f,
+  -0.034953f, -0.077419f, 0.636610f,  0.314980f,  1.110610f,  -0.343368f,
+  0.696647f,  -0.649667f, 0.653491f,  -0.096006f, -0.090469f, -0.066975f,
+  -0.105864f, -0.015666f, 0.102056f,  -0.105344f, -0.273495f, -0.014686f,
+  0.122031f,  0.139524f,  -1.042029f, -0.562510f, 0.885644f,  1.088059f,
+  0.189223f,  0.049404f,  -0.167371f, 0.018703f,  -0.208390f, -0.159002f,
+  -0.377130f, -0.151118f, 0.117861f,  0.026986f,  -0.032433f, 0.081603f,
+  -0.106729f, -0.040134f, 0.015161f,  0.290572f,  0.241446f,  1.390085f,
+  0.438915f,  -0.358097f, -0.171799f, 0.879758f,  -0.014110f, 0.029562f,
+  -0.073583f, -0.125817f, -0.036512f, -0.040275f, 0.037997f,  0.120979f,
+  0.064538f,  -0.038841f, 0.034797f,  0.110229f,  -0.239779f, -0.004558f,
+  0.226534f,  0.111286f,  -0.268198f, 0.237673f,  -0.328237f, -0.090774f,
+  -0.269690f, -0.202147f, -0.181808f, -0.305238f, 0.110058f,  -0.169217f,
+  -0.300125f, 0.069031f,  -0.081358f, -0.376174f, -0.349980f, 0.071443f,
+  -0.396278f, -0.389503f, -0.190410f, -0.014767f, -0.265229f, -0.099787f,
+  0.079847f,  -0.214580f, -0.235661f, -0.184227f, 0.111099f,  -0.083945f,
+  -0.153809f, -0.284092f, -0.132497f, -0.154841f, -0.517157f, -0.640603f,
+  -0.357036f, -0.486142f, -0.182819f, -0.475022f, 0.079282f,  0.081168f,
+  -0.120831f, -0.016048f, -0.232495f, 0.214329f,  -0.055058f, 0.032856f,
+  0.061753f,  0.003226f,  0.097028f,  0.084535f,  -1.563199f, 0.434928f,
+  -0.403710f, 0.520696f,  -0.401696f, 0.450568f,  -0.074121f, 0.076622f,
+  -0.098421f, 0.167036f,  -0.255250f, -0.526313f, -0.933693f, -0.558104f,
+  0.194341f,  0.173326f,  0.071112f,  -0.651961f, -1.327587f, -0.705289f,
+  -1.138889f, 0.197167f,  -0.714654f, -0.113891f, 0.080158f,  0.000301f,
+  0.057905f,  0.060718f,  -0.635995f, 0.100026f,  -0.038239f, -0.025530f,
+};
+
+static const float av1_4_partition_nn_bias_16_layer0[48] = {
+  -0.079252f, -0.083606f, -0.112759f, -0.071622f, 0.444562f,  0.215649f,
+  -0.337661f, -0.242379f, -0.053829f, 0.165168f,  -0.076613f, -0.190579f,
+  -0.060175f, -0.571661f, -0.454075f, -1.462711f, -0.161563f, -0.088748f,
+  -0.030279f, -0.456293f, -0.134473f, -0.194976f, 0.044373f,  -0.503954f,
+  -0.083563f, 0.123344f,  0.011821f,  0.085445f,  -0.050294f, -0.135194f,
+  0.057815f,  0.543558f,  -0.090602f, -0.104671f, -0.285075f, 0.354335f,
+  1.037007f,  -0.023879f, -0.025025f, -0.094408f, -0.101200f, -0.142105f,
+  -0.380607f, -0.059067f, -0.113017f, -0.137448f, -0.177840f, 0.468505f,
+};
+
+static const float av1_4_partition_nn_weights_16_layer1[48 * LABEL_SIZE] = {
+  0.174954f,  -0.239117f, 0.073252f,  0.258881f,  0.579781f,  0.441827f,
+  0.372037f,  -0.062362f, 0.068477f,  0.376811f,  -0.130520f, 0.214951f,
+  -0.200674f, 0.240347f,  0.152954f,  1.360264f,  0.334630f,  -0.064789f,
+  -0.270826f, 0.212699f,  0.045669f,  -0.150852f, -0.412603f, 0.122481f,
+  -0.230246f, 0.005004f,  0.321417f,  -0.554083f, -0.186742f, -0.197687f,
+  -0.028669f, -0.138559f, -0.117773f, 0.024953f,  0.326367f,  -0.109951f,
+  -1.098959f, -0.136134f, 0.563218f,  0.191799f,  0.126191f,  -0.093113f,
+  0.185371f,  0.058468f,  0.245247f,  -0.138064f, -0.471573f, -0.209372f,
+  -0.111171f, 0.222275f,  -0.350556f, -0.106336f, 0.268877f,  0.090639f,
+  -0.083008f, -0.190791f, -0.243922f, -0.121182f, -0.133733f, -0.078450f,
+  0.099751f,  0.353020f,  -0.199079f, -0.463492f, -0.647884f, 0.166611f,
+  -0.464034f, 0.045096f,  -0.312178f, -0.190972f, -0.468297f, 0.662376f,
+  -0.197071f, -0.653123f, -0.354365f, -0.088501f, -0.302671f, 0.140713f,
+  0.885444f,  0.350273f,  -0.003345f, 0.217260f,  0.219156f,  0.240653f,
+  0.347840f,  0.101849f,  -0.244565f, -0.166971f, 0.091056f,  0.319912f,
+  0.268459f,  0.250726f,  -0.155819f, -0.087588f, 0.010749f,  -0.192344f,
+  0.344808f,  0.223482f,  -0.189563f, -0.067317f, -0.348191f, -0.085265f,
+  0.259318f,  0.102408f,  0.096675f,  -0.255564f, -0.168480f, -0.068189f,
+  -0.457704f, 0.010565f,  0.228573f,  -0.124421f, 0.202488f,  0.148519f,
+  0.002180f,  0.099099f,  -0.179019f, 0.245414f,  -0.038307f, 0.116897f,
+  -0.031377f, 0.368533f,  -0.793891f, 0.148614f,  0.075441f,  0.102465f,
+  -0.310002f, -0.355369f, -0.206713f, -0.262276f, 0.068578f,  -0.044980f,
+  0.092689f,  -0.181058f, 0.016279f,  0.155965f,  0.545361f,  -0.390699f,
+  -0.042457f, 0.110238f,  0.114640f,  0.112525f,  0.522221f,  0.533164f,
+  -0.331720f, -0.212966f, 0.140823f,  0.251311f,  -0.006092f, -0.800438f,
+  0.007981f,  -0.585140f, -0.006526f, 0.541683f,  -0.298498f, 0.084322f,
+  -0.056467f, -0.361806f, -0.256347f, -1.419173f, -0.159093f, 0.023017f,
+  0.667915f,  -0.176995f, 0.022307f,  -0.169493f, 0.581377f,  0.044929f,
+  0.044914f,  -0.056290f, 0.324196f,  0.648043f,  -0.089381f, -0.054971f,
+  0.064782f,  0.629356f,  -0.003760f, -0.123822f, 0.144133f,  -0.378821f,
+  1.116858f,  0.128552f,  -0.668783f, 0.207194f,  -0.437781f, -0.283321f,
+  -0.549404f, 0.010538f,  0.208997f,  0.231396f,  -0.174347f, 0.161910f,
+};
+
+static const float av1_4_partition_nn_bias_16_layer1[LABEL_SIZE] = {
+  -0.197883f,
+  -0.136696f,
+  0.094115f,
+  0.612799f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_16 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      48,  // num_hidden_nodes
+  },
+  {
+      av1_4_partition_nn_weights_16_layer0,
+      av1_4_partition_nn_weights_16_layer1,
+  },
+  {
+      av1_4_partition_nn_bias_16_layer0,
+      av1_4_partition_nn_bias_16_layer1,
+  },
+};
+
+static const float av1_4_partition_nn_weights_32_layer0[FEATURE_SIZE * 32] = {
+  0.114554f,  0.043669f,  0.313291f,  0.167688f,  -0.413357f, 0.088232f,
+  0.301915f,  -0.358117f, 0.267711f,  -0.252716f, -0.038531f, -0.032805f,
+  -0.025382f, 0.023624f,  -0.949694f, -0.065480f, -0.375721f, -0.697319f,
+  -0.117387f, -0.204309f, -0.190797f, -0.223867f, -0.190248f, 0.026668f,
+  0.199717f,  0.216902f,  -0.239241f, -0.096894f, -0.225046f, 0.246523f,
+  0.002333f,  -0.254385f, -0.205815f, 0.123139f,  -0.476923f, 0.137557f,
+  0.059686f,  -0.124013f, 0.974675f,  0.889753f,  0.378940f,  0.526413f,
+  -0.208747f, -0.001913f, 0.094081f,  0.848010f,  0.062042f,  0.159831f,
+  0.071016f,  0.024437f,  0.212611f,  0.039501f,  -0.149922f, -0.055229f,
+  -0.229270f, 0.129004f,  -0.182803f, 0.291223f,  -1.197804f, -0.916991f,
+  -0.024095f, 0.738729f,  -0.300326f, 0.402480f,  0.023944f,  -0.022613f,
+  -0.004554f, 0.001784f,  0.035143f,  -0.202237f, 0.080252f,  -0.003912f,
+  -0.040345f, -0.121881f, 0.126672f,  0.093507f,  -0.081305f, -0.081099f,
+  -0.218824f, -0.459254f, -0.055250f, -0.095096f, 0.207278f,  0.245259f,
+  -0.380849f, -0.334458f, -0.351449f, -0.513045f, -0.407823f, -0.222423f,
+  0.103205f,  -0.299965f, -0.211472f, -0.348690f, -0.283688f, -0.152743f,
+  -0.204005f, -0.173636f, 0.020302f,  -0.109112f, 0.081203f,  -0.137344f,
+  -0.364582f, -0.343133f, -0.176167f, -0.446541f, 0.144844f,  -0.268105f,
+  -0.003889f, -0.309560f, -0.236092f, -0.299450f, 0.248269f,  0.207510f,
+  -0.279023f, -0.272472f, -0.166427f, 0.205973f,  -0.345692f, -0.238400f,
+  -0.319178f, -0.327246f, -0.321756f, 0.043191f,  -0.027520f, -0.029310f,
+  0.161379f,  0.031154f,  -0.605365f, -0.230926f, 0.261142f,  -0.262678f,
+  -0.373351f, -0.326245f, 0.279222f,  0.684357f,  -0.864302f, 0.036132f,
+  0.239307f,  0.136262f,  0.124002f,  -0.410379f, -0.172722f, -0.376670f,
+  -0.195889f, 0.037292f,  -0.055295f, 1.022308f,  0.237600f,  -0.618435f,
+  0.366154f,  0.168308f,  -0.473467f, -0.756558f, -0.044830f, 0.019057f,
+  -0.084214f, -0.007789f, -0.066028f, -0.074562f, 0.002082f,  0.001007f,
+  -0.269676f, -0.164768f, -0.027271f, -0.098935f, 0.009431f,  0.254431f,
+  0.124238f,  -0.198181f, 0.142723f,  -0.112997f, -0.164224f, -0.355160f,
+  0.135330f,  -0.379557f, 0.079392f,  0.210607f,  -0.354927f, -0.277678f,
+  -0.931111f, 0.056208f,  -0.347710f, -0.355415f, 0.826145f,  0.390625f,
+  0.374414f,  -0.205685f, 0.562485f,  0.152288f,  0.130635f,  0.056622f,
+  0.057972f,  0.095526f,  -0.082436f, -0.085938f, -0.070570f, -0.087634f,
+  0.335934f,  0.084860f,  0.544424f,  -0.278917f, 0.476740f,  0.050927f,
+  -1.288817f, -0.078320f, -0.553041f, -0.160538f, -0.109365f, -0.127146f,
+  -0.032524f, -0.105117f, -0.182965f, -0.024723f, 0.083317f,  0.060073f,
+  -0.042945f, 0.015249f,  1.241504f,  0.662613f,  0.530496f,  -0.180519f,
+  -1.099086f, -0.825844f, 0.551856f,  -0.025009f, -0.006619f, -0.001049f,
+  0.014828f,  -0.035166f, -0.241091f, -0.136364f, -0.003219f, -0.014581f,
+  -0.379945f, -0.226191f, -0.161241f, -0.496390f, -0.147175f, -0.118004f,
+  -0.128206f, -0.389770f, -0.184288f, -0.119076f, -0.379211f, 0.236180f,
+  -0.468730f, -0.175170f, 0.136433f,  0.167739f,  -0.377602f, 0.135772f,
+  0.040972f,  -0.193974f, -0.319475f, -0.016469f, -0.412027f, -0.322605f,
+  0.111125f,  -0.078456f, -0.387234f, -0.401605f, -0.088717f, -0.340682f,
+  0.010556f,  0.058256f,  -0.127352f, 0.017665f,  0.072632f,  -0.171966f,
+  -0.117342f, -0.166050f, -0.182689f, -0.073182f, 0.096279f,  -0.260229f,
+  0.025216f,  -0.332236f, -0.218706f, -0.200153f, -0.110303f, 0.073499f,
+  -0.280123f, 0.132262f,  -0.308330f, -0.119036f, -0.303874f, -0.065445f,
+  -0.412137f, 0.057167f,  0.044582f,  -0.330952f, -0.232572f, 0.039732f,
+  -0.326877f, -0.300569f, -0.467164f, -0.371499f, 0.034430f,  0.058277f,
+  -0.042485f, -0.409028f, -0.110889f, -0.500758f, -0.343141f, 0.042023f,
+  -1.071050f, 0.086854f,  -0.004932f, -0.259698f, 0.125301f,  -0.742663f,
+  -0.370517f, -0.772840f, 0.193628f,  0.554676f,  0.051283f,  -0.196639f,
+  0.040344f,  0.027391f,  -0.040501f, 0.038303f,  0.032972f,  -0.014638f,
+  0.097720f,  -0.206897f, -0.015480f, 0.008543f,  0.034469f,  0.127234f,
+  -0.396463f, -0.390189f, 0.117538f,  -0.435622f, 0.043420f,  -0.241987f,
+  -0.118254f, -0.190349f, 0.190273f,  -0.085625f, -0.141253f, -0.377438f,
+  -0.249211f, 0.214512f,  -0.363191f, -0.754851f, 0.238045f,  1.127635f,
+  0.173947f,  -0.357620f, 0.073671f,  0.220617f,  0.072067f,  -0.076214f,
+  -0.044583f, -0.018371f, 0.010952f,  -0.135116f, 0.076597f,  0.034480f,
+  -0.070212f, -0.454429f, -0.135215f, 0.163851f,  -0.625990f, -0.283991f,
+  0.284051f,  0.182935f,  -0.048717f, 0.002484f,  -0.009086f, 0.321724f,
+  0.125162f,  -0.069624f, -0.430299f, -0.007224f, -0.284725f, -0.475662f,
+  0.123807f,  -0.313614f, -0.103142f, 0.072125f,  0.100320f,  -0.185558f,
+  -0.481522f, -0.247311f, -0.386762f, -0.258850f, 0.178844f,  -0.381231f,
+  -0.436001f, -0.374834f, 0.230104f,  -0.500679f, 0.170880f,  0.029657f,
+  -0.105857f, -0.366671f, -0.268833f, 0.036885f,  -0.026776f, 0.037837f,
+  -0.362095f, -0.254933f, 0.129650f,  0.007945f,  -0.304715f, -0.100813f,
+  -0.342849f, -0.269223f, 0.178490f,  0.186735f,  -0.353995f, 0.050381f,
+  -0.440186f, 0.025985f,  1.096969f,  1.132937f,  0.581545f,  0.271734f,
+  -0.109169f, -0.014239f, 0.688644f,  0.602702f,  0.048616f,  0.022335f,
+  0.037545f,  0.081667f,  -0.109038f, -0.088565f, -0.002506f, -0.041420f,
+  -0.132515f, 0.187312f,  0.677273f,  1.111182f,  0.199096f,  -0.211551f,
+  -0.896508f, 0.257981f,  0.007803f,  0.160343f,  -0.124864f, -0.097150f,
+  0.225090f,  0.242900f,  -0.195665f, 0.011310f,  0.160765f,  0.169195f,
+  -0.081994f, -0.017372f, -0.566190f, -0.902086f, 0.027768f,  0.511419f,
+  0.076009f,  -0.165861f, 0.240487f,  0.006298f,  -0.153334f, 0.041249f,
+  0.387092f,  0.313011f,  -0.032269f, 0.019024f,  0.052568f,  0.124247f,
+  0.197640f,  0.002537f,  0.651044f,  0.829828f,  -0.446444f, -0.402042f,
+  -0.469399f, -0.019842f, 0.371960f,  0.140373f,  -0.044808f, 0.008283f,
+  0.093791f,  0.052149f,  0.143123f,  -0.449571f, -0.868816f, -0.265661f,
+  -0.225232f, -0.014704f, 0.543836f,  -0.374498f, 0.561647f,  1.309445f,
+  0.056789f,  -0.048447f, 0.255758f,  0.644553f,  -0.124802f, 0.097419f,
+  -0.149336f, 0.021596f,  -0.043699f, 0.057591f,  -0.000077f, 0.034488f,
+  -0.049353f, -0.007799f, 0.437914f,  0.509369f,  0.674428f,  1.858949f,
+  -0.205964f, 0.060776f,  0.184213f,  0.037177f,  -0.062535f, -0.115408f,
+  0.076498f,  0.010235f,  -0.142253f, 0.009983f,  0.073436f,  0.038716f,
+  -0.369983f, -0.185959f, -0.137867f, 0.032134f,  0.213814f,  -0.125571f,
+  0.247874f,  -0.166871f, -0.160890f, 0.147029f,  0.267143f,  -0.298488f,
+  -0.210203f, -0.188313f, -0.085024f, -0.244962f, -0.189833f, -0.261242f,
+  0.399519f,  0.143200f,  -0.776419f, -0.374639f, -0.022066f, 0.582904f,
+  0.006430f,  -0.139134f, -0.491894f, -0.430579f, -0.358221f, -0.231365f,
+  -0.398255f, -0.173231f, 0.211789f,  -0.036121f, -0.266856f, 0.042956f,
+  -1.138513f, -0.070313f, 0.158803f,  0.406989f,  -0.015974f, 0.651020f,
+  -0.468982f, -0.310019f, 0.416922f,  0.895162f,  0.019921f,  0.004023f,
+  0.006962f,  0.000863f,  -0.216395f, -0.074913f, -0.002613f, 0.026703f,
+};
+
+static const float av1_4_partition_nn_bias_32_layer0[32] = {
+  0.133615f,  -0.113389f, -0.575989f, 0.589389f,  -0.193574f, -0.132463f,
+  0.000000f,  0.060317f,  0.264577f,  -0.060599f, 0.540147f,  -0.127782f,
+  -0.548802f, -0.172235f, -0.193032f, -0.026301f, -0.177527f, 0.267821f,
+  -0.115455f, -0.137162f, -0.079595f, -0.041443f, -0.043856f, -0.657220f,
+  -0.448931f, 0.446300f,  0.250002f,  0.223559f,  -0.647723f, -0.014369f,
+  0.084333f,  -0.056270f,
+};
+
+static const float av1_4_partition_nn_weights_32_layer1[32 * LABEL_SIZE] = {
+  -0.069633f, -0.087239f, 0.365816f,  -0.068579f, 0.231198f,  -0.067856f,
+  -0.139892f, -0.100235f, -0.488166f, -0.150112f, -0.005546f, 0.210832f,
+  0.778888f,  0.169624f,  0.089968f,  -0.243569f, 0.353483f,  0.032296f,
+  -0.157408f, 0.286885f,  -0.063537f, -0.324055f, -0.161464f, 0.430600f,
+  0.277707f,  -0.196463f, 0.154647f,  0.059804f,  0.176408f,  0.303179f,
+  -0.040156f, 0.375810f,  -0.363032f, -0.186808f, -0.264561f, -0.158937f,
+  -0.007949f, -0.076394f, 0.056475f,  0.308528f,  0.695387f,  0.051336f,
+  0.433063f,  -0.229948f, -1.210712f, 0.036286f,  0.183868f,  -0.117660f,
+  0.230134f,  -0.093469f, 0.237918f,  0.625986f,  -0.236671f, -0.377172f,
+  0.331091f,  -0.394004f, -0.214349f, 0.243940f,  -0.600348f, 0.069843f,
+  0.088325f,  0.225775f,  0.276884f,  -0.604493f, 0.769812f,  0.259574f,
+  0.086220f,  0.511515f,  -0.282584f, -0.157719f, 0.278778f,  -0.332732f,
+  0.068985f,  -0.237236f, -0.006102f, -0.154883f, 0.710288f,  -0.245896f,
+  -0.255895f, -0.398038f, 0.304084f,  -0.317065f, 0.192609f,  -0.235613f,
+  0.461340f,  0.117194f,  0.116817f,  0.196150f,  0.421622f,  -0.264495f,
+  0.617852f,  -0.351756f, -0.310016f, 0.135932f,  -0.242622f, -0.073094f,
+  0.042077f,  0.039230f,  -0.482715f, 0.553187f,  0.360637f,  0.313484f,
+  -0.131540f, -0.104731f, 0.374704f,  0.222173f,  0.437657f,  0.029827f,
+  -0.545156f, -0.203176f, 0.267824f,  0.169237f,  -0.057871f, 0.552197f,
+  0.272243f,  0.025681f,  -0.262192f, 0.255934f,  -0.202407f, -0.483317f,
+  -0.204721f, 0.288807f,  -0.030735f, -0.047161f, -0.780724f, 0.381939f,
+  -0.295318f, 0.537378f,
+};
+
+static const float av1_4_partition_nn_bias_32_layer1[LABEL_SIZE] = {
+  -0.332518f,
+  0.114452f,
+  0.098949f,
+  0.465896f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_32 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      32,  // num_hidden_nodes
+  },
+  {
+      av1_4_partition_nn_weights_32_layer0,
+      av1_4_partition_nn_weights_32_layer1,
+  },
+  {
+      av1_4_partition_nn_bias_32_layer0,
+      av1_4_partition_nn_bias_32_layer1,
+  },
+};
+
+static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 16] = {
+  0.256343f,  -0.021774f, -0.117102f, 0.416930f,  0.188160f,  0.148768f,
+  -0.611181f, -0.121607f, -0.394825f, -0.875025f, -0.167071f, 0.016408f,
+  0.222769f,  -0.199332f, 0.058667f,  -0.679529f, 0.081744f,  0.044438f,
+  -0.182941f, -0.110339f, -0.137822f, -0.096164f, -0.132319f, 0.140036f,
+  -0.049503f, -0.309894f, -0.323991f, 0.166113f,  0.138104f,  -0.263629f,
+  -0.368460f, -0.273989f, 0.147239f,  0.044566f,  -0.363357f, -0.030792f,
+  0.020734f,  0.068506f,  -0.434214f, 0.581644f,  -1.244146f, -0.569162f,
+  0.179499f,  -0.188900f, 0.078431f,  -0.392126f, -0.006431f, 0.112146f,
+  -0.065892f, -0.051319f, 0.094607f,  0.251700f,  -0.000650f, 0.011911f,
+  0.080449f,  0.022816f,  0.322382f,  0.577070f,  0.927738f,  0.178707f,
+  -0.101237f, -0.212521f, 0.560261f,  -0.206492f, -0.077591f, -0.069960f,
+  0.025727f,  0.041122f,  -0.735228f, -0.506091f, -0.600776f, -0.117829f,
+  0.103556f,  0.141823f,  0.853448f,  0.339488f,  0.994022f,  0.121693f,
+  -2.065366f, -0.352510f, -0.174323f, -0.323400f, -0.002193f, 0.004161f,
+  0.042469f,  -0.005319f, -0.305784f, -0.371353f, 0.011194f,  -0.018597f,
+  0.209260f,  0.071577f,  0.242470f,  -0.856593f, 0.288842f,  1.062608f,
+  -0.300472f, 0.221623f,  -0.813563f, -0.250347f, -0.081455f, -0.092779f,
+  -0.168132f, -0.180640f, -0.075130f, -0.052906f, -0.015645f, 0.127158f,
+  -0.006546f, 0.051671f,  0.545608f,  1.101804f,  0.288086f,  1.107046f,
+  -0.200012f, 0.220182f,  -0.189220f, -0.554973f, 0.040711f,  -0.058029f,
+  0.043737f,  0.016164f,  -0.391790f, -0.287770f, -0.046545f, 0.045071f,
+  0.190005f,  -0.076963f, 0.836839f,  1.633266f,  0.902928f,  0.991972f,
+  -0.127932f, 0.293680f,  -0.035984f, 0.476179f,  -0.098024f, 0.068314f,
+  -0.058365f, 0.096221f,  -0.000321f, -0.128840f, 0.136441f,  -0.061853f,
+  0.270367f,  -0.184129f, -0.373670f, -0.177381f, 0.262109f,  -0.378013f,
+  -0.053249f, -0.456389f, 0.222972f,  -0.228067f, -0.115210f, -0.277797f,
+  0.096913f,  -0.014512f, -0.015533f, 0.026389f,  -0.360536f, -0.078477f,
+  -0.203186f, 0.199574f,  0.770476f,  0.595592f,  0.360828f,  0.547721f,
+  -0.804787f, 0.389690f,  -0.437645f, 0.576776f,  0.081903f,  0.082750f,
+  0.007166f,  -0.143755f, 0.114462f,  0.472432f,  -0.058974f, 0.077761f,
+  -2.015181f, -0.054942f, -0.110894f, 0.529188f,  -0.003300f, 0.913895f,
+  -0.324643f, 0.316135f,  -0.291729f, 1.072647f,  -0.029236f, 0.045592f,
+  -0.039399f, 0.043472f,  -0.303244f, -0.108761f, -0.011154f, 0.009693f,
+  -0.374985f, 0.027758f,  0.302075f,  -0.295758f, -0.165563f, -0.297259f,
+  -0.485624f, -0.469310f, -0.028247f, -0.124440f, -0.428082f, 0.096325f,
+  0.089003f,  -0.301585f, 0.022474f,  0.077477f,  -0.032233f, -0.231036f,
+  0.143206f,  0.169113f,  -0.556486f, 0.346327f,  -0.667790f, 0.126983f,
+  0.179727f,  0.397307f,  -0.490612f, -1.708789f, -0.040336f, -0.028547f,
+  -0.091319f, -0.119367f, -0.518796f, -0.543383f, 0.037162f,  0.031344f,
+  -0.131692f, 0.119353f,  0.799313f,  0.443848f,  -0.499919f, -1.002983f,
+  0.375477f,  0.221096f,  -0.238033f, 0.284849f,  0.021897f,  0.023338f,
+  -0.059067f, 0.117276f,  0.039540f,  0.049630f,  0.175150f,  0.014166f,
+  -0.071486f, 0.091234f,  -1.007432f, -1.417378f, 0.640528f,  1.442576f,
+  -0.257183f, -0.597016f, 0.861785f,  0.276121f,  -0.098017f, 0.120514f,
+  -0.133184f, 0.106529f,  0.171644f,  0.059513f,  0.215952f,  -0.009441f,
+  -0.505313f, 0.063174f,  0.229148f,  -0.344213f, 0.862721f,  1.549941f,
+  -0.220129f, 0.493094f,  0.264095f,  0.143641f,  0.084968f,  -0.078266f,
+  0.032335f,  -0.019006f, -0.098205f, 0.119213f,  -0.103465f, 0.072811f,
+};
+
+static const float av1_4_partition_nn_bias_64_layer0[16] = {
+  0.111611f,  -0.067682f, 0.633594f,  0.143559f,  -1.051284f, -0.266625f,
+  -0.829789f, -0.956123f, -0.153484f, -0.787741f, 0.004832f,  -0.080769f,
+  0.235166f,  0.449468f,  0.294689f,  -0.395300f,
+};
+
+static const float av1_4_partition_nn_weights_64_layer1[16 * LABEL_SIZE] = {
+  -0.069999f, -0.093710f, -0.423714f, -0.028138f, 0.684415f,  0.141445f,
+  0.507161f,  0.435533f,  -0.263268f, 0.585105f,  0.235301f,  0.127536f,
+  -0.688639f, -0.217993f, -0.540066f, 0.406718f,  0.018210f,  -0.077349f,
+  -0.124823f, -0.488220f, -0.957026f, 0.302632f,  0.285490f,  -0.411356f,
+  0.091089f,  0.103862f,  -0.549291f, 0.148628f,  0.640603f,  -0.601018f,
+  0.178024f,  0.601370f,  0.313780f,  0.051938f,  0.524083f,  0.814631f,
+  -0.415522f, -0.738849f, 0.477881f,  -0.342864f, 0.105181f,  0.040010f,
+  -0.177521f, 0.400646f,  0.167093f,  0.388279f,  -0.898439f, -0.111936f,
+  0.469875f,  -0.099528f, -0.217370f, 0.283742f,  -0.033798f, -0.142797f,
+  -0.174057f, -1.293311f, -0.038777f, -0.003846f, 0.093642f,  -0.527150f,
+  -0.021259f, 0.194651f,  -0.276294f, -0.109514f,
+};
+
+static const float av1_4_partition_nn_bias_64_layer1[LABEL_SIZE] = {
+  -0.688947f,
+  0.121075f,
+  0.289597f,
+  0.948091f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_64 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      16,  // num_hidden_nodes
+  },
+  {
+      av1_4_partition_nn_weights_64_layer0,
+      av1_4_partition_nn_weights_64_layer1,
+  },
+  {
+      av1_4_partition_nn_bias_64_layer0,
+      av1_4_partition_nn_bias_64_layer1,
+  },
+};
+
+#undef FEATURE_SIZE
+#undef LABEL_SIZE
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/pickcdef.c b/third_party/aom/av1/encoder/pickcdef.c
index 4f6265617..6d154a7d2 100644
--- a/third_party/aom/av1/encoder/pickcdef.c
+++ b/third_party/aom/av1/encoder/pickcdef.c
@@ -296,7 +296,7 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
   int ydec[3];
   int pli;
   int cdef_count;
-  int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
+  int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
   uint64_t best_tot_mse = (uint64_t)1 << 63;
   uint64_t tot_mse;
   int sb_count;
@@ -317,8 +317,8 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
   DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
   uint16_t *in;
   DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
-  quantizer =
-      av1_ac_quant_Q3(cm->base_qindex, 0, cm->bit_depth) >> (cm->bit_depth - 8);
+  quantizer = av1_ac_quant_Q3(cm->base_qindex, 0, cm->seq_params.bit_depth) >>
+              (cm->seq_params.bit_depth - 8);
   lambda = .12 * quantizer * quantizer / 256.;
 
   av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
@@ -361,7 +361,7 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
 
     for (r = 0; r < frame_height; ++r) {
       for (c = 0; c < frame_width; ++c) {
-        if (cm->use_highbitdepth) {
+        if (cm->seq_params.use_highbitdepth) {
           src[pli][r * stride[pli] + c] = CONVERT_TO_SHORTPTR(
               xd->plane[pli].dst.buf)[r * xd->plane[pli].dst.stride + c];
           ref_coeff[pli][r * stride[pli] + c] =
diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c
index 5f802a707..461c3af83 100644
--- a/third_party/aom/av1/encoder/picklpf.c
+++ b/third_party/aom/av1/encoder/picklpf.c
@@ -82,10 +82,8 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
                           plane + 1, partial_frame);
 #endif
 
-  int highbd = 0;
-  highbd = cm->use_highbitdepth;
-
-  filt_err = aom_get_sse_plane(sd, cm->frame_to_show, plane, highbd);
+  filt_err = aom_get_sse_plane(sd, cm->frame_to_show, plane,
+                               cm->seq_params.use_highbitdepth);
 
   // Re-instate the unfiltered frame
   yv12_copy_plane(&cpi->last_frame_uf, cm->frame_to_show, plane);
@@ -202,7 +200,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
   } else if (method >= LPF_PICK_FROM_Q) {
     const int min_filter_level = 0;
     const int max_filter_level = av1_get_max_filter_level(cpi);
-    const int q = av1_ac_quant_Q3(cm->base_qindex, 0, cm->bit_depth);
+    const int q = av1_ac_quant_Q3(cm->base_qindex, 0, cm->seq_params.bit_depth);
     // These values were determined by linear fitting the result of the
     // searched level for 8 bit depth:
     // Keyframes: filt_guess = q * 0.06699 - 1.60817
@@ -211,7 +209,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     // And high bit depth separately:
     // filt_guess = q * 0.316206 + 3.87252
     int filt_guess;
-    switch (cm->bit_depth) {
+    switch (cm->seq_params.bit_depth) {
       case AOM_BITS_8:
         filt_guess = (cm->frame_type == KEY_FRAME)
                          ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18)
@@ -229,7 +227,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
                "or AOM_BITS_12");
         return;
     }
-    if (cm->bit_depth != AOM_BITS_8 && cm->frame_type == KEY_FRAME)
+    if (cm->seq_params.bit_depth != AOM_BITS_8 && cm->frame_type == KEY_FRAME)
       filt_guess -= 4;
     // TODO(chengchen): retrain the model for Y, U, V filter levels
     lf->filter_level[0] = clamp(filt_guess, min_filter_level, max_filter_level);
diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c
index 93ea09690..28b693b08 100644
--- a/third_party/aom/av1/encoder/pickrst.c
+++ b/third_party/aom/av1/encoder/pickrst.c
@@ -163,8 +163,8 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
   const int is_uv = plane > 0;
   const RestorationInfo *rsi = &cm->rst_info[plane];
   RestorationLineBuffers rlbs;
-  const int bit_depth = cm->bit_depth;
-  const int highbd = cm->use_highbitdepth;
+  const int bit_depth = cm->seq_params.bit_depth;
+  const int highbd = cm->seq_params.use_highbitdepth;
 
   const YV12_BUFFER_CONFIG *fts = cm->frame_to_show;
   // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be
@@ -173,7 +173,8 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
 
   av1_loop_restoration_filter_unit(
       limits, rui, &rsi->boundaries, &rlbs, tile_rect, rsc->tile_stripe0,
-      is_uv && cm->subsampling_x, is_uv && cm->subsampling_y, highbd, bit_depth,
+      is_uv && cm->seq_params.subsampling_x,
+      is_uv && cm->seq_params.subsampling_y, highbd, bit_depth,
       fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane],
       rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr);
 
@@ -540,8 +541,8 @@ static void search_sgrproj(const RestorationTileLimits *limits,
 
   const MACROBLOCK *const x = rsc->x;
   const AV1_COMMON *const cm = rsc->cm;
-  const int highbd = cm->use_highbitdepth;
-  const int bit_depth = cm->bit_depth;
+  const int highbd = cm->seq_params.use_highbitdepth;
+  const int bit_depth = cm->seq_params.bit_depth;
 
   uint8_t *dgd_start =
       rsc->dgd_buffer + limits->v_start * rsc->dgd_stride + limits->h_start;
@@ -549,8 +550,8 @@ static void search_sgrproj(const RestorationTileLimits *limits,
       rsc->src_buffer + limits->v_start * rsc->src_stride + limits->h_start;
 
   const int is_uv = rsc->plane > 0;
-  const int ss_x = is_uv && cm->subsampling_x;
-  const int ss_y = is_uv && cm->subsampling_y;
+  const int ss_x = is_uv && cm->seq_params.subsampling_x;
+  const int ss_y = is_uv && cm->seq_params.subsampling_y;
   const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
   const int procunit_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
 
@@ -1067,7 +1068,7 @@ static void search_wiener(const RestorationTileLimits *limits,
   double vfilterd[WIENER_WIN], hfilterd[WIENER_WIN];
 
   const AV1_COMMON *const cm = rsc->cm;
-  if (cm->use_highbitdepth) {
+  if (cm->seq_params.use_highbitdepth) {
     compute_stats_highbd(wiener_win, rsc->dgd_buffer, rsc->src_buffer,
                          limits->h_start, limits->h_end, limits->v_start,
                          limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
@@ -1149,7 +1150,7 @@ static void search_norestore(const RestorationTileLimits *limits,
   RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
   RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
 
-  const int highbd = rsc->cm->use_highbitdepth;
+  const int highbd = rsc->cm->seq_params.use_highbitdepth;
   rusi->sse[RESTORE_NONE] = sse_restoration_unit(
       limits, rsc->src, rsc->cm->frame_to_show, rsc->plane, highbd);
 
@@ -1280,7 +1281,7 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
     double best_cost = 0;
     RestorationType best_rtype = RESTORE_NONE;
 
-    const int highbd = rsc.cm->use_highbitdepth;
+    const int highbd = rsc.cm->seq_params.use_highbitdepth;
     extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height,
                  rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER,
                  highbd);
diff --git a/third_party/aom/av1/encoder/pustats.h b/third_party/aom/av1/encoder/pustats.h
index ef333b6d8..42a4c590b 100644
--- a/third_party/aom/av1/encoder/pustats.h
+++ b/third_party/aom/av1/encoder/pustats.h
@@ -18,91 +18,79 @@ extern "C" {
 
 #include "av1/encoder/ml.h"
 
-#define NUM_FEATURES 20
+#define NUM_FEATURES 11
 #define NUM_HIDDEN_LAYERS 2
-#define HIDDEN_LAYERS_0_NODES 10
+#define HIDDEN_LAYERS_0_NODES 12
 #define HIDDEN_LAYERS_1_NODES 10
 #define LOGITS_NODES 1
 
 static const float
     av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES *
                                           HIDDEN_LAYERS_0_NODES] = {
-      13.8498f,  19.6630f,   13.3036f,  5.2448f,   -18.0270f,  21.6671f,
-      -0.2135f,  -0.0060f,   0.1211f,   -0.3549f,  -0.3550f,   0.0190f,
-      0.0167f,   -0.1192f,   0.2003f,   8.6663f,   32.0264f,   9.9558f,
-      9.0935f,   -110.4994f, 51.8056f,  64.8041f,  58.5392f,   53.0189f,
-      -61.6300f, 4.7540f,    -0.0140f,  0.0185f,   -15.8050f,  0.0790f,
-      0.0707f,   0.0784f,    0.0766f,   -0.3030f,  0.0392f,    49.3312f,
-      63.3326f,  61.4025f,   54.2723f,  -62.2769f, -147.1736f, -84.9432f,
-      -82.5422f, -70.4857f,  46.7622f,  -1.0285f,  -0.4809f,   0.0068f,
-      1.0888f,   -0.0515f,   -0.0384f,  -0.0232f,  -0.0396f,   0.2429f,
-      0.2040f,   -144.4016f, -88.0868f, -80.3134f, -70.6685f,  66.8528f,
-      -53.8097f, -45.4011f,  -52.8680f, -58.7226f, 99.7830f,   2.3728f,
-      0.0229f,   0.0002f,    -0.3288f,  -0.0563f,  -0.0550f,   -0.0552f,
-      -0.0563f,  0.2214f,    0.0139f,   -60.8965f, -45.5251f,  -50.4188f,
-      -51.5623f, 85.7369f,   77.3415f,  47.4930f,  53.8120f,   58.2311f,
-      -45.9650f, -2.4938f,   0.1639f,   -0.5270f,  -75.4622f,  -0.0026f,
-      0.0031f,   0.0047f,    0.0015f,   0.0092f,   0.0654f,    75.6402f,
-      54.7447f,  54.8156f,   52.6834f,  -9.1246f,  -34.0108f,  -35.6423f,
-      -34.2911f, -38.5444f,  72.1123f,  10.9750f,  -0.1595f,   0.1983f,
-      22.5724f,  -0.0556f,   -0.0618f,  -0.0571f,  -0.0608f,   0.2439f,
-      -0.0805f,  -32.5107f,  -28.9688f, -33.7284f, -48.1365f,  61.5297f,
-      39.2492f,  -35.1928f,  -11.5000f, 7.7038f,   -94.2469f,  13.5586f,
-      0.7541f,   0.0105f,    4.4041f,   0.1799f,   0.1339f,    0.1567f,
-      -0.6668f,  -0.7384f,   0.2185f,   17.1700f,  -26.4601f,  -1.8970f,
-      38.9635f,  -30.1916f,  31.8139f,  14.6157f,  10.0565f,   3.3340f,
-      -40.6985f, -2.1186f,   0.0116f,   0.0962f,   0.7115f,    -1.4071f,
-      -1.3701f,  -1.4728f,   -1.3404f,  -1.7286f,  5.5632f,    28.4998f,
-      5.4087f,   16.2668f,   11.8693f,  -39.4153f, 106.3281f,  38.3075f,
-      39.4933f,  47.3805f,   -15.0514f, -21.2421f, -0.2358f,   -0.0024f,
-      0.3505f,   -0.0429f,   -0.0377f,  -0.0322f,  -0.0344f,   0.2020f,
-      0.1417f,   99.6711f,   35.3896f,  43.1117f,  59.8879f,   -17.8250f,
-      -16.6976f, 18.5100f,   6.3383f,   25.3020f,  -55.8824f,  25.1027f,
-      -0.9926f,  -0.0738f,   -1.4892f,  0.0269f,   -0.0051f,   -5.8168f,
-      -0.0579f,  -0.1500f,   0.7224f,   8.3066f,   -3.8805f,   -12.1482f,
-      14.3492f,  -20.8118f,
+      21.5067f,  22.6709f,  0.0049f,   0.9288f,  -0.0100f,  0.0060f,   -0.0071f,
+      -0.0085f,  0.0348f,   -0.1273f,  10.1154f, 6.3405f,   7.8589f,   -0.0652f,
+      -4.6352f,  0.0445f,   -3.2748f,  0.1025f,  -0.0385f,  -0.4505f,  1.1320f,
+      3.2634f,   23.2420f,  -7.9056f,  0.0522f,  -18.1555f, 0.0977f,   0.1155f,
+      -0.0138f,  0.0267f,   -0.3992f,  0.2735f,  22.8063f,  35.1043f,  3.8140f,
+      -0.0295f,  0.0771f,   -0.6938f,  0.0302f,  -0.0266f,  0.0989f,   -0.0794f,
+      0.2981f,   33.3333f,  -24.1150f, 1.4986f,  -0.0975f,  -15.3938f, -0.0858f,
+      -0.0845f,  -0.0869f,  -0.0858f,  0.3542f,  0.0155f,   -18.2629f, 9.6688f,
+      -11.9643f, -0.2904f,  -5.3026f,  -0.1011f, -0.1202f,  0.0127f,   -0.0269f,
+      0.3434f,   0.0595f,   16.6800f,  41.4730f, 6.9269f,   -0.0512f,  -1.4540f,
+      0.0468f,   0.0077f,   0.0983f,   0.1265f,  -0.5234f,  0.9477f,   36.6470f,
+      -0.4838f,  -0.2269f,  -0.1143f,  -0.3907f, -0.5005f,  -0.0179f,  -0.1057f,
+      0.1233f,   -0.4412f,  -0.0474f,  0.1140f,  -21.6813f, -0.9077f,  -0.0078f,
+      -3.3306f,  0.0417f,   0.0412f,   0.0427f,  0.0418f,   -0.1699f,  0.0072f,
+      -22.3335f, 16.1203f,  -10.1220f, -0.0019f, 0.0005f,   -0.0054f,  -0.0155f,
+      -0.0302f,  -0.0379f,  0.1276f,   0.1568f,  21.6175f,  12.2919f,  11.0327f,
+      -0.2000f,  -8.6691f,  -0.5593f,  -0.5952f, -0.4203f,  -0.4857f,  -1.1239f,
+      3.1404f,   -13.1098f, -5.9165f,  22.2060f, -0.0312f,  -3.9642f,  -0.0344f,
+      -0.0656f,  -0.0273f,  -0.0465f,  0.1412f,  -6.1974f,  9.3661f,
     };
 
 static const float av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] =
     {
-      17.6566f,  62.2217f, -107.2644f, -56.2255f, 68.2252f,
-      -37.5662f, 9.587f,   18.5206f,   69.6873f,  4.3903f,
+      -14.3065f, 2.059f,   -62.9916f, -50.1209f, 57.643f,  -59.3737f,
+      -30.4737f, -0.1112f, 72.5427f,  55.402f,   24.9523f, 18.5834f,
     };
 
 static const float
     av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
                                           HIDDEN_LAYERS_1_NODES] = {
-      -0.0494f, 0.3505f,   -0.0461f, -1.3451f, 0.0198f,  -0.0746f, -0.2217f,
-      -0.9525f, 0.0633f,   -0.0737f, -0.3568f, 1.8569f,  -0.0189f, -1.8269f,
-      0.6281f,  -1.3266f,  -0.9202f, 2.8978f,  -0.6437f, -0.8709f, -1.5066f,
-      -1.0582f, -1.9509f,  -0.0417f, -0.1315f, -0.3368f, 0.0014f,  -0.5734f,
-      -1.4640f, -1.6042f,  3.3911f,  -1.6815f, -1.9026f, -4.8702f, -0.1012f,
-      -1.4517f, -3.2156f,  0.8448f,  0.2331f,  -0.1593f, 2.6627f,  -0.8451f,
-      -1.7382f, 0.9303f,   2.3003f,  -0.0659f, 0.5772f,  0.4253f,  0.2083f,
-      0.3649f,  -0.9198f,  -0.2183f, -0.5381f, -1.0831f, 2.0359f,  0.0040f,
-      -0.0871f, -0.1715f,  2.2453f,  0.5099f,  -0.5900f, -0.6313f, -1.3028f,
-      -1.7257f, 1.4130f,   -0.7189f, -0.4336f, 1.9266f,  1.7495f,  -0.3321f,
-      0.2827f,  0.4015f,   -0.5044f, -1.0420f, -0.1258f, -0.0342f, -0.1190f,
-      -3.1263f, 0.7485f,   -0.3161f, -0.2224f, 2.5533f,  -0.2121f, -1.3389f,
-      0.5556f,  -0.9407f,  -0.7456f, 1.4137f,  -0.0353f, -0.0521f, 2.4382f,
-      0.1493f,  -11.5631f, -1.6178f, 3.5538f,  -3.6538f, -0.5972f, -3.0038f,
-      -2.1640f, 0.5754f,
+      0.3883f,  -0.2784f, -0.2850f, 0.4894f,  -2.2450f, 0.4511f,  -0.1969f,
+      -0.0077f, -1.4924f, 0.1138f,  -2.9848f, 1.0211f,  -0.1712f, -0.1952f,
+      -0.4774f, 0.0761f,  -0.3186f, -0.1002f, 0.8663f,  0.5026f,  1.1920f,
+      0.9337f,  0.3911f,  -0.3841f, -0.0037f, 0.7295f,  -0.3183f, 0.1829f,
+      -1.3670f, -0.1046f, 0.6629f,  0.0619f,  -0.1551f, 0.8174f,  2.1521f,
+      -1.3323f, -0.0527f, -0.5772f, 0.2001f,  -0.6270f, -1.0625f, 0.3342f,
+      0.6676f,  0.4605f,  -2.0049f, 0.7781f,  0.0713f,  -0.0824f, -0.4529f,
+      0.1757f,  -0.1338f, -0.2319f, -0.2864f, 0.1248f,  0.3887f,  -0.1676f,
+      1.8422f,  0.6435f,  1.2123f,  -0.5667f, -0.2423f, -0.0314f, 0.2411f,
+      -0.5013f, 0.0422f,  0.2559f,  0.4435f,  -0.1223f, 1.5167f,  0.3939f,
+      1.0898f,  0.0795f,  -0.9251f, -0.0813f, -0.5929f, -0.0741f, 4.0687f,
+      -0.4368f, -0.0984f, 0.0837f,  3.6169f,  0.0662f,  -0.1679f, -0.8090f,
+      -0.2610f, -0.5791f, 0.0642f,  -0.2979f, -0.9036f, 0.2898f,  0.3265f,
+      0.4660f,  -1.6358f, -0.0347f, 0.1087f,  0.0353f,  0.5687f,  -0.5242f,
+      -0.4895f, 0.7693f,  -1.3829f, -0.2244f, -0.2880f, 0.0575f,  2.0563f,
+      -0.2322f, -1.1597f, 1.6125f,  -0.0925f, 1.3540f,  0.1432f,  0.3993f,
+      -0.0303f, -1.1438f, -1.7323f, -0.4329f, 2.9443f,  -0.5724f, 0.0122f,
+      -1.0829f,
     };
 
 static const float av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] =
     {
-      69.1995f, 41.7369f, -1.4885f, -35.785f, 26.1678f,
-      58.4472f, 36.2223f, 66.327f,  50.8867f, 2.8306f,
+      -10.3717f, 37.304f,  -36.7221f, -52.7572f, 44.0877f,
+      41.1631f,  36.3299f, -48.6087f, -4.5189f,  13.0611f,
     };
 
 static const float
     av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
-      1.811f,  0.9009f, 0.0694f, -0.9985f, -0.039f,
-      0.2076f, 0.5643f, 0.5408f, 0.6071f,  0.277f,
+      0.8362f, 1.0615f, -1.5178f, -1.2959f, 1.3233f,
+      1.4909f, 1.3554f, -0.8626f, -0.618f,  -0.9458f,
     };
 
 static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = {
-  39.5529f,
+  30.6878f,
 };
 
 static const NN_CONFIG av1_pustats_rate_nnconfig = {
@@ -125,78 +113,70 @@ static const NN_CONFIG av1_pustats_rate_nnconfig = {
 static const float
     av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES *
                                           HIDDEN_LAYERS_0_NODES] = {
-      -39.0787f,  -212.9998f, -174.2088f, -264.1454f, 292.7151f,  -60.8750f,
-      -5.9915f,   0.0712f,    -60.2312f,  -0.2020f,   -0.2135f,   -0.1663f,
-      -0.0711f,   0.2267f,    0.9152f,    -36.1294f,  -159.9320f, -222.9809f,
-      -270.2556f, 300.7162f,  159.9224f,  -172.5735f, -7.6852f,   54.3985f,
-      110.6721f,  19.2907f,   -15.1039f,  -0.0457f,   0.3289f,    0.4529f,
-      -8.2222f,   1.3213f,    -0.8378f,   -0.2605f,   3.9600f,    17.3407f,
-      113.1116f,  34.6326f,   11.6688f,   109.3541f,  240.8123f,  45.0615f,
-      80.7443f,   39.2500f,   -21.0931f,  -27.1989f,  -0.4264f,   -0.1345f,
-      1.6269f,    -0.0716f,   0.0989f,    -0.1382f,   0.0248f,    0.0913f,
-      4.3903f,    244.1014f,  32.2567f,   58.6171f,   62.2273f,   -2.8647f,
-      -227.5659f, 16.0031f,   -70.5256f,  23.8071f,   290.7356f,  13.6094f,
-      -2.1842f,   0.0104f,    -2.8760f,   0.3708f,    0.8501f,    -3.2964f,
-      -0.2088f,   -0.4474f,   1.2248f,    40.5180f,   -130.7891f, -188.1583f,
-      -174.0906f, 205.9622f,  0.3425f,    0.2531f,    0.2822f,    0.0488f,
-      0.1416f,    -0.0433f,   -0.1195f,   -0.0413f,   -0.0708f,   -0.0787f,
-      -0.0889f,   -0.4022f,   -0.5055f,   -0.4715f,   0.2315f,    0.1021f,
-      -0.3676f,   -0.3499f,   -0.0715f,   0.1913f,    205.7521f,  125.2265f,
-      92.0640f,   77.5566f,   -164.4280f, -19.3715f,  -0.1346f,   -0.4060f,
-      0.5042f,    -0.2395f,   -0.1329f,   -0.1397f,   0.2175f,    0.2895f,
-      5.5019f,    198.9799f,  114.0018f,  94.9015f,   86.8434f,   -183.4237f,
-      121.5626f,  94.8945f,   65.0803f,   93.6487f,   -346.5279f, -47.6168f,
-      0.0633f,    0.0135f,    -0.0692f,   -0.1015f,   -0.1146f,   -0.1341f,
-      -0.1175f,   0.4186f,    0.1505f,    130.7402f,  107.8443f,  62.8497f,
-      65.3501f,   -312.7407f, 282.8321f,  98.1531f,   75.6648f,   25.8733f,
-      -176.9298f, -37.2695f,  -0.3760f,   0.0017f,    0.1030f,    -0.1483f,
-      0.0787f,    -0.0962f,   0.4109f,    -0.2292f,   9.1681f,    274.3607f,
-      60.9538f,   75.9405f,   68.3776f,   -167.3098f, -335.1045f, -69.2583f,
-      -76.3441f,  -16.5793f,  218.5244f,  28.2405f,   0.9169f,    -0.0026f,
-      -0.8077f,   -1.5756f,   -0.0804f,   0.1404f,    1.2656f,    0.0272f,
-      -0.2529f,   -340.8659f, -112.7778f, -58.3890f,  -4.1224f,   108.1709f,
-      -180.7382f, -93.7114f,  -77.8686f,  -131.8134f, 353.3893f,  4.8233f,
-      0.0205f,    0.0000f,    -1.1654f,   -0.0161f,   -0.0255f,   -0.0358f,
-      -0.0412f,   0.1103f,    0.1041f,    -188.9934f, -110.1792f, -88.6301f,
-      -93.7226f,  336.9746f,
+      0.7770f,   1.0881f,  0.0177f,  0.4939f,  -0.2541f, -0.2672f, -0.1705f,
+      -0.1940f,  -0.6395f, 1.2928f,  3.6240f,  2.4445f,  1.6790f,  0.0265f,
+      0.1897f,   0.1776f,  0.0422f,  0.0197f,  -0.0466f, 0.0462f,  -1.0827f,
+      2.0231f,   1.8044f,  2.7022f,  0.0064f,  0.2255f,  -0.0552f, -0.1010f,
+      -0.0581f,  -0.0781f, 0.2614f,  -3.4085f, 1.7478f,  0.1155f,  -0.1458f,
+      -0.0031f,  -0.1797f, -0.4378f, -0.0539f, 0.0607f,  -0.1347f, -0.3142f,
+      -0.2014f,  -0.4484f, -0.2808f, 1.5913f,  0.0046f,  -0.0610f, -0.6479f,
+      -0.7278f,  -0.5592f, -0.6695f, -0.8120f, 2.9056f,  -1.1501f, 9.3618f,
+      4.2486f,   0.0011f,  -0.1499f, -0.0834f, 0.1282f,  0.0409f,  0.1670f,
+      -0.1398f,  -0.4661f, 13.7700f, 8.2061f,  -0.0685f, 0.0061f,  -0.2951f,
+      0.0169f,   0.0520f,  0.0040f,  0.0374f,  0.0467f,  -0.0107f, 14.2664f,
+      -2.2489f,  -0.2516f, -0.0061f, -0.9921f, 0.1223f,  0.1212f,  0.1199f,
+      0.1185f,   -0.4867f, 0.0325f,  -5.0757f, -8.7853f, 1.0450f,  0.0169f,
+      0.5462f,   0.0051f,  0.1330f,  0.0143f,  0.1429f,  -0.0258f, 0.2769f,
+      -12.8839f, 22.3093f, 1.2761f,  0.0037f,  -1.2459f, -0.0466f, 0.0003f,
+      -0.0464f,  -0.0067f, 0.2361f,  0.0355f,  23.3833f, 10.9218f, 2.6811f,
+      0.0222f,   -1.1055f, 0.1825f,  0.0575f,  0.0114f,  -0.1259f, 0.3148f,
+      -2.0047f,  11.9559f, 5.7375f,  0.8802f,  0.0042f,  -0.2469f, -0.1040f,
+      -1.5679f,  0.1969f,  -0.0184f, 0.0157f,  0.6688f,  3.4492f,
     };
 
 static const float av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] =
-    { -175.6918f, 43.4519f,  154.196f, -81.1015f,  -0.0758f,
-      136.5695f,  110.8713f, 142.029f, -153.0901f, -145.2688f };
+    {
+      4.5051f,  -4.5858f, 1.4693f, 0.f,      3.7968f, -3.6292f,
+      -7.3112f, 10.9743f, 8.027f,  -2.2692f, -8.748f, -1.3689f,
+    };
 
 static const float
     av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
                                           HIDDEN_LAYERS_1_NODES] = {
-      -0.1727f, -0.2859f,  -0.3757f, -0.4260f,  -0.5441f, -0.0666f, -0.3792f,
-      -0.1335f, -0.1521f,  -0.0821f, -3.1590f,  0.2711f,  0.5889f,  0.0878f,
-      0.4693f,  0.7773f,   -9.2989f, 0.0414f,   0.4485f,  22.8958f, -3.7024f,
-      -2.4672f, -43.2908f, 0.0956f,  0.4431f,   2.3429f,  1.7183f,  0.3985f,
-      -0.2275f, -3.1583f,  -0.3485f, 0.3280f,   0.3763f,  0.2069f,  0.4231f,
-      0.7366f,  -6.9527f,  0.0713f,  0.1359f,   16.6500f, -1.7655f, -0.1651f,
-      0.1280f,  -0.2678f,  -0.2120f, 1.6243f,   1.8773f,  -0.7543f, -0.3292f,
-      -0.7627f, -0.2001f,  -0.1125f, -0.8100f,  -0.1866f, 0.0567f,  -0.4002f,
-      3.2429f,  0.6427f,   -0.3759f, -11.6518f, -2.2893f, 0.7708f,  -1.8637f,
-      1.7148f,  0.3124f,   -0.7129f, -0.4927f,  0.1964f,  -0.2570f, -25.0783f,
-      2.5061f,  0.1457f,   -1.1239f, 0.0570f,   -0.2526f, -0.0669f, 0.6791f,
-      1.1531f,  -0.7246f,  -0.3180f, -0.0015f,  -0.0061f, -0.1626f, -0.0181f,
-      0.1271f,  -0.0140f,  -0.6027f, 0.0736f,   -0.0157f, 1.2420f,  -6.4055f,
-      0.2128f,  -0.0386f,  0.3446f,  0.1840f,   -0.7208f, -1.6979f, -0.0442f,
-      0.3230f,  -1.9745f,
+      -0.0182f, -0.0925f, -0.0311f, -0.2962f, 0.1177f,  -0.0027f, -0.2136f,
+      -1.2094f, 0.0935f,  -0.1403f, -0.1477f, -0.0752f, 0.1519f,  -0.4726f,
+      -0.3521f, 0.4199f,  -0.0168f, -0.2927f, -0.2510f, 0.0706f,  -0.2920f,
+      0.2046f,  -0.0400f, -0.2114f, 0.4240f,  -0.7070f, 0.4964f,  0.4471f,
+      0.3841f,  -0.0918f, -0.6140f, 0.6056f,  -0.1123f, 0.3944f,  -0.0178f,
+      -1.7702f, -0.4434f, 0.0560f,  0.1565f,  -0.0793f, -0.0041f, 0.0052f,
+      -0.1843f, 0.2400f,  -0.0605f, 0.3196f,  -0.0286f, -0.0002f, -0.0595f,
+      -0.0493f, -0.2636f, -0.3994f, -0.1871f, -0.3298f, -0.0788f, -1.0685f,
+      0.1900f,  -0.5549f, -0.1350f, -0.0153f, -0.1195f, -0.5874f, 1.0468f,
+      0.0212f,  -0.2306f, -0.2677f, -0.3000f, -1.0702f, -0.1725f, -0.0656f,
+      -0.0226f, 0.0616f,  -0.3453f, 0.0810f,  0.4838f,  -0.3780f, -1.4486f,
+      0.7777f,  -0.0459f, -0.6568f, 0.0589f,  -1.0286f, -0.6001f, 0.0826f,
+      0.4794f,  -0.0586f, -0.1759f, 0.3811f,  -0.1313f, 0.3829f,  -0.0968f,
+      -2.0445f, -0.3566f, -0.1491f, -0.0745f, -0.0202f, 0.0839f,  0.0470f,
+      -0.2432f, 0.3013f,  -0.0743f, -0.3479f, 0.0749f,  -5.2490f, 0.0209f,
+      -0.1653f, -0.0826f, -0.0535f, 0.3225f,  -0.3786f, -0.0104f, 0.3091f,
+      0.3652f,  0.1757f,  -0.3252f, -1.1022f, -0.0574f, -0.4473f, 0.3469f,
+      -0.5539f,
     };
 
 static const float av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] =
-    { 0.f,      70.3414f, 9.6036f,   -118.1096f, 49.2507f,
-      95.1849f, 81.8015f, 167.0967f, -337.7945f, 169.8344f };
+    {
+      11.9337f, -0.3681f, -6.1324f,  12.674f,  9.0956f,
+      4.6069f,  -4.4158f, -12.4848f, 10.8473f, 5.7633f,
+    };
 
 static const float
     av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
-      -0.3627f, 1.2272f,  0.2201f, -1.7406f, -0.6885f,
-      0.8487f,  -0.2761f, 0.7731f, -5.2096f, -0.7351f,
+      0.3245f,  0.2979f,  -0.157f,  -0.1441f, 0.1413f,
+      -0.7496f, -0.1737f, -0.5322f, 0.0748f,  0.2518f,
     };
 
 static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = {
-  48.2331f,
+  4.6065f,
 };
 
 static const NN_CONFIG av1_pustats_dist_nnconfig = {
diff --git a/third_party/aom/av1/encoder/rate_distortion_model_params.h b/third_party/aom/av1/encoder/rate_distortion_model_params.h
new file mode 100644
index 000000000..14d23f10f
--- /dev/null
+++ b/third_party/aom/av1/encoder/rate_distortion_model_params.h
@@ -0,0 +1,591 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
+#define AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+// 22 float features +
+// 2 categorical features with 4 possible values, converted to one-hot vectors.
+// So, total 22 + 2 * 4 = 30 features.
+#define NUM_FEATURES 30
+#define NUM_HIDDEN_LAYERS 1
+#define NUM_HIDDEN_NODES 96
+#define NUM_OUTPUTS 1
+
+//------------------------------------------------------------------------------
+// RDCost model
+
+static const float
+    av1_rdcost_model_nn_weights_layer0[NUM_FEATURES * NUM_HIDDEN_NODES] = {
+      -0.0699f,   0.2790f,    0.1915f,    0.2669f,    0.4637f,    0.4095f,
+      0.2129f,    0.0634f,    0.2306f,    -0.2232f,   -0.5711f,   -0.6493f,
+      -0.7406f,   -0.8440f,   0.4105f,    0.1392f,    0.5218f,    -0.1618f,
+      -0.1719f,   0.3409f,    0.1111f,    -0.3609f,   -0.2929f,   0.3869f,
+      -0.5373f,   0.0700f,    0.2572f,    0.2483f,    -0.0314f,   0.5228f,
+      0.0169f,    -0.1357f,   0.0419f,    -0.1722f,   0.1303f,    0.1198f,
+      -0.0013f,   0.1309f,    0.0293f,    -0.1941f,   0.0668f,    -0.0643f,
+      -0.0381f,   0.1249f,    -0.0731f,   -0.1649f,   0.0964f,    0.0270f,
+      0.1354f,    0.0538f,    -0.2064f,   -0.2067f,   -0.0569f,   0.0449f,
+      0.1680f,    -0.0732f,   -0.0785f,   0.1884f,    -0.2137f,   -0.0189f,
+      0.2976f,    0.2818f,    -0.0222f,   0.2658f,    0.0488f,    0.2778f,
+      -0.1110f,   0.2069f,    -0.0072f,   -0.0095f,   -0.1105f,   -0.1365f,
+      -0.4245f,   -0.4751f,   -0.0736f,   0.2333f,    0.0653f,    -0.0249f,
+      0.0055f,    -0.0838f,   -0.0489f,   -0.2597f,   0.2621f,    -0.0251f,
+      -0.0545f,   0.0816f,    -0.0816f,   0.3396f,    -0.1047f,   0.3678f,
+      0.1487f,    -0.0270f,   0.2574f,    0.1018f,    0.2560f,    -0.0598f,
+      -0.0446f,   -0.1792f,   0.5336f,    -0.1590f,   -0.9820f,   -0.6514f,
+      -0.6304f,   -0.8359f,   -0.0699f,   0.0295f,    -0.0057f,   -0.3088f,
+      -0.1466f,   0.2220f,    -0.1980f,   -0.3400f,   -0.1228f,   0.2667f,
+      -0.4816f,   0.0155f,    -0.0194f,   0.2051f,    0.0513f,    0.1575f,
+      -121.4240f, -126.6840f, -124.1106f, -127.6184f, -85.0333f,  -26.6396f,
+      2.7020f,    102.0452f,  -85.5128f,  0.0076f,    122.2206f,  107.5265f,
+      108.3773f,  93.4847f,   20.3705f,   -89.6993f,  -176.9070f, -41.7543f,
+      -123.0293f, -91.6437f,  -205.7099f, -62.5346f,  -83.2987f,  21.3830f,
+      56.6341f,   -120.8647f, -127.7562f, -121.6688f, -127.4225f, -74.8045f,
+      -15.9247f,  -14.6468f,  -14.7788f,  -15.4498f,  -18.5514f,  -11.1579f,
+      -5.8164f,   -3.4318f,   0.8100f,    0.0642f,    203.5111f,  189.6872f,
+      190.4776f,  176.4784f,  -4.9427f,   -12.5324f,  -7.6861f,   21.9182f,
+      -6.7864f,   -7.1906f,   -8.1292f,   21.4780f,   -7.8016f,   -5.2653f,
+      61.8526f,   -15.5105f,  -14.6900f,  -14.1459f,  -15.4350f,  -19.1379f,
+      -0.7876f,   -1.8558f,   -4.6035f,   -6.8405f,   -0.2904f,   2.3202f,
+      1.8127f,    -2.9397f,   -0.8187f,   -0.6098f,   22.6173f,   10.3668f,
+      12.9363f,   2.4541f,    6.6700f,    0.3804f,    -3.3117f,   8.5464f,
+      -25.8348f,  1.8698f,    -9.5753f,   8.5558f,    -16.3576f,  7.2217f,
+      35.3115f,   -1.1447f,   -2.6530f,   -4.7027f,   -5.7024f,   -0.9513f,
+      0.8393f,    0.7085f,    0.7879f,    0.3728f,    3.0574f,    1.1360f,
+      26.0531f,   4.1084f,    -1.7340f,   0.1683f,    -450.7927f, -444.5818f,
+      -442.5239f, -438.1168f, 2.4924f,    -0.0147f,   -0.0797f,   -47.5322f,
+      -1.7638f,   -0.8608f,   -0.6500f,   -44.4326f,  -0.9027f,   2.5560f,
+      -267.6517f, 0.2642f,    0.9457f,    0.7944f,    0.3609f,    3.2742f,
+      -74.3400f,  -81.6894f,  -76.2162f,  -69.2979f,  -90.2476f,  -39.7389f,
+      2.2545f,    36.5095f,   -60.1129f,  -1.0383f,   87.0348f,   83.9940f,
+      83.7199f,   80.8609f,   14.9075f,   -78.7405f,  -74.3549f,  -4.2382f,
+      -23.9739f,  -91.8469f,  -67.2654f,  -21.5293f,  -9.9857f,   11.8391f,
+      35.8223f,   -74.2551f,  -81.0729f,  -73.8347f,  -70.3798f,  -86.8052f,
+      0.1701f,    -0.1136f,   0.0060f,    -0.0496f,   -0.1727f,   0.0195f,
+      -0.1040f,   0.1027f,    0.0467f,    -0.2538f,   -0.1322f,   0.0860f,
+      0.0093f,    -0.2801f,   -0.0958f,   0.0497f,    -0.0582f,   -0.0311f,
+      0.1840f,    0.0752f,    0.0282f,    0.0297f,    0.0607f,    0.0650f,
+      0.0893f,    0.1297f,    0.0373f,    0.0040f,    -0.0973f,   0.0248f,
+      -0.1419f,   0.0322f,    -0.0712f,   0.0860f,    -0.0426f,   -0.1989f,
+      0.1393f,    -0.1183f,   0.0735f,    -0.1895f,   0.1447f,    -0.0056f,
+      -0.1833f,   0.0884f,    0.0949f,    0.0476f,    0.0551f,    0.2125f,
+      -0.1537f,   -0.0141f,   -0.2182f,   0.1567f,    0.0457f,    -0.1485f,
+      -0.1177f,   0.0391f,    0.1982f,    -0.1288f,   0.1165f,    -0.2019f,
+      0.4550f,    0.5179f,    0.4311f,    0.1861f,    0.6199f,    0.4542f,
+      0.2034f,    0.1128f,    1.3489f,    -0.2525f,   -2.1139f,   -2.2444f,
+      -2.3679f,   -2.3378f,   0.5682f,    0.1348f,    0.3032f,    -1.5835f,
+      0.2883f,    0.1693f,    0.0439f,    -1.4556f,   0.3818f,    0.4875f,
+      -1.8899f,   0.2510f,    0.6450f,    0.6082f,    0.5962f,    0.8131f,
+      12.0281f,   13.3899f,   13.6249f,   15.8068f,   -1.5453f,   6.7456f,
+      -6.0877f,   26.2596f,   6.2223f,    -0.5922f,   134.1428f,  128.8985f,
+      128.7538f,  123.0920f,  1.3207f,    18.3069f,   15.7436f,   46.5230f,
+      24.7455f,   15.0688f,   19.9965f,   34.7236f,   19.7171f,   1.2018f,
+      49.7274f,   11.8957f,   13.1578f,   14.0451f,   15.3544f,   -3.5601f,
+      1.0048f,    0.9479f,    1.1832f,    2.0635f,    -2.9808f,   2.0803f,
+      -7.5815f,   8.4733f,    -4.2008f,   0.1217f,    226.5257f,  210.7018f,
+      211.6235f,  195.2605f,  0.8283f,    1.0977f,    1.4858f,    41.1242f,
+      1.5822f,    0.8742f,    2.0440f,    33.6213f,   1.6177f,    0.9661f,
+      65.0014f,   1.4197f,    1.0109f,    1.3153f,    1.5470f,    -3.2833f,
+      2.0858f,    2.0012f,    2.1088f,    2.5593f,    -0.9422f,   1.8554f,
+      -6.5378f,   0.6780f,    2.3186f,    0.0506f,    218.3285f,  203.4055f,
+      204.0362f,  188.7854f,  0.3701f,    2.5257f,    3.5172f,    28.8144f,
+      2.1511f,    3.4676f,    2.6337f,    28.5113f,   2.4254f,    -0.0548f,
+      59.4511f,   2.0757f,    2.1551f,    2.2271f,    2.5300f,    -1.4173f,
+      91.9240f,   88.2142f,   83.6155f,   82.2482f,   -9.2566f,   10.9654f,
+      -2.6974f,   62.6750f,   -3.6298f,   -0.1245f,   69.6721f,   67.1340f,
+      66.9162f,   64.1994f,   -83.6778f,  76.8107f,   69.7832f,   64.9261f,
+      68.4901f,   76.3615f,   70.8108f,   63.5435f,   69.1973f,   -83.6034f,
+      24.8275f,   90.1923f,   87.6831f,   82.9783f,   81.8558f,   -7.1010f,
+      95.1656f,   88.3853f,   80.5835f,   79.5990f,   -3.0720f,   8.1290f,
+      -0.6151f,   63.6425f,   -4.5833f,   -0.0063f,   70.1861f,   66.6250f,
+      66.6148f,   63.0886f,   -89.2863f,  74.7684f,   64.8897f,   60.4134f,
+      62.5241f,   78.7076f,   61.7234f,   60.1688f,   61.9509f,   -89.4098f,
+      30.3361f,   92.9144f,   88.5954f,   79.6336f,   79.2453f,   -0.4101f,
+      0.6287f,    0.8050f,    0.4417f,    0.5419f,    0.5972f,    1.3037f,
+      0.4316f,    -0.0013f,   -0.3673f,   -0.4952f,   6.1773f,    5.7825f,
+      6.1705f,    5.3848f,    1.7607f,    -0.0152f,   -0.2924f,   0.8199f,
+      1.3326f,    0.7197f,    -0.6332f,   1.1127f,    1.0472f,    1.8468f,
+      3.4419f,    0.8233f,    0.7175f,    0.8514f,    0.6372f,    0.9472f,
+      -0.0813f,   -0.0197f,   -0.0096f,   -0.2015f,   0.1133f,    -0.0305f,
+      0.0578f,    0.1375f,    -0.0750f,   -0.1702f,   0.1246f,    -0.1782f,
+      0.2017f,    0.0425f,    -0.0602f,   0.1837f,    0.1044f,    -0.1273f,
+      -0.1431f,   0.0672f,    -0.1807f,   -0.1045f,   -0.1355f,   -0.0497f,
+      -0.0561f,   -0.0633f,   0.1907f,    -0.0777f,   0.1203f,    0.0754f,
+      0.4079f,    0.2001f,    0.0558f,    0.0622f,    0.2951f,    0.6541f,
+      -0.0068f,   0.1070f,    0.4469f,    -0.1266f,   -1.3035f,   -1.3324f,
+      -1.3612f,   -0.9966f,   0.7986f,    0.3192f,    -0.5028f,   -0.3844f,
+      -0.4079f,   0.6690f,    -0.5109f,   -0.2719f,   -0.4958f,   1.0310f,
+      -0.8044f,   0.1447f,    0.4221f,    0.3194f,    0.3063f,    0.5520f,
+      0.4667f,    -5.7238f,   -0.5602f,   12.6339f,   -15.1865f,  -14.9035f,
+      -3.0726f,   9.5347f,    -24.6225f,  -2.7086f,   89.8557f,   95.0657f,
+      93.8693f,   99.1085f,   -35.9483f,  -18.0363f,  -1.6298f,   25.3484f,
+      39.3975f,   -15.3199f,  5.7664f,    17.2367f,   25.2788f,   -36.5648f,
+      29.1426f,   0.3857f,    -5.2117f,   0.0533f,    12.1707f,   -11.1735f,
+      0.2673f,    0.0090f,    0.1574f,    0.0904f,    0.0281f,    0.1144f,
+      0.1123f,    -0.0061f,   0.0954f,    -0.0094f,   -0.4387f,   -0.5006f,
+      -0.2560f,   -0.2326f,   -0.1769f,   0.0465f,    0.1273f,    -0.1627f,
+      0.2987f,    -0.3041f,   0.1131f,    -0.3620f,   0.0932f,    -0.0649f,
+      -0.4597f,   0.2535f,    -0.0994f,   0.1390f,    0.1279f,    0.4207f,
+      -39.1159f,  -42.6382f,  -38.4225f,  -31.2301f,  -28.2382f,  -28.1176f,
+      -9.5822f,   1.1886f,    -1.2964f,   -0.7908f,   154.9819f,  147.1914f,
+      147.0482f,  138.7535f,  -21.7014f,  -35.7117f,  -28.8802f,  -3.8968f,
+      -21.5007f,  -28.2213f,  -28.4878f,  -3.7558f,   -26.8317f,  -22.8491f,
+      50.9464f,   -37.0918f,  -42.8811f,  -39.3079f,  -32.1904f,  -26.6354f,
+      -72.5346f,  -75.5751f,  -72.6896f,  -71.3671f,  -35.3279f,  -21.6077f,
+      -5.8259f,   38.7516f,   -6.8012f,   0.0172f,    170.0685f,  157.4452f,
+      158.2334f,  145.0102f,  10.0653f,   -45.1775f,  -56.4571f,  -5.1165f,
+      -75.8980f,  -46.8672f,  -55.3642f,  -6.5631f,   -81.0258f,  10.1348f,
+      55.9786f,   -70.8124f,  -75.7040f,  -73.9831f,  -70.8786f,  -34.9723f,
+      88.6239f,   86.5330f,   80.9333f,   79.6833f,   -10.0096f,  10.6312f,
+      -4.2350f,   62.6230f,   -3.2991f,   -0.0843f,   75.8659f,   72.7886f,
+      72.5301f,   68.8265f,   -81.8276f,  70.3025f,   62.9511f,   62.5706f,
+      69.1842f,   69.3637f,   65.4820f,   65.4357f,   71.5347f,   -82.1064f,
+      24.1925f,   86.2418f,   85.4985f,   80.4091f,   79.5378f,   -9.3877f,
+      -7.6594f,   -4.9581f,   -10.6385f,  -20.2307f,  -44.2261f,  -13.7557f,
+      -4.5344f,   18.1793f,   -10.5522f,  -1.5878f,   110.3187f,  102.4945f,
+      102.3305f,  94.1324f,   -25.2665f,  9.8172f,    -4.4791f,   69.4972f,
+      -6.7571f,   5.8378f,    -11.6101f,  70.7066f,   -4.9327f,   -24.0513f,
+      41.4598f,   -7.0600f,   -7.0940f,   -10.2478f,  -18.9616f,  -46.7505f,
+      90.9365f,   86.0260f,   73.2934f,   69.3406f,   3.3863f,    3.8524f,
+      0.6536f,    63.2150f,   -10.6304f,  0.0291f,    73.0071f,   69.7660f,
+      69.0457f,   65.5611f,   -92.3379f,  74.2756f,   54.5025f,   84.3183f,
+      53.7481f,   73.5624f,   55.3827f,   82.3242f,   53.5432f,   -92.5355f,
+      25.3457f,   89.1858f,   84.4763f,   72.9840f,   69.1889f,   4.6719f,
+      -0.0129f,   0.1995f,    0.2069f,    0.0358f,    0.1209f,    -0.1185f,
+      -0.1217f,   -0.1456f,   0.0125f,    -0.1354f,   0.0510f,    -0.0572f,
+      0.1397f,    0.1453f,    -0.0086f,   0.0107f,    0.0232f,    0.1508f,
+      0.0884f,    -0.0967f,   -0.1786f,   0.1361f,    -0.1399f,   -0.2021f,
+      -0.0242f,   -0.2169f,   0.0133f,    0.0116f,    -0.1489f,   -0.0093f,
+      -0.0796f,   0.1507f,    0.0906f,    0.0228f,    -0.0166f,   -0.1875f,
+      0.0471f,    0.1184f,    -0.0007f,   -0.2732f,   -0.1386f,   -0.2057f,
+      -0.0213f,   -0.1699f,   0.0996f,    0.1562f,    0.1850f,    -0.0362f,
+      -0.2059f,   0.0258f,    -0.0135f,   -0.1276f,   0.0034f,    0.2023f,
+      0.0857f,    -0.0085f,   -0.1955f,   -0.1666f,   -0.0920f,   0.0971f,
+      -0.0292f,   -0.0512f,   -0.0753f,   -0.0739f,   -0.0873f,   -0.1200f,
+      0.0220f,    -0.1359f,   0.2013f,    -0.0445f,   0.1143f,    -0.1484f,
+      -0.1556f,   -0.0003f,   0.1711f,    -0.0724f,   -0.0531f,   0.1126f,
+      0.0476f,    -0.0057f,   0.0088f,    0.0792f,    -0.0438f,   -0.1118f,
+      -0.0244f,   0.0712f,    0.0930f,    -0.0203f,   0.1662f,    -0.0695f,
+      -12.3872f,  -18.7022f,  -13.4237f,  -1.4731f,   -18.6843f,  -14.1515f,
+      -7.5057f,   40.2090f,   -2.7774f,   -1.8433f,   123.6006f,  119.0557f,
+      118.2758f,  113.6423f,  -32.6216f,  -19.5865f,  -16.2897f,  17.2068f,
+      6.3559f,    -17.8742f,  0.7098f,    11.5970f,   -10.1104f,  -33.1830f,
+      39.5617f,   -10.5499f,  -17.8137f,  -14.7185f,  -2.6172f,   -14.6004f,
+      0.3893f,    0.4443f,    0.5305f,    0.3049f,    0.8316f,    0.8679f,
+      0.2265f,    0.2393f,    1.1970f,    -0.2891f,   -1.8666f,   -1.8266f,
+      -1.6984f,   -1.8787f,   0.8706f,    0.4208f,    0.5076f,    -0.8436f,
+      -0.1623f,   0.8008f,    0.1512f,    -1.0839f,   -0.3002f,   0.9263f,
+      -1.3031f,   0.5964f,    0.3413f,    0.5551f,    0.2618f,    0.7018f,
+      -0.1320f,   -0.1944f,   -0.0209f,   -0.0877f,   0.0721f,    -0.0840f,
+      0.0589f,    0.1019f,    0.1927f,    -0.2011f,   -0.1117f,   0.1575f,
+      0.1080f,    -0.0516f,   0.2154f,    -0.1231f,   0.0426f,    -0.0522f,
+      -0.1824f,   -0.1923f,   -0.1206f,   -0.1724f,   -0.0798f,   0.0401f,
+      -0.2170f,   0.0293f,    -0.0853f,   0.1517f,    0.2128f,    -0.1934f,
+      0.0406f,    0.0517f,    0.0822f,    -0.0150f,   0.0943f,    -0.0989f,
+      -0.1802f,   -0.1453f,   -0.1967f,   -0.1797f,   0.1545f,    -0.1217f,
+      0.1755f,    -0.1604f,   -0.0515f,   0.0509f,    0.0310f,    -0.1220f,
+      -0.1770f,   -0.0157f,   0.1989f,    -0.0069f,   0.1766f,    0.1267f,
+      -0.0517f,   -0.0396f,   0.0346f,    0.1946f,    0.1162f,    -0.1345f,
+      -106.6179f, -110.5917f, -107.5476f, -108.0601f, -61.1687f,  -22.4247f,
+      2.6632f,    109.5208f,  -66.1177f,  0.0062f,    159.9339f,  144.7755f,
+      145.5032f,  128.9872f,  18.9180f,   -75.3569f,  -105.0866f, -52.0704f,
+      -119.1299f, -74.7543f,  -109.9468f, -59.0682f,  -104.5754f, 19.2878f,
+      67.2573f,   -104.8061f, -111.8610f, -106.6751f, -107.3537f, -56.4758f,
+      -0.6967f,   -0.8495f,   -0.9586f,   -1.0461f,   1.4522f,    -0.2762f,
+      28.2828f,   2.9157f,    -2.1062f,   0.1566f,    -467.2388f, -461.0685f,
+      -459.0092f, -453.8370f, 1.5422f,    -0.8186f,   -0.4884f,   -53.0399f,
+      -2.0255f,   -1.1348f,   -1.1039f,   -50.2489f,  -1.4821f,   1.8021f,
+      -258.0319f, -1.0865f,   -0.5542f,   -1.0443f,   -1.2732f,   1.8413f,
+      0.2377f,    0.1937f,    -0.0116f,   0.0935f,    -0.0599f,   0.0118f,
+      -0.0875f,   0.0455f,    -0.1301f,   -0.1081f,   -0.2622f,   -0.1960f,
+      0.0393f,    -0.1490f,   0.1852f,    -0.0964f,   -0.0741f,   0.0419f,
+      0.1162f,    -0.0274f,   0.1200f,    -0.0333f,   -0.1337f,   0.2141f,
+      0.0664f,    0.1044f,    -0.1744f,   0.1060f,    -0.1468f,   0.0679f,
+      0.0218f,    0.0494f,    0.1064f,    0.1363f,    0.0013f,    0.1331f,
+      -0.2095f,   0.2088f,    -0.0399f,   -0.1811f,   0.0678f,    -0.1974f,
+      0.1855f,    -0.0968f,   -0.2008f,   0.0162f,    -0.0096f,   -0.1493f,
+      0.2170f,    -0.1248f,   -0.2055f,   0.1276f,    -0.0269f,   -0.1697f,
+      -0.0662f,   0.1073f,    -0.0029f,   -0.1051f,   -0.1573f,   0.2106f,
+      -0.2020f,   -0.1565f,   0.0335f,    -0.1818f,   -0.1665f,   0.2169f,
+      0.1974f,    -0.1470f,   -0.1738f,   -0.2038f,   0.0558f,    -0.0441f,
+      0.0065f,    -0.1485f,   -0.1366f,   -0.2131f,   0.1042f,    0.0349f,
+      -0.1804f,   -0.1361f,   -0.0116f,   -0.1012f,   -0.0860f,   0.0606f,
+      -0.2077f,   0.1826f,    -0.1014f,   -0.0721f,   -0.1517f,   0.1022f,
+      -0.1110f,   -0.0186f,   0.1505f,    0.1797f,    0.0911f,    0.0340f,
+      0.1702f,    -0.1404f,   -0.0566f,   -0.2744f,   -0.1943f,   -0.1871f,
+      0.0046f,    0.0306f,    -0.0436f,   0.1625f,    -0.1302f,   0.0175f,
+      0.1570f,    -0.1425f,   0.0779f,    0.1398f,    0.0929f,    0.0897f,
+      0.0458f,    -0.0936f,   0.1321f,    -0.1355f,   0.0974f,    0.0457f,
+      -73.3516f,  -75.0655f,  -72.1062f,  -72.4624f,  -34.8640f,  -14.3727f,
+      -4.4720f,   66.4982f,   -18.8358f,  0.0397f,    174.2172f,  160.4959f,
+      161.1034f,  147.3250f,  9.5507f,    -45.0180f,  -73.1609f,  -1.5230f,
+      -74.8677f,  -43.8559f,  -68.7622f,  -4.8971f,   -82.1922f,  9.6490f,
+      64.7115f,   -71.8566f,  -75.3879f,  -72.5479f,  -71.7161f,  -34.8056f,
+      0.1442f,    0.1558f,    0.1267f,    -0.1261f,   -0.0506f,   -0.0823f,
+      -0.1807f,   -0.0889f,   -0.2098f,   -0.1295f,   -0.2046f,   -0.1749f,
+      -0.1197f,   -0.1380f,   0.0799f,    -0.0889f,   -0.1209f,   0.1919f,
+      0.1947f,    -0.2086f,   -0.1042f,   -0.0468f,   0.0232f,    0.1052f,
+      -0.0535f,   0.1398f,    0.1713f,    -0.1522f,   0.1453f,    0.0286f,
+      -64.8503f,  -67.6746f,  -63.6497f,  -60.4614f,  -35.6091f,  -20.1605f,
+      -3.6082f,   84.2801f,   -37.8552f,  -2.2371f,   132.4947f,  123.5057f,
+      123.5776f,  113.9060f,  -14.8772f,  -40.7130f,  -79.1391f,  -10.7024f,
+      -65.7831f,  -43.6078f,  -79.6847f,  -13.0743f,  -69.2533f,  -16.0171f,
+      50.4868f,   -64.3678f,  -68.7061f,  -64.0823f,  -59.3413f,  -28.9405f,
+      77.1601f,   75.4899f,   69.8696f,   67.8764f,   -22.7548f,  5.9814f,
+      -3.2826f,   57.9754f,   -5.9500f,   -0.0014f,   77.2251f,   74.0737f,
+      73.7004f,   70.5072f,   -80.9661f,  69.3065f,   55.8337f,   76.8831f,
+      57.9902f,   63.4765f,   56.4748f,   70.0282f,   61.0874f,   -81.3960f,
+      26.2594f,   76.0367f,   74.9115f,   69.2361f,   66.9262f,   -20.1637f,
+      0.1886f,    -0.1108f,   0.1262f,    0.0189f,    0.1382f,    0.0859f,
+      -0.1874f,   -0.1986f,   -0.0171f,   -0.1400f,   -0.2944f,   -0.0750f,
+      -0.0395f,   -0.2092f,   -0.0878f,   0.1216f,    -0.0870f,   -0.1613f,
+      0.2495f,    0.0754f,    0.0244f,    -0.1205f,   -0.0196f,   -0.1729f,
+      0.1170f,    0.1585f,    0.1482f,    -0.1705f,   -0.1337f,   0.0199f,
+      13.0897f,   9.1111f,    6.7413f,    6.3907f,    -28.1187f,  0.4556f,
+      -5.3116f,   30.7293f,   -16.3644f,  -0.0365f,   118.9118f,  111.6125f,
+      111.3227f,  103.4680f,  -30.1883f,  8.9328f,    -4.1876f,   79.3936f,
+      -9.0522f,   12.7861f,   -1.2736f,   78.0446f,   -5.9485f,   -30.5716f,
+      27.8951f,   13.9613f,   6.7173f,    5.2345f,    8.3271f,    -27.3705f,
+      1.0488f,    1.0864f,    1.0710f,    1.7332f,    -3.0561f,   1.1622f,
+      -7.6688f,   3.0491f,    -1.3865f,   0.0769f,    222.5451f,  207.8170f,
+      208.1767f,  193.1396f,  0.4447f,    2.1654f,    1.8929f,    35.1469f,
+      1.1783f,    2.6199f,    1.1611f,    26.2989f,   3.4446f,    0.1551f,
+      65.6529f,   1.2229f,    0.9851f,    1.0241f,    1.4373f,    -3.3421f,
+      0.1388f,    0.0756f,    0.2047f,    0.1140f,    0.0945f,    0.2038f,
+      0.1038f,    -0.2068f,   -0.0626f,   -0.1937f,   0.1347f,    -0.0464f,
+      -0.0866f,   0.0250f,    0.0264f,    -0.1556f,   -0.1625f,   0.1028f,
+      -0.1255f,   -0.0854f,   0.1033f,    0.0008f,    -0.2133f,   -0.0317f,
+      0.1725f,    -0.1054f,   -0.1900f,   0.0383f,    0.0440f,    -0.1900f,
+      -30.0811f,  -30.9929f,  -29.3194f,  -26.8347f,  -20.5957f,  -4.1595f,
+      -1.9066f,   42.4707f,   -9.0435f,   0.0064f,    175.7328f,  163.1350f,
+      163.5085f,  151.1648f,  4.4620f,    -20.6011f,  -19.3402f,  1.5468f,
+      -32.0920f,  -25.4581f,  -12.3706f,  -2.1636f,   -32.4569f,  3.9365f,
+      61.0117f,   -28.4195f,  -31.0837f,  -30.2749f,  -27.5522f,  -22.8688f,
+      -0.3000f,   0.0092f,    -0.3675f,   -0.4113f,   0.0033f,    0.1138f,
+      0.2182f,    -0.5803f,   0.7507f,    -0.2529f,   -1.7724f,   -1.4702f,
+      -1.5805f,   -1.4294f,   0.1435f,    -0.0168f,   0.2356f,    -0.4373f,
+      -0.4500f,   -0.4803f,   -0.0041f,   -0.3878f,   0.1321f,    0.2761f,
+      -1.1975f,   -0.3509f,   -0.0465f,   -0.4050f,   -0.1110f,   0.2233f,
+      0.0950f,    0.0974f,    -0.1600f,   -0.1753f,   -0.0328f,   0.0741f,
+      -0.0706f,   0.1839f,    -0.0833f,   -0.1367f,   -0.1094f,   -0.1739f,
+      -0.1069f,   0.0370f,    -0.1404f,   0.1631f,    -0.1570f,   0.2117f,
+      -0.1891f,   0.0395f,    0.1081f,    0.1760f,    0.0997f,    0.0853f,
+      -0.1018f,   0.1306f,    -0.0924f,   -0.2078f,   0.0801f,    -0.0949f,
+      0.5803f,    0.5578f,    0.4089f,    0.1912f,    0.6774f,    0.3145f,
+      0.3992f,    -0.1316f,   1.3142f,    -0.2457f,   -2.3536f,   -2.4939f,
+      -2.3165f,   -2.4879f,   0.2321f,    0.1901f,    0.1789f,    -1.5215f,
+      0.2645f,    0.2231f,    0.2411f,    -1.2361f,   0.2971f,    0.1421f,
+      -1.6715f,   0.3158f,    0.2476f,    0.3596f,    0.3029f,    0.9297f,
+      -88.8401f,  -89.5209f,  -86.1926f,  -87.4196f,  -39.6504f,  -17.9684f,
+      -4.2702f,   80.2017f,   -29.1676f,  -0.4190f,   150.2820f,  138.4751f,
+      139.1087f,  126.6569f,  13.7188f,   -57.0739f,  -80.3383f,  -18.8351f,
+      -87.4103f,  -56.0072f,  -82.7707f,  -23.1871f,  -93.6787f,  13.9287f,
+      59.6213f,   -87.4843f,  -90.4227f,  -86.2635f,  -86.6841f,  -37.9086f,
+      0.1184f,    -0.2169f,   -0.1915f,   0.0543f,    0.1253f,    -0.1370f,
+      0.0836f,    -0.1198f,   0.1544f,    -0.2004f,   -0.1118f,   -0.0786f,
+      0.1517f,    -0.1000f,   -0.1055f,   0.0936f,    -0.1579f,   0.1098f,
+      -0.0234f,   -0.0499f,   0.0951f,    -0.1711f,   0.0186f,    -0.2008f,
+      0.1777f,    0.1386f,    -0.1495f,   -0.0684f,   -0.2149f,   -0.1198f,
+      -0.6205f,   -0.7209f,   -0.5487f,   -0.9080f,   1.3400f,    0.0085f,
+      28.2837f,   3.2217f,    -1.8463f,   0.1620f,    -464.3599f, -458.4327f,
+      -455.9967f, -451.0393f, 1.6619f,    -0.6944f,   -0.3167f,   -52.3630f,
+      -1.6971f,   -0.7340f,   -0.8923f,   -49.2771f,  -1.1177f,   1.8810f,
+      -258.9386f, -1.0765f,   -0.7279f,   -0.5208f,   -0.8839f,   1.8175f,
+      -78.8510f,  -80.5740f,  -77.8843f,  -77.9798f,  -36.5560f,  -16.0818f,
+      -5.5362f,   66.4228f,   -16.8150f,  0.0036f,    181.8365f,  167.7181f,
+      168.2344f,  153.9725f,  11.2659f,   -47.5786f,  -92.6978f,  6.7573f,
+      -68.7704f,  -48.3850f,  -95.3637f,  8.8888f,    -76.9497f,  11.2243f,
+      60.9020f,   -77.6515f,  -80.7610f,  -78.4537f,  -77.4659f,  -36.2872f,
+      -0.0936f,   0.1966f,    -0.2121f,   0.0193f,    0.0489f,    -0.1445f,
+      0.0060f,    0.0358f,    -0.0783f,   -0.0985f,   -0.2072f,   -0.0802f,
+      -0.0185f,   0.1868f,    -0.0631f,   0.1260f,    -0.0675f,   0.2167f,
+      -0.2174f,   -0.1085f,   0.1483f,    -0.1655f,   -0.1040f,   0.1605f,
+      -0.1673f,   -0.0148f,   -0.1856f,   -0.1454f,   0.1603f,    -0.1620f,
+      -0.9205f,   -1.2716f,   -3.6561f,   -5.0834f,   -0.7934f,   1.8710f,
+      2.2999f,    -2.9516f,   -1.7631f,   -0.3804f,   41.2998f,   26.2358f,
+      28.9763f,   15.7315f,   5.2164f,    3.2963f,    -5.4457f,   18.6310f,
+      -25.0076f,  5.4368f,    -12.0085f,  17.1462f,   -14.6992f,  5.6365f,
+      48.6207f,   -1.0921f,   -1.8723f,   -3.5354f,   -5.1774f,   -1.0200f,
+      -0.1065f,   -0.2021f,   0.0332f,    0.1692f,    -0.1239f,   0.1325f,
+      -0.0660f,   -0.0567f,   0.2107f,    -0.2084f,   -0.0263f,   0.1411f,
+      0.0178f,    0.0451f,    0.2024f,    -0.1756f,   -0.0771f,   -0.1690f,
+      -0.2097f,   -0.2130f,   0.0714f,    0.0172f,    -0.0310f,   0.0649f,
+      -0.1550f,   0.0701f,    0.0306f,    -0.1750f,   -0.1988f,   -0.2060f,
+      0.0005f,    -0.1325f,   -0.1823f,   -0.0900f,   -0.1291f,   -0.1817f,
+      0.0144f,    0.0951f,    -0.1954f,   -0.0171f,   -0.1985f,   0.0875f,
+      0.0901f,    -0.0857f,   0.1681f,    0.0465f,    0.1023f,    0.0985f,
+      -0.2152f,   -0.1723f,   -0.0825f,   0.0203f,    -0.1206f,   -0.1431f,
+      -0.1552f,   0.1344f,    0.0398f,    0.0169f,    0.2180f,    -0.1530f,
+      2.7964f,    2.7312f,    2.8831f,    3.4729f,    -3.1366f,   2.4043f,
+      -7.2004f,   1.4128f,    2.8648f,    0.0578f,    225.5640f,  210.3712f,
+      210.6907f,  195.0339f,  0.3140f,    1.8060f,    2.7355f,    33.6917f,
+      3.3542f,    3.3682f,    1.7371f,    31.2424f,   3.4094f,    -0.1192f,
+      63.0864f,   3.0562f,    2.8633f,    2.6777f,    3.5495f,    -4.2616f,
+      -1.4034f,   0.3930f,    -4.6756f,   -9.9870f,   -27.8511f,  5.6071f,
+      -1.0862f,   34.4907f,   -10.4831f,  -0.0281f,   117.2617f,  104.9590f,
+      106.1515f,  93.9707f,   -16.8801f,  5.3036f,    -21.7458f,  98.5306f,
+      -20.7596f,  6.4733f,    -17.6440f,  98.3097f,   -31.9540f,  -17.0600f,
+      27.4543f,   -0.6140f,   -1.6182f,   -4.9167f,   -8.9017f,   -26.2485f,
+      -0.1952f,   -0.0462f,   -0.1958f,   0.1679f,    -0.1592f,   -0.1634f,
+      -0.0507f,   -0.0542f,   0.0038f,    -0.0343f,   0.0567f,    -0.1983f,
+      0.0250f,    -0.0762f,   0.0902f,    -0.0343f,   0.1240f,    0.1161f,
+      0.1237f,    0.1870f,    0.0346f,    0.0340f,    0.0625f,    -0.0355f,
+      0.0278f,    -0.1043f,   0.1755f,    0.0253f,    0.1750f,    -0.2070f,
+      -5.5531f,   -5.3122f,   -4.9348f,   -4.4782f,   -7.5686f,   -1.5478f,
+      -5.4341f,   0.5087f,    -2.1382f,   0.0798f,    208.3677f,  194.0083f,
+      194.4168f,  179.3082f,  1.4443f,    -1.5038f,   -1.4021f,   25.9363f,
+      -4.0635f,   -2.6785f,   -1.6640f,   22.2589f,   -1.4910f,   1.4715f,
+      59.1972f,   -4.9638f,   -5.1920f,   -4.9193f,   -5.2649f,   -8.0556f,
+      20.1226f,   12.0195f,   9.7385f,    10.7058f,   -27.4201f,  8.4869f,
+      -5.0826f,   32.9212f,   -2.0674f,   -0.0290f,   120.5002f,  112.3222f,
+      112.3287f,  104.1107f,  -20.6293f,  14.8534f,   -0.8748f,   103.1141f,
+      -1.1368f,   15.3716f,   2.7653f,    91.7285f,   -0.5991f,   -20.7338f,
+      35.9363f,   20.5104f,   11.1988f,   9.0368f,    10.6355f,   -26.5309f,
+      -0.2058f,   -0.2176f,   0.1331f,    -0.1415f,   -0.0825f,   -0.0470f,
+      -0.0615f,   0.1274f,    0.0076f,    -0.0575f,   -0.2065f,   0.0866f,
+      0.2166f,    -0.1942f,   -0.1952f,   0.1323f,    -0.1016f,   0.1803f,
+      -0.0424f,   0.1555f,    0.1118f,    0.1559f,    0.0337f,    -0.0341f,
+      -0.0430f,   0.1988f,    -0.0553f,   -0.0255f,   0.1817f,    0.0608f,
+      0.1431f,    0.0686f,    -0.0245f,   -0.2107f,   0.2001f,    -0.0964f,
+      -0.0090f,   0.1151f,    -0.0365f,   -0.1986f,   0.1740f,    -0.2098f,
+      0.0013f,    0.1369f,    0.1910f,    0.1801f,    -0.2019f,   0.0348f,
+      -0.1175f,   0.0627f,    -0.1929f,   -0.0099f,   0.1349f,    0.1804f,
+      -0.1071f,   -0.1651f,   -0.1146f,   -0.0259f,   0.1626f,    -0.0271f,
+      0.1393f,    0.1304f,    -0.0200f,   0.0924f,    -0.0839f,   -0.0031f,
+      -0.1311f,   0.0350f,    -0.1330f,   -0.0911f,   0.1949f,    -0.0209f,
+      -0.1883f,   0.0269f,    0.2040f,    0.1552f,    0.1532f,    0.1157f,
+      -0.1102f,   -0.1220f,   -0.0808f,   -0.1050f,   0.1716f,    0.0846f,
+      -0.0180f,   -0.1037f,   0.2063f,    0.1237f,    0.1253f,    -0.0496f,
+      -0.0183f,   0.0491f,    0.1703f,    -0.0824f,   -0.0702f,   -0.1100f,
+      -0.0965f,   0.0130f,    -0.1222f,   -0.1081f,   0.0329f,    0.2115f,
+      -0.1438f,   0.0799f,    -0.1602f,   -0.0330f,   0.0501f,    0.1072f,
+      -0.0744f,   -0.1783f,   -0.0240f,   0.0777f,    -0.1944f,   0.0438f,
+      -0.0033f,   -0.1873f,   0.0984f,    -0.0318f,   0.0773f,    0.1489f,
+      0.3966f,    0.4711f,    0.3972f,    0.0623f,    0.5970f,    0.1018f,
+      0.1375f,    -0.1881f,   0.8921f,    -0.1854f,   -2.1138f,   -2.1178f,
+      -1.8295f,   -2.1703f,   0.5784f,    -0.1937f,   -0.0728f,   -0.9953f,
+      0.2442f,    -0.4074f,   -0.1591f,   -1.1660f,   0.4832f,    0.2203f,
+      -1.4957f,   0.1544f,    0.1810f,    0.2275f,    0.4075f,    0.8153f,
+      0.0715f,    0.0222f,    0.0463f,    -0.0201f,   0.0396f,    0.5951f,
+      -0.2779f,   -0.0306f,   0.7532f,    -0.1596f,   -4.1080f,   -3.7925f,
+      -3.8522f,   -3.2468f,   0.7728f,    0.0188f,    -0.1448f,   0.4084f,
+      -0.4666f,   -0.1036f,   -1.1469f,   0.4243f,    0.2778f,    0.9023f,
+      -3.0216f,   0.0384f,    -0.3348f,   -0.0314f,   -0.2788f,   0.0479f,
+      139.0773f,  131.6164f,  115.0392f,  111.1817f,  41.7596f,   9.5379f,
+      1.8542f,    46.9890f,   -12.8221f,  0.0241f,    52.9779f,   51.5268f,
+      50.8060f,   48.7028f,   -132.9665f, 118.3478f,  101.1239f,  81.4608f,
+      75.4251f,   121.0643f,  97.8947f,   86.8911f,   74.5576f,   -133.7606f,
+      29.2657f,   135.8916f,  131.3661f,  114.1687f,  111.0784f,  31.3790f,
+      -0.0807f,   -0.0657f,   -0.0027f,   0.0410f,    0.0765f,    0.1194f,
+      0.0953f,    -0.0060f,   0.1531f,    -0.2339f,   0.1488f,    -0.0615f,
+      -0.0579f,   0.0761f,    0.1250f,    -0.0469f,   0.1480f,    0.0683f,
+      -0.0049f,   0.1558f,    0.2168f,    -0.0736f,   0.1135f,    -0.1244f,
+      0.0725f,    -0.1297f,   -0.0215f,   -0.0412f,   -0.1632f,   -0.0200f,
+      -0.1346f,   -0.1954f,   0.0053f,    0.0151f,    0.1379f,    -0.1497f,
+      -0.0102f,   -0.0336f,   0.0900f,    -0.1706f,   -0.0932f,   -0.2084f,
+      0.1242f,    -0.2027f,   0.0849f,    -0.2139f,   -0.2015f,   0.0944f,
+      -0.0984f,   0.2082f,    0.1625f,    -0.0227f,   -0.1676f,   0.1021f,
+      0.1516f,    0.0245f,    0.0955f,    -0.1488f,   -0.0057f,   0.1783f,
+      -0.8568f,   -0.8175f,   -0.6282f,   -1.3107f,   1.5712f,    0.1044f,
+      28.2289f,   3.0885f,    -1.9829f,   0.1600f,    -465.9583f, -459.5893f,
+      -457.5055f, -452.7600f, 1.7229f,    -0.6620f,   -0.1065f,   -52.8017f,
+      -2.0293f,   -0.8224f,   -1.0389f,   -49.9049f,  -1.2250f,   1.7647f,
+      -259.2465f, -1.0978f,   -0.5169f,   -0.8721f,   -0.8197f,   1.9158f,
+      16.2234f,   15.8523f,   13.8343f,   9.8509f,    -21.4326f,  15.7650f,
+      -6.4451f,   34.8575f,   1.1387f,    -0.0223f,   117.7213f,  109.8494f,
+      109.7624f,  101.8532f,  -20.3275f,  16.0812f,   4.9165f,    92.4919f,
+      4.1615f,    13.8451f,   9.2112f,    97.1580f,   -8.7037f,   -20.4420f,
+      27.1105f,   17.4922f,   13.9998f,   12.3888f,   11.4705f,   -20.9568f,
+      0.5457f,    0.5322f,    0.2823f,    0.3581f,    0.5359f,    0.1576f,
+      0.1969f,    -0.0136f,   -0.2748f,   -0.3168f,   -0.3918f,   -0.2167f,
+      -0.1797f,   -0.1869f,   0.2986f,    -0.2116f,   -0.4226f,   -0.2022f,
+      0.9452f,    0.5474f,    -0.1218f,   0.2067f,    -0.1600f,   0.1937f,
+      0.0808f,    0.4877f,    0.5106f,    0.2626f,    0.5076f,    0.6228f,
+      0.5124f,    0.4044f,    0.4023f,    0.1222f,    2.5446f,    0.9623f,
+      24.9875f,   4.7442f,    -2.0551f,   0.1642f,    -449.9478f, -444.1841f,
+      -442.0153f, -437.1498f, 2.3209f,    -0.6986f,   -0.3456f,   -47.4074f,
+      -1.2374f,   -1.0939f,   -0.9112f,   -41.1851f,  -0.5064f,   2.4209f,
+      -263.4446f, -0.0433f,   0.3460f,    0.1475f,    0.3770f,    2.9154f,
+      0.2032f,    0.1527f,    0.2161f,    -0.1981f,   0.1893f,    -0.2003f,
+      0.1734f,    0.1713f,    0.1207f,    -0.2073f,   -0.1018f,   0.0770f,
+      0.0728f,    0.1665f,    0.0689f,    0.1884f,    -0.1399f,   -0.1326f,
+      -0.0518f,   -0.1948f,   0.1576f,    -0.1835f,   0.1436f,    0.0497f,
+      0.0883f,    -0.1253f,   -0.0417f,   -0.0507f,   -0.1555f,   0.2076f,
+      -2.4080f,   6.1616f,    -0.8564f,   -13.6773f,  -32.7238f,  -16.3144f,
+      -1.9828f,   20.5110f,   -17.0191f,  -1.7154f,   103.6642f,  95.3675f,
+      95.5662f,   86.9504f,   -35.5340f,  19.6681f,   -2.4900f,   65.0847f,
+      -15.8119f,  13.7256f,   -4.6753f,   63.4713f,   -6.5992f,   -34.2369f,
+      41.3959f,   -1.5528f,   3.8106f,    -0.7762f,   -12.3204f,  -35.1734f,
+      -83.9509f,  -87.4861f,  -83.5925f,  -81.5047f,  -54.1256f,  -45.7506f,
+      -13.5325f,  -6.0331f,   -8.5062f,   0.0261f,    189.9450f,  177.7870f,
+      178.6945f,  164.9762f,  9.8521f,    -68.0619f,  -68.6145f,  6.5056f,
+      -55.9651f,  -66.9540f,  -65.3349f,  -2.1954f,   -57.2408f,  8.6577f,
+      60.6966f,   -82.1056f,  -88.5245f,  -83.3057f,  -80.7283f,  -50.5285f,
+      -0.1397f,   0.1862f,    -0.0691f,   -0.0906f,   0.1560f,    0.1377f,
+      -0.0066f,   -0.0213f,   0.0708f,    -0.0386f,   -0.0015f,   -0.0020f,
+      -0.2122f,   0.0747f,    0.0795f,    0.0229f,    0.1923f,    -0.1661f,
+      0.0895f,    0.1176f,    0.1398f,    -0.0443f,   0.0934f,    0.0638f,
+      -0.1924f,   0.0602f,    0.0404f,    0.1597f,    0.1387f,    -0.0601f,
+      -28.3967f,  -21.8483f,  -25.5175f,  -29.9252f,  2.0161f,    -3.0092f,
+      7.7435f,    28.2367f,   -35.0188f,  -0.1578f,   105.0164f,  93.4495f,
+      94.9134f,   81.0315f,   4.3602f,    8.1303f,    -37.7665f,  -16.6986f,
+      -40.8902f,  8.2542f,    -33.3215f,  -2.0457f,   -69.0245f,  4.1016f,
+      47.2770f,   -25.8268f,  -23.6034f,  -26.4339f,  -27.8305f,  8.4468f,
+      13.8742f,   8.3874f,    4.2044f,    1.4619f,    -40.2909f,  -0.6358f,
+      -0.7982f,   36.1931f,   -17.3147f,  -0.3348f,   106.8135f,  96.5298f,
+      97.8829f,   86.9994f,   -25.8170f,  15.0652f,   -0.9181f,   85.8544f,
+      2.5475f,    9.8009f,    -3.5931f,   89.2017f,   -3.7252f,   -25.2986f,
+      22.5505f,   14.0434f,   7.0708f,    4.6646f,    1.5807f,    -39.4024f,
+      -0.1436f,   0.0256f,    0.0274f,    -0.2126f,   0.0401f,    0.0745f,
+      -0.0379f,   -0.0357f,   0.0777f,    -0.0709f,   -0.1093f,   -0.2047f,
+      -0.0713f,   -0.0478f,   -0.0908f,   0.1963f,    0.1282f,    0.0977f,
+      0.1304f,    0.2058f,    0.0700f,    0.0518f,    0.0239f,    0.0686f,
+      -0.1909f,   0.0828f,    -0.1243f,   -0.1920f,   0.1908f,    -0.0808f,
+      90.8028f,   89.2894f,   84.5339f,   83.3491f,   -13.3838f,  12.0240f,
+      -3.9443f,   63.0867f,   -2.5321f,   -0.0099f,   68.9140f,   66.3206f,
+      66.0278f,   63.1498f,   -83.7261f,  74.3448f,   73.4998f,   64.8477f,
+      69.7701f,   74.5878f,   71.0331f,   63.2116f,   74.3162f,   -83.9282f,
+      20.8163f,   89.6818f,   88.6452f,   83.7338f,   82.9360f,   -13.2357f,
+      0.1299f,    -0.1765f,   -0.0168f,   -0.1372f,   -0.1183f,   0.0472f,
+      0.1312f,    0.0267f,    0.0194f,    -0.1593f,   0.0059f,    0.1775f,
+      0.0668f,    -0.1239f,   -0.1982f,   -0.1415f,   -0.1659f,   -0.1148f,
+      0.0136f,    0.0913f,    -0.1254f,   -0.0357f,   0.0892f,    0.0835f,
+      -0.0554f,   0.1969f,    -0.0888f,   -0.0623f,   -0.0236f,   -0.1492f,
+      0.4196f,    0.3218f,    0.2287f,    0.5095f,    0.7210f,    0.2279f,
+      0.4523f,    -0.1832f,   1.3095f,    -0.2041f,   -2.1443f,   -2.1947f,
+      -1.9292f,   -2.1142f,   0.5840f,    0.1018f,    0.1011f,    -1.6565f,
+      0.4325f,    0.0424f,    0.2836f,    -1.7183f,   0.2595f,    0.2686f,
+      -1.8784f,   0.3891f,    0.3050f,    0.6195f,    0.2896f,    0.5905f,
+      -5.3024f,   -3.2518f,   -12.5192f,  -29.1732f,  1.6538f,    -1.8315f,
+      9.9788f,    10.5155f,   6.3234f,    -0.3460f,   76.9925f,   51.3785f,
+      55.7120f,   29.0432f,   5.5901f,    25.6578f,   -3.9565f,   13.0509f,
+      -106.0371f, 23.2124f,   -18.2004f,  8.4618f,    -69.3585f,  5.5651f,
+      80.0565f,   -6.4941f,   -5.3742f,   -14.4209f,  -24.1565f,  6.6801f,
+      -22.0585f,  -20.9909f,  -26.7939f,  -29.6890f,  -14.5085f,  2.1866f,
+      -4.2608f,   17.3977f,   -30.8824f,  -0.4017f,   135.6957f,  126.9320f,
+      127.0044f,  118.1835f,  -1.8768f,   -0.8629f,   -32.0882f,  44.7862f,
+      -23.9174f,  1.6485f,    -27.9940f,  51.9078f,   -48.5279f,  -1.7550f,
+      49.9230f,   -19.9785f,  -22.4647f,  -27.6911f,  -27.3197f,  -10.6545f,
+      -0.1922f,   -0.1999f,   -0.1396f,   0.1065f,    0.0085f,    -0.1940f,
+      0.0351f,    0.1285f,    -0.0292f,   -0.1296f,   0.1543f,    -0.2082f,
+      -0.1758f,   0.0719f,    0.0764f,    0.1394f,    -0.0255f,   -0.0370f,
+      0.1615f,    -0.0568f,   0.1920f,    -0.1631f,   0.0199f,    0.1884f,
+      0.0693f,    0.1074f,    -0.0273f,   0.1540f,    0.0098f,    0.2111f,
+      0.1805f,    -0.0555f,   0.1159f,    0.0469f,    0.1789f,    -0.1711f,
+      -0.1304f,   0.1912f,    -0.0737f,   -0.1408f,   0.1804f,    -0.2023f,
+      -0.0467f,   -0.1019f,   -0.0136f,   0.0691f,    0.1454f,    -0.0213f,
+      0.0929f,    -0.0958f,   0.1299f,    0.1137f,    0.1175f,    0.1042f,
+      -0.2081f,   -0.0737f,   0.0582f,    0.1640f,    0.2120f,    -0.0646f,
+      -0.0326f,   0.1976f,    0.1182f,    -0.1365f,   -0.1784f,   0.2113f,
+      0.0469f,    0.0763f,    -0.0197f,   -0.1902f,   0.1259f,    0.1598f,
+      -0.0180f,   -0.1339f,   -0.1675f,   -0.1884f,   -0.1973f,   0.1529f,
+      0.1160f,    0.2154f,    -0.1446f,   -0.1395f,   0.0355f,    0.1513f,
+      -0.2086f,   -0.1135f,   -0.1502f,   -0.0018f,   0.0486f,    -0.0110f,
+      -0.0843f,   -0.0716f,   -0.1367f,   0.0753f,    0.0114f,    0.0475f,
+      -0.0632f,   0.2045f,    -0.0512f,   -0.0906f,   -0.1071f,   -0.1957f,
+      0.1361f,    0.1821f,    -0.1684f,   -0.1383f,   0.1059f,    0.1579f,
+      -0.0064f,   -0.1205f,   -0.0718f,   -0.1323f,   -0.0174f,   -0.1092f,
+      -0.1915f,   0.1978f,    -0.1245f,   0.1297f,    -0.1542f,   0.1556f,
+      -0.1752f,   0.0718f,    -0.1020f,   -0.1970f,   0.0518f,    -0.0888f,
+      0.0541f,    -0.1922f,   -0.1467f,   -0.0653f,   -0.1940f,   -0.0800f,
+      -0.1096f,   -0.0796f,   -0.1310f,   0.0191f,    -0.1077f,   -0.0973f,
+      0.1566f,    0.0074f,    0.0500f,    -0.0415f,   -0.2116f,   0.0227f,
+      0.0895f,    0.1528f,    0.1404f,    0.0467f,    0.0462f,    -0.0973f,
+      -0.1669f,   0.0551f,    0.1167f,    -0.1470f,   -0.0542f,   -0.1006f,
+      0.2104f,    0.1039f,    -0.0211f,   -0.1726f,   -0.0694f,   -0.0270f,
+      0.0277f,    -0.0715f,   -0.2055f,   -0.1502f,   -0.1718f,   -0.0043f,
+      0.0174f,    0.1019f,    -0.0233f,   -0.1518f,   -0.1331f,   -0.0001f,
+      -0.1483f,   -0.2115f,   0.0666f,    0.0014f,    0.1601f,    -0.0690f,
+    };
+
+static const float av1_rdcost_model_nn_biases_layer0[NUM_HIDDEN_NODES] = {
+  0.156824f,   0.f,         0.130013f,   0.084482f,  -129.058197f, -15.090252f,
+  -3.859116f,  0.736356f,   -81.361557f, -0.001922f, -0.000713f,   0.440181f,
+  14.982646f,  1.282223f,   2.23122f,    94.26635f,  93.920929f,   0.614672f,
+  0.f,         0.315858f,   4.746014f,   0.116901f,  -35.661354f,  -75.148285f,
+  92.006989f,  -14.112332f, 86.673157f,  -0.000307f, -0.000544f,   0.f,
+  -7.851313f,  0.505186f,   0.f,         0.f,        -111.681091f, -0.937782f,
+  0.035789f,   0.f,         0.f,         -0.00102f,  -75.180527f,  0.f,
+  -63.821148f, 79.592392f,  0.085068f,   11.184906f, 1.25406f,     0.f,
+  -29.779242f, -0.181732f,  0.f,         0.425554f,  -90.78405f,   0.f,
+  -0.828326f,  -81.132179f, 0.f,         -2.757063f, 0.f,          0.f,
+  2.967951f,   -4.440599f,  0.f,         -5.105355f, 14.734543f,   0.f,
+  0.f,         0.f,         0.f,         0.295342f,  -0.026907f,   133.375412f,
+  -0.000855f,  0.f,         -0.875029f,  15.665165f, 0.437296f,    0.321257f,
+  -0.001932f,  -4.235782f,  -87.187782f, 0.f,        -28.84696f,   7.055514f,
+  0.f,         95.548302f,  -0.000425f,  0.38969f,   -13.88008f,   -27.347931f,
+  0.f,         0.f,         0.f,         -0.000026f, 0.f,          0.f,
+};
+
+static const float
+    av1_rdcost_model_nn_weights_layer1[NUM_HIDDEN_NODES * NUM_OUTPUTS] = {
+      -0.101706f,   -0.14411f,    -0.139118f,   -0.132945f,   118.811302f,
+      3.137232f,    -32.969776f,  -4.150725f,   26.263071f,   0.092841f,
+      0.174125f,    -0.028195f,   15.712872f,   17.722702f,   5.666006f,
+      -121.143929f, -131.933731f, -3.000318f,   -0.032063f,   -0.380065f,
+      -1.660653f,   -0.164802f,   7.177527f,    87.759155f,   -119.564224f,
+      -98.051651f,  -110.581116f, -0.069982f,   0.023906f,    0.183792f,
+      40.606274f,   -0.080804f,   -0.053744f,   -0.187848f,   157.44313f,
+      -4.820149f,   0.089499f,    0.070232f,    -0.043038f,   0.072996f,
+      93.347313f,   0.225259f,    103.223228f,  -110.682541f, 0.14314f,
+      -89.827538f,  6.505952f,    -0.076949f,   73.816132f,   -0.063416f,
+      -0.23736f,    -0.066059f,   116.049599f,  0.120871f,    -4.708246f,
+      107.501671f,  -0.206708f,   -32.688675f,  0.047608f,    -0.105907f,
+      6.505825f,    -75.461891f,  -0.160341f,   6.532121f,    -84.868111f,
+      -0.065622f,   0.044756f,    0.008672f,    0.017155f,    0.046108f,
+      -0.218818f,   -126.507957f, 0.028271f,    0.180625f,    -4.707376f,
+      -121.524307f, -0.03853f,    -4.103166f,   -0.018947f,   -95.768463f,
+      15.941695f,   0.147154f,    -102.863029f, -72.521698f,  -0.037133f,
+      -138.1492f,   0.210016f,    -0.084692f,   -68.693665f,  -52.523472f,
+      -0.133385f,   -0.17438f,    0.008654f,    -0.035642f,   -0.145202f,
+      0.211135f,
+    };
+
+static const float av1_rdcost_model_nn_biases_layer1[NUM_OUTPUTS] = {
+  0.251909f
+};
+
+static const NN_CONFIG av1_rdcost_model_nnconfig = {
+  NUM_FEATURES,
+  NUM_OUTPUTS,
+  NUM_HIDDEN_LAYERS,
+  {
+      NUM_HIDDEN_NODES,
+  },
+  {
+      av1_rdcost_model_nn_weights_layer0,
+      av1_rdcost_model_nn_weights_layer1,
+  },
+  {
+      av1_rdcost_model_nn_biases_layer0,
+      av1_rdcost_model_nn_biases_layer1,
+  },
+};
+
+//------------------------------------------------------------------------------
+
+#undef NUM_FEATURES
+#undef NUM_HIDDEN_LAYERS
+#undef NUM_HIDDEN_NODES
+#undef NUM_OUTPUTS
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c
index ac9392fa1..3aae0144e 100644
--- a/third_party/aom/av1/encoder/ratectrl.c
+++ b/third_party/aom/av1/encoder/ratectrl.c
@@ -421,9 +421,9 @@ void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int width,
     projected_size_based_on_q =
         av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
   } else {
-    projected_size_based_on_q =
-        av1_estimate_bits_at_q(cpi->common.frame_type, cm->base_qindex, MBs,
-                               rate_correction_factor, cm->bit_depth);
+    projected_size_based_on_q = av1_estimate_bits_at_q(
+        cpi->common.frame_type, cm->base_qindex, MBs, rate_correction_factor,
+        cm->seq_params.bit_depth);
   }
   // Work out a size correction factor.
   if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
@@ -495,7 +495,7 @@ int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame,
           (int)av1_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
     } else {
       bits_per_mb_at_this_q = (int)av1_rc_bits_per_mb(
-          cm->frame_type, i, correction_factor, cm->bit_depth);
+          cm->frame_type, i, correction_factor, cm->seq_params.bit_depth);
     }
 
     if (bits_per_mb_at_this_q <= target_bits_per_mb) {
@@ -643,7 +643,8 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
   int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
   int q;
   int *rtc_minq;
-  ASSIGN_MINQ_TABLE(cm->bit_depth, rtc_minq);
+  const int bit_depth = cm->seq_params.bit_depth;
+  ASSIGN_MINQ_TABLE(bit_depth, rtc_minq);
 
   if (frame_is_intra_only(cm)) {
     active_best_quality = rc->best_quality;
@@ -652,17 +653,17 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
     // based on the ambient Q to reduce the risk of popping.
     if (rc->this_key_frame_forced) {
       int qindex = rc->last_boosted_qindex;
-      double last_boosted_q = av1_convert_qindex_to_q(qindex, cm->bit_depth);
-      int delta_qindex = av1_compute_qdelta(
-          rc, last_boosted_q, (last_boosted_q * 0.75), cm->bit_depth);
+      double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+      int delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+                                            (last_boosted_q * 0.75), bit_depth);
       active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
     } else if (cm->current_video_frame > 0) {
       // not first frame of one pass and kf_boost is set
       double q_adj_factor = 1.0;
       double q_val;
 
-      active_best_quality = get_kf_active_quality(
-          rc, rc->avg_frame_qindex[KEY_FRAME], cm->bit_depth);
+      active_best_quality =
+          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth);
 
       // Allow somewhat lower kf minq with small image formats.
       if ((width * height) <= (352 * 288)) {
@@ -671,9 +672,9 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
 
       // Convert the adjustment factor to a qindex delta
       // on active_best_quality.
-      q_val = av1_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+      q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth);
       active_best_quality +=
-          av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
+          av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
     }
   } else if (!rc->is_src_frame_alt_ref &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
@@ -686,7 +687,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
     } else {
       q = active_worst_quality;
     }
-    active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+    active_best_quality = get_gf_active_quality(rc, q, bit_depth);
   } else {
     // Use the lower of active_worst_quality and recent/average Q.
     if (cm->current_video_frame > 1) {
@@ -716,8 +717,8 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
       !(cm->current_video_frame == 0)) {
     int qdelta = 0;
     aom_clear_system_state();
-    qdelta = av1_compute_qdelta_by_rate(
-        &cpi->rc, cm->frame_type, active_worst_quality, 2.0, cm->bit_depth);
+    qdelta = av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+                                        active_worst_quality, 2.0, bit_depth);
     *top_index = active_worst_quality + qdelta;
     *top_index = AOMMAX(*top_index, *bottom_index);
   }
@@ -768,27 +769,27 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
   int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
   int q;
   int *inter_minq;
-  ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
+  const int bit_depth = cm->seq_params.bit_depth;
+  ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
 
   if (frame_is_intra_only(cm)) {
     if (oxcf->rc_mode == AOM_Q) {
       const int qindex = cq_level;
-      const double q_val = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+      const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
       const int delta_qindex =
-          av1_compute_qdelta(rc, q_val, q_val * 0.25, cm->bit_depth);
+          av1_compute_qdelta(rc, q_val, q_val * 0.25, bit_depth);
       active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
     } else if (rc->this_key_frame_forced) {
       const int qindex = rc->last_boosted_qindex;
-      const double last_boosted_q =
-          av1_convert_qindex_to_q(qindex, cm->bit_depth);
+      const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
       const int delta_qindex = av1_compute_qdelta(
-          rc, last_boosted_q, last_boosted_q * 0.75, cm->bit_depth);
+          rc, last_boosted_q, last_boosted_q * 0.75, bit_depth);
       active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
     } else {  // not first frame of one pass and kf_boost is set
       double q_adj_factor = 1.0;
 
-      active_best_quality = get_kf_active_quality(
-          rc, rc->avg_frame_qindex[KEY_FRAME], cm->bit_depth);
+      active_best_quality =
+          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth);
 
       // Allow somewhat lower kf minq with small image formats.
       if ((width * height) <= (352 * 288)) {
@@ -798,9 +799,9 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
       // Convert the adjustment factor to a qindex delta on active_best_quality.
       {
         const double q_val =
-            av1_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+            av1_convert_qindex_to_q(active_best_quality, bit_depth);
         active_best_quality +=
-            av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
+            av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
       }
     }
   } else if (!rc->is_src_frame_alt_ref &&
@@ -815,30 +816,30 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
     // For constrained quality dont allow Q less than the cq level
     if (oxcf->rc_mode == AOM_CQ) {
       if (q < cq_level) q = cq_level;
-      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+      active_best_quality = get_gf_active_quality(rc, q, bit_depth);
       // Constrained quality use slightly lower active best.
       active_best_quality = active_best_quality * 15 / 16;
     } else if (oxcf->rc_mode == AOM_Q) {
       const int qindex = cq_level;
-      const double q_val = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+      const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
       const int delta_qindex =
           (cpi->refresh_alt_ref_frame)
-              ? av1_compute_qdelta(rc, q_val, q_val * 0.40, cm->bit_depth)
-              : av1_compute_qdelta(rc, q_val, q_val * 0.50, cm->bit_depth);
+              ? av1_compute_qdelta(rc, q_val, q_val * 0.40, bit_depth)
+              : av1_compute_qdelta(rc, q_val, q_val * 0.50, bit_depth);
       active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
     } else {
-      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+      active_best_quality = get_gf_active_quality(rc, q, bit_depth);
     }
   } else {
     if (oxcf->rc_mode == AOM_Q) {
       const int qindex = cq_level;
-      const double q_val = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+      const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
       const double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0,
                                                      0.70, 1.0, 0.85, 1.0 };
       const int delta_qindex = av1_compute_qdelta(
           rc, q_val,
           q_val * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL],
-          cm->bit_depth);
+          bit_depth);
       active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
     } else {
       // Use the lower of active_worst_quality and recent/average Q.
@@ -868,12 +869,12 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
     aom_clear_system_state();
     if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
         !(cm->current_video_frame == 0)) {
-      qdelta = av1_compute_qdelta_by_rate(
-          &cpi->rc, cm->frame_type, active_worst_quality, 2.0, cm->bit_depth);
+      qdelta = av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+                                          active_worst_quality, 2.0, bit_depth);
     } else if (!rc->is_src_frame_alt_ref &&
                (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
       qdelta = av1_compute_qdelta_by_rate(
-          &cpi->rc, cm->frame_type, active_worst_quality, 1.75, cm->bit_depth);
+          &cpi->rc, cm->frame_type, active_worst_quality, 1.75, bit_depth);
     }
     *top_index = active_worst_quality + qdelta;
     *top_index = AOMMAX(*top_index, *bottom_index);
@@ -908,9 +909,9 @@ int av1_frame_type_qdelta(const AV1_COMP *cpi, int rf_level, int q) {
     INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME
   };
   const AV1_COMMON *const cm = &cpi->common;
-  int qdelta =
-      av1_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q,
-                                 rate_factor_deltas[rf_level], cm->bit_depth);
+  int qdelta = av1_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q,
+                                          rate_factor_deltas[rf_level],
+                                          cm->seq_params.bit_depth);
   return qdelta;
 }
 
@@ -927,7 +928,15 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
   int active_worst_quality = cpi->twopass.active_worst_quality;
   int q;
   int *inter_minq;
-  ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
+  const int bit_depth = cm->seq_params.bit_depth;
+  ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
+
+#if CUSTOMIZED_GF
+  const int is_intrl_arf_boost =
+      gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE;
+#else
+  const int is_intrl_arf_boost = cpi->refresh_alt2_ref_frame;
+#endif  // CUSTOMIZED_GF
 
   if (frame_is_intra_only(cm)) {
     // Handle the special case for key frames forced when we have reached
@@ -941,16 +950,16 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
       if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
         qindex = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
         active_best_quality = qindex;
-        last_boosted_q = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+        last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
         delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
-                                          last_boosted_q * 1.25, cm->bit_depth);
+                                          last_boosted_q * 1.25, bit_depth);
         active_worst_quality =
             AOMMIN(qindex + delta_qindex, active_worst_quality);
       } else {
         qindex = rc->last_boosted_qindex;
-        last_boosted_q = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+        last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
         delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
-                                          last_boosted_q * 0.75, cm->bit_depth);
+                                          last_boosted_q * 0.75, bit_depth);
         active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
       }
     } else {
@@ -960,7 +969,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
 
       // Baseline value derived from cpi->active_worst_quality and kf boost.
       active_best_quality =
-          get_kf_active_quality(rc, active_worst_quality, cm->bit_depth);
+          get_kf_active_quality(rc, active_worst_quality, bit_depth);
 
       // Allow somewhat lower kf minq with small image formats.
       if ((width * height) <= (352 * 288)) {
@@ -972,12 +981,12 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
 
       // Convert the adjustment factor to a qindex delta
       // on active_best_quality.
-      q_val = av1_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+      q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth);
       active_best_quality +=
-          av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
+          av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
     }
   } else if (!rc->is_src_frame_alt_ref &&
-             (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
+             (cpi->refresh_golden_frame || is_intrl_arf_boost ||
               cpi->refresh_alt_ref_frame)) {
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
@@ -992,24 +1001,45 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
     if (oxcf->rc_mode == AOM_CQ) {
       if (q < cq_level) q = cq_level;
 
-      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+      active_best_quality = get_gf_active_quality(rc, q, bit_depth);
 
       // Constrained quality use slightly lower active best.
       active_best_quality = active_best_quality * 15 / 16;
 
     } else if (oxcf->rc_mode == AOM_Q) {
-      if (!cpi->refresh_alt_ref_frame && !cpi->refresh_alt2_ref_frame) {
+      if (!cpi->refresh_alt_ref_frame && !is_intrl_arf_boost) {
         active_best_quality = cq_level;
       } else {
-        active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
-
-        // Modify best quality for second level arfs. For mode AOM_Q this
-        // becomes the baseline frame q.
-        if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
-          active_best_quality = (active_best_quality + cq_level + 1) / 2;
+        active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+#if USE_SYMM_MULTI_LAYER
+        if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
+          int this_height = gf_group->pyramid_level[gf_group->index];
+          while (this_height < gf_group->pyramid_height) {
+            active_best_quality = (active_best_quality + cq_level + 1) / 2;
+            ++this_height;
+          }
+        } else {
+#endif
+          // Modify best quality for second level arfs. For mode AOM_Q this
+          // becomes the baseline frame q.
+          if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
+            active_best_quality = (active_best_quality + cq_level + 1) / 2;
+#if USE_SYMM_MULTI_LAYER
+        }
+#endif
       }
     } else {
-      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+      active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+#if USE_SYMM_MULTI_LAYER
+      if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
+        int this_height = gf_group->pyramid_level[gf_group->index];
+        while (this_height < gf_group->pyramid_height) {
+          active_best_quality =
+              (active_best_quality + active_worst_quality + 1) / 2;
+          ++this_height;
+        }
+      }
+#endif
     }
   } else {
     if (oxcf->rc_mode == AOM_Q) {
@@ -1031,7 +1061,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
       (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
     if (frame_is_intra_only(cm) ||
         (!rc->is_src_frame_alt_ref &&
-         (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
+         (cpi->refresh_golden_frame || is_intrl_arf_boost ||
           cpi->refresh_alt_ref_frame))) {
       active_best_quality -=
           (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
@@ -1056,7 +1086,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
   // Modify active_best_quality for downscaled normal frames.
   if (av1_frame_scaled(cm) && !frame_is_kf_gf_arf(cpi)) {
     int qdelta = av1_compute_qdelta_by_rate(
-        rc, cm->frame_type, active_best_quality, 2.0, cm->bit_depth);
+        rc, cm->frame_type, active_best_quality, 2.0, bit_depth);
     active_best_quality =
         AOMMAX(active_best_quality + qdelta, rc->best_quality);
   }
@@ -1164,6 +1194,16 @@ static void update_alt_ref_frame_stats(AV1_COMP *cpi) {
 
 static void update_golden_frame_stats(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
+#if CUSTOMIZED_GF
+  const TWO_PASS *const twopass = &cpi->twopass;
+  const GF_GROUP *const gf_group = &twopass->gf_group;
+  const int is_intrnl_arf =
+      cpi->oxcf.pass == 2
+          ? gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE
+          : cpi->refresh_alt2_ref_frame;
+#else
+  const int is_intnl_arf = cpi->refresh_alt2_ref_frame;
+#endif
 
   // Update the Golden frame usage counts.
   // NOTE(weitinglin): If we use show_existing_frame for an OVERLAY frame,
@@ -1184,14 +1224,7 @@ static void update_golden_frame_stats(AV1_COMP *cpi) {
     } else if (!rc->source_alt_ref_pending) {
       rc->source_alt_ref_active = 0;
     }
-
-    // Decrement count down till next gf
-    if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
-
-  } else if (!cpi->refresh_alt_ref_frame && !cpi->refresh_alt2_ref_frame) {
-    // Decrement count down till next gf
-    if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
-
+  } else if (!cpi->refresh_alt_ref_frame && !is_intrnl_arf) {
     rc->frames_since_golden++;
   }
 }
@@ -1199,6 +1232,17 @@ static void update_golden_frame_stats(AV1_COMP *cpi) {
 void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
   const AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+#if CUSTOMIZED_GF
+  const TWO_PASS *const twopass = &cpi->twopass;
+  const GF_GROUP *const gf_group = &twopass->gf_group;
+  const int is_intrnl_arf =
+      cpi->oxcf.pass == 2
+          ? gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE
+          : cpi->refresh_alt2_ref_frame;
+#else
+  const int is_intrnl_arf = cpi->refresh_alt2_ref_frame;
+#endif
+
   const int qindex = cm->base_qindex;
 
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
@@ -1218,13 +1262,13 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
   } else {
     if (!rc->is_src_frame_alt_ref &&
-        !(cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
+        !(cpi->refresh_golden_frame || is_intrnl_arf ||
           cpi->refresh_alt_ref_frame)) {
       rc->last_q[INTER_FRAME] = qindex;
       rc->avg_frame_qindex[INTER_FRAME] =
           ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
       rc->ni_frames++;
-      rc->tot_q += av1_convert_qindex_to_q(qindex, cm->bit_depth);
+      rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params.bit_depth);
       rc->avg_q = rc->tot_q / rc->ni_frames;
       // Calculate the average Q for normal inter frames (not key or GFU
       // frames).
@@ -1240,7 +1284,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
   // This is used to help set quality in forced key frames to reduce popping
   if ((qindex < rc->last_boosted_qindex) || (cm->frame_type == KEY_FRAME) ||
       (!rc->constrained_gf_group &&
-       (cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame ||
+       (cpi->refresh_alt_ref_frame || is_intrnl_arf ||
         (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
     rc->last_boosted_qindex = qindex;
   }
@@ -1591,6 +1635,10 @@ void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi,
     if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
       rc->max_gf_interval = rc->static_scene_max_gf_interval;
 
+#if FIX_GF_INTERVAL_LENGTH
+    rc->max_gf_interval = FIXED_GF_LENGTH + 1;
+#endif
+
     // Clamp min to max
     rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval);
   }
diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h
index 81157ce72..f0508da9e 100644
--- a/third_party/aom/av1/encoder/ratectrl.h
+++ b/third_party/aom/av1/encoder/ratectrl.h
@@ -24,6 +24,20 @@ extern "C" {
 // Bits Per MB at different Q (Multiplied by 512)
 #define BPER_MB_NORMBITS 9
 
+#define CUSTOMIZED_GF 1
+#define FIX_GF_INTERVAL_LENGTH 0
+
+#if FIX_GF_INTERVAL_LENGTH
+#define FIXED_GF_LENGTH 16
+#define USE_SYMM_MULTI_LAYER 1
+#else
+#define USE_SYMM_MULTI_LAYER 0
+#endif
+
+#if USE_SYMM_MULTI_LAYER
+#define USE_MANUAL_GF4_STRUCT 0
+#endif
+
 #define MIN_GF_INTERVAL 4
 #define MAX_GF_INTERVAL 16
 #define FIXED_GF_INTERVAL 8  // Used in some testing modes only
diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c
index 17f23e5ec..c4d4777bf 100644
--- a/third_party/aom/av1/encoder/rd.c
+++ b/third_party/aom/av1/encoder/rd.c
@@ -44,9 +44,6 @@
 
 #define RD_THRESH_POW 1.25
 
-// Factor to weigh the rate for switchable interp filters.
-#define SWITCHABLE_INTERP_RATE_FACTOR 1
-
 // The baseline rd thresholds for breaking out of the rd loop for
 // certain modes are assumed to be based on 8x8 blocks.
 // This table is used to correct for block size.
@@ -357,9 +354,10 @@ static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = {
 };
 
 int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) {
-  const int64_t q = av1_dc_quant_Q3(qindex, 0, cpi->common.bit_depth);
+  const int64_t q =
+      av1_dc_quant_Q3(qindex, 0, cpi->common.seq_params.bit_depth);
   int64_t rdmult = 0;
-  switch (cpi->common.bit_depth) {
+  switch (cpi->common.seq_params.bit_depth) {
     case AOM_BITS_8: rdmult = 88 * q * q / 24; break;
     case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4); break;
     case AOM_BITS_12: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8); break;
@@ -394,7 +392,7 @@ static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) {
 }
 
 void av1_initialize_me_consts(const AV1_COMP *cpi, MACROBLOCK *x, int qindex) {
-  switch (cpi->common.bit_depth) {
+  switch (cpi->common.seq_params.bit_depth) {
     case AOM_BITS_8:
       x->sadperbit16 = sad_per_bit16lut_8[qindex];
       x->sadperbit4 = sad_per_bit4lut_8[qindex];
@@ -420,7 +418,7 @@ static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) {
         clamp(av1_get_qindex(&cm->seg, segment_id, cm->base_qindex) +
                   cm->y_dc_delta_q,
               0, MAXQ);
-    const int q = compute_rd_thresh_factor(qindex, cm->bit_depth);
+    const int q = compute_rd_thresh_factor(qindex, cm->seq_params.bit_depth);
 
     for (bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
       // Threshold here seems unnecessarily harsh but fine given actual
diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h
index 281b676b0..692367d7a 100644
--- a/third_party/aom/av1/encoder/rd.h
+++ b/third_party/aom/av1/encoder/rd.h
@@ -43,6 +43,9 @@ extern "C" {
 #define RD_THRESH_MAX_FACT 64
 #define RD_THRESH_INC 1
 
+// Factor to weigh the rate for switchable interp filters.
+#define SWITCHABLE_INTERP_RATE_FACTOR 1
+
 // This enumerator type needs to be kept aligned with the mode order in
 // const MODE_DEFINITION av1_mode_order[MAX_MODES] used in the rd code.
 typedef enum {
diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c
index 6f4fced87..fef6d2875 100644
--- a/third_party/aom/av1/encoder/rdopt.c
+++ b/third_party/aom/av1/encoder/rdopt.c
@@ -58,8 +58,11 @@
 #include "av1/encoder/tokenize.h"
 #include "av1/encoder/tx_prune_model_weights.h"
 
+#define DNN_BASED_RD_INTERP_FILTER 0
+
 // Set this macro as 1 to collect data about tx size selection.
 #define COLLECT_TX_SIZE_DATA 0
+
 #if COLLECT_TX_SIZE_DATA
 static const char av1_tx_size_data_output_file[] = "tx_size_data.txt";
 #endif
@@ -916,9 +919,9 @@ static double od_compute_dist(uint16_t *x, uint16_t *y, int bsize_w,
   int activity_masking = 0;
 
   int i, j;
-  DECLARE_ALIGNED(16, od_coeff, e[MAX_TX_SQUARE]);
-  DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
-  DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, od_coeff, e[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, od_coeff, tmp[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_SB_SQUARE]);
   for (i = 0; i < bsize_h; i++) {
     for (j = 0; j < bsize_w; j++) {
       e[i * bsize_w + j] = x[i * bsize_w + j] - y[i * bsize_w + j];
@@ -944,9 +947,9 @@ static double od_compute_dist_diff(uint16_t *x, int16_t *e, int bsize_w,
 
   int activity_masking = 0;
 
-  DECLARE_ALIGNED(16, uint16_t, y[MAX_TX_SQUARE]);
-  DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
-  DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, uint16_t, y[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, od_coeff, tmp[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_SB_SQUARE]);
   int i, j;
   for (i = 0; i < bsize_h; i++) {
     for (j = 0; j < bsize_w; j++) {
@@ -975,8 +978,8 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
   int i, j;
   const MACROBLOCKD *xd = &x->e_mbd;
 
-  DECLARE_ALIGNED(16, uint16_t, orig[MAX_TX_SQUARE]);
-  DECLARE_ALIGNED(16, uint16_t, rec[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, uint16_t, orig[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint16_t, rec[MAX_SB_SQUARE]);
 
   assert(bsw >= 8);
   assert(bsh >= 8);
@@ -1068,8 +1071,8 @@ static int64_t dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src,
   int i, j;
   const MACROBLOCKD *xd = &x->e_mbd;
 
-  DECLARE_ALIGNED(16, uint16_t, orig[MAX_TX_SQUARE]);
-  DECLARE_ALIGNED(16, int16_t, diff16[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, uint16_t, orig[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, int16_t, diff16[MAX_SB_SQUARE]);
 
   assert(bsw >= 8);
   assert(bsh >= 8);
@@ -1112,7 +1115,7 @@ static int64_t dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src,
     d = (int64_t)od_compute_dist_diff(orig, diff16, bsw, bsh, qindex);
   } else if (x->tune_metric == AOM_TUNE_CDEF_DIST) {
     int coeff_shift = AOMMAX(xd->bd - 8, 0);
-    DECLARE_ALIGNED(16, uint16_t, dst16[MAX_TX_SQUARE]);
+    DECLARE_ALIGNED(16, uint16_t, dst16[MAX_SB_SQUARE]);
 
     for (i = 0; i < bsh; i++) {
       for (j = 0; j < bsw; j++) {
@@ -1146,11 +1149,15 @@ static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
   const int bh = block_size_high[bsize];
   unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
 
-  const int f_index = bsize - BLOCK_16X16;
-  if (f_index < 0) {
-    const int w_shift = bw == 8 ? 1 : 2;
-    const int h_shift = bh == 8 ? 1 : 2;
-    if (cpi->common.use_highbitdepth) {
+  if (bsize < BLOCK_16X16 || (bsize >= BLOCK_4X16 && bsize <= BLOCK_32X8)) {
+    // Special cases: calculate 'esq' values manually, as we don't have 'vf'
+    // functions for the 16 (very small) sub-blocks of this block.
+    const int w_shift = (bw == 4) ? 0 : (bw == 8) ? 1 : (bw == 16) ? 2 : 3;
+    const int h_shift = (bh == 4) ? 0 : (bh == 8) ? 1 : (bh == 16) ? 2 : 3;
+    assert(bw <= 32);
+    assert(bh <= 32);
+    assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15);
+    if (cpi->common.seq_params.use_highbitdepth) {
       const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
       const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
       for (int i = 0; i < bh; ++i)
@@ -1168,43 +1175,49 @@ static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
                         (src[j + i * src_stride] - dst[j + i * dst_stride]);
         }
     }
-  } else {
-    cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[0]);
-    cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+  } else {  // Calculate 'esq' values using 'vf' functions on the 16 sub-blocks.
+    const int f_index =
+        (bsize < BLOCK_SIZES) ? bsize - BLOCK_16X16 : bsize - BLOCK_8X16;
+    assert(f_index >= 0 && f_index < BLOCK_SIZES_ALL);
+    const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index;
+    assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]);
+    assert(block_size_high[bsize] == 4 * block_size_high[subsize]);
+    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]);
+    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
                             &esq[1]);
-    cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
                             &esq[2]);
-    cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
                             dst_stride, &esq[3]);
     src += bh / 4 * src_stride;
     dst += bh / 4 * dst_stride;
 
-    cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[4]);
-    cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]);
+    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
                             &esq[5]);
-    cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
                             &esq[6]);
-    cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
                             dst_stride, &esq[7]);
     src += bh / 4 * src_stride;
     dst += bh / 4 * dst_stride;
 
-    cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[8]);
-    cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]);
+    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
                             &esq[9]);
-    cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
                             &esq[10]);
-    cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
                             dst_stride, &esq[11]);
     src += bh / 4 * src_stride;
     dst += bh / 4 * dst_stride;
 
-    cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[12]);
-    cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]);
+    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
                             &esq[13]);
-    cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
                             &esq[14]);
-    cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
                             dst_stride, &esq[15]);
   }
 
@@ -1371,16 +1384,27 @@ static void get_energy_distribution_finer(const int16_t *diff, int stride,
   unsigned int esq[256];
   const int w_shift = bw <= 8 ? 0 : 1;
   const int h_shift = bh <= 8 ? 0 : 1;
-  const int esq_w = bw <= 8 ? bw : bw / 2;
-  const int esq_h = bh <= 8 ? bh : bh / 2;
+  const int esq_w = bw >> w_shift;
+  const int esq_h = bh >> h_shift;
   const int esq_sz = esq_w * esq_h;
   int i, j;
   memset(esq, 0, esq_sz * sizeof(esq[0]));
-  for (i = 0; i < bh; i++) {
-    unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
-    const int16_t *cur_diff_row = diff + i * stride;
-    for (j = 0; j < bw; j++) {
-      cur_esq_row[j >> w_shift] += cur_diff_row[j] * cur_diff_row[j];
+  if (w_shift) {
+    for (i = 0; i < bh; i++) {
+      unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
+      const int16_t *cur_diff_row = diff + i * stride;
+      for (j = 0; j < bw; j += 2) {
+        cur_esq_row[j >> 1] += (cur_diff_row[j] * cur_diff_row[j] +
+                                cur_diff_row[j + 1] * cur_diff_row[j + 1]);
+      }
+    }
+  } else {
+    for (i = 0; i < bh; i++) {
+      unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
+      const int16_t *cur_diff_row = diff + i * stride;
+      for (j = 0; j < bw; j++) {
+        cur_esq_row[j] += cur_diff_row[j] * cur_diff_row[j];
+      }
     }
   }
 
@@ -1558,9 +1582,9 @@ static const float *prune_2D_adaptive_thresholds[] = {
   NULL,
 };
 
-static int prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
-                       int blk_row, int blk_col, TxSetType tx_set_type,
-                       TX_TYPE_PRUNE_MODE prune_mode) {
+static uint16_t prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            int blk_row, int blk_col, TxSetType tx_set_type,
+                            TX_TYPE_PRUNE_MODE prune_mode) {
   static const int tx_type_table_2D[16] = {
     DCT_DCT,      DCT_ADST,      DCT_FLIPADST,      V_DCT,
     ADST_DCT,     ADST_ADST,     ADST_FLIPADST,     V_ADST,
@@ -1636,7 +1660,7 @@ static int prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
   const float score_thresh =
       prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness - 1];
 
-  int prune_bitmask = 0;
+  uint16_t prune_bitmask = 0;
   for (int i = 0; i < 16; i++) {
     if (scores_2D[i] < score_thresh && i != max_score_i)
       prune_bitmask |= (1 << tx_type_table_2D[i]);
@@ -1644,9 +1668,27 @@ static int prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
   return prune_bitmask;
 }
 
+// ((prune >> vtx_tab[tx_type]) & 1)
+static const uint16_t prune_v_mask[] = {
+  0x0000, 0x0425, 0x108a, 0x14af, 0x4150, 0x4575, 0x51da, 0x55ff,
+  0xaa00, 0xae25, 0xba8a, 0xbeaf, 0xeb50, 0xef75, 0xfbda, 0xffff,
+};
+
+// ((prune >> (htx_tab[tx_type] + 8)) & 1)
+static const uint16_t prune_h_mask[] = {
+  0x0000, 0x0813, 0x210c, 0x291f, 0x80e0, 0x88f3, 0xa1ec, 0xa9ff,
+  0x5600, 0x5e13, 0x770c, 0x7f1f, 0xd6e0, 0xdef3, 0xf7ec, 0xffff,
+};
+
+static INLINE uint16_t gen_tx_search_prune_mask(int tx_search_prune) {
+  uint8_t prune_v = tx_search_prune & 0x0F;
+  uint8_t prune_h = (tx_search_prune >> 8) & 0x0F;
+  return (prune_v_mask[prune_v] & prune_h_mask[prune_h]);
+}
+
 static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
                      const MACROBLOCKD *const xd, int tx_set_type) {
-  av1_zero(x->tx_search_prune);
+  x->tx_search_prune[tx_set_type] = 0;
   x->tx_split_prune_flag = 0;
   const MB_MODE_INFO *mbmi = xd->mi[0];
   if (!is_inter_block(mbmi) || cpi->sf.tx_type_search.prune_mode == NO_PRUNE ||
@@ -1656,24 +1698,24 @@ static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
   int tx_set = ext_tx_set_index[1][tx_set_type];
   assert(tx_set >= 0);
   const int *tx_set_1D = ext_tx_used_inter_1D[tx_set];
+  int prune = 0;
   switch (cpi->sf.tx_type_search.prune_mode) {
     case NO_PRUNE: return;
     case PRUNE_ONE:
       if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) return;
-      x->tx_search_prune[tx_set_type] = prune_one_for_sby(cpi, bsize, x, xd);
+      prune = prune_one_for_sby(cpi, bsize, x, xd);
+      x->tx_search_prune[tx_set_type] = gen_tx_search_prune_mask(prune);
       break;
     case PRUNE_TWO:
       if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) {
         if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return;
-        x->tx_search_prune[tx_set_type] =
-            prune_two_for_sby(cpi, bsize, x, xd, 0, 1);
-      }
-      if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) {
-        x->tx_search_prune[tx_set_type] =
-            prune_two_for_sby(cpi, bsize, x, xd, 1, 0);
+        prune = prune_two_for_sby(cpi, bsize, x, xd, 0, 1);
+      } else if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) {
+        prune = prune_two_for_sby(cpi, bsize, x, xd, 1, 0);
+      } else {
+        prune = prune_two_for_sby(cpi, bsize, x, xd, 1, 1);
       }
-      x->tx_search_prune[tx_set_type] =
-          prune_two_for_sby(cpi, bsize, x, xd, 1, 1);
+      x->tx_search_prune[tx_set_type] = gen_tx_search_prune_mask(prune);
       break;
     case PRUNE_2D_ACCURATE:
     case PRUNE_2D_FAST: break;
@@ -1681,17 +1723,6 @@ static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
   }
 }
 
-static int do_tx_type_search(TX_TYPE tx_type, int prune,
-                             TX_TYPE_PRUNE_MODE mode) {
-  // TODO(sarahparker) implement for non ext tx
-  if (mode >= PRUNE_2D_ACCURATE) {
-    return !((prune >> tx_type) & 1);
-  } else {
-    return !(((prune >> vtx_tab[tx_type]) & 1) |
-             ((prune >> (htx_tab[tx_type] + 8)) & 1));
-  }
-}
-
 static void model_rd_from_sse(const AV1_COMP *const cpi,
                               const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
                               int plane, int64_t sse, int *rate,
@@ -1764,9 +1795,11 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
   for (plane = plane_from; plane <= plane_to; ++plane) {
     struct macroblock_plane *const p = &x->plane[plane];
     struct macroblockd_plane *const pd = &xd->plane[plane];
-    const BLOCK_SIZE bs =
+    const BLOCK_SIZE plane_bsize =
         get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
-    unsigned int sse;
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+    int64_t sse;
     int rate;
     int64_t dist;
 
@@ -1774,14 +1807,14 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
 
     // TODO(geza): Write direct sse functions that do not compute
     // variance as well.
-    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
-                       &sse);
+    sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh);
+    sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
 
-    if (plane == 0) x->pred_sse[ref] = sse;
+    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
 
     total_sse += sse;
 
-    model_rd_from_sse(cpi, xd, bs, plane, sse, &rate, &dist);
+    model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &rate, &dist);
 
     rate_sum += rate;
     dist_sum += dist;
@@ -1934,7 +1967,8 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
 static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
                                       int blk_row, int blk_col,
                                       const BLOCK_SIZE plane_bsize,
-                                      const BLOCK_SIZE tx_bsize) {
+                                      const BLOCK_SIZE tx_bsize,
+                                      int force_sse) {
   int visible_rows, visible_cols;
   const MACROBLOCKD *xd = &x->e_mbd;
   get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
@@ -1944,13 +1978,17 @@ static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
 #if CONFIG_DIST_8X8
   int txb_height = block_size_high[tx_bsize];
   int txb_width = block_size_wide[tx_bsize];
-  if (x->using_dist_8x8 && plane == 0 && txb_width >= 8 && txb_height >= 8) {
+  if (!force_sse && x->using_dist_8x8 && plane == 0 && txb_width >= 8 &&
+      txb_height >= 8) {
     const int src_stride = x->plane[plane].src.stride;
     const int src_idx = (blk_row * src_stride + blk_col)
                         << tx_size_wide_log2[0];
+    const int diff_idx = (blk_row * diff_stride + blk_col)
+                         << tx_size_wide_log2[0];
     const uint8_t *src = &x->plane[plane].src.buf[src_idx];
-    return dist_8x8_diff(x, src, src_stride, diff, diff_stride, txb_width,
-                         txb_height, visible_cols, visible_rows, x->qindex);
+    return dist_8x8_diff(x, src, src_stride, diff + diff_idx, diff_stride,
+                         txb_width, txb_height, visible_cols, visible_rows,
+                         x->qindex);
   }
 #endif
   diff += ((blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]);
@@ -2182,10 +2220,14 @@ static void get_2x2_normalized_sses_and_sads(
       for (int col = 0; col < 2; ++col) {
         const int16_t *const this_src_diff =
             src_diff + row * half_height * diff_stride + col * half_width;
-        sse_norm_arr[row * 2 + col] =
-            get_sse_norm(this_src_diff, diff_stride, half_width, half_height);
-        sad_norm_arr[row * 2 + col] =
-            get_sad_norm(this_src_diff, diff_stride, half_width, half_height);
+        if (sse_norm_arr) {
+          sse_norm_arr[row * 2 + col] =
+              get_sse_norm(this_src_diff, diff_stride, half_width, half_height);
+        }
+        if (sad_norm_arr) {
+          sad_norm_arr[row * 2 + col] =
+              get_sad_norm(this_src_diff, diff_stride, half_width, half_height);
+        }
       }
     }
   } else {  // use function pointers to calculate stats
@@ -2199,28 +2241,35 @@ static void get_2x2_normalized_sses_and_sads(
         const uint8_t *const this_dst =
             dst + row * half_height * dst_stride + col * half_width;
 
-        unsigned int this_sse;
-        cpi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst,
-                                      dst_stride, &this_sse);
-        sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half;
+        if (sse_norm_arr) {
+          unsigned int this_sse;
+          cpi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst,
+                                        dst_stride, &this_sse);
+          sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half;
+        }
 
-        const unsigned int this_sad = cpi->fn_ptr[tx_bsize_half].sdf(
-            this_src, src_stride, this_dst, dst_stride);
-        sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half;
+        if (sad_norm_arr) {
+          const unsigned int this_sad = cpi->fn_ptr[tx_bsize_half].sdf(
+              this_src, src_stride, this_dst, dst_stride);
+          sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half;
+        }
       }
     }
   }
 }
 
 #if CONFIG_COLLECT_RD_STATS
-// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
-// 0: Do not collect any RD stats
-// 1: Collect RD stats for transform units
-// 2: Collect RD stats for partition units
+  // NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
+  // 0: Do not collect any RD stats
+  // 1: Collect RD stats for transform units
+  // 2: Collect RD stats for partition units
+
+#if CONFIG_COLLECT_RD_STATS == 1
 static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     const RD_STATS *const rd_stats, int blk_row,
                                     int blk_col, BLOCK_SIZE plane_bsize,
-                                    TX_SIZE tx_size, TX_TYPE tx_type) {
+                                    TX_SIZE tx_size, TX_TYPE tx_type,
+                                    int64_t rd) {
   if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
 
   // Generate small sample to restrict output size.
@@ -2304,9 +2353,12 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
   fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
           hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
 
+  fprintf(fout, " %d %" PRId64, x->rdmult, rd);
+
   fprintf(fout, "\n");
   fclose(fout);
 }
+#endif  // CONFIG_COLLECT_RD_STATS == 1
 
 #if CONFIG_COLLECT_RD_STATS == 2
 static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
@@ -2327,12 +2379,14 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
   const int plane = 0;
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int bw = block_size_wide[plane_bsize];
-  const int bh = block_size_high[plane_bsize];
+  const int diff_stride = block_size_wide[plane_bsize];
+  int bw, bh;
+  get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
+                     &bh);
+  const int num_samples = bw * bh;
   const int dequant_shift =
       (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
   const int q_step = pd->dequant_Q3[1] >> dequant_shift;
-  const double num_samples = bw * bh;
 
   const double rate_norm = (double)rd_stats->rate / num_samples;
   const double dist_norm = (double)rd_stats->dist / num_samples;
@@ -2343,23 +2397,28 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
   const uint8_t *const src = p->src.buf;
   const int dst_stride = pd->dst.stride;
   const uint8_t *const dst = pd->dst.buf;
-  unsigned int sse;
-  cpi->fn_ptr[plane_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+  const int16_t *const src_diff = p->src_diff;
+  const int shift = (xd->bd - 8);
+
+  int64_t sse = aom_sum_squares_2d_i16(src_diff, diff_stride, bw, bh);
+  sse = ROUND_POWER_OF_TWO(sse, shift * 2);
   const double sse_norm = (double)sse / num_samples;
 
   const unsigned int sad =
       cpi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride);
-  const double sad_norm = (double)sad / num_samples;
+  const double sad_norm =
+      (double)sad / (1 << num_pels_log2_lookup[plane_bsize]);
 
   fprintf(fout, " %g %g", sse_norm, sad_norm);
 
-  const int diff_stride = block_size_wide[plane_bsize];
-  const int16_t *const src_diff = p->src_diff;
-
   double sse_norm_arr[4], sad_norm_arr[4];
   get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
                                    dst_stride, src_diff, diff_stride,
                                    sse_norm_arr, sad_norm_arr);
+  if (shift) {
+    for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift));
+    for (int k = 0; k < 4; ++k) sad_norm_arr[k] /= (1 << shift);
+  }
   for (int i = 0; i < 4; ++i) {
     fprintf(fout, " %g", sse_norm_arr[i]);
   }
@@ -2376,7 +2435,8 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
   const double model_dist_norm = (double)model_dist / num_samples;
   fprintf(fout, " %g %g", model_rate_norm, model_dist_norm);
 
-  const double mean = get_mean(src_diff, diff_stride, bw, bh);
+  double mean = get_mean(src_diff, diff_stride, bw, bh);
+  mean /= (1 << shift);
   double hor_corr, vert_corr;
   get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr);
   fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
@@ -2393,20 +2453,19 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
 #endif  // CONFIG_COLLECT_RD_STATS == 2
 #endif  // CONFIG_COLLECT_RD_STATS
 
-static void model_rd_with_dnn(const AV1_COMP *const cpi,
-                              const MACROBLOCK *const x, BLOCK_SIZE bsize,
-                              int plane, unsigned int *rsse, int *rate,
-                              int64_t *dist) {
+static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                              BLOCK_SIZE plane_bsize, int plane, int64_t *rsse,
+                              int *rate, int64_t *dist) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
   const int log_numpels = num_pels_log2_lookup[plane_bsize];
-  const int num_samples = (1 << log_numpels);
 
   const struct macroblock_plane *const p = &x->plane[plane];
-  const int bw = block_size_wide[plane_bsize];
-  const int bh = block_size_high[plane_bsize];
+  int bw, bh;
+  const int diff_stride = block_size_wide[plane_bsize];
+  get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
+                     &bh);
+  const int num_samples = bw * bh;
   const int dequant_shift =
       (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
   const int q_step = pd->dequant_Q3[1] >> dequant_shift;
@@ -2415,55 +2474,73 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi,
   const uint8_t *const src = p->src.buf;
   const int dst_stride = pd->dst.stride;
   const uint8_t *const dst = pd->dst.buf;
-  unsigned int sse;
-  cpi->fn_ptr[plane_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+  const int16_t *const src_diff = p->src_diff;
+  const int shift = (xd->bd - 8);
+  int64_t sse = aom_sum_squares_2d_i16(p->src_diff, diff_stride, bw, bh);
+  sse = ROUND_POWER_OF_TWO(sse, shift * 2);
   const double sse_norm = (double)sse / num_samples;
 
-  const int diff_stride = block_size_wide[plane_bsize];
-  const int16_t *const src_diff = p->src_diff;
+  if (sse == 0) {
+    if (rate) *rate = 0;
+    if (dist) *dist = 0;
+    if (rsse) *rsse = sse;
+    return;
+  }
+  if (plane) {
+    int model_rate;
+    int64_t model_dist;
+    model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &model_rate,
+                      &model_dist);
+    if (rate) *rate = model_rate;
+    if (dist) *dist = model_dist;
+    if (rsse) *rsse = sse;
+    return;
+  }
 
-  double sse_norm_arr[4], sad_norm_arr[4];
+  double sse_norm_arr[4];
   get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
                                    dst_stride, src_diff, diff_stride,
-                                   sse_norm_arr, sad_norm_arr);
-  const double mean = get_mean(src_diff, diff_stride, bw, bh);
+                                   sse_norm_arr, NULL);
+  double mean = get_mean(src_diff, bw, bw, bh);
+  if (shift) {
+    for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift));
+    mean /= (1 << shift);
+  }
   const double variance = sse_norm - mean * mean;
+  assert(variance >= 0.0);
   const double q_sqr = (double)(q_step * q_step);
-  const double q_sqr_by_variance = q_sqr / variance;
+  const double q_sqr_by_sse_norm = q_sqr / (sse_norm + 1.0);
   double hor_corr, vert_corr;
   get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr);
-  double hdist[4] = { 0 }, vdist[4] = { 0 };
-  get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst,
-                               dst_stride, 1, hdist, vdist);
 
-  float features[20];
-  features[0] = (float)hdist[0];
-  features[1] = (float)hdist[1];
-  features[2] = (float)hdist[2];
-  features[3] = (float)hdist[3];
-  features[4] = (float)hor_corr;
-  features[5] = (float)log_numpels;
-  features[6] = (float)mean;
-  features[7] = (float)q_sqr;
-  features[8] = (float)q_sqr_by_variance;
-  features[9] = (float)sse_norm_arr[0];
-  features[10] = (float)sse_norm_arr[1];
-  features[11] = (float)sse_norm_arr[2];
-  features[12] = (float)sse_norm_arr[3];
-  features[13] = (float)sse_norm_arr[3];
-  features[14] = (float)variance;
-  features[15] = (float)vdist[0];
-  features[16] = (float)vdist[1];
-  features[17] = (float)vdist[2];
-  features[18] = (float)vdist[3];
-  features[19] = (float)vert_corr;
-
-  float rate_f, dist_f;
-  av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_f);
+  float features[11];
+  features[0] = (float)hor_corr;
+  features[1] = (float)log_numpels;
+  features[2] = (float)q_sqr;
+  features[3] = (float)q_sqr_by_sse_norm;
+  features[4] = (float)sse_norm_arr[0];
+  features[5] = (float)sse_norm_arr[1];
+  features[6] = (float)sse_norm_arr[2];
+  features[7] = (float)sse_norm_arr[3];
+  features[8] = (float)sse_norm;
+  features[9] = (float)variance;
+  features[10] = (float)vert_corr;
+
+  float rate_f, dist_by_sse_norm_f;
+  av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_by_sse_norm_f);
   av1_nn_predict(features, &av1_pustats_rate_nnconfig, &rate_f);
-  const int rate_i = (int)(AOMMAX(0.0, rate_f * (1 << log_numpels)) + 0.5);
-  const int64_t dist_i =
-      (int64_t)(AOMMAX(0.0, dist_f * (1 << log_numpels)) + 0.5);
+  const float dist_f = (float)((double)dist_by_sse_norm_f * (1.0 + sse_norm));
+  int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+  int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+
+  // Check if skip is better
+  if (RDCOST(x->rdmult, rate_i, dist_i) >= RDCOST(x->rdmult, 0, (sse << 4))) {
+    dist_i = sse << 4;
+    rate_i = 0;
+  } else if (rate_i == 0) {
+    dist_i = sse << 4;
+  }
+
   if (rate) *rate = rate_i;
   if (dist) *dist = dist_i;
   if (rsse) *rsse = sse;
@@ -2488,15 +2565,18 @@ void model_rd_for_sb_with_dnn(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
   x->pred_sse[ref] = 0;
 
   for (int plane = plane_from; plane <= plane_to; ++plane) {
-    unsigned int sse;
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    int64_t sse;
     int rate;
     int64_t dist;
 
     if (x->skip_chroma_rd && plane) continue;
 
-    model_rd_with_dnn(cpi, x, bsize, plane, &sse, &rate, &dist);
+    model_rd_with_dnn(cpi, x, plane_bsize, plane, &sse, &rate, &dist);
 
-    if (plane == 0) x->pred_sse[ref] = sse;
+    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
 
     total_sse += sse;
     rate_sum += rate;
@@ -2586,27 +2666,16 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
   int rate_cost = 0;
   TX_TYPE txk_start = DCT_DCT;
   TX_TYPE txk_end = TX_TYPES - 1;
-  if (!(!is_inter && x->use_default_intra_tx_type) &&
-      !(is_inter && x->use_default_inter_tx_type))
-    if (x->rd_model == LOW_TXFM_RD || x->cb_partition_scan)
-      if (plane == 0) txk_end = DCT_DCT;
+  if ((!is_inter && x->use_default_intra_tx_type) ||
+      (is_inter && x->use_default_inter_tx_type)) {
+    txk_start = txk_end = get_default_tx_type(0, xd, tx_size);
+  } else if (x->rd_model == LOW_TXFM_RD || x->cb_partition_scan) {
+    if (plane == 0) txk_end = DCT_DCT;
+  }
 
   uint8_t best_txb_ctx = 0;
   const TxSetType tx_set_type =
       av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used);
-  int prune = 0;
-  const int do_prune = plane == 0 && !fast_tx_search && txk_end != DCT_DCT &&
-                       !(!is_inter && x->use_default_intra_tx_type) &&
-                       !(is_inter && x->use_default_inter_tx_type) &&
-                       cpi->sf.tx_type_search.prune_mode > NO_PRUNE;
-  if (do_prune && is_inter) {
-    if (cpi->sf.tx_type_search.prune_mode >= PRUNE_2D_ACCURATE) {
-      prune = prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col,
-                          tx_set_type, cpi->sf.tx_type_search.prune_mode);
-    } else {
-      prune = x->tx_search_prune[tx_set_type];
-    }
-  }
 
   TX_TYPE uv_tx_type = DCT_DCT;
   if (plane) {
@@ -2615,39 +2684,38 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
         av1_get_tx_type(get_plane_type(plane), xd, blk_row, blk_col, tx_size,
                         cm->reduced_tx_set_used);
   }
-  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) {
+  const uint16_t ext_tx_used_flag = av1_ext_tx_used_flag[tx_set_type];
+  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 ||
+      ext_tx_used_flag == 0x0001) {
     txk_start = txk_end = DCT_DCT;
   }
-
-  int8_t allowed_tx_mask[TX_TYPES] = { 0 };  // 1: allow; 0: skip.
-  int allowed_tx_num = 0;
-  if (fast_tx_search) {
-    allowed_tx_mask[DCT_DCT] = 1;
-    allowed_tx_mask[H_DCT] = 1;
-    allowed_tx_mask[V_DCT] = 1;
+  uint16_t allowed_tx_mask = 0;  // 1: allow; 0: skip.
+  if (txk_start == txk_end) {
+    allowed_tx_mask = 1 << txk_start;
+    allowed_tx_mask &= ext_tx_used_flag;
+  } else if (fast_tx_search) {
+    allowed_tx_mask = 0x0c01;  // V_DCT, H_DCT, DCT_DCT
+    allowed_tx_mask &= ext_tx_used_flag;
   } else {
-    memset(allowed_tx_mask + txk_start, 1, txk_end - txk_start + 1);
-  }
-  for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
-    if (do_prune) {
-      if (!do_tx_type_search(tx_type, prune, cpi->sf.tx_type_search.prune_mode))
-        allowed_tx_mask[tx_type] = 0;
-    }
-    if (plane == 0 && allowed_tx_mask[tx_type]) {
-      if (!av1_ext_tx_used[tx_set_type][tx_type])
-        allowed_tx_mask[tx_type] = 0;
-      else if (!is_inter && x->use_default_intra_tx_type &&
-               tx_type != get_default_tx_type(0, xd, tx_size))
-        allowed_tx_mask[tx_type] = 0;
-      else if (is_inter && x->use_default_inter_tx_type &&
-               tx_type != get_default_tx_type(0, xd, tx_size))
-        allowed_tx_mask[tx_type] = 0;
-    }
-    allowed_tx_num += allowed_tx_mask[tx_type];
+    assert(plane == 0);
+    allowed_tx_mask = ext_tx_used_flag;
+    // !fast_tx_search && txk_end != txk_start && plane == 0
+    const int do_prune = cpi->sf.tx_type_search.prune_mode > NO_PRUNE;
+    if (do_prune && is_inter) {
+      if (cpi->sf.tx_type_search.prune_mode >= PRUNE_2D_ACCURATE) {
+        const uint16_t prune =
+            prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type,
+                        cpi->sf.tx_type_search.prune_mode);
+        allowed_tx_mask &= (~prune);
+      } else {
+        allowed_tx_mask &= (~x->tx_search_prune[tx_set_type]);
+      }
+    }
   }
   // Need to have at least one transform type allowed.
-  if (allowed_tx_num == 0) {
-    allowed_tx_mask[plane ? uv_tx_type : DCT_DCT] = 1;
+  if (allowed_tx_mask == 0) {
+    txk_start = txk_end = (plane ? uv_tx_type : DCT_DCT);
+    allowed_tx_mask = (1 << txk_start);
   }
 
   int use_transform_domain_distortion =
@@ -2664,20 +2732,21 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
       cpi->sf.use_transform_domain_distortion == 1 &&
       use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD &&
       !x->cb_partition_scan;
-  if (calc_pixel_domain_distortion_final && allowed_tx_num <= 1)
+  if (calc_pixel_domain_distortion_final &&
+      (txk_start == txk_end || allowed_tx_mask == 0x0001))
     calc_pixel_domain_distortion_final = use_transform_domain_distortion = 0;
 
   const uint16_t *eobs_ptr = x->plane[plane].eobs;
 
   const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
   int64_t block_sse =
-      pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize);
+      pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize, 1);
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
     block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
   block_sse *= 16;
 
   for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
-    if (!allowed_tx_mask[tx_type]) continue;
+    if (!(allowed_tx_mask & (1 << tx_type))) continue;
     if (plane == 0) mbmi->txk_type[txk_type_idx] = tx_type;
     RD_STATS this_rd_stats;
     av1_invalid_rd_stats(&this_rd_stats);
@@ -2686,8 +2755,8 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
       av1_xform_quant(
           cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
           USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
-      rate_cost = av1_cost_coeffs(cm, x, plane, blk_row, blk_col, block,
-                                  tx_size, txb_ctx, use_fast_coef_costing);
+      rate_cost = av1_cost_coeffs(cm, x, plane, block, tx_size, tx_type,
+                                  txb_ctx, use_fast_coef_costing);
     } else {
       av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
                       tx_size, tx_type, AV1_XFORM_QUANT_FP);
@@ -2696,13 +2765,18 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
         // Calculate distortion quickly in transform domain.
         dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
                              &this_rd_stats.sse);
-        rate_cost = av1_cost_coeffs(cm, x, plane, blk_row, blk_col, block,
-                                    tx_size, txb_ctx, use_fast_coef_costing);
+
+        const int64_t best_rd_ = AOMMIN(best_rd, ref_best_rd);
+        const int64_t dist_cost_estimate =
+            RDCOST(x->rdmult, 0, AOMMIN(this_rd_stats.dist, this_rd_stats.sse));
+        if (dist_cost_estimate - (dist_cost_estimate >> 3) > best_rd_) continue;
+
+        rate_cost = av1_cost_coeffs(cm, x, plane, block, tx_size, tx_type,
+                                    txb_ctx, use_fast_coef_costing);
         const int64_t rd_estimate =
             AOMMIN(RDCOST(x->rdmult, rate_cost, this_rd_stats.dist),
                    RDCOST(x->rdmult, 0, this_rd_stats.sse));
-        if (rd_estimate - (rd_estimate >> 3) > AOMMIN(best_rd, ref_best_rd))
-          continue;
+        if (rd_estimate - (rd_estimate >> 3) > best_rd_) continue;
       }
       av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx, 1,
                      &rate_cost);
@@ -2741,7 +2815,7 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
 #if CONFIG_COLLECT_RD_STATS == 1
     if (plane == 0) {
       PrintTransformUnitStats(cpi, x, &this_rd_stats, blk_row, blk_col,
-                              plane_bsize, tx_size, tx_type);
+                              plane_bsize, tx_size, tx_type, rd);
     }
 #endif  // CONFIG_COLLECT_RD_STATS == 1
 
@@ -3097,6 +3171,7 @@ static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
                                    MACROBLOCK *x, int *r, int64_t *d, int *s,
                                    int64_t *sse, int64_t ref_best_rd) {
   RD_STATS rd_stats;
+  av1_subtract_plane(x, bs, 0);
   x->rd_model = LOW_TXFM_RD;
   int64_t rd = txfm_yrd(cpi, x, &rd_stats, ref_best_rd, bs,
                         max_txsize_rect_lookup[bs], FTXS_NONE);
@@ -3267,7 +3342,7 @@ static int intra_mode_info_cost_y(const AV1_COMP *cpi, const MACROBLOCK *x,
       const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
       palette_mode_cost +=
           av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache,
-                                   n_cache, cpi->common.bit_depth);
+                                   n_cache, cpi->common.seq_params.bit_depth);
       palette_mode_cost +=
           av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP);
       total_rate += palette_mode_cost;
@@ -3318,8 +3393,8 @@ static int intra_mode_info_cost_uv(const AV1_COMP *cpi, const MACROBLOCK *x,
           write_uniform_cost(plt_size, color_map[0]);
       uint16_t color_cache[2 * PALETTE_MAX_SIZE];
       const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
-      palette_mode_cost += av1_palette_color_cost_uv(pmi, color_cache, n_cache,
-                                                     cpi->common.bit_depth);
+      palette_mode_cost += av1_palette_color_cost_uv(
+          pmi, color_cache, n_cache, cpi->common.seq_params.bit_depth);
       palette_mode_cost +=
           av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP);
       total_rate += palette_mode_cost;
@@ -3375,6 +3450,7 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
     }
   }
   // RD estimation.
+  av1_subtract_plane(x, bsize, 0);
   model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate,
                   &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL,
                   NULL, NULL);
@@ -3458,10 +3534,10 @@ static void palette_rd_y(
     return;
   }
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  if (cpi->common.use_highbitdepth)
+  if (cpi->common.seq_params.use_highbitdepth)
     for (int i = 0; i < k; ++i)
-      pmi->palette_colors[i] =
-          clip_pixel_highbd((int)centroids[i], cpi->common.bit_depth);
+      pmi->palette_colors[i] = clip_pixel_highbd(
+          (int)centroids[i], cpi->common.seq_params.bit_depth);
   else
     for (int i = 0; i < k; ++i)
       pmi->palette_colors[i] = clip_pixel(centroids[i]);
@@ -3514,6 +3590,7 @@ static int rd_pick_palette_intra_sby(
   MB_MODE_INFO *const mbmi = xd->mi[0];
   assert(!is_inter_block(mbmi));
   assert(av1_allow_palette(cpi->common.allow_screen_content_tools, bsize));
+  const SequenceHeader *const seq_params = &cpi->common.seq_params;
   int colors, n;
   const int src_stride = x->plane[0].src.stride;
   const uint8_t *const src = x->plane[0].src.buf;
@@ -3523,9 +3600,9 @@ static int rd_pick_palette_intra_sby(
                            &cols);
 
   int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
-  if (cpi->common.use_highbitdepth)
+  if (seq_params->use_highbitdepth)
     colors = av1_count_colors_highbd(src, src_stride, rows, cols,
-                                     cpi->common.bit_depth, count_buf);
+                                     seq_params->bit_depth, count_buf);
   else
     colors = av1_count_colors(src, src_stride, rows, cols, count_buf);
   mbmi->filter_intra_mode_info.use_filter_intra = 0;
@@ -3537,12 +3614,12 @@ static int rd_pick_palette_intra_sby(
     int centroids[PALETTE_MAX_SIZE];
     int lb, ub, val;
     uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
-    if (cpi->common.use_highbitdepth)
+    if (seq_params->use_highbitdepth)
       lb = ub = src16[0];
     else
       lb = ub = src[0];
 
-    if (cpi->common.use_highbitdepth) {
+    if (seq_params->use_highbitdepth) {
       for (r = 0; r < rows; ++r) {
         for (c = 0; c < cols; ++c) {
           val = src16[r * src_stride + c];
@@ -3576,7 +3653,7 @@ static int rd_pick_palette_intra_sby(
     int top_colors[PALETTE_MAX_SIZE] = { 0 };
     for (i = 0; i < AOMMIN(colors, PALETTE_MAX_SIZE); ++i) {
       int max_count = 0;
-      for (int j = 0; j < (1 << cpi->common.bit_depth); ++j) {
+      for (int j = 0; j < (1 << seq_params->bit_depth); ++j) {
         if (count_buf[j] > max_count) {
           max_count = count_buf[j];
           top_colors[i] = j;
@@ -4316,6 +4393,244 @@ static int ml_predict_tx_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row,
   return (int)(score * 100);
 }
 
+typedef struct {
+  int64_t rd;
+  int txb_entropy_ctx;
+  TX_TYPE tx_type;
+} TxCandidateInfo;
+
+static void try_tx_block_no_split(
+    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+    TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize,
+    const ENTROPY_CONTEXT *ta, const ENTROPY_CONTEXT *tl,
+    int txfm_partition_ctx, RD_STATS *rd_stats, int64_t ref_best_rd,
+    FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
+    TxCandidateInfo *no_split) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblock_plane *const p = &x->plane[0];
+  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+
+  no_split->rd = INT64_MAX;
+  no_split->txb_entropy_ctx = 0;
+  no_split->tx_type = TX_TYPES;
+
+  const ENTROPY_CONTEXT *const pta = ta + blk_col;
+  const ENTROPY_CONTEXT *const ptl = tl + blk_row;
+
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx);
+  const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y]
+                                .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+
+  rd_stats->ref_rdcost = ref_best_rd;
+  rd_stats->zero_rate = zero_blk_rate;
+  const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
+  mbmi->inter_tx_size[index] = tx_size;
+  tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, pta,
+                ptl, rd_stats, ftxs_mode, ref_best_rd,
+                rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
+  assert(rd_stats->rate < INT_MAX);
+
+  if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+           RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
+       rd_stats->skip == 1) &&
+      !xd->lossless[mbmi->segment_id]) {
+#if CONFIG_RD_DEBUG
+    av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col,
+                              zero_blk_rate - rd_stats->rate);
+#endif  // CONFIG_RD_DEBUG
+    rd_stats->rate = zero_blk_rate;
+    rd_stats->dist = rd_stats->sse;
+    rd_stats->skip = 1;
+    x->blk_skip[blk_row * bw + blk_col] = 1;
+    p->eobs[block] = 0;
+    update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                     DCT_DCT);
+  } else {
+    x->blk_skip[blk_row * bw + blk_col] = 0;
+    rd_stats->skip = 0;
+  }
+
+  if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+    rd_stats->rate += x->txfm_partition_cost[txfm_partition_ctx][0];
+
+  no_split->rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+  no_split->txb_entropy_ctx = p->txb_entropy_ctx[block];
+  const int txk_type_idx =
+      av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
+  no_split->tx_type = mbmi->txk_type[txk_type_idx];
+}
+
+static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+                            int blk_col, int block, TX_SIZE tx_size, int depth,
+                            BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+                            ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above,
+                            TXFM_CONTEXT *tx_left, RD_STATS *rd_stats,
+                            int64_t ref_best_rd, int *is_cost_valid,
+                            FAST_TX_SEARCH_MODE ftxs_mode,
+                            TXB_RD_INFO_NODE *rd_info_node);
+
+static void try_tx_block_split(
+    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+    TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+    ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+    int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd,
+    FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
+    RD_STATS *split_rd_stats, int64_t *split_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+  struct macroblock_plane *const p = &x->plane[0];
+  const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+  const int bsw = tx_size_wide_unit[sub_txs];
+  const int bsh = tx_size_high_unit[sub_txs];
+  const int sub_step = bsw * bsh;
+  RD_STATS this_rd_stats;
+  int this_cost_valid = 1;
+  int64_t tmp_rd = 0;
+#if CONFIG_DIST_8X8
+  int sub8x8_eob[4] = { 0, 0, 0, 0 };
+  struct macroblockd_plane *const pd = &xd->plane[0];
+#endif
+  split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1];
+
+  assert(tx_size < TX_SIZES_ALL);
+
+  int blk_idx = 0;
+  for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) {
+    for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw, ++blk_idx) {
+      const int offsetr = blk_row + r;
+      const int offsetc = blk_col + c;
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+      assert(blk_idx < 4);
+      select_tx_block(
+          cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize, ta,
+          tl, tx_above, tx_left, &this_rd_stats, ref_best_rd - tmp_rd,
+          &this_cost_valid, ftxs_mode,
+          (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
+
+#if CONFIG_DIST_8X8
+      if (!x->using_dist_8x8)
+#endif
+        if (!this_cost_valid) goto LOOP_EXIT;
+#if CONFIG_DIST_8X8
+      if (x->using_dist_8x8 && tx_size == TX_8X8) {
+        sub8x8_eob[2 * (r / bsh) + (c / bsw)] = p->eobs[block];
+      }
+#endif  // CONFIG_DIST_8X8
+      av1_merge_rd_stats(split_rd_stats, &this_rd_stats);
+
+      tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
+#if CONFIG_DIST_8X8
+      if (!x->using_dist_8x8)
+#endif
+        if (no_split_rd < tmp_rd) {
+          this_cost_valid = 0;
+          goto LOOP_EXIT;
+        }
+      block += sub_step;
+    }
+  }
+
+LOOP_EXIT : {}
+
+#if CONFIG_DIST_8X8
+  if (x->using_dist_8x8 && this_cost_valid && tx_size == TX_8X8) {
+    const int src_stride = p->src.stride;
+    const int dst_stride = pd->dst.stride;
+
+    const uint8_t *src =
+        &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+    const uint8_t *dst =
+        &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+
+    int64_t dist_8x8;
+    const int qindex = x->qindex;
+    const int pred_stride = block_size_wide[plane_bsize];
+    const int pred_idx = (blk_row * pred_stride + blk_col)
+                         << tx_size_wide_log2[0];
+    const int16_t *pred = &x->pred_luma[pred_idx];
+    int i, j;
+    int row, col;
+
+    uint8_t *pred8;
+    DECLARE_ALIGNED(16, uint16_t, pred8_16[8 * 8]);
+
+    dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, BLOCK_8X8,
+                            8, 8, 8, 8, qindex) *
+               16;
+
+#ifdef DEBUG_DIST_8X8
+    if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8)
+      assert(sum_rd_stats.sse == dist_8x8);
+#endif  // DEBUG_DIST_8X8
+
+    split_rd_stats->sse = dist_8x8;
+
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      pred8 = CONVERT_TO_BYTEPTR(pred8_16);
+    else
+      pred8 = (uint8_t *)pred8_16;
+
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      for (row = 0; row < 2; ++row) {
+        for (col = 0; col < 2; ++col) {
+          int idx = row * 2 + col;
+          int eob = sub8x8_eob[idx];
+
+          if (eob > 0) {
+            for (j = 0; j < 4; j++)
+              for (i = 0; i < 4; i++)
+                CONVERT_TO_SHORTPTR(pred8)
+                [(row * 4 + j) * 8 + 4 * col + i] =
+                    pred[(row * 4 + j) * pred_stride + 4 * col + i];
+          } else {
+            for (j = 0; j < 4; j++)
+              for (i = 0; i < 4; i++)
+                CONVERT_TO_SHORTPTR(pred8)
+                [(row * 4 + j) * 8 + 4 * col + i] = CONVERT_TO_SHORTPTR(
+                    dst)[(row * 4 + j) * dst_stride + 4 * col + i];
+          }
+        }
+      }
+    } else {
+      for (row = 0; row < 2; ++row) {
+        for (col = 0; col < 2; ++col) {
+          int idx = row * 2 + col;
+          int eob = sub8x8_eob[idx];
+
+          if (eob > 0) {
+            for (j = 0; j < 4; j++)
+              for (i = 0; i < 4; i++)
+                pred8[(row * 4 + j) * 8 + 4 * col + i] =
+                    (uint8_t)pred[(row * 4 + j) * pred_stride + 4 * col + i];
+          } else {
+            for (j = 0; j < 4; j++)
+              for (i = 0; i < 4; i++)
+                pred8[(row * 4 + j) * 8 + 4 * col + i] =
+                    dst[(row * 4 + j) * dst_stride + 4 * col + i];
+          }
+        }
+      }
+    }
+    dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, pred8, 8, BLOCK_8X8, 8, 8,
+                            8, 8, qindex) *
+               16;
+
+#ifdef DEBUG_DIST_8X8
+    if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8)
+      assert(sum_rd_stats.dist == dist_8x8);
+#endif  // DEBUG_DIST_8X8
+
+    split_rd_stats->dist = dist_8x8;
+    tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
+  }
+#endif  // CONFIG_DIST_8X8
+  if (this_cost_valid) *split_rd = tmp_rd;
+}
+
 // Search for the best tx partition/type for a given luma block.
 static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
                             int blk_col, int block, TX_SIZE tx_size, int depth,
@@ -4338,8 +4653,6 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
   const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-  ENTROPY_CONTEXT *pta = ta + blk_col;
-  ENTROPY_CONTEXT *ptl = tl + blk_row;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
                                          mbmi->sb_type, tx_size);
@@ -4348,64 +4661,25 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
   const int try_no_split = 1;
   int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH;
 
-  int64_t no_split_rd = INT64_MAX;
-  int no_split_txb_entropy_ctx = 0;
-  TX_TYPE no_split_tx_type = TX_TYPES;
+  TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES };
+
   // TX no split
   if (try_no_split) {
-    const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
-    TXB_CTX txb_ctx;
-    get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx);
-    const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y]
-                                  .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+    try_tx_block_no_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
+                          plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd,
+                          ftxs_mode, rd_info_node, &no_split);
 
-    rd_stats->ref_rdcost = ref_best_rd;
-    rd_stats->zero_rate = zero_blk_rate;
-    const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
-    mbmi->inter_tx_size[index] = tx_size;
-    tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, pta,
-                  ptl, rd_stats, ftxs_mode, ref_best_rd,
-                  rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
-    assert(rd_stats->rate < INT_MAX);
-
-    if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
-             RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
-         rd_stats->skip == 1) &&
-        !xd->lossless[mbmi->segment_id]) {
-#if CONFIG_RD_DEBUG
-      av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col,
-                                zero_blk_rate - rd_stats->rate);
-#endif  // CONFIG_RD_DEBUG
-      rd_stats->rate = zero_blk_rate;
-      rd_stats->dist = rd_stats->sse;
-      rd_stats->skip = 1;
-      x->blk_skip[blk_row * bw + blk_col] = 1;
-      p->eobs[block] = 0;
-      update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
-                       DCT_DCT);
-    } else {
-      x->blk_skip[blk_row * bw + blk_col] = 0;
-      rd_stats->skip = 0;
-    }
-
-    if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
-      rd_stats->rate += x->txfm_partition_cost[ctx][0];
-    no_split_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
     if (cpi->sf.adaptive_txb_search_level &&
-        (no_split_rd -
-         (no_split_rd >> (1 + cpi->sf.adaptive_txb_search_level))) >
+        (no_split.rd -
+         (no_split.rd >> (1 + cpi->sf.adaptive_txb_search_level))) >
             ref_best_rd) {
       *is_cost_valid = 0;
       return;
     }
 
-    no_split_txb_entropy_ctx = p->txb_entropy_ctx[block];
-    const int txk_type_idx =
-        av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
-    no_split_tx_type = mbmi->txk_type[txk_type_idx];
-
-    if (cpi->sf.txb_split_cap)
+    if (cpi->sf.txb_split_cap) {
       if (p->eobs[block] == 0) try_split = 0;
+    }
   }
 
   if (x->e_mbd.bd == 8 && !x->cb_partition_scan && try_split) {
@@ -4427,155 +4701,10 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
   RD_STATS split_rd_stats;
   av1_init_rd_stats(&split_rd_stats);
   if (try_split) {
-    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bsw = tx_size_wide_unit[sub_txs];
-    const int bsh = tx_size_high_unit[sub_txs];
-    const int sub_step = bsw * bsh;
-    RD_STATS this_rd_stats;
-    int this_cost_valid = 1;
-    int64_t tmp_rd = 0;
-#if CONFIG_DIST_8X8
-    int sub8x8_eob[4] = { 0, 0, 0, 0 };
-    struct macroblockd_plane *const pd = &xd->plane[0];
-#endif
-    split_rd_stats.rate = x->txfm_partition_cost[ctx][1];
-
-    assert(tx_size < TX_SIZES_ALL);
-
-    ref_best_rd = AOMMIN(no_split_rd, ref_best_rd);
-
-    int blk_idx = 0;
-    for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) {
-      for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw, ++blk_idx) {
-        const int offsetr = blk_row + r;
-        const int offsetc = blk_col + c;
-        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-        assert(blk_idx < 4);
-        select_tx_block(
-            cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize,
-            ta, tl, tx_above, tx_left, &this_rd_stats, ref_best_rd - tmp_rd,
-            &this_cost_valid, ftxs_mode,
-            (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
-
-#if CONFIG_DIST_8X8
-        if (!x->using_dist_8x8)
-#endif
-          if (!this_cost_valid) goto LOOP_EXIT;
-#if CONFIG_DIST_8X8
-        if (x->using_dist_8x8 && tx_size == TX_8X8) {
-          sub8x8_eob[2 * (r / bsh) + (c / bsw)] = p->eobs[block];
-        }
-#endif  // CONFIG_DIST_8X8
-        av1_merge_rd_stats(&split_rd_stats, &this_rd_stats);
-
-        tmp_rd = RDCOST(x->rdmult, split_rd_stats.rate, split_rd_stats.dist);
-#if CONFIG_DIST_8X8
-        if (!x->using_dist_8x8)
-#endif
-          if (no_split_rd < tmp_rd) {
-            this_cost_valid = 0;
-            goto LOOP_EXIT;
-          }
-        block += sub_step;
-      }
-    }
-
-  LOOP_EXIT : {}
-
-#if CONFIG_DIST_8X8
-    if (x->using_dist_8x8 && this_cost_valid && tx_size == TX_8X8) {
-      const int src_stride = p->src.stride;
-      const int dst_stride = pd->dst.stride;
-
-      const uint8_t *src =
-          &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
-      const uint8_t *dst =
-          &pd->dst
-               .buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-
-      int64_t dist_8x8;
-      const int qindex = x->qindex;
-      const int pred_stride = block_size_wide[plane_bsize];
-      const int pred_idx = (blk_row * pred_stride + blk_col)
-                           << tx_size_wide_log2[0];
-      const int16_t *pred = &x->pred_luma[pred_idx];
-      int i, j;
-      int row, col;
-
-      uint8_t *pred8;
-      DECLARE_ALIGNED(16, uint16_t, pred8_16[8 * 8]);
-
-      dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride,
-                              BLOCK_8X8, 8, 8, 8, 8, qindex) *
-                 16;
-
-#ifdef DEBUG_DIST_8X8
-      if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8)
-        assert(sum_rd_stats.sse == dist_8x8);
-#endif  // DEBUG_DIST_8X8
-
-      split_rd_stats.sse = dist_8x8;
-
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-        pred8 = CONVERT_TO_BYTEPTR(pred8_16);
-      else
-        pred8 = (uint8_t *)pred8_16;
-
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        for (row = 0; row < 2; ++row) {
-          for (col = 0; col < 2; ++col) {
-            int idx = row * 2 + col;
-            int eob = sub8x8_eob[idx];
-
-            if (eob > 0) {
-              for (j = 0; j < 4; j++)
-                for (i = 0; i < 4; i++)
-                  CONVERT_TO_SHORTPTR(pred8)
-                  [(row * 4 + j) * 8 + 4 * col + i] =
-                      pred[(row * 4 + j) * pred_stride + 4 * col + i];
-            } else {
-              for (j = 0; j < 4; j++)
-                for (i = 0; i < 4; i++)
-                  CONVERT_TO_SHORTPTR(pred8)
-                  [(row * 4 + j) * 8 + 4 * col + i] = CONVERT_TO_SHORTPTR(
-                      dst)[(row * 4 + j) * dst_stride + 4 * col + i];
-            }
-          }
-        }
-      } else {
-        for (row = 0; row < 2; ++row) {
-          for (col = 0; col < 2; ++col) {
-            int idx = row * 2 + col;
-            int eob = sub8x8_eob[idx];
-
-            if (eob > 0) {
-              for (j = 0; j < 4; j++)
-                for (i = 0; i < 4; i++)
-                  pred8[(row * 4 + j) * 8 + 4 * col + i] =
-                      (uint8_t)pred[(row * 4 + j) * pred_stride + 4 * col + i];
-            } else {
-              for (j = 0; j < 4; j++)
-                for (i = 0; i < 4; i++)
-                  pred8[(row * 4 + j) * 8 + 4 * col + i] =
-                      dst[(row * 4 + j) * dst_stride + 4 * col + i];
-            }
-          }
-        }
-      }
-      dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, pred8, 8, BLOCK_8X8, 8,
-                              8, 8, 8, qindex) *
-                 16;
-
-#ifdef DEBUG_DIST_8X8
-      if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8)
-        assert(sum_rd_stats.dist == dist_8x8);
-#endif  // DEBUG_DIST_8X8
-
-      split_rd_stats.dist = dist_8x8;
-      tmp_rd = RDCOST(x->rdmult, split_rd_stats.rate, split_rd_stats.dist);
-    }
-#endif  // CONFIG_DIST_8X8
-    if (this_cost_valid) split_rd = tmp_rd;
+    try_tx_block_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
+                       plane_bsize, ta, tl, tx_above, tx_left, ctx, no_split.rd,
+                       AOMMIN(no_split.rd, ref_best_rd), ftxs_mode,
+                       rd_info_node, &split_rd_stats, &split_rd);
   }
 
 #if COLLECT_TX_SIZE_DATA
@@ -4626,9 +4755,11 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
   } while (0);
 #endif  // COLLECT_TX_SIZE_DATA
 
-  if (no_split_rd < split_rd) {
+  if (no_split.rd < split_rd) {
+    ENTROPY_CONTEXT *pta = ta + blk_col;
+    ENTROPY_CONTEXT *ptl = tl + blk_row;
     const TX_SIZE tx_size_selected = tx_size;
-    p->txb_entropy_ctx[block] = no_split_txb_entropy_ctx;
+    p->txb_entropy_ctx[block] = no_split.txb_entropy_ctx;
     av1_set_txb_context(x, 0, block, tx_size_selected, pta, ptl);
     txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
                           tx_size);
@@ -4641,7 +4772,7 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
     }
     mbmi->tx_size = tx_size_selected;
     update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
-                     no_split_tx_type);
+                     no_split.tx_type);
     x->blk_skip[blk_row * bw + blk_col] = rd_stats->skip;
   } else {
     *rd_stats = split_rd_stats;
@@ -4707,13 +4838,19 @@ static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
       }
     }
   }
-  int64_t zero_rd = RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse);
-  this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-  if (zero_rd < this_rd) {
-    this_rd = zero_rd;
-    rd_stats->rate = rd_stats->zero_rate;
+
+  const int skip_ctx = av1_get_skip_context(xd);
+  const int s0 = x->skip_cost[skip_ctx][0];
+  const int s1 = x->skip_cost[skip_ctx][1];
+  int64_t skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
+  this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
+  if (skip_rd <= this_rd) {
+    this_rd = skip_rd;
+    rd_stats->rate = 0;
     rd_stats->dist = rd_stats->sse;
     rd_stats->skip = 1;
+  } else {
+    rd_stats->skip = 0;
   }
   if (this_rd > ref_best_rd) is_cost_valid = 0;
 
@@ -4921,11 +5058,15 @@ static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
       }
     }
   }
-  int64_t zero_rd = RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse);
-  this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-  if (zero_rd < this_rd) {
-    this_rd = zero_rd;
-    rd_stats->rate = rd_stats->zero_rate;
+
+  const int skip_ctx = av1_get_skip_context(xd);
+  const int s0 = x->skip_cost[skip_ctx][0];
+  const int s1 = x->skip_cost[skip_ctx][1];
+  int64_t skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
+  this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
+  if (skip_rd < this_rd) {
+    this_rd = skip_rd;
+    rd_stats->rate = 0;
     rd_stats->dist = rd_stats->sse;
     rd_stats->skip = 1;
   }
@@ -5159,7 +5300,7 @@ static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
   const MACROBLOCKD *xd = &x->e_mbd;
   const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
 
-  *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize);
+  *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize, 1);
   const int64_t mse = *dist / bw / bh;
   // Normalized quantizer takes the transform upscaling factor (8 for tx size
   // smaller than 32) into account.
@@ -5215,23 +5356,7 @@ static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize,
   mbmi->tx_size = tx_size;
   memset(x->blk_skip, 1, sizeof(x->blk_skip[0]) * n4);
   rd_stats->skip = 1;
-
-  // Rate.
-  const int tx_size_ctx = get_txsize_entropy_ctx(tx_size);
-  ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
-  ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
-  av1_get_entropy_contexts(bsize, &xd->plane[0], ctxa, ctxl);
-  TXB_CTX txb_ctx;
-  // Because plane is 0, plane_bsize equal to bsize
-  get_txb_ctx(bsize, tx_size, 0, ctxa, ctxl, &txb_ctx);
-  int rate = x->coeff_costs[tx_size_ctx][PLANE_TYPE_Y]
-                 .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
-  if (tx_size > TX_4X4) {
-    int ctx = txfm_partition_context(
-        xd->above_txfm_context, xd->left_txfm_context, mbmi->sb_type, tx_size);
-    rate += x->txfm_partition_cost[ctx][0];
-  }
-  rd_stats->rate = rate;
+  rd_stats->rate = 0;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
     dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
   rd_stats->dist = rd_stats->sse = (dist << 4);
@@ -5322,6 +5447,8 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
 
   rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
                                found_rd_info ? matched_rd_info : NULL);
+  assert(IMPLIES(this_rd_stats.skip && !this_rd_stats.invalid_rate,
+                 this_rd_stats.rate == 0));
 
   ref_best_rd = AOMMIN(rd, ref_best_rd);
   if (rd < best_rd) {
@@ -5455,6 +5582,7 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
       av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type));
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const BLOCK_SIZE bsize = mbmi->sb_type;
+  const SequenceHeader *const seq_params = &cpi->common.seq_params;
   int this_rate;
   int64_t this_rd;
   int colors_u, colors_v, colors;
@@ -5470,11 +5598,11 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
   mbmi->uv_mode = UV_DC_PRED;
 
   int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
-  if (cpi->common.use_highbitdepth) {
+  if (seq_params->use_highbitdepth) {
     colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols,
-                                       cpi->common.bit_depth, count_buf);
+                                       seq_params->bit_depth, count_buf);
     colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols,
-                                       cpi->common.bit_depth, count_buf);
+                                       seq_params->bit_depth, count_buf);
   } else {
     colors_u = av1_count_colors(src_u, src_stride, rows, cols, count_buf);
     colors_v = av1_count_colors(src_v, src_stride, rows, cols, count_buf);
@@ -5494,7 +5622,7 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
 
     uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
     uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
-    if (cpi->common.use_highbitdepth) {
+    if (seq_params->use_highbitdepth) {
       lb_u = src_u16[0];
       ub_u = src_u16[0];
       lb_v = src_v16[0];
@@ -5508,7 +5636,7 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
 
     for (r = 0; r < rows; ++r) {
       for (c = 0; c < cols; ++c) {
-        if (cpi->common.use_highbitdepth) {
+        if (seq_params->use_highbitdepth) {
           val_u = src_u16[r * src_stride + c];
           val_v = src_v16[r * src_stride + c];
           data[(r * cols + c) * 2] = val_u;
@@ -5557,9 +5685,9 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
       pmi->palette_size[1] = n;
       for (i = 1; i < 3; ++i) {
         for (j = 0; j < n; ++j) {
-          if (cpi->common.use_highbitdepth)
+          if (seq_params->use_highbitdepth)
             pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
-                (int)centroids[j * 2 + i - 1], cpi->common.bit_depth);
+                (int)centroids[j * 2 + i - 1], seq_params->bit_depth);
           else
             pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
                 clip_pixel((int)centroids[j * 2 + i - 1]);
@@ -5907,8 +6035,9 @@ static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
     *mode_uv = UV_DC_PRED;
     return;
   }
-  xd->cfl.is_chroma_reference = is_chroma_reference(
-      mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y);
+  xd->cfl.is_chroma_reference =
+      is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x,
+                          cm->seq_params.subsampling_y);
   bsize = scale_chroma_bsize(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
                              xd->plane[AOM_PLANE_U].subsampling_y);
   // Only store reconstructed luma when there's chroma RDO. When there's no
@@ -7038,7 +7167,9 @@ static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
 // Choose the best wedge index and sign
 static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
                           const BLOCK_SIZE bsize, const uint8_t *const p0,
-                          const uint8_t *const p1, int *const best_wedge_sign,
+                          const int16_t *const residual1,
+                          const int16_t *const diff10,
+                          int *const best_wedge_sign,
                           int *const best_wedge_index) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct buf_2d *const src = &x->plane[0].src;
@@ -7056,34 +7187,22 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
   const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
   const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
 
-  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]);
-
-  int64_t sign_limit;
-
+  DECLARE_ALIGNED(32, int16_t, residual0[MAX_SB_SQUARE]);  // src - pred0
   if (hbd) {
-    aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride,
-                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
-    aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
-                              CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
-    aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
+    aom_highbd_subtract_block(bh, bw, residual0, bw, src->buf, src->stride,
                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
   } else {
-    aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw);
-    aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
-    aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
+    aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw);
   }
 
-  sign_limit = ((int64_t)aom_sum_squares_i16(r0, N) -
-                (int64_t)aom_sum_squares_i16(r1, N)) *
-               (1 << WEDGE_WEIGHT_BITS) / 2;
-
+  int64_t sign_limit = ((int64_t)aom_sum_squares_i16(residual0, N) -
+                        (int64_t)aom_sum_squares_i16(residual1, N)) *
+                       (1 << WEDGE_WEIGHT_BITS) / 2;
+  int16_t *ds = residual0;
   if (N < 64)
-    av1_wedge_compute_delta_squares_c(ds, r0, r1, N);
+    av1_wedge_compute_delta_squares_c(ds, residual0, residual1, N);
   else
-    av1_wedge_compute_delta_squares(ds, r0, r1, N);
+    av1_wedge_compute_delta_squares(ds, residual0, residual1, N);
 
   for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
     mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize);
@@ -7096,9 +7215,9 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
 
     mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
     if (N < 64)
-      sse = av1_wedge_sse_from_residuals_c(r1, d10, mask, N);
+      sse = av1_wedge_sse_from_residuals_c(residual1, diff10, mask, N);
     else
-      sse = av1_wedge_sse_from_residuals(r1, d10, mask, N);
+      sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
     sse = ROUND_POWER_OF_TWO(sse, bd_round);
 
     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
@@ -7117,12 +7236,15 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
 }
 
 // Choose the best wedge index the specified sign
-static int64_t pick_wedge_fixed_sign(
-    const AV1_COMP *const cpi, const MACROBLOCK *const x,
-    const BLOCK_SIZE bsize, const uint8_t *const p0, const uint8_t *const p1,
-    const int wedge_sign, int *const best_wedge_index) {
+static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi,
+                                     const MACROBLOCK *const x,
+                                     const BLOCK_SIZE bsize,
+                                     const int16_t *const residual1,
+                                     const int16_t *const diff10,
+                                     const int wedge_sign,
+                                     int *const best_wedge_index) {
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const src = &x->plane[0].src;
+
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
   const int N = bw * bh;
@@ -7135,26 +7257,12 @@ static int64_t pick_wedge_fixed_sign(
   uint64_t sse;
   const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
   const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
-
-  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
-
-  if (hbd) {
-    aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
-                              CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
-    aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
-                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
-  } else {
-    aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
-    aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
-  }
-
   for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
     mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
     if (N < 64)
-      sse = av1_wedge_sse_from_residuals_c(r1, d10, mask, N);
+      sse = av1_wedge_sse_from_residuals_c(residual1, diff10, mask, N);
     else
-      sse = av1_wedge_sse_from_residuals(r1, d10, mask, N);
+      sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
     sse = ROUND_POWER_OF_TWO(sse, bd_round);
 
     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
@@ -7166,16 +7274,14 @@ static int64_t pick_wedge_fixed_sign(
       best_rd = rd;
     }
   }
-
   return best_rd -
          RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
 }
 
-static int64_t pick_interinter_wedge(const AV1_COMP *const cpi,
-                                     MACROBLOCK *const x,
-                                     const BLOCK_SIZE bsize,
-                                     const uint8_t *const p0,
-                                     const uint8_t *const p1) {
+static int64_t pick_interinter_wedge(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize,
+    const uint8_t *const p0, const uint8_t *const p1,
+    const int16_t *const residual1, const int16_t *const diff10) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   const int bw = block_size_wide[bsize];
@@ -7189,9 +7295,11 @@ static int64_t pick_interinter_wedge(const AV1_COMP *const cpi,
 
   if (cpi->sf.fast_wedge_sign_estimate) {
     wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
-    rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, wedge_sign, &wedge_index);
+    rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, wedge_sign,
+                               &wedge_index);
   } else {
-    rd = pick_wedge(cpi, x, bsize, p0, p1, &wedge_sign, &wedge_index);
+    rd = pick_wedge(cpi, x, bsize, p0, residual1, diff10, &wedge_sign,
+                    &wedge_index);
   }
 
   mbmi->interinter_comp.wedge_sign = wedge_sign;
@@ -7202,10 +7310,11 @@ static int64_t pick_interinter_wedge(const AV1_COMP *const cpi,
 static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
                                    MACROBLOCK *const x, const BLOCK_SIZE bsize,
                                    const uint8_t *const p0,
-                                   const uint8_t *const p1) {
+                                   const uint8_t *const p1,
+                                   const int16_t *const residual1,
+                                   const int16_t *const diff10) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  const struct buf_2d *const src = &x->plane[0].src;
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
   const int N = bw * bh;
@@ -7218,23 +7327,6 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
   DIFFWTD_MASK_TYPE best_mask_type = 0;
   const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
   const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
-  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
-
-  if (hbd) {
-    aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride,
-                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
-    aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
-                              CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
-    aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
-                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
-  } else {
-    aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw);
-    aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
-    aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
-  }
-
   // try each mask type and its inverse
   for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) {
     // build mask and inverse
@@ -7247,7 +7339,7 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
                                       bw, bh, bw);
 
     // compute rd for mask
-    sse = av1_wedge_sse_from_residuals(r1, d10, xd->seg_mask, N);
+    sse = av1_wedge_sse_from_residuals(residual1, diff10, xd->seg_mask, N);
     sse = ROUND_POWER_OF_TWO(sse, bd_round);
 
     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
@@ -7279,14 +7371,26 @@ static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
                                      const uint8_t *const p1) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-
-  int64_t rd;
-  int wedge_index = -1;
-
   assert(is_interintra_wedge_used(bsize));
   assert(cpi->common.seq_params.enable_interintra_compound);
 
-  rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, 0, &wedge_index);
+  const struct buf_2d *const src = &x->plane[0].src;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]);  // src - pred1
+  DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]);     // pred1 - pred0
+  if (get_bitdepth_data_path_index(xd)) {
+    aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
+    aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw,
+                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+  } else {
+    aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw);
+    aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw);
+  }
+  int wedge_index = -1;
+  int64_t rd =
+      pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, 0, &wedge_index);
 
   mbmi->interintra_wedge_sign = 0;
   mbmi->interintra_wedge_index = wedge_index;
@@ -7296,11 +7400,15 @@ static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
 static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     const BLOCK_SIZE bsize,
                                     const uint8_t *const p0,
-                                    const uint8_t *const p1) {
+                                    const uint8_t *const p1,
+                                    const int16_t *const residual1,
+                                    const int16_t *const diff10) {
   const COMPOUND_TYPE compound_type = x->e_mbd.mi[0]->interinter_comp.type;
   switch (compound_type) {
-    case COMPOUND_WEDGE: return pick_interinter_wedge(cpi, x, bsize, p0, p1);
-    case COMPOUND_DIFFWTD: return pick_interinter_seg(cpi, x, bsize, p0, p1);
+    case COMPOUND_WEDGE:
+      return pick_interinter_wedge(cpi, x, bsize, p0, p1, residual1, diff10);
+    case COMPOUND_DIFFWTD:
+      return pick_interinter_seg(cpi, x, bsize, p0, p1, residual1, diff10);
     default: assert(0); return 0;
   }
 }
@@ -7336,7 +7444,7 @@ static int64_t build_and_cost_compound_type(
     const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
     const BLOCK_SIZE bsize, const int this_mode, int *rs2, int rate_mv,
     BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, uint8_t **preds1,
-    int *strides, int mi_row, int mi_col) {
+    int16_t *residual1, int16_t *diff10, int *strides, int mi_row, int mi_col) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -7348,7 +7456,8 @@ static int64_t build_and_cost_compound_type(
   int64_t tmp_skip_sse_sb;
   const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
 
-  best_rd_cur = pick_interinter_mask(cpi, x, bsize, *preds0, *preds1);
+  best_rd_cur =
+      pick_interinter_mask(cpi, x, bsize, *preds0, *preds1, residual1, diff10);
   *rs2 += get_interinter_compound_mask_rate(x, mbmi);
   best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0);
 
@@ -7357,6 +7466,7 @@ static int64_t build_and_cost_compound_type(
     *out_rate_mv = interinter_compound_motion_search(cpi, x, cur_mv, bsize,
                                                      this_mode, mi_row, mi_col);
     av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
+    av1_subtract_plane(x, bsize, 0);
     model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
                     &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
     rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
@@ -7367,7 +7477,6 @@ static int64_t build_and_cost_compound_type(
       av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
                                                preds1, strides);
     }
-    av1_subtract_plane(x, bsize, 0);
     rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
                              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
     if (rd != INT64_MAX)
@@ -7377,7 +7486,6 @@ static int64_t build_and_cost_compound_type(
   } else {
     av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
                                              preds1, strides);
-    av1_subtract_plane(x, bsize, 0);
     rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
                              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
     if (rd != INT64_MAX)
@@ -7393,11 +7501,11 @@ typedef struct {
   int above_pred_stride[MAX_MB_PLANE];
   uint8_t *left_pred_buf[MAX_MB_PLANE];
   int left_pred_stride[MAX_MB_PLANE];
-  int_mv *single_newmv;
+  int_mv (*single_newmv)[REF_FRAMES];
   // Pointer to array of motion vectors to use for each ref and their rates
   // Should point to first of 2 arrays in 2D array
-  int *single_newmv_rate;
-  int *single_newmv_valid;
+  int (*single_newmv_rate)[REF_FRAMES];
+  int (*single_newmv_valid)[REF_FRAMES];
   // Pointer to array of predicted rate-distortion
   // Should point to first of 2 arrays in 2D array
   int64_t (*modelled_rd)[REF_FRAMES];
@@ -7428,14 +7536,15 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
   const PREDICTION_MODE this_mode = mbmi->mode;
   const int refs[2] = { mbmi->ref_frame[0],
                         mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
+  const int ref_mv_idx = mbmi->ref_mv_idx;
   int i;
 
   (void)args;
 
   if (is_comp_pred) {
     if (this_mode == NEW_NEWMV) {
-      cur_mv[0].as_int = args->single_newmv[refs[0]].as_int;
-      cur_mv[1].as_int = args->single_newmv[refs[1]].as_int;
+      cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
+      cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
 
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
         joint_motion_search(cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, NULL,
@@ -7451,7 +7560,7 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
         }
       }
     } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
-      cur_mv[1].as_int = args->single_newmv[refs[1]].as_int;
+      cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
         compound_single_motion_search_interinter(
             cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 1);
@@ -7464,7 +7573,7 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
       }
     } else {
       assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV);
-      cur_mv[0].as_int = args->single_newmv[refs[0]].as_int;
+      cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
         compound_single_motion_search_interinter(
             cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 0);
@@ -7480,9 +7589,9 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
     single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, rate_mv);
     if (x->best_mv.as_int == INVALID_MV) return INT64_MAX;
 
-    args->single_newmv[refs[0]] = x->best_mv;
-    args->single_newmv_rate[refs[0]] = *rate_mv;
-    args->single_newmv_valid[refs[0]] = 1;
+    args->single_newmv[ref_mv_idx][refs[0]] = x->best_mv;
+    args->single_newmv_rate[ref_mv_idx][refs[0]] = *rate_mv;
+    args->single_newmv_valid[ref_mv_idx][refs[0]] = 1;
 
     cur_mv[0].as_int = x->best_mv.as_int;
 
@@ -7508,12 +7617,25 @@ static INLINE void swap_dst_buf(MACROBLOCKD *xd, const BUFFER_SET *dst_bufs[2],
   restore_dst_buf(xd, *dst_bufs[0], num_planes);
 }
 
+static INLINE int get_switchable_rate(MACROBLOCK *const x,
+                                      const InterpFilters filters,
+                                      const int ctx[2]) {
+  int inter_filter_cost;
+  const InterpFilter filter0 = av1_extract_interp_filter(filters, 0);
+  const InterpFilter filter1 = av1_extract_interp_filter(filters, 1);
+  inter_filter_cost = x->switchable_interp_costs[ctx[0]][filter0];
+  inter_filter_cost += x->switchable_interp_costs[ctx[1]][filter1];
+  return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+}
+
 // calculate the rdcost of given interpolation_filter
 static INLINE int64_t interpolation_filter_rd(
     MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
     int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd,
     int *const switchable_rate, int *const skip_txfm_sb,
-    int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2], int filter_idx) {
+    int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2], int filter_idx,
+    const int switchable_ctx[2], const int skip_pred, int *rate,
+    int64_t *dist) {
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -7523,23 +7645,136 @@ static INLINE int64_t interpolation_filter_rd(
 
   const InterpFilters last_best = mbmi->interp_filters;
   mbmi->interp_filters = filter_sets[filter_idx];
-  const int tmp_rs = av1_get_switchable_rate(cm, x, xd);
-  av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
-  model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, &tmp_dist,
-                  &tmp_skip_sb, &tmp_skip_sse, NULL, NULL, NULL);
+  const int tmp_rs =
+      get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
+
+  if (!skip_pred) {
+    av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+    av1_subtract_plane(x, bsize, 0);
+#if DNN_BASED_RD_INTERP_FILTER
+    model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 0, 0, &tmp_rate, &tmp_dist,
+                             &tmp_skip_sb, &tmp_skip_sse, NULL, NULL, NULL);
+#else
+    model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &tmp_rate, &tmp_dist, &tmp_skip_sb,
+                    &tmp_skip_sse, NULL, NULL, NULL);
+#endif
+    if (num_planes > 1) {
+      int64_t tmp_y_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
+      if (tmp_y_rd > *rd) {
+        mbmi->interp_filters = last_best;
+        return 0;
+      }
+      int tmp_rate_uv, tmp_skip_sb_uv;
+      int64_t tmp_dist_uv, tmp_skip_sse_uv;
+      av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, orig_dst, bsize);
+      for (int plane = 1; plane < num_planes; ++plane)
+        av1_subtract_plane(x, bsize, plane);
+#if DNN_BASED_RD_INTERP_FILTER
+      model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 1, num_planes - 1,
+                               &tmp_rate_uv, &tmp_dist_uv, &tmp_skip_sb_uv,
+                               &tmp_skip_sse_uv, NULL, NULL, NULL);
+#else
+      model_rd_for_sb(cpi, bsize, x, xd, 1, num_planes - 1, &tmp_rate_uv,
+                      &tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL,
+                      NULL, NULL);
+#endif
+      tmp_rate += tmp_rate_uv;
+      tmp_skip_sb &= tmp_skip_sb_uv;
+      tmp_dist += tmp_dist_uv;
+      tmp_skip_sse += tmp_skip_sse_uv;
+    }
+  } else {
+    tmp_rate = *rate;
+    tmp_dist = *dist;
+  }
   int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
   if (tmp_rd < *rd) {
     *rd = tmp_rd;
     *switchable_rate = tmp_rs;
     *skip_txfm_sb = tmp_skip_sb;
     *skip_sse_sb = tmp_skip_sse;
-    swap_dst_buf(xd, dst_bufs, num_planes);
+    *rate = tmp_rate;
+    *dist = tmp_dist;
+    if (!skip_pred) {
+      swap_dst_buf(xd, dst_bufs, num_planes);
+    }
     return 1;
   }
   mbmi->interp_filters = last_best;
   return 0;
 }
 
+// Find the best rd filter in horizontal direction
+static INLINE int find_best_horiz_interp_filter_rd(
+    MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+    int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd,
+    int *const switchable_rate, int *const skip_txfm_sb,
+    int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2],
+    const int switchable_ctx[2], const int skip_hor, int *rate, int64_t *dist,
+    int best_dual_mode) {
+  int i;
+  const int bw = block_size_wide[bsize];
+  assert(best_dual_mode == 0);
+  if ((bw <= 4) && (!skip_hor)) {
+    int skip_pred = 1;
+    // Process the filters in reverse order to enable reusing rate and
+    // distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP
+    for (i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) {
+      if (interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+                                  switchable_rate, skip_txfm_sb, skip_sse_sb,
+                                  dst_bufs, i, switchable_ctx, skip_pred, rate,
+                                  dist)) {
+        best_dual_mode = i;
+      }
+      skip_pred = 0;
+    }
+  } else {
+    for (i = 1; i < SWITCHABLE_FILTERS; ++i) {
+      if (interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+                                  switchable_rate, skip_txfm_sb, skip_sse_sb,
+                                  dst_bufs, i, switchable_ctx, skip_hor, rate,
+                                  dist)) {
+        best_dual_mode = i;
+      }
+    }
+  }
+  return best_dual_mode;
+}
+
+// Find the best rd filter in vertical direction
+static INLINE void find_best_vert_interp_filter_rd(
+    MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+    int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd,
+    int *const switchable_rate, int *const skip_txfm_sb,
+    int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2],
+    const int switchable_ctx[2], const int skip_ver, int *rate, int64_t *dist,
+    int best_dual_mode, int filter_set_size) {
+  int i;
+  const int bh = block_size_high[bsize];
+  if ((bh <= 4) && (!skip_ver)) {
+    int skip_pred = 1;
+    // Process the filters in reverse order to enable reusing rate and
+    // distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP
+    assert(filter_set_size == DUAL_FILTER_SET_SIZE);
+    for (i = (filter_set_size - SWITCHABLE_FILTERS + best_dual_mode);
+         i >= (best_dual_mode + SWITCHABLE_FILTERS); i -= SWITCHABLE_FILTERS) {
+      interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+                              switchable_rate, skip_txfm_sb, skip_sse_sb,
+                              dst_bufs, i, switchable_ctx, skip_pred, rate,
+                              dist);
+      skip_pred = 0;
+    }
+  } else {
+    for (i = best_dual_mode + SWITCHABLE_FILTERS; i < filter_set_size;
+         i += SWITCHABLE_FILTERS) {
+      interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+                              switchable_rate, skip_txfm_sb, skip_sse_sb,
+                              dst_bufs, i, switchable_ctx, skip_ver, rate,
+                              dist);
+    }
+  }
+}
+
 // check if there is saved result match with this search
 static INLINE int is_interp_filter_match(const INTERPOLATION_FILTER_STATS *st,
                                          MB_MODE_INFO *const mi) {
@@ -7605,10 +7840,22 @@ static int64_t interpolation_filter_search(
   if (!need_search || match_found == -1) {
     set_default_interp_filters(mbmi, assign_filter);
   }
-  *switchable_rate = av1_get_switchable_rate(cm, x, xd);
+  int switchable_ctx[2];
+  switchable_ctx[0] = av1_get_pred_context_switchable_interp(xd, 0);
+  switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1);
+  *switchable_rate =
+      get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
   av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+  for (int plane = 0; plane < num_planes; ++plane)
+    av1_subtract_plane(x, bsize, plane);
+#if DNN_BASED_RD_INTERP_FILTER
+  model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate,
+                           &tmp_dist, skip_txfm_sb, skip_sse_sb, NULL, NULL,
+                           NULL);
+#else
   model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, &tmp_dist,
                   skip_txfm_sb, skip_sse_sb, NULL, NULL, NULL);
+#endif  // DNN_BASED_RD_INTERP_FILTER
   *rd = RDCOST(x->rdmult, *switchable_rate + tmp_rate, tmp_dist);
 
   if (assign_filter != SWITCHABLE || match_found != -1) {
@@ -7619,6 +7866,23 @@ static int64_t interpolation_filter_search(
            av1_broadcast_interp_filter(EIGHTTAP_REGULAR));
     return 0;
   }
+  int skip_hor = 1;
+  int skip_ver = 1;
+  const int is_compound = has_second_ref(mbmi);
+  for (int k = 0; k < num_planes - 1; ++k) {
+    struct macroblockd_plane *const pd = &xd->plane[k];
+    const int bw = pd->width;
+    const int bh = pd->height;
+    for (int j = 0; j < 1 + is_compound; ++j) {
+      const MV mv = mbmi->mv[j].as_mv;
+      const MV mv_q4 = clamp_mv_to_umv_border_sb(
+          xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+      const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+      const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+      skip_hor &= (sub_x == 0);
+      skip_ver &= (sub_y == 0);
+    }
+  }
   // do interp_filter search
   const int filter_set_size = DUAL_FILTER_SET_SIZE;
   restore_dst_buf(xd, *tmp_dst, num_planes);
@@ -7629,20 +7893,16 @@ static int64_t interpolation_filter_search(
     int best_dual_mode = 0;
     // Find best of {R}x{R,Sm,Sh}
     // EIGHTTAP_REGULAR mode is calculated beforehand
-    for (i = 1; i < SWITCHABLE_FILTERS; ++i) {
-      if (interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
-                                  switchable_rate, skip_txfm_sb, skip_sse_sb,
-                                  dst_bufs, i)) {
-        best_dual_mode = i;
-      }
-    }
+    best_dual_mode = find_best_horiz_interp_filter_rd(
+        x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate,
+        skip_txfm_sb, skip_sse_sb, dst_bufs, switchable_ctx, skip_hor,
+        &tmp_rate, &tmp_dist, best_dual_mode);
+
     // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
-    for (i = best_dual_mode + SWITCHABLE_FILTERS; i < filter_set_size;
-         i += SWITCHABLE_FILTERS) {
-      interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
-                              switchable_rate, skip_txfm_sb, skip_sse_sb,
-                              dst_bufs, i);
-    }
+    find_best_vert_interp_filter_rd(
+        x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate,
+        skip_txfm_sb, skip_sse_sb, dst_bufs, switchable_ctx, skip_ver,
+        &tmp_rate, &tmp_dist, best_dual_mode, filter_set_size);
   } else {
     // EIGHTTAP_REGULAR mode is calculated beforehand
     for (i = 1; i < filter_set_size; ++i) {
@@ -7653,7 +7913,8 @@ static int64_t interpolation_filter_search(
       }
       interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
                               switchable_rate, skip_txfm_sb, skip_sse_sb,
-                              dst_bufs, i);
+                              dst_bufs, i, switchable_ctx, 0, &tmp_rate,
+                              &tmp_dist);
     }
   }
   swap_dst_buf(xd, dst_bufs, num_planes);
@@ -7848,6 +8109,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
         av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
                                                   intrapred, bw);
         av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+        av1_subtract_plane(x, bsize, 0);
         model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
                         &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
         rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
@@ -7861,7 +8123,6 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
       av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
                                                 intrapred, bw);
       av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-      av1_subtract_plane(x, bsize, 0);
       rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
                                &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
       if (rd != INT64_MAX)
@@ -7908,6 +8169,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
             mbmi->mv[0].as_int = tmp_mv.as_int;
             av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst,
                                            bsize);
+            av1_subtract_plane(x, bsize, 0);
             model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
                             &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL,
                             NULL);
@@ -7925,7 +8187,6 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
             av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
           }
           // Evaluate closer to true rd
-          av1_subtract_plane(x, bsize, 0);
           rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
                                    &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
                                    INT64_MAX);
@@ -8323,6 +8584,148 @@ static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi,
   return cost;
 }
 
+static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                   BLOCK_SIZE bsize, int mi_col, int mi_row,
+                                   int_mv *cur_mv, int masked_compound_used,
+                                   BUFFER_SET *orig_dst, BUFFER_SET *tmp_dst,
+                                   int *rate_mv, int64_t *rd,
+                                   RD_STATS *rd_stats, int64_t ref_best_rd) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int this_mode = mbmi->mode;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  int rate_sum, rs2;
+  int64_t dist_sum;
+
+  int_mv best_mv[2];
+  int best_tmp_rate_mv = *rate_mv;
+  int tmp_skip_txfm_sb;
+  int64_t tmp_skip_sse_sb;
+  INTERINTER_COMPOUND_DATA best_compound_data;
+  best_compound_data.type = COMPOUND_AVERAGE;
+  DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]);  // src - pred1
+  DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]);     // pred1 - pred0
+  uint8_t tmp_best_mask_buf[2 * MAX_SB_SQUARE];
+  uint8_t *preds0[1] = { pred0 };
+  uint8_t *preds1[1] = { pred1 };
+  int strides[1] = { bw };
+  int tmp_rate_mv;
+  const int num_pix = 1 << num_pels_log2_lookup[bsize];
+  const int mask_len = 2 * num_pix * sizeof(uint8_t);
+  COMPOUND_TYPE cur_type;
+  int best_compmode_interinter_cost = 0;
+  int can_use_previous = cm->allow_warped_motion;
+
+  best_mv[0].as_int = cur_mv[0].as_int;
+  best_mv[1].as_int = cur_mv[1].as_int;
+  *rd = INT64_MAX;
+  if (masked_compound_used) {
+    // get inter predictors to use for masked compound modes
+    av1_build_inter_predictors_for_planes_single_buf(
+        xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides, can_use_previous);
+    av1_build_inter_predictors_for_planes_single_buf(
+        xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous);
+    const struct buf_2d *const src = &x->plane[0].src;
+    if (get_bitdepth_data_path_index(xd)) {
+      aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+                                CONVERT_TO_BYTEPTR(pred1), bw, xd->bd);
+      aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(pred1),
+                                bw, CONVERT_TO_BYTEPTR(pred0), bw, xd->bd);
+    } else {
+      aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, pred1,
+                         bw);
+      aom_subtract_block(bh, bw, diff10, bw, pred1, bw, pred0, bw);
+    }
+  }
+  const int orig_is_best = xd->plane[0].dst.buf == orig_dst->plane[0];
+  const BUFFER_SET *backup_buf = orig_is_best ? tmp_dst : orig_dst;
+  const BUFFER_SET *best_buf = orig_is_best ? orig_dst : tmp_dst;
+  for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
+    if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break;
+    if (!is_interinter_compound_used(cur_type, bsize)) continue;
+    tmp_rate_mv = *rate_mv;
+    int64_t best_rd_cur = INT64_MAX;
+    mbmi->interinter_comp.type = cur_type;
+    int masked_type_cost = 0;
+
+    const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+    const int comp_index_ctx = get_comp_index_context(cm, xd);
+    mbmi->compound_idx = 1;
+    if (cur_type == COMPOUND_AVERAGE) {
+      mbmi->comp_group_idx = 0;
+      if (masked_compound_used) {
+        masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][0];
+      }
+      masked_type_cost += x->comp_idx_cost[comp_index_ctx][1];
+      rs2 = masked_type_cost;
+      // No need to call av1_build_inter_predictors_sby here
+      // 1. COMPOUND_AVERAGE is always the first candidate
+      // 2. av1_build_inter_predictors_sby has been called by
+      // interpolation_filter_search
+      int64_t est_rd =
+          estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+      // use spare buffer for following compound type try
+      restore_dst_buf(xd, *backup_buf, 1);
+      if (est_rd != INT64_MAX)
+        best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
+    } else {
+      mbmi->comp_group_idx = 1;
+      masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][1];
+      masked_type_cost += x->compound_type_cost[bsize][cur_type - 1];
+      rs2 = masked_type_cost;
+      if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
+          *rd / 3 < ref_best_rd) {
+        best_rd_cur = build_and_cost_compound_type(
+            cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst,
+            &tmp_rate_mv, preds0, preds1, residual1, diff10, strides, mi_row,
+            mi_col);
+      }
+    }
+    if (best_rd_cur < *rd) {
+      *rd = best_rd_cur;
+      best_compound_data = mbmi->interinter_comp;
+      if (masked_compound_used && cur_type != COMPOUND_TYPES - 1) {
+        memcpy(tmp_best_mask_buf, xd->seg_mask, mask_len);
+      }
+      best_compmode_interinter_cost = rs2;
+      if (have_newmv_in_inter_mode(this_mode)) {
+        if (use_masked_motion_search(cur_type)) {
+          best_tmp_rate_mv = tmp_rate_mv;
+          best_mv[0].as_int = mbmi->mv[0].as_int;
+          best_mv[1].as_int = mbmi->mv[1].as_int;
+        } else {
+          best_mv[0].as_int = cur_mv[0].as_int;
+          best_mv[1].as_int = cur_mv[1].as_int;
+        }
+      }
+    }
+    // reset to original mvs for next iteration
+    mbmi->mv[0].as_int = cur_mv[0].as_int;
+    mbmi->mv[1].as_int = cur_mv[1].as_int;
+  }
+  if (mbmi->interinter_comp.type != best_compound_data.type) {
+    mbmi->comp_group_idx =
+        (best_compound_data.type == COMPOUND_AVERAGE) ? 0 : 1;
+    mbmi->interinter_comp = best_compound_data;
+    memcpy(xd->seg_mask, tmp_best_mask_buf, mask_len);
+  }
+  if (have_newmv_in_inter_mode(this_mode)) {
+    mbmi->mv[0].as_int = best_mv[0].as_int;
+    mbmi->mv[1].as_int = best_mv[1].as_int;
+    if (use_masked_motion_search(mbmi->interinter_comp.type)) {
+      rd_stats->rate += best_tmp_rate_mv - *rate_mv;
+      *rate_mv = best_tmp_rate_mv;
+    }
+  }
+  restore_dst_buf(xd, *best_buf, 1);
+  return best_compmode_interinter_cost;
+}
+
 static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                  BLOCK_SIZE bsize, RD_STATS *rd_stats,
                                  RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
@@ -8344,63 +8747,24 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   int refs[2] = { mbmi->ref_frame[0],
                   (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   int rate_mv = 0;
-  const int bw = block_size_wide[bsize];
   DECLARE_ALIGNED(32, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
-  uint8_t *tmp_buf;
+  uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_);
   int64_t rd = INT64_MAX;
   BUFFER_SET orig_dst, tmp_dst;
 
   int skip_txfm_sb = 0;
   int64_t skip_sse_sb = INT64_MAX;
   int16_t mode_ctx;
-
-  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
-  mbmi->comp_group_idx = 0;
-  mbmi->compound_idx = 1;
-  if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME;
-
-  mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
-
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_);
-  else
-    tmp_buf = tmp_buf_;
-  // Make sure that we didn't leave the plane destination buffers set
-  // to tmp_buf at the end of the last iteration
-  assert(xd->plane[0].dst.buf != tmp_buf);
-
-  mbmi->num_proj_ref[0] = 0;
-  mbmi->num_proj_ref[1] = 0;
-
-  if (is_comp_pred) {
-    for (int ref_idx = 0; ref_idx < is_comp_pred + 1; ++ref_idx) {
-      const int single_mode = get_single_mode(this_mode, ref_idx, is_comp_pred);
-      if (single_mode == NEWMV &&
-          args->single_newmv[mbmi->ref_frame[ref_idx]].as_int == INVALID_MV)
-        return INT64_MAX;
-    }
-  }
-
-  mbmi->motion_mode = SIMPLE_TRANSLATION;
   const int masked_compound_used = is_any_masked_compound_used(bsize) &&
                                    cm->seq_params.enable_masked_compound;
   int64_t ret_val = INT64_MAX;
   const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-  rd_stats->rate += args->ref_frame_cost + args->single_comp_cost;
-  rd_stats->rate +=
-      get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
-  const RD_STATS backup_rd_stats = *rd_stats;
-  const RD_STATS backup_rd_stats_y = *rd_stats_y;
-  const RD_STATS backup_rd_stats_uv = *rd_stats_uv;
-  const MB_MODE_INFO backup_mbmi = *mbmi;
-  INTERINTER_COMPOUND_DATA best_compound_data;
-  uint8_t tmp_best_mask_buf[2 * MAX_SB_SQUARE];
   RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
   int64_t best_rd = INT64_MAX;
-  int64_t best_ret_val = INT64_MAX;
   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
   MB_MODE_INFO best_mbmi = *mbmi;
-  int64_t early_terminate = 0;
+  int best_disable_skip;
+  int best_xskip;
   int plane_rate[MAX_MB_PLANE] = { 0 };
   int64_t plane_sse[MAX_MB_PLANE] = { 0 };
   int64_t plane_dist[MAX_MB_PLANE] = { 0 };
@@ -8411,387 +8775,311 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   int comp_idx;
   const int search_jnt_comp = is_comp_pred & cm->seq_params.enable_jnt_comp &
                               (mbmi->mode != GLOBAL_GLOBALMV);
-  // If !search_jnt_comp, we need to force mbmi->compound_idx = 1.
-  for (comp_idx = 1; comp_idx >= !search_jnt_comp; --comp_idx) {
-    int rs = 0;
-    int compmode_interinter_cost = 0;
-    early_terminate = 0;
-    *rd_stats = backup_rd_stats;
-    *rd_stats_y = backup_rd_stats_y;
-    *rd_stats_uv = backup_rd_stats_uv;
-    *mbmi = backup_mbmi;
-    mbmi->compound_idx = comp_idx;
-
-    if (is_comp_pred && comp_idx == 0) {
-      mbmi->comp_group_idx = 0;
-      mbmi->compound_idx = 0;
 
-      const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
-      const int comp_index_ctx = get_comp_index_context(cm, xd);
-      if (masked_compound_used) {
-        compmode_interinter_cost +=
-            x->comp_group_idx_cost[comp_group_idx_ctx][0];
+  const int has_drl = (have_nearmv_in_inter_mode(mbmi->mode) &&
+                       mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
+                      ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) &&
+                       mbmi_ext->ref_mv_count[ref_frame_type] > 1);
+
+  // TODO(jingning): This should be deprecated shortly.
+  const int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
+  const int ref_set =
+      has_drl ? AOMMIN(MAX_REF_MV_SERCH,
+                       mbmi_ext->ref_mv_count[ref_frame_type] - idx_offset)
+              : 1;
+
+  for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    if (cpi->sf.reduce_inter_modes && ref_mv_idx > 0) {
+      if (mbmi->ref_frame[0] == LAST2_FRAME ||
+          mbmi->ref_frame[0] == LAST3_FRAME ||
+          mbmi->ref_frame[1] == LAST2_FRAME ||
+          mbmi->ref_frame[1] == LAST3_FRAME) {
+        if (mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx + idx_offset]
+                .weight < REF_CAT_LEVEL) {
+          continue;
+        }
       }
-      compmode_interinter_cost += x->comp_idx_cost[comp_index_ctx][0];
     }
 
-    int_mv cur_mv[2];
-    if (!build_cur_mv(cur_mv, this_mode, cm, x)) {
-      early_terminate = INT64_MAX;
-      continue;
-    }
-    if (have_newmv_in_inter_mode(this_mode)) {
-      if (comp_idx == 0) {
-        cur_mv[0] = backup_mv[0];
-        cur_mv[1] = backup_mv[1];
-        rate_mv = backup_rate_mv;
-      }
+    av1_init_rd_stats(rd_stats);
 
-      // when jnt_comp_skip_mv_search flag is on, new mv will be searched once
-      if (!(search_jnt_comp && cpi->sf.jnt_comp_skip_mv_search &&
-            comp_idx == 0)) {
-        newmv_ret_val =
-            handle_newmv(cpi, x, bsize, cur_mv, mi_row, mi_col, &rate_mv, args);
-
-        // Store cur_mv and rate_mv so that they can be restored in the next
-        // iteration of the loop
-        backup_mv[0] = cur_mv[0];
-        backup_mv[1] = cur_mv[1];
-        backup_rate_mv = rate_mv;
-      }
-
-      if (newmv_ret_val != 0) {
-        early_terminate = INT64_MAX;
-        continue;
-      } else {
-        rd_stats->rate += rate_mv;
-      }
-    }
-    for (i = 0; i < is_comp_pred + 1; ++i) {
-      mbmi->mv[i].as_int = cur_mv[i].as_int;
-    }
+    mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+    mbmi->comp_group_idx = 0;
+    mbmi->compound_idx = 1;
+    if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME;
 
-    // Initialise tmp_dst and orig_dst buffers to prevent "may be used
-    // uninitialized" warnings in GCC when the stream is monochrome.
-    memset(tmp_dst.plane, 0, sizeof(tmp_dst.plane));
-    memset(tmp_dst.stride, 0, sizeof(tmp_dst.stride));
-    memset(orig_dst.plane, 0, sizeof(tmp_dst.plane));
-    memset(orig_dst.stride, 0, sizeof(tmp_dst.stride));
+    mode_ctx =
+        av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
 
-    // do first prediction into the destination buffer. Do the next
-    // prediction into a temporary buffer. Then keep track of which one
-    // of these currently holds the best predictor, and use the other
-    // one for future predictions. In the end, copy from tmp_buf to
-    // dst if necessary.
-    for (i = 0; i < num_planes; i++) {
-      tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE;
-      tmp_dst.stride[i] = MAX_SB_SIZE;
-    }
-    for (i = 0; i < num_planes; i++) {
-      orig_dst.plane[i] = xd->plane[i].dst.buf;
-      orig_dst.stride[i] = xd->plane[i].dst.stride;
-    }
+    mbmi->num_proj_ref[0] = 0;
+    mbmi->num_proj_ref[1] = 0;
+    mbmi->motion_mode = SIMPLE_TRANSLATION;
+    mbmi->ref_mv_idx = ref_mv_idx;
 
-    const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx);
-#if USE_DISCOUNT_NEWMV_TEST
-    // We don't include the cost of the second reference here, because there
-    // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
-    // words if you present them in that order, the second one is always known
-    // if the first is known.
-    //
-    // Under some circumstances we discount the cost of new mv mode to encourage
-    // initiation of a motion field.
-    if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) {
-      // discount_newmv_test only applies discount on NEWMV mode.
-      assert(this_mode == NEWMV);
-      rd_stats->rate += AOMMIN(cost_mv_ref(x, this_mode, mode_ctx),
-                               cost_mv_ref(x, NEARESTMV, mode_ctx));
-    } else {
-      rd_stats->rate += ref_mv_cost;
+    if (is_comp_pred) {
+      for (int ref_idx = 0; ref_idx < is_comp_pred + 1; ++ref_idx) {
+        const int single_mode =
+            get_single_mode(this_mode, ref_idx, is_comp_pred);
+        if (single_mode == NEWMV &&
+            args->single_newmv[mbmi->ref_mv_idx][mbmi->ref_frame[ref_idx]]
+                    .as_int == INVALID_MV)
+          continue;
+      }
     }
-#else
-    rd_stats->rate += ref_mv_cost;
-#endif
 
-    if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
-        mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
-      early_terminate = INT64_MAX;
-      continue;
-    }
+    rd_stats->rate += args->ref_frame_cost + args->single_comp_cost;
+    rd_stats->rate +=
+        get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
 
-    ret_val = interpolation_filter_search(
-        x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, args->single_filter,
-        &rd, &rs, &skip_txfm_sb, &skip_sse_sb);
-    if (ret_val != 0) {
-      early_terminate = INT64_MAX;
-      restore_dst_buf(xd, orig_dst, num_planes);
-      continue;
-    } else if (cpi->sf.model_based_post_interp_filter_breakout &&
-               ref_best_rd != INT64_MAX && (rd / 6) > ref_best_rd) {
-      early_terminate = INT64_MAX;
-      restore_dst_buf(xd, orig_dst, num_planes);
-      if ((rd >> 4) > ref_best_rd) break;
-      continue;
-    }
+    const RD_STATS backup_rd_stats = *rd_stats;
+    const MB_MODE_INFO backup_mbmi = *mbmi;
+    int64_t best_rd2 = INT64_MAX;
 
-    if (is_comp_pred && comp_idx) {
-      int rate_sum, rs2;
-      int64_t dist_sum;
-      int64_t best_rd_compound = INT64_MAX, best_rd_cur = INT64_MAX;
-      int_mv best_mv[2];
-      int best_tmp_rate_mv = rate_mv;
-      int tmp_skip_txfm_sb;
-      int64_t tmp_skip_sse_sb;
-      DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]);
-      DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]);
-      uint8_t *preds0[1] = { pred0 };
-      uint8_t *preds1[1] = { pred1 };
-      int strides[1] = { bw };
-      int tmp_rate_mv;
-      const int num_pix = 1 << num_pels_log2_lookup[bsize];
-      COMPOUND_TYPE cur_type;
-      int best_compmode_interinter_cost = 0;
-      int can_use_previous = cm->allow_warped_motion;
-
-      best_mv[0].as_int = cur_mv[0].as_int;
-      best_mv[1].as_int = cur_mv[1].as_int;
+    // If !search_jnt_comp, we need to force mbmi->compound_idx = 1.
+    for (comp_idx = 1; comp_idx >= !search_jnt_comp; --comp_idx) {
+      int rs = 0;
+      int compmode_interinter_cost = 0;
+      *rd_stats = backup_rd_stats;
+      *mbmi = backup_mbmi;
+      mbmi->compound_idx = comp_idx;
 
-      if (masked_compound_used) {
-        // get inter predictors to use for masked compound modes
-        av1_build_inter_predictors_for_planes_single_buf(
-            xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides,
-            can_use_previous);
-        av1_build_inter_predictors_for_planes_single_buf(
-            xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides,
-            can_use_previous);
-      }
-
-      int best_comp_group_idx = 0;
-      int best_compound_idx = 1;
-      for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
-        if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break;
-        if (!is_interinter_compound_used(cur_type, bsize)) continue;
-        tmp_rate_mv = rate_mv;
-        best_rd_cur = INT64_MAX;
-        mbmi->interinter_comp.type = cur_type;
-        int masked_type_cost = 0;
+      if (is_comp_pred && comp_idx == 0) {
+        mbmi->comp_group_idx = 0;
+        mbmi->compound_idx = 0;
 
         const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
         const int comp_index_ctx = get_comp_index_context(cm, xd);
         if (masked_compound_used) {
-          if (cur_type == COMPOUND_AVERAGE) {
-            mbmi->comp_group_idx = 0;
-            mbmi->compound_idx = 1;
-
-            masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][0];
-            masked_type_cost += x->comp_idx_cost[comp_index_ctx][1];
-          } else {
-            mbmi->comp_group_idx = 1;
-            mbmi->compound_idx = 1;
-
-            masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][1];
-            masked_type_cost +=
-                x->compound_type_cost[bsize][mbmi->interinter_comp.type - 1];
-          }
-        } else {
-          mbmi->comp_group_idx = 0;
-          mbmi->compound_idx = 1;
-
-          masked_type_cost += x->comp_idx_cost[comp_index_ctx][1];
+          compmode_interinter_cost +=
+              x->comp_group_idx_cost[comp_group_idx_ctx][0];
         }
-        rs2 = masked_type_cost;
+        compmode_interinter_cost += x->comp_idx_cost[comp_index_ctx][0];
+      }
 
-        switch (cur_type) {
-          case COMPOUND_AVERAGE:
-            av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst,
-                                           bsize);
-            av1_subtract_plane(x, bsize, 0);
-            rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
-                                     &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
-                                     INT64_MAX);
-            if (rd != INT64_MAX)
-              best_rd_cur =
-                  RDCOST(x->rdmult, rs2 + rate_mv + rate_sum, dist_sum);
-            break;
-          case COMPOUND_WEDGE:
-            if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
-                best_rd_compound / 3 < ref_best_rd) {
-              best_rd_cur = build_and_cost_compound_type(
-                  cpi, x, cur_mv, bsize, this_mode, &rs2, rate_mv, &orig_dst,
-                  &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col);
-            }
-            break;
-          case COMPOUND_DIFFWTD:
-            if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
-                best_rd_compound / 3 < ref_best_rd) {
-              best_rd_cur = build_and_cost_compound_type(
-                  cpi, x, cur_mv, bsize, this_mode, &rs2, rate_mv, &orig_dst,
-                  &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col);
-            }
-            break;
-          default: assert(0); return INT64_MAX;
+      int_mv cur_mv[2];
+      if (!build_cur_mv(cur_mv, this_mode, cm, x)) {
+        continue;
+      }
+      if (have_newmv_in_inter_mode(this_mode)) {
+        if (comp_idx == 0) {
+          cur_mv[0] = backup_mv[0];
+          cur_mv[1] = backup_mv[1];
+          rate_mv = backup_rate_mv;
         }
 
-        if (best_rd_cur < best_rd_compound) {
-          best_comp_group_idx = mbmi->comp_group_idx;
-          best_compound_idx = mbmi->compound_idx;
-          best_rd_compound = best_rd_cur;
-          best_compound_data = mbmi->interinter_comp;
-          memcpy(tmp_best_mask_buf, xd->seg_mask,
-                 2 * num_pix * sizeof(uint8_t));
-          best_compmode_interinter_cost = rs2;
-          if (have_newmv_in_inter_mode(this_mode)) {
-            if (use_masked_motion_search(cur_type)) {
-              best_tmp_rate_mv = tmp_rate_mv;
-              best_mv[0].as_int = mbmi->mv[0].as_int;
-              best_mv[1].as_int = mbmi->mv[1].as_int;
-            } else {
-              best_mv[0].as_int = cur_mv[0].as_int;
-              best_mv[1].as_int = cur_mv[1].as_int;
-            }
-          }
+        // when jnt_comp_skip_mv_search flag is on, new mv will be searched once
+        if (!(search_jnt_comp && cpi->sf.jnt_comp_skip_mv_search &&
+              comp_idx == 0)) {
+          newmv_ret_val = handle_newmv(cpi, x, bsize, cur_mv, mi_row, mi_col,
+                                       &rate_mv, args);
+
+          // Store cur_mv and rate_mv so that they can be restored in the next
+          // iteration of the loop
+          backup_mv[0] = cur_mv[0];
+          backup_mv[1] = cur_mv[1];
+          backup_rate_mv = rate_mv;
         }
-        // reset to original mvs for next iteration
-        mbmi->mv[0].as_int = cur_mv[0].as_int;
-        mbmi->mv[1].as_int = cur_mv[1].as_int;
-      }
-      mbmi->comp_group_idx = best_comp_group_idx;
-      mbmi->compound_idx = best_compound_idx;
-      mbmi->interinter_comp = best_compound_data;
-      assert(IMPLIES(mbmi->comp_group_idx == 1,
-                     mbmi->interinter_comp.type != COMPOUND_AVERAGE));
-      memcpy(xd->seg_mask, tmp_best_mask_buf, 2 * num_pix * sizeof(uint8_t));
-      if (have_newmv_in_inter_mode(this_mode)) {
-        mbmi->mv[0].as_int = best_mv[0].as_int;
-        mbmi->mv[1].as_int = best_mv[1].as_int;
-        if (use_masked_motion_search(mbmi->interinter_comp.type)) {
-          rd_stats->rate += best_tmp_rate_mv - rate_mv;
-          rate_mv = best_tmp_rate_mv;
+
+        if (newmv_ret_val != 0) {
+          continue;
+        } else {
+          rd_stats->rate += rate_mv;
         }
       }
+      for (i = 0; i < is_comp_pred + 1; ++i) {
+        mbmi->mv[i].as_int = cur_mv[i].as_int;
+      }
 
-      if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) {
-        restore_dst_buf(xd, orig_dst, num_planes);
-        early_terminate = INT64_MAX;
+      // Initialise tmp_dst and orig_dst buffers to prevent "may be used
+      // uninitialized" warnings in GCC when the stream is monochrome.
+      memset(tmp_dst.plane, 0, sizeof(tmp_dst.plane));
+      memset(tmp_dst.stride, 0, sizeof(tmp_dst.stride));
+      memset(orig_dst.plane, 0, sizeof(tmp_dst.plane));
+      memset(orig_dst.stride, 0, sizeof(tmp_dst.stride));
+
+      // do first prediction into the destination buffer. Do the next
+      // prediction into a temporary buffer. Then keep track of which one
+      // of these currently holds the best predictor, and use the other
+      // one for future predictions. In the end, copy from tmp_buf to
+      // dst if necessary.
+      for (i = 0; i < num_planes; i++) {
+        tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE;
+        tmp_dst.stride[i] = MAX_SB_SIZE;
+      }
+      for (i = 0; i < num_planes; i++) {
+        orig_dst.plane[i] = xd->plane[i].dst.buf;
+        orig_dst.stride[i] = xd->plane[i].dst.stride;
+      }
+
+      const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx);
+#if USE_DISCOUNT_NEWMV_TEST
+      // We don't include the cost of the second reference here, because there
+      // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in
+      // other words if you present them in that order, the second one is always
+      // known if the first is known.
+      //
+      // Under some circumstances we discount the cost of new mv mode to
+      // encourage initiation of a motion field.
+      if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) {
+        // discount_newmv_test only applies discount on NEWMV mode.
+        assert(this_mode == NEWMV);
+        rd_stats->rate += AOMMIN(cost_mv_ref(x, this_mode, mode_ctx),
+                                 cost_mv_ref(x, NEARESTMV, mode_ctx));
+      } else {
+        rd_stats->rate += ref_mv_cost;
+      }
+#else
+      rd_stats->rate += ref_mv_cost;
+#endif
+
+      if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
+          mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
         continue;
       }
-      compmode_interinter_cost = best_compmode_interinter_cost;
-    }
 
-    if (is_comp_pred) {
-      int tmp_rate;
-      int64_t tmp_dist;
-      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst, bsize);
-      model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate,
-                      &tmp_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate,
-                      plane_sse, plane_dist);
-      rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist);
-    }
-
-    if (search_jnt_comp) {
-      // if 1/2 model rd is larger than best_rd in jnt_comp mode,
-      // use jnt_comp mode, save additional search
-      if ((rd >> 1) > best_rd) {
+      ret_val = interpolation_filter_search(
+          x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst,
+          args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb);
+      if (ret_val != 0) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        continue;
+      } else if (cpi->sf.model_based_post_interp_filter_breakout &&
+                 ref_best_rd != INT64_MAX && (rd / 6 > ref_best_rd)) {
         restore_dst_buf(xd, orig_dst, num_planes);
+        if ((rd >> 4) > ref_best_rd) break;
         continue;
       }
-    }
 
-    if (!is_comp_pred)
-      args->single_filter[this_mode][refs[0]] =
-          av1_extract_interp_filter(mbmi->interp_filters, 0);
+      if (is_comp_pred && comp_idx) {
+        int64_t best_rd_compound;
+        compmode_interinter_cost = compound_type_rd(
+            cpi, x, bsize, mi_col, mi_row, cur_mv, masked_compound_used,
+            &orig_dst, &tmp_dst, &rate_mv, &best_rd_compound, rd_stats,
+            ref_best_rd);
+        if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) {
+          restore_dst_buf(xd, orig_dst, num_planes);
+          continue;
+        }
+        if (mbmi->interinter_comp.type != COMPOUND_AVERAGE) {
+          int tmp_rate;
+          int64_t tmp_dist;
+          av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst,
+                                        bsize);
+          for (int plane = 0; plane < num_planes; ++plane)
+            av1_subtract_plane(x, bsize, plane);
+          model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate,
+                          &tmp_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate,
+                          plane_sse, plane_dist);
+          rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist);
+        }
+      }
 
-    if (args->modelled_rd != NULL) {
-      if (is_comp_pred) {
-        const int mode0 = compound_ref0_mode(this_mode);
-        const int mode1 = compound_ref1_mode(this_mode);
-        const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]],
-                                   args->modelled_rd[mode1][refs[1]]);
-        if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) {
+      if (search_jnt_comp) {
+        // if 1/2 model rd is larger than best_rd in jnt_comp mode,
+        // use jnt_comp mode, save additional search
+        if ((rd >> 1) > best_rd) {
           restore_dst_buf(xd, orig_dst, num_planes);
-          early_terminate = INT64_MAX;
           continue;
         }
-      } else {
-        args->modelled_rd[this_mode][refs[0]] = rd;
       }
-    }
 
-    if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
-      // if current pred_error modeled rd is substantially more than the best
-      // so far, do not bother doing full rd
-      if (rd / 2 > ref_best_rd) {
-        restore_dst_buf(xd, orig_dst, num_planes);
-        early_terminate = INT64_MAX;
-        continue;
+      if (!is_comp_pred)
+        args->single_filter[this_mode][refs[0]] =
+            av1_extract_interp_filter(mbmi->interp_filters, 0);
+
+      if (args->modelled_rd != NULL) {
+        if (is_comp_pred) {
+          const int mode0 = compound_ref0_mode(this_mode);
+          const int mode1 = compound_ref1_mode(this_mode);
+          const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]],
+                                     args->modelled_rd[mode1][refs[1]]);
+          if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) {
+            restore_dst_buf(xd, orig_dst, num_planes);
+            continue;
+          }
+        } else {
+          args->modelled_rd[this_mode][refs[0]] = rd;
+        }
       }
-    }
 
-    rd_stats->rate += compmode_interinter_cost;
+      if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+        // if current pred_error modeled rd is substantially more than the best
+        // so far, do not bother doing full rd
+        if (rd / 2 > ref_best_rd) {
+          restore_dst_buf(xd, orig_dst, num_planes);
+          continue;
+        }
+      }
 
-    if (search_jnt_comp && cpi->sf.jnt_comp_fast_tx_search && comp_idx == 0) {
-      // TODO(chengchen): this speed feature introduces big loss.
-      // Need better estimation of rate distortion.
-      rd_stats->rate += rs;
-      rd_stats->rate += plane_rate[0] + plane_rate[1] + plane_rate[2];
-      rd_stats_y->rate = plane_rate[0];
-      rd_stats_uv->rate = plane_rate[1] + plane_rate[2];
-      rd_stats->sse = plane_sse[0] + plane_sse[1] + plane_sse[2];
-      rd_stats_y->sse = plane_sse[0];
-      rd_stats_uv->sse = plane_sse[1] + plane_sse[2];
-      rd_stats->dist = plane_dist[0] + plane_dist[1] + plane_dist[2];
-      rd_stats_y->dist = plane_dist[0];
-      rd_stats_uv->dist = plane_dist[1] + plane_dist[2];
-    } else {
+      rd_stats->rate += compmode_interinter_cost;
+
+      if (search_jnt_comp && cpi->sf.jnt_comp_fast_tx_search && comp_idx == 0) {
+        // TODO(chengchen): this speed feature introduces big loss.
+        // Need better estimation of rate distortion.
+        rd_stats->rate += rs;
+        rd_stats->rate += plane_rate[0] + plane_rate[1] + plane_rate[2];
+        rd_stats_y->rate = plane_rate[0];
+        rd_stats_uv->rate = plane_rate[1] + plane_rate[2];
+        rd_stats->sse = plane_sse[0] + plane_sse[1] + plane_sse[2];
+        rd_stats_y->sse = plane_sse[0];
+        rd_stats_uv->sse = plane_sse[1] + plane_sse[2];
+        rd_stats->dist = plane_dist[0] + plane_dist[1] + plane_dist[2];
+        rd_stats_y->dist = plane_dist[0];
+        rd_stats_uv->dist = plane_dist[1] + plane_dist[2];
+      } else {
 #if CONFIG_COLLECT_INTER_MODE_RD_STATS
-      ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
-                               disable_skip, mi_row, mi_col, args, ref_best_rd,
-                               refs, rate_mv, &orig_dst, best_est_rd);
+        ret_val =
+            motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
+                           disable_skip, mi_row, mi_col, args, ref_best_rd,
+                           refs, rate_mv, &orig_dst, best_est_rd);
 #else
-      ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
-                               disable_skip, mi_row, mi_col, args, ref_best_rd,
-                               refs, rate_mv, &orig_dst);
+        ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y,
+                                 rd_stats_uv, disable_skip, mi_row, mi_col,
+                                 args, ref_best_rd, refs, rate_mv, &orig_dst);
 #endif
-    }
-    if (ret_val != INT64_MAX) {
-      if (search_jnt_comp) {
+      }
+      if (ret_val != INT64_MAX) {
         int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
         if (tmp_rd < best_rd) {
           best_rd_stats = *rd_stats;
           best_rd_stats_y = *rd_stats_y;
           best_rd_stats_uv = *rd_stats_uv;
-          best_ret_val = ret_val;
           best_rd = tmp_rd;
           best_mbmi = *mbmi;
+          best_disable_skip = *disable_skip;
+          best_xskip = x->skip;
           memcpy(best_blk_skip, x->blk_skip,
                  sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w);
         }
+
+        if (tmp_rd < best_rd2) {
+          best_rd2 = tmp_rd;
+        }
+
         if (tmp_rd < ref_best_rd) {
           ref_best_rd = tmp_rd;
         }
       }
-    }
-    if (!search_jnt_comp && ret_val != 0) {
       restore_dst_buf(xd, orig_dst, num_planes);
-      return ret_val;
     }
-    restore_dst_buf(xd, orig_dst, num_planes);
+
+    args->modelled_rd = NULL;
   }
 
+  if (best_rd == INT64_MAX) return INT64_MAX;
+
   // re-instate status of the best choice
-  if (is_comp_pred && best_ret_val != INT64_MAX) {
-    *rd_stats = best_rd_stats;
-    *rd_stats_y = best_rd_stats_y;
-    *rd_stats_uv = best_rd_stats_uv;
-    ret_val = best_ret_val;
-    *mbmi = best_mbmi;
-    assert(IMPLIES(mbmi->comp_group_idx == 1,
-                   mbmi->interinter_comp.type != COMPOUND_AVERAGE));
-    memcpy(x->blk_skip, best_blk_skip,
-           sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w);
-  }
-  if (early_terminate == INT64_MAX) return INT64_MAX;
-  if (ret_val != 0) return ret_val;
+  *rd_stats = best_rd_stats;
+  *rd_stats_y = best_rd_stats_y;
+  *rd_stats_uv = best_rd_stats_uv;
+  *mbmi = best_mbmi;
+  *disable_skip = best_disable_skip;
+  x->skip = best_xskip;
+  assert(IMPLIES(mbmi->comp_group_idx == 1,
+                 mbmi->interinter_comp.type != COMPOUND_AVERAGE));
+  memcpy(x->blk_skip, best_blk_skip,
+         sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w);
+
   return RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
 }
 
@@ -8822,6 +9110,13 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
   av1_find_best_ref_mvs_from_stack(0, mbmi_ext, ref_frame, &nearestmv, &nearmv,
                                    0);
 
+  if (nearestmv.as_int == INVALID_MV) {
+    nearestmv.as_int = 0;
+  }
+  if (nearmv.as_int == INVALID_MV) {
+    nearmv.as_int = 0;
+  }
+
   int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
   if (dv_ref.as_int == 0)
     av1_find_ref_dv(&dv_ref, tile, cm->seq_params.mib_size, mi_row, mi_col);
@@ -9013,8 +9308,9 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
   if (intra_yrd < best_rd) {
     // Only store reconstructed luma when there's chroma RDO. When there's no
     // chroma RDO, the reconstructed luma will be stored in encode_superblock().
-    xd->cfl.is_chroma_reference = is_chroma_reference(
-        mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y);
+    xd->cfl.is_chroma_reference =
+        is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x,
+                            cm->seq_params.subsampling_y);
     xd->cfl.store_y = store_cfl_required_rdo(cm, x);
     if (xd->cfl.store_y) {
       // Restore reconstructed luma values.
@@ -9081,7 +9377,7 @@ static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
 
   for (r = 0; r < rows; ++r) {
     for (c = 0; c < cols; ++c) {
-      if (cpi->common.use_highbitdepth) {
+      if (cpi->common.seq_params.use_highbitdepth) {
         data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
         data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
       } else {
@@ -9760,6 +10056,8 @@ static int inter_mode_search_order_independent_skip(
   if (comp_pred) {
     if (!cpi->allow_comp_inter_inter) return 1;
 
+    if (cm->reference_mode == SINGLE_REFERENCE) return 1;
+
     // Skip compound inter modes if ARF is not available.
     if (!(cpi->ref_frame_flags & ref_frame_flag_list[ref_frame[1]])) return 1;
 
@@ -9857,7 +10155,7 @@ static int handle_intra_mode(InterModeSearchState *search_state,
       av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
   const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
   const int intra_cost_penalty = av1_get_intra_cost_penalty(
-      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+      cm->base_qindex, cm->y_dc_delta_q, cm->seq_params.bit_depth);
   const int rows = block_size_high[bsize];
   const int cols = block_size_wide[bsize];
   const int num_planes = av1_num_planes(cm);
@@ -10050,7 +10348,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   const int try_palette =
       av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const struct segmentation *const seg = &cm->seg;
   PREDICTION_MODE this_mode;
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
@@ -10097,7 +10394,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     int64_t distortion2 = 0;
     int skippable = 0;
     int this_skip2 = 0;
-    uint8_t ref_frame_type;
 
     this_mode = av1_mode_order[mode_index].mode;
     ref_frame = av1_mode_order[mode_index].ref_frame[0];
@@ -10195,7 +10491,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       mbmi->angle_delta[PLANE_TYPE_UV] = 0;
       mbmi->filter_intra_mode_info.use_filter_intra = 0;
       mbmi->ref_mv_idx = 0;
-      ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
       int64_t ref_best_rd = search_state.best_rd;
       {
         RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
@@ -10203,9 +10498,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         rd_stats.rate = rate2;
 
         // Point to variables that are maintained between loop iterations
-        args.single_newmv = search_state.single_newmv[0];
-        args.single_newmv_rate = search_state.single_newmv_rate[0];
-        args.single_newmv_valid = search_state.single_newmv_valid[0];
+        args.single_newmv = search_state.single_newmv;
+        args.single_newmv_rate = search_state.single_newmv_rate;
+        args.single_newmv_valid = search_state.single_newmv_valid;
         args.modelled_rd = search_state.modelled_rd;
         args.single_comp_cost = real_compmode_cost;
         args.ref_frame_cost = ref_frame_cost;
@@ -10218,10 +10513,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
                                     &rd_stats_uv, &disable_skip, mi_row, mi_col,
                                     &args, ref_best_rd);
 #endif
-        if (this_rd < ref_best_rd) {
-          ref_best_rd = this_rd;
-        }
-
         rate2 = rd_stats.rate;
         skippable = rd_stats.skip;
         distortion2 = rd_stats.dist;
@@ -10229,108 +10520,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         rate_uv = rd_stats_uv.rate;
       }
 
-      // TODO(jingning): This needs some refactoring to improve code quality
-      // and reduce redundant steps.
-      if ((have_nearmv_in_inter_mode(mbmi->mode) &&
-           mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
-          ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) &&
-           mbmi_ext->ref_mv_count[ref_frame_type] > 1)) {
-        MB_MODE_INFO backup_mbmi = *mbmi;
-        int backup_skip = x->skip;
-        int64_t tmp_ref_rd = this_rd;
-        int ref_idx;
-
-        // TODO(jingning): This should be deprecated shortly.
-        int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
-        int ref_set =
-            AOMMIN(MAX_REF_MV_SERCH - 1,
-                   mbmi_ext->ref_mv_count[ref_frame_type] - 1 - idx_offset);
-        memcpy(x->blk_skip_drl, x->blk_skip,
-               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-
-        for (ref_idx = 0; ref_idx < ref_set; ++ref_idx) {
-          int64_t tmp_alt_rd = INT64_MAX;
-          int dummy_disable_skip = 0;
-          int_mv cur_mv;
-          RD_STATS tmp_rd_stats, tmp_rd_stats_y, tmp_rd_stats_uv;
-
-          av1_invalid_rd_stats(&tmp_rd_stats);
-
-          x->skip = 0;
-
-          mbmi->ref_mv_idx = 1 + ref_idx;
-
-          if (cpi->sf.reduce_inter_modes) {
-            if (mbmi->ref_frame[0] == LAST2_FRAME ||
-                mbmi->ref_frame[0] == LAST3_FRAME ||
-                mbmi->ref_frame[1] == LAST2_FRAME ||
-                mbmi->ref_frame[1] == LAST3_FRAME) {
-              if (mbmi_ext
-                      ->ref_mv_stack[ref_frame_type]
-                                    [mbmi->ref_mv_idx + idx_offset]
-                      .weight < REF_CAT_LEVEL) {
-                *mbmi = backup_mbmi;
-                x->skip = backup_skip;
-                continue;
-              }
-            }
-          }
-
-          cur_mv =
-              mbmi_ext->ref_mv_stack[ref_frame][mbmi->ref_mv_idx + idx_offset]
-                  .this_mv;
-          clamp_mv2(&cur_mv.as_mv, xd);
-
-          if (!mv_check_bounds(&x->mv_limits, &cur_mv.as_mv)) {
-            av1_init_rd_stats(&tmp_rd_stats);
-
-            args.modelled_rd = NULL;
-            args.single_newmv = search_state.single_newmv[mbmi->ref_mv_idx];
-            args.single_newmv_rate =
-                search_state.single_newmv_rate[mbmi->ref_mv_idx];
-            args.single_newmv_valid =
-                search_state.single_newmv_valid[mbmi->ref_mv_idx];
-            args.single_comp_cost = real_compmode_cost;
-            args.ref_frame_cost = ref_frame_cost;
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
-            tmp_alt_rd =
-                handle_inter_mode(cpi, x, bsize, &tmp_rd_stats, &tmp_rd_stats_y,
-                                  &tmp_rd_stats_uv, &dummy_disable_skip, mi_row,
-                                  mi_col, &args, ref_best_rd, &best_est_rd);
-#else
-            tmp_alt_rd = handle_inter_mode(
-                cpi, x, bsize, &tmp_rd_stats, &tmp_rd_stats_y, &tmp_rd_stats_uv,
-                &dummy_disable_skip, mi_row, mi_col, &args, ref_best_rd);
-#endif
-
-            // Prevent pointers from escaping local scope
-            args.single_newmv = search_state.single_newmv[0];
-            args.single_newmv_rate = search_state.single_newmv_rate[0];
-            args.single_newmv_valid = search_state.single_newmv_valid[0];
-          }
-
-          if (tmp_ref_rd > tmp_alt_rd) {
-            rate2 = tmp_rd_stats.rate;
-            disable_skip = dummy_disable_skip;
-            distortion2 = tmp_rd_stats.dist;
-            skippable = tmp_rd_stats.skip;
-            rate_y = tmp_rd_stats_y.rate;
-            rate_uv = tmp_rd_stats_uv.rate;
-            this_rd = tmp_alt_rd;
-            tmp_ref_rd = tmp_alt_rd;
-            backup_mbmi = *mbmi;
-            backup_skip = x->skip;
-            memcpy(x->blk_skip_drl, x->blk_skip,
-                   sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-          } else {
-            *mbmi = backup_mbmi;
-            x->skip = backup_skip;
-          }
-        }
-
-        memcpy(x->blk_skip, x->blk_skip_drl,
-               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-      }
       if (this_rd == INT64_MAX) continue;
 
       this_skip2 = mbmi->skip;
diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h
index 1fa3d68ce..12df472c1 100644
--- a/third_party/aom/av1/encoder/rdopt.h
+++ b/third_party/aom/av1/encoder/rdopt.h
@@ -78,8 +78,8 @@ static INLINE int av1_cost_skip_txb(MACROBLOCK *x, const TXB_CTX *const txb_ctx,
 }
 
 static INLINE int av1_cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x,
-                                  int plane, int blk_row, int blk_col,
-                                  int block, TX_SIZE tx_size,
+                                  int plane, int block, TX_SIZE tx_size,
+                                  const TX_TYPE tx_type,
                                   const TXB_CTX *const txb_ctx,
                                   int use_fast_coef_costing) {
 #if TXCOEFF_COST_TIMER
@@ -87,8 +87,8 @@ static INLINE int av1_cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x,
   aom_usec_timer_start(&timer);
 #endif
   (void)use_fast_coef_costing;
-  const int cost = av1_cost_coeffs_txb(cm, x, plane, blk_row, blk_col, block,
-                                       tx_size, txb_ctx);
+  const int cost =
+      av1_cost_coeffs_txb(cm, x, plane, block, tx_size, tx_type, txb_ctx);
 #if TXCOEFF_COST_TIMER
   AV1_COMMON *tmp_cm = (AV1_COMMON *)&cpi->common;
   aom_usec_timer_mark(&timer);
diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c
index 49740817c..d4b4b19c4 100644
--- a/third_party/aom/av1/encoder/speed_features.c
+++ b/third_party/aom/av1/encoder/speed_features.c
@@ -89,9 +89,27 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
                                                        SPEED_FEATURES *sf,
                                                        int speed) {
   AV1_COMMON *const cm = &cpi->common;
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+
+  if (is_480p_or_larger) {
+    sf->use_square_partition_only_threshold = BLOCK_128X128;
+  } else {
+    sf->use_square_partition_only_threshold = BLOCK_64X64;
+  }
+
+  if (speed >= 1) {
+    if (is_720p_or_larger) {
+      sf->use_square_partition_only_threshold = BLOCK_128X128;
+    } else if (is_480p_or_larger) {
+      sf->use_square_partition_only_threshold = BLOCK_64X64;
+    } else {
+      sf->use_square_partition_only_threshold = BLOCK_32X32;
+    }
+  }
 
   if (speed >= 2) {
-    if (AOMMIN(cm->width, cm->height) >= 720) {
+    if (is_720p_or_larger) {
       sf->disable_split_mask =
           cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
       sf->adaptive_pred_interp_filter = 0;
@@ -106,7 +124,7 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
   }
 
   if (speed >= 3) {
-    if (AOMMIN(cm->width, cm->height) >= 720) {
+    if (is_720p_or_larger) {
       sf->disable_split_mask = DISABLE_ALL_SPLIT;
       sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
       sf->partition_search_breakout_dist_thr = (1 << 25);
@@ -130,7 +148,7 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
   }
 
   if (speed >= 4) {
-    if (AOMMIN(cm->width, cm->height) >= 720) {
+    if (is_720p_or_larger) {
       sf->partition_search_breakout_dist_thr = (1 << 26);
     } else {
       sf->partition_search_breakout_dist_thr = (1 << 24);
@@ -149,6 +167,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
   sf->reduce_inter_modes = 1;
   sf->prune_ext_partition_types_search_level = 1;
   sf->ml_prune_ab_partition = 1;
+  sf->ml_prune_4_partition = 1;
   sf->adaptive_txb_search_level = 1;
   sf->jnt_comp_skip_mv_search = 1;
   sf->model_based_prune_tx_search_level = 1;
@@ -195,7 +214,9 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
 
     sf->partition_search_breakout_rate_thr = 80;
-    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+    // Note: This speed feature is disable as it seems to be worse in
+    // compression/quality and is also slower.
+    // sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->allow_partition_search_skip = 1;
     sf->disable_wedge_search_var_thresh = 100;
     sf->fast_wedge_sign_estimate = 1;
@@ -221,7 +242,8 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
   if (speed >= 4) {
     sf->tx_type_search.fast_intra_tx_type_search = 1;
     sf->tx_type_search.fast_inter_tx_type_search = 1;
-    sf->use_square_partition_only = !boosted;
+    sf->use_square_partition_only_threshold =
+        boosted ? BLOCK_128X128 : BLOCK_4X4;
     sf->tx_size_search_method =
         frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
     sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
@@ -242,7 +264,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
     sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
-    sf->use_square_partition_only = 1;
+    sf->use_square_partition_only_threshold = BLOCK_4X4;
     sf->tx_size_search_method = USE_LARGESTALL;
     sf->mv.search_method = BIGDIA;
     sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
@@ -363,9 +385,11 @@ static void set_dev_sf(AV1_COMP *cpi, SPEED_FEATURES *sf, int speed) {
   if (speed & PARTITION_SF) {
     if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
         has_internal_image_edge(cpi)) {
-      sf->use_square_partition_only = !frame_is_boosted(cpi);
+      sf->use_square_partition_only_threshold =
+          frame_is_boosted(cpi) ? BLOCK_128X128 : BLOCK_4X4;
     } else {
-      sf->use_square_partition_only = !frame_is_intra_only(cm);
+      sf->use_square_partition_only_threshold =
+          frame_is_intra_only(cm) ? BLOCK_128X128 : BLOCK_4X4;
     }
     sf->less_rectangular_check = 1;
     sf->prune_ext_partition_types_search_level = 2;
@@ -438,7 +462,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   sf->tx_type_search.skip_tx_search = 0;
   sf->selective_ref_frame = 0;
   sf->less_rectangular_check = 0;
-  sf->use_square_partition_only = 0;
+  sf->use_square_partition_only_threshold = BLOCK_128X128;
   sf->auto_min_max_partition_size = NOT_IN_USE;
   sf->rd_auto_partition_min_limit = BLOCK_4X4;
   sf->default_max_partition_size = BLOCK_LARGEST;
@@ -493,6 +517,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   sf->simple_model_rd_from_var = 0;
   sf->prune_ext_partition_types_search_level = 0;
   sf->ml_prune_ab_partition = 0;
+  sf->ml_prune_4_partition = 0;
   sf->fast_cdef_search = 0;
 
   // Set this at the appropriate speed levels
diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h
index 59cb6be58..d0408ba2f 100644
--- a/third_party/aom/av1/encoder/speed_features.h
+++ b/third_party/aom/av1/encoder/speed_features.h
@@ -400,6 +400,9 @@ typedef struct SPEED_FEATURES {
   // Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions.
   int ml_prune_ab_partition;
 
+  // Use a ML model to prune horz4 and vert4 partitions.
+  int ml_prune_4_partition;
+
   int fast_cdef_search;
 
   // 2-pass coding block partition search
@@ -413,8 +416,8 @@ typedef struct SPEED_FEATURES {
   // rd than partition type split.
   int less_rectangular_check;
 
-  // Disable testing non square partitions. (eg 16x32)
-  int use_square_partition_only;
+  // Use square partition only beyond this block size.
+  BLOCK_SIZE use_square_partition_only_threshold;
 
   // Sets min and max partition sizes for this superblock based on the
   // same superblock in last encoded frame, and the left and above neighbor.
diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c
index 250feab81..d7e4f4eb3 100644
--- a/third_party/aom/av1/encoder/temporal_filter.c
+++ b/third_party/aom/av1/encoder/temporal_filter.c
@@ -535,10 +535,10 @@ static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
   // Adjust the strength based on active max q.
   if (cpi->common.current_video_frame > 1)
     q = ((int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
-                                      cpi->common.bit_depth));
+                                      cpi->common.seq_params.bit_depth));
   else
     q = ((int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[KEY_FRAME],
-                                      cpi->common.bit_depth));
+                                      cpi->common.seq_params.bit_depth));
   if (q > 16) {
     strength = oxcf->arnr_strength;
   } else {
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
index 84065d6de..c71f2e74c 100644
--- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 #include "av1/encoder/x86/av1_txfm1d_sse4.h"
 
 void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
new file mode 100644
index 000000000..592462e20
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
@@ -0,0 +1,2068 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/x86/av1_fwd_txfm_avx2.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void fdct16x16_new_avx2(const __m256i *input, __m256i *output,
+                                      int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+  __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+  __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+  __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+  __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+  __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+  __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+  __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+
+  // stage 1
+  __m256i x1[16];
+  btf_16_adds_subs_out_avx2(&x1[0], &x1[15], input[0], input[15]);
+  btf_16_adds_subs_out_avx2(&x1[1], &x1[14], input[1], input[14]);
+  btf_16_adds_subs_out_avx2(&x1[2], &x1[13], input[2], input[13]);
+  btf_16_adds_subs_out_avx2(&x1[3], &x1[12], input[3], input[12]);
+  btf_16_adds_subs_out_avx2(&x1[4], &x1[11], input[4], input[11]);
+  btf_16_adds_subs_out_avx2(&x1[5], &x1[10], input[5], input[10]);
+  btf_16_adds_subs_out_avx2(&x1[6], &x1[9], input[6], input[9]);
+  btf_16_adds_subs_out_avx2(&x1[7], &x1[8], input[7], input[8]);
+
+  // stage 2
+  btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+
+  // stage 3
+  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+
+  // stage 4
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+
+  // stage 5
+  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+
+  // stage 6
+  btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+
+  // stage 7
+  output[0] = x1[0];
+  output[1] = x1[8];
+  output[2] = x1[4];
+  output[3] = x1[12];
+  output[4] = x1[2];
+  output[5] = x1[10];
+  output[6] = x1[6];
+  output[7] = x1[14];
+  output[8] = x1[1];
+  output[9] = x1[9];
+  output[10] = x1[5];
+  output[11] = x1[13];
+  output[12] = x1[3];
+  output[13] = x1[11];
+  output[14] = x1[7];
+  output[15] = x1[15];
+}
+
+static INLINE void fdct16x32_new_avx2(const __m256i *input, __m256i *output,
+                                      int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+  __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+  __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+  __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+  __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+  __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+  __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+  __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+  __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+  __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
+  __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
+  __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
+  __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
+  __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
+  __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
+  __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
+  __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
+  __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
+  __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
+  __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
+  __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
+  __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
+  __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
+  __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
+  __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
+
+  // stage 1
+  __m256i x1[32];
+  btf_16_adds_subs_out_avx2(&x1[0], &x1[31], input[0], input[31]);
+  btf_16_adds_subs_out_avx2(&x1[1], &x1[30], input[1], input[30]);
+  btf_16_adds_subs_out_avx2(&x1[2], &x1[29], input[2], input[29]);
+  btf_16_adds_subs_out_avx2(&x1[3], &x1[28], input[3], input[28]);
+  btf_16_adds_subs_out_avx2(&x1[4], &x1[27], input[4], input[27]);
+  btf_16_adds_subs_out_avx2(&x1[5], &x1[26], input[5], input[26]);
+  btf_16_adds_subs_out_avx2(&x1[6], &x1[25], input[6], input[25]);
+  btf_16_adds_subs_out_avx2(&x1[7], &x1[24], input[7], input[24]);
+  btf_16_adds_subs_out_avx2(&x1[8], &x1[23], input[8], input[23]);
+  btf_16_adds_subs_out_avx2(&x1[9], &x1[22], input[9], input[22]);
+  btf_16_adds_subs_out_avx2(&x1[10], &x1[21], input[10], input[21]);
+  btf_16_adds_subs_out_avx2(&x1[11], &x1[20], input[11], input[20]);
+  btf_16_adds_subs_out_avx2(&x1[12], &x1[19], input[12], input[19]);
+  btf_16_adds_subs_out_avx2(&x1[13], &x1[18], input[13], input[18]);
+  btf_16_adds_subs_out_avx2(&x1[14], &x1[17], input[14], input[17]);
+  btf_16_adds_subs_out_avx2(&x1[15], &x1[16], input[15], input[16]);
+
+  // stage 2
+  btf_16_adds_subs_avx2(&x1[0], &x1[15]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[14]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[5], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[6], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[8]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
+
+  // stage 3
+  btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[23]);
+  btf_16_adds_subs_avx2(&x1[17], &x1[22]);
+  btf_16_adds_subs_avx2(&x1[18], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[19], &x1[20]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[24]);
+  btf_16_adds_subs_avx2(&x1[30], &x1[25]);
+  btf_16_adds_subs_avx2(&x1[29], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[28], &x1[27]);
+
+  // stage 4
+  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
+
+  // stage 5
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[19]);
+  btf_16_adds_subs_avx2(&x1[17], &x1[18]);
+  btf_16_adds_subs_avx2(&x1[23], &x1[20]);
+  btf_16_adds_subs_avx2(&x1[22], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[24], &x1[27]);
+  btf_16_adds_subs_avx2(&x1[25], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[28]);
+  btf_16_adds_subs_avx2(&x1[30], &x1[29]);
+
+  // stage 6
+  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
+
+  // stage 7
+  btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[17]);
+  btf_16_adds_subs_avx2(&x1[19], &x1[18]);
+  btf_16_adds_subs_avx2(&x1[20], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[23], &x1[22]);
+  btf_16_adds_subs_avx2(&x1[24], &x1[25]);
+  btf_16_adds_subs_avx2(&x1[27], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[28], &x1[29]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[30]);
+
+  // stage 8
+  btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
+
+  // stage 9
+  output[0] = x1[0];
+  output[1] = x1[16];
+  output[2] = x1[8];
+  output[3] = x1[24];
+  output[4] = x1[4];
+  output[5] = x1[20];
+  output[6] = x1[12];
+  output[7] = x1[28];
+  output[8] = x1[2];
+  output[9] = x1[18];
+  output[10] = x1[10];
+  output[11] = x1[26];
+  output[12] = x1[6];
+  output[13] = x1[22];
+  output[14] = x1[14];
+  output[15] = x1[30];
+  output[16] = x1[1];
+  output[17] = x1[17];
+  output[18] = x1[9];
+  output[19] = x1[25];
+  output[20] = x1[5];
+  output[21] = x1[21];
+  output[22] = x1[13];
+  output[23] = x1[29];
+  output[24] = x1[3];
+  output[25] = x1[19];
+  output[26] = x1[11];
+  output[27] = x1[27];
+  output[28] = x1[7];
+  output[29] = x1[23];
+  output[30] = x1[15];
+  output[31] = x1[31];
+}
+
+static INLINE void fdct16x64_new_avx2(const __m256i *input, __m256i *output,
+                                      int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+  __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+  __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+  __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+  __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+  __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+  __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+  __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+  __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
+  __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+  __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
+  __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+  __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
+  __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
+  __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
+  __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
+  __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
+  __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
+  __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
+  __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
+  __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
+  __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
+  __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
+  __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
+  __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
+  __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
+  __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
+  __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
+  __m256i cospi_p63_p01 = pair_set_w16_epi16(cospi[63], cospi[1]);
+  __m256i cospi_m01_p63 = pair_set_w16_epi16(-cospi[1], cospi[63]);
+  __m256i cospi_p31_p33 = pair_set_w16_epi16(cospi[31], cospi[33]);
+  __m256i cospi_m33_p31 = pair_set_w16_epi16(-cospi[33], cospi[31]);
+  __m256i cospi_p47_p17 = pair_set_w16_epi16(cospi[47], cospi[17]);
+  __m256i cospi_m17_p47 = pair_set_w16_epi16(-cospi[17], cospi[47]);
+  __m256i cospi_p15_p49 = pair_set_w16_epi16(cospi[15], cospi[49]);
+  __m256i cospi_m49_p15 = pair_set_w16_epi16(-cospi[49], cospi[15]);
+  __m256i cospi_p55_p09 = pair_set_w16_epi16(cospi[55], cospi[9]);
+  __m256i cospi_m09_p55 = pair_set_w16_epi16(-cospi[9], cospi[55]);
+  __m256i cospi_p23_p41 = pair_set_w16_epi16(cospi[23], cospi[41]);
+  __m256i cospi_m41_p23 = pair_set_w16_epi16(-cospi[41], cospi[23]);
+  __m256i cospi_p39_p25 = pair_set_w16_epi16(cospi[39], cospi[25]);
+  __m256i cospi_m25_p39 = pair_set_w16_epi16(-cospi[25], cospi[39]);
+  __m256i cospi_p07_p57 = pair_set_w16_epi16(cospi[7], cospi[57]);
+  __m256i cospi_m57_p07 = pair_set_w16_epi16(-cospi[57], cospi[7]);
+  __m256i cospi_p59_p05 = pair_set_w16_epi16(cospi[59], cospi[5]);
+  __m256i cospi_m05_p59 = pair_set_w16_epi16(-cospi[5], cospi[59]);
+  __m256i cospi_p27_p37 = pair_set_w16_epi16(cospi[27], cospi[37]);
+  __m256i cospi_m37_p27 = pair_set_w16_epi16(-cospi[37], cospi[27]);
+  __m256i cospi_p43_p21 = pair_set_w16_epi16(cospi[43], cospi[21]);
+  __m256i cospi_m21_p43 = pair_set_w16_epi16(-cospi[21], cospi[43]);
+  __m256i cospi_p11_p53 = pair_set_w16_epi16(cospi[11], cospi[53]);
+  __m256i cospi_m53_p11 = pair_set_w16_epi16(-cospi[53], cospi[11]);
+  __m256i cospi_p51_p13 = pair_set_w16_epi16(cospi[51], cospi[13]);
+  __m256i cospi_m13_p51 = pair_set_w16_epi16(-cospi[13], cospi[51]);
+  __m256i cospi_p19_p45 = pair_set_w16_epi16(cospi[19], cospi[45]);
+  __m256i cospi_m45_p19 = pair_set_w16_epi16(-cospi[45], cospi[19]);
+  __m256i cospi_p35_p29 = pair_set_w16_epi16(cospi[35], cospi[29]);
+  __m256i cospi_m29_p35 = pair_set_w16_epi16(-cospi[29], cospi[35]);
+  __m256i cospi_p03_p61 = pair_set_w16_epi16(cospi[3], cospi[61]);
+  __m256i cospi_m61_p03 = pair_set_w16_epi16(-cospi[61], cospi[3]);
+
+  // stage 1
+  __m256i x1[64];
+  btf_16_adds_subs_out_avx2(&x1[0], &x1[63], input[0], input[63]);
+  btf_16_adds_subs_out_avx2(&x1[1], &x1[62], input[1], input[62]);
+  btf_16_adds_subs_out_avx2(&x1[2], &x1[61], input[2], input[61]);
+  btf_16_adds_subs_out_avx2(&x1[3], &x1[60], input[3], input[60]);
+  btf_16_adds_subs_out_avx2(&x1[4], &x1[59], input[4], input[59]);
+  btf_16_adds_subs_out_avx2(&x1[5], &x1[58], input[5], input[58]);
+  btf_16_adds_subs_out_avx2(&x1[6], &x1[57], input[6], input[57]);
+  btf_16_adds_subs_out_avx2(&x1[7], &x1[56], input[7], input[56]);
+  btf_16_adds_subs_out_avx2(&x1[8], &x1[55], input[8], input[55]);
+  btf_16_adds_subs_out_avx2(&x1[9], &x1[54], input[9], input[54]);
+  btf_16_adds_subs_out_avx2(&x1[10], &x1[53], input[10], input[53]);
+  btf_16_adds_subs_out_avx2(&x1[11], &x1[52], input[11], input[52]);
+  btf_16_adds_subs_out_avx2(&x1[12], &x1[51], input[12], input[51]);
+  btf_16_adds_subs_out_avx2(&x1[13], &x1[50], input[13], input[50]);
+  btf_16_adds_subs_out_avx2(&x1[14], &x1[49], input[14], input[49]);
+  btf_16_adds_subs_out_avx2(&x1[15], &x1[48], input[15], input[48]);
+  btf_16_adds_subs_out_avx2(&x1[16], &x1[47], input[16], input[47]);
+  btf_16_adds_subs_out_avx2(&x1[17], &x1[46], input[17], input[46]);
+  btf_16_adds_subs_out_avx2(&x1[18], &x1[45], input[18], input[45]);
+  btf_16_adds_subs_out_avx2(&x1[19], &x1[44], input[19], input[44]);
+  btf_16_adds_subs_out_avx2(&x1[20], &x1[43], input[20], input[43]);
+  btf_16_adds_subs_out_avx2(&x1[21], &x1[42], input[21], input[42]);
+  btf_16_adds_subs_out_avx2(&x1[22], &x1[41], input[22], input[41]);
+  btf_16_adds_subs_out_avx2(&x1[23], &x1[40], input[23], input[40]);
+  btf_16_adds_subs_out_avx2(&x1[24], &x1[39], input[24], input[39]);
+  btf_16_adds_subs_out_avx2(&x1[25], &x1[38], input[25], input[38]);
+  btf_16_adds_subs_out_avx2(&x1[26], &x1[37], input[26], input[37]);
+  btf_16_adds_subs_out_avx2(&x1[27], &x1[36], input[27], input[36]);
+  btf_16_adds_subs_out_avx2(&x1[28], &x1[35], input[28], input[35]);
+  btf_16_adds_subs_out_avx2(&x1[29], &x1[34], input[29], input[34]);
+  btf_16_adds_subs_out_avx2(&x1[30], &x1[33], input[30], input[33]);
+  btf_16_adds_subs_out_avx2(&x1[31], &x1[32], input[31], input[32]);
+
+  // stage 2
+  btf_16_adds_subs_avx2(&x1[0], &x1[31]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[30]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[29]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[28]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[27]);
+  btf_16_adds_subs_avx2(&x1[5], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[6], &x1[25]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[24]);
+  btf_16_adds_subs_avx2(&x1[8], &x1[23]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[22]);
+  btf_16_adds_subs_avx2(&x1[10], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[20]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[19]);
+  btf_16_adds_subs_avx2(&x1[13], &x1[18]);
+  btf_16_adds_subs_avx2(&x1[14], &x1[17]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[16]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[40], &x1[55], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[41], &x1[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[42], &x1[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[43], &x1[52], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[44], &x1[51], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[45], &x1[50], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[46], &x1[49], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[47], &x1[48], _r, cos_bit);
+
+  // stage 3
+  btf_16_adds_subs_avx2(&x1[0], &x1[15]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[14]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[5], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[6], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[8]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[32], &x1[47]);
+  btf_16_adds_subs_avx2(&x1[33], &x1[46]);
+  btf_16_adds_subs_avx2(&x1[34], &x1[45]);
+  btf_16_adds_subs_avx2(&x1[35], &x1[44]);
+  btf_16_adds_subs_avx2(&x1[36], &x1[43]);
+  btf_16_adds_subs_avx2(&x1[37], &x1[42]);
+  btf_16_adds_subs_avx2(&x1[38], &x1[41]);
+  btf_16_adds_subs_avx2(&x1[39], &x1[40]);
+  btf_16_adds_subs_avx2(&x1[63], &x1[48]);
+  btf_16_adds_subs_avx2(&x1[62], &x1[49]);
+  btf_16_adds_subs_avx2(&x1[61], &x1[50]);
+  btf_16_adds_subs_avx2(&x1[60], &x1[51]);
+  btf_16_adds_subs_avx2(&x1[59], &x1[52]);
+  btf_16_adds_subs_avx2(&x1[58], &x1[53]);
+  btf_16_adds_subs_avx2(&x1[57], &x1[54]);
+  btf_16_adds_subs_avx2(&x1[56], &x1[55]);
+
+  // stage 4
+  btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[23]);
+  btf_16_adds_subs_avx2(&x1[17], &x1[22]);
+  btf_16_adds_subs_avx2(&x1[18], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[19], &x1[20]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[24]);
+  btf_16_adds_subs_avx2(&x1[30], &x1[25]);
+  btf_16_adds_subs_avx2(&x1[29], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[28], &x1[27]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[36], &x1[59], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[37], &x1[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[38], &x1[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[39], &x1[56], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[40], &x1[55], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[41], &x1[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[42], &x1[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[43], &x1[52], _r, cos_bit);
+
+  // stage 5
+  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[32], &x1[39]);
+  btf_16_adds_subs_avx2(&x1[33], &x1[38]);
+  btf_16_adds_subs_avx2(&x1[34], &x1[37]);
+  btf_16_adds_subs_avx2(&x1[35], &x1[36]);
+  btf_16_adds_subs_avx2(&x1[47], &x1[40]);
+  btf_16_adds_subs_avx2(&x1[46], &x1[41]);
+  btf_16_adds_subs_avx2(&x1[45], &x1[42]);
+  btf_16_adds_subs_avx2(&x1[44], &x1[43]);
+  btf_16_adds_subs_avx2(&x1[48], &x1[55]);
+  btf_16_adds_subs_avx2(&x1[49], &x1[54]);
+  btf_16_adds_subs_avx2(&x1[50], &x1[53]);
+  btf_16_adds_subs_avx2(&x1[51], &x1[52]);
+  btf_16_adds_subs_avx2(&x1[63], &x1[56]);
+  btf_16_adds_subs_avx2(&x1[62], &x1[57]);
+  btf_16_adds_subs_avx2(&x1[61], &x1[58]);
+  btf_16_adds_subs_avx2(&x1[60], &x1[59]);
+
+  // stage 6
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[19]);
+  btf_16_adds_subs_avx2(&x1[17], &x1[18]);
+  btf_16_adds_subs_avx2(&x1[23], &x1[20]);
+  btf_16_adds_subs_avx2(&x1[22], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[24], &x1[27]);
+  btf_16_adds_subs_avx2(&x1[25], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[28]);
+  btf_16_adds_subs_avx2(&x1[30], &x1[29]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[34], &x1[61], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[35], &x1[60], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[36], &x1[59], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[37], &x1[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[42], &x1[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[43], &x1[52], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[44], &x1[51], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[45], &x1[50], _r, cos_bit);
+
+  // stage 7
+  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[32], &x1[35]);
+  btf_16_adds_subs_avx2(&x1[33], &x1[34]);
+  btf_16_adds_subs_avx2(&x1[39], &x1[36]);
+  btf_16_adds_subs_avx2(&x1[38], &x1[37]);
+  btf_16_adds_subs_avx2(&x1[40], &x1[43]);
+  btf_16_adds_subs_avx2(&x1[41], &x1[42]);
+  btf_16_adds_subs_avx2(&x1[47], &x1[44]);
+  btf_16_adds_subs_avx2(&x1[46], &x1[45]);
+  btf_16_adds_subs_avx2(&x1[48], &x1[51]);
+  btf_16_adds_subs_avx2(&x1[49], &x1[50]);
+  btf_16_adds_subs_avx2(&x1[55], &x1[52]);
+  btf_16_adds_subs_avx2(&x1[54], &x1[53]);
+  btf_16_adds_subs_avx2(&x1[56], &x1[59]);
+  btf_16_adds_subs_avx2(&x1[57], &x1[58]);
+  btf_16_adds_subs_avx2(&x1[63], &x1[60]);
+  btf_16_adds_subs_avx2(&x1[62], &x1[61]);
+
+  // stage 8
+  btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[17]);
+  btf_16_adds_subs_avx2(&x1[19], &x1[18]);
+  btf_16_adds_subs_avx2(&x1[20], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[23], &x1[22]);
+  btf_16_adds_subs_avx2(&x1[24], &x1[25]);
+  btf_16_adds_subs_avx2(&x1[27], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[28], &x1[29]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[30]);
+  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x1[33], &x1[62], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x1[34], &x1[61], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x1[37], &x1[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x1[38], &x1[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x1[41], &x1[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x1[42], &x1[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x1[45], &x1[50], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x1[46], &x1[49], _r, cos_bit);
+
+  // stage 9
+  btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[32], &x1[33]);
+  btf_16_adds_subs_avx2(&x1[35], &x1[34]);
+  btf_16_adds_subs_avx2(&x1[36], &x1[37]);
+  btf_16_adds_subs_avx2(&x1[39], &x1[38]);
+  btf_16_adds_subs_avx2(&x1[40], &x1[41]);
+  btf_16_adds_subs_avx2(&x1[43], &x1[42]);
+  btf_16_adds_subs_avx2(&x1[44], &x1[45]);
+  btf_16_adds_subs_avx2(&x1[47], &x1[46]);
+  btf_16_adds_subs_avx2(&x1[48], &x1[49]);
+  btf_16_adds_subs_avx2(&x1[51], &x1[50]);
+  btf_16_adds_subs_avx2(&x1[52], &x1[53]);
+  btf_16_adds_subs_avx2(&x1[55], &x1[54]);
+  btf_16_adds_subs_avx2(&x1[56], &x1[57]);
+  btf_16_adds_subs_avx2(&x1[59], &x1[58]);
+  btf_16_adds_subs_avx2(&x1[60], &x1[61]);
+  btf_16_adds_subs_avx2(&x1[63], &x1[62]);
+
+  // stage 10
+  btf_16_w16_avx2(cospi_p63_p01, cospi_m01_p63, &x1[32], &x1[63], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p31_p33, cospi_m33_p31, &x1[33], &x1[62], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p47_p17, cospi_m17_p47, &x1[34], &x1[61], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p15_p49, cospi_m49_p15, &x1[35], &x1[60], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p55_p09, cospi_m09_p55, &x1[36], &x1[59], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p23_p41, cospi_m41_p23, &x1[37], &x1[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p39_p25, cospi_m25_p39, &x1[38], &x1[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p07_p57, cospi_m57_p07, &x1[39], &x1[56], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p59_p05, cospi_m05_p59, &x1[40], &x1[55], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p27_p37, cospi_m37_p27, &x1[41], &x1[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p43_p21, cospi_m21_p43, &x1[42], &x1[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p11_p53, cospi_m53_p11, &x1[43], &x1[52], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p51_p13, cospi_m13_p51, &x1[44], &x1[51], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p19_p45, cospi_m45_p19, &x1[45], &x1[50], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p35_p29, cospi_m29_p35, &x1[46], &x1[49], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p03_p61, cospi_m61_p03, &x1[47], &x1[48], _r, cos_bit);
+
+  // stage 11
+  output[0] = x1[0];
+  output[1] = x1[32];
+  output[2] = x1[16];
+  output[3] = x1[48];
+  output[4] = x1[8];
+  output[5] = x1[40];
+  output[6] = x1[24];
+  output[7] = x1[56];
+  output[8] = x1[4];
+  output[9] = x1[36];
+  output[10] = x1[20];
+  output[11] = x1[52];
+  output[12] = x1[12];
+  output[13] = x1[44];
+  output[14] = x1[28];
+  output[15] = x1[60];
+  output[16] = x1[2];
+  output[17] = x1[34];
+  output[18] = x1[18];
+  output[19] = x1[50];
+  output[20] = x1[10];
+  output[21] = x1[42];
+  output[22] = x1[26];
+  output[23] = x1[58];
+  output[24] = x1[6];
+  output[25] = x1[38];
+  output[26] = x1[22];
+  output[27] = x1[54];
+  output[28] = x1[14];
+  output[29] = x1[46];
+  output[30] = x1[30];
+  output[31] = x1[62];
+  output[32] = x1[1];
+  output[33] = x1[33];
+  output[34] = x1[17];
+  output[35] = x1[49];
+  output[36] = x1[9];
+  output[37] = x1[41];
+  output[38] = x1[25];
+  output[39] = x1[57];
+  output[40] = x1[5];
+  output[41] = x1[37];
+  output[42] = x1[21];
+  output[43] = x1[53];
+  output[44] = x1[13];
+  output[45] = x1[45];
+  output[46] = x1[29];
+  output[47] = x1[61];
+  output[48] = x1[3];
+  output[49] = x1[35];
+  output[50] = x1[19];
+  output[51] = x1[51];
+  output[52] = x1[11];
+  output[53] = x1[43];
+  output[54] = x1[27];
+  output[55] = x1[59];
+  output[56] = x1[7];
+  output[57] = x1[39];
+  output[58] = x1[23];
+  output[59] = x1[55];
+  output[60] = x1[15];
+  output[61] = x1[47];
+  output[62] = x1[31];
+  output[63] = x1[63];
+}
+
+static INLINE void av1_fdct32_new_avx2(const __m256i *input, __m256i *output,
+                                       int8_t cos_bit) {
+  __m256i x1[32];
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+  // stage 0
+  // stage 1
+  btf_32_add_sub_out_avx2(&x1[0], &x1[31], input[0], input[31]);
+  btf_32_add_sub_out_avx2(&x1[1], &x1[30], input[1], input[30]);
+  btf_32_add_sub_out_avx2(&x1[2], &x1[29], input[2], input[29]);
+  btf_32_add_sub_out_avx2(&x1[3], &x1[28], input[3], input[28]);
+  btf_32_add_sub_out_avx2(&x1[4], &x1[27], input[4], input[27]);
+  btf_32_add_sub_out_avx2(&x1[5], &x1[26], input[5], input[26]);
+  btf_32_add_sub_out_avx2(&x1[6], &x1[25], input[6], input[25]);
+  btf_32_add_sub_out_avx2(&x1[7], &x1[24], input[7], input[24]);
+  btf_32_add_sub_out_avx2(&x1[8], &x1[23], input[8], input[23]);
+  btf_32_add_sub_out_avx2(&x1[9], &x1[22], input[9], input[22]);
+  btf_32_add_sub_out_avx2(&x1[10], &x1[21], input[10], input[21]);
+  btf_32_add_sub_out_avx2(&x1[11], &x1[20], input[11], input[20]);
+  btf_32_add_sub_out_avx2(&x1[12], &x1[19], input[12], input[19]);
+  btf_32_add_sub_out_avx2(&x1[13], &x1[18], input[13], input[18]);
+  btf_32_add_sub_out_avx2(&x1[14], &x1[17], input[14], input[17]);
+  btf_32_add_sub_out_avx2(&x1[15], &x1[16], input[15], input[16]);
+
+  // stage 2
+  btf_32_add_sub_avx2(&x1[0], &x1[15]);
+  btf_32_add_sub_avx2(&x1[1], &x1[14]);
+  btf_32_add_sub_avx2(&x1[2], &x1[13]);
+  btf_32_add_sub_avx2(&x1[3], &x1[12]);
+  btf_32_add_sub_avx2(&x1[4], &x1[11]);
+  btf_32_add_sub_avx2(&x1[5], &x1[10]);
+  btf_32_add_sub_avx2(&x1[6], &x1[9]);
+  btf_32_add_sub_avx2(&x1[7], &x1[8]);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[22], &x1[25], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[23], &x1[24], _r, cos_bit);
+
+  // stage 3
+  btf_32_add_sub_avx2(&x1[0], &x1[7]);
+  btf_32_add_sub_avx2(&x1[1], &x1[6]);
+  btf_32_add_sub_avx2(&x1[2], &x1[5]);
+  btf_32_add_sub_avx2(&x1[3], &x1[4]);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[10], &x1[13], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[11], &x1[12], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[23]);
+  btf_32_add_sub_avx2(&x1[17], &x1[22]);
+  btf_32_add_sub_avx2(&x1[18], &x1[21]);
+  btf_32_add_sub_avx2(&x1[19], &x1[20]);
+  btf_32_add_sub_avx2(&x1[31], &x1[24]);
+  btf_32_add_sub_avx2(&x1[30], &x1[25]);
+  btf_32_add_sub_avx2(&x1[29], &x1[26]);
+  btf_32_add_sub_avx2(&x1[28], &x1[27]);
+
+  // stage 4
+  btf_32_add_sub_avx2(&x1[0], &x1[3]);
+  btf_32_add_sub_avx2(&x1[1], &x1[2]);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[5], &x1[6], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[8], &x1[11]);
+  btf_32_add_sub_avx2(&x1[9], &x1[10]);
+  btf_32_add_sub_avx2(&x1[15], &x1[12]);
+  btf_32_add_sub_avx2(&x1[14], &x1[13]);
+  btf_32_avx2_type0(-cospi[16], cospi[48], &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[16], cospi[48], &x1[19], &x1[28], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[21], &x1[26], _r, cos_bit);
+
+  // stage 5
+  btf_32_avx2_type0(cospi[32], cospi[32], &x1[0], &x1[1], _r, cos_bit);
+  btf_32_avx2_type1(cospi[48], cospi[16], &x1[2], &x1[3], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[4], &x1[5]);
+  btf_32_add_sub_avx2(&x1[7], &x1[6]);
+  btf_32_avx2_type0(-cospi[16], cospi[48], &x1[9], &x1[14], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[10], &x1[13], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[19]);
+  btf_32_add_sub_avx2(&x1[17], &x1[18]);
+  btf_32_add_sub_avx2(&x1[23], &x1[20]);
+  btf_32_add_sub_avx2(&x1[22], &x1[21]);
+  btf_32_add_sub_avx2(&x1[24], &x1[27]);
+  btf_32_add_sub_avx2(&x1[25], &x1[26]);
+  btf_32_add_sub_avx2(&x1[31], &x1[28]);
+  btf_32_add_sub_avx2(&x1[30], &x1[29]);
+
+  // stage 6
+  btf_32_avx2_type1(cospi[56], cospi[8], &x1[4], &x1[7], _r, cos_bit);
+  btf_32_avx2_type1(cospi[24], cospi[40], &x1[5], &x1[6], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[8], &x1[9]);
+  btf_32_add_sub_avx2(&x1[11], &x1[10]);
+  btf_32_add_sub_avx2(&x1[12], &x1[13]);
+  btf_32_add_sub_avx2(&x1[15], &x1[14]);
+  btf_32_avx2_type0(-cospi[8], cospi[56], &x1[17], &x1[30], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[56], -cospi[8], &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[40], cospi[24], &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[24], -cospi[40], &x1[22], &x1[25], _r, cos_bit);
+
+  // stage 7
+  btf_32_avx2_type1(cospi[60], cospi[4], &x1[8], &x1[15], _r, cos_bit);
+  btf_32_avx2_type1(cospi[28], cospi[36], &x1[9], &x1[14], _r, cos_bit);
+  btf_32_avx2_type1(cospi[44], cospi[20], &x1[10], &x1[13], _r, cos_bit);
+  btf_32_avx2_type1(cospi[12], cospi[52], &x1[11], &x1[12], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[17]);
+  btf_32_add_sub_avx2(&x1[19], &x1[18]);
+  btf_32_add_sub_avx2(&x1[20], &x1[21]);
+  btf_32_add_sub_avx2(&x1[23], &x1[22]);
+  btf_32_add_sub_avx2(&x1[24], &x1[25]);
+  btf_32_add_sub_avx2(&x1[27], &x1[26]);
+  btf_32_add_sub_avx2(&x1[28], &x1[29]);
+  btf_32_add_sub_avx2(&x1[31], &x1[30]);
+
+  // stage 8
+  btf_32_avx2_type1(cospi[62], cospi[2], &x1[16], &x1[31], _r, cos_bit);
+  btf_32_avx2_type1(cospi[30], cospi[34], &x1[17], &x1[30], _r, cos_bit);
+  btf_32_avx2_type1(cospi[46], cospi[18], &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type1(cospi[14], cospi[50], &x1[19], &x1[28], _r, cos_bit);
+  btf_32_avx2_type1(cospi[54], cospi[10], &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type1(cospi[22], cospi[42], &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type1(cospi[38], cospi[26], &x1[22], &x1[25], _r, cos_bit);
+  btf_32_avx2_type1(cospi[6], cospi[58], &x1[23], &x1[24], _r, cos_bit);
+
+  // stage 9
+  output[0] = x1[0];
+  output[1] = x1[16];
+  output[2] = x1[8];
+  output[3] = x1[24];
+  output[4] = x1[4];
+  output[5] = x1[20];
+  output[6] = x1[12];
+  output[7] = x1[28];
+  output[8] = x1[2];
+  output[9] = x1[18];
+  output[10] = x1[10];
+  output[11] = x1[26];
+  output[12] = x1[6];
+  output[13] = x1[22];
+  output[14] = x1[14];
+  output[15] = x1[30];
+  output[16] = x1[1];
+  output[17] = x1[17];
+  output[18] = x1[9];
+  output[19] = x1[25];
+  output[20] = x1[5];
+  output[21] = x1[21];
+  output[22] = x1[13];
+  output[23] = x1[29];
+  output[24] = x1[3];
+  output[25] = x1[19];
+  output[26] = x1[11];
+  output[27] = x1[27];
+  output[28] = x1[7];
+  output[29] = x1[23];
+  output[30] = x1[15];
+  output[31] = x1[31];
+}
+
+static INLINE void av1_fdct64_new_avx2(const __m256i *input, __m256i *output,
+                                       int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
+  __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
+  __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
+  __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
+  __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
+  __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
+  __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
+  __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
+  __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
+  __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
+  __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
+  __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
+  __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
+  __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
+  __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
+  __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
+  __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
+  __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
+  __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
+  __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
+  __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
+  __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
+  __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
+  __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
+  __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
+  __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
+  __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
+  __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
+  __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
+  __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
+  __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
+  __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
+  __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
+  __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
+  __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
+  __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
+  __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
+  __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
+  __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
+  __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
+  __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
+  __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
+  __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
+  __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
+  __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
+  __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
+  __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
+  __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
+  __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
+  __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
+  __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
+  __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
+  __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
+  __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
+  __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
+  __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
+  __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
+  __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
+  __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
+  __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
+  __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
+  __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
+  __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
+  __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
+  __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
+  __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
+  __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
+  __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
+  __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
+  __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
+  __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
+  __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
+  __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
+  __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
+  __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
+  __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
+  __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
+  __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
+
+  // stage 1
+  __m256i x1[64];
+  btf_32_add_sub_out_avx2(&x1[0], &x1[63], input[0], input[63]);
+  btf_32_add_sub_out_avx2(&x1[1], &x1[62], input[1], input[62]);
+  btf_32_add_sub_out_avx2(&x1[2], &x1[61], input[2], input[61]);
+  btf_32_add_sub_out_avx2(&x1[3], &x1[60], input[3], input[60]);
+  btf_32_add_sub_out_avx2(&x1[4], &x1[59], input[4], input[59]);
+  btf_32_add_sub_out_avx2(&x1[5], &x1[58], input[5], input[58]);
+  btf_32_add_sub_out_avx2(&x1[6], &x1[57], input[6], input[57]);
+  btf_32_add_sub_out_avx2(&x1[7], &x1[56], input[7], input[56]);
+  btf_32_add_sub_out_avx2(&x1[8], &x1[55], input[8], input[55]);
+  btf_32_add_sub_out_avx2(&x1[9], &x1[54], input[9], input[54]);
+  btf_32_add_sub_out_avx2(&x1[10], &x1[53], input[10], input[53]);
+  btf_32_add_sub_out_avx2(&x1[11], &x1[52], input[11], input[52]);
+  btf_32_add_sub_out_avx2(&x1[12], &x1[51], input[12], input[51]);
+  btf_32_add_sub_out_avx2(&x1[13], &x1[50], input[13], input[50]);
+  btf_32_add_sub_out_avx2(&x1[14], &x1[49], input[14], input[49]);
+  btf_32_add_sub_out_avx2(&x1[15], &x1[48], input[15], input[48]);
+  btf_32_add_sub_out_avx2(&x1[16], &x1[47], input[16], input[47]);
+  btf_32_add_sub_out_avx2(&x1[17], &x1[46], input[17], input[46]);
+  btf_32_add_sub_out_avx2(&x1[18], &x1[45], input[18], input[45]);
+  btf_32_add_sub_out_avx2(&x1[19], &x1[44], input[19], input[44]);
+  btf_32_add_sub_out_avx2(&x1[20], &x1[43], input[20], input[43]);
+  btf_32_add_sub_out_avx2(&x1[21], &x1[42], input[21], input[42]);
+  btf_32_add_sub_out_avx2(&x1[22], &x1[41], input[22], input[41]);
+  btf_32_add_sub_out_avx2(&x1[23], &x1[40], input[23], input[40]);
+  btf_32_add_sub_out_avx2(&x1[24], &x1[39], input[24], input[39]);
+  btf_32_add_sub_out_avx2(&x1[25], &x1[38], input[25], input[38]);
+  btf_32_add_sub_out_avx2(&x1[26], &x1[37], input[26], input[37]);
+  btf_32_add_sub_out_avx2(&x1[27], &x1[36], input[27], input[36]);
+  btf_32_add_sub_out_avx2(&x1[28], &x1[35], input[28], input[35]);
+  btf_32_add_sub_out_avx2(&x1[29], &x1[34], input[29], input[34]);
+  btf_32_add_sub_out_avx2(&x1[30], &x1[33], input[30], input[33]);
+  btf_32_add_sub_out_avx2(&x1[31], &x1[32], input[31], input[32]);
+
+  // stage 2
+  btf_32_add_sub_avx2(&x1[0], &x1[31]);
+  btf_32_add_sub_avx2(&x1[1], &x1[30]);
+  btf_32_add_sub_avx2(&x1[2], &x1[29]);
+  btf_32_add_sub_avx2(&x1[3], &x1[28]);
+  btf_32_add_sub_avx2(&x1[4], &x1[27]);
+  btf_32_add_sub_avx2(&x1[5], &x1[26]);
+  btf_32_add_sub_avx2(&x1[6], &x1[25]);
+  btf_32_add_sub_avx2(&x1[7], &x1[24]);
+  btf_32_add_sub_avx2(&x1[8], &x1[23]);
+  btf_32_add_sub_avx2(&x1[9], &x1[22]);
+  btf_32_add_sub_avx2(&x1[10], &x1[21]);
+  btf_32_add_sub_avx2(&x1[11], &x1[20]);
+  btf_32_add_sub_avx2(&x1[12], &x1[19]);
+  btf_32_add_sub_avx2(&x1[13], &x1[18]);
+  btf_32_add_sub_avx2(&x1[14], &x1[17]);
+  btf_32_add_sub_avx2(&x1[15], &x1[16]);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[40], &x1[55], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[41], &x1[54], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[42], &x1[53], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[43], &x1[52], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[44], &x1[51], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[45], &x1[50], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[46], &x1[49], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[47], &x1[48], _r, cos_bit);
+
+  // stage 3
+  btf_32_add_sub_avx2(&x1[0], &x1[15]);
+  btf_32_add_sub_avx2(&x1[1], &x1[14]);
+  btf_32_add_sub_avx2(&x1[2], &x1[13]);
+  btf_32_add_sub_avx2(&x1[3], &x1[12]);
+  btf_32_add_sub_avx2(&x1[4], &x1[11]);
+  btf_32_add_sub_avx2(&x1[5], &x1[10]);
+  btf_32_add_sub_avx2(&x1[6], &x1[9]);
+  btf_32_add_sub_avx2(&x1[7], &x1[8]);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[22], &x1[25], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[23], &x1[24], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[32], &x1[47]);
+  btf_32_add_sub_avx2(&x1[33], &x1[46]);
+  btf_32_add_sub_avx2(&x1[34], &x1[45]);
+  btf_32_add_sub_avx2(&x1[35], &x1[44]);
+  btf_32_add_sub_avx2(&x1[36], &x1[43]);
+  btf_32_add_sub_avx2(&x1[37], &x1[42]);
+  btf_32_add_sub_avx2(&x1[38], &x1[41]);
+  btf_32_add_sub_avx2(&x1[39], &x1[40]);
+  btf_32_add_sub_avx2(&x1[63], &x1[48]);
+  btf_32_add_sub_avx2(&x1[62], &x1[49]);
+  btf_32_add_sub_avx2(&x1[61], &x1[50]);
+  btf_32_add_sub_avx2(&x1[60], &x1[51]);
+  btf_32_add_sub_avx2(&x1[59], &x1[52]);
+  btf_32_add_sub_avx2(&x1[58], &x1[53]);
+  btf_32_add_sub_avx2(&x1[57], &x1[54]);
+  btf_32_add_sub_avx2(&x1[56], &x1[55]);
+
+  // stage 4
+  btf_32_add_sub_avx2(&x1[0], &x1[7]);
+  btf_32_add_sub_avx2(&x1[1], &x1[6]);
+  btf_32_add_sub_avx2(&x1[2], &x1[5]);
+  btf_32_add_sub_avx2(&x1[3], &x1[4]);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[10], &x1[13], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[11], &x1[12], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[23]);
+  btf_32_add_sub_avx2(&x1[17], &x1[22]);
+  btf_32_add_sub_avx2(&x1[18], &x1[21]);
+  btf_32_add_sub_avx2(&x1[19], &x1[20]);
+  btf_32_add_sub_avx2(&x1[31], &x1[24]);
+  btf_32_add_sub_avx2(&x1[30], &x1[25]);
+  btf_32_add_sub_avx2(&x1[29], &x1[26]);
+  btf_32_add_sub_avx2(&x1[28], &x1[27]);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[36], &x1[59], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[37], &x1[58], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[38], &x1[57], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[39], &x1[56], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[40], &x1[55], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[41], &x1[54], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[42], &x1[53], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[43], &x1[52], _r, cos_bit);
+
+  // stage 5
+  btf_32_add_sub_avx2(&x1[0], &x1[3]);
+  btf_32_add_sub_avx2(&x1[1], &x1[2]);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[5], &x1[6], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[8], &x1[11]);
+  btf_32_add_sub_avx2(&x1[9], &x1[10]);
+  btf_32_add_sub_avx2(&x1[15], &x1[12]);
+  btf_32_add_sub_avx2(&x1[14], &x1[13]);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[19], &x1[28], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[21], &x1[26], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[32], &x1[39]);
+  btf_32_add_sub_avx2(&x1[33], &x1[38]);
+  btf_32_add_sub_avx2(&x1[34], &x1[37]);
+  btf_32_add_sub_avx2(&x1[35], &x1[36]);
+  btf_32_add_sub_avx2(&x1[47], &x1[40]);
+  btf_32_add_sub_avx2(&x1[46], &x1[41]);
+  btf_32_add_sub_avx2(&x1[45], &x1[42]);
+  btf_32_add_sub_avx2(&x1[44], &x1[43]);
+  btf_32_add_sub_avx2(&x1[48], &x1[55]);
+  btf_32_add_sub_avx2(&x1[49], &x1[54]);
+  btf_32_add_sub_avx2(&x1[50], &x1[53]);
+  btf_32_add_sub_avx2(&x1[51], &x1[52]);
+  btf_32_add_sub_avx2(&x1[63], &x1[56]);
+  btf_32_add_sub_avx2(&x1[62], &x1[57]);
+  btf_32_add_sub_avx2(&x1[61], &x1[58]);
+  btf_32_add_sub_avx2(&x1[60], &x1[59]);
+
+  // stage 6
+  btf_32_avx2_type0_new(cospi_p32, cospi_p32, &x1[0], &x1[1], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p48, cospi_p16, &x1[2], &x1[3], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[4], &x1[5]);
+  btf_32_add_sub_avx2(&x1[7], &x1[6]);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[9], &x1[14], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[10], &x1[13], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[19]);
+  btf_32_add_sub_avx2(&x1[17], &x1[18]);
+  btf_32_add_sub_avx2(&x1[23], &x1[20]);
+  btf_32_add_sub_avx2(&x1[22], &x1[21]);
+  btf_32_add_sub_avx2(&x1[24], &x1[27]);
+  btf_32_add_sub_avx2(&x1[25], &x1[26]);
+  btf_32_add_sub_avx2(&x1[31], &x1[28]);
+  btf_32_add_sub_avx2(&x1[30], &x1[29]);
+  btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[34], &x1[61], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[35], &x1[60], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[36], &x1[59], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[37], &x1[58], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[42], &x1[53], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[43], &x1[52], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[44], &x1[51], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[45], &x1[50], _r, cos_bit);
+
+  // stage 7
+  btf_32_avx2_type1_new(cospi_p56, cospi_p08, &x1[4], &x1[7], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p24, cospi_p40, &x1[5], &x1[6], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[8], &x1[9]);
+  btf_32_add_sub_avx2(&x1[11], &x1[10]);
+  btf_32_add_sub_avx2(&x1[12], &x1[13]);
+  btf_32_add_sub_avx2(&x1[15], &x1[14]);
+  btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[17], &x1[30], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[22], &x1[25], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[32], &x1[35]);
+  btf_32_add_sub_avx2(&x1[33], &x1[34]);
+  btf_32_add_sub_avx2(&x1[39], &x1[36]);
+  btf_32_add_sub_avx2(&x1[38], &x1[37]);
+  btf_32_add_sub_avx2(&x1[40], &x1[43]);
+  btf_32_add_sub_avx2(&x1[41], &x1[42]);
+  btf_32_add_sub_avx2(&x1[47], &x1[44]);
+  btf_32_add_sub_avx2(&x1[46], &x1[45]);
+  btf_32_add_sub_avx2(&x1[48], &x1[51]);
+  btf_32_add_sub_avx2(&x1[49], &x1[50]);
+  btf_32_add_sub_avx2(&x1[55], &x1[52]);
+  btf_32_add_sub_avx2(&x1[54], &x1[53]);
+  btf_32_add_sub_avx2(&x1[56], &x1[59]);
+  btf_32_add_sub_avx2(&x1[57], &x1[58]);
+  btf_32_add_sub_avx2(&x1[63], &x1[60]);
+  btf_32_add_sub_avx2(&x1[62], &x1[61]);
+
+  // stage 8
+  btf_32_avx2_type1_new(cospi_p60, cospi_p04, &x1[8], &x1[15], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p28, cospi_p36, &x1[9], &x1[14], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p44, cospi_p20, &x1[10], &x1[13], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p12, cospi_p52, &x1[11], &x1[12], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[17]);
+  btf_32_add_sub_avx2(&x1[19], &x1[18]);
+  btf_32_add_sub_avx2(&x1[20], &x1[21]);
+  btf_32_add_sub_avx2(&x1[23], &x1[22]);
+  btf_32_add_sub_avx2(&x1[24], &x1[25]);
+  btf_32_add_sub_avx2(&x1[27], &x1[26]);
+  btf_32_add_sub_avx2(&x1[28], &x1[29]);
+  btf_32_add_sub_avx2(&x1[31], &x1[30]);
+  btf_32_avx2_type0_new(cospi_m04, cospi_p60, &x1[33], &x1[62], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m60, cospi_m04, &x1[34], &x1[61], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m36, cospi_p28, &x1[37], &x1[58], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m28, cospi_m36, &x1[38], &x1[57], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m20, cospi_p44, &x1[41], &x1[54], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m44, cospi_m20, &x1[42], &x1[53], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m52, cospi_p12, &x1[45], &x1[50], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m12, cospi_m52, &x1[46], &x1[49], _r, cos_bit);
+
+  // stage 9
+  btf_32_avx2_type1_new(cospi_p62, cospi_p02, &x1[16], &x1[31], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p30, cospi_p34, &x1[17], &x1[30], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p46, cospi_p18, &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p14, cospi_p50, &x1[19], &x1[28], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p54, cospi_p10, &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p22, cospi_p42, &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p38, cospi_p26, &x1[22], &x1[25], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p06, cospi_p58, &x1[23], &x1[24], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[32], &x1[33]);
+  btf_32_add_sub_avx2(&x1[35], &x1[34]);
+  btf_32_add_sub_avx2(&x1[36], &x1[37]);
+  btf_32_add_sub_avx2(&x1[39], &x1[38]);
+  btf_32_add_sub_avx2(&x1[40], &x1[41]);
+  btf_32_add_sub_avx2(&x1[43], &x1[42]);
+  btf_32_add_sub_avx2(&x1[44], &x1[45]);
+  btf_32_add_sub_avx2(&x1[47], &x1[46]);
+  btf_32_add_sub_avx2(&x1[48], &x1[49]);
+  btf_32_add_sub_avx2(&x1[51], &x1[50]);
+  btf_32_add_sub_avx2(&x1[52], &x1[53]);
+  btf_32_add_sub_avx2(&x1[55], &x1[54]);
+  btf_32_add_sub_avx2(&x1[56], &x1[57]);
+  btf_32_add_sub_avx2(&x1[59], &x1[58]);
+  btf_32_add_sub_avx2(&x1[60], &x1[61]);
+  btf_32_add_sub_avx2(&x1[63], &x1[62]);
+
+  // stage 10
+  btf_32_avx2_type1_new(cospi_p63, cospi_p01, &x1[32], &x1[63], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p31, cospi_p33, &x1[33], &x1[62], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p47, cospi_p17, &x1[34], &x1[61], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p15, cospi_p49, &x1[35], &x1[60], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p55, cospi_p09, &x1[36], &x1[59], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p23, cospi_p41, &x1[37], &x1[58], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p39, cospi_p25, &x1[38], &x1[57], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p07, cospi_p57, &x1[39], &x1[56], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p59, cospi_p05, &x1[40], &x1[55], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p27, cospi_p37, &x1[41], &x1[54], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p43, cospi_p21, &x1[42], &x1[53], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p11, cospi_p53, &x1[43], &x1[52], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p51, cospi_p13, &x1[44], &x1[51], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p19, cospi_p45, &x1[45], &x1[50], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p35, cospi_p29, &x1[46], &x1[49], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p03, cospi_p61, &x1[47], &x1[48], _r, cos_bit);
+
+  // stage 11
+  output[0] = x1[0];
+  output[1] = x1[32];
+  output[2] = x1[16];
+  output[3] = x1[48];
+  output[4] = x1[8];
+  output[5] = x1[40];
+  output[6] = x1[24];
+  output[7] = x1[56];
+  output[8] = x1[4];
+  output[9] = x1[36];
+  output[10] = x1[20];
+  output[11] = x1[52];
+  output[12] = x1[12];
+  output[13] = x1[44];
+  output[14] = x1[28];
+  output[15] = x1[60];
+  output[16] = x1[2];
+  output[17] = x1[34];
+  output[18] = x1[18];
+  output[19] = x1[50];
+  output[20] = x1[10];
+  output[21] = x1[42];
+  output[22] = x1[26];
+  output[23] = x1[58];
+  output[24] = x1[6];
+  output[25] = x1[38];
+  output[26] = x1[22];
+  output[27] = x1[54];
+  output[28] = x1[14];
+  output[29] = x1[46];
+  output[30] = x1[30];
+  output[31] = x1[62];
+  output[32] = x1[1];
+  output[33] = x1[33];
+  output[34] = x1[17];
+  output[35] = x1[49];
+  output[36] = x1[9];
+  output[37] = x1[41];
+  output[38] = x1[25];
+  output[39] = x1[57];
+  output[40] = x1[5];
+  output[41] = x1[37];
+  output[42] = x1[21];
+  output[43] = x1[53];
+  output[44] = x1[13];
+  output[45] = x1[45];
+  output[46] = x1[29];
+  output[47] = x1[61];
+  output[48] = x1[3];
+  output[49] = x1[35];
+  output[50] = x1[19];
+  output[51] = x1[51];
+  output[52] = x1[11];
+  output[53] = x1[43];
+  output[54] = x1[27];
+  output[55] = x1[59];
+  output[56] = x1[7];
+  output[57] = x1[39];
+  output[58] = x1[23];
+  output[59] = x1[55];
+  output[60] = x1[15];
+  output[61] = x1[47];
+  output[62] = x1[31];
+  output[63] = x1[63];
+}
+
+static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
+                                       int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __zero = _mm256_setzero_si256();
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+  __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+  __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
+  __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
+  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+
+  // stage 1
+  __m256i x1[16];
+  x1[0] = input[0];
+  x1[1] = _mm256_subs_epi16(__zero, input[15]);
+  x1[2] = _mm256_subs_epi16(__zero, input[7]);
+  x1[3] = input[8];
+  x1[4] = _mm256_subs_epi16(__zero, input[3]);
+  x1[5] = input[12];
+  x1[6] = input[4];
+  x1[7] = _mm256_subs_epi16(__zero, input[11]);
+  x1[8] = _mm256_subs_epi16(__zero, input[1]);
+  x1[9] = input[14];
+  x1[10] = input[6];
+  x1[11] = _mm256_subs_epi16(__zero, input[9]);
+  x1[12] = input[2];
+  x1[13] = _mm256_subs_epi16(__zero, input[13]);
+  x1[14] = _mm256_subs_epi16(__zero, input[5]);
+  x1[15] = input[10];
+
+  // stage 2
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
+
+  // stage 3
+  btf_16_adds_subs_avx2(&x1[0], &x1[2]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[6]);
+  btf_16_adds_subs_avx2(&x1[5], &x1[7]);
+  btf_16_adds_subs_avx2(&x1[8], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[14]);
+  btf_16_adds_subs_avx2(&x1[13], &x1[15]);
+
+  // stage 4
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[6], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[14], &x1[15], _r, cos_bit);
+
+  // stage 5
+  btf_16_adds_subs_avx2(&x1[0], &x1[4]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[6]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[7]);
+  btf_16_adds_subs_avx2(&x1[8], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[10], &x1[14]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[15]);
+
+  // stage 6
+  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x1[10], &x1[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x1[12], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x1[14], &x1[15], _r, cos_bit);
+
+  // stage 7
+  btf_16_adds_subs_avx2(&x1[0], &x1[8]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[5], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[6], &x1[14]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[15]);
+
+  // stage 8
+  btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
+
+  // stage 9
+  output[0] = x1[1];
+  output[1] = x1[14];
+  output[2] = x1[3];
+  output[3] = x1[12];
+  output[4] = x1[5];
+  output[5] = x1[10];
+  output[6] = x1[7];
+  output[7] = x1[8];
+  output[8] = x1[9];
+  output[9] = x1[6];
+  output[10] = x1[11];
+  output[11] = x1[4];
+  output[12] = x1[13];
+  output[13] = x1[2];
+  output[14] = x1[15];
+  output[15] = x1[0];
+}
+
+static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) {
+  const __m256i scale__r = pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1));
+  const __m256i b = _mm256_madd_epi16(a, scale__r);
+  return _mm256_srai_epi32(b, NewSqrt2Bits);
+}
+
+static INLINE void fidentity16x16_new_avx2(const __m256i *input,
+                                           __m256i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i one = _mm256_set1_epi16(1);
+
+  for (int i = 0; i < 16; ++i) {
+    const __m256i a_lo = _mm256_unpacklo_epi16(input[i], one);
+    const __m256i a_hi = _mm256_unpackhi_epi16(input[i], one);
+    const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
+    const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
+    output[i] = _mm256_packs_epi32(b_lo, b_hi);
+  }
+}
+
+static INLINE void fidentity16x32_new_avx2(const __m256i *input,
+                                           __m256i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; ++i) {
+    output[i] = _mm256_slli_epi16(input[i], 2);
+  }
+}
+
+static INLINE void av1_round_shift_array_32_avx2(__m256i *input,
+                                                 __m256i *output,
+                                                 const int size,
+                                                 const int bit) {
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = av1_round_shift_32_avx2(input[i], bit);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = _mm256_slli_epi32(input[i], -bit);
+    }
+  }
+}
+
+static INLINE void av1_round_shift_rect_array_32_avx2(__m256i *input,
+                                                      __m256i *output,
+                                                      const int size,
+                                                      const int bit) {
+  const __m256i sqrt2 = _mm256_set1_epi32(NewSqrt2);
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      const __m256i r0 = av1_round_shift_32_avx2(input[i], bit);
+      const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
+      output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      const __m256i r0 = _mm256_slli_epi32(input[i], -bit);
+      const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
+      output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits);
+    }
+  }
+}
+
+static INLINE void transpose_32_8x8_avx2(int stride, const __m256i *inputA,
+                                         __m256i *output) {
+  __m256i temp0 = _mm256_unpacklo_epi32(inputA[0], inputA[2]);
+  __m256i temp1 = _mm256_unpackhi_epi32(inputA[0], inputA[2]);
+  __m256i temp2 = _mm256_unpacklo_epi32(inputA[1], inputA[3]);
+  __m256i temp3 = _mm256_unpackhi_epi32(inputA[1], inputA[3]);
+  __m256i temp4 = _mm256_unpacklo_epi32(inputA[4], inputA[6]);
+  __m256i temp5 = _mm256_unpackhi_epi32(inputA[4], inputA[6]);
+  __m256i temp6 = _mm256_unpacklo_epi32(inputA[5], inputA[7]);
+  __m256i temp7 = _mm256_unpackhi_epi32(inputA[5], inputA[7]);
+
+  __m256i t0 = _mm256_unpacklo_epi32(temp0, temp2);
+  __m256i t1 = _mm256_unpackhi_epi32(temp0, temp2);
+  __m256i t2 = _mm256_unpacklo_epi32(temp1, temp3);
+  __m256i t3 = _mm256_unpackhi_epi32(temp1, temp3);
+  __m256i t4 = _mm256_unpacklo_epi32(temp4, temp6);
+  __m256i t5 = _mm256_unpackhi_epi32(temp4, temp6);
+  __m256i t6 = _mm256_unpacklo_epi32(temp5, temp7);
+  __m256i t7 = _mm256_unpackhi_epi32(temp5, temp7);
+
+  output[0 * stride] = _mm256_permute2x128_si256(t0, t4, 0x20);
+  output[1 * stride] = _mm256_permute2x128_si256(t1, t5, 0x20);
+  output[2 * stride] = _mm256_permute2x128_si256(t2, t6, 0x20);
+  output[3 * stride] = _mm256_permute2x128_si256(t3, t7, 0x20);
+  output[4 * stride] = _mm256_permute2x128_si256(t0, t4, 0x31);
+  output[5 * stride] = _mm256_permute2x128_si256(t1, t5, 0x31);
+  output[6 * stride] = _mm256_permute2x128_si256(t2, t6, 0x31);
+  output[7 * stride] = _mm256_permute2x128_si256(t3, t7, 0x31);
+}
+
+// Store 8 16 bit values. Sign extend the values.
+static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
+                                                        int32_t *out,
+                                                        const int stride,
+                                                        const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    _mm256_store_si256((__m256i *)(out),
+                       _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[i])));
+    _mm256_store_si256(
+        (__m256i *)(out + 8),
+        _mm256_cvtepi16_epi32(_mm256_extracti128_si256(in[i], 1)));
+    out += stride;
+  }
+}
+
+static INLINE void store_rect_16bit_to_32bit_avx2(const __m256i a,
+                                                  int32_t *const b) {
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8);
+  const __m256i a_lo = _mm256_unpacklo_epi16(a_reoder, one);
+  const __m256i a_hi = _mm256_unpackhi_epi16(a_reoder, one);
+  const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
+  const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
+  _mm256_store_si256((__m256i *)b, b_lo);
+  _mm256_store_si256((__m256i *)(b + 8), b_hi);
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w16_avx2(
+    const __m256i *const in, int32_t *const out, const int stride,
+    const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit_avx2(in[i], out + i * stride);
+  }
+}
+
+static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = {
+  fdct16x32_new_avx2,       // DCT_DCT
+  NULL,                     // ADST_DCT
+  NULL,                     // DCT_ADST
+  NULL,                     // ADST_ADST
+  NULL,                     // FLIPADST_DCT
+  NULL,                     // DCT_FLIPADST
+  NULL,                     // FLIPADST_FLIPADST
+  NULL,                     // ADST_FLIPADST
+  NULL,                     // FLIPADST_ADST
+  fidentity16x32_new_avx2,  // IDTX
+  fdct16x32_new_avx2,       // V_DCT
+  fidentity16x32_new_avx2,  // H_DCT
+  NULL,                     // V_ADST
+  NULL,                     // H_ADST
+  NULL,                     // V_FLIPADST
+  NULL                      // H_FLIPADST
+};
+
+static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = {
+  fdct16x32_new_avx2,       // DCT_DCT
+  NULL,                     // ADST_DCT
+  NULL,                     // DCT_ADST
+  NULL,                     // ADST_ADST
+  NULL,                     // FLIPADST_DCT
+  NULL,                     // DCT_FLIPADST
+  NULL,                     // FLIPADST_FLIPADST
+  NULL,                     // ADST_FLIPADST
+  NULL,                     // FLIPADST_ADST
+  fidentity16x32_new_avx2,  // IDTX
+  fidentity16x32_new_avx2,  // V_DCT
+  fdct16x32_new_avx2,       // H_DCT
+  NULL,                     // V_ADST
+  NULL,                     // H_ADST
+  NULL,                     // V_FLIPADST
+  NULL                      // H_FLIPADST
+};
+
+static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = {
+  fdct16x16_new_avx2,       // DCT_DCT
+  fadst16x16_new_avx2,      // ADST_DCT
+  fdct16x16_new_avx2,       // DCT_ADST
+  fadst16x16_new_avx2,      // ADST_ADST
+  fadst16x16_new_avx2,      // FLIPADST_DCT
+  fdct16x16_new_avx2,       // DCT_FLIPADST
+  fadst16x16_new_avx2,      // FLIPADST_FLIPADST
+  fadst16x16_new_avx2,      // ADST_FLIPADST
+  fadst16x16_new_avx2,      // FLIPADST_ADST
+  fidentity16x16_new_avx2,  // IDTX
+  fdct16x16_new_avx2,       // V_DCT
+  fidentity16x16_new_avx2,  // H_DCT
+  fadst16x16_new_avx2,      // V_ADST
+  fidentity16x16_new_avx2,  // H_ADST
+  fadst16x16_new_avx2,      // V_FLIPADST
+  fidentity16x16_new_avx2   // H_FLIPADST
+};
+
+static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = {
+  fdct16x16_new_avx2,       // DCT_DCT
+  fdct16x16_new_avx2,       // ADST_DCT
+  fadst16x16_new_avx2,      // DCT_ADST
+  fadst16x16_new_avx2,      // ADST_ADST
+  fdct16x16_new_avx2,       // FLIPADST_DCT
+  fadst16x16_new_avx2,      // DCT_FLIPADST
+  fadst16x16_new_avx2,      // FLIPADST_FLIPADST
+  fadst16x16_new_avx2,      // ADST_FLIPADST
+  fadst16x16_new_avx2,      // FLIPADST_ADST
+  fidentity16x16_new_avx2,  // IDTX
+  fidentity16x16_new_avx2,  // V_DCT
+  fdct16x16_new_avx2,       // H_DCT
+  fidentity16x16_new_avx2,  // V_ADST
+  fadst16x16_new_avx2,      // H_ADST
+  fidentity16x16_new_avx2,  // V_FLIPADST
+  fadst16x16_new_avx2       // H_FLIPADST
+};
+
+static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_16X16;
+  __m256i buf0[16], buf1[16];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int32_t i = 0;
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+  }
+  round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+  transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
+
+  __m256i *buf;
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_avx2(buf1 + width * i, buf, width);
+  } else {
+    buf = buf1 + width * i;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit_w16_avx2(buf, width, shift[2]);
+  transpose_16bit_16x16_avx2(buf, buf);
+  store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, width, 16);
+}
+
+static void lowbd_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_32X32;
+  __m256i buf0[32], buf1[128];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
+                                           height);
+    } else {
+      load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    }
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    transpose_16bit_16x16_avx2(buf0 + 0 * 16, buf1 + 0 * width + 16 * i);
+    transpose_16bit_16x16_avx2(buf0 + 1 * 16, buf1 + 1 * width + 16 * i);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    __m256i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_avx2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf, width, shift[2]);
+    transpose_16bit_16x16_avx2(buf, buf);
+    store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, width,
+                                         16);
+    transpose_16bit_16x16_avx2(buf + 16, buf + 16);
+    store_buffer_16bit_to_32bit_w16_avx2(buf + 16, output + 16 * width * i + 16,
+                                         width, 16);
+  }
+}
+
+static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X64;
+  __m256i buf0[64], buf1[256];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+  const int width_div16 = (width >> 4);
+  const int height_div16 = (height >> 4);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
+      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+    __m256i bufA[64];
+    __m256i bufB[64];
+    __m128i *buf = (__m128i *)(buf1 + width * i);
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+      bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+    }
+    av1_fdct64_new_avx2(bufA, bufA, cos_bit_row);
+    av1_fdct64_new_avx2(bufB, bufB, cos_bit_row);
+    av1_round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]);
+    av1_round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]);
+
+    int32_t *output8 = output + 16 * 32 * i;
+    for (int j = 0; j < 4; ++j) {
+      __m256i *out = (__m256i *)(output8 + 8 * j);
+      transpose_32_8x8_avx2(4, bufA + 8 * j, out);
+      transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_16x32_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_16X32;
+  __m256i buf0[32], buf1[32];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip_avx2(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_avx2(input, stride, buf0, height);
+  }
+  round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+  transpose_16bit_16x16_avx2(buf0, buf1);
+  transpose_16bit_16x16_avx2(buf0 + 16, buf1 + 16);
+
+  for (int i = 0; i < 2; i++) {
+    __m256i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_avx2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf, width, shift[2]);
+    transpose_16bit_16x16_avx2(buf, buf);
+    store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i,
+                                              width, 16);
+  }
+}
+
+static void lowbd_fwd_txfm2d_32x16_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m256i buf0[32], buf1[64];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_32X16];
+  const int txw_idx = get_txw_idx(TX_32X16);
+  const int txh_idx = get_txh_idx(TX_32X16);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 16;
+  const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
+                                           height);
+    } else {
+      load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    }
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
+  }
+
+  __m256i *buf;
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_avx2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit_w16_avx2(buf, width, shift[2]);
+  transpose_16bit_16x16_avx2(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output, width, 16);
+
+  transpose_16bit_16x16_avx2(buf + 16, buf + 16);
+  store_rect_buffer_16bit_to_32bit_w16_avx2(buf + 16, output + 16, width, 16);
+}
+
+static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_64X32;
+  __m256i buf0[64], buf1[256];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+  const int width_div16 = (width >> 4);
+  const int height_div16 = (height >> 4);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div16); ++j) {
+      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+    }
+  }
+  assert(tx_type == DCT_DCT);
+  for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+    __m256i bufA[64];
+    __m256i bufB[64];
+    __m128i *buf = (__m128i *)(buf1 + width * i);
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+      bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+    }
+    av1_fdct64_new_avx2(bufA, bufA, cos_bit_row);
+    av1_fdct64_new_avx2(bufB, bufB, cos_bit_row);
+    av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2]);
+    av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2]);
+
+    int32_t *output8 = output + 16 * 32 * i;
+    for (int j = 0; j < 4; ++j) {
+      __m256i *out = (__m256i *)(output8 + 8 * j);
+      transpose_32_8x8_avx2(4, bufA + 8 * j, out);
+      transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_32X64;
+  __m256i buf0[64], buf1[256];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+  const int width_div16 = (width >> 4);
+  const int height_div16 = (height >> 4);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
+      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+    __m256i bufA[32];
+    __m256i bufB[32];
+    __m128i *buf = (__m128i *)(buf1 + width * i);
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+      bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+    }
+    av1_fdct32_new_avx2(bufA, bufA, cos_bit_row);
+    av1_fdct32_new_avx2(bufB, bufB, cos_bit_row);
+    av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2]);
+    av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2]);
+
+    int32_t *output8 = output + 16 * 32 * i;
+    for (int j = 0; j < 4; ++j) {
+      __m256i *out = (__m256i *)(output8 + 8 * j);
+      transpose_32_8x8_avx2(4, bufA + 8 * j, out);
+      transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_16x64_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_16X64;
+  __m256i buf0[64], buf1[64];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+  const transform_1d_avx2 row_txfm = fdct16x16_new_avx2;
+  const int width_div16 = (width >> 4);
+  const int height_div16 = (height >> 4);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    for (int j = 0; j < height_div16; ++j) {
+      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(4, height_div16); i++) {
+    __m256i *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf, width, shift[2]);
+    int32_t *output16 = output + 16 * width * i;
+    for (int j = 0; j < width_div16; ++j) {
+      __m256i *buf16 = buf + 16 * j;
+      transpose_16bit_16x16_avx2(buf16, buf16);
+      store_buffer_16bit_to_32bit_w16_avx2(buf16, output16 + 16 * j, width, 16);
+    }
+  }
+  // Zero out the bottom 16x32 area.
+  memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X16;
+  __m256i buf0[64], buf1[64];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = fdct16x16_new_avx2;
+  const transform_1d_avx2 row_txfm = fdct16x64_new_avx2;
+  const int width_div16 = (width >> 4);
+  const int height_div16 = (height >> 4);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    for (int j = 0; j < height_div16; ++j) {
+      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+    }
+  }
+
+  for (int i = 0; i < height_div16; i++) {
+    __m256i *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf, width, shift[2]);
+    int32_t *output16 = output + 16 * 32 * i;
+    for (int j = 0; j < 2; ++j) {
+      __m256i *buf16 = buf + 16 * j;
+      transpose_16bit_16x16_avx2(buf16, buf16);
+      store_buffer_16bit_to_32bit_w16_avx2(buf16, output16 + 16 * j, 32, 16);
+    }
+  }
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+  av1_lowbd_fwd_txfm2d_4x4_sse2,   // 4x4 transform
+  av1_lowbd_fwd_txfm2d_8x8_sse2,   // 8x8 transform
+  lowbd_fwd_txfm2d_16x16_avx2,     // 16x16 transform
+  lowbd_fwd_txfm2d_32x32_avx2,     // 32x32 transform
+  lowbd_fwd_txfm2d_64x64_avx2,     // 64x64 transform
+  av1_lowbd_fwd_txfm2d_4x8_sse2,   // 4x8 transform
+  av1_lowbd_fwd_txfm2d_8x4_sse2,   // 8x4 transform
+  av1_lowbd_fwd_txfm2d_8x16_sse2,  // 8x16 transform
+  av1_lowbd_fwd_txfm2d_16x8_sse2,  // 16x8 transform
+  lowbd_fwd_txfm2d_16x32_avx2,     // 16x32 transform
+  lowbd_fwd_txfm2d_32x16_avx2,     // 32x16 transform
+  lowbd_fwd_txfm2d_32x64_avx2,     // 32x64 transform
+  lowbd_fwd_txfm2d_64x32_avx2,     // 64x32 transform
+  av1_lowbd_fwd_txfm2d_4x16_sse2,  // 4x16 transform
+  av1_lowbd_fwd_txfm2d_16x4_sse2,  // 16x4 transform
+  av1_lowbd_fwd_txfm2d_8x32_sse2,  // 8x32 transform
+  av1_lowbd_fwd_txfm2d_32x8_sse2,  // 32x8 transform
+  lowbd_fwd_txfm2d_16x64_avx2,     // 16x64 transform
+  lowbd_fwd_txfm2d_64x16_avx2,     // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff,
+                             int diff_stride, TxfmParam *txfm_param) {
+  FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+  if ((fwd_txfm2d_func == NULL) ||
+      (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) {
+    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+  } else {
+    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+                    txfm_param->bd);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
new file mode 100644
index 000000000..c582ca0e3
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_FWD_TXFM_AVX2_H_
+#define AV1_FWD_TXFM_AVX2_H_
+#include <immintrin.h>
+
+static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) {
+  __m256i tmp, round;
+  round = _mm256_set1_epi32(1 << (bit - 1));
+  tmp = _mm256_add_epi32(vec, round);
+  return _mm256_srai_epi32(tmp, bit);
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+static INLINE void btf_32_avx2_type0(const int32_t w0, const int32_t w1,
+                                     __m256i *in0, __m256i *in1,
+                                     const __m256i _r, const int32_t cos_bit) {
+  __m256i _in0 = *in0;
+  __m256i _in1 = *in1;
+  const __m256i ww0 = _mm256_set1_epi32(w0);
+  const __m256i ww1 = _mm256_set1_epi32(w1);
+  const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+  const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+  __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+  temp0 = _mm256_add_epi32(temp0, _r);
+  *in0 = _mm256_srai_epi32(temp0, cos_bit);
+  const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+  const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+  __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0);
+  temp1 = _mm256_add_epi32(temp1, _r);
+  *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+static INLINE void btf_32_avx2_type1(const int32_t w0, const int32_t w1,
+                                     __m256i *in0, __m256i *in1,
+                                     const __m256i _r, const int32_t cos_bit) {
+  __m256i _in0 = *in0;
+  __m256i _in1 = *in1;
+  const __m256i ww0 = _mm256_set1_epi32(w0);
+  const __m256i ww1 = _mm256_set1_epi32(w1);
+  const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+  const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+  __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+  temp0 = _mm256_add_epi32(temp0, _r);
+  *in0 = _mm256_srai_epi32(temp0, cos_bit);
+  const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+  const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+  __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1);
+  temp1 = _mm256_add_epi32(temp1, _r);
+  *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+static INLINE void btf_32_avx2_type0_new(const __m256i ww0, const __m256i ww1,
+                                         __m256i *in0, __m256i *in1,
+                                         const __m256i _r,
+                                         const int32_t cos_bit) {
+  __m256i _in0 = *in0;
+  __m256i _in1 = *in1;
+  const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+  const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+  __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+  temp0 = _mm256_add_epi32(temp0, _r);
+  *in0 = _mm256_srai_epi32(temp0, cos_bit);
+  const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+  const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+  __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0);
+  temp1 = _mm256_add_epi32(temp1, _r);
+  *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+static INLINE void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1,
+                                         __m256i *in0, __m256i *in1,
+                                         const __m256i _r,
+                                         const int32_t cos_bit) {
+  __m256i _in0 = *in0;
+  __m256i _in1 = *in1;
+  const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+  const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+  __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+  temp0 = _mm256_add_epi32(temp0, _r);
+  *in0 = _mm256_srai_epi32(temp0, cos_bit);
+  const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+  const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+  __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1);
+  temp1 = _mm256_add_epi32(temp1, _r);
+  *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+#endif  // AV1_FWD_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/encoder/x86/corner_match_sse4.c b/third_party/aom/av1/encoder/x86/corner_match_sse4.c
index 381f757da..93f37b71d 100644
--- a/third_party/aom/av1/encoder/x86/corner_match_sse4.c
+++ b/third_party/aom/av1/encoder/x86/corner_match_sse4.c
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 #include <stdlib.h>
 #include <memory.h>
 #include <math.h>
diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
new file mode 100644
index 000000000..f776e84c7
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom/aom_integer.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c
+ */
+uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d,
+                                           const uint8_t *m, int N) {
+  int n = -N;
+
+  uint64_t csse;
+
+  const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE);
+  const __m256i v_zext_q = _mm256_set1_epi64x(0xffffffff);
+
+  __m256i v_acc0_q = _mm256_setzero_si256();
+
+  assert(N % 64 == 0);
+
+  r1 += N;
+  d += N;
+  m += N;
+
+  do {
+    const __m256i v_r0_w = _mm256_lddqu_si256((__m256i *)(r1 + n));
+    const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(d + n));
+    const __m128i v_m01_b = _mm_lddqu_si128((__m128i *)(m + n));
+
+    const __m256i v_rd0l_w = _mm256_unpacklo_epi16(v_d0_w, v_r0_w);
+    const __m256i v_rd0h_w = _mm256_unpackhi_epi16(v_d0_w, v_r0_w);
+    const __m256i v_m0_w = _mm256_cvtepu8_epi16(v_m01_b);
+
+    const __m256i v_m0l_w = _mm256_unpacklo_epi16(v_m0_w, v_mask_max_w);
+    const __m256i v_m0h_w = _mm256_unpackhi_epi16(v_m0_w, v_mask_max_w);
+
+    const __m256i v_t0l_d = _mm256_madd_epi16(v_rd0l_w, v_m0l_w);
+    const __m256i v_t0h_d = _mm256_madd_epi16(v_rd0h_w, v_m0h_w);
+
+    const __m256i v_t0_w = _mm256_packs_epi32(v_t0l_d, v_t0h_d);
+
+    const __m256i v_sq0_d = _mm256_madd_epi16(v_t0_w, v_t0_w);
+
+    const __m256i v_sum0_q = _mm256_add_epi64(
+        _mm256_and_si256(v_sq0_d, v_zext_q), _mm256_srli_epi64(v_sq0_d, 32));
+
+    v_acc0_q = _mm256_add_epi64(v_acc0_q, v_sum0_q);
+
+    n += 16;
+  } while (n);
+
+  v_acc0_q = _mm256_add_epi64(v_acc0_q, _mm256_srli_si256(v_acc0_q, 8));
+  __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc0_q);
+  __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc0_q, 1);
+  v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
+#if ARCH_X86_64
+  csse = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0);
+#else
+  xx_storel_64(&csse, v_acc_q_0);
+#endif
+
+  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * See av1_wedge_sign_from_residuals_c
+ */
+int av1_wedge_sign_from_residuals_avx2(const int16_t *ds, const uint8_t *m,
+                                       int N, int64_t limit) {
+  int64_t acc;
+  __m256i v_acc0_d = _mm256_setzero_si256();
+
+  // Input size limited to 8192 by the use of 32 bit accumulators and m
+  // being between [0, 64]. Overflow might happen at larger sizes,
+  // though it is practically impossible on real video input.
+  assert(N < 8192);
+  assert(N % 64 == 0);
+
+  do {
+    const __m256i v_m01_b = _mm256_lddqu_si256((__m256i *)(m));
+    const __m256i v_m23_b = _mm256_lddqu_si256((__m256i *)(m + 32));
+
+    const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(ds));
+    const __m256i v_d1_w = _mm256_lddqu_si256((__m256i *)(ds + 16));
+    const __m256i v_d2_w = _mm256_lddqu_si256((__m256i *)(ds + 32));
+    const __m256i v_d3_w = _mm256_lddqu_si256((__m256i *)(ds + 48));
+
+    const __m256i v_m0_w =
+        _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m01_b));
+    const __m256i v_m1_w =
+        _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m01_b, 1));
+    const __m256i v_m2_w =
+        _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m23_b));
+    const __m256i v_m3_w =
+        _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m23_b, 1));
+
+    const __m256i v_p0_d = _mm256_madd_epi16(v_d0_w, v_m0_w);
+    const __m256i v_p1_d = _mm256_madd_epi16(v_d1_w, v_m1_w);
+    const __m256i v_p2_d = _mm256_madd_epi16(v_d2_w, v_m2_w);
+    const __m256i v_p3_d = _mm256_madd_epi16(v_d3_w, v_m3_w);
+
+    const __m256i v_p01_d = _mm256_add_epi32(v_p0_d, v_p1_d);
+    const __m256i v_p23_d = _mm256_add_epi32(v_p2_d, v_p3_d);
+
+    const __m256i v_p0123_d = _mm256_add_epi32(v_p01_d, v_p23_d);
+
+    v_acc0_d = _mm256_add_epi32(v_acc0_d, v_p0123_d);
+
+    ds += 64;
+    m += 64;
+
+    N -= 64;
+  } while (N);
+
+  __m256i v_sign_d = _mm256_srai_epi32(v_acc0_d, 31);
+  v_acc0_d = _mm256_add_epi64(_mm256_unpacklo_epi32(v_acc0_d, v_sign_d),
+                              _mm256_unpackhi_epi32(v_acc0_d, v_sign_d));
+
+  __m256i v_acc_q = _mm256_add_epi64(v_acc0_d, _mm256_srli_si256(v_acc0_d, 8));
+
+  __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc_q);
+  __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc_q, 1);
+  v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
+
+#if ARCH_X86_64
+  acc = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0);
+#else
+  xx_storel_64(&acc, v_acc_q_0);
+#endif
+
+  return acc > limit;
+}
+
+/**
+ * av1_wedge_compute_delta_squares_c
+ */
+void av1_wedge_compute_delta_squares_avx2(int16_t *d, const int16_t *a,
+                                          const int16_t *b, int N) {
+  const __m256i v_neg_w = _mm256_set1_epi32(0xffff0001);
+
+  assert(N % 64 == 0);
+
+  do {
+    const __m256i v_a0_w = _mm256_lddqu_si256((__m256i *)(a));
+    const __m256i v_b0_w = _mm256_lddqu_si256((__m256i *)(b));
+    const __m256i v_a1_w = _mm256_lddqu_si256((__m256i *)(a + 16));
+    const __m256i v_b1_w = _mm256_lddqu_si256((__m256i *)(b + 16));
+    const __m256i v_a2_w = _mm256_lddqu_si256((__m256i *)(a + 32));
+    const __m256i v_b2_w = _mm256_lddqu_si256((__m256i *)(b + 32));
+    const __m256i v_a3_w = _mm256_lddqu_si256((__m256i *)(a + 48));
+    const __m256i v_b3_w = _mm256_lddqu_si256((__m256i *)(b + 48));
+
+    const __m256i v_ab0l_w = _mm256_unpacklo_epi16(v_a0_w, v_b0_w);
+    const __m256i v_ab0h_w = _mm256_unpackhi_epi16(v_a0_w, v_b0_w);
+    const __m256i v_ab1l_w = _mm256_unpacklo_epi16(v_a1_w, v_b1_w);
+    const __m256i v_ab1h_w = _mm256_unpackhi_epi16(v_a1_w, v_b1_w);
+    const __m256i v_ab2l_w = _mm256_unpacklo_epi16(v_a2_w, v_b2_w);
+    const __m256i v_ab2h_w = _mm256_unpackhi_epi16(v_a2_w, v_b2_w);
+    const __m256i v_ab3l_w = _mm256_unpacklo_epi16(v_a3_w, v_b3_w);
+    const __m256i v_ab3h_w = _mm256_unpackhi_epi16(v_a3_w, v_b3_w);
+
+    // Negate top word of pairs
+    const __m256i v_abl0n_w = _mm256_sign_epi16(v_ab0l_w, v_neg_w);
+    const __m256i v_abh0n_w = _mm256_sign_epi16(v_ab0h_w, v_neg_w);
+    const __m256i v_abl1n_w = _mm256_sign_epi16(v_ab1l_w, v_neg_w);
+    const __m256i v_abh1n_w = _mm256_sign_epi16(v_ab1h_w, v_neg_w);
+    const __m256i v_abl2n_w = _mm256_sign_epi16(v_ab2l_w, v_neg_w);
+    const __m256i v_abh2n_w = _mm256_sign_epi16(v_ab2h_w, v_neg_w);
+    const __m256i v_abl3n_w = _mm256_sign_epi16(v_ab3l_w, v_neg_w);
+    const __m256i v_abh3n_w = _mm256_sign_epi16(v_ab3h_w, v_neg_w);
+
+    const __m256i v_r0l_w = _mm256_madd_epi16(v_ab0l_w, v_abl0n_w);
+    const __m256i v_r0h_w = _mm256_madd_epi16(v_ab0h_w, v_abh0n_w);
+    const __m256i v_r1l_w = _mm256_madd_epi16(v_ab1l_w, v_abl1n_w);
+    const __m256i v_r1h_w = _mm256_madd_epi16(v_ab1h_w, v_abh1n_w);
+    const __m256i v_r2l_w = _mm256_madd_epi16(v_ab2l_w, v_abl2n_w);
+    const __m256i v_r2h_w = _mm256_madd_epi16(v_ab2h_w, v_abh2n_w);
+    const __m256i v_r3l_w = _mm256_madd_epi16(v_ab3l_w, v_abl3n_w);
+    const __m256i v_r3h_w = _mm256_madd_epi16(v_ab3h_w, v_abh3n_w);
+
+    const __m256i v_r0_w = _mm256_packs_epi32(v_r0l_w, v_r0h_w);
+    const __m256i v_r1_w = _mm256_packs_epi32(v_r1l_w, v_r1h_w);
+    const __m256i v_r2_w = _mm256_packs_epi32(v_r2l_w, v_r2h_w);
+    const __m256i v_r3_w = _mm256_packs_epi32(v_r3l_w, v_r3h_w);
+
+    _mm256_store_si256((__m256i *)(d), v_r0_w);
+    _mm256_store_si256((__m256i *)(d + 16), v_r1_w);
+    _mm256_store_si256((__m256i *)(d + 32), v_r2_w);
+    _mm256_store_si256((__m256i *)(d + 48), v_r3_w);
+
+    a += 64;
+    b += 64;
+    d += 64;
+    N -= 64;
+  } while (N);
+}
diff --git a/third_party/aom/build/cmake/aom_config_defaults.cmake b/third_party/aom/build/cmake/aom_config_defaults.cmake
index c7252f064..19af5c43b 100644
--- a/third_party/aom/build/cmake/aom_config_defaults.cmake
+++ b/third_party/aom/build/cmake/aom_config_defaults.cmake
@@ -76,6 +76,7 @@ set(CONFIG_MISMATCH_DEBUG 0 CACHE NUMBER "Mismatch debugging flag.")
 set(CONFIG_ACCOUNTING 0 CACHE NUMBER "Enables bit accounting.")
 set(CONFIG_ANALYZER 0 CACHE NUMBER "Enables bit stream analyzer.")
 set(CONFIG_COEFFICIENT_RANGE_CHECKING 0 CACHE NUMBER "Coefficient range check.")
+set(CONFIG_DENOISE 0 CACHE NUMBER "Denoise/noise modeling support in encoder.")
 set(CONFIG_FILEOPTIONS 1 CACHE NUMBER "Enables encoder config file support.")
 set(CONFIG_INSPECTION 0 CACHE NUMBER "Enables bitstream inspection.")
 set(CONFIG_INTERNAL_STATS 0 CACHE NUMBER "Enables internal encoder stats.")
diff --git a/third_party/aom/build/cmake/aom_configure.cmake b/third_party/aom/build/cmake/aom_configure.cmake
index 5d782aaf9..a12389778 100644
--- a/third_party/aom/build/cmake/aom_configure.cmake
+++ b/third_party/aom/build/cmake/aom_configure.cmake
@@ -40,14 +40,6 @@ include("${AOM_ROOT}/build/cmake/compiler_flags.cmake")
 include("${AOM_ROOT}/build/cmake/compiler_tests.cmake")
 include("${AOM_ROOT}/build/cmake/util.cmake")
 
-# Build a list of all configurable variables.
-get_cmake_property(cmake_cache_vars CACHE_VARIABLES)
-foreach(var ${cmake_cache_vars})
-  if("${var}" MATCHES "^CONFIG_")
-    list(APPEND AOM_CONFIG_VARS ${var})
-  endif()
-endforeach()
-
 # Detect target CPU.
 if(NOT AOM_TARGET_CPU)
   if("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "AMD64" OR
@@ -269,7 +261,7 @@ else()
   add_compiler_flag_if_supported("-Wlogical-op")
   add_compiler_flag_if_supported("-Wpointer-arith")
   add_compiler_flag_if_supported("-Wsign-compare")
-  add_compiler_flag_if_supported("-Wstack-usage=320000")
+  add_compiler_flag_if_supported("-Wstack-usage=360000")
   add_compiler_flag_if_supported("-Wstring-conversion")
   add_compiler_flag_if_supported("-Wtype-limits")
   add_compiler_flag_if_supported("-Wuninitialized")
@@ -334,9 +326,6 @@ if(NOT PERL_FOUND)
   message(FATAL_ERROR "Perl is required to build libaom.")
 endif()
 
-configure_file("${AOM_CONFIG_DIR}/rtcd_config.cmake"
-               "${AOM_CONFIG_DIR}/${AOM_TARGET_CPU}_rtcd_config.rtcd")
-
 set(AOM_RTCD_CONFIG_FILE_LIST "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl"
     "${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl"
     "${AOM_ROOT}/av1/common/av1_rtcd_defs.pl")
@@ -355,13 +344,12 @@ foreach(NUM RANGE ${AOM_RTCD_CUSTOM_COMMAND_COUNT})
   list(GET AOM_RTCD_HEADER_FILE_LIST ${NUM} AOM_RTCD_HEADER_FILE)
   list(GET AOM_RTCD_SOURCE_FILE_LIST ${NUM} AOM_RTCD_SOURCE_FILE)
   list(GET AOM_RTCD_SYMBOL_LIST ${NUM} AOM_RTCD_SYMBOL)
-  execute_process(
-    COMMAND ${PERL_EXECUTABLE} "${AOM_ROOT}/build/make/rtcd.pl"
-            --arch=${AOM_TARGET_CPU}
-            --sym=${AOM_RTCD_SYMBOL} ${AOM_RTCD_FLAGS}
-            --config=${AOM_CONFIG_DIR}/${AOM_TARGET_CPU}_rtcd_config.rtcd
-            ${AOM_RTCD_CONFIG_FILE}
-    OUTPUT_FILE ${AOM_RTCD_HEADER_FILE})
+  execute_process(COMMAND ${PERL_EXECUTABLE} "${AOM_ROOT}/build/make/rtcd.pl"
+                          --arch=${AOM_TARGET_CPU}
+                          --sym=${AOM_RTCD_SYMBOL} ${AOM_RTCD_FLAGS}
+                          --config=${AOM_CONFIG_DIR}/config/aom_config.h
+                          ${AOM_RTCD_CONFIG_FILE}
+                  OUTPUT_FILE ${AOM_RTCD_HEADER_FILE})
 endforeach()
 
 # Generate aom_version.h.
diff --git a/third_party/aom/build/cmake/aom_optimization.cmake b/third_party/aom/build/cmake/aom_optimization.cmake
index 069ea1bb9..ce3dc0340 100644
--- a/third_party/aom/build/cmake/aom_optimization.cmake
+++ b/third_party/aom/build/cmake/aom_optimization.cmake
@@ -197,16 +197,16 @@ endfunction()
 # include file, $source is the C source file, and $symbol is used for the symbol
 # argument passed to rtcd.pl.
 function(add_rtcd_build_step config output source symbol)
-  add_custom_command(
-    OUTPUT ${output}
-    COMMAND ${PERL_EXECUTABLE} ARGS "${AOM_ROOT}/build/make/rtcd.pl"
-            --arch=${AOM_TARGET_CPU}
-            --sym=${symbol} ${AOM_RTCD_FLAGS}
-            --config=${AOM_CONFIG_DIR}/${AOM_TARGET_CPU}_rtcd_config.rtcd
-            ${config} > ${output}
-    DEPENDS ${config}
-    COMMENT "Generating ${output}"
-    WORKING_DIRECTORY ${AOM_CONFIG_DIR} VERBATIM)
+  add_custom_command(OUTPUT ${output}
+                     COMMAND ${PERL_EXECUTABLE} ARGS
+                             "${AOM_ROOT}/build/make/rtcd.pl"
+                             --arch=${AOM_TARGET_CPU}
+                             --sym=${symbol} ${AOM_RTCD_FLAGS}
+                             --config=${AOM_CONFIG_DIR}/config/aom_config.h
+                             ${config} > ${output}
+                     DEPENDS ${config}
+                     COMMENT "Generating ${output}"
+                     WORKING_DIRECTORY ${AOM_CONFIG_DIR} VERBATIM)
   set_property(SOURCE ${source} PROPERTY OBJECT_DEPENDS ${output})
   set_property(SOURCE ${output} PROPERTY GENERATED)
 endfunction()
diff --git a/third_party/aom/build/cmake/cpu.cmake b/third_party/aom/build/cmake/cpu.cmake
index 6f866d04d..6e8089e63 100644
--- a/third_party/aom/build/cmake/cpu.cmake
+++ b/third_party/aom/build/cmake/cpu.cmake
@@ -91,9 +91,3 @@ elseif("${AOM_TARGET_CPU}" MATCHES "^x86")
     endif()
   endforeach()
 endif()
-
-foreach(config_var ${AOM_CONFIG_VARS})
-  if(${${config_var}})
-    set(RTCD_${config_var} yes)
-  endif()
-endforeach()
diff --git a/third_party/aom/build/cmake/generate_aom_config_templates.cmake b/third_party/aom/build/cmake/generate_aom_config_templates.cmake
index 6ea02295c..8fbb4737b 100644
--- a/third_party/aom/build/cmake/generate_aom_config_templates.cmake
+++ b/third_party/aom/build/cmake/generate_aom_config_templates.cmake
@@ -98,13 +98,3 @@ foreach(cache_var ${cmake_cache_vars})
                 "${cache_var} equ \${${cache_var}}\n")
   endif()
 endforeach()
-
-set(aom_rtcd_config_template "${AOM_CONFIG_DIR}/rtcd_config.cmake")
-file(WRITE "${aom_rtcd_config_template}" ${cmake_file_header_block})
-foreach(cache_var ${cmake_cache_vars})
-  if(NOT "${cache_var}" MATCHES "AOM_CONFIG_DIR\|AOM_ROOT\|^CMAKE_\|INLINE")
-    file(APPEND "${aom_rtcd_config_template}"
-                "${cache_var}=\${RTCD_${cache_var}}\n")
-  endif()
-endforeach()
-
diff --git a/third_party/aom/build/make/iosbuild.sh b/third_party/aom/build/make/iosbuild.sh
index 75f0b1b08..167ece200 100755
--- a/third_party/aom/build/make/iosbuild.sh
+++ b/third_party/aom/build/make/iosbuild.sh
@@ -245,7 +245,7 @@ build_framework() {
 # Trap function. Cleans up the subtree used to build all targets contained in
 # $TARGETS.
 cleanup() {
-  local readonly res=$?
+  local res=$?
   cd "${ORIG_PWD}"
 
   if [ $res -ne 0 ]; then
diff --git a/third_party/aom/build/make/rtcd.pl b/third_party/aom/build/make/rtcd.pl
index 8d8be25c0..b849a1eba 100755
--- a/third_party/aom/build/make/rtcd.pl
+++ b/third_party/aom/build/make/rtcd.pl
@@ -58,11 +58,15 @@ open CONFIG_FILE, $opts{config} or
 
 my %config = ();
 while (<CONFIG_FILE>) {
-  next if !/^(?:CONFIG_|HAVE_)/;
+  next if !/^#define\s+(?:CONFIG_|HAVE_)/;
   chomp;
-  s/\r$//;
-  my @pair = split /=/;
-  $config{$pair[0]} = $pair[1];
+  my @line_components = split /\s/;
+  scalar @line_components > 2 or
+    die "Invalid input passed to rtcd.pl via $opts{config}.";
+  # $line_components[0] = #define
+  # $line_components[1] = flag name (CONFIG_SOMETHING or HAVE_SOMETHING)
+  # $line_components[2] = flag value (0 or 1)
+  $config{$line_components[1]} = "$line_components[2]" eq "1" ? "yes" : "";
 }
 close CONFIG_FILE;
 
@@ -415,19 +419,11 @@ if ($opts{arch} eq 'x86') {
   x86;
 } elsif ($opts{arch} eq 'mips32' || $opts{arch} eq 'mips64') {
   @ALL_ARCHS = filter("$opts{arch}");
-  open CONFIG_FILE, $opts{config} or
-    die "Error opening config file '$opts{config}': $!\n";
-  while (<CONFIG_FILE>) {
-    if (/HAVE_DSPR2=yes/) {
-      @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/);
-      last;
-    }
-    if (/HAVE_MSA=yes/) {
-      @ALL_ARCHS = filter("$opts{arch}", qw/msa/);
-      last;
-    }
+  if (aom_config("HAVE_DSPR2") eq "yes") {
+    @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/);
+  } elsif (aom_config("HAVE_MSA") eq "yes") {
+    @ALL_ARCHS = filter("$opts{arch}", qw/msa/);
   }
-  close CONFIG_FILE;
   mips;
 } elsif ($opts{arch} =~ /armv7\w?/) {
   @ALL_ARCHS = filter(qw/neon/);
@@ -466,4 +462,4 @@ Options:
   --disable-EXT     Disable support for EXT extensions
   --require-EXT     Require support for EXT extensions
   --sym=SYMBOL      Unique symbol to use for RTCD initialization function
-  --config=FILE     File with CONFIG_FOO=yes lines to parse
+  --config=FILE     Path to file containing C preprocessor directives to parse
diff --git a/third_party/aom/common/tools_common.c b/third_party/aom/common/tools_common.c
index 359ec7341..21cd80026 100644
--- a/third_party/aom/common/tools_common.c
+++ b/third_party/aom/common/tools_common.c
@@ -236,7 +236,7 @@ double sse_to_psnr(double samples, double peak, double sse) {
 }
 
 // TODO(debargha): Consolidate the functions below into a separate file.
-static void highbd_img_upshift(aom_image_t *dst, aom_image_t *src,
+static void highbd_img_upshift(aom_image_t *dst, const aom_image_t *src,
                                int input_shift) {
   // Note the offset is 1 less than half.
   const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
@@ -262,8 +262,8 @@ static void highbd_img_upshift(aom_image_t *dst, aom_image_t *src,
       h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
     }
     for (y = 0; y < h; y++) {
-      uint16_t *p_src =
-          (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      const uint16_t *p_src =
+          (const uint16_t *)(src->planes[plane] + y * src->stride[plane]);
       uint16_t *p_dst =
           (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
       for (x = 0; x < w; x++) *p_dst++ = (*p_src++ << input_shift) + offset;
@@ -271,7 +271,7 @@ static void highbd_img_upshift(aom_image_t *dst, aom_image_t *src,
   }
 }
 
-static void lowbd_img_upshift(aom_image_t *dst, aom_image_t *src,
+static void lowbd_img_upshift(aom_image_t *dst, const aom_image_t *src,
                               int input_shift) {
   // Note the offset is 1 less than half.
   const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
@@ -297,7 +297,7 @@ static void lowbd_img_upshift(aom_image_t *dst, aom_image_t *src,
       h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
     }
     for (y = 0; y < h; y++) {
-      uint8_t *p_src = src->planes[plane] + y * src->stride[plane];
+      const uint8_t *p_src = src->planes[plane] + y * src->stride[plane];
       uint16_t *p_dst =
           (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
       for (x = 0; x < w; x++) {
@@ -307,7 +307,8 @@ static void lowbd_img_upshift(aom_image_t *dst, aom_image_t *src,
   }
 }
 
-void aom_img_upshift(aom_image_t *dst, aom_image_t *src, int input_shift) {
+void aom_img_upshift(aom_image_t *dst, const aom_image_t *src,
+                     int input_shift) {
   if (src->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
     highbd_img_upshift(dst, src, input_shift);
   } else {
@@ -315,7 +316,7 @@ void aom_img_upshift(aom_image_t *dst, aom_image_t *src, int input_shift) {
   }
 }
 
-void aom_img_truncate_16_to_8(aom_image_t *dst, aom_image_t *src) {
+void aom_img_truncate_16_to_8(aom_image_t *dst, const aom_image_t *src) {
   int plane;
   if (dst->fmt + AOM_IMG_FMT_HIGHBITDEPTH != src->fmt || dst->d_w != src->d_w ||
       dst->d_h != src->d_h || dst->x_chroma_shift != src->x_chroma_shift ||
@@ -337,8 +338,8 @@ void aom_img_truncate_16_to_8(aom_image_t *dst, aom_image_t *src) {
       h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
     }
     for (y = 0; y < h; y++) {
-      uint16_t *p_src =
-          (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      const uint16_t *p_src =
+          (const uint16_t *)(src->planes[plane] + y * src->stride[plane]);
       uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
       for (x = 0; x < w; x++) {
         *p_dst++ = (uint8_t)(*p_src++);
@@ -347,7 +348,7 @@ void aom_img_truncate_16_to_8(aom_image_t *dst, aom_image_t *src) {
   }
 }
 
-static void highbd_img_downshift(aom_image_t *dst, aom_image_t *src,
+static void highbd_img_downshift(aom_image_t *dst, const aom_image_t *src,
                                  int down_shift) {
   int plane;
   if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
@@ -371,8 +372,8 @@ static void highbd_img_downshift(aom_image_t *dst, aom_image_t *src,
       h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
     }
     for (y = 0; y < h; y++) {
-      uint16_t *p_src =
-          (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      const uint16_t *p_src =
+          (const uint16_t *)(src->planes[plane] + y * src->stride[plane]);
       uint16_t *p_dst =
           (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
       for (x = 0; x < w; x++) *p_dst++ = *p_src++ >> down_shift;
@@ -380,7 +381,7 @@ static void highbd_img_downshift(aom_image_t *dst, aom_image_t *src,
   }
 }
 
-static void lowbd_img_downshift(aom_image_t *dst, aom_image_t *src,
+static void lowbd_img_downshift(aom_image_t *dst, const aom_image_t *src,
                                 int down_shift) {
   int plane;
   if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
@@ -404,8 +405,8 @@ static void lowbd_img_downshift(aom_image_t *dst, aom_image_t *src,
       h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
     }
     for (y = 0; y < h; y++) {
-      uint16_t *p_src =
-          (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      const uint16_t *p_src =
+          (const uint16_t *)(src->planes[plane] + y * src->stride[plane]);
       uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
       for (x = 0; x < w; x++) {
         *p_dst++ = *p_src++ >> down_shift;
@@ -414,7 +415,8 @@ static void lowbd_img_downshift(aom_image_t *dst, aom_image_t *src,
   }
 }
 
-void aom_img_downshift(aom_image_t *dst, aom_image_t *src, int down_shift) {
+void aom_img_downshift(aom_image_t *dst, const aom_image_t *src,
+                       int down_shift) {
   if (dst->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
     highbd_img_downshift(dst, src, down_shift);
   } else {
diff --git a/third_party/aom/common/tools_common.h b/third_party/aom/common/tools_common.h
index abee4ea63..587903650 100644
--- a/third_party/aom/common/tools_common.h
+++ b/third_party/aom/common/tools_common.h
@@ -152,9 +152,10 @@ void aom_img_write(const aom_image_t *img, FILE *file);
 int aom_img_read(aom_image_t *img, FILE *file);
 
 double sse_to_psnr(double samples, double peak, double mse);
-void aom_img_upshift(aom_image_t *dst, aom_image_t *src, int input_shift);
-void aom_img_downshift(aom_image_t *dst, aom_image_t *src, int down_shift);
-void aom_img_truncate_16_to_8(aom_image_t *dst, aom_image_t *src);
+void aom_img_upshift(aom_image_t *dst, const aom_image_t *src, int input_shift);
+void aom_img_downshift(aom_image_t *dst, const aom_image_t *src,
+                       int down_shift);
+void aom_img_truncate_16_to_8(aom_image_t *dst, const aom_image_t *src);
 
 #ifdef __cplusplus
 } /* extern "C" */
diff --git a/third_party/aom/common/video_common.h b/third_party/aom/common/video_common.h
index f96af4b7e..965038d39 100644
--- a/third_party/aom/common/video_common.h
+++ b/third_party/aom/common/video_common.h
@@ -19,6 +19,7 @@ typedef struct {
   int frame_width;
   int frame_height;
   struct AvxRational time_base;
+  unsigned int is_annexb;
 } AvxVideoInfo;
 
 #endif  // VIDEO_COMMON_H_
diff --git a/third_party/aom/common/video_reader.c b/third_party/aom/common/video_reader.c
index f5327c928..b54c250c2 100644
--- a/third_party/aom/common/video_reader.c
+++ b/third_party/aom/common/video_reader.c
@@ -8,19 +8,20 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#include "common/video_reader.h"
-
 #include <stdlib.h>
 #include <string.h>
+#include <assert.h>
 
 #include "aom_ports/mem_ops.h"
 #include "common/ivfdec.h"
-
-static const char *const kIVFSignature = "DKIF";
+#include "common/obudec.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
 
 struct AvxVideoReaderStruct {
   AvxVideoInfo info;
-  FILE *file;
+  struct AvxInputContext input_ctx;
+  struct ObuDecInputContext obu_ctx;
   uint8_t *buffer;
   size_t buffer_size;
   size_t frame_size;
@@ -28,42 +29,64 @@ struct AvxVideoReaderStruct {
 };
 
 AvxVideoReader *aom_video_reader_open(const char *filename) {
-  char header[32];
   AvxVideoReader *reader = NULL;
   FILE *const file = fopen(filename, "rb");
   if (!file) return NULL;  // Can't open file
 
-  if (fread(header, 1, 32, file) != 32) return NULL;  // Can't read file header
-
-  if (memcmp(kIVFSignature, header, 4) != 0)
-    return NULL;  // Wrong IVF signature
-
-  if (mem_get_le16(header + 4) != 0) return NULL;  // Wrong IVF version
-
   reader = (AvxVideoReader *)calloc(1, sizeof(*reader));
-  if (!reader) return NULL;  // Can't allocate AvxVideoReader
+  if (!reader) {
+    fclose(file);
+    return NULL;  // Can't allocate AvxVideoReader
+  }
 
-  reader->file = file;
-  reader->info.codec_fourcc = mem_get_le32(header + 8);
-  reader->info.frame_width = mem_get_le16(header + 12);
-  reader->info.frame_height = mem_get_le16(header + 14);
-  reader->info.time_base.numerator = mem_get_le32(header + 16);
-  reader->info.time_base.denominator = mem_get_le32(header + 20);
+  reader->input_ctx.filename = filename;
+  reader->input_ctx.file = file;
+  reader->obu_ctx.avx_ctx = &reader->input_ctx;
+  reader->obu_ctx.is_annexb = 1;
+
+  if (file_is_ivf(&reader->input_ctx)) {
+    reader->input_ctx.file_type = FILE_TYPE_IVF;
+    reader->info.codec_fourcc = reader->input_ctx.fourcc;
+    reader->info.frame_width = reader->input_ctx.width;
+    reader->info.frame_height = reader->input_ctx.height;
+  } else if (file_is_obu(&reader->obu_ctx)) {
+    reader->input_ctx.file_type = FILE_TYPE_OBU;
+    // assume AV1
+    reader->info.codec_fourcc = AV1_FOURCC;
+    reader->info.is_annexb = reader->obu_ctx.is_annexb;
+  } else {
+    fclose(file);
+    free(reader);
+    return NULL;  // Unknown file type
+  }
 
   return reader;
 }
 
 void aom_video_reader_close(AvxVideoReader *reader) {
   if (reader) {
-    fclose(reader->file);
+    fclose(reader->input_ctx.file);
+    if (reader->input_ctx.file_type == FILE_TYPE_OBU) {
+      obudec_free(&reader->obu_ctx);
+    }
     free(reader->buffer);
     free(reader);
   }
 }
 
 int aom_video_reader_read_frame(AvxVideoReader *reader) {
-  return !ivf_read_frame(reader->file, &reader->buffer, &reader->frame_size,
-                         &reader->buffer_size, &reader->pts);
+  if (reader->input_ctx.file_type == FILE_TYPE_IVF) {
+    return !ivf_read_frame(reader->input_ctx.file, &reader->buffer,
+                           &reader->frame_size, &reader->buffer_size,
+                           &reader->pts);
+  } else if (reader->input_ctx.file_type == FILE_TYPE_OBU) {
+    return !obudec_read_temporal_unit(&reader->obu_ctx, &reader->buffer,
+                                      &reader->frame_size,
+                                      &reader->buffer_size);
+  } else {
+    assert(0);
+    return 0;
+  }
 }
 
 const uint8_t *aom_video_reader_get_frame(AvxVideoReader *reader,
@@ -77,7 +100,9 @@ int64_t aom_video_reader_get_frame_pts(AvxVideoReader *reader) {
   return (int64_t)reader->pts;
 }
 
-FILE *aom_video_reader_get_file(AvxVideoReader *reader) { return reader->file; }
+FILE *aom_video_reader_get_file(AvxVideoReader *reader) {
+  return reader->input_ctx.file;
+}
 
 const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader) {
   return &reader->info;
diff --git a/third_party/aom/examples/aom_cx_set_ref.c b/third_party/aom/examples/aom_cx_set_ref.c
index e02e94c07..fc037d484 100644
--- a/third_party/aom/examples/aom_cx_set_ref.c
+++ b/third_party/aom/examples/aom_cx_set_ref.c
@@ -163,7 +163,7 @@ static int encode_frame(aom_codec_ctx_t *ecodec, aom_image_t *img,
 
         // Copy out first decoded frame, and use it as reference later.
         if (*frame_out == 1 && ext_ref != NULL)
-          if (aom_codec_control(dcodec, AV1_GET_NEW_FRAME_IMAGE, ext_ref))
+          if (aom_codec_control(dcodec, AV1_COPY_NEW_FRAME_IMAGE, ext_ref))
             die_codec(dcodec, "Failed to get decoder new frame");
       }
     }
diff --git a/third_party/aom/examples/inspect.c b/third_party/aom/examples/inspect.c
index 4887fc4a3..9d5f0dcfc 100644
--- a/third_party/aom/examples/inspect.c
+++ b/third_party/aom/examples/inspect.c
@@ -630,7 +630,9 @@ int read_frame() {
     die_codec(&codec, "Failed to decode frame.");
   }
   int got_any_frames = 0;
-  while ((img = aom_codec_get_frame(&codec, &iter))) {
+  aom_image_t *frame_img;
+  while ((frame_img = aom_codec_get_frame(&codec, &iter))) {
+    img = frame_img;
     ++frame_count;
     got_any_frames = 1;
   }
diff --git a/third_party/aom/examples/lightfield_bitstream_parsing.c b/third_party/aom/examples/lightfield_bitstream_parsing.c
index d13f3f172..71d4dec77 100644
--- a/third_party/aom/examples/lightfield_bitstream_parsing.c
+++ b/third_party/aom/examples/lightfield_bitstream_parsing.c
@@ -12,15 +12,14 @@
 // Lightfield Bitstream Parsing
 // ============================
 //
-// This is an lightfield bitstream parsing example. It takes an input file
+// This is a lightfield bitstream parsing example. It takes an input file
 // containing the whole compressed lightfield bitstream(ivf file), and parses it
 // and constructs and outputs a new bitstream that can be decoded by an AV1
-// decoder. The output bitstream contains tile list OBUs. The lf_width and
-// lf_height arguments are the number of lightfield images in each dimension.
-// The lf_blocksize determines the number of reference images used.
+// decoder. The output bitstream contains reference frames(i.e. anchor frames),
+// camera frame header, and tile list OBUs. num_references is the number of
+// anchor frames coded at the beginning of the light field file.
 // After running the lightfield encoder, run lightfield bitstream parsing:
-// examples/lightfield_bitstream_parsing vase10x10.ivf vase_tile_list.ivf 10 10
-// 5
+// examples/lightfield_bitstream_parsing vase10x10.ivf vase_tile_list.ivf 4
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -35,13 +34,13 @@
 #include "common/video_reader.h"
 #include "common/video_writer.h"
 
+#define MAX_TILES 512
+
 static const char *exec_name;
 
 void usage_exit(void) {
-  fprintf(
-      stderr,
-      "Usage: %s <infile> <outfile> <lf_width> <lf_height> <lf_blocksize> \n",
-      exec_name);
+  fprintf(stderr, "Usage: %s <infile> <outfile> <num_references> \n",
+          exec_name);
   exit(EXIT_FAILURE);
 }
 
@@ -95,39 +94,36 @@ const TILE_LIST_INFO tile_list[2][9] = {
     { 50, 2, 5, 4 } },
 };
 
+static int get_image_bps(aom_img_fmt_t fmt) {
+  switch (fmt) {
+    case AOM_IMG_FMT_I420: return 12;
+    case AOM_IMG_FMT_I422: return 16;
+    case AOM_IMG_FMT_I444: return 24;
+    case AOM_IMG_FMT_I42016: return 24;
+    case AOM_IMG_FMT_I42216: return 32;
+    case AOM_IMG_FMT_I44416: return 48;
+    default: die("Invalid image format");
+  }
+}
+
 int main(int argc, char **argv) {
   aom_codec_ctx_t codec;
   AvxVideoReader *reader = NULL;
   AvxVideoWriter *writer = NULL;
   const AvxInterface *decoder = NULL;
   const AvxVideoInfo *info = NULL;
-  const char *lf_width_arg;
-  const char *lf_height_arg;
-  const char *lf_blocksize_arg;
-  int width, height;
-  int lf_width, lf_height;
-  int lf_blocksize;
-  int u_blocks, v_blocks;
+  int num_references;
   int n, i;
   aom_codec_pts_t pts;
 
   exec_name = argv[0];
-  if (argc != 6) die("Invalid number of arguments.");
+  if (argc != 4) die("Invalid number of arguments.");
 
   reader = aom_video_reader_open(argv[1]);
   if (!reader) die("Failed to open %s for reading.", argv[1]);
 
-  lf_width_arg = argv[3];
-  lf_height_arg = argv[4];
-  lf_blocksize_arg = argv[5];
-
-  lf_width = (int)strtol(lf_width_arg, NULL, 0);
-  lf_height = (int)strtol(lf_height_arg, NULL, 0);
-  lf_blocksize = (int)strtol(lf_blocksize_arg, NULL, 0);
-
+  num_references = (int)strtol(argv[3], NULL, 0);
   info = aom_video_reader_get_info(reader);
-  width = info->frame_width;
-  height = info->frame_height;
 
   // The writer to write out ivf file in tile list OBU, which can be decoded by
   // AV1 decoder.
@@ -144,11 +140,6 @@ int main(int argc, char **argv) {
   // Decode anchor frames.
   aom_codec_control_(&codec, AV1_SET_TILE_MODE, 0);
 
-  // How many anchor frames we have.
-  u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize;
-  v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize;
-
-  int num_references = v_blocks * u_blocks;
   for (i = 0; i < num_references; ++i) {
     aom_video_reader_read_frame(reader);
 
@@ -223,10 +214,20 @@ int main(int argc, char **argv) {
       die_codec(&codec, "Failed to copy compressed camera frame header.");
   }
 
-  // Allocate a buffer to store tile list bitstream. Image format
-  // AOM_IMG_FMT_I420.
-  size_t data_sz =
-      ALIGN_POWER_OF_TWO(width, 5) * ALIGN_POWER_OF_TWO(height, 5) * 12 / 8;
+  // Read out the image format.
+  aom_img_fmt_t ref_fmt = 0;
+  if (aom_codec_control(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt))
+    die_codec(&codec, "Failed to get the image format");
+  const int bps = get_image_bps(ref_fmt);
+  // read out the tile size.
+  unsigned int tile_size = 0;
+  if (aom_codec_control(&codec, AV1D_GET_TILE_SIZE, &tile_size))
+    die_codec(&codec, "Failed to get the tile size");
+  const unsigned int tile_width = tile_size >> 16;
+  const unsigned int tile_height = tile_size & 65535;
+  // Allocate a buffer to store tile list bitstream.
+  const size_t data_sz = MAX_TILES * ALIGN_POWER_OF_TWO(tile_width, 5) *
+                         ALIGN_POWER_OF_TWO(tile_height, 5) * bps / 8;
   unsigned char *tl_buf = (unsigned char *)malloc(data_sz);
   if (tl_buf == NULL) die_codec(&codec, "Failed to allocate tile list buffer.");
 
@@ -251,7 +252,8 @@ int main(int argc, char **argv) {
 
     // Write the OBU size using a fixed length_field_size of 4 bytes.
     saved_obu_size_loc = tl;
-    aom_wb_write_literal(&wb, 0, 32);
+    // aom_wb_write_unsigned_literal(&wb, data, bits) requires that bits <= 32.
+    aom_wb_write_unsigned_literal(&wb, 0, 32);
     tl += 4;
     tile_list_obu_header_size += 4;
 
diff --git a/third_party/aom/examples/lightfield_decoder.c b/third_party/aom/examples/lightfield_decoder.c
index 625cddcac..5da468413 100644
--- a/third_party/aom/examples/lightfield_decoder.c
+++ b/third_party/aom/examples/lightfield_decoder.c
@@ -14,14 +14,10 @@
 //
 // This is an example of a simple lightfield decoder. It builds upon the
 // simple_decoder.c example.  It takes an input file containing the compressed
-// data (in ivf format), treating it as a lightfield instead of a video and
-// will decode a single lightfield tile. The lf_width and lf_height arguments
-// are the number of lightfield images in each dimension. The tile to decode
-// is specified by the tile_u, tile_v, tile_s, tile_t arguments. The tile_u,
-// tile_v specify the image and tile_s, tile_t specify the tile in the image.
+// data (in ivf format), treating it as a lightfield instead of a video.
 // After running the lightfield encoder, run lightfield decoder to decode a
-// single tile:
-// examples/lightfield_decoder vase10x10.ivf vase_reference.yuv 10 10 5
+// batch of tiles:
+// examples/lightfield_decoder vase10x10.ivf vase_reference.yuv 4
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -38,10 +34,7 @@
 static const char *exec_name;
 
 void usage_exit(void) {
-  fprintf(
-      stderr,
-      "Usage: %s <infile> <outfile> <lf_width> <lf_height> <lf_blocksize>\n",
-      exec_name);
+  fprintf(stderr, "Usage: %s <infile> <outfile> <num_references>\n", exec_name);
   exit(EXIT_FAILURE);
 }
 
@@ -85,22 +78,14 @@ int main(int argc, char **argv) {
   AvxVideoReader *reader = NULL;
   const AvxInterface *decoder = NULL;
   const AvxVideoInfo *info = NULL;
-  const char *lf_width_arg;
-  const char *lf_height_arg;
-  const char *lf_blocksize_arg;
-  int width, height;
-  int lf_width, lf_height;
-  int lf_blocksize;
-  int u_blocks;
-  int v_blocks;
+  int num_references;
   aom_image_t reference_images[MAX_EXTERNAL_REFERENCES];
   size_t frame_size = 0;
   const unsigned char *frame = NULL;
-  int n, i;
-
+  int n, i, j;
   exec_name = argv[0];
 
-  if (argc != 6) die("Invalid number of arguments.");
+  if (argc != 4) die("Invalid number of arguments.");
 
   reader = aom_video_reader_open(argv[1]);
   if (!reader) die("Failed to open %s for reading.", argv[1]);
@@ -108,16 +93,9 @@ int main(int argc, char **argv) {
   if (!(outfile = fopen(argv[2], "wb")))
     die("Failed to open %s for writing.", argv[2]);
 
-  lf_width_arg = argv[3];
-  lf_height_arg = argv[4];
-  lf_blocksize_arg = argv[5];
-  lf_width = (int)strtol(lf_width_arg, NULL, 0);
-  lf_height = (int)strtol(lf_height_arg, NULL, 0);
-  lf_blocksize = (int)strtol(lf_blocksize_arg, NULL, 0);
+  num_references = (int)strtol(argv[3], NULL, 0);
 
   info = aom_video_reader_get_info(reader);
-  width = info->frame_width;
-  height = info->frame_height;
 
   decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
   if (!decoder) die("Unknown input codec.");
@@ -126,33 +104,39 @@ int main(int argc, char **argv) {
   if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
     die_codec(&codec, "Failed to initialize decoder.");
 
-  // How many anchor frames we have.
-  u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize;
-  v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize;
-
-  int num_references = v_blocks * u_blocks;
-
-  // Allocate memory to store decoded references.
-  aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420;
-  if (!CONFIG_LOWBITDEPTH) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
-  // Allocate memory with the border so that it can be used as a reference.
-  for (i = 0; i < num_references; i++) {
-    unsigned int border = AOM_BORDER_IN_PIXELS;
-    if (!aom_img_alloc_with_border(&reference_images[i], ref_fmt, width, height,
-                                   32, 8, border)) {
-      die("Failed to allocate references.");
-    }
+  if (aom_codec_control(&codec, AV1D_SET_IS_ANNEXB, info->is_annexb)) {
+    die("Failed to set annex b status");
   }
 
   // Decode anchor frames.
   aom_codec_control_(&codec, AV1_SET_TILE_MODE, 0);
-
   for (i = 0; i < num_references; ++i) {
     aom_video_reader_read_frame(reader);
     frame = aom_video_reader_get_frame(reader, &frame_size);
     if (aom_codec_decode(&codec, frame, frame_size, NULL))
       die_codec(&codec, "Failed to decode frame.");
 
+    if (i == 0) {
+      aom_img_fmt_t ref_fmt = 0;
+      if (aom_codec_control(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt))
+        die_codec(&codec, "Failed to get the image format");
+
+      int frame_res[2];
+      if (aom_codec_control(&codec, AV1D_GET_FRAME_SIZE, frame_res))
+        die_codec(&codec, "Failed to get the image frame size");
+
+      // Allocate memory to store decoded references. Allocate memory with the
+      // border so that it can be used as a reference.
+      for (j = 0; j < num_references; j++) {
+        unsigned int border = AOM_BORDER_IN_PIXELS;
+        if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt,
+                                       frame_res[0], frame_res[1], 32, 8,
+                                       border)) {
+          die("Failed to allocate references.");
+        }
+      }
+    }
+
     if (aom_codec_control(&codec, AV1_COPY_NEW_FRAME_IMAGE,
                           &reference_images[i]))
       die_codec(&codec, "Failed to copy decoded reference frame");
diff --git a/third_party/aom/examples/lightfield_encoder.c b/third_party/aom/examples/lightfield_encoder.c
index 22daf622c..f8c37fbb0 100644
--- a/third_party/aom/examples/lightfield_encoder.c
+++ b/third_party/aom/examples/lightfield_encoder.c
@@ -240,7 +240,8 @@ static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
   AvxVideoInfo info = { encoder->fourcc,
                         cfg->g_w,
                         cfg->g_h,
-                        { cfg->g_timebase.num, cfg->g_timebase.den } };
+                        { cfg->g_timebase.num, cfg->g_timebase.den },
+                        0 };
   AvxVideoWriter *writer = NULL;
   aom_codec_ctx_t codec;
   int frame_count = 0;
diff --git a/third_party/aom/examples/lightfield_tile_list_decoder.c b/third_party/aom/examples/lightfield_tile_list_decoder.c
index cec6baa2c..2e4f3898d 100644
--- a/third_party/aom/examples/lightfield_tile_list_decoder.c
+++ b/third_party/aom/examples/lightfield_tile_list_decoder.c
@@ -16,12 +16,12 @@
 // contains the anchor frames that are references of the coded tiles, the camera
 // frame header, and tile list OBUs that include the tile information and the
 // compressed tile data. This input file is reconstructed from the encoded
-// lightfield ivf file, and is decodable by AV1 decoder. The lf_width and
-// lf_height arguments are the number of lightfield images in each dimension.
-// The lf_blocksize determines the number of reference images used.
+// lightfield ivf file, and is decodable by AV1 decoder. num_references is
+// the number of anchor frames coded at the beginning of the light field file.
+// num_tile_lists is the number of tile lists need to be decoded.
 // Run lightfield tile list decoder to decode an AV1 tile list file:
 // examples/lightfield_tile_list_decoder vase_tile_list.ivf vase_tile_list.yuv
-// 10 10 5 2
+// 4 2
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -40,8 +40,7 @@ static const char *exec_name;
 
 void usage_exit(void) {
   fprintf(stderr,
-          "Usage: %s <infile> <outfile> <lf_width> <lf_height> <lf_blocksize> "
-          "<num_tile_lists>\n",
+          "Usage: %s <infile> <outfile> <num_references> <num_tile_lists>\n",
           exec_name);
   exit(EXIT_FAILURE);
 }
@@ -52,21 +51,16 @@ int main(int argc, char **argv) {
   AvxVideoReader *reader = NULL;
   const AvxInterface *decoder = NULL;
   const AvxVideoInfo *info = NULL;
-  const char *lf_width_arg;
-  const char *lf_height_arg;
-  const char *lf_blocksize_arg;
-  int width, height;
-  int lf_width, lf_height, lf_blocksize;
-  int u_blocks, v_blocks;
+  int num_references;
   int num_tile_lists;
   aom_image_t reference_images[MAX_EXTERNAL_REFERENCES];
   size_t frame_size = 0;
   const unsigned char *frame = NULL;
-  int i, n;
+  int i, j, n;
 
   exec_name = argv[0];
 
-  if (argc != 7) die("Invalid number of arguments.");
+  if (argc != 5) die("Invalid number of arguments.");
 
   reader = aom_video_reader_open(argv[1]);
   if (!reader) die("Failed to open %s for reading.", argv[1]);
@@ -74,17 +68,10 @@ int main(int argc, char **argv) {
   if (!(outfile = fopen(argv[2], "wb")))
     die("Failed to open %s for writing.", argv[2]);
 
-  lf_width_arg = argv[3];
-  lf_height_arg = argv[4];
-  lf_blocksize_arg = argv[5];
-  lf_width = (int)strtol(lf_width_arg, NULL, 0);
-  lf_height = (int)strtol(lf_height_arg, NULL, 0);
-  lf_blocksize = (int)strtol(lf_blocksize_arg, NULL, 0);
-  num_tile_lists = (int)strtol(argv[6], NULL, 0);
+  num_references = (int)strtol(argv[3], NULL, 0);
+  num_tile_lists = (int)strtol(argv[4], NULL, 0);
 
   info = aom_video_reader_get_info(reader);
-  width = info->frame_width;
-  height = info->frame_height;
 
   decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
   if (!decoder) die("Unknown input codec.");
@@ -93,33 +80,39 @@ int main(int argc, char **argv) {
   if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
     die_codec(&codec, "Failed to initialize decoder.");
 
-  // How many anchor frames we have.
-  u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize;
-  v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize;
-
-  int num_references = v_blocks * u_blocks;
-
-  // Allocate memory to store decoded references.
-  aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420;
-  if (!CONFIG_LOWBITDEPTH) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
-  // Allocate memory with the border so that it can be used as a reference.
-  for (i = 0; i < num_references; i++) {
-    unsigned int border = AOM_BORDER_IN_PIXELS;
-    if (!aom_img_alloc_with_border(&reference_images[i], ref_fmt, width, height,
-                                   32, 8, border)) {
-      die("Failed to allocate references.");
-    }
+  if (aom_codec_control(&codec, AV1D_SET_IS_ANNEXB, info->is_annexb)) {
+    die("Failed to set annex b status");
   }
 
   // Decode anchor frames.
   aom_codec_control_(&codec, AV1_SET_TILE_MODE, 0);
-
   for (i = 0; i < num_references; ++i) {
     aom_video_reader_read_frame(reader);
     frame = aom_video_reader_get_frame(reader, &frame_size);
     if (aom_codec_decode(&codec, frame, frame_size, NULL))
       die_codec(&codec, "Failed to decode frame.");
 
+    if (i == 0) {
+      aom_img_fmt_t ref_fmt = 0;
+      if (aom_codec_control(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt))
+        die_codec(&codec, "Failed to get the image format");
+
+      int frame_res[2];
+      if (aom_codec_control(&codec, AV1D_GET_FRAME_SIZE, frame_res))
+        die_codec(&codec, "Failed to get the image frame size");
+
+      // Allocate memory to store decoded references. Allocate memory with the
+      // border so that it can be used as a reference.
+      for (j = 0; j < num_references; j++) {
+        unsigned int border = AOM_BORDER_IN_PIXELS;
+        if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt,
+                                       frame_res[0], frame_res[1], 32, 8,
+                                       border)) {
+          die("Failed to allocate references.");
+        }
+      }
+    }
+
     if (aom_codec_control(&codec, AV1_COPY_NEW_FRAME_IMAGE,
                           &reference_images[i]))
       die_codec(&codec, "Failed to copy decoded reference frame");
@@ -142,13 +135,11 @@ int main(int argc, char **argv) {
   // Set external references.
   av1_ext_ref_frame_t set_ext_ref = { &reference_images[0], num_references };
   aom_codec_control_(&codec, AV1D_SET_EXT_REF_PTR, &set_ext_ref);
-
   // Must decode the camera frame header first.
   aom_video_reader_read_frame(reader);
   frame = aom_video_reader_get_frame(reader, &frame_size);
   if (aom_codec_decode(&codec, frame, frame_size, NULL))
     die_codec(&codec, "Failed to decode the frame.");
-
   // Decode tile lists one by one.
   for (n = 0; n < num_tile_lists; n++) {
     aom_video_reader_read_frame(reader);
@@ -156,7 +147,6 @@ int main(int argc, char **argv) {
 
     if (aom_codec_decode(&codec, frame, frame_size, NULL))
       die_codec(&codec, "Failed to decode the tile list.");
-
     aom_codec_iter_t iter = NULL;
     aom_image_t *img;
     while ((img = aom_codec_get_frame(&codec, &iter)))
diff --git a/third_party/aom/examples/twopass_encoder.c b/third_party/aom/examples/twopass_encoder.c
index 1b134cce0..a03bc6cc2 100644
--- a/third_party/aom/examples/twopass_encoder.c
+++ b/third_party/aom/examples/twopass_encoder.c
@@ -148,7 +148,8 @@ static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
   AvxVideoInfo info = { encoder->fourcc,
                         cfg->g_w,
                         cfg->g_h,
-                        { cfg->g_timebase.num, cfg->g_timebase.den } };
+                        { cfg->g_timebase.num, cfg->g_timebase.den },
+                        0 };
   AvxVideoWriter *writer = NULL;
   aom_codec_ctx_t codec;
   int frame_count = 0;
diff --git a/third_party/aom/test/aomdec.sh b/third_party/aom/test/aomdec.sh
index 5f54ae0af..927142287 100755
--- a/third_party/aom/test/aomdec.sh
+++ b/third_party/aom/test/aomdec.sh
@@ -37,7 +37,7 @@ aomdec_verify_environment() {
 # input file path and shifted away. All remaining parameters are passed through
 # to aomdec.
 aomdec_pipe() {
-  local readonly input="$1"
+  local input="$1"
   shift
   if [ ! -e "${input}" ]; then
     elog "Input file ($input) missing in aomdec_pipe()"
@@ -51,8 +51,8 @@ aomdec_pipe() {
 # the directory containing aomdec. $1 one is used as the input file path and
 # shifted away. All remaining parameters are passed through to aomdec.
 aomdec() {
-  local readonly decoder="$(aom_tool_path aomdec)"
-  local readonly input="$1"
+  local decoder="$(aom_tool_path aomdec)"
+  local input="$1"
   shift
   eval "${AOM_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull}
 }
@@ -65,7 +65,7 @@ aomdec_can_decode_av1() {
 
 aomdec_av1_ivf() {
   if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
-    local readonly file="${AV1_IVF_FILE}"
+    local file="${AV1_IVF_FILE}"
     if [ ! -e "${file}" ]; then
       encode_yuv_raw_input_av1 "${file}" --ivf
     fi
@@ -75,7 +75,7 @@ aomdec_av1_ivf() {
 
 aomdec_av1_ivf_error_resilient() {
   if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
-    local readonly file="av1.error-resilient.ivf"
+    local file="av1.error-resilient.ivf"
     if [ ! -e "${file}" ]; then
       encode_yuv_raw_input_av1 "${file}" --ivf --error-resilient=1
     fi
@@ -85,7 +85,7 @@ aomdec_av1_ivf_error_resilient() {
 
 aomdec_av1_ivf_multithread() {
   if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
-    local readonly file="${AV1_IVF_FILE}"
+    local file="${AV1_IVF_FILE}"
     if [ ! -e "${file}" ]; then
       encode_yuv_raw_input_av1 "${file}" --ivf
     fi
@@ -97,7 +97,7 @@ aomdec_av1_ivf_multithread() {
 
 aomdec_aom_ivf_pipe_input() {
   if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
-    local readonly file="${AV1_IVF_FILE}"
+    local file="${AV1_IVF_FILE}"
     if [ ! -e "${file}" ]; then
       encode_yuv_raw_input_av1 "${file}" --ivf
     fi
@@ -107,7 +107,7 @@ aomdec_aom_ivf_pipe_input() {
 
 aomdec_av1_obu_annexb() {
   if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
-    local readonly file="${AV1_OBU_ANNEXB_FILE}"
+    local file="${AV1_OBU_ANNEXB_FILE}"
     if [ ! -e "${file}" ]; then
       encode_yuv_raw_input_av1 "${file}" --obu --annexb=1
     fi
@@ -117,7 +117,7 @@ aomdec_av1_obu_annexb() {
 
 aomdec_av1_obu_section5() {
   if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
-    local readonly file="${AV1_OBU_SEC5_FILE}"
+    local file="${AV1_OBU_SEC5_FILE}"
     if [ ! -e "${file}" ]; then
       encode_yuv_raw_input_av1 "${file}" --obu
     fi
@@ -128,7 +128,7 @@ aomdec_av1_obu_section5() {
 aomdec_av1_webm() {
   if [ "$(aomdec_can_decode_av1)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly file="${AV1_WEBM_FILE}"
+    local file="${AV1_WEBM_FILE}"
     if [ ! -e "${file}" ]; then
       encode_yuv_raw_input_av1 "${file}"
     fi
diff --git a/third_party/aom/test/aomenc.sh b/third_party/aom/test/aomenc.sh
index a0ab8c8aa..b030397a3 100755
--- a/third_party/aom/test/aomenc.sh
+++ b/third_party/aom/test/aomenc.sh
@@ -60,8 +60,8 @@ y4m_input_720p() {
 # input file path and shifted away. All remaining parameters are passed through
 # to aomenc.
 aomenc_pipe() {
-  local readonly encoder="$(aom_tool_path aomenc)"
-  local readonly input="$1"
+  local encoder="$(aom_tool_path aomenc)"
+  local input="$1"
   shift
   cat "${input}" | eval "${AOM_TEST_PREFIX}" "${encoder}" - \
     --test-decode=fatal \
@@ -72,8 +72,8 @@ aomenc_pipe() {
 # the directory containing aomenc. $1 one is used as the input file path and
 # shifted away. All remaining parameters are passed through to aomenc.
 aomenc() {
-  local readonly encoder="$(aom_tool_path aomenc)"
-  local readonly input="$1"
+  local encoder="$(aom_tool_path aomenc)"
+  local input="$1"
   shift
   eval "${AOM_TEST_PREFIX}" "${encoder}" "${input}" \
     --test-decode=fatal \
@@ -156,7 +156,7 @@ aomenc_av1_webm() {
 aomenc_av1_webm_1pass() {
   if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${AOM_TEST_OUTPUT_DIR}/av1_test.webm"
+    local output="${AOM_TEST_OUTPUT_DIR}/av1_test.webm"
     aomenc $(yuv_raw_input) \
       $(aomenc_encode_test_fast_params) \
       --passes=1 \
@@ -171,7 +171,7 @@ aomenc_av1_webm_1pass() {
 
 aomenc_av1_ivf_lossless() {
   if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
-    local readonly output="${AOM_TEST_OUTPUT_DIR}/av1_lossless.ivf"
+    local output="${AOM_TEST_OUTPUT_DIR}/av1_lossless.ivf"
     aomenc $(yuv_raw_input) \
       $(aomenc_encode_test_fast_params) \
       --ivf \
@@ -187,7 +187,7 @@ aomenc_av1_ivf_lossless() {
 
 aomenc_av1_ivf_minq0_maxq0() {
   if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
-    local readonly output="${AOM_TEST_OUTPUT_DIR}/av1_lossless_minq0_maxq0.ivf"
+    local output="${AOM_TEST_OUTPUT_DIR}/av1_lossless_minq0_maxq0.ivf"
     aomenc $(yuv_raw_input) \
       $(aomenc_encode_test_fast_params) \
       --ivf \
@@ -205,9 +205,9 @@ aomenc_av1_ivf_minq0_maxq0() {
 aomenc_av1_webm_lag5_frames10() {
   if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly lag_total_frames=10
-    local readonly lag_frames=5
-    local readonly output="${AOM_TEST_OUTPUT_DIR}/av1_lag5_frames10.webm"
+    local lag_total_frames=10
+    local lag_frames=5
+    local output="${AOM_TEST_OUTPUT_DIR}/av1_lag5_frames10.webm"
     aomenc $(yuv_raw_input) \
       $(aomenc_encode_test_fast_params) \
       --limit=${lag_total_frames} \
@@ -225,7 +225,7 @@ aomenc_av1_webm_lag5_frames10() {
 aomenc_av1_webm_non_square_par() {
   if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${AOM_TEST_OUTPUT_DIR}/av1_non_square_par.webm"
+    local output="${AOM_TEST_OUTPUT_DIR}/av1_non_square_par.webm"
     aomenc $(y4m_input_non_square_par) \
       $(aomenc_encode_test_fast_params) \
       --output="${output}"
@@ -241,7 +241,7 @@ aomenc_av1_webm_cdf_update_mode() {
   if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
     for mode in 0 1 2; do
-      local readonly output="${AOM_TEST_OUTPUT_DIR}/cdf_mode_${mode}.webm"
+      local output="${AOM_TEST_OUTPUT_DIR}/cdf_mode_${mode}.webm"
       aomenc $(yuv_raw_input) \
         $(aomenc_encode_test_fast_params) \
         --cdf-update-mode=${mode} \
diff --git a/third_party/aom/test/av1_convolve_2d_test_util.cc b/third_party/aom/test/av1_convolve_2d_test_util.cc
index cbe3f8c9f..1aa08044e 100644
--- a/third_party/aom/test/av1_convolve_2d_test_util.cc
+++ b/third_party/aom/test/av1_convolve_2d_test_util.cc
@@ -63,10 +63,10 @@ void AV1Convolve2DSrTest::RunCheckOutput(convolve_2d_func test_impl) {
     for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
       for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL;
            ++vfilter) {
-        InterpFilterParams filter_params_x =
+        const InterpFilterParams *filter_params_x =
             av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
                                                          out_w);
-        InterpFilterParams filter_params_y =
+        const InterpFilterParams *filter_params_y =
             av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
                                                          out_h);
         for (int do_average = 0; do_average < 1; ++do_average) {
@@ -83,11 +83,11 @@ void AV1Convolve2DSrTest::RunCheckOutput(convolve_2d_func test_impl) {
               const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
               const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
               av1_convolve_2d_sr_c(input + offset_r * w + offset_c, w, output,
-                                   MAX_SB_SIZE, out_w, out_h, &filter_params_x,
-                                   &filter_params_y, subx, suby, &conv_params1);
+                                   MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                                   filter_params_y, subx, suby, &conv_params1);
               test_impl(input + offset_r * w + offset_c, w, output2,
-                        MAX_SB_SIZE, out_w, out_h, &filter_params_x,
-                        &filter_params_y, subx, suby, &conv_params2);
+                        MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                        filter_params_y, subx, suby, &conv_params2);
 
               if (memcmp(output, output2, sizeof(output))) {
                 for (int i = 0; i < MAX_SB_SIZE; ++i) {
@@ -137,10 +137,10 @@ void AV1Convolve2DSrTest::RunSpeedTest(convolve_2d_func test_impl) {
     const int out_h = block_size_high[block_idx] >> shift;
     const int num_loops = 1000000000 / (out_w + out_h);
 
-    InterpFilterParams filter_params_x =
+    const InterpFilterParams *filter_params_x =
         av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
                                                      out_w);
-    InterpFilterParams filter_params_y =
+    const InterpFilterParams *filter_params_y =
         av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
                                                      out_h);
 
@@ -148,8 +148,8 @@ void AV1Convolve2DSrTest::RunSpeedTest(convolve_2d_func test_impl) {
     aom_usec_timer_start(&timer);
 
     for (int i = 0; i < num_loops; ++i)
-      test_impl(input, w, output, MAX_SB_SIZE, out_w, out_h, &filter_params_x,
-                &filter_params_y, subx, suby, &conv_params2);
+      test_impl(input, w, output, MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                filter_params_y, subx, suby, &conv_params2);
 
     aom_usec_timer_mark(&timer);
     const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
@@ -188,10 +188,10 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
   const int out_h = block_size_high[block_idx];
   for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
     for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) {
-      InterpFilterParams filter_params_x =
+      const InterpFilterParams *filter_params_x =
           av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
                                                        out_w);
-      InterpFilterParams filter_params_y =
+      const InterpFilterParams *filter_params_y =
           av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
                                                        out_h);
       for (int do_average = 0; do_average <= 1; ++do_average) {
@@ -212,11 +212,11 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
             const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
             const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
             av1_jnt_convolve_2d_c(input + offset_r * w + offset_c, w, output8_1,
-                                  MAX_SB_SIZE, out_w, out_h, &filter_params_x,
-                                  &filter_params_y, subx, suby, &conv_params1);
+                                  MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                                  filter_params_y, subx, suby, &conv_params1);
             test_impl(input + offset_r * w + offset_c, w, output8_2,
-                      MAX_SB_SIZE, out_w, out_h, &filter_params_x,
-                      &filter_params_y, subx, suby, &conv_params2);
+                      MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                      filter_params_y, subx, suby, &conv_params2);
 
             for (int i = 0; i < out_h; ++i) {
               for (int j = 0; j < out_w; ++j) {
@@ -261,11 +261,11 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
                 const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
                 av1_jnt_convolve_2d_c(input + offset_r * w + offset_c, w,
                                       output8_1, MAX_SB_SIZE, out_w, out_h,
-                                      &filter_params_x, &filter_params_y, subx,
+                                      filter_params_x, filter_params_y, subx,
                                       suby, &conv_params1);
                 test_impl(input + offset_r * w + offset_c, w, output8_2,
-                          MAX_SB_SIZE, out_w, out_h, &filter_params_x,
-                          &filter_params_y, subx, suby, &conv_params2);
+                          MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                          filter_params_y, subx, suby, &conv_params2);
 
                 for (int i = 0; i < out_h; ++i) {
                   for (int j = 0; j < out_w; ++j) {
@@ -323,10 +323,10 @@ void AV1JntConvolve2DTest::RunSpeedTest(convolve_2d_func test_impl) {
   const int num_loops = 1000000000 / (out_w + out_h);
   const int do_average = 0;
 
-  InterpFilterParams filter_params_x =
+  const InterpFilterParams *filter_params_x =
       av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
                                                    out_w);
-  InterpFilterParams filter_params_y =
+  const InterpFilterParams *filter_params_y =
       av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
                                                    out_h);
 
@@ -344,7 +344,7 @@ void AV1JntConvolve2DTest::RunSpeedTest(convolve_2d_func test_impl) {
 
   for (int i = 0; i < num_loops; ++i)
     test_impl(input + offset_r * w + offset_c, w, output8, MAX_SB_SIZE, out_w,
-              out_h, &filter_params_x, &filter_params_y, subx, suby,
+              out_h, filter_params_x, filter_params_y, subx, suby,
               &conv_params);
 
   aom_usec_timer_mark(&timer);
@@ -407,10 +407,10 @@ void AV1HighbdConvolve2DSrTest::RunSpeedTest(
     const int out_h = block_size_high[block_idx] >> shift;
     const int num_loops = 1000000000 / (out_w + out_h);
 
-    InterpFilterParams filter_params_x =
+    const InterpFilterParams *filter_params_x =
         av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
                                                      out_w);
-    InterpFilterParams filter_params_y =
+    const InterpFilterParams *filter_params_y =
         av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
                                                      out_h);
 
@@ -418,7 +418,7 @@ void AV1HighbdConvolve2DSrTest::RunSpeedTest(
     aom_usec_timer_start(&timer);
     for (int i = 0; i < num_loops; ++i)
       test_impl(input + offset_r * w + offset_c, w, output, MAX_SB_SIZE, out_w,
-                out_h, &filter_params_x, &filter_params_y, subx, suby,
+                out_h, filter_params_x, filter_params_y, subx, suby,
                 &conv_params, bd);
 
     aom_usec_timer_mark(&timer);
@@ -456,10 +456,10 @@ void AV1HighbdConvolve2DSrTest::RunCheckOutput(
     for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
       for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL;
            ++vfilter) {
-        InterpFilterParams filter_params_x =
+        const InterpFilterParams *filter_params_x =
             av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
                                                          out_w);
-        InterpFilterParams filter_params_y =
+        const InterpFilterParams *filter_params_y =
             av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
                                                          out_h);
         for (int do_average = 0; do_average < 1; ++do_average) {
@@ -477,11 +477,11 @@ void AV1HighbdConvolve2DSrTest::RunCheckOutput(
               const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
               av1_highbd_convolve_2d_sr_c(input + offset_r * w + offset_c, w,
                                           output, MAX_SB_SIZE, out_w, out_h,
-                                          &filter_params_x, &filter_params_y,
+                                          filter_params_x, filter_params_y,
                                           subx, suby, &conv_params1, bd);
               test_impl(input + offset_r * w + offset_c, w, output2,
-                        MAX_SB_SIZE, out_w, out_h, &filter_params_x,
-                        &filter_params_y, subx, suby, &conv_params2, bd);
+                        MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                        filter_params_y, subx, suby, &conv_params2, bd);
 
               if (memcmp(output, output2, sizeof(output))) {
                 for (int i = 0; i < MAX_SB_SIZE; ++i) {
@@ -530,10 +530,10 @@ void AV1HighbdJntConvolve2DTest::RunSpeedTest(
   const int out_w = block_size_wide[block_idx];
   const int out_h = block_size_high[block_idx];
 
-  InterpFilterParams filter_params_x =
+  const InterpFilterParams *filter_params_x =
       av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
                                                    out_w);
-  InterpFilterParams filter_params_y =
+  const InterpFilterParams *filter_params_y =
       av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
                                                    out_h);
 
@@ -554,8 +554,8 @@ void AV1HighbdJntConvolve2DTest::RunSpeedTest(
   aom_usec_timer_start(&timer);
   for (int i = 0; i < num_loops; ++i)
     test_impl(input + offset_r * w + offset_c, w, output16, MAX_SB_SIZE, out_w,
-              out_h, &filter_params_x, &filter_params_y, subx, suby,
-              &conv_params, bd);
+              out_h, filter_params_x, filter_params_y, subx, suby, &conv_params,
+              bd);
 
   aom_usec_timer_mark(&timer);
   const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
@@ -589,10 +589,10 @@ void AV1HighbdJntConvolve2DTest::RunCheckOutput(
   const int out_h = block_size_high[block_idx];
   for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
     for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) {
-      InterpFilterParams filter_params_x =
+      const InterpFilterParams *filter_params_x =
           av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
                                                        out_w);
-      InterpFilterParams filter_params_y =
+      const InterpFilterParams *filter_params_y =
           av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
                                                        out_h);
       for (int do_average = 0; do_average <= 1; ++do_average) {
@@ -614,11 +614,11 @@ void AV1HighbdJntConvolve2DTest::RunCheckOutput(
             const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
             av1_highbd_jnt_convolve_2d_c(input + offset_r * w + offset_c, w,
                                          output16_1, MAX_SB_SIZE, out_w, out_h,
-                                         &filter_params_x, &filter_params_y,
-                                         subx, suby, &conv_params1, bd);
+                                         filter_params_x, filter_params_y, subx,
+                                         suby, &conv_params1, bd);
             test_impl(input + offset_r * w + offset_c, w, output16_2,
-                      MAX_SB_SIZE, out_w, out_h, &filter_params_x,
-                      &filter_params_y, subx, suby, &conv_params2, bd);
+                      MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                      filter_params_y, subx, suby, &conv_params2, bd);
 
             for (int i = 0; i < out_h; ++i) {
               for (int j = 0; j < out_w; ++j) {
@@ -664,11 +664,11 @@ void AV1HighbdJntConvolve2DTest::RunCheckOutput(
                 const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
                 av1_highbd_jnt_convolve_2d_c(
                     input + offset_r * w + offset_c, w, output16_1, MAX_SB_SIZE,
-                    out_w, out_h, &filter_params_x, &filter_params_y, subx,
-                    suby, &conv_params1, bd);
+                    out_w, out_h, filter_params_x, filter_params_y, subx, suby,
+                    &conv_params1, bd);
                 test_impl(input + offset_r * w + offset_c, w, output16_2,
-                          MAX_SB_SIZE, out_w, out_h, &filter_params_x,
-                          &filter_params_y, subx, suby, &conv_params2, bd);
+                          MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                          filter_params_y, subx, suby, &conv_params2, bd);
 
                 for (int i = 0; i < out_h; ++i) {
                   for (int j = 0; j < out_w; ++j) {
diff --git a/third_party/aom/test/av1_convolve_2d_test_util.h b/third_party/aom/test/av1_convolve_2d_test_util.h
index 3a53dbdfe..cd4607d68 100644
--- a/third_party/aom/test/av1_convolve_2d_test_util.h
+++ b/third_party/aom/test/av1_convolve_2d_test_util.h
@@ -28,8 +28,8 @@ namespace AV1Convolve2D {
 
 typedef void (*convolve_2d_func)(const uint8_t *src, int src_stride,
                                  uint8_t *dst, int dst_stride, int w, int h,
-                                 InterpFilterParams *filter_params_x,
-                                 InterpFilterParams *filter_params_y,
+                                 const InterpFilterParams *filter_params_x,
+                                 const InterpFilterParams *filter_params_y,
                                  const int subpel_x_q4, const int subpel_y_q4,
                                  ConvolveParams *conv_params);
 
@@ -71,8 +71,8 @@ class AV1JntConvolve2DTest : public ::testing::TestWithParam<Convolve2DParam> {
 namespace AV1HighbdConvolve2D {
 typedef void (*highbd_convolve_2d_func)(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, InterpFilterParams *filter_params_x,
-    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
     const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 
 typedef ::testing::tuple<int, highbd_convolve_2d_func, int, int, BLOCK_SIZE>
diff --git a/third_party/aom/test/av1_convolve_scale_test.cc b/third_party/aom/test/av1_convolve_scale_test.cc
index e0571423c..b99caaeeb 100644
--- a/third_party/aom/test/av1_convolve_scale_test.cc
+++ b/third_party/aom/test/av1_convolve_scale_test.cc
@@ -390,8 +390,8 @@ typedef tuple<int, int> BlockDimension;
 
 typedef void (*LowbdConvolveFunc)(const uint8_t *src, int src_stride,
                                   uint8_t *dst, int dst_stride, int w, int h,
-                                  InterpFilterParams *filter_params_x,
-                                  InterpFilterParams *filter_params_y,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
                                   const int subpel_x_qn, const int x_step_qn,
                                   const int subpel_y_qn, const int y_step_qn,
                                   ConvolveParams *conv_params);
@@ -463,8 +463,8 @@ INSTANTIATE_TEST_CASE_P(
 
 typedef void (*HighbdConvolveFunc)(const uint16_t *src, int src_stride,
                                    uint16_t *dst, int dst_stride, int w, int h,
-                                   InterpFilterParams *filter_params_x,
-                                   InterpFilterParams *filter_params_y,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
                                    const int subpel_x_qn, const int x_step_qn,
                                    const int subpel_y_qn, const int y_step_qn,
                                    ConvolveParams *conv_params, int bd);
diff --git a/third_party/aom/test/av1_ext_tile_test.cc b/third_party/aom/test/av1_ext_tile_test.cc
index d2abbab7f..424d2f065 100644
--- a/third_party/aom/test/av1_ext_tile_test.cc
+++ b/third_party/aom/test/av1_ext_tile_test.cc
@@ -47,6 +47,7 @@ class AV1ExtTileTest
 
     decoder_ = codec_->CreateDecoder(cfg, 0);
     decoder_->Control(AV1_SET_TILE_MODE, 1);
+    decoder_->Control(AV1D_EXT_TILE_DEBUG, 1);
     decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);
     decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
 
@@ -82,13 +83,14 @@ class AV1ExtTileTest
       encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
       encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
 
-      // The tile size is 64x64.
-      encoder->Control(AV1E_SET_TILE_COLUMNS, kTileSize);
-      encoder->Control(AV1E_SET_TILE_ROWS, kTileSize);
       // TODO(yunqingwang): test single_tile_decoding = 0.
       encoder->Control(AV1E_SET_SINGLE_TILE_DECODING, 1);
       // Always use 64x64 max partition.
       encoder->Control(AV1E_SET_SUPERBLOCK_SIZE, AOM_SUPERBLOCK_SIZE_64X64);
+      // Set tile_columns and tile_rows to MAX values, which guarantees the tile
+      // size of 64 x 64 pixels(i.e. 1 SB) for <= 4k resolution.
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 6);
+      encoder->Control(AV1E_SET_TILE_ROWS, 6);
     }
 
     if (video->frame() == 1) {
@@ -195,7 +197,7 @@ class AV1ExtTileTest
   std::vector<std::string> tile_md5_;
 };
 
-TEST_P(AV1ExtTileTest, DISABLED_DecoderResultTest) { TestRoundTrip(); }
+TEST_P(AV1ExtTileTest, DecoderResultTest) { TestRoundTrip(); }
 
 AV1_INSTANTIATE_TEST_CASE(
     // Now only test 2-pass mode.
@@ -204,7 +206,7 @@ AV1_INSTANTIATE_TEST_CASE(
 
 class AV1ExtTileTestLarge : public AV1ExtTileTest {};
 
-TEST_P(AV1ExtTileTestLarge, DISABLED_DecoderResultTest) { TestRoundTrip(); }
+TEST_P(AV1ExtTileTestLarge, DecoderResultTest) { TestRoundTrip(); }
 
 AV1_INSTANTIATE_TEST_CASE(
     // Now only test 2-pass mode.
diff --git a/third_party/aom/test/av1_fwd_txfm2d_test.cc b/third_party/aom/test/av1_fwd_txfm2d_test.cc
index e0294be4e..6577e33b8 100644
--- a/third_party/aom/test/av1_fwd_txfm2d_test.cc
+++ b/third_party/aom/test/av1_fwd_txfm2d_test.cc
@@ -247,9 +247,9 @@ void AV1FwdTxfm2dMatchTest(TX_SIZE tx_size, lowbd_fwd_txfm_func target_func) {
 
     FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size];
     if (ref_func != NULL) {
-      DECLARE_ALIGNED(16, int16_t, input[64 * 64]) = { 0 };
-      DECLARE_ALIGNED(16, int32_t, output[64 * 64]);
-      DECLARE_ALIGNED(16, int32_t, ref_output[64 * 64]);
+      DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 };
+      DECLARE_ALIGNED(32, int32_t, output[64 * 64]);
+      DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]);
       int input_stride = 64;
       ACMRandom rnd(ACMRandom::DeterministicSeed());
       for (int cnt = 0; cnt < 500; ++cnt) {
@@ -339,4 +339,16 @@ INSTANTIATE_TEST_CASE_P(SSE4_1, AV1FwdTxfm2dTest,
                         Combine(ValuesIn(fwd_txfm_for_sse41),
                                 Values(av1_lowbd_fwd_txfm_sse4_1)));
 #endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+static TX_SIZE fwd_txfm_for_avx2[] = {
+  TX_4X4,  TX_8X8,  TX_16X16, TX_32X32, TX_64X64, TX_4X8,   TX_8X4,
+  TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16,
+  TX_16X4, TX_8X32, TX_32X8,  TX_16X64, TX_64X16,
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, AV1FwdTxfm2dTest,
+                        Combine(ValuesIn(fwd_txfm_for_avx2),
+                                Values(av1_lowbd_fwd_txfm_avx2)));
+#endif  // HAVE_AVX2
 }  // namespace
diff --git a/third_party/aom/test/av1_inv_txfm2d_test.cc b/third_party/aom/test/av1_inv_txfm2d_test.cc
index 461e7ebcd..11e231ba6 100644
--- a/third_party/aom/test/av1_inv_txfm2d_test.cc
+++ b/third_party/aom/test/av1_inv_txfm2d_test.cc
@@ -364,4 +364,15 @@ INSTANTIATE_TEST_CASE_P(AVX2, AV1LbdInvTxfm2d,
                         ::testing::Values(av1_lowbd_inv_txfm2d_add_avx2));
 #endif  // HAVE_AVX2
 
+#if HAVE_NEON
+
+extern "C" void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input,
+                                              uint8_t *output, int stride,
+                                              TX_TYPE tx_type, TX_SIZE tx_size,
+                                              int eob);
+
+INSTANTIATE_TEST_CASE_P(NEON, AV1LbdInvTxfm2d,
+                        ::testing::Values(av1_lowbd_inv_txfm2d_add_neon));
+#endif  // HAVE_NEON
+
 }  // namespace
diff --git a/third_party/aom/test/av1_wedge_utils_test.cc b/third_party/aom/test/av1_wedge_utils_test.cc
index cfdf2d36c..e8fbe69a4 100644
--- a/third_party/aom/test/av1_wedge_utils_test.cc
+++ b/third_party/aom/test/av1_wedge_utils_test.cc
@@ -217,14 +217,6 @@ TEST_P(WedgeUtilsSSEOptTest, ExtremeValues) {
   }
 }
 
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
-    SSE2, WedgeUtilsSSEOptTest,
-    ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_c,
-                                    av1_wedge_sse_from_residuals_sse2)));
-
-#endif  // HAVE_SSE2
-
 //////////////////////////////////////////////////////////////////////////////
 // av1_wedge_sign_from_residuals
 //////////////////////////////////////////////////////////////////////////////
@@ -325,15 +317,6 @@ TEST_P(WedgeUtilsSignOptTest, ExtremeValues) {
   }
 }
 
-#if HAVE_SSE2
-
-INSTANTIATE_TEST_CASE_P(
-    SSE2, WedgeUtilsSignOptTest,
-    ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_c,
-                                     av1_wedge_sign_from_residuals_sse2)));
-
-#endif  // HAVE_SSE2
-
 //////////////////////////////////////////////////////////////////////////////
 // av1_wedge_compute_delta_squares
 //////////////////////////////////////////////////////////////////////////////
@@ -371,12 +354,37 @@ TEST_P(WedgeUtilsDeltaSquaresOptTest, RandomValues) {
 }
 
 #if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, WedgeUtilsSSEOptTest,
+    ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_c,
+                                    av1_wedge_sse_from_residuals_sse2)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, WedgeUtilsSignOptTest,
+    ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_c,
+                                     av1_wedge_sign_from_residuals_sse2)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, WedgeUtilsDeltaSquaresOptTest,
     ::testing::Values(TestFuncsFDS(av1_wedge_compute_delta_squares_c,
                                    av1_wedge_compute_delta_squares_sse2)));
-
 #endif  // HAVE_SSE2
 
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, WedgeUtilsSSEOptTest,
+    ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_sse2,
+                                    av1_wedge_sse_from_residuals_avx2)));
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2, WedgeUtilsSignOptTest,
+    ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_sse2,
+                                     av1_wedge_sign_from_residuals_avx2)));
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2, WedgeUtilsDeltaSquaresOptTest,
+    ::testing::Values(TestFuncsFDS(av1_wedge_compute_delta_squares_sse2,
+                                   av1_wedge_compute_delta_squares_avx2)));
+#endif  // HAVE_AVX2
+
 }  // namespace
diff --git a/third_party/aom/test/codec_factory.h b/third_party/aom/test/codec_factory.h
index 65b76094c..e6ae7f8c3 100644
--- a/third_party/aom/test/codec_factory.h
+++ b/third_party/aom/test/codec_factory.h
@@ -71,6 +71,11 @@ class CodecTestWith4Params
     : public ::testing::TestWithParam< ::testing::tuple<
           const libaom_test::CodecFactory *, T1, T2, T3, T4> > {};
 
+template <class T1, class T2, class T3, class T4, class T5>
+class CodecTestWith5Params
+    : public ::testing::TestWithParam< ::testing::tuple<
+          const libaom_test::CodecFactory *, T1, T2, T3, T4, T5> > {};
+
 /*
  * AV1 Codec Definitions
  */
diff --git a/third_party/aom/test/comp_mask_variance_test.cc b/third_party/aom/test/comp_mask_variance_test.cc
index a5e3f3411..0016ddd59 100644
--- a/third_party/aom/test/comp_mask_variance_test.cc
+++ b/third_party/aom/test/comp_mask_variance_test.cc
@@ -33,6 +33,7 @@ typedef void (*comp_mask_pred_func)(uint8_t *comp_pred, const uint8_t *pred,
                                     int width, int height, const uint8_t *ref,
                                     int ref_stride, const uint8_t *mask,
                                     int mask_stride, int invert_mask);
+
 #if HAVE_SSSE3 || HAVE_AV2
 const BLOCK_SIZE kValidBlockSize[] = {
   BLOCK_8X8,   BLOCK_8X16, BLOCK_8X32,  BLOCK_16X8,  BLOCK_16X16,
@@ -270,4 +271,274 @@ INSTANTIATE_TEST_CASE_P(
 #endif
 
 #endif  // ifndef aom_comp_mask_pred
+
+typedef void (*highbd_comp_mask_pred_func)(uint16_t *comp_pred,
+                                           const uint8_t *pred8, int width,
+                                           int height, const uint8_t *ref8,
+                                           int ref_stride, const uint8_t *mask,
+                                           int mask_stride, int invert_mask);
+
+typedef ::testing::tuple<highbd_comp_mask_pred_func, BLOCK_SIZE, int>
+    HighbdCompMaskPredParam;
+
+class AV1HighbdCompMaskVarianceTest
+    : public ::testing::TestWithParam<HighbdCompMaskPredParam> {
+ public:
+  ~AV1HighbdCompMaskVarianceTest();
+  void SetUp();
+
+  void TearDown();
+
+ protected:
+  void RunCheckOutput(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize,
+                      int inv);
+  void RunSpeedTest(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize);
+  bool CheckResult(int width, int height) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        const int idx = y * width + x;
+        if (comp_pred1_[idx] != comp_pred2_[idx]) {
+          printf("%dx%d mismatch @%d(%d,%d) ", width, height, idx, y, x);
+          printf("%d != %d ", comp_pred1_[idx], comp_pred2_[idx]);
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  libaom_test::ACMRandom rnd_;
+  uint16_t *comp_pred1_;
+  uint16_t *comp_pred2_;
+  uint16_t *pred_;
+  uint16_t *ref_buffer_;
+  uint16_t *ref_;
+};
+
+AV1HighbdCompMaskVarianceTest::~AV1HighbdCompMaskVarianceTest() { ; }
+
+void AV1HighbdCompMaskVarianceTest::SetUp() {
+  rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+  av1_init_wedge_masks();
+
+  comp_pred1_ =
+      (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred1_));
+  comp_pred2_ =
+      (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred2_));
+  pred_ = (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*pred_));
+  ref_buffer_ = (uint16_t *)aom_memalign(
+      16, (MAX_SB_SQUARE + (8 * MAX_SB_SIZE)) * sizeof(*ref_buffer_));
+  ref_ = ref_buffer_ + (8 * MAX_SB_SIZE);
+}
+
+void AV1HighbdCompMaskVarianceTest::TearDown() {
+  aom_free(comp_pred1_);
+  aom_free(comp_pred2_);
+  aom_free(pred_);
+  aom_free(ref_buffer_);
+  libaom_test::ClearSystemState();
+}
+
+void AV1HighbdCompMaskVarianceTest::RunCheckOutput(
+    highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv) {
+  int bd_ = GET_PARAM(2);
+
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+  for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
+    ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+
+  for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+    const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+    aom_highbd_comp_mask_pred_c(comp_pred1_, CONVERT_TO_BYTEPTR(pred_), w, h,
+                                CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w,
+                                inv);
+
+    test_impl(comp_pred2_, CONVERT_TO_BYTEPTR(pred_), w, h,
+              CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv);
+
+    ASSERT_EQ(CheckResult(w, h), true)
+        << " wedge " << wedge_index << " inv " << inv;
+  }
+}
+
+void AV1HighbdCompMaskVarianceTest::RunSpeedTest(
+    highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize) {
+  int bd_ = GET_PARAM(2);
+
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  int wedge_index = wedge_types / 2;
+
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+  for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
+    ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+
+  const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+  const int num_loops = 1000000000 / (w + h);
+
+  highbd_comp_mask_pred_func funcs[2] = { aom_highbd_comp_mask_pred_c,
+                                          test_impl };
+  double elapsed_time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    highbd_comp_mask_pred_func func = funcs[i];
+    for (int j = 0; j < num_loops; ++j) {
+      func(comp_pred1_, CONVERT_TO_BYTEPTR(pred_), w, h,
+           CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, 0);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  printf("compMask %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0],
+         elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1HighbdCompMaskVarianceTest, CheckOutput) {
+  // inv = 0, 1
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+TEST_P(AV1HighbdCompMaskVarianceTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, AV1HighbdCompMaskVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_avx2),
+                       ::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Range(8, 13, 2)));
+#endif
+
+#ifndef aom_highbd_comp_mask_pred
+// can't run this test if aom_highbd_comp_mask_pred is defined to
+// aom_highbd_comp_mask_pred_c
+class AV1HighbdCompMaskUpVarianceTest : public AV1HighbdCompMaskVarianceTest {
+ public:
+  ~AV1HighbdCompMaskUpVarianceTest();
+
+ protected:
+  void RunCheckOutput(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize,
+                      int inv);
+  void RunSpeedTest(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize,
+                    int havSub);
+};
+
+AV1HighbdCompMaskUpVarianceTest::~AV1HighbdCompMaskUpVarianceTest() { ; }
+
+void AV1HighbdCompMaskUpVarianceTest::RunCheckOutput(
+    highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv) {
+  int bd_ = GET_PARAM(2);
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+  for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
+    ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+
+  // loop through subx and suby
+  for (int sub = 0; sub < 8 * 8; ++sub) {
+    int subx = sub & 0x7;
+    int suby = (sub >> 3);
+    for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+      const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+      aom_highbd_comp_mask_pred = aom_highbd_comp_mask_pred_c;  // ref
+      aom_highbd_comp_mask_upsampled_pred(
+          NULL, NULL, 0, 0, NULL, comp_pred1_, CONVERT_TO_BYTEPTR(pred_), w, h,
+          subx, suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv, bd_);
+
+      aom_highbd_comp_mask_pred = test_impl;  // test
+      aom_highbd_comp_mask_upsampled_pred(
+          NULL, NULL, 0, 0, NULL, comp_pred2_, CONVERT_TO_BYTEPTR(pred_), w, h,
+          subx, suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv, bd_);
+      ASSERT_EQ(CheckResult(w, h), true)
+          << " wedge " << wedge_index << " inv " << inv << "sub (" << subx
+          << "," << suby << ")";
+    }
+  }
+}
+
+void AV1HighbdCompMaskUpVarianceTest::RunSpeedTest(
+    highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int havSub) {
+  int bd_ = GET_PARAM(2);
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int subx = havSub ? 3 : 0;
+  const int suby = havSub ? 4 : 0;
+
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  int wedge_index = wedge_types / 2;
+  const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+  for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
+    ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+
+  const int num_loops = 1000000000 / (w + h);
+  highbd_comp_mask_pred_func funcs[2] = { &aom_highbd_comp_mask_pred_c,
+                                          test_impl };
+  double elapsed_time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    aom_highbd_comp_mask_pred = funcs[i];
+    for (int j = 0; j < num_loops; ++j) {
+      aom_highbd_comp_mask_upsampled_pred(
+          NULL, NULL, 0, 0, NULL, comp_pred1_, CONVERT_TO_BYTEPTR(pred_), w, h,
+          subx, suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, 0, bd_);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  printf("CompMaskUp[%d] %3dx%-3d:%7.2f/%7.2fns", havSub, w, h, elapsed_time[0],
+         elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1HighbdCompMaskUpVarianceTest, CheckOutput) {
+  // inv mask = 0, 1
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+TEST_P(AV1HighbdCompMaskUpVarianceTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, AV1HighbdCompMaskUpVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_avx2),
+                       ::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Range(8, 13, 2)));
+#endif
+
+#endif  // ifndef aom_highbd_comp_mask_pred
 }  // namespace AV1CompMaskVariance
diff --git a/third_party/aom/test/convolve_test.cc b/third_party/aom/test/convolve_test.cc
index 7098e8af6..de3f47628 100644
--- a/third_party/aom/test/convolve_test.cc
+++ b/third_party/aom/test/convolve_test.cc
@@ -490,9 +490,9 @@ TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
     const InterpFilter filter = (InterpFilter)filter_bank;
     const InterpKernel *filters =
         (const InterpKernel *)av1_get_interp_filter_kernel(filter);
-    const InterpFilterParams filter_params =
+    const InterpFilterParams *filter_params =
         av1_get_interp_filter_params_with_block_size(filter, 8);
-    if (filter_params.taps != SUBPEL_TAPS) continue;
+    if (filter_params->taps != SUBPEL_TAPS) continue;
     for (int i = 0; i < kNumFilters; i++) {
       const int p0 = filters[i][0] + filters[i][1];
       const int p1 = filters[i][2] + filters[i][3];
@@ -528,9 +528,9 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
     const InterpFilter filter = (InterpFilter)filter_bank;
     const InterpKernel *filters =
         (const InterpKernel *)av1_get_interp_filter_kernel(filter);
-    const InterpFilterParams filter_params =
+    const InterpFilterParams *filter_params =
         av1_get_interp_filter_params_with_block_size(filter, 8);
-    if (filter_params.taps != SUBPEL_TAPS) continue;
+    if (filter_params->taps != SUBPEL_TAPS) continue;
 
     for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
       for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
@@ -614,9 +614,9 @@ TEST_P(ConvolveTest, FilterExtremes) {
         const InterpFilter filter = (InterpFilter)filter_bank;
         const InterpKernel *filters =
             (const InterpKernel *)av1_get_interp_filter_kernel(filter);
-        const InterpFilterParams filter_params =
+        const InterpFilterParams *filter_params =
             av1_get_interp_filter_params_with_block_size(filter, 8);
-        if (filter_params.taps != SUBPEL_TAPS) continue;
+        if (filter_params->taps != SUBPEL_TAPS) continue;
         for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
           for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
             wrapper_filter_block2d_8_c(in, kInputStride, filters[filter_x],
@@ -713,9 +713,9 @@ TEST_P(ConvolveTest, DISABLED_Speed) {
       const InterpFilter filter = (InterpFilter)filter_bank;
       const InterpKernel *filters =
           (const InterpKernel *)av1_get_interp_filter_kernel(filter);
-      const InterpFilterParams filter_params =
+      const InterpFilterParams *filter_params =
           av1_get_interp_filter_params_with_block_size(filter, 8);
-      if (filter_params.taps != SUBPEL_TAPS) continue;
+      if (filter_params->taps != SUBPEL_TAPS) continue;
 
       for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
         for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
@@ -832,20 +832,25 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest,
 #endif
 
 #if HAVE_AVX2
-const ConvolveFunctions convolve8_avx2(wrap_convolve_copy_avx2_8,
-                                       wrap_convolve8_horiz_avx2_8,
-                                       wrap_convolve8_vert_avx2_8, 8);
-const ConvolveFunctions convolve10_avx2(wrap_convolve_copy_avx2_10,
-                                        wrap_convolve8_horiz_avx2_10,
-                                        wrap_convolve8_vert_avx2_10, 10);
-const ConvolveFunctions convolve12_avx2(wrap_convolve_copy_avx2_12,
-                                        wrap_convolve8_horiz_avx2_12,
-                                        wrap_convolve8_vert_avx2_12, 12);
-const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES_64(convolve8_avx2),
-                                               ALL_SIZES_64(convolve10_avx2),
-                                               ALL_SIZES_64(convolve12_avx2) };
+const ConvolveFunctions convolve8_avx2(aom_convolve_copy_c,
+                                       aom_convolve8_horiz_avx2,
+                                       aom_convolve8_vert_avx2, 0);
+
+const ConvolveFunctions wrap_convolve8_avx2(wrap_convolve_copy_avx2_8,
+                                            wrap_convolve8_horiz_avx2_8,
+                                            wrap_convolve8_vert_avx2_8, 8);
+const ConvolveFunctions wrap_convolve10_avx2(wrap_convolve_copy_avx2_10,
+                                             wrap_convolve8_horiz_avx2_10,
+                                             wrap_convolve8_vert_avx2_10, 10);
+const ConvolveFunctions wrap_convolve12_avx2(wrap_convolve_copy_avx2_12,
+                                             wrap_convolve8_horiz_avx2_12,
+                                             wrap_convolve8_vert_avx2_12, 12);
+const ConvolveParam kArray_Convolve8_avx2[] = {
+  ALL_SIZES_64(wrap_convolve8_avx2), ALL_SIZES_64(wrap_convolve10_avx2),
+  ALL_SIZES_64(wrap_convolve12_avx2), ALL_SIZES(convolve8_avx2)
+};
 INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
-                        ::testing::ValuesIn(kArrayConvolve8_avx2));
+                        ::testing::ValuesIn(kArray_Convolve8_avx2));
 #endif  // HAVE_AVX2
 
 }  // namespace
diff --git a/third_party/aom/test/decode_multithreaded_test.cc b/third_party/aom/test/decode_multithreaded_test.cc
index ed9a9ceef..cea1d144f 100644
--- a/third_party/aom/test/decode_multithreaded_test.cc
+++ b/third_party/aom/test/decode_multithreaded_test.cc
@@ -26,13 +26,14 @@ namespace {
 static const int kNumMultiThreadDecoders = 3;
 
 class AV1DecodeMultiThreadedTest
-    : public ::libaom_test::CodecTestWith4Params<int, int, int, int>,
+    : public ::libaom_test::CodecTestWith5Params<int, int, int, int, int>,
       public ::libaom_test::EncoderTest {
  protected:
   AV1DecodeMultiThreadedTest()
       : EncoderTest(GET_PARAM(0)), md5_single_thread_(), md5_multi_thread_(),
         n_tile_cols_(GET_PARAM(1)), n_tile_rows_(GET_PARAM(2)),
-        n_tile_groups_(GET_PARAM(3)), set_cpu_used_(GET_PARAM(4)) {
+        n_tile_groups_(GET_PARAM(3)), set_cpu_used_(GET_PARAM(4)),
+        row_mt_(GET_PARAM(5)) {
     init_flags_ = AOM_CODEC_USE_PSNR;
     aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
     cfg.w = 704;
@@ -45,14 +46,17 @@ class AV1DecodeMultiThreadedTest
     for (int i = 0; i < kNumMultiThreadDecoders; ++i) {
       cfg.threads <<= 1;
       multi_thread_dec_[i] = codec_->CreateDecoder(cfg, 0);
+      multi_thread_dec_[i]->Control(AV1D_SET_ROW_MT, row_mt_);
     }
 
     if (single_thread_dec_->IsAV1()) {
+      single_thread_dec_->Control(AV1D_EXT_TILE_DEBUG, 1);
       single_thread_dec_->Control(AV1_SET_DECODE_TILE_ROW, -1);
       single_thread_dec_->Control(AV1_SET_DECODE_TILE_COL, -1);
     }
     for (int i = 0; i < kNumMultiThreadDecoders; ++i) {
       if (multi_thread_dec_[i]->IsAV1()) {
+        multi_thread_dec_[i]->Control(AV1D_EXT_TILE_DEBUG, 1);
         multi_thread_dec_[i]->Control(AV1_SET_DECODE_TILE_ROW, -1);
         multi_thread_dec_[i]->Control(AV1_SET_DECODE_TILE_COL, -1);
       }
@@ -128,6 +132,7 @@ class AV1DecodeMultiThreadedTest
   int n_tile_rows_;
   int n_tile_groups_;
   int set_cpu_used_;
+  int row_mt_;
 };
 
 // run an encode and do the decode both in single thread
@@ -154,16 +159,17 @@ TEST_P(AV1DecodeMultiThreadedTestLarge, MD5Match) {
 // TODO(ranjit): More tests have to be added using pre-generated MD5.
 AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedTest, ::testing::Values(1, 2),
                           ::testing::Values(1, 2), ::testing::Values(1),
-                          ::testing::Values(3));
+                          ::testing::Values(3), ::testing::Values(0, 1));
 AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedTestLarge,
                           ::testing::Values(0, 1, 2, 6),
                           ::testing::Values(0, 1, 2, 6),
-                          ::testing::Values(1, 4), ::testing::Values(0));
+                          ::testing::Values(1, 4), ::testing::Values(0),
+                          ::testing::Values(0, 1));
 
 class AV1DecodeMultiThreadedLSTestLarge
     : public AV1DecodeMultiThreadedTestLarge {};
 
-TEST_P(AV1DecodeMultiThreadedLSTestLarge, DISABLED_MD5Match) {
+TEST_P(AV1DecodeMultiThreadedLSTestLarge, MD5Match) {
   cfg_.large_scale_tile = 1;
   single_thread_dec_->Control(AV1_SET_TILE_MODE, 1);
   for (int i = 0; i < kNumMultiThreadDecoders; ++i)
@@ -172,8 +178,8 @@ TEST_P(AV1DecodeMultiThreadedLSTestLarge, DISABLED_MD5Match) {
 }
 
 AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedLSTestLarge,
-                          ::testing::Values(1, 2, 32),
-                          ::testing::Values(1, 2, 32), ::testing::Values(1),
-                          ::testing::Values(0, 3));
+                          ::testing::Values(6), ::testing::Values(6),
+                          ::testing::Values(1), ::testing::Values(0, 3),
+                          ::testing::Values(0, 1));
 
 }  // namespace
diff --git a/third_party/aom/test/decode_test_driver.cc b/third_party/aom/test/decode_test_driver.cc
index ed261b527..70de0cff6 100644
--- a/third_party/aom/test/decode_test_driver.cc
+++ b/third_party/aom/test/decode_test_driver.cc
@@ -94,7 +94,7 @@ void DecoderTest::RunLoop(CompressedVideoSource *video,
     const aom_image_t *img = NULL;
 
     // Get decompressed data
-    while ((img = dec_iter.Next()))
+    while (!::testing::Test::HasFailure() && (img = dec_iter.Next()))
       DecompressedFrameHook(*img, video->frame_number());
   }
   delete decoder;
diff --git a/third_party/aom/test/dr_prediction_test.cc b/third_party/aom/test/dr_prediction_test.cc
index 22b9832a1..ff2c1de4e 100644
--- a/third_party/aom/test/dr_prediction_test.cc
+++ b/third_party/aom/test/dr_prediction_test.cc
@@ -143,8 +143,8 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
   static const int kBufSize = ((2 * MAX_TX_SIZE) << 1) + 16;
 
   DrPredTest()
-      : upsample_above_(0), upsample_left_(0), bw_(0), bh_(0), dx_(1), dy_(1),
-        bd_(8), txsize_(TX_4X4) {
+      : enable_upsample_(0), upsample_above_(0), upsample_left_(0), bw_(0),
+        bh_(0), dx_(1), dy_(1), bd_(8), txsize_(TX_4X4) {
     params_ = this->GetParam();
     start_angle_ = params_.start_angle;
     stop_angle_ = start_angle_ + 90;
@@ -193,7 +193,7 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
     OutputTimes(kNumTests, ref_time, tst_time, tx);
   }
 
-  void RunTest(bool speedtest) {
+  void RunTest(bool speedtest, int p_angle) {
     for (int i = 0; i < kBufSize; ++i) {
       above_data_[i] = left_data_[i] = (1 << bd_) - 1;
     }
@@ -212,6 +212,15 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
       bw_ = tx_size_wide[kTxSize[tx]];
       bh_ = tx_size_high[kTxSize[tx]];
 
+      if (enable_upsample_) {
+        upsample_above_ =
+            av1_use_intra_edge_upsample(bw_, bh_, p_angle - 90, 0);
+        upsample_left_ =
+            av1_use_intra_edge_upsample(bw_, bh_, p_angle - 180, 0);
+      } else {
+        upsample_above_ = upsample_left_ = 0;
+      }
+
       Predict(speedtest, tx);
 
       for (int r = 0; r < bh_; ++r) {
@@ -252,6 +261,7 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
   Pixel *left_;
   int dst_stride_;
 
+  int enable_upsample_;
   int upsample_above_;
   int upsample_left_;
   int bw_;
@@ -273,25 +283,25 @@ class LowbdDrPredTest : public DrPredTest<uint8_t, DrPred> {};
 
 TEST_P(LowbdDrPredTest, SaturatedValues) {
   for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    upsample_above_ = iter & 1;
+    enable_upsample_ = iter & 1;
     for (int angle = start_angle_; angle < stop_angle_; ++angle) {
       dx_ = av1_get_dx(angle);
       dy_ = av1_get_dy(angle);
-      if (dx_ && dy_) RunTest(false);
+      if (dx_ && dy_) RunTest(false, angle);
     }
   }
 }
 
 TEST_P(LowbdDrPredTest, DISABLED_Speed) {
   const int angles[] = { 3, 45, 87 };
-  for (upsample_above_ = 0; upsample_above_ < 2; ++upsample_above_) {
-    upsample_left_ = upsample_above_;
+  for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
     for (int i = 0; i < 3; ++i) {
-      dx_ = av1_get_dx(angles[i] + start_angle_);
-      dy_ = av1_get_dy(angles[i] + start_angle_);
-      printf("upsample_above: %d upsample_left: %d angle: %d ~~~~~~~~~~~~~~~\n",
-             upsample_above_, upsample_left_, angles[i] + start_angle_);
-      if (dx_ && dy_) RunTest(true);
+      const int angle = angles[i] + start_angle_;
+      dx_ = av1_get_dx(angle);
+      dy_ = av1_get_dy(angle);
+      printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n",
+             enable_upsample_, angle);
+      if (dx_ && dy_) RunTest(true, angle);
     }
   }
 }
@@ -311,25 +321,25 @@ class HighbdDrPredTest : public DrPredTest<uint16_t, DrPred_Hbd> {};
 
 TEST_P(HighbdDrPredTest, SaturatedValues) {
   for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    upsample_above_ = iter & 1;
+    enable_upsample_ = iter & 1;
     for (int angle = start_angle_; angle < stop_angle_; ++angle) {
       dx_ = av1_get_dx(angle);
       dy_ = av1_get_dy(angle);
-      if (dx_ && dy_) RunTest(false);
+      if (dx_ && dy_) RunTest(false, angle);
     }
   }
 }
 
 TEST_P(HighbdDrPredTest, DISABLED_Speed) {
   const int angles[] = { 3, 45, 87 };
-  for (upsample_above_ = 0; upsample_above_ < 2; ++upsample_above_) {
-    upsample_left_ = upsample_above_;
+  for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
     for (int i = 0; i < 3; ++i) {
-      dx_ = av1_get_dx(angles[i] + start_angle_);
-      dy_ = av1_get_dy(angles[i] + start_angle_);
-      printf("upsample_above: %d upsample_left: %d angle: %d ~~~~~~~~~~~~~~~\n",
-             upsample_above_, upsample_left_, angles[i] + start_angle_);
-      if (dx_ && dy_) RunTest(true);
+      const int angle = angles[i] + start_angle_;
+      dx_ = av1_get_dx(angle);
+      dy_ = av1_get_dy(angle);
+      printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n",
+             enable_upsample_, angle);
+      if (dx_ && dy_) RunTest(true, angle);
     }
   }
 }
diff --git a/third_party/aom/test/dump_obu.sh b/third_party/aom/test/dump_obu.sh
index 182e894f5..da44dd7e6 100755
--- a/third_party/aom/test/dump_obu.sh
+++ b/third_party/aom/test/dump_obu.sh
@@ -44,7 +44,7 @@ aomenc_available() {
 
 encode_test_file() {
   if [ "$(aomenc_available)" = "yes" ]; then
-    local readonly encoder="$(aom_tool_path aomenc)"
+    local encoder="$(aom_tool_path aomenc)"
 
     eval "${encoder}" \
       $(aomenc_encode_test_fast_params) \
diff --git a/third_party/aom/test/encode_test_driver.cc b/third_party/aom/test/encode_test_driver.cc
index b75d7be16..35908430d 100644
--- a/third_party/aom/test/encode_test_driver.cc
+++ b/third_party/aom/test/encode_test_driver.cc
@@ -217,6 +217,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
       // Set dec_cfg.tile_row = -1 and dec_cfg.tile_col = -1 so that the whole
       // frame is decoded.
       decoder->Control(AV1_SET_TILE_MODE, cfg_.large_scale_tile);
+      decoder->Control(AV1D_EXT_TILE_DEBUG, 1);
       decoder->Control(AV1_SET_DECODE_TILE_ROW, -1);
       decoder->Control(AV1_SET_DECODE_TILE_COL, -1);
     }
diff --git a/third_party/aom/test/ethread_test.cc b/third_party/aom/test/ethread_test.cc
index 3dcc2a707..dd9fc2f8d 100644
--- a/third_party/aom/test/ethread_test.cc
+++ b/third_party/aom/test/ethread_test.cc
@@ -20,12 +20,14 @@
 
 namespace {
 class AVxEncoderThreadTest
-    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+    : public ::libaom_test::CodecTestWith4Params<libaom_test::TestMode, int,
+                                                 int, int>,
       public ::libaom_test::EncoderTest {
  protected:
   AVxEncoderThreadTest()
       : EncoderTest(GET_PARAM(0)), encoder_initialized_(false),
-        encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)) {
+        encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)),
+        tile_cols_(GET_PARAM(3)), tile_rows_(GET_PARAM(4)) {
     init_flags_ = AOM_CODEC_USE_PSNR;
     aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
     cfg.w = 1280;
@@ -84,9 +86,8 @@ class AVxEncoderThreadTest
   }
 
   virtual void SetTileSize(libaom_test::Encoder *encoder) {
-    // Encode 4 tile columns.
-    encoder->Control(AV1E_SET_TILE_COLUMNS, 2);
-    encoder->Control(AV1E_SET_TILE_ROWS, 0);
+    encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_);
+    encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
   }
 
   virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
@@ -153,6 +154,8 @@ class AVxEncoderThreadTest
   bool encoder_initialized_;
   ::libaom_test::TestMode encoding_mode_;
   int set_cpu_used_;
+  int tile_cols_;
+  int tile_rows_;
   ::libaom_test::Decoder *decoder_;
   std::vector<size_t> size_enc_;
   std::vector<std::string> md5_enc_;
@@ -177,42 +180,46 @@ TEST_P(AVxEncoderThreadTestLarge, EncoderResultTest) {
 AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTest,
                           ::testing::Values(::libaom_test::kTwoPassGood,
                                             ::libaom_test::kOnePassGood),
-                          ::testing::Range(2, 4));
+                          ::testing::Range(2, 4), ::testing::Values(1, 2),
+                          ::testing::Values(0, 1));
 
 AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTestLarge,
                           ::testing::Values(::libaom_test::kTwoPassGood,
                                             ::libaom_test::kOnePassGood),
-                          ::testing::Range(0, 2));
+                          ::testing::Range(0, 2), ::testing::Values(0, 1, 2, 6),
+                          ::testing::Values(0, 1, 2, 6));
 
 class AVxEncoderThreadLSTest : public AVxEncoderThreadTest {
   virtual void SetTileSize(libaom_test::Encoder *encoder) {
-    encoder->Control(AV1E_SET_TILE_COLUMNS, 1);
-    // TODO(geza): Start using multiple tile rows when the multi-threaded
-    // encoder can handle them
-    encoder->Control(AV1E_SET_TILE_ROWS, 32);
+    encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_);
+    encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
   }
 };
 
-TEST_P(AVxEncoderThreadLSTest, DISABLED_EncoderResultTest) {
+TEST_P(AVxEncoderThreadLSTest, EncoderResultTest) {
   cfg_.large_scale_tile = 1;
   decoder_->Control(AV1_SET_TILE_MODE, 1);
+  decoder_->Control(AV1D_EXT_TILE_DEBUG, 1);
   DoTest();
 }
 
 class AVxEncoderThreadLSTestLarge : public AVxEncoderThreadLSTest {};
 
-TEST_P(AVxEncoderThreadLSTestLarge, DISABLED_EncoderResultTest) {
+TEST_P(AVxEncoderThreadLSTestLarge, EncoderResultTest) {
   cfg_.large_scale_tile = 1;
   decoder_->Control(AV1_SET_TILE_MODE, 1);
+  decoder_->Control(AV1D_EXT_TILE_DEBUG, 1);
   DoTest();
 }
 
 AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadLSTest,
                           ::testing::Values(::libaom_test::kTwoPassGood,
                                             ::libaom_test::kOnePassGood),
-                          ::testing::Range(2, 4));
+                          ::testing::Range(2, 4), ::testing::Values(6),
+                          ::testing::Values(0, 6));
 AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadLSTestLarge,
                           ::testing::Values(::libaom_test::kTwoPassGood,
                                             ::libaom_test::kOnePassGood),
-                          ::testing::Range(0, 2));
+                          ::testing::Range(0, 2), ::testing::Values(6),
+                          ::testing::Values(0, 6));
 }  // namespace
diff --git a/third_party/aom/test/fft_test.cc b/third_party/aom/test/fft_test.cc
index 56187cdbb..5c8ec069c 100644
--- a/third_party/aom/test/fft_test.cc
+++ b/third_party/aom/test/fft_test.cc
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 #include <math.h>
 
 #include <algorithm>
diff --git a/third_party/aom/test/film_grain_table_test.cc b/third_party/aom/test/film_grain_table_test.cc
index 068814635..524d67d7b 100644
--- a/third_party/aom/test/film_grain_table_test.cc
+++ b/third_party/aom/test/film_grain_table_test.cc
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 #include <string>
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "aom_dsp/grain_table.h"
diff --git a/third_party/aom/test/intrapred_test.cc b/third_party/aom/test/intrapred_test.cc
index 82f191449..1a1c0fc42 100644
--- a/third_party/aom/test/intrapred_test.cc
+++ b/third_party/aom/test/intrapred_test.cc
@@ -37,6 +37,15 @@ typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride,
 typedef void (*IntraPred)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
                           const uint8_t *left);
 
+}  // namespace
+
+// NOTE: Under gcc version 7.3.0 (Debian 7.3.0-5), if this template is in the
+// anonymous namespace, then we get a strange compiler warning in
+// the begin() and end() methods of the ParamGenerator template class in
+// gtest/internal/gtest-param-util.h:
+//   warning: ‘<anonymous>’ is used uninitialized in this function
+// As a workaround, put this template outside the anonymous namespace.
+// See bug aomedia:2003.
 template <typename FuncType>
 struct IntraPredFunc {
   IntraPredFunc(FuncType pred = NULL, FuncType ref = NULL,
@@ -52,6 +61,8 @@ struct IntraPredFunc {
   int bit_depth;
 };
 
+namespace {
+
 template <typename FuncType, typename Pixel>
 class AV1IntraPredTest
     : public ::testing::TestWithParam<IntraPredFunc<FuncType> > {
diff --git a/third_party/aom/test/lightfield_test.sh b/third_party/aom/test/lightfield_test.sh
new file mode 100755
index 000000000..b957a6b79
--- /dev/null
+++ b/third_party/aom/test/lightfield_test.sh
@@ -0,0 +1,98 @@
+#!/bin/sh
+## Copyright (c) 2018, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the lightfield example.
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $infile is required.
+lightfield_test_verify_environment() {
+  local infile="${LIBAOM_TEST_DATA_PATH}/vase10x10.yuv"
+  if [ ! -e "${infile}" ]; then
+    echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Run the lightfield example
+lightfield_test() {
+  local img_width=1024
+  local img_height=1024
+  local lf_width=10
+  local lf_height=10
+  local lf_blocksize=5
+  local num_references=4
+  local num_tile_lists=2
+
+  # Encode the lightfield.
+  local encoder="${LIBAOM_BIN_PATH}/lightfield_encoder${AOM_TEST_EXE_SUFFIX}"
+  local yuv_file="${LIBAOM_TEST_DATA_PATH}/vase10x10.yuv"
+  local lf_file="${AOM_TEST_OUTPUT_DIR}/vase10x10.ivf"
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${encoder}" "${img_width}" "${img_height}" \
+      "${yuv_file}" "${lf_file}" "${lf_width}" \
+      "${lf_height}" "${lf_blocksize}" ${devnull}
+
+  [ -e "${lf_file}" ] || return 1
+
+  # Parse lightfield bitstream to construct and output a new bitstream that can
+  # be decoded by an AV1 decoder.
+  local bs_decoder="${LIBAOM_BIN_PATH}/lightfield_bitstream_parsing${AOM_TEST_EXE_SUFFIX}"
+  local tl_file="${AOM_TEST_OUTPUT_DIR}/vase_tile_list.ivf"
+  if [ ! -x "${bs_decoder}" ]; then
+    elog "${bs_decoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${bs_decoder}" "${lf_file}" "${tl_file}" \
+      "${num_references}" ${devnull}
+
+  [ -e "${tl_file}" ] || return 1
+
+  # Run lightfield tile list decoder
+  local tl_decoder="${LIBAOM_BIN_PATH}/lightfield_tile_list_decoder${AOM_TEST_EXE_SUFFIX}"
+  local tl_outfile="${AOM_TEST_OUTPUT_DIR}/vase_tile_list.yuv"
+  if [ ! -x "${tl_decoder}" ]; then
+    elog "${tl_decoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${tl_decoder}" "${tl_file}" "${tl_outfile}" \
+      "${num_references}" "${num_tile_lists}" ${devnull}
+
+  [ -e "${tl_outfile}" ] || return 1
+
+  # Run reference lightfield decoder
+  local ref_decoder="${LIBAOM_BIN_PATH}/lightfield_decoder${AOM_TEST_EXE_SUFFIX}"
+  local tl_reffile="${AOM_TEST_OUTPUT_DIR}/vase_reference.yuv"
+  if [ ! -x "${ref_decoder}" ]; then
+    elog "${ref_decoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${ref_decoder}" "${lf_file}" "${tl_reffile}" \
+      "${num_references}" ${devnull}
+
+  [ -e "${tl_reffile}" ] || return 1
+
+  # Check if tl_outfile and tl_reffile are identical. If not identical, this test fails.
+  diff ${tl_outfile} ${tl_reffile} > /dev/null
+  if [ $? -eq 1 ]; then
+    return 1
+  fi
+}
+
+lightfield_test_tests="lightfield_test"
+
+run_tests lightfield_test_verify_environment "${lightfield_test_tests}"
diff --git a/third_party/aom/test/lpf_test.cc b/third_party/aom/test/lpf_test.cc
index 1e2862ac8..451bffd2a 100644
--- a/third_party/aom/test/lpf_test.cc
+++ b/third_party/aom/test/lpf_test.cc
@@ -581,8 +581,12 @@ INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param_hbd,
 const loop_param_t kLoop8Test6[] = {
   make_tuple(&aom_lpf_vertical_14_neon, &aom_lpf_vertical_14_c, 8),
   make_tuple(&aom_lpf_vertical_8_neon, &aom_lpf_vertical_8_c, 8),
+  make_tuple(&aom_lpf_vertical_6_neon, &aom_lpf_vertical_6_c, 8),
+  make_tuple(&aom_lpf_vertical_4_neon, &aom_lpf_vertical_4_c, 8),
+  make_tuple(&aom_lpf_horizontal_14_neon, &aom_lpf_horizontal_14_c, 8),
   make_tuple(&aom_lpf_horizontal_8_neon, &aom_lpf_horizontal_8_c, 8),
-  make_tuple(&aom_lpf_horizontal_6_neon, &aom_lpf_horizontal_6_c, 8)
+  make_tuple(&aom_lpf_horizontal_6_neon, &aom_lpf_horizontal_6_c, 8),
+  make_tuple(&aom_lpf_horizontal_4_neon, &aom_lpf_horizontal_4_c, 8)
 };
 
 INSTANTIATE_TEST_CASE_P(NEON, Loop8Test6Param_lbd,
diff --git a/third_party/aom/test/masked_sad_test.cc b/third_party/aom/test/masked_sad_test.cc
index 1a393a001..311f1877d 100644
--- a/third_party/aom/test/masked_sad_test.cc
+++ b/third_party/aom/test/masked_sad_test.cc
@@ -44,14 +44,14 @@ class MaskedSADTest : public ::testing::TestWithParam<MaskedSADParam> {
   }
 
   virtual void TearDown() { libaom_test::ClearSystemState(); }
+  void runMaskedSADTest(int run_times);
 
  protected:
   MaskedSADFunc maskedSAD_op_;
   MaskedSADFunc ref_maskedSAD_op_;
 };
-
-TEST_P(MaskedSADTest, OperationCheck) {
-  unsigned int ref_ret, ret;
+void MaskedSADTest::runMaskedSADTest(int run_times) {
+  unsigned int ref_ret = 0, ret = 1;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
   DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
@@ -62,7 +62,8 @@ TEST_P(MaskedSADTest, OperationCheck) {
   int src_stride = MAX_SB_SIZE;
   int ref_stride = MAX_SB_SIZE;
   int msk_stride = MAX_SB_SIZE;
-  for (int i = 0; i < number_of_iterations; ++i) {
+  const int iters = run_times == 1 ? number_of_iterations : 1;
+  for (int i = 0; i < iters; ++i) {
     for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
       src_ptr[j] = rnd.Rand8();
       ref_ptr[j] = rnd.Rand8();
@@ -72,24 +73,48 @@ TEST_P(MaskedSADTest, OperationCheck) {
     }
 
     for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
-      ref_ret =
-          ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride,
+      aom_usec_timer timer;
+      aom_usec_timer_start(&timer);
+      for (int repeat = 0; repeat < run_times; ++repeat) {
+        ref_ret = ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride,
+                                    second_pred_ptr, msk_ptr, msk_stride,
+                                    invert_mask);
+      }
+      aom_usec_timer_mark(&timer);
+      const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+      aom_usec_timer_start(&timer);
+      if (run_times == 1) {
+        ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src_ptr, src_stride,
+                                                     ref_ptr, ref_stride,
+                                                     second_pred_ptr, msk_ptr,
+                                                     msk_stride, invert_mask));
+      } else {
+        for (int repeat = 0; repeat < run_times; ++repeat) {
+          ret =
+              maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride,
                             second_pred_ptr, msk_ptr, msk_stride, invert_mask);
-      ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src_ptr, src_stride, ref_ptr,
-                                                   ref_stride, second_pred_ptr,
-                                                   msk_ptr, msk_stride,
-                                                   invert_mask));
+        }
+      }
+      aom_usec_timer_mark(&timer);
+      const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+      if (run_times > 10) {
+        printf("%7.2f/%7.2fns", time1, time2);
+        printf("(%3.2f)\n", time1 / time2);
+      }
       if (ret != ref_ret) {
         err_count++;
         if (first_failure == -1) first_failure = i;
       }
     }
   }
-  EXPECT_EQ(0, err_count)
-      << "Error: Masked SAD Test, C output doesn't match SSSE3 output. "
-      << "First failed at test case " << first_failure;
+  EXPECT_EQ(0, err_count) << "Error: Masked SAD Test,  output doesn't match. "
+                          << "First failed at test case " << first_failure;
 }
 
+TEST_P(MaskedSADTest, OperationCheck) { runMaskedSADTest(1); }
+
+TEST_P(MaskedSADTest, DISABLED_Speed) { runMaskedSADTest(2000000); }
+
 typedef unsigned int (*HighbdMaskedSADFunc)(const uint8_t *src, int src_stride,
                                             const uint8_t *ref, int ref_stride,
                                             const uint8_t *second_pred,
@@ -108,14 +133,14 @@ class HighbdMaskedSADTest
   }
 
   virtual void TearDown() { libaom_test::ClearSystemState(); }
+  void runHighbdMaskedSADTest(int run_times);
 
  protected:
   HighbdMaskedSADFunc maskedSAD_op_;
   HighbdMaskedSADFunc ref_maskedSAD_op_;
 };
-
-TEST_P(HighbdMaskedSADTest, OperationCheck) {
-  unsigned int ref_ret, ret;
+void HighbdMaskedSADTest::runHighbdMaskedSADTest(int run_times) {
+  unsigned int ref_ret = 0, ret = 1;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
   DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
@@ -129,7 +154,8 @@ TEST_P(HighbdMaskedSADTest, OperationCheck) {
   int src_stride = MAX_SB_SIZE;
   int ref_stride = MAX_SB_SIZE;
   int msk_stride = MAX_SB_SIZE;
-  for (int i = 0; i < number_of_iterations; ++i) {
+  const int iters = run_times == 1 ? number_of_iterations : 1;
+  for (int i = 0; i < iters; ++i) {
     for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
       src_ptr[j] = rnd.Rand16() & 0xfff;
       ref_ptr[j] = rnd.Rand16() & 0xfff;
@@ -138,13 +164,34 @@ TEST_P(HighbdMaskedSADTest, OperationCheck) {
     }
 
     for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
-      ref_ret =
-          ref_maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride,
+      aom_usec_timer timer;
+      aom_usec_timer_start(&timer);
+      for (int repeat = 0; repeat < run_times; ++repeat) {
+        ref_ret = ref_maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride,
+                                    second_pred8_ptr, msk_ptr, msk_stride,
+                                    invert_mask);
+      }
+      aom_usec_timer_mark(&timer);
+      const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+      aom_usec_timer_start(&timer);
+      if (run_times == 1) {
+        ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride,
+                                                     ref8_ptr, ref_stride,
+                                                     second_pred8_ptr, msk_ptr,
+                                                     msk_stride, invert_mask));
+      } else {
+        for (int repeat = 0; repeat < run_times; ++repeat) {
+          ret =
+              maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride,
                             second_pred8_ptr, msk_ptr, msk_stride, invert_mask);
-      ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride,
-                                                   ref8_ptr, ref_stride,
-                                                   second_pred8_ptr, msk_ptr,
-                                                   msk_stride, invert_mask));
+        }
+      }
+      aom_usec_timer_mark(&timer);
+      const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+      if (run_times > 10) {
+        printf("%7.2f/%7.2fns", time1, time2);
+        printf("(%3.2f)\n", time1 / time2);
+      }
       if (ret != ref_ret) {
         err_count++;
         if (first_failure == -1) first_failure = i;
@@ -152,57 +199,144 @@ TEST_P(HighbdMaskedSADTest, OperationCheck) {
     }
   }
   EXPECT_EQ(0, err_count)
-      << "Error: High BD Masked SAD Test, C output doesn't match SSSE3 output. "
+      << "Error: High BD Masked SAD Test, output doesn't match. "
       << "First failed at test case " << first_failure;
 }
 
+TEST_P(HighbdMaskedSADTest, OperationCheck) { runHighbdMaskedSADTest(1); }
+
+TEST_P(HighbdMaskedSADTest, DISABLED_Speed) { runHighbdMaskedSADTest(1000000); }
+
 using ::testing::make_tuple;
 
 #if HAVE_SSSE3
 const MaskedSADParam msad_test[] = {
-  make_tuple(&aom_masked_sad128x128_ssse3, &aom_masked_sad128x128_c),
-  make_tuple(&aom_masked_sad128x64_ssse3, &aom_masked_sad128x64_c),
-  make_tuple(&aom_masked_sad64x128_ssse3, &aom_masked_sad64x128_c),
-  make_tuple(&aom_masked_sad64x64_ssse3, &aom_masked_sad64x64_c),
-  make_tuple(&aom_masked_sad64x32_ssse3, &aom_masked_sad64x32_c),
-  make_tuple(&aom_masked_sad32x64_ssse3, &aom_masked_sad32x64_c),
-  make_tuple(&aom_masked_sad32x32_ssse3, &aom_masked_sad32x32_c),
-  make_tuple(&aom_masked_sad32x16_ssse3, &aom_masked_sad32x16_c),
-  make_tuple(&aom_masked_sad16x32_ssse3, &aom_masked_sad16x32_c),
-  make_tuple(&aom_masked_sad16x16_ssse3, &aom_masked_sad16x16_c),
-  make_tuple(&aom_masked_sad16x8_ssse3, &aom_masked_sad16x8_c),
-  make_tuple(&aom_masked_sad8x16_ssse3, &aom_masked_sad8x16_c),
-  make_tuple(&aom_masked_sad8x8_ssse3, &aom_masked_sad8x8_c),
-  make_tuple(&aom_masked_sad8x4_ssse3, &aom_masked_sad8x4_c),
+  make_tuple(&aom_masked_sad4x4_ssse3, &aom_masked_sad4x4_c),
   make_tuple(&aom_masked_sad4x8_ssse3, &aom_masked_sad4x8_c),
-  make_tuple(&aom_masked_sad4x4_ssse3, &aom_masked_sad4x4_c)
+  make_tuple(&aom_masked_sad8x4_ssse3, &aom_masked_sad8x4_c),
+  make_tuple(&aom_masked_sad8x8_ssse3, &aom_masked_sad8x8_c),
+  make_tuple(&aom_masked_sad8x16_ssse3, &aom_masked_sad8x16_c),
+  make_tuple(&aom_masked_sad16x8_ssse3, &aom_masked_sad16x8_c),
+  make_tuple(&aom_masked_sad16x16_ssse3, &aom_masked_sad16x16_c),
+  make_tuple(&aom_masked_sad16x32_ssse3, &aom_masked_sad16x32_c),
+  make_tuple(&aom_masked_sad32x16_ssse3, &aom_masked_sad32x16_c),
+  make_tuple(&aom_masked_sad32x32_ssse3, &aom_masked_sad32x32_c),
+  make_tuple(&aom_masked_sad32x64_ssse3, &aom_masked_sad32x64_c),
+  make_tuple(&aom_masked_sad64x32_ssse3, &aom_masked_sad64x32_c),
+  make_tuple(&aom_masked_sad64x64_ssse3, &aom_masked_sad64x64_c),
+  make_tuple(&aom_masked_sad64x128_ssse3, &aom_masked_sad64x128_c),
+  make_tuple(&aom_masked_sad128x64_ssse3, &aom_masked_sad128x64_c),
+  make_tuple(&aom_masked_sad128x128_ssse3, &aom_masked_sad128x128_c),
+  make_tuple(&aom_masked_sad4x16_ssse3, &aom_masked_sad4x16_c),
+  make_tuple(&aom_masked_sad16x4_ssse3, &aom_masked_sad16x4_c),
+  make_tuple(&aom_masked_sad8x32_ssse3, &aom_masked_sad8x32_c),
+  make_tuple(&aom_masked_sad32x8_ssse3, &aom_masked_sad32x8_c),
+  make_tuple(&aom_masked_sad16x64_ssse3, &aom_masked_sad16x64_c),
+  make_tuple(&aom_masked_sad64x16_ssse3, &aom_masked_sad64x16_c),
 };
 
-INSTANTIATE_TEST_CASE_P(SSSE3_C_COMPARE, MaskedSADTest,
-                        ::testing::ValuesIn(msad_test));
+INSTANTIATE_TEST_CASE_P(SSSE3, MaskedSADTest, ::testing::ValuesIn(msad_test));
+
 const HighbdMaskedSADParam hbd_msad_test[] = {
-  make_tuple(&aom_highbd_masked_sad128x128_ssse3,
-             &aom_highbd_masked_sad128x128_c),
-  make_tuple(&aom_highbd_masked_sad128x64_ssse3,
-             &aom_highbd_masked_sad128x64_c),
+  make_tuple(&aom_highbd_masked_sad4x4_ssse3, &aom_highbd_masked_sad4x4_c),
+  make_tuple(&aom_highbd_masked_sad4x8_ssse3, &aom_highbd_masked_sad4x8_c),
+  make_tuple(&aom_highbd_masked_sad8x4_ssse3, &aom_highbd_masked_sad8x4_c),
+  make_tuple(&aom_highbd_masked_sad8x8_ssse3, &aom_highbd_masked_sad8x8_c),
+  make_tuple(&aom_highbd_masked_sad8x16_ssse3, &aom_highbd_masked_sad8x16_c),
+  make_tuple(&aom_highbd_masked_sad16x8_ssse3, &aom_highbd_masked_sad16x8_c),
+  make_tuple(&aom_highbd_masked_sad16x16_ssse3, &aom_highbd_masked_sad16x16_c),
+  make_tuple(&aom_highbd_masked_sad16x32_ssse3, &aom_highbd_masked_sad16x32_c),
+  make_tuple(&aom_highbd_masked_sad32x16_ssse3, &aom_highbd_masked_sad32x16_c),
+  make_tuple(&aom_highbd_masked_sad32x32_ssse3, &aom_highbd_masked_sad32x32_c),
+  make_tuple(&aom_highbd_masked_sad32x64_ssse3, &aom_highbd_masked_sad32x64_c),
+  make_tuple(&aom_highbd_masked_sad64x32_ssse3, &aom_highbd_masked_sad64x32_c),
+  make_tuple(&aom_highbd_masked_sad64x64_ssse3, &aom_highbd_masked_sad64x64_c),
   make_tuple(&aom_highbd_masked_sad64x128_ssse3,
              &aom_highbd_masked_sad64x128_c),
-  make_tuple(&aom_highbd_masked_sad64x64_ssse3, &aom_highbd_masked_sad64x64_c),
-  make_tuple(&aom_highbd_masked_sad64x32_ssse3, &aom_highbd_masked_sad64x32_c),
-  make_tuple(&aom_highbd_masked_sad32x64_ssse3, &aom_highbd_masked_sad32x64_c),
-  make_tuple(&aom_highbd_masked_sad32x32_ssse3, &aom_highbd_masked_sad32x32_c),
-  make_tuple(&aom_highbd_masked_sad32x16_ssse3, &aom_highbd_masked_sad32x16_c),
-  make_tuple(&aom_highbd_masked_sad16x32_ssse3, &aom_highbd_masked_sad16x32_c),
-  make_tuple(&aom_highbd_masked_sad16x16_ssse3, &aom_highbd_masked_sad16x16_c),
-  make_tuple(&aom_highbd_masked_sad16x8_ssse3, &aom_highbd_masked_sad16x8_c),
-  make_tuple(&aom_highbd_masked_sad8x16_ssse3, &aom_highbd_masked_sad8x16_c),
-  make_tuple(&aom_highbd_masked_sad8x8_ssse3, &aom_highbd_masked_sad8x8_c),
-  make_tuple(&aom_highbd_masked_sad8x4_ssse3, &aom_highbd_masked_sad8x4_c),
-  make_tuple(&aom_highbd_masked_sad4x8_ssse3, &aom_highbd_masked_sad4x8_c),
-  make_tuple(&aom_highbd_masked_sad4x4_ssse3, &aom_highbd_masked_sad4x4_c)
+  make_tuple(&aom_highbd_masked_sad128x64_ssse3,
+             &aom_highbd_masked_sad128x64_c),
+  make_tuple(&aom_highbd_masked_sad128x128_ssse3,
+             &aom_highbd_masked_sad128x128_c),
+  make_tuple(&aom_highbd_masked_sad4x16_ssse3, &aom_highbd_masked_sad4x16_c),
+  make_tuple(&aom_highbd_masked_sad16x4_ssse3, &aom_highbd_masked_sad16x4_c),
+  make_tuple(&aom_highbd_masked_sad8x32_ssse3, &aom_highbd_masked_sad8x32_c),
+  make_tuple(&aom_highbd_masked_sad32x8_ssse3, &aom_highbd_masked_sad32x8_c),
+  make_tuple(&aom_highbd_masked_sad16x64_ssse3, &aom_highbd_masked_sad16x64_c),
+  make_tuple(&aom_highbd_masked_sad64x16_ssse3, &aom_highbd_masked_sad64x16_c),
 };
 
-INSTANTIATE_TEST_CASE_P(SSSE3_C_COMPARE, HighbdMaskedSADTest,
+INSTANTIATE_TEST_CASE_P(SSSE3, HighbdMaskedSADTest,
                         ::testing::ValuesIn(hbd_msad_test));
 #endif  // HAVE_SSSE3
+
+#if HAVE_AVX2
+const MaskedSADParam msad_avx2_test[] = {
+  make_tuple(&aom_masked_sad4x4_avx2, &aom_masked_sad4x4_ssse3),
+  make_tuple(&aom_masked_sad4x8_avx2, &aom_masked_sad4x8_ssse3),
+  make_tuple(&aom_masked_sad8x4_avx2, &aom_masked_sad8x4_ssse3),
+  make_tuple(&aom_masked_sad8x8_avx2, &aom_masked_sad8x8_ssse3),
+  make_tuple(&aom_masked_sad8x16_avx2, &aom_masked_sad8x16_ssse3),
+  make_tuple(&aom_masked_sad16x8_avx2, &aom_masked_sad16x8_ssse3),
+  make_tuple(&aom_masked_sad16x16_avx2, &aom_masked_sad16x16_ssse3),
+  make_tuple(&aom_masked_sad16x32_avx2, &aom_masked_sad16x32_ssse3),
+  make_tuple(&aom_masked_sad32x16_avx2, &aom_masked_sad32x16_ssse3),
+  make_tuple(&aom_masked_sad32x32_avx2, &aom_masked_sad32x32_ssse3),
+  make_tuple(&aom_masked_sad32x64_avx2, &aom_masked_sad32x64_ssse3),
+  make_tuple(&aom_masked_sad64x32_avx2, &aom_masked_sad64x32_ssse3),
+  make_tuple(&aom_masked_sad64x64_avx2, &aom_masked_sad64x64_ssse3),
+  make_tuple(&aom_masked_sad64x128_avx2, &aom_masked_sad64x128_ssse3),
+  make_tuple(&aom_masked_sad128x64_avx2, &aom_masked_sad128x64_ssse3),
+  make_tuple(&aom_masked_sad128x128_avx2, &aom_masked_sad128x128_ssse3),
+  make_tuple(&aom_masked_sad4x16_avx2, &aom_masked_sad4x16_ssse3),
+  make_tuple(&aom_masked_sad16x4_avx2, &aom_masked_sad16x4_ssse3),
+  make_tuple(&aom_masked_sad8x32_avx2, &aom_masked_sad8x32_ssse3),
+  make_tuple(&aom_masked_sad32x8_avx2, &aom_masked_sad32x8_ssse3),
+  make_tuple(&aom_masked_sad16x64_avx2, &aom_masked_sad16x64_ssse3),
+  make_tuple(&aom_masked_sad64x16_avx2, &aom_masked_sad64x16_ssse3)
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, MaskedSADTest,
+                        ::testing::ValuesIn(msad_avx2_test));
+
+const HighbdMaskedSADParam hbd_msad_avx2_test[] = {
+  make_tuple(&aom_highbd_masked_sad4x4_avx2, &aom_highbd_masked_sad4x4_ssse3),
+  make_tuple(&aom_highbd_masked_sad4x8_avx2, &aom_highbd_masked_sad4x8_ssse3),
+  make_tuple(&aom_highbd_masked_sad8x4_avx2, &aom_highbd_masked_sad8x4_ssse3),
+  make_tuple(&aom_highbd_masked_sad8x8_avx2, &aom_highbd_masked_sad8x8_ssse3),
+  make_tuple(&aom_highbd_masked_sad8x16_avx2, &aom_highbd_masked_sad8x16_ssse3),
+  make_tuple(&aom_highbd_masked_sad16x8_avx2, &aom_highbd_masked_sad16x8_ssse3),
+  make_tuple(&aom_highbd_masked_sad16x16_avx2,
+             &aom_highbd_masked_sad16x16_ssse3),
+  make_tuple(&aom_highbd_masked_sad16x32_avx2,
+             &aom_highbd_masked_sad16x32_ssse3),
+  make_tuple(&aom_highbd_masked_sad32x16_avx2,
+             &aom_highbd_masked_sad32x16_ssse3),
+  make_tuple(&aom_highbd_masked_sad32x32_avx2,
+             &aom_highbd_masked_sad32x32_ssse3),
+  make_tuple(&aom_highbd_masked_sad32x64_avx2,
+             &aom_highbd_masked_sad32x64_ssse3),
+  make_tuple(&aom_highbd_masked_sad64x32_avx2,
+             &aom_highbd_masked_sad64x32_ssse3),
+  make_tuple(&aom_highbd_masked_sad64x64_avx2,
+             &aom_highbd_masked_sad64x64_ssse3),
+  make_tuple(&aom_highbd_masked_sad64x128_avx2,
+             &aom_highbd_masked_sad64x128_ssse3),
+  make_tuple(&aom_highbd_masked_sad128x64_avx2,
+             &aom_highbd_masked_sad128x64_ssse3),
+  make_tuple(&aom_highbd_masked_sad128x128_avx2,
+             &aom_highbd_masked_sad128x128_ssse3),
+  make_tuple(&aom_highbd_masked_sad4x16_avx2, &aom_highbd_masked_sad4x16_ssse3),
+  make_tuple(&aom_highbd_masked_sad16x4_avx2, &aom_highbd_masked_sad16x4_ssse3),
+  make_tuple(&aom_highbd_masked_sad8x32_avx2, &aom_highbd_masked_sad8x32_ssse3),
+  make_tuple(&aom_highbd_masked_sad32x8_avx2, &aom_highbd_masked_sad32x8_ssse3),
+  make_tuple(&aom_highbd_masked_sad16x64_avx2,
+             &aom_highbd_masked_sad16x64_ssse3),
+  make_tuple(&aom_highbd_masked_sad64x16_avx2,
+             &aom_highbd_masked_sad64x16_ssse3)
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, HighbdMaskedSADTest,
+                        ::testing::ValuesIn(hbd_msad_avx2_test));
+#endif  // HAVE_AVX2
+
 }  // namespace
diff --git a/third_party/aom/test/noise_model_test.cc b/third_party/aom/test/noise_model_test.cc
index 9b7fff8a2..b5b387e31 100644
--- a/third_party/aom/test/noise_model_test.cc
+++ b/third_party/aom/test/noise_model_test.cc
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 #include <math.h>
 #include <algorithm>
 #include <vector>
diff --git a/third_party/aom/test/obmc_sad_test.cc b/third_party/aom/test/obmc_sad_test.cc
index 1820da266..6cef86961 100644
--- a/third_party/aom/test/obmc_sad_test.cc
+++ b/third_party/aom/test/obmc_sad_test.cc
@@ -108,6 +108,29 @@ INSTANTIATE_TEST_CASE_P(SSE4_1, ObmcSadTest,
                         ::testing::ValuesIn(sse4_functions));
 #endif  // HAVE_SSE4_1
 
+#if HAVE_AVX2
+const ObmcSadTest::ParamType avx2_functions[] = {
+  TestFuncs(aom_obmc_sad128x128_c, aom_obmc_sad128x128_avx2),
+  TestFuncs(aom_obmc_sad128x64_c, aom_obmc_sad128x64_avx2),
+  TestFuncs(aom_obmc_sad64x128_c, aom_obmc_sad64x128_avx2),
+  TestFuncs(aom_obmc_sad64x64_c, aom_obmc_sad64x64_avx2),
+  TestFuncs(aom_obmc_sad64x32_c, aom_obmc_sad64x32_avx2),
+  TestFuncs(aom_obmc_sad32x64_c, aom_obmc_sad32x64_avx2),
+  TestFuncs(aom_obmc_sad32x32_c, aom_obmc_sad32x32_avx2),
+  TestFuncs(aom_obmc_sad32x16_c, aom_obmc_sad32x16_avx2),
+  TestFuncs(aom_obmc_sad16x32_c, aom_obmc_sad16x32_avx2),
+  TestFuncs(aom_obmc_sad16x16_c, aom_obmc_sad16x16_avx2),
+  TestFuncs(aom_obmc_sad16x8_c, aom_obmc_sad16x8_avx2),
+  TestFuncs(aom_obmc_sad8x16_c, aom_obmc_sad8x16_avx2),
+  TestFuncs(aom_obmc_sad8x8_c, aom_obmc_sad8x8_avx2),
+  TestFuncs(aom_obmc_sad8x4_c, aom_obmc_sad8x4_avx2),
+  TestFuncs(aom_obmc_sad4x8_c, aom_obmc_sad4x8_avx2),
+  TestFuncs(aom_obmc_sad4x4_c, aom_obmc_sad4x4_avx2)
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, ObmcSadTest, ::testing::ValuesIn(avx2_functions));
+#endif  // HAVE_AVX2
+
 ////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
@@ -187,4 +210,28 @@ ObmcSadHBDTest::ParamType sse4_functions_hbd[] = {
 INSTANTIATE_TEST_CASE_P(SSE4_1, ObmcSadHBDTest,
                         ::testing::ValuesIn(sse4_functions_hbd));
 #endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+ObmcSadHBDTest::ParamType avx2_functions_hbd[] = {
+  TestFuncs(aom_highbd_obmc_sad128x128_c, aom_highbd_obmc_sad128x128_avx2),
+  TestFuncs(aom_highbd_obmc_sad128x64_c, aom_highbd_obmc_sad128x64_avx2),
+  TestFuncs(aom_highbd_obmc_sad64x128_c, aom_highbd_obmc_sad64x128_avx2),
+  TestFuncs(aom_highbd_obmc_sad64x64_c, aom_highbd_obmc_sad64x64_avx2),
+  TestFuncs(aom_highbd_obmc_sad64x32_c, aom_highbd_obmc_sad64x32_avx2),
+  TestFuncs(aom_highbd_obmc_sad32x64_c, aom_highbd_obmc_sad32x64_avx2),
+  TestFuncs(aom_highbd_obmc_sad32x32_c, aom_highbd_obmc_sad32x32_avx2),
+  TestFuncs(aom_highbd_obmc_sad32x16_c, aom_highbd_obmc_sad32x16_avx2),
+  TestFuncs(aom_highbd_obmc_sad16x32_c, aom_highbd_obmc_sad16x32_avx2),
+  TestFuncs(aom_highbd_obmc_sad16x16_c, aom_highbd_obmc_sad16x16_avx2),
+  TestFuncs(aom_highbd_obmc_sad16x8_c, aom_highbd_obmc_sad16x8_avx2),
+  TestFuncs(aom_highbd_obmc_sad8x16_c, aom_highbd_obmc_sad8x16_avx2),
+  TestFuncs(aom_highbd_obmc_sad8x8_c, aom_highbd_obmc_sad8x8_avx2),
+  TestFuncs(aom_highbd_obmc_sad8x4_c, aom_highbd_obmc_sad8x4_avx2),
+  TestFuncs(aom_highbd_obmc_sad4x8_c, aom_highbd_obmc_sad4x8_avx2),
+  TestFuncs(aom_highbd_obmc_sad4x4_c, aom_highbd_obmc_sad4x4_avx2)
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, ObmcSadHBDTest,
+                        ::testing::ValuesIn(avx2_functions_hbd));
+#endif  // HAVE_AVX2
 }  // namespace
diff --git a/third_party/aom/test/reconinter_test.cc b/third_party/aom/test/reconinter_test.cc
index 4f74c817e..9b849404c 100644
--- a/third_party/aom/test/reconinter_test.cc
+++ b/third_party/aom/test/reconinter_test.cc
@@ -28,12 +28,30 @@
 namespace {
 using libaom_test::ACMRandom;
 
-class BuildCompDiffwtdMaskTest : public ::testing::TestWithParam<int> {
+typedef void (*buildcompdiffwtdmaskd_func)(uint8_t *mask,
+                                           DIFFWTD_MASK_TYPE mask_type,
+                                           const uint8_t *src0, int src0_stride,
+                                           const uint8_t *src1, int src1_stride,
+                                           int h, int w);
+
+typedef ::testing::tuple<BLOCK_SIZE, buildcompdiffwtdmaskd_func>
+    BuildCompDiffwtdMaskDParam;
+
+#if HAVE_SSE4_1
+::testing::internal::ParamGenerator<BuildCompDiffwtdMaskDParam> BuildParams(
+    buildcompdiffwtdmaskd_func filter) {
+  return ::testing::Combine(::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL),
+                            ::testing::Values(filter));
+}
+#endif
+
+class BuildCompDiffwtdMaskTest
+    : public ::testing::TestWithParam<BuildCompDiffwtdMaskDParam> {
  public:
   virtual ~BuildCompDiffwtdMaskTest() {}
 
   virtual void TearDown() { libaom_test::ClearSystemState(); }
-  void RunTest(const int sb_type, const int is_speed,
+  void RunTest(buildcompdiffwtdmaskd_func test_impl, const int is_speed,
                const DIFFWTD_MASK_TYPE type);
 
  private:
@@ -159,8 +177,10 @@ void BuildCompDiffwtdMaskD16Test::RunSpeedTest(
          width, height, 1000.0 * elapsed_time1 / num_loops);
 }
 #if HAVE_SSE4_1
-void BuildCompDiffwtdMaskTest::RunTest(const int sb_type, const int is_speed,
+void BuildCompDiffwtdMaskTest::RunTest(buildcompdiffwtdmaskd_func test_impl,
+                                       const int is_speed,
                                        const DIFFWTD_MASK_TYPE type) {
+  const int sb_type = GET_PARAM(0);
   const int width = block_size_wide[sb_type];
   const int height = block_size_high[sb_type];
   DECLARE_ALIGNED(16, uint8_t, mask_ref[MAX_SB_SQUARE]);
@@ -182,8 +202,7 @@ void BuildCompDiffwtdMaskTest::RunTest(const int sb_type, const int is_speed,
   const double t1 = get_time_mark(&timer);
   aom_usec_timer_start(&timer);
   for (int i = 0; i < run_times; ++i) {
-    av1_build_compound_diffwtd_mask_sse4_1(mask_test, type, src0, width, src1,
-                                           width, height, width);
+    test_impl(mask_test, type, src0, width, src1, width, height, width);
   }
   const double t2 = get_time_mark(&timer);
   if (is_speed) {
@@ -200,12 +219,12 @@ void BuildCompDiffwtdMaskTest::RunTest(const int sb_type, const int is_speed,
 }
 
 TEST_P(BuildCompDiffwtdMaskTest, match) {
-  RunTest(GetParam(), 0, DIFFWTD_38);
-  RunTest(GetParam(), 0, DIFFWTD_38_INV);
+  RunTest(GET_PARAM(1), 0, DIFFWTD_38);
+  RunTest(GET_PARAM(1), 0, DIFFWTD_38_INV);
 }
 TEST_P(BuildCompDiffwtdMaskTest, DISABLED_Speed) {
-  RunTest(GetParam(), 1, DIFFWTD_38);
-  RunTest(GetParam(), 1, DIFFWTD_38_INV);
+  RunTest(GET_PARAM(1), 1, DIFFWTD_38);
+  RunTest(GET_PARAM(1), 1, DIFFWTD_38_INV);
 }
 #endif
 TEST_P(BuildCompDiffwtdMaskD16Test, CheckOutput) {
@@ -218,8 +237,7 @@ TEST_P(BuildCompDiffwtdMaskD16Test, DISABLED_Speed) {
 
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_CASE_P(SSE4_1, BuildCompDiffwtdMaskTest,
-                        ::testing::Range(0, static_cast<int>(BLOCK_SIZES_ALL),
-                                         1));
+                        BuildParams(av1_build_compound_diffwtd_mask_sse4_1));
 
 INSTANTIATE_TEST_CASE_P(
     SSE4_1, BuildCompDiffwtdMaskD16Test,
diff --git a/third_party/aom/test/resize_test.cc b/third_party/aom/test/resize_test.cc
index e1c4e9fa5..b270b8362 100644
--- a/third_party/aom/test/resize_test.cc
+++ b/third_party/aom/test/resize_test.cc
@@ -546,12 +546,6 @@ TEST_P(ResizeRealtimeTest, DISABLED_TestInternalResizeDownUpChangeBitRate) {
 #endif
 }
 
-aom_img_fmt_t CspForFrameNumber(int frame) {
-  if (frame < 10) return AOM_IMG_FMT_I420;
-  if (frame < 20) return AOM_IMG_FMT_I444;
-  return AOM_IMG_FMT_I420;
-}
-
 class ResizeCspTest : public ResizeTest {
  protected:
 #if WRITE_COMPRESSED_STREAM
@@ -580,20 +574,6 @@ class ResizeCspTest : public ResizeTest {
 #endif
   }
 
-  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
-                                  libaom_test::Encoder *encoder) {
-    if (CspForFrameNumber(video->frame()) != AOM_IMG_FMT_I420 &&
-        cfg_.g_profile != 1) {
-      cfg_.g_profile = 1;
-      encoder->Config(&cfg_);
-    }
-    if (CspForFrameNumber(video->frame()) == AOM_IMG_FMT_I420 &&
-        cfg_.g_profile != 0) {
-      cfg_.g_profile = 0;
-      encoder->Config(&cfg_);
-    }
-  }
-
   virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
     if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
     EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
@@ -621,19 +601,13 @@ class ResizeCspTest : public ResizeTest {
 
 class ResizingCspVideoSource : public ::libaom_test::DummyVideoSource {
  public:
-  ResizingCspVideoSource() {
+  explicit ResizingCspVideoSource(aom_img_fmt_t image_format) {
     SetSize(kInitialWidth, kInitialHeight);
+    SetImageFormat(image_format);
     limit_ = 30;
   }
 
   virtual ~ResizingCspVideoSource() {}
-
- protected:
-  virtual void Next() {
-    ++frame_;
-    SetImageFormat(CspForFrameNumber(frame_));
-    FillFrame();
-  }
 };
 
 #if (defined(DISABLE_TRELLISQ_SEARCH) && DISABLE_TRELLISQ_SEARCH)
@@ -641,14 +615,19 @@ TEST_P(ResizeCspTest, DISABLED_TestResizeCspWorks) {
 #else
 TEST_P(ResizeCspTest, TestResizeCspWorks) {
 #endif
-  ResizingCspVideoSource video;
-  init_flags_ = AOM_CODEC_USE_PSNR;
-  cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
-  cfg_.g_lag_in_frames = 0;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const aom_img_fmt_t image_formats[] = { AOM_IMG_FMT_I420, AOM_IMG_FMT_I444 };
+  for (size_t i = 0; i < GTEST_ARRAY_SIZE_(image_formats); ++i) {
+    ResizingCspVideoSource video(image_formats[i]);
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_profile = (image_formats[i] == AOM_IMG_FMT_I420) ? 0 : 1;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
-  // Check we decoded the same number of frames as we attempted to encode
-  ASSERT_EQ(frame_info_list_.size(), video.limit());
+    // Check we decoded the same number of frames as we attempted to encode
+    ASSERT_EQ(frame_info_list_.size(), video.limit());
+    frame_info_list_.clear();
+  }
 }
 
 AV1_INSTANTIATE_TEST_CASE(ResizeTest,
diff --git a/third_party/aom/test/selfguided_filter_test.cc b/third_party/aom/test/selfguided_filter_test.cc
index 4506a90db..d2d5c6105 100644
--- a/third_party/aom/test/selfguided_filter_test.cc
+++ b/third_party/aom/test/selfguided_filter_test.cc
@@ -208,6 +208,11 @@ INSTANTIATE_TEST_CASE_P(AVX2, AV1SelfguidedFilterTest,
                         ::testing::Values(apply_selfguided_restoration_avx2));
 #endif
 
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, AV1SelfguidedFilterTest,
+                        ::testing::Values(apply_selfguided_restoration_neon));
+#endif
+
 // Test parameter list:
 //  <tst_fun_, bit_depth>
 typedef tuple<SgrFunc, int> HighbdFilterTestParam;
@@ -395,5 +400,11 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Combine(::testing::Values(apply_selfguided_restoration_avx2),
                        ::testing::ValuesIn(highbd_params_avx2)));
 #endif
-
+#if HAVE_NEON
+const int highbd_params_neon[] = { 8, 10, 12 };
+INSTANTIATE_TEST_CASE_P(
+    NEON, AV1HighbdSelfguidedFilterTest,
+    ::testing::Combine(::testing::Values(apply_selfguided_restoration_neon),
+                       ::testing::ValuesIn(highbd_params_neon)));
+#endif
 }  // namespace
diff --git a/third_party/aom/test/test-data.sha1 b/third_party/aom/test/test-data.sha1
index 67aeb5208..723e06104 100644
--- a/third_party/aom/test/test-data.sha1
+++ b/third_party/aom/test/test-data.sha1
@@ -356,4 +356,134 @@ c8baedb48fd5d4c956aa8d73fd957370f718f047 *av1-1-b8-01-size-208x226.ivf.md5
 25a4940922761239809d82c45c2be1c5e4f48785 *av1-1-b8-01-size-224x226.ivf
 87ae7e7558241bf3575a333f56fbad4dfdade8ff *av1-1-b8-01-size-224x226.ivf.md5
 40dd208eb525cd90d7c0674cf787097fb909afae *av1-1-b8-01-size-226x226.ivf
-34bdef682a4eae0e0a05e4486a968af1df8b220a *av1-1-b8-01-size-226x226.ivf.md5
-\ No newline at end of file
+34bdef682a4eae0e0a05e4486a968af1df8b220a *av1-1-b8-01-size-226x226.ivf.md5
+9bbe8499796aa588ff02e313fb0d4349940d2fea *av1-1-b10-00-quantizer-00.ivf
+36b402eedad2bacee8ac09acce44e2fc356dd80b *av1-1-b10-00-quantizer-00.ivf.md5
+1d5e1d2827624f328020bf123df213bb175577e0 *av1-1-b10-00-quantizer-01.ivf
+16c529be5502369e43ce9c6fe99a9709968e3daf *av1-1-b10-00-quantizer-01.ivf.md5
+39abc20739242a8f05efd4b35d7603c8ad7ff45d *av1-1-b10-00-quantizer-02.ivf
+81faa72c3d43b003966fe09ffaae51b07b1059be *av1-1-b10-00-quantizer-02.ivf.md5
+92ebf349b803333a43824a83d997b8cf76f656f9 *av1-1-b10-00-quantizer-03.ivf
+5e7556dc998cb8b506a43cc078e30802d7e600e6 *av1-1-b10-00-quantizer-03.ivf.md5
+1c496177c66e49f2e3556af87ec67afb5060170b *av1-1-b10-00-quantizer-04.ivf
+560fea4800a44fe19ed8d3e74f425bdbf1fb8abd *av1-1-b10-00-quantizer-04.ivf.md5
+7de864b8475ce0acd0ecb01827f2c9add815352b *av1-1-b10-00-quantizer-05.ivf
+1c1aea3db3f54a91866d89fd3b1a0d285ca10310 *av1-1-b10-00-quantizer-05.ivf.md5
+b6501c165619b036d0f7864fd4739973d2d18970 *av1-1-b10-00-quantizer-06.ivf
+d758c8eff275651006c41e7dd447cac13b489ad7 *av1-1-b10-00-quantizer-06.ivf.md5
+e4df6f588f156dffaafd9517b64f753cfc9ccf05 *av1-1-b10-00-quantizer-07.ivf
+3c577f67dade4537de642fd457ea2b367424f336 *av1-1-b10-00-quantizer-07.ivf.md5
+07e9c4c18abb36c8699c1c12bebcc727f090b525 *av1-1-b10-00-quantizer-08.ivf
+4981568ade3170f311cb114fa2689edc4bc35e67 *av1-1-b10-00-quantizer-08.ivf.md5
+2268ecd2899f1b41ae9898925b1d62cfefa30282 *av1-1-b10-00-quantizer-09.ivf
+029b03029b65b7c4c208961f0820467ad42fd3d6 *av1-1-b10-00-quantizer-09.ivf.md5
+3d2adaf6441cfa9585dcbf7d19d65bf6992a29a3 *av1-1-b10-00-quantizer-10.ivf
+017b7fb4c3ba0747c2d5688d493da33ef993d110 *av1-1-b10-00-quantizer-10.ivf.md5
+006535760bd7dc1cfc95e648b05215954a2e76c2 *av1-1-b10-00-quantizer-11.ivf
+c0ae083deb8e820aa49034af4d100944dd977018 *av1-1-b10-00-quantizer-11.ivf.md5
+840e0cbfe1acc8a7a45c823dc55ab44a0b6b553e *av1-1-b10-00-quantizer-12.ivf
+49232ea38bdef650c94808f53834f1137cd4bf39 *av1-1-b10-00-quantizer-12.ivf.md5
+04b0e5a7387e07474f51be4b2c3e05211b40f0d0 *av1-1-b10-00-quantizer-13.ivf
+a51b5ec4b890df3a64f9f0d866b8c41296c9e081 *av1-1-b10-00-quantizer-13.ivf.md5
+5dc47a140fbcbf08bf91481ee3585e9e067561ab *av1-1-b10-00-quantizer-14.ivf
+2625319eef69d6225e6ab6e5ce7790491406cb5d *av1-1-b10-00-quantizer-14.ivf.md5
+f866be86d8d8aa08ded30e42988b0936c1a16064 *av1-1-b10-00-quantizer-15.ivf
+03b7c1eefb54d99e30051c7123c0453f04a6579d *av1-1-b10-00-quantizer-15.ivf.md5
+548df2371dfb485419ed9baf28e3f495c64f364a *av1-1-b10-00-quantizer-16.ivf
+8a0d6bf1626b05b65c77331305414fe9be54e8c6 *av1-1-b10-00-quantizer-16.ivf.md5
+0077c82f96a2e095a3cb8de9bfa63715e3c9f438 *av1-1-b10-00-quantizer-17.ivf
+5d85f77f3087f4b206930722a945c60039262be4 *av1-1-b10-00-quantizer-17.ivf.md5
+1e0f1245ecb4c903b5dc7072d959fc43a7bba381 *av1-1-b10-00-quantizer-18.ivf
+06316ae2b45f2359a70cc3855ffd6ab81048b41a *av1-1-b10-00-quantizer-18.ivf.md5
+f197198f7ec058110185fda5297a1a43993654df *av1-1-b10-00-quantizer-19.ivf
+bac522c7f234d506c75b5495d74b3fa57c83a4df *av1-1-b10-00-quantizer-19.ivf.md5
+c2f57324d000b349323f37d5ebebde8c2b861f30 *av1-1-b10-00-quantizer-20.ivf
+999c6110786cbc25e67792234a5a02f2cb4553c7 *av1-1-b10-00-quantizer-20.ivf.md5
+2ffad9adfd19286fe2166ba877289d201c9a634f *av1-1-b10-00-quantizer-21.ivf
+d55713eaa791cfd7bf69b6c26d5032029d9a0f06 *av1-1-b10-00-quantizer-21.ivf.md5
+382528db53328c1a38976f5d9b579eef35d839f4 *av1-1-b10-00-quantizer-22.ivf
+cb5bd459e1a90126da9264cff4281515f95755b2 *av1-1-b10-00-quantizer-22.ivf.md5
+b52cc6160fc66f72ad66c198d275a1c73f925022 *av1-1-b10-00-quantizer-23.ivf
+c0f9d6659e1f283e9356fd7b4ac9f7cc5544cdc2 *av1-1-b10-00-quantizer-23.ivf.md5
+e11f15e3b63e7606b1122bb3670ee77c09c04840 *av1-1-b10-00-quantizer-24.ivf
+e9f141b924440e044270c81a68458fe498599a8e *av1-1-b10-00-quantizer-24.ivf.md5
+fb91793b69824c99b0218788dcea0a74ebd7e84e *av1-1-b10-00-quantizer-25.ivf
+434e33d609b2683c3cfbcc3a2cdfc26339590fb6 *av1-1-b10-00-quantizer-25.ivf.md5
+d82e38f31cdcf8b43479e6ddaa83373de38f70a2 *av1-1-b10-00-quantizer-26.ivf
+183943b851ba383a536f13c83b93f61ac8961ad5 *av1-1-b10-00-quantizer-26.ivf.md5
+6bf5e4e8e0aca699e493b9eb3672d2117494d74d *av1-1-b10-00-quantizer-27.ivf
+f0fb7e0a99180828b0e38b2cfe0622eecc2d26b8 *av1-1-b10-00-quantizer-27.ivf.md5
+d5adee2567544c3ae4223b3f3528a770377878d2 *av1-1-b10-00-quantizer-28.ivf
+14edf588efc67570e529b0ff8aeb8e7a0c69238b *av1-1-b10-00-quantizer-28.ivf.md5
+e6dcdc106847956035e3f00aabf4470f97e1887e *av1-1-b10-00-quantizer-29.ivf
+413c5cb778611c7c1a810b53861b9ab1fb391f17 *av1-1-b10-00-quantizer-29.ivf.md5
+b5e98b3f6b1db04d46bf43064c6ac64f797aff00 *av1-1-b10-00-quantizer-30.ivf
+d1a603661d76c28658c7cd2892b408e91d77893e *av1-1-b10-00-quantizer-30.ivf.md5
+80168371d1150e82e3f46bcbbcabba458b835b19 *av1-1-b10-00-quantizer-31.ivf
+904ecd033d4af5239c4d5b3f86e51ed5c3c2e3fb *av1-1-b10-00-quantizer-31.ivf.md5
+96291f6ace85980892d135a5b74188cd629c325f *av1-1-b10-00-quantizer-32.ivf
+a5ceace390d4a75d48281fe29060c21557e4f5ae *av1-1-b10-00-quantizer-32.ivf.md5
+0f80495de34eae07c4905b72573a315a879390ec *av1-1-b10-00-quantizer-33.ivf
+72b8f662973a660412946687dff878b276ae518e *av1-1-b10-00-quantizer-33.ivf.md5
+24905e3be7db320994b7fb8311dfd50a7c9e54da *av1-1-b10-00-quantizer-34.ivf
+cea514bb1b7b064c4d31914a2cb266611c278577 *av1-1-b10-00-quantizer-34.ivf.md5
+083012960dd7c17d3b00fa0e807759c98faded8f *av1-1-b10-00-quantizer-35.ivf
+de5fdb9e1e581484af1cc7d2dd3c3e84c90cebb2 *av1-1-b10-00-quantizer-35.ivf.md5
+f725f179aeee5b413620c0dd81b007b245c2a7ed *av1-1-b10-00-quantizer-36.ivf
+246b1931c04c02df1f168090e2650827cd5dbabd *av1-1-b10-00-quantizer-36.ivf.md5
+f6aa824156e9848f237481889a8103eb6130f31d *av1-1-b10-00-quantizer-37.ivf
+a8f78dd15fc2994369a08c2ddddcd0760c62ea5b *av1-1-b10-00-quantizer-37.ivf.md5
+a8dd662338c493aea266b99203e70af25982633f *av1-1-b10-00-quantizer-38.ivf
+09f36d998e85d0450060f540e50b075ae1432fc6 *av1-1-b10-00-quantizer-38.ivf.md5
+d97428871720ed658da6ed0e3f7c15da83387e4c *av1-1-b10-00-quantizer-39.ivf
+8c5230048909ee8f86f87c116f153cd910d0141f *av1-1-b10-00-quantizer-39.ivf.md5
+86e754e55e9b63c6e0a4fef01761414f8a6b61ca *av1-1-b10-00-quantizer-40.ivf
+99a71accf6457264e45ca80d3b1f082ee5acdecc *av1-1-b10-00-quantizer-40.ivf.md5
+9d18b7236506ab7e107c062620b64096ec0cf423 *av1-1-b10-00-quantizer-41.ivf
+5771159a9a7c7b66c9e13bb13ec3d53b37860208 *av1-1-b10-00-quantizer-41.ivf.md5
+54b72bc879a80e66613f421e67db62bba1c0041b *av1-1-b10-00-quantizer-42.ivf
+bf958236883ee7209ef4cb0b7503b430634a291e *av1-1-b10-00-quantizer-42.ivf.md5
+a06d5321a51d90404dd7085ae511d7df5d5e1e05 *av1-1-b10-00-quantizer-43.ivf
+ddb25723d976043d863634b9dc3b5fb84a245803 *av1-1-b10-00-quantizer-43.ivf.md5
+2ea0b64c170d7299dae1c14a8a49349aee8e0d08 *av1-1-b10-00-quantizer-44.ivf
+d18bde1b4893792173fa2014665e9364395ad5e9 *av1-1-b10-00-quantizer-44.ivf.md5
+73e506a32d3518e23424f231c7b5323d7a34a3d6 *av1-1-b10-00-quantizer-45.ivf
+be6224ebc77a3e5fb9c1645b876007e584a09d89 *av1-1-b10-00-quantizer-45.ivf.md5
+841223871374464194edc739c48dc7cefd1ff255 *av1-1-b10-00-quantizer-46.ivf
+4766d616f923496a8dc113c9b7f875f0c0735f9a *av1-1-b10-00-quantizer-46.ivf.md5
+8bbbbea130aaea453f7b826956a5520d10a0eccf *av1-1-b10-00-quantizer-47.ivf
+3ea21fac0c492b03d8ec25e4ee0971cd57e5f71a *av1-1-b10-00-quantizer-47.ivf.md5
+3ce83e0f1e1835b9a6c10fe502a16fd3650839e0 *av1-1-b10-00-quantizer-48.ivf
+b468de2c09fca5a6b2bb7a20bab4afd8d192c31d *av1-1-b10-00-quantizer-48.ivf.md5
+f3a757c678aa00f9a9c4c4658d37733fd935925a *av1-1-b10-00-quantizer-49.ivf
+f888dc88db576122695d4eb41c486aacd28a2d1d *av1-1-b10-00-quantizer-49.ivf.md5
+a9d78aaef105cc5a95b7ebb54783f37e75673123 *av1-1-b10-00-quantizer-50.ivf
+06d0c5e79cc794030c4be022089b1d12c1383f71 *av1-1-b10-00-quantizer-50.ivf.md5
+165c20ee372f83682d094541097e375227353239 *av1-1-b10-00-quantizer-51.ivf
+b3d90214b8c6e6f6d9357bb5784d10081325c356 *av1-1-b10-00-quantizer-51.ivf.md5
+5b3ea7a18654d943065f5c176974c3960b56664e *av1-1-b10-00-quantizer-52.ivf
+dc61a6e4e2549074130023b14b137fb4fe442ce3 *av1-1-b10-00-quantizer-52.ivf.md5
+74c3b5851b6a94d33b575a689eb8d34592e95d5f *av1-1-b10-00-quantizer-53.ivf
+a80e43a0fb2b852426bd941b8d4b8f56690e9bc9 *av1-1-b10-00-quantizer-53.ivf.md5
+d05b8dea2cddd4f0d9e792f42f71afbd29f7811c *av1-1-b10-00-quantizer-54.ivf
+432937893321f4bd25fa400b8988c5788cb06ecf *av1-1-b10-00-quantizer-54.ivf.md5
+4eaee0f1970426be0bbeb7d4fccdc7e804e9bea4 *av1-1-b10-00-quantizer-55.ivf
+710ab95ce1dcd2540db4477ff4ee6ab771fe0759 *av1-1-b10-00-quantizer-55.ivf.md5
+fe637930c9faa8744cba37effc4cb5510315d1c0 *av1-1-b10-00-quantizer-56.ivf
+2f9431b30523fb6a3e4122f22c6c3ff7b96a7987 *av1-1-b10-00-quantizer-56.ivf.md5
+ed54fc7fcec194eef1f50adbbe12a6a36ab6836b *av1-1-b10-00-quantizer-57.ivf
+43bccac7800b399210cf15520a83739c23a5d9c7 *av1-1-b10-00-quantizer-57.ivf.md5
+a7b8d628ba3e4c5f37aa6a3d7b82afda73ac89dc *av1-1-b10-00-quantizer-58.ivf
+b26638272b787df54f45a46629b852acbcb73e3d *av1-1-b10-00-quantizer-58.ivf.md5
+c077f22ff547fb5ffd020e8dac91d05942fb52df *av1-1-b10-00-quantizer-59.ivf
+4efd99cc0891bf345b8cd2ae8e21709d61be497b *av1-1-b10-00-quantizer-59.ivf.md5
+301ab53039d75e1ffa8cc6a0874d9ea94e4a6a0d *av1-1-b10-00-quantizer-60.ivf
+4729bd734a6edd2d8d0432a3f66b3d91d565050e *av1-1-b10-00-quantizer-60.ivf.md5
+c78640d3211034df9fcb273bdfc18625819652f2 *av1-1-b10-00-quantizer-61.ivf
+3d823eb2b33ccfea68db506626bcbecf49b0f167 *av1-1-b10-00-quantizer-61.ivf.md5
+bf241a449a28773b93e6e529a06dfc28109577e4 *av1-1-b10-00-quantizer-62.ivf
+75457d8476f1927f737d089dcf3d0f7f99f3c4fb *av1-1-b10-00-quantizer-62.ivf.md5
+8b6eb3fff2e0db7eac775b08c745250ca591e2d9 *av1-1-b10-00-quantizer-63.ivf
+63ea689d025593e5d91760785b8e446d04d4671e *av1-1-b10-00-quantizer-63.ivf.md5
+a9f7ea6312a533cc6426a6145edd190d45813c37 *av1-1-b8-02-allintra.ivf
+8fd8f789cfee1069d20f3e2c241f5cad7292239e *av1-1-b8-02-allintra.ivf.md5
+\ No newline at end of file
diff --git a/third_party/aom/test/test.cmake b/third_party/aom/test/test.cmake
index 8594d059c..7b584880f 100644
--- a/third_party/aom/test/test.cmake
+++ b/third_party/aom/test/test.cmake
@@ -405,7 +405,7 @@ function(setup_aom_test_targets)
         OR (CONFIG_AV1_DECODER AND  "${var}" MATCHES "_TEST_DECODER_"))
       list(APPEND aom_test_source_vars ${var})
     endif()
-    # cmake-format:on
+    # cmake-format: on
   endforeach()
 
   # Libaom_test_srcs.txt generation.
diff --git a/third_party/aom/test/test_data_util.cmake b/third_party/aom/test/test_data_util.cmake
index bbdd5f4a2..9fe00a07d 100644
--- a/third_party/aom/test/test_data_util.cmake
+++ b/third_party/aom/test/test_data_util.cmake
@@ -165,6 +165,134 @@ if(CONFIG_AV1_DECODER)
               "av1-1-b8-00-quantizer-62.ivf.md5"
               "av1-1-b8-00-quantizer-63.ivf"
               "av1-1-b8-00-quantizer-63.ivf.md5"
+              "av1-1-b10-00-quantizer-00.ivf"
+              "av1-1-b10-00-quantizer-00.ivf.md5"
+              "av1-1-b10-00-quantizer-01.ivf"
+              "av1-1-b10-00-quantizer-01.ivf.md5"
+              "av1-1-b10-00-quantizer-02.ivf"
+              "av1-1-b10-00-quantizer-02.ivf.md5"
+              "av1-1-b10-00-quantizer-03.ivf"
+              "av1-1-b10-00-quantizer-03.ivf.md5"
+              "av1-1-b10-00-quantizer-04.ivf"
+              "av1-1-b10-00-quantizer-04.ivf.md5"
+              "av1-1-b10-00-quantizer-05.ivf"
+              "av1-1-b10-00-quantizer-05.ivf.md5"
+              "av1-1-b10-00-quantizer-06.ivf"
+              "av1-1-b10-00-quantizer-06.ivf.md5"
+              "av1-1-b10-00-quantizer-07.ivf"
+              "av1-1-b10-00-quantizer-07.ivf.md5"
+              "av1-1-b10-00-quantizer-08.ivf"
+              "av1-1-b10-00-quantizer-08.ivf.md5"
+              "av1-1-b10-00-quantizer-09.ivf"
+              "av1-1-b10-00-quantizer-09.ivf.md5"
+              "av1-1-b10-00-quantizer-10.ivf"
+              "av1-1-b10-00-quantizer-10.ivf.md5"
+              "av1-1-b10-00-quantizer-11.ivf"
+              "av1-1-b10-00-quantizer-11.ivf.md5"
+              "av1-1-b10-00-quantizer-12.ivf"
+              "av1-1-b10-00-quantizer-12.ivf.md5"
+              "av1-1-b10-00-quantizer-13.ivf"
+              "av1-1-b10-00-quantizer-13.ivf.md5"
+              "av1-1-b10-00-quantizer-14.ivf"
+              "av1-1-b10-00-quantizer-14.ivf.md5"
+              "av1-1-b10-00-quantizer-15.ivf"
+              "av1-1-b10-00-quantizer-15.ivf.md5"
+              "av1-1-b10-00-quantizer-16.ivf"
+              "av1-1-b10-00-quantizer-16.ivf.md5"
+              "av1-1-b10-00-quantizer-17.ivf"
+              "av1-1-b10-00-quantizer-17.ivf.md5"
+              "av1-1-b10-00-quantizer-18.ivf"
+              "av1-1-b10-00-quantizer-18.ivf.md5"
+              "av1-1-b10-00-quantizer-19.ivf"
+              "av1-1-b10-00-quantizer-19.ivf.md5"
+              "av1-1-b10-00-quantizer-20.ivf"
+              "av1-1-b10-00-quantizer-20.ivf.md5"
+              "av1-1-b10-00-quantizer-21.ivf"
+              "av1-1-b10-00-quantizer-21.ivf.md5"
+              "av1-1-b10-00-quantizer-22.ivf"
+              "av1-1-b10-00-quantizer-22.ivf.md5"
+              "av1-1-b10-00-quantizer-23.ivf"
+              "av1-1-b10-00-quantizer-23.ivf.md5"
+              "av1-1-b10-00-quantizer-24.ivf"
+              "av1-1-b10-00-quantizer-24.ivf.md5"
+              "av1-1-b10-00-quantizer-25.ivf"
+              "av1-1-b10-00-quantizer-25.ivf.md5"
+              "av1-1-b10-00-quantizer-26.ivf"
+              "av1-1-b10-00-quantizer-26.ivf.md5"
+              "av1-1-b10-00-quantizer-27.ivf"
+              "av1-1-b10-00-quantizer-27.ivf.md5"
+              "av1-1-b10-00-quantizer-28.ivf"
+              "av1-1-b10-00-quantizer-28.ivf.md5"
+              "av1-1-b10-00-quantizer-29.ivf"
+              "av1-1-b10-00-quantizer-29.ivf.md5"
+              "av1-1-b10-00-quantizer-30.ivf"
+              "av1-1-b10-00-quantizer-30.ivf.md5"
+              "av1-1-b10-00-quantizer-31.ivf"
+              "av1-1-b10-00-quantizer-31.ivf.md5"
+              "av1-1-b10-00-quantizer-32.ivf"
+              "av1-1-b10-00-quantizer-32.ivf.md5"
+              "av1-1-b10-00-quantizer-33.ivf"
+              "av1-1-b10-00-quantizer-33.ivf.md5"
+              "av1-1-b10-00-quantizer-34.ivf"
+              "av1-1-b10-00-quantizer-34.ivf.md5"
+              "av1-1-b10-00-quantizer-35.ivf"
+              "av1-1-b10-00-quantizer-35.ivf.md5"
+              "av1-1-b10-00-quantizer-36.ivf"
+              "av1-1-b10-00-quantizer-36.ivf.md5"
+              "av1-1-b10-00-quantizer-37.ivf"
+              "av1-1-b10-00-quantizer-37.ivf.md5"
+              "av1-1-b10-00-quantizer-38.ivf"
+              "av1-1-b10-00-quantizer-38.ivf.md5"
+              "av1-1-b10-00-quantizer-39.ivf"
+              "av1-1-b10-00-quantizer-39.ivf.md5"
+              "av1-1-b10-00-quantizer-40.ivf"
+              "av1-1-b10-00-quantizer-40.ivf.md5"
+              "av1-1-b10-00-quantizer-41.ivf"
+              "av1-1-b10-00-quantizer-41.ivf.md5"
+              "av1-1-b10-00-quantizer-42.ivf"
+              "av1-1-b10-00-quantizer-42.ivf.md5"
+              "av1-1-b10-00-quantizer-43.ivf"
+              "av1-1-b10-00-quantizer-43.ivf.md5"
+              "av1-1-b10-00-quantizer-44.ivf"
+              "av1-1-b10-00-quantizer-44.ivf.md5"
+              "av1-1-b10-00-quantizer-45.ivf"
+              "av1-1-b10-00-quantizer-45.ivf.md5"
+              "av1-1-b10-00-quantizer-46.ivf"
+              "av1-1-b10-00-quantizer-46.ivf.md5"
+              "av1-1-b10-00-quantizer-47.ivf"
+              "av1-1-b10-00-quantizer-47.ivf.md5"
+              "av1-1-b10-00-quantizer-48.ivf"
+              "av1-1-b10-00-quantizer-48.ivf.md5"
+              "av1-1-b10-00-quantizer-49.ivf"
+              "av1-1-b10-00-quantizer-49.ivf.md5"
+              "av1-1-b10-00-quantizer-50.ivf"
+              "av1-1-b10-00-quantizer-50.ivf.md5"
+              "av1-1-b10-00-quantizer-51.ivf"
+              "av1-1-b10-00-quantizer-51.ivf.md5"
+              "av1-1-b10-00-quantizer-52.ivf"
+              "av1-1-b10-00-quantizer-52.ivf.md5"
+              "av1-1-b10-00-quantizer-53.ivf"
+              "av1-1-b10-00-quantizer-53.ivf.md5"
+              "av1-1-b10-00-quantizer-54.ivf"
+              "av1-1-b10-00-quantizer-54.ivf.md5"
+              "av1-1-b10-00-quantizer-55.ivf"
+              "av1-1-b10-00-quantizer-55.ivf.md5"
+              "av1-1-b10-00-quantizer-56.ivf"
+              "av1-1-b10-00-quantizer-56.ivf.md5"
+              "av1-1-b10-00-quantizer-57.ivf"
+              "av1-1-b10-00-quantizer-57.ivf.md5"
+              "av1-1-b10-00-quantizer-58.ivf"
+              "av1-1-b10-00-quantizer-58.ivf.md5"
+              "av1-1-b10-00-quantizer-59.ivf"
+              "av1-1-b10-00-quantizer-59.ivf.md5"
+              "av1-1-b10-00-quantizer-60.ivf"
+              "av1-1-b10-00-quantizer-60.ivf.md5"
+              "av1-1-b10-00-quantizer-61.ivf"
+              "av1-1-b10-00-quantizer-61.ivf.md5"
+              "av1-1-b10-00-quantizer-62.ivf"
+              "av1-1-b10-00-quantizer-62.ivf.md5"
+              "av1-1-b10-00-quantizer-63.ivf"
+              "av1-1-b10-00-quantizer-63.ivf.md5"
               "av1-1-b8-01-size-16x16.ivf"
               "av1-1-b8-01-size-16x16.ivf.md5"
               "av1-1-b8-01-size-16x18.ivf"
@@ -364,7 +492,9 @@ if(CONFIG_AV1_DECODER)
               "av1-1-b8-01-size-66x64.ivf"
               "av1-1-b8-01-size-66x64.ivf.md5"
               "av1-1-b8-01-size-66x66.ivf"
-              "av1-1-b8-01-size-66x66.ivf.md5")
+              "av1-1-b8-01-size-66x66.ivf.md5"
+              "av1-1-b8-02-allintra.ivf"
+              "av1-1-b8-02-allintra.ivf.md5")
 endif()
 
 if(ENABLE_ENCODE_PERF_TESTS AND CONFIG_AV1_ENCODER)
diff --git a/third_party/aom/test/test_vector_test.cc b/third_party/aom/test/test_vector_test.cc
index 85223177c..1bfeacba1 100644
--- a/third_party/aom/test/test_vector_test.cc
+++ b/third_party/aom/test/test_vector_test.cc
@@ -30,8 +30,9 @@ namespace {
 
 const int kThreads = 0;
 const int kFileName = 1;
+const int kRowMT = 2;
 
-typedef ::testing::tuple<int, const char *> DecodeParam;
+typedef ::testing::tuple<int, const char *, int> DecodeParam;
 
 class TestVectorTest : public ::libaom_test::DecoderTest,
                        public ::libaom_test::CodecTestWithParam<DecodeParam> {
@@ -48,6 +49,12 @@ class TestVectorTest : public ::libaom_test::DecoderTest,
         << "Md5 file open failed. Filename: " << md5_file_name_;
   }
 
+  virtual void PreDecodeFrameHook(
+      const libaom_test::CompressedVideoSource &video,
+      libaom_test::Decoder *decoder) {
+    if (video.frame_number() == 0) decoder->Control(AV1D_SET_ROW_MT, row_mt_);
+  }
+
   virtual void DecompressedFrameHook(const aom_image_t &img,
                                      const unsigned int frame_number) {
     ASSERT_TRUE(md5_file_ != NULL);
@@ -60,14 +67,32 @@ class TestVectorTest : public ::libaom_test::DecoderTest,
     expected_md5[32] = '\0';
 
     ::libaom_test::MD5 md5_res;
-    md5_res.Add(&img);
-    const char *actual_md5 = md5_res.Get();
+#if !CONFIG_LOWBITDEPTH
+    const aom_img_fmt_t shifted_fmt =
+        (aom_img_fmt)(img.fmt & ~AOM_IMG_FMT_HIGHBITDEPTH);
+    if (img.bit_depth == 8 && shifted_fmt != img.fmt) {
+      aom_image_t *img_shifted =
+          aom_img_alloc(NULL, shifted_fmt, img.d_w, img.d_h, 16);
+      img_shifted->bit_depth = img.bit_depth;
+      img_shifted->monochrome = img.monochrome;
+      aom_img_downshift(img_shifted, &img, 0);
+      md5_res.Add(img_shifted);
+      aom_img_free(img_shifted);
+    } else {
+#endif
+      md5_res.Add(&img);
+#if !CONFIG_LOWBITDEPTH
+    }
+#endif
 
+    const char *actual_md5 = md5_res.Get();
     // Check md5 match.
     ASSERT_STREQ(expected_md5, actual_md5)
         << "Md5 checksums don't match: frame number = " << frame_number;
   }
 
+  unsigned int row_mt_;
+
  private:
   FILE *md5_file_;
 };
@@ -84,6 +109,7 @@ TEST_P(TestVectorTest, MD5Match) {
   char str[256];
 
   cfg.threads = ::testing::get<kThreads>(input);
+  row_mt_ = ::testing::get<kRowMT>(input);
 
   snprintf(str, sizeof(str) / sizeof(str[0]) - 1, "file: %s threads: %d",
            filename.c_str(), cfg.threads);
@@ -118,17 +144,14 @@ TEST_P(TestVectorTest, MD5Match) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg));
 }
 
-// TODO(yaowu): Current md5 check works only when CONFIG_LOWBITDEPTH is enabled,
-// remove CONFIG_LOWBITDEPTH when md5 check is reworked to be compatible with
-// CONFIG_LOWBITDEPTH = 0
-#if CONFIG_AV1_DECODER && CONFIG_LOWBITDEPTH
+#if CONFIG_AV1_DECODER
 AV1_INSTANTIATE_TEST_CASE(
     TestVectorTest,
-    ::testing::Combine(
-        ::testing::Values(1),  // Single thread.
-        ::testing::ValuesIn(libaom_test::kAV1TestVectors,
-                            libaom_test::kAV1TestVectors +
-                                libaom_test::kNumAV1TestVectors)));
+    ::testing::Combine(::testing::Values(1),  // Single thread.
+                       ::testing::ValuesIn(libaom_test::kAV1TestVectors,
+                                           libaom_test::kAV1TestVectors +
+                                               libaom_test::kNumAV1TestVectors),
+                       ::testing::Values(0)));
 
 // Test AV1 decode in with different numbers of threads.
 INSTANTIATE_TEST_CASE_P(
@@ -140,7 +163,8 @@ INSTANTIATE_TEST_CASE_P(
             ::testing::Range(2, 9),  // With 2 ~ 8 threads.
             ::testing::ValuesIn(libaom_test::kAV1TestVectors,
                                 libaom_test::kAV1TestVectors +
-                                    libaom_test::kNumAV1TestVectors))));
+                                    libaom_test::kNumAV1TestVectors),
+            ::testing::Range(0, 2))));
 
 #endif  // CONFIG_AV1_DECODER
 
diff --git a/third_party/aom/test/test_vectors.cc b/third_party/aom/test/test_vectors.cc
index a9edf7520..f478c0183 100644
--- a/third_party/aom/test/test_vectors.cc
+++ b/third_party/aom/test/test_vectors.cc
@@ -17,88 +17,121 @@ namespace libaom_test {
 
 #if CONFIG_AV1_DECODER
 const char *const kAV1TestVectors[] = {
-  "av1-1-b8-00-quantizer-00.ivf", "av1-1-b8-00-quantizer-01.ivf",
-  "av1-1-b8-00-quantizer-02.ivf", "av1-1-b8-00-quantizer-03.ivf",
-  "av1-1-b8-00-quantizer-04.ivf", "av1-1-b8-00-quantizer-05.ivf",
-  "av1-1-b8-00-quantizer-06.ivf", "av1-1-b8-00-quantizer-07.ivf",
-  "av1-1-b8-00-quantizer-08.ivf", "av1-1-b8-00-quantizer-09.ivf",
-  "av1-1-b8-00-quantizer-10.ivf", "av1-1-b8-00-quantizer-11.ivf",
-  "av1-1-b8-00-quantizer-12.ivf", "av1-1-b8-00-quantizer-13.ivf",
-  "av1-1-b8-00-quantizer-14.ivf", "av1-1-b8-00-quantizer-15.ivf",
-  "av1-1-b8-00-quantizer-16.ivf", "av1-1-b8-00-quantizer-17.ivf",
-  "av1-1-b8-00-quantizer-18.ivf", "av1-1-b8-00-quantizer-19.ivf",
-  "av1-1-b8-00-quantizer-20.ivf", "av1-1-b8-00-quantizer-21.ivf",
-  "av1-1-b8-00-quantizer-22.ivf", "av1-1-b8-00-quantizer-23.ivf",
-  "av1-1-b8-00-quantizer-24.ivf", "av1-1-b8-00-quantizer-25.ivf",
-  "av1-1-b8-00-quantizer-26.ivf", "av1-1-b8-00-quantizer-27.ivf",
-  "av1-1-b8-00-quantizer-28.ivf", "av1-1-b8-00-quantizer-29.ivf",
-  "av1-1-b8-00-quantizer-30.ivf", "av1-1-b8-00-quantizer-31.ivf",
-  "av1-1-b8-00-quantizer-32.ivf", "av1-1-b8-00-quantizer-33.ivf",
-  "av1-1-b8-00-quantizer-34.ivf", "av1-1-b8-00-quantizer-35.ivf",
-  "av1-1-b8-00-quantizer-36.ivf", "av1-1-b8-00-quantizer-37.ivf",
-  "av1-1-b8-00-quantizer-38.ivf", "av1-1-b8-00-quantizer-39.ivf",
-  "av1-1-b8-00-quantizer-40.ivf", "av1-1-b8-00-quantizer-41.ivf",
-  "av1-1-b8-00-quantizer-42.ivf", "av1-1-b8-00-quantizer-43.ivf",
-  "av1-1-b8-00-quantizer-44.ivf", "av1-1-b8-00-quantizer-45.ivf",
-  "av1-1-b8-00-quantizer-46.ivf", "av1-1-b8-00-quantizer-47.ivf",
-  "av1-1-b8-00-quantizer-48.ivf", "av1-1-b8-00-quantizer-49.ivf",
-  "av1-1-b8-00-quantizer-50.ivf", "av1-1-b8-00-quantizer-51.ivf",
-  "av1-1-b8-00-quantizer-52.ivf", "av1-1-b8-00-quantizer-53.ivf",
-  "av1-1-b8-00-quantizer-54.ivf", "av1-1-b8-00-quantizer-55.ivf",
-  "av1-1-b8-00-quantizer-56.ivf", "av1-1-b8-00-quantizer-57.ivf",
-  "av1-1-b8-00-quantizer-58.ivf", "av1-1-b8-00-quantizer-59.ivf",
-  "av1-1-b8-00-quantizer-60.ivf", "av1-1-b8-00-quantizer-61.ivf",
-  "av1-1-b8-00-quantizer-62.ivf", "av1-1-b8-00-quantizer-63.ivf",
-  "av1-1-b8-01-size-16x16.ivf",   "av1-1-b8-01-size-16x18.ivf",
-  "av1-1-b8-01-size-16x32.ivf",   "av1-1-b8-01-size-16x34.ivf",
-  "av1-1-b8-01-size-16x64.ivf",   "av1-1-b8-01-size-16x66.ivf",
-  "av1-1-b8-01-size-18x16.ivf",   "av1-1-b8-01-size-18x18.ivf",
-  "av1-1-b8-01-size-18x32.ivf",   "av1-1-b8-01-size-18x34.ivf",
-  "av1-1-b8-01-size-18x64.ivf",   "av1-1-b8-01-size-18x66.ivf",
-  "av1-1-b8-01-size-196x196.ivf", "av1-1-b8-01-size-196x198.ivf",
-  "av1-1-b8-01-size-196x200.ivf", "av1-1-b8-01-size-196x202.ivf",
-  "av1-1-b8-01-size-196x208.ivf", "av1-1-b8-01-size-196x210.ivf",
-  "av1-1-b8-01-size-196x224.ivf", "av1-1-b8-01-size-196x226.ivf",
-  "av1-1-b8-01-size-198x196.ivf", "av1-1-b8-01-size-198x198.ivf",
-  "av1-1-b8-01-size-198x200.ivf", "av1-1-b8-01-size-198x202.ivf",
-  "av1-1-b8-01-size-198x208.ivf", "av1-1-b8-01-size-198x210.ivf",
-  "av1-1-b8-01-size-198x224.ivf", "av1-1-b8-01-size-198x226.ivf",
-  "av1-1-b8-01-size-200x196.ivf", "av1-1-b8-01-size-200x198.ivf",
-  "av1-1-b8-01-size-200x200.ivf", "av1-1-b8-01-size-200x202.ivf",
-  "av1-1-b8-01-size-200x208.ivf", "av1-1-b8-01-size-200x210.ivf",
-  "av1-1-b8-01-size-200x224.ivf", "av1-1-b8-01-size-200x226.ivf",
-  "av1-1-b8-01-size-202x196.ivf", "av1-1-b8-01-size-202x198.ivf",
-  "av1-1-b8-01-size-202x200.ivf", "av1-1-b8-01-size-202x202.ivf",
-  "av1-1-b8-01-size-202x208.ivf", "av1-1-b8-01-size-202x210.ivf",
-  "av1-1-b8-01-size-202x224.ivf", "av1-1-b8-01-size-202x226.ivf",
-  "av1-1-b8-01-size-208x196.ivf", "av1-1-b8-01-size-208x198.ivf",
-  "av1-1-b8-01-size-208x200.ivf", "av1-1-b8-01-size-208x202.ivf",
-  "av1-1-b8-01-size-208x208.ivf", "av1-1-b8-01-size-208x210.ivf",
-  "av1-1-b8-01-size-208x224.ivf", "av1-1-b8-01-size-208x226.ivf",
-  "av1-1-b8-01-size-210x196.ivf", "av1-1-b8-01-size-210x198.ivf",
-  "av1-1-b8-01-size-210x200.ivf", "av1-1-b8-01-size-210x202.ivf",
-  "av1-1-b8-01-size-210x208.ivf", "av1-1-b8-01-size-210x210.ivf",
-  "av1-1-b8-01-size-210x224.ivf", "av1-1-b8-01-size-210x226.ivf",
-  "av1-1-b8-01-size-224x196.ivf", "av1-1-b8-01-size-224x198.ivf",
-  "av1-1-b8-01-size-224x200.ivf", "av1-1-b8-01-size-224x202.ivf",
-  "av1-1-b8-01-size-224x208.ivf", "av1-1-b8-01-size-224x210.ivf",
-  "av1-1-b8-01-size-224x224.ivf", "av1-1-b8-01-size-224x226.ivf",
-  "av1-1-b8-01-size-226x196.ivf", "av1-1-b8-01-size-226x198.ivf",
-  "av1-1-b8-01-size-226x200.ivf", "av1-1-b8-01-size-226x202.ivf",
-  "av1-1-b8-01-size-226x208.ivf", "av1-1-b8-01-size-226x210.ivf",
-  "av1-1-b8-01-size-226x224.ivf", "av1-1-b8-01-size-226x226.ivf",
-  "av1-1-b8-01-size-32x16.ivf",   "av1-1-b8-01-size-32x18.ivf",
-  "av1-1-b8-01-size-32x32.ivf",   "av1-1-b8-01-size-32x34.ivf",
-  "av1-1-b8-01-size-32x64.ivf",   "av1-1-b8-01-size-32x66.ivf",
-  "av1-1-b8-01-size-34x16.ivf",   "av1-1-b8-01-size-34x18.ivf",
-  "av1-1-b8-01-size-34x32.ivf",   "av1-1-b8-01-size-34x34.ivf",
-  "av1-1-b8-01-size-34x64.ivf",   "av1-1-b8-01-size-34x66.ivf",
-  "av1-1-b8-01-size-64x16.ivf",   "av1-1-b8-01-size-64x18.ivf",
-  "av1-1-b8-01-size-64x32.ivf",   "av1-1-b8-01-size-64x34.ivf",
-  "av1-1-b8-01-size-64x64.ivf",   "av1-1-b8-01-size-64x66.ivf",
-  "av1-1-b8-01-size-66x16.ivf",   "av1-1-b8-01-size-66x18.ivf",
-  "av1-1-b8-01-size-66x32.ivf",   "av1-1-b8-01-size-66x34.ivf",
-  "av1-1-b8-01-size-66x64.ivf",   "av1-1-b8-01-size-66x66.ivf",
+  "av1-1-b8-00-quantizer-00.ivf",  "av1-1-b8-00-quantizer-01.ivf",
+  "av1-1-b8-00-quantizer-02.ivf",  "av1-1-b8-00-quantizer-03.ivf",
+  "av1-1-b8-00-quantizer-04.ivf",  "av1-1-b8-00-quantizer-05.ivf",
+  "av1-1-b8-00-quantizer-06.ivf",  "av1-1-b8-00-quantizer-07.ivf",
+  "av1-1-b8-00-quantizer-08.ivf",  "av1-1-b8-00-quantizer-09.ivf",
+  "av1-1-b8-00-quantizer-10.ivf",  "av1-1-b8-00-quantizer-11.ivf",
+  "av1-1-b8-00-quantizer-12.ivf",  "av1-1-b8-00-quantizer-13.ivf",
+  "av1-1-b8-00-quantizer-14.ivf",  "av1-1-b8-00-quantizer-15.ivf",
+  "av1-1-b8-00-quantizer-16.ivf",  "av1-1-b8-00-quantizer-17.ivf",
+  "av1-1-b8-00-quantizer-18.ivf",  "av1-1-b8-00-quantizer-19.ivf",
+  "av1-1-b8-00-quantizer-20.ivf",  "av1-1-b8-00-quantizer-21.ivf",
+  "av1-1-b8-00-quantizer-22.ivf",  "av1-1-b8-00-quantizer-23.ivf",
+  "av1-1-b8-00-quantizer-24.ivf",  "av1-1-b8-00-quantizer-25.ivf",
+  "av1-1-b8-00-quantizer-26.ivf",  "av1-1-b8-00-quantizer-27.ivf",
+  "av1-1-b8-00-quantizer-28.ivf",  "av1-1-b8-00-quantizer-29.ivf",
+  "av1-1-b8-00-quantizer-30.ivf",  "av1-1-b8-00-quantizer-31.ivf",
+  "av1-1-b8-00-quantizer-32.ivf",  "av1-1-b8-00-quantizer-33.ivf",
+  "av1-1-b8-00-quantizer-34.ivf",  "av1-1-b8-00-quantizer-35.ivf",
+  "av1-1-b8-00-quantizer-36.ivf",  "av1-1-b8-00-quantizer-37.ivf",
+  "av1-1-b8-00-quantizer-38.ivf",  "av1-1-b8-00-quantizer-39.ivf",
+  "av1-1-b8-00-quantizer-40.ivf",  "av1-1-b8-00-quantizer-41.ivf",
+  "av1-1-b8-00-quantizer-42.ivf",  "av1-1-b8-00-quantizer-43.ivf",
+  "av1-1-b8-00-quantizer-44.ivf",  "av1-1-b8-00-quantizer-45.ivf",
+  "av1-1-b8-00-quantizer-46.ivf",  "av1-1-b8-00-quantizer-47.ivf",
+  "av1-1-b8-00-quantizer-48.ivf",  "av1-1-b8-00-quantizer-49.ivf",
+  "av1-1-b8-00-quantizer-50.ivf",  "av1-1-b8-00-quantizer-51.ivf",
+  "av1-1-b8-00-quantizer-52.ivf",  "av1-1-b8-00-quantizer-53.ivf",
+  "av1-1-b8-00-quantizer-54.ivf",  "av1-1-b8-00-quantizer-55.ivf",
+  "av1-1-b8-00-quantizer-56.ivf",  "av1-1-b8-00-quantizer-57.ivf",
+  "av1-1-b8-00-quantizer-58.ivf",  "av1-1-b8-00-quantizer-59.ivf",
+  "av1-1-b8-00-quantizer-60.ivf",  "av1-1-b8-00-quantizer-61.ivf",
+  "av1-1-b8-00-quantizer-62.ivf",  "av1-1-b8-00-quantizer-63.ivf",
+  "av1-1-b10-00-quantizer-00.ivf", "av1-1-b10-00-quantizer-01.ivf",
+  "av1-1-b10-00-quantizer-02.ivf", "av1-1-b10-00-quantizer-03.ivf",
+  "av1-1-b10-00-quantizer-04.ivf", "av1-1-b10-00-quantizer-05.ivf",
+  "av1-1-b10-00-quantizer-06.ivf", "av1-1-b10-00-quantizer-07.ivf",
+  "av1-1-b10-00-quantizer-08.ivf", "av1-1-b10-00-quantizer-09.ivf",
+  "av1-1-b10-00-quantizer-10.ivf", "av1-1-b10-00-quantizer-11.ivf",
+  "av1-1-b10-00-quantizer-12.ivf", "av1-1-b10-00-quantizer-13.ivf",
+  "av1-1-b10-00-quantizer-14.ivf", "av1-1-b10-00-quantizer-15.ivf",
+  "av1-1-b10-00-quantizer-16.ivf", "av1-1-b10-00-quantizer-17.ivf",
+  "av1-1-b10-00-quantizer-18.ivf", "av1-1-b10-00-quantizer-19.ivf",
+  "av1-1-b10-00-quantizer-20.ivf", "av1-1-b10-00-quantizer-21.ivf",
+  "av1-1-b10-00-quantizer-22.ivf", "av1-1-b10-00-quantizer-23.ivf",
+  "av1-1-b10-00-quantizer-24.ivf", "av1-1-b10-00-quantizer-25.ivf",
+  "av1-1-b10-00-quantizer-26.ivf", "av1-1-b10-00-quantizer-27.ivf",
+  "av1-1-b10-00-quantizer-28.ivf", "av1-1-b10-00-quantizer-29.ivf",
+  "av1-1-b10-00-quantizer-30.ivf", "av1-1-b10-00-quantizer-31.ivf",
+  "av1-1-b10-00-quantizer-32.ivf", "av1-1-b10-00-quantizer-33.ivf",
+  "av1-1-b10-00-quantizer-34.ivf", "av1-1-b10-00-quantizer-35.ivf",
+  "av1-1-b10-00-quantizer-36.ivf", "av1-1-b10-00-quantizer-37.ivf",
+  "av1-1-b10-00-quantizer-38.ivf", "av1-1-b10-00-quantizer-39.ivf",
+  "av1-1-b10-00-quantizer-40.ivf", "av1-1-b10-00-quantizer-41.ivf",
+  "av1-1-b10-00-quantizer-42.ivf", "av1-1-b10-00-quantizer-43.ivf",
+  "av1-1-b10-00-quantizer-44.ivf", "av1-1-b10-00-quantizer-45.ivf",
+  "av1-1-b10-00-quantizer-46.ivf", "av1-1-b10-00-quantizer-47.ivf",
+  "av1-1-b10-00-quantizer-48.ivf", "av1-1-b10-00-quantizer-49.ivf",
+  "av1-1-b10-00-quantizer-50.ivf", "av1-1-b10-00-quantizer-51.ivf",
+  "av1-1-b10-00-quantizer-52.ivf", "av1-1-b10-00-quantizer-53.ivf",
+  "av1-1-b10-00-quantizer-54.ivf", "av1-1-b10-00-quantizer-55.ivf",
+  "av1-1-b10-00-quantizer-56.ivf", "av1-1-b10-00-quantizer-57.ivf",
+  "av1-1-b10-00-quantizer-58.ivf", "av1-1-b10-00-quantizer-59.ivf",
+  "av1-1-b10-00-quantizer-60.ivf", "av1-1-b10-00-quantizer-61.ivf",
+  "av1-1-b10-00-quantizer-62.ivf", "av1-1-b10-00-quantizer-63.ivf",
+  "av1-1-b8-01-size-16x16.ivf",    "av1-1-b8-01-size-16x18.ivf",
+  "av1-1-b8-01-size-16x32.ivf",    "av1-1-b8-01-size-16x34.ivf",
+  "av1-1-b8-01-size-16x64.ivf",    "av1-1-b8-01-size-16x66.ivf",
+  "av1-1-b8-01-size-18x16.ivf",    "av1-1-b8-01-size-18x18.ivf",
+  "av1-1-b8-01-size-18x32.ivf",    "av1-1-b8-01-size-18x34.ivf",
+  "av1-1-b8-01-size-18x64.ivf",    "av1-1-b8-01-size-18x66.ivf",
+  "av1-1-b8-01-size-196x196.ivf",  "av1-1-b8-01-size-196x198.ivf",
+  "av1-1-b8-01-size-196x200.ivf",  "av1-1-b8-01-size-196x202.ivf",
+  "av1-1-b8-01-size-196x208.ivf",  "av1-1-b8-01-size-196x210.ivf",
+  "av1-1-b8-01-size-196x224.ivf",  "av1-1-b8-01-size-196x226.ivf",
+  "av1-1-b8-01-size-198x196.ivf",  "av1-1-b8-01-size-198x198.ivf",
+  "av1-1-b8-01-size-198x200.ivf",  "av1-1-b8-01-size-198x202.ivf",
+  "av1-1-b8-01-size-198x208.ivf",  "av1-1-b8-01-size-198x210.ivf",
+  "av1-1-b8-01-size-198x224.ivf",  "av1-1-b8-01-size-198x226.ivf",
+  "av1-1-b8-01-size-200x196.ivf",  "av1-1-b8-01-size-200x198.ivf",
+  "av1-1-b8-01-size-200x200.ivf",  "av1-1-b8-01-size-200x202.ivf",
+  "av1-1-b8-01-size-200x208.ivf",  "av1-1-b8-01-size-200x210.ivf",
+  "av1-1-b8-01-size-200x224.ivf",  "av1-1-b8-01-size-200x226.ivf",
+  "av1-1-b8-01-size-202x196.ivf",  "av1-1-b8-01-size-202x198.ivf",
+  "av1-1-b8-01-size-202x200.ivf",  "av1-1-b8-01-size-202x202.ivf",
+  "av1-1-b8-01-size-202x208.ivf",  "av1-1-b8-01-size-202x210.ivf",
+  "av1-1-b8-01-size-202x224.ivf",  "av1-1-b8-01-size-202x226.ivf",
+  "av1-1-b8-01-size-208x196.ivf",  "av1-1-b8-01-size-208x198.ivf",
+  "av1-1-b8-01-size-208x200.ivf",  "av1-1-b8-01-size-208x202.ivf",
+  "av1-1-b8-01-size-208x208.ivf",  "av1-1-b8-01-size-208x210.ivf",
+  "av1-1-b8-01-size-208x224.ivf",  "av1-1-b8-01-size-208x226.ivf",
+  "av1-1-b8-01-size-210x196.ivf",  "av1-1-b8-01-size-210x198.ivf",
+  "av1-1-b8-01-size-210x200.ivf",  "av1-1-b8-01-size-210x202.ivf",
+  "av1-1-b8-01-size-210x208.ivf",  "av1-1-b8-01-size-210x210.ivf",
+  "av1-1-b8-01-size-210x224.ivf",  "av1-1-b8-01-size-210x226.ivf",
+  "av1-1-b8-01-size-224x196.ivf",  "av1-1-b8-01-size-224x198.ivf",
+  "av1-1-b8-01-size-224x200.ivf",  "av1-1-b8-01-size-224x202.ivf",
+  "av1-1-b8-01-size-224x208.ivf",  "av1-1-b8-01-size-224x210.ivf",
+  "av1-1-b8-01-size-224x224.ivf",  "av1-1-b8-01-size-224x226.ivf",
+  "av1-1-b8-01-size-226x196.ivf",  "av1-1-b8-01-size-226x198.ivf",
+  "av1-1-b8-01-size-226x200.ivf",  "av1-1-b8-01-size-226x202.ivf",
+  "av1-1-b8-01-size-226x208.ivf",  "av1-1-b8-01-size-226x210.ivf",
+  "av1-1-b8-01-size-226x224.ivf",  "av1-1-b8-01-size-226x226.ivf",
+  "av1-1-b8-01-size-32x16.ivf",    "av1-1-b8-01-size-32x18.ivf",
+  "av1-1-b8-01-size-32x32.ivf",    "av1-1-b8-01-size-32x34.ivf",
+  "av1-1-b8-01-size-32x64.ivf",    "av1-1-b8-01-size-32x66.ivf",
+  "av1-1-b8-01-size-34x16.ivf",    "av1-1-b8-01-size-34x18.ivf",
+  "av1-1-b8-01-size-34x32.ivf",    "av1-1-b8-01-size-34x34.ivf",
+  "av1-1-b8-01-size-34x64.ivf",    "av1-1-b8-01-size-34x66.ivf",
+  "av1-1-b8-01-size-64x16.ivf",    "av1-1-b8-01-size-64x18.ivf",
+  "av1-1-b8-01-size-64x32.ivf",    "av1-1-b8-01-size-64x34.ivf",
+  "av1-1-b8-01-size-64x64.ivf",    "av1-1-b8-01-size-64x66.ivf",
+  "av1-1-b8-01-size-66x16.ivf",    "av1-1-b8-01-size-66x18.ivf",
+  "av1-1-b8-01-size-66x32.ivf",    "av1-1-b8-01-size-66x34.ivf",
+  "av1-1-b8-01-size-66x64.ivf",    "av1-1-b8-01-size-66x66.ivf",
+  "av1-1-b8-02-allintra.ivf",
 };
 const int kNumAV1TestVectors = NELEMENTS(kAV1TestVectors);
 #endif  // CONFIG_AV1_DECODER
diff --git a/third_party/aom/test/tile_independence_test.cc b/third_party/aom/test/tile_independence_test.cc
index e8b2e1fe4..cf534c0c5 100644
--- a/third_party/aom/test/tile_independence_test.cc
+++ b/third_party/aom/test/tile_independence_test.cc
@@ -146,25 +146,28 @@ AV1_INSTANTIATE_TEST_CASE(TileIndependenceTestLarge, ::testing::Values(0, 1),
 
 class TileIndependenceLSTest : public TileIndependenceTest {};
 
-TEST_P(TileIndependenceLSTest, DISABLED_MD5Match) {
+TEST_P(TileIndependenceLSTest, MD5Match) {
   cfg_.large_scale_tile = 1;
   fw_dec_->Control(AV1_SET_TILE_MODE, 1);
+  fw_dec_->Control(AV1D_EXT_TILE_DEBUG, 1);
   inv_dec_->Control(AV1_SET_TILE_MODE, 1);
+  inv_dec_->Control(AV1D_EXT_TILE_DEBUG, 1);
   DoTest();
 }
 
 class TileIndependenceLSTestLarge : public TileIndependenceTestLarge {};
 
-TEST_P(TileIndependenceLSTestLarge, DISABLED_MD5Match) {
+TEST_P(TileIndependenceLSTestLarge, MD5Match) {
   cfg_.large_scale_tile = 1;
   fw_dec_->Control(AV1_SET_TILE_MODE, 1);
+  fw_dec_->Control(AV1D_EXT_TILE_DEBUG, 1);
   inv_dec_->Control(AV1_SET_TILE_MODE, 1);
+  inv_dec_->Control(AV1D_EXT_TILE_DEBUG, 1);
   DoTest();
 }
 
-AV1_INSTANTIATE_TEST_CASE(TileIndependenceLSTest, ::testing::Values(1, 2, 32),
-                          ::testing::Values(1, 2, 32), ::testing::Values(1));
-AV1_INSTANTIATE_TEST_CASE(TileIndependenceLSTestLarge,
-                          ::testing::Values(1, 2, 32),
-                          ::testing::Values(1, 2, 32), ::testing::Values(1));
+AV1_INSTANTIATE_TEST_CASE(TileIndependenceLSTest, ::testing::Values(6),
+                          ::testing::Values(6), ::testing::Values(1));
+AV1_INSTANTIATE_TEST_CASE(TileIndependenceLSTestLarge, ::testing::Values(6),
+                          ::testing::Values(6), ::testing::Values(1));
 }  // namespace
diff --git a/third_party/aom/test/tools_common.sh b/third_party/aom/test/tools_common.sh
index 21a6b9b8e..c08710606 100755
--- a/third_party/aom/test/tools_common.sh
+++ b/third_party/aom/test/tools_common.sh
@@ -47,26 +47,16 @@ test_end() {
 
 # Echoes the target configuration being tested.
 test_configuration_target() {
-  aom_config_mk="${LIBAOM_CONFIG_PATH}/config.mk"
-  # TODO(tomfinegan): Remove the parts requiring config.mk when the configure
-  # script is removed from the repository.
-  if [ ! -f "${aom_config_mk}" ]; then
-    aom_config_c="${LIBAOM_CONFIG_PATH}/aom_config.c"
-    # Clean up the cfg pointer line from aom_config.c for easier re-use by
-    # someone examining a failure in the example tests.
-    # 1. Run grep on aom_config.c for cfg and limit the results to 1.
-    # 2. Split the line using ' = ' as separator.
-    # 3. Abuse sed to consume the leading " and trailing "; from the assignment
-    #    to the cfg pointer.
-    cmake_config=$(awk -F ' = ' '/cfg/ { print $NF; exit }' "${aom_config_c}" \
-      | sed -e s/\"// -e s/\"\;//)
-    echo cmake generated via command: cmake path/to/aom ${cmake_config}
-    return
-  fi
-  # Find the TOOLCHAIN line, split it using ':=' as the field separator, and
-  # print the last field to get the value. Then pipe the value to tr to consume
-  # any leading/trailing spaces while allowing tr to echo the output to stdout.
-  awk -F ':=' '/TOOLCHAIN/ { print $NF }' "${aom_config_mk}" | tr -d ' '
+  aom_config_c="${LIBAOM_CONFIG_PATH}/config/aom_config.c"
+  # Clean up the cfg pointer line from aom_config.c for easier re-use by
+  # someone examining a failure in the example tests.
+  # 1. Run grep on aom_config.c for cfg and limit the results to 1.
+  # 2. Split the line using ' = ' as separator.
+  # 3. Abuse sed to consume the leading " and trailing "; from the assignment
+  #    to the cfg pointer.
+  cmake_config=$(awk -F ' = ' '/cfg/ { print $NF; exit }' "${aom_config_c}" \
+    | sed -e s/\"// -e s/\"\;//)
+  echo cmake generated via command: cmake path/to/aom ${cmake_config}
 }
 
 # Trap function used for failure reports and tool output directory removal.
@@ -163,10 +153,10 @@ is_windows_target() {
 # included in $tool_paths, or an empty string. Caller is responsible for testing
 # the string once the function returns.
 aom_tool_path() {
-  local readonly tool_name="$1"
-  local readonly root_path="${LIBAOM_BIN_PATH}"
-  local readonly suffix="${AOM_TEST_EXE_SUFFIX}"
-  local readonly tool_paths="\
+  local tool_name="$1"
+  local root_path="${LIBAOM_BIN_PATH}"
+  local suffix="${AOM_TEST_EXE_SUFFIX}"
+  local tool_paths="\
     ${root_path}/${tool_name}${suffix} \
     ${root_path}/../${tool_name}${suffix} \
     ${root_path}/tools/${tool_name}${suffix} \
@@ -348,8 +338,8 @@ yuv_raw_input() {
 # Do a small encode for testing decoders.
 encode_yuv_raw_input_av1() {
   if [ "$(av1_encode_available)" = "yes" ]; then
-    local readonly output="$1"
-    local readonly encoder="$(aom_tool_path aomenc)"
+    local output="$1"
+    local encoder="$(aom_tool_path aomenc)"
     shift
     eval "${encoder}" $(yuv_raw_input) \
       $(aomenc_encode_test_fast_params) \
diff --git a/third_party/aom/test/variance_test.cc b/third_party/aom/test/variance_test.cc
index eb801b442..2f1b1fc5a 100644
--- a/third_party/aom/test/variance_test.cc
+++ b/third_party/aom/test/variance_test.cc
@@ -23,6 +23,7 @@
 #include "aom/aom_codec.h"
 #include "aom/aom_integer.h"
 #include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
 
 namespace {
@@ -46,6 +47,10 @@ typedef unsigned int (*JntSubpixAvgVarMxNFunc)(
     const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
     int b_stride, uint32_t *sse, const uint8_t *second_pred,
     const JNT_COMP_PARAMS *jcp_param);
+typedef uint32_t (*ObmcSubpelVarFunc)(const uint8_t *pre, int pre_stride,
+                                      int xoffset, int yoffset,
+                                      const int32_t *wsrc, const int32_t *mask,
+                                      unsigned int *sse);
 
 using libaom_test::ACMRandom;
 
@@ -269,6 +274,56 @@ static uint32_t jnt_subpel_avg_variance_ref(
   return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
 }
 
+static uint32_t obmc_subpel_variance_ref(const uint8_t *pre, int l2w, int l2h,
+                                         int xoff, int yoff,
+                                         const int32_t *wsrc,
+                                         const int32_t *mask, uint32_t *sse_ptr,
+                                         bool use_high_bit_depth_,
+                                         aom_bit_depth_t bit_depth) {
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+
+  xoff <<= 1;
+  yoff <<= 1;
+
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // Bilinear interpolation at a 16th pel step.
+      if (!use_high_bit_depth_) {
+        const int a1 = pre[(w + 1) * (y + 0) + x + 0];
+        const int a2 = pre[(w + 1) * (y + 0) + x + 1];
+        const int b1 = pre[(w + 1) * (y + 1) + x + 0];
+        const int b2 = pre[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = ROUND_POWER_OF_TWO_SIGNED(
+            wsrc[w * y + x] - r * mask[w * y + x], 12);
+        se += diff;
+        sse += diff * diff;
+      } else {
+        uint16_t *pre16 = CONVERT_TO_SHORTPTR(pre);
+        const int a1 = pre16[(w + 1) * (y + 0) + x + 0];
+        const int a2 = pre16[(w + 1) * (y + 0) + x + 1];
+        const int b1 = pre16[(w + 1) * (y + 1) + x + 0];
+        const int b2 = pre16[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = ROUND_POWER_OF_TWO_SIGNED(
+            wsrc[w * y + x] - r * mask[w * y + x], 12);
+        se += diff;
+        sse += diff * diff;
+      }
+    }
+  }
+  RoundHighBitDepth(bit_depth, &se, &sse);
+  *sse_ptr = static_cast<uint32_t>(sse);
+  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
@@ -803,12 +858,170 @@ void SubpelVarianceTest<JntSubpixAvgVarMxNFunc>::RefTest() {
   }
 }
 
+////////////////////////////////////////////////////////////////////////////////
+
+static const int kMaskMax = 64;
+
+typedef TestParams<ObmcSubpelVarFunc> ObmcSubpelVarianceParams;
+
+template <typename FunctionType>
+class ObmcVarianceTest
+    : public ::testing::TestWithParam<TestParams<FunctionType> > {
+ public:
+  virtual void SetUp() {
+    params_ = this->GetParam();
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    if (!use_high_bit_depth()) {
+      pre_ = reinterpret_cast<uint8_t *>(
+          aom_memalign(32, block_size() + width() + height() + 1));
+    } else {
+      pre_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(aom_memalign(
+          32, block_size() + width() + height() + 1 * sizeof(uint16_t))));
+    }
+    wsrc_ = reinterpret_cast<int32_t *>(
+        aom_memalign(32, block_size() * sizeof(uint32_t)));
+    mask_ = reinterpret_cast<int32_t *>(
+        aom_memalign(32, block_size() * sizeof(uint32_t)));
+    ASSERT_TRUE(pre_ != NULL);
+    ASSERT_TRUE(wsrc_ != NULL);
+    ASSERT_TRUE(mask_ != NULL);
+  }
+
+  virtual void TearDown() {
+    if (!use_high_bit_depth()) {
+      aom_free(pre_);
+    } else {
+      aom_free(CONVERT_TO_SHORTPTR(pre_));
+    }
+    aom_free(wsrc_);
+    aom_free(mask_);
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RefTest();
+  void ExtremeRefTest();
+  void SpeedTest();
+
+  ACMRandom rnd_;
+  uint8_t *pre_;
+  int32_t *wsrc_;
+  int32_t *mask_;
+  TestParams<FunctionType> params_;
+
+  // some relay helpers
+  bool use_high_bit_depth() const { return params_.use_high_bit_depth; }
+  int byte_shift() const { return params_.bit_depth - 8; }
+  int block_size() const { return params_.block_size; }
+  int width() const { return params_.width; }
+  int height() const { return params_.height; }
+  uint32_t bd_mask() const { return params_.mask; }
+};
+
+template <>
+void ObmcVarianceTest<ObmcSubpelVarFunc>::RefTest() {
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      if (!use_high_bit_depth())
+        for (int j = 0; j < block_size() + width() + height() + 1; j++)
+          pre_[j] = rnd_.Rand8();
+      else
+        for (int j = 0; j < block_size() + width() + height() + 1; j++)
+          CONVERT_TO_SHORTPTR(pre_)[j] = rnd_.Rand16() & bd_mask();
+      for (int j = 0; j < block_size(); j++) {
+        wsrc_[j] = (rnd_.Rand16() & bd_mask()) * rnd_(kMaskMax * kMaskMax + 1);
+        mask_[j] = rnd_(kMaskMax * kMaskMax + 1);
+      }
+
+      uint32_t sse1, sse2;
+      uint32_t var1, var2;
+      ASM_REGISTER_STATE_CHECK(
+          var1 = params_.func(pre_, width() + 1, x, y, wsrc_, mask_, &sse1));
+      var2 = obmc_subpel_variance_ref(
+          pre_, params_.log2width, params_.log2height, x, y, wsrc_, mask_,
+          &sse2, use_high_bit_depth(), params_.bit_depth);
+      EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
+      EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
+    }
+  }
+}
+
+template <>
+void ObmcVarianceTest<ObmcSubpelVarFunc>::ExtremeRefTest() {
+  // Pre: Set the first half of values to the maximum, the second half to 0.
+  // Mask: same as above
+  // WSrc: Set the first half of values to 0, the second half to the maximum.
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      const int half = block_size() / 2;
+      if (!use_high_bit_depth()) {
+        memset(pre_, 255, half);
+        memset(pre_ + half, 0, half + width() + height() + 1);
+      } else {
+        aom_memset16(CONVERT_TO_SHORTPTR(pre_), bd_mask(), half);
+        aom_memset16(CONVERT_TO_SHORTPTR(pre_) + half, 0, half);
+      }
+      for (int j = 0; j < half; j++) {
+        wsrc_[j] = bd_mask() * kMaskMax * kMaskMax;
+        mask_[j] = 0;
+      }
+      for (int j = half; j < block_size(); j++) {
+        wsrc_[j] = 0;
+        mask_[j] = kMaskMax * kMaskMax;
+      }
+
+      uint32_t sse1, sse2;
+      uint32_t var1, var2;
+      ASM_REGISTER_STATE_CHECK(
+          var1 = params_.func(pre_, width() + 1, x, y, wsrc_, mask_, &sse1));
+      var2 = obmc_subpel_variance_ref(
+          pre_, params_.log2width, params_.log2height, x, y, wsrc_, mask_,
+          &sse2, use_high_bit_depth(), params_.bit_depth);
+      EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
+      EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
+    }
+  }
+}
+
+template <>
+void ObmcVarianceTest<ObmcSubpelVarFunc>::SpeedTest() {
+  if (!use_high_bit_depth())
+    for (int j = 0; j < block_size() + width() + height() + 1; j++)
+      pre_[j] = rnd_.Rand8();
+  else
+    for (int j = 0; j < block_size() + width() + height() + 1; j++)
+      CONVERT_TO_SHORTPTR(pre_)[j] = rnd_.Rand16() & bd_mask();
+  for (int j = 0; j < block_size(); j++) {
+    wsrc_[j] = (rnd_.Rand16() & bd_mask()) * rnd_(kMaskMax * kMaskMax + 1);
+    mask_[j] = rnd_(kMaskMax * kMaskMax + 1);
+  }
+  unsigned int sse1;
+  const int stride = width() + 1;
+  int run_time = 1000000000 / block_size();
+  aom_usec_timer timer;
+
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_time; ++i) {
+    int x = rnd_(8);
+    int y = rnd_(8);
+    ASM_REGISTER_STATE_CHECK(
+        params_.func(pre_, stride, x, y, wsrc_, mask_, &sse1));
+  }
+  aom_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("obmc_sub_pixel_variance_%dx%d_%d: %d us\n", width(), height(),
+         params_.bit_depth, elapsed_time);
+}
+
 typedef MainTestClass<Get4x4SseFunc> AvxSseTest;
 typedef MainTestClass<VarianceMxNFunc> AvxMseTest;
 typedef MainTestClass<VarianceMxNFunc> AvxVarianceTest;
 typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxSubpelVarianceTest;
 typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxSubpelAvgVarianceTest;
 typedef SubpelVarianceTest<JntSubpixAvgVarMxNFunc> AvxJntSubpelAvgVarianceTest;
+typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxObmcSubpelVarianceTest;
 
 TEST_P(AvxSseTest, RefSse) { RefTestSse(); }
 TEST_P(AvxSseTest, MaxSse) { MaxTestSse(); }
@@ -825,6 +1038,9 @@ TEST_P(AvxSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(AvxSubpelAvgVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxJntSubpelAvgVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxObmcSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxObmcSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(AvxObmcSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
 
 INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
                         ::testing::Values(aom_get_mb_ss_c));
@@ -934,10 +1150,32 @@ INSTANTIATE_TEST_CASE_P(
         JntSubpelAvgVarianceParams(2, 2, &aom_jnt_sub_pixel_avg_variance4x4_c,
                                    0)));
 
+INSTANTIATE_TEST_CASE_P(
+    C, AvxObmcSubpelVarianceTest,
+    ::testing::Values(
+        ObmcSubpelVarianceParams(7, 7, &aom_obmc_sub_pixel_variance128x128_c,
+                                 0),
+        ObmcSubpelVarianceParams(7, 6, &aom_obmc_sub_pixel_variance128x64_c, 0),
+        ObmcSubpelVarianceParams(6, 7, &aom_obmc_sub_pixel_variance64x128_c, 0),
+        ObmcSubpelVarianceParams(6, 6, &aom_obmc_sub_pixel_variance64x64_c, 0),
+        ObmcSubpelVarianceParams(6, 5, &aom_obmc_sub_pixel_variance64x32_c, 0),
+        ObmcSubpelVarianceParams(5, 6, &aom_obmc_sub_pixel_variance32x64_c, 0),
+        ObmcSubpelVarianceParams(5, 5, &aom_obmc_sub_pixel_variance32x32_c, 0),
+        ObmcSubpelVarianceParams(5, 4, &aom_obmc_sub_pixel_variance32x16_c, 0),
+        ObmcSubpelVarianceParams(4, 5, &aom_obmc_sub_pixel_variance16x32_c, 0),
+        ObmcSubpelVarianceParams(4, 4, &aom_obmc_sub_pixel_variance16x16_c, 0),
+        ObmcSubpelVarianceParams(4, 3, &aom_obmc_sub_pixel_variance16x8_c, 0),
+        ObmcSubpelVarianceParams(3, 4, &aom_obmc_sub_pixel_variance8x16_c, 0),
+        ObmcSubpelVarianceParams(3, 3, &aom_obmc_sub_pixel_variance8x8_c, 0),
+        ObmcSubpelVarianceParams(3, 2, &aom_obmc_sub_pixel_variance8x4_c, 0),
+        ObmcSubpelVarianceParams(2, 3, &aom_obmc_sub_pixel_variance4x8_c, 0),
+        ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_c, 0)));
+
 typedef MainTestClass<VarianceMxNFunc> AvxHBDMseTest;
 typedef MainTestClass<VarianceMxNFunc> AvxHBDVarianceTest;
 typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxHBDSubpelVarianceTest;
 typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxHBDSubpelAvgVarianceTest;
+typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxHBDObmcSubpelVarianceTest;
 
 TEST_P(AvxHBDMseTest, RefMse) { RefTestMse(); }
 TEST_P(AvxHBDMseTest, MaxMse) { MaxTestMse(); }
@@ -1161,6 +1399,94 @@ const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_c[] = {
 INSTANTIATE_TEST_CASE_P(C, AvxHBDSubpelAvgVarianceTest,
                         ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_c));
 
+const ObmcSubpelVarianceParams kArrayHBDObmcSubpelVariance_c[] = {
+  ObmcSubpelVarianceParams(7, 7, &aom_highbd_obmc_sub_pixel_variance128x128_c,
+                           8),
+  ObmcSubpelVarianceParams(7, 6, &aom_highbd_obmc_sub_pixel_variance128x64_c,
+                           8),
+  ObmcSubpelVarianceParams(6, 7, &aom_highbd_obmc_sub_pixel_variance64x128_c,
+                           8),
+  ObmcSubpelVarianceParams(6, 6, &aom_highbd_obmc_sub_pixel_variance64x64_c, 8),
+  ObmcSubpelVarianceParams(6, 5, &aom_highbd_obmc_sub_pixel_variance64x32_c, 8),
+  ObmcSubpelVarianceParams(5, 6, &aom_highbd_obmc_sub_pixel_variance32x64_c, 8),
+  ObmcSubpelVarianceParams(5, 5, &aom_highbd_obmc_sub_pixel_variance32x32_c, 8),
+  ObmcSubpelVarianceParams(5, 4, &aom_highbd_obmc_sub_pixel_variance32x16_c, 8),
+  ObmcSubpelVarianceParams(4, 5, &aom_highbd_obmc_sub_pixel_variance16x32_c, 8),
+  ObmcSubpelVarianceParams(4, 4, &aom_highbd_obmc_sub_pixel_variance16x16_c, 8),
+  ObmcSubpelVarianceParams(4, 3, &aom_highbd_obmc_sub_pixel_variance16x8_c, 8),
+  ObmcSubpelVarianceParams(3, 4, &aom_highbd_obmc_sub_pixel_variance8x16_c, 8),
+  ObmcSubpelVarianceParams(3, 3, &aom_highbd_obmc_sub_pixel_variance8x8_c, 8),
+  ObmcSubpelVarianceParams(3, 2, &aom_highbd_obmc_sub_pixel_variance8x4_c, 8),
+  ObmcSubpelVarianceParams(2, 3, &aom_highbd_obmc_sub_pixel_variance4x8_c, 8),
+  ObmcSubpelVarianceParams(2, 2, &aom_highbd_obmc_sub_pixel_variance4x4_c, 8),
+  ObmcSubpelVarianceParams(7, 7,
+                           &aom_highbd_10_obmc_sub_pixel_variance128x128_c, 10),
+  ObmcSubpelVarianceParams(7, 6, &aom_highbd_10_obmc_sub_pixel_variance128x64_c,
+                           10),
+  ObmcSubpelVarianceParams(6, 7, &aom_highbd_10_obmc_sub_pixel_variance64x128_c,
+                           10),
+  ObmcSubpelVarianceParams(6, 6, &aom_highbd_10_obmc_sub_pixel_variance64x64_c,
+                           10),
+  ObmcSubpelVarianceParams(6, 5, &aom_highbd_10_obmc_sub_pixel_variance64x32_c,
+                           10),
+  ObmcSubpelVarianceParams(5, 6, &aom_highbd_10_obmc_sub_pixel_variance32x64_c,
+                           10),
+  ObmcSubpelVarianceParams(5, 5, &aom_highbd_10_obmc_sub_pixel_variance32x32_c,
+                           10),
+  ObmcSubpelVarianceParams(5, 4, &aom_highbd_10_obmc_sub_pixel_variance32x16_c,
+                           10),
+  ObmcSubpelVarianceParams(4, 5, &aom_highbd_10_obmc_sub_pixel_variance16x32_c,
+                           10),
+  ObmcSubpelVarianceParams(4, 4, &aom_highbd_10_obmc_sub_pixel_variance16x16_c,
+                           10),
+  ObmcSubpelVarianceParams(4, 3, &aom_highbd_10_obmc_sub_pixel_variance16x8_c,
+                           10),
+  ObmcSubpelVarianceParams(3, 4, &aom_highbd_10_obmc_sub_pixel_variance8x16_c,
+                           10),
+  ObmcSubpelVarianceParams(3, 3, &aom_highbd_10_obmc_sub_pixel_variance8x8_c,
+                           10),
+  ObmcSubpelVarianceParams(3, 2, &aom_highbd_10_obmc_sub_pixel_variance8x4_c,
+                           10),
+  ObmcSubpelVarianceParams(2, 3, &aom_highbd_10_obmc_sub_pixel_variance4x8_c,
+                           10),
+  ObmcSubpelVarianceParams(2, 2, &aom_highbd_10_obmc_sub_pixel_variance4x4_c,
+                           10),
+  ObmcSubpelVarianceParams(7, 7,
+                           &aom_highbd_12_obmc_sub_pixel_variance128x128_c, 12),
+  ObmcSubpelVarianceParams(7, 6, &aom_highbd_12_obmc_sub_pixel_variance128x64_c,
+                           12),
+  ObmcSubpelVarianceParams(6, 7, &aom_highbd_12_obmc_sub_pixel_variance64x128_c,
+                           12),
+  ObmcSubpelVarianceParams(6, 6, &aom_highbd_12_obmc_sub_pixel_variance64x64_c,
+                           12),
+  ObmcSubpelVarianceParams(6, 5, &aom_highbd_12_obmc_sub_pixel_variance64x32_c,
+                           12),
+  ObmcSubpelVarianceParams(5, 6, &aom_highbd_12_obmc_sub_pixel_variance32x64_c,
+                           12),
+  ObmcSubpelVarianceParams(5, 5, &aom_highbd_12_obmc_sub_pixel_variance32x32_c,
+                           12),
+  ObmcSubpelVarianceParams(5, 4, &aom_highbd_12_obmc_sub_pixel_variance32x16_c,
+                           12),
+  ObmcSubpelVarianceParams(4, 5, &aom_highbd_12_obmc_sub_pixel_variance16x32_c,
+                           12),
+  ObmcSubpelVarianceParams(4, 4, &aom_highbd_12_obmc_sub_pixel_variance16x16_c,
+                           12),
+  ObmcSubpelVarianceParams(4, 3, &aom_highbd_12_obmc_sub_pixel_variance16x8_c,
+                           12),
+  ObmcSubpelVarianceParams(3, 4, &aom_highbd_12_obmc_sub_pixel_variance8x16_c,
+                           12),
+  ObmcSubpelVarianceParams(3, 3, &aom_highbd_12_obmc_sub_pixel_variance8x8_c,
+                           12),
+  ObmcSubpelVarianceParams(3, 2, &aom_highbd_12_obmc_sub_pixel_variance8x4_c,
+                           12),
+  ObmcSubpelVarianceParams(2, 3, &aom_highbd_12_obmc_sub_pixel_variance4x8_c,
+                           12),
+  ObmcSubpelVarianceParams(2, 2, &aom_highbd_12_obmc_sub_pixel_variance4x4_c,
+                           12)
+};
+INSTANTIATE_TEST_CASE_P(C, AvxHBDObmcSubpelVarianceTest,
+                        ::testing::ValuesIn(kArrayHBDObmcSubpelVariance_c));
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
                         ::testing::Values(aom_get_mb_ss_sse2));
@@ -1519,6 +1845,44 @@ INSTANTIATE_TEST_CASE_P(
                                    0)));
 #endif  // HAVE_SSSE3
 
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, AvxObmcSubpelVarianceTest,
+    ::testing::Values(
+        ObmcSubpelVarianceParams(7, 7,
+                                 &aom_obmc_sub_pixel_variance128x128_sse4_1, 0),
+        ObmcSubpelVarianceParams(7, 6,
+                                 &aom_obmc_sub_pixel_variance128x64_sse4_1, 0),
+        ObmcSubpelVarianceParams(6, 7,
+                                 &aom_obmc_sub_pixel_variance64x128_sse4_1, 0),
+        ObmcSubpelVarianceParams(6, 6, &aom_obmc_sub_pixel_variance64x64_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(6, 5, &aom_obmc_sub_pixel_variance64x32_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(5, 6, &aom_obmc_sub_pixel_variance32x64_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(5, 5, &aom_obmc_sub_pixel_variance32x32_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(5, 4, &aom_obmc_sub_pixel_variance32x16_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(4, 5, &aom_obmc_sub_pixel_variance16x32_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(4, 4, &aom_obmc_sub_pixel_variance16x16_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(4, 3, &aom_obmc_sub_pixel_variance16x8_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(3, 4, &aom_obmc_sub_pixel_variance8x16_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(3, 3, &aom_obmc_sub_pixel_variance8x8_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(3, 2, &aom_obmc_sub_pixel_variance8x4_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(2, 3, &aom_obmc_sub_pixel_variance4x8_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_sse4_1,
+                                 0)));
+#endif  // HAVE_SSE4_1
+
 #if HAVE_AVX2
 INSTANTIATE_TEST_CASE_P(AVX2, AvxMseTest,
                         ::testing::Values(MseParams(4, 4, &aom_mse16x16_avx2)));